]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blame - src/patches/reiser4-for-2.6.16-5.patch
Wir kehren zurueck zu Kudzu, da hwinfo noch mehr Aerger macht.
[people/teissler/ipfire-2.x.git] / src / patches / reiser4-for-2.6.16-5.patch
CommitLineData
44254afd
MT
1Index: linux-2.6.16/Documentation/Changes
2===================================================================
3--- linux-2.6.16.orig/Documentation/Changes
4+++ linux-2.6.16/Documentation/Changes
5@@ -54,6 +54,7 @@ o module-init-tools 0.9.10
6 o e2fsprogs 1.29 # tune2fs
7 o jfsutils 1.1.3 # fsck.jfs -V
8 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
9+o reiser4progs 1.0.0 # fsck.reiser4 -V
10 o xfsprogs 2.6.0 # xfs_db -V
11 o pcmciautils 004
12 o pcmcia-cs 3.1.21 # cardmgr -V
13@@ -163,6 +164,13 @@ The reiserfsprogs package should be used
14 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
15 reiserfsck. These utils work on both i386 and alpha platforms.
16
17+Reiser4progs
18+------------
19+
20+The reiser4progs package contains utilities for the reiser4 file system.
21+Detailed instructions are provided in the README file located at:
22+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
23+
24 Xfsprogs
25 --------
26
27@@ -344,6 +352,10 @@ Reiserfsprogs
28 -------------
29 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
30
31+Reiser4progs
32+------------
33+o <ftp://ftp.namesys.com/pub/reiser4progs/>
34+
35 Xfsprogs
36 --------
37 o <ftp://oss.sgi.com/projects/xfs/download/>
38Index: linux-2.6.16/Documentation/filesystems/reiser4.txt
39===================================================================
40--- /dev/null
41+++ linux-2.6.16/Documentation/filesystems/reiser4.txt
42@@ -0,0 +1,75 @@
43+Reiser4 filesystem
44+==================
45+Reiser4 is a file system based on dancing tree algorithms, and is
46+described at http://www.namesys.com
47+
48+
49+References
50+==========
51+web page http://namesys.com/v4/v4.html
52+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
53+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
54+install page http://www.namesys.com/install_v4.html
55+
56+Compile options
57+===============
58+Enable reiser4 debug mode
59+ This checks everything imaginable while reiser4
60+ runs
61+
62+Mount options
63+=============
64+tmgr.atom_max_size=N
65+ Atoms containing more than N blocks will be forced to commit.
66+ N is decimal.
67+ Default is nr_free_pagecache_pages() / 2 at mount time.
68+
69+tmgr.atom_max_age=N
70+ Atoms older than N seconds will be forced to commit. N is decimal.
71+ Default is 600.
72+
73+tmgr.atom_max_flushers=N
74+ Limit of concurrent flushers for one atom. 0 means no limit.
75+ Default is 0.
76+
77+tree.cbk_cache.nr_slots=N
78+ Number of slots in the cbk cache.
79+
80+flush.relocate_threshold=N
81+ If flush finds more than N adjacent dirty leaf-level blocks it
82+ will force them to be relocated.
83+ Default is 64.
84+
85+flush.relocate_distance=N
86+ If flush finds can find a block allocation closer than at most
87+ N from the preceder it will relocate to that position.
88+ Default is 64.
89+
90+flush.scan_maxnodes=N
91+ The maximum number of nodes to scan left on a level during
92+ flush.
93+ Default is 10000.
94+
95+optimal_io_size=N
96+ Preferred IO size. This value is used to set st_blksize of
97+ struct stat.
98+ Default is 65536.
99+
100+bsdgroups
101+ Turn on BSD-style gid assignment.
102+
103+32bittimes
104+ By default file in reiser4 have 64 bit timestamps. Files
105+ created when filesystem is mounted with 32bittimes mount
106+ option will get 32 bit timestamps.
107+
108+mtflush
109+ Turn off concurrent flushing.
110+
111+nopseudo
112+ Disable pseudo files support. See
113+ http://namesys.com/v4/pseudo.html for more about pseudo files.
114+
115+dont_load_bitmap
116+ Don't load all bitmap blocks at mount time, it is useful for
117+ machines with tiny RAM and large disks.
118Index: linux-2.6.16/fs/Kconfig
119===================================================================
120--- linux-2.6.16.orig/fs/Kconfig
121+++ linux-2.6.16/fs/Kconfig
122@@ -177,6 +177,8 @@ config FS_MBCACHE
123 default y if EXT2_FS=y || EXT3_FS=y
124 default m if EXT2_FS=m || EXT3_FS=m
125
126+source "fs/reiser4/Kconfig"
127+
128 config REISERFS_FS
129 tristate "Reiserfs support"
130 help
131Index: linux-2.6.16/fs/Makefile
132===================================================================
133--- linux-2.6.16.orig/fs/Makefile
134+++ linux-2.6.16/fs/Makefile
135@@ -51,6 +51,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
136
137 # Do not add any filesystems before this line
138 obj-$(CONFIG_REISERFS_FS) += reiserfs/
139+obj-$(CONFIG_REISER4_FS) += reiser4/
140 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
141 obj-$(CONFIG_JBD) += jbd/
142 obj-$(CONFIG_EXT2_FS) += ext2/
143Index: linux-2.6.16/fs/fs-writeback.c
144===================================================================
145--- linux-2.6.16.orig/fs/fs-writeback.c
146+++ linux-2.6.16/fs/fs-writeback.c
147@@ -286,8 +286,6 @@ __writeback_single_inode(struct inode *i
148 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
149 * that it can be located for waiting on in __writeback_single_inode().
150 *
151- * Called under inode_lock.
152- *
153 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
154 * This function assumes that the blockdev superblock's inodes are backed by
155 * a variety of queues, so all inodes are searched. For other superblocks,
156@@ -303,11 +301,13 @@ __writeback_single_inode(struct inode *i
157 * on the writer throttling path, and we get decent balancing between many
158 * throttled threads: we don't want them all piling up on __wait_on_inode.
159 */
160-static void
161-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
162+void
163+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
164 {
165 const unsigned long start = jiffies; /* livelock avoidance */
166
167+ spin_lock(&inode_lock);
168+
169 if (!wbc->for_kupdate || list_empty(&sb->s_io))
170 list_splice_init(&sb->s_dirty, &sb->s_io);
171
172@@ -387,8 +387,19 @@ sync_sb_inodes(struct super_block *sb, s
173 if (wbc->nr_to_write <= 0)
174 break;
175 }
176+ spin_unlock(&inode_lock);
177 return; /* Leave any unwritten inodes on s_io */
178 }
179+EXPORT_SYMBOL(generic_sync_sb_inodes);
180+
181+static void
182+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
183+{
184+ if (sb->s_op->sync_inodes)
185+ sb->s_op->sync_inodes(sb, wbc);
186+ else
187+ generic_sync_sb_inodes(sb, wbc);
188+}
189
190 /*
191 * Start writeback of dirty pagecache data against all unlocked inodes.
192@@ -429,11 +440,8 @@ restart:
193 * be unmounted by the time it is released.
194 */
195 if (down_read_trylock(&sb->s_umount)) {
196- if (sb->s_root) {
197- spin_lock(&inode_lock);
198+ if (sb->s_root)
199 sync_sb_inodes(sb, wbc);
200- spin_unlock(&inode_lock);
201- }
202 up_read(&sb->s_umount);
203 }
204 spin_lock(&sb_lock);
205@@ -469,9 +477,7 @@ void sync_inodes_sb(struct super_block *
206 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
207 nr_dirty + nr_unstable;
208 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
209- spin_lock(&inode_lock);
210 sync_sb_inodes(sb, &wbc);
211- spin_unlock(&inode_lock);
212 }
213
214 /*
215Index: linux-2.6.16/fs/reiser4/Kconfig
216===================================================================
217--- /dev/null
218+++ linux-2.6.16/fs/reiser4/Kconfig
219@@ -0,0 +1,31 @@
220+config REISER4_FS
221+ tristate "Reiser4 (EXPERIMENTAL)"
222+ depends on EXPERIMENTAL
223+ select ZLIB_INFLATE
224+ select ZLIB_DEFLATE
225+ help
226+ Reiser4 is a filesystem that performs all filesystem operations
227+ as atomic transactions, which means that it either performs a
228+ write, or it does not, and in the event of a crash it does not
229+ partially perform it or corrupt it.
230+
231+ It stores files in dancing trees, which are like balanced trees but
232+ faster. It packs small files together so that they share blocks
233+ without wasting space. This means you can use it to store really
234+ small files. It also means that it saves you disk space. It avoids
235+ hassling you with anachronisms like having a maximum number of
236+ inodes, and wasting space if you use less than that number.
237+
238+ Reiser4 is a distinct filesystem type from reiserfs (V3).
239+ It's therefore not possible to use reiserfs file systems
240+ with reiser4.
241+
242+ To learn more about reiser4, go to http://www.namesys.com
243+
244+config REISER4_DEBUG
245+ bool "Enable reiser4 debug mode"
246+ depends on REISER4_FS
247+ help
248+ Don't use this unless you are debugging reiser4.
249+
250+ If unsure, say N.
251Index: linux-2.6.16/fs/reiser4/Makefile
252===================================================================
253--- /dev/null
254+++ linux-2.6.16/fs/reiser4/Makefile
255@@ -0,0 +1,100 @@
256+#
257+# reiser4/Makefile
258+#
259+
260+obj-$(CONFIG_REISER4_FS) += reiser4.o
261+
262+reiser4-y := \
263+ debug.o \
264+ jnode.o \
265+ znode.o \
266+ key.o \
267+ pool.o \
268+ tree_mod.o \
269+ estimate.o \
270+ carry.o \
271+ carry_ops.o \
272+ lock.o \
273+ tree.o \
274+ context.o \
275+ tap.o \
276+ coord.o \
277+ block_alloc.o \
278+ txnmgr.o \
279+ kassign.o \
280+ flush.o \
281+ wander.o \
282+ eottl.o \
283+ search.o \
284+ page_cache.o \
285+ seal.o \
286+ dscale.o \
287+ flush_queue.o \
288+ ktxnmgrd.o \
289+ blocknrset.o \
290+ super.o \
291+ super_ops.o \
292+ fsdata.o \
293+ export_ops.o \
294+ oid.o \
295+ tree_walk.o \
296+ inode.o \
297+ vfs_ops.o \
298+ as_ops.o \
299+ entd.o\
300+ readahead.o \
301+ status_flags.o \
302+ init_super.o \
303+ safe_link.o \
304+ \
305+ plugin/plugin.o \
306+ plugin/plugin_set.o \
307+ plugin/node/node.o \
308+ plugin/object.o \
309+ plugin/cluster.o \
310+ plugin/inode_ops.o \
311+ plugin/inode_ops_rename.o \
312+ plugin/file_ops.o \
313+ plugin/file_ops_readdir.o \
314+ plugin/file_plugin_common.o \
315+ plugin/file/file.o \
316+ plugin/file/tail_conversion.o \
317+ plugin/file/symlink.o \
318+ plugin/file/cryptcompress.o \
319+ plugin/dir_plugin_common.o \
320+ plugin/dir/hashed_dir.o \
321+ plugin/dir/seekable_dir.o \
322+ plugin/node/node40.o \
323+ \
324+ plugin/crypto/cipher.o \
325+ plugin/crypto/digest.o \
326+ \
327+ plugin/compress/minilzo.o \
328+ plugin/compress/compress.o \
329+ plugin/compress/compress_mode.o \
330+ \
331+ plugin/item/static_stat.o \
332+ plugin/item/sde.o \
333+ plugin/item/cde.o \
334+ plugin/item/blackbox.o \
335+ plugin/item/internal.o \
336+ plugin/item/tail.o \
337+ plugin/item/ctail.o \
338+ plugin/item/extent.o \
339+ plugin/item/extent_item_ops.o \
340+ plugin/item/extent_file_ops.o \
341+ plugin/item/extent_flush_ops.o \
342+ \
343+ plugin/hash.o \
344+ plugin/fibration.o \
345+ plugin/tail_policy.o \
346+ plugin/item/item.o \
347+ \
348+ plugin/security/perm.o \
349+ plugin/space/bitmap.o \
350+ \
351+ plugin/disk_format/disk_format40.o \
352+ plugin/disk_format/disk_format.o \
353+ \
354+ plugin/regular.o
355+
356Index: linux-2.6.16/fs/reiser4/README
357===================================================================
358--- /dev/null
359+++ linux-2.6.16/fs/reiser4/README
360@@ -0,0 +1,125 @@
361+[LICENSING]
362+
363+Reiser4 is hereby licensed under the GNU General
364+Public License version 2.
365+
366+Source code files that contain the phrase "licensing governed by
367+reiser4/README" are "governed files" throughout this file. Governed
368+files are licensed under the GPL. The portions of them owned by Hans
369+Reiser, or authorized to be licensed by him, have been in the past,
370+and likely will be in the future, licensed to other parties under
371+other licenses. If you add your code to governed files, and don't
372+want it to be owned by Hans Reiser, put your copyright label on that
373+code so the poor blight and his customers can keep things straight.
374+All portions of governed files not labeled otherwise are owned by Hans
375+Reiser, and by adding your code to it, widely distributing it to
376+others or sending us a patch, and leaving the sentence in stating that
377+licensing is governed by the statement in this file, you accept this.
378+It will be a kindness if you identify whether Hans Reiser is allowed
379+to license code labeled as owned by you on your behalf other than
380+under the GPL, because he wants to know if it is okay to do so and put
381+a check in the mail to you (for non-trivial improvements) when he
382+makes his next sale. He makes no guarantees as to the amount if any,
383+though he feels motivated to motivate contributors, and you can surely
384+discuss this with him before or after contributing. You have the
385+right to decline to allow him to license your code contribution other
386+than under the GPL.
387+
388+Further licensing options are available for commercial and/or other
389+interests directly from Hans Reiser: reiser@namesys.com. If you interpret
390+the GPL as not allowing those additional licensing options, you read
391+it wrongly, and Richard Stallman agrees with me, when carefully read
392+you can see that those restrictions on additional terms do not apply
393+to the owner of the copyright, and my interpretation of this shall
394+govern for this license.
395+
396+[END LICENSING]
397+
398+Reiser4 is a file system based on dancing tree algorithms, and is
399+described at http://www.namesys.com
400+
401+mkfs.reiser4 and other utilities are on our webpage or wherever your
402+Linux provider put them. You really want to be running the latest
403+version off the website if you use fsck.
404+
405+Yes, if you update your reiser4 kernel module you do have to
406+recompile your kernel, most of the time. The errors you get will be
407+quite cryptic if your forget to do so.
408+
409+Hideous Commercial Pitch: Spread your development costs across other OS
410+vendors. Select from the best in the world, not the best in your
411+building, by buying from third party OS component suppliers. Leverage
412+the software component development power of the internet. Be the most
413+aggressive in taking advantage of the commercial possibilities of
414+decentralized internet development, and add value through your branded
415+integration that you sell as an operating system. Let your competitors
416+be the ones to compete against the entire internet by themselves. Be
417+hip, get with the new economic trend, before your competitors do. Send
418+email to reiser@namesys.com
419+
420+Hans Reiser was the primary architect of Reiser4, but a whole team
421+chipped their ideas in. He invested everything he had into Namesys
422+for 5.5 dark years of no money before Reiser3 finally started to work well
423+enough to bring in money. He owns the copyright.
424+
425+DARPA was the primary sponsor of Reiser4. DARPA does not endorse
426+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
427+opinion, unique in its willingness to invest into things more
428+theoretical than the VC community can readily understand, and more
429+longterm than allows them to be sure that they will be the ones to
430+extract the economic benefits from. DARPA also integrated us into a
431+security community that transformed our security worldview.
432+
433+Vladimir Saveliev is our lead programmer, with us from the beginning,
434+and he worked long hours writing the cleanest code. This is why he is
435+now the lead programmer after years of commitment to our work. He
436+always made the effort to be the best he could be, and to make his
437+code the best that it could be. What resulted was quite remarkable. I
438+don't think that money can ever motivate someone to work the way he
439+did, he is one of the most selfless men I know.
440+
441+Alexander Lyamin was our sysadmin, and helped to educate us in
442+security issues. Moscow State University and IMT were very generous
443+in the internet access they provided us, and in lots of other little
444+ways that a generous institution can be.
445+
446+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
447+locking code, the block allocator, and finished the flushing code.
448+His code is always crystal clean and well structured.
449+
450+Nikita Danilov wrote the core of the balancing code, the core of the
451+plugins code, and the directory code. He worked a steady pace of long
452+hours that produced a whole lot of well abstracted code. He is our
453+senior computer scientist.
454+
455+Vladimir Demidov wrote the parser. Writing an in kernel parser is
456+something very few persons have the skills for, and it is thanks to
457+him that we can say that the parser is really not so big compared to
458+various bits of our other code, and making a parser work in the kernel
459+was not so complicated as everyone would imagine mainly because it was
460+him doing it...
461+
462+Joshua McDonald wrote the transaction manager, and the flush code.
463+The flush code unexpectedly turned out be extremely hairy for reasons
464+you can read about on our web page, and he did a great job on an
465+extremely difficult task.
466+
467+Nina Reiser handled our accounting, government relations, and much
468+more.
469+
470+Ramon Reiser developed our website.
471+
472+Beverly Palmer drew our graphics.
473+
474+Vitaly Fertman developed librepair, userspace plugins repair code, fsck
475+and worked with Umka on developing libreiser4 and userspace plugins.
476+
477+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
478+userspace tools (reiser4progs).
479+
480+Oleg Drokin (aka Green) is the release manager who fixes everything.
481+It is so nice to have someone like that on the team. He (plus Chris
482+and Jeff) make it possible for the entire rest of the Namesys team to
483+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
484+is just amazing to watch his talent for spotting bugs in action.
485+
486Index: linux-2.6.16/fs/reiser4/as_ops.c
487===================================================================
488--- /dev/null
489+++ linux-2.6.16/fs/reiser4/as_ops.c
490@@ -0,0 +1,392 @@
491+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
492+
493+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
494+
495+#include "forward.h"
496+#include "debug.h"
497+#include "dformat.h"
498+#include "coord.h"
499+#include "plugin/item/item.h"
500+#include "plugin/file/file.h"
501+#include "plugin/security/perm.h"
502+#include "plugin/disk_format/disk_format.h"
503+#include "plugin/plugin.h"
504+#include "plugin/plugin_set.h"
505+#include "plugin/object.h"
506+#include "txnmgr.h"
507+#include "jnode.h"
508+#include "znode.h"
509+#include "block_alloc.h"
510+#include "tree.h"
511+#include "vfs_ops.h"
512+#include "inode.h"
513+#include "page_cache.h"
514+#include "ktxnmgrd.h"
515+#include "super.h"
516+#include "reiser4.h"
517+#include "entd.h"
518+
519+#include <linux/profile.h>
520+#include <linux/types.h>
521+#include <linux/mount.h>
522+#include <linux/vfs.h>
523+#include <linux/mm.h>
524+#include <linux/buffer_head.h>
525+#include <linux/dcache.h>
526+#include <linux/list.h>
527+#include <linux/pagemap.h>
528+#include <linux/slab.h>
529+#include <linux/seq_file.h>
530+#include <linux/init.h>
531+#include <linux/module.h>
532+#include <linux/writeback.h>
533+#include <linux/backing-dev.h>
534+#include <linux/quotaops.h>
535+#include <linux/security.h>
536+
537+/* address space operations */
538+
539+/**
540+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
541+ * @page: page to be dirtied
542+ *
543+ * Operation of struct address_space_operations. This implementation is used by
544+ * unix and crc file plugins.
545+ *
546+ * This is called when reiser4 page gets dirtied outside of reiser4, for
547+ * example, when dirty bit is moved from pte to physical page.
548+ *
549+ * Tags page in the mapping's page tree with special tag so that it is possible
550+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
551+ * capturing by an atom) later because it can not be done in the contexts where
552+ * set_page_dirty is called.
553+ */
554+int reiser4_set_page_dirty(struct page *page)
555+{
556+ /* this page can be unformatted only */
557+ assert("vs-1734", (page->mapping &&
558+ page->mapping->host &&
559+ get_super_fake(page->mapping->host->i_sb) !=
560+ page->mapping->host
561+ && get_cc_fake(page->mapping->host->i_sb) !=
562+ page->mapping->host
563+ && get_bitmap_fake(page->mapping->host->i_sb) !=
564+ page->mapping->host));
565+
566+ if (!TestSetPageDirty(page)) {
567+ struct address_space *mapping = page->mapping;
568+
569+ if (mapping) {
570+ write_lock_irq(&mapping->tree_lock);
571+
572+ /* check for race with truncate */
573+ if (page->mapping) {
574+ assert("vs-1652", page->mapping == mapping);
575+ if (mapping_cap_account_dirty(mapping))
576+ inc_page_state(nr_dirty);
577+ radix_tree_tag_set(&mapping->page_tree,
578+ page->index,
579+ PAGECACHE_TAG_REISER4_MOVED);
580+ }
581+ write_unlock_irq(&mapping->tree_lock);
582+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
583+ }
584+ }
585+ return 0;
586+}
587+
588+static int filler(void *vp, struct page *page)
589+{
590+ return page->mapping->a_ops->readpage(vp, page);
591+}
592+
593+/**
594+ * reiser4_readpages - submit read for a set of pages
595+ * @file: file to read
596+ * @mapping: address space
597+ * @pages: list of pages to submit read for
598+ * @nr_pages: number of pages no the list
599+ *
600+ * Operation of struct address_space_operations. This implementation is used by
601+ * unix and crc file plugins.
602+ *
603+ * Calls read_cache_pages or readpages hook if it is set.
604+ */
605+int
606+reiser4_readpages(struct file *file, struct address_space *mapping,
607+ struct list_head *pages, unsigned nr_pages)
608+{
609+ reiser4_context *ctx;
610+ reiser4_file_fsdata *fsdata;
611+
612+ ctx = init_context(mapping->host->i_sb);
613+ if (IS_ERR(ctx))
614+ return PTR_ERR(ctx);
615+
616+ fsdata = reiser4_get_file_fsdata(file);
617+ if (IS_ERR(fsdata)) {
618+ reiser4_exit_context(ctx);
619+ return PTR_ERR(fsdata);
620+ }
621+
622+ if (fsdata->ra2.readpages)
623+ fsdata->ra2.readpages(mapping, pages, fsdata->ra2.data);
624+ else {
625+ /*
626+ * filler (reiser4 readpage method) may involve tree search
627+ * which is not allowed when lock stack is not clean. If lock
628+ * stack is not clean - do nothing.
629+ */
630+ if (lock_stack_isclean(get_current_lock_stack()))
631+ read_cache_pages(mapping, pages, filler, file);
632+ else {
633+ while (!list_empty(pages)) {
634+ struct page *victim;
635+
636+ victim = list_entry(pages->prev, struct page, lru);
637+ list_del(&victim->lru);
638+ page_cache_release(victim);
639+ }
640+ }
641+ }
642+ reiser4_exit_context(ctx);
643+ return 0;
644+}
645+
646+/* ->invalidatepage method for reiser4 */
647+
648+/*
649+ * this is called for each truncated page from
650+ * truncate_inode_pages()->truncate_{complete,partial}_page().
651+ *
652+ * At the moment of call, page is under lock, and outstanding io (if any) has
653+ * completed.
654+ */
655+
656+/**
657+ * reiser4_invalidatepage
658+ * @page: page to invalidate
659+ * @offset: starting offset for partial invalidation
660+ *
661+ */
662+int reiser4_invalidatepage(struct page *page, unsigned long offset)
663+{
664+ int ret = 0;
665+ reiser4_context *ctx;
666+ struct inode *inode;
667+ jnode *node;
668+
669+ /*
670+ * This is called to truncate file's page.
671+ *
672+ * Originally, reiser4 implemented truncate in a standard way
673+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
674+ * first, then file system ->truncate() call-back is invoked).
675+ *
676+ * This lead to the problem when ->invalidatepage() was called on a
677+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
678+ * process. That is, truncate was bypassing transactions. To avoid
679+ * this, try_capture_page_to_invalidate() call was added here.
680+ *
681+ * After many troubles with vmtruncate() based truncate (including
682+ * races with flush, tail conversion, etc.) it was re-written in the
683+ * top-to-bottom style: items are killed in cut_tree_object() and
684+ * pages belonging to extent are invalidated in kill_hook_extent(). So
685+ * probably now additional call to capture is not needed here.
686+ */
687+
688+ assert("nikita-3137", PageLocked(page));
689+ assert("nikita-3138", !PageWriteback(page));
690+ inode = page->mapping->host;
691+
692+ /*
693+ * ->invalidatepage() should only be called for the unformatted
694+ * jnodes. Destruction of all other types of jnodes is performed
695+ * separately. But, during some corner cases (like handling errors
696+ * during mount) it is simpler to let ->invalidatepage to be called on
697+ * them. Check for this, and do nothing.
698+ */
699+ if (get_super_fake(inode->i_sb) == inode)
700+ return 0;
701+ if (get_cc_fake(inode->i_sb) == inode)
702+ return 0;
703+ if (get_bitmap_fake(inode->i_sb) == inode)
704+ return 0;
705+ assert("vs-1426", PagePrivate(page));
706+ assert("vs-1427",
707+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
708+ assert("", jprivate(page) != NULL);
709+ assert("", ergo(inode_file_plugin(inode) !=
710+ file_plugin_by_id(CRC_FILE_PLUGIN_ID), offset == 0));
711+
712+ ctx = init_context(inode->i_sb);
713+ if (IS_ERR(ctx))
714+ return PTR_ERR(ctx);
715+
716+ node = jprivate(page);
717+ spin_lock_jnode(node);
718+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
719+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
720+ /* there is not need to capture */
721+ jref(node);
722+ JF_SET(node, JNODE_HEARD_BANSHEE);
723+ page_clear_jnode(page, node);
724+ uncapture_jnode(node);
725+ unhash_unformatted_jnode(node);
726+ jput(node);
727+ reiser4_exit_context(ctx);
728+ return 0;
729+ }
730+ spin_unlock_jnode(node);
731+
732+ /* capture page being truncated. */
733+ ret = try_capture_page_to_invalidate(page);
734+ if (ret != 0)
735+ warning("nikita-3141", "Cannot capture: %i", ret);
736+
737+ if (offset == 0) {
738+ /* remove jnode from transaction and detach it from page. */
739+ jref(node);
740+ JF_SET(node, JNODE_HEARD_BANSHEE);
741+ /* page cannot be detached from jnode concurrently, because it
742+ * is locked */
743+ uncapture_page(page);
744+
745+ /* this detaches page from jnode, so that jdelete will not try
746+ * to lock page which is already locked */
747+ spin_lock_jnode(node);
748+ page_clear_jnode(page, node);
749+ spin_unlock_jnode(node);
750+ unhash_unformatted_jnode(node);
751+
752+ jput(node);
753+ }
754+
755+ reiser4_exit_context(ctx);
756+ return 0;
757+}
758+
759+/* help function called from reiser4_releasepage(). It returns true if jnode
760+ * can be detached from its page and page released. */
761+int jnode_is_releasable(jnode * node /* node to check */ )
762+{
763+ assert("nikita-2781", node != NULL);
764+ assert_spin_locked(&(node->guard));
765+ assert_spin_locked(&(node->load));
766+
767+ /* is some thread is currently using jnode page, later cannot be
768+ * detached */
769+ if (atomic_read(&node->d_count) != 0) {
770+ return 0;
771+ }
772+
773+ assert("vs-1214", !jnode_is_loaded(node));
774+
775+ /*
776+ * can only release page if real block number is assigned to it. Simple
777+ * check for ->atom wouldn't do, because it is possible for node to be
778+ * clean, not it atom yet, and still having fake block number. For
779+ * example, node just created in jinit_new().
780+ */
781+ if (blocknr_is_fake(jnode_get_block(node)))
782+ return 0;
783+
784+ /*
785+ * pages prepared for write can not be released anyway, so avoid
786+ * detaching jnode from the page
787+ */
788+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
789+ return 0;
790+
791+ /*
792+ * dirty jnode cannot be released. It can however be submitted to disk
793+ * as part of early flushing, but only after getting flush-prepped.
794+ */
795+ if (JF_ISSET(node, JNODE_DIRTY))
796+ return 0;
797+
798+ /* overwrite set is only written by log writer. */
799+ if (JF_ISSET(node, JNODE_OVRWR))
800+ return 0;
801+
802+ /* jnode is already under writeback */
803+ if (JF_ISSET(node, JNODE_WRITEBACK))
804+ return 0;
805+
806+ /* don't flush bitmaps or journal records */
807+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
808+ return 0;
809+
810+ return 1;
811+}
812+
813+/*
814+ * ->releasepage method for reiser4
815+ *
816+ * This is called by VM scanner when it comes across clean page. What we have
817+ * to do here is to check whether page can really be released (freed that is)
818+ * and if so, detach jnode from it and remove page from the page cache.
819+ *
820+ * Check for releasability is done by releasable() function.
821+ */
822+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
823+{
824+ jnode *node;
825+
826+ assert("nikita-2257", PagePrivate(page));
827+ assert("nikita-2259", PageLocked(page));
828+ assert("nikita-2892", !PageWriteback(page));
829+ assert("nikita-3019", schedulable());
830+
831+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
832+ is not clear what to do in this case. A lot of deadlocks seems be
833+ possible. */
834+
835+ node = jnode_by_page(page);
836+ assert("nikita-2258", node != NULL);
837+ assert("reiser4-4", page->mapping != NULL);
838+ assert("reiser4-5", page->mapping->host != NULL);
839+
840+ if (PageDirty(page))
841+ return 0;
842+
843+ if (page_count(page) > 3)
844+ return 0;
845+
846+ /* releasable() needs jnode lock, because it looks at the jnode fields
847+ * and we need jload_lock here to avoid races with jload(). */
848+ spin_lock_jnode(node);
849+ spin_lock(&(node->load));
850+ if (jnode_is_releasable(node)) {
851+ struct address_space *mapping;
852+
853+ mapping = page->mapping;
854+ jref(node);
855+ /* there is no need to synchronize against
856+ * jnode_extent_write() here, because pages seen by
857+ * jnode_extent_write() are !releasable(). */
858+ page_clear_jnode(page, node);
859+ spin_unlock(&(node->load));
860+ spin_unlock_jnode(node);
861+
862+ /* we are under memory pressure so release jnode also. */
863+ jput(node);
864+
865+ return 1;
866+ } else {
867+ spin_unlock(&(node->load));
868+ spin_unlock_jnode(node);
869+ assert("nikita-3020", schedulable());
870+ return 0;
871+ }
872+}
873+
874+/* Make Linus happy.
875+ Local variables:
876+ c-indentation-style: "K&R"
877+ mode-name: "LC"
878+ c-basic-offset: 8
879+ tab-width: 8
880+ fill-column: 120
881+ End:
882+*/
883Index: linux-2.6.16/fs/reiser4/block_alloc.c
884===================================================================
885--- /dev/null
886+++ linux-2.6.16/fs/reiser4/block_alloc.c
887@@ -0,0 +1,1139 @@
888+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
889+
890+#include "debug.h"
891+#include "dformat.h"
892+#include "plugin/plugin.h"
893+#include "txnmgr.h"
894+#include "znode.h"
895+#include "block_alloc.h"
896+#include "tree.h"
897+#include "super.h"
898+
899+#include <linux/types.h> /* for __u?? */
900+#include <linux/fs.h> /* for struct super_block */
901+#include <linux/spinlock.h>
902+
903+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
904+
905+/* We need to be able to reserve enough disk space to ensure that an atomic
906+ operation will have enough disk space to flush (see flush.c and
907+ http://namesys.com/v4/v4.html) and commit it once it is started.
908+
909+ In our design a call for reserving disk space may fail but not an actual
910+ block allocation.
911+
912+ All free blocks, already allocated blocks, and all kinds of reserved blocks
913+ are counted in different per-fs block counters.
914+
915+ A reiser4 super block's set of block counters currently is:
916+
917+ free -- free blocks,
918+ used -- already allocated blocks,
919+
920+ grabbed -- initially reserved for performing an fs operation, those blocks
921+ are taken from free blocks, then grabbed disk space leaks from grabbed
922+ blocks counter to other counters like "fake allocated", "flush
923+ reserved", "used", the rest of not used grabbed space is returned to
924+ free space at the end of fs operation;
925+
926+ fake allocated -- counts all nodes without real disk block numbers assigned,
927+ we have separate accounting for formatted and unformatted
928+ nodes (for easier debugging);
929+
930+ flush reserved -- disk space needed for flushing and committing an atom.
931+ Each dirty already allocated block could be written as a
932+ part of atom's overwrite set or as a part of atom's
933+ relocate set. In both case one additional block is needed,
934+ it is used as a wandered block if we do overwrite or as a
935+ new location for a relocated block.
936+
937+ In addition, blocks in some states are counted on per-thread and per-atom
938+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
939+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
940+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
941+ blocks, which are reserved for flush processing and atom commit. */
942+
943+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
944+ number of blocks to grab for most expensive case of balancing when the leaf
945+ node we insert new item to gets split and new leaf node is allocated.
946+
947+ So, we need to grab blocks for
948+
949+ 1) one block for possible dirtying the node we insert an item to. That block
950+ would be used for node relocation at flush time or for allocating of a
951+ wandered one, it depends what will be a result (what set, relocate or
952+ overwrite the node gets assigned to) of the node processing by the flush
953+ algorithm.
954+
955+ 2) one block for either allocating a new node, or dirtying of right or left
956+ clean neighbor, only one case may happen.
957+
958+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
959+ node, and creation of new node. have I forgotten something? email me.
960+
961+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
962+ counter and in the fs-wide one (both ctx->grabbed_blocks and
963+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
964+ decremented by 2.
965+
966+ Suppose both two blocks were spent for dirtying of an already allocated clean
967+ node (one block went from "grabbed" to "flush reserved") and for new block
968+ allocating (one block went from "grabbed" to "fake allocated formatted").
969+
970+ Inserting of a child pointer to the parent node caused parent node to be
971+ split, the balancing code takes care about this grabbing necessary space
972+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
973+ "can use the 5% reserved disk space".
974+
975+ At this moment insertion completes and grabbed blocks (if they were not used)
976+ should be returned to the free space counter.
977+
978+ However the atom life-cycle is not completed. The atom had one "flush
979+ reserved" block added by our insertion and the new fake allocated node is
980+ counted as a "fake allocated formatted" one. The atom has to be fully
981+ processed by flush before commit. Suppose that the flush moved the first,
982+ already allocated node to the atom's overwrite list, the new fake allocated
983+ node, obviously, went into the atom relocate set. The reiser4 flush
984+ allocates the new node using one unit from "fake allocated formatted"
985+ counter, the log writer uses one from "flush reserved" for wandered block
986+ allocation.
987+
988+ And, it is not the end. When the wandered block is deallocated after the
989+ atom gets fully played (see wander.c for term description), the disk space
990+ occupied for it is returned to free blocks. */
991+
992+/* BLOCK NUMBERS */
993+
994+/* Any reiser4 node has a block number assigned to it. We use these numbers for
995+ indexing in hash tables, so if a block has not yet been assigned a location
996+ on disk we need to give it a temporary fake block number.
997+
998+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
999+ use highest bit in 64-bit block number to distinguish fake and real block
1000+ numbers. So, only 63 bits may be used to addressing of real device
1001+ blocks. That "fake" block numbers space is divided into subspaces of fake
1002+ block numbers for data blocks and for shadow (working) bitmap blocks.
1003+
1004+ Fake block numbers for data blocks are generated by a cyclic counter, which
1005+ gets incremented after each real block allocation. We assume that it is
1006+ impossible to overload this counter during one transaction life. */
1007+
1008+/* Initialize a blocknr hint. */
1009+void blocknr_hint_init(reiser4_blocknr_hint * hint)
1010+{
1011+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
1012+}
1013+
1014+/* Release any resources of a blocknr hint. */
1015+void blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
1016+{
1017+ /* No resources should be freed in current blocknr_hint implementation. */
1018+}
1019+
1020+/* see above for explanation of fake block number. */
1021+/* Audited by: green(2002.06.11) */
1022+int blocknr_is_fake(const reiser4_block_nr * da)
1023+{
1024+ /* The reason for not simply returning result of '&' operation is that
1025+ while return value is (possibly 32bit) int, the reiser4_block_nr is
1026+ at least 64 bits long, and high bit (which is the only possible
1027+ non zero bit after the masking) would be stripped off */
1028+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
1029+}
1030+
1031+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
1032+ arithmetic. Mostly, they are isolated to not to code same assertions in
1033+ several places. */
1034+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
1035+{
1036+ BUG_ON(ctx->grabbed_blocks < count);
1037+ assert("zam-527", ctx->grabbed_blocks >= count);
1038+ ctx->grabbed_blocks -= count;
1039+}
1040+
1041+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
1042+{
1043+ ctx->grabbed_blocks += count;
1044+}
1045+
1046+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
1047+{
1048+ assert("zam-525", sbinfo->blocks_grabbed >= count);
1049+ sbinfo->blocks_grabbed -= count;
1050+}
1051+
1052+/* Decrease the counter of block reserved for flush in super block. */
1053+static void
1054+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1055+{
1056+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
1057+ sbinfo->blocks_flush_reserved -= count;
1058+}
1059+
1060+static void
1061+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1062+ reiser4_ba_flags_t flags)
1063+{
1064+ if (flags & BA_FORMATTED) {
1065+ assert("zam-806", sbinfo->blocks_fake_allocated >= count);
1066+ sbinfo->blocks_fake_allocated -= count;
1067+ } else {
1068+ assert("zam-528",
1069+ sbinfo->blocks_fake_allocated_unformatted >= count);
1070+ sbinfo->blocks_fake_allocated_unformatted -= count;
1071+ }
1072+}
1073+
1074+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
1075+{
1076+ assert("zam-530",
1077+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
1078+ sbinfo->blocks_used -= count;
1079+}
1080+
1081+static void
1082+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1083+{
1084+ assert("edward-501", sbinfo->blocks_clustered >= count);
1085+ sbinfo->blocks_clustered -= count;
1086+}
1087+
1088+/* Increase the counter of block reserved for flush in atom. */
1089+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1090+{
1091+ assert("zam-772", atom != NULL);
1092+ assert_spin_locked(&(atom->alock));
1093+ atom->flush_reserved += count;
1094+}
1095+
1096+/* Decrease the counter of block reserved for flush in atom. */
1097+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1098+{
1099+ assert("zam-774", atom != NULL);
1100+ assert_spin_locked(&(atom->alock));
1101+ assert("nikita-2790", atom->flush_reserved >= count);
1102+ atom->flush_reserved -= count;
1103+}
1104+
1105+/* super block has 6 counters: free, used, grabbed, fake allocated
1106+ (formatted and unformatted) and flush reserved. Their sum must be
1107+ number of blocks on a device. This function checks this */
1108+int check_block_counters(const struct super_block *super)
1109+{
1110+ __u64 sum;
1111+
1112+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
1113+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
1114+ reiser4_fake_allocated_unformatted(super) + flush_reserved(super) +
1115+ reiser4_clustered_blocks(super);
1116+ if (reiser4_block_count(super) != sum) {
1117+ printk("super block counters: "
1118+ "used %llu, free %llu, "
1119+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
1120+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
1121+ (unsigned long long)reiser4_data_blocks(super),
1122+ (unsigned long long)reiser4_free_blocks(super),
1123+ (unsigned long long)reiser4_grabbed_blocks(super),
1124+ (unsigned long long)reiser4_fake_allocated(super),
1125+ (unsigned long long)
1126+ reiser4_fake_allocated_unformatted(super),
1127+ (unsigned long long)flush_reserved(super),
1128+ (unsigned long long)reiser4_clustered_blocks(super),
1129+ (unsigned long long)sum,
1130+ (unsigned long long)reiser4_block_count(super));
1131+ return 0;
1132+ }
1133+ return 1;
1134+}
1135+
1136+/* Adjust "working" free blocks counter for number of blocks we are going to
1137+ allocate. Record number of grabbed blocks in fs-wide and per-thread
1138+ counters. This function should be called before bitmap scanning or
1139+ allocating fake block numbers
1140+
1141+ @super -- pointer to reiser4 super block;
1142+ @count -- number of blocks we reserve;
1143+
1144+ @return -- 0 if success, -ENOSPC, if all
1145+ free blocks are preserved or already allocated.
1146+*/
1147+
1148+static int
1149+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
1150+{
1151+ __u64 free_blocks;
1152+ int ret = 0, use_reserved = flags & BA_RESERVED;
1153+ reiser4_super_info_data *sbinfo;
1154+
1155+ assert("vs-1276", ctx == get_current_context());
1156+
1157+ /* Do not grab anything on ro-mounted fs. */
1158+ if (rofs_super(ctx->super)) {
1159+ ctx->grab_enabled = 0;
1160+ return 0;
1161+ }
1162+
1163+ sbinfo = get_super_private(ctx->super);
1164+
1165+ spin_lock_reiser4_super(sbinfo);
1166+
1167+ free_blocks = sbinfo->blocks_free;
1168+
1169+ if ((use_reserved && free_blocks < count) ||
1170+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1171+ ret = RETERR(-ENOSPC);
1172+ goto unlock_and_ret;
1173+ }
1174+
1175+ add_to_ctx_grabbed(ctx, count);
1176+
1177+ sbinfo->blocks_grabbed += count;
1178+ sbinfo->blocks_free -= count;
1179+
1180+#if REISER4_DEBUG
1181+ if (ctx->grabbed_initially == 0)
1182+ ctx->grabbed_initially = count;
1183+#endif
1184+
1185+ assert("nikita-2986", check_block_counters(ctx->super));
1186+
1187+ /* disable grab space in current context */
1188+ ctx->grab_enabled = 0;
1189+
1190+ unlock_and_ret:
1191+ spin_unlock_reiser4_super(sbinfo);
1192+
1193+ return ret;
1194+}
1195+
1196+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1197+{
1198+ int ret;
1199+ reiser4_context *ctx;
1200+
1201+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1202+ lock_stack_isclean(get_current_lock_stack
1203+ ())));
1204+ ctx = get_current_context();
1205+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
1206+ return 0;
1207+ }
1208+
1209+ ret = reiser4_grab(ctx, count, flags);
1210+ if (ret == -ENOSPC) {
1211+
1212+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
1213+ if (flags & BA_CAN_COMMIT) {
1214+ txnmgr_force_commit_all(ctx->super, 0);
1215+ ctx->grab_enabled = 1;
1216+ ret = reiser4_grab(ctx, count, flags);
1217+ }
1218+ }
1219+ /*
1220+ * allocation from reserved pool cannot fail. This is severe error.
1221+ */
1222+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1223+ return ret;
1224+}
1225+
1226+/*
1227+ * SPACE RESERVED FOR UNLINK/TRUNCATE
1228+ *
1229+ * Unlink and truncate require space in transaction (to update stat data, at
1230+ * least). But we don't want rm(1) to fail with "No space on device" error.
1231+ *
1232+ * Solution is to reserve 5% of disk space for truncates and
1233+ * unlinks. Specifically, normal space grabbing requests don't grab space from
1234+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1235+ * drain it. Per super block delete_sema semaphore is used to allow only one
1236+ * thread at a time to grab from reserved area.
1237+ *
1238+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1239+ * flag.
1240+ *
1241+ */
1242+
1243+int reiser4_grab_reserved(struct super_block *super,
1244+ __u64 count, reiser4_ba_flags_t flags)
1245+{
1246+ reiser4_super_info_data *sbinfo = get_super_private(super);
1247+
1248+ assert("nikita-3175", flags & BA_CAN_COMMIT);
1249+
1250+ /* Check the delete semaphore already taken by us, we assume that
1251+ * reading of machine word is atomic. */
1252+ if (sbinfo->delete_sema_owner == current) {
1253+ if (reiser4_grab_space
1254+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1255+ warning("zam-1003",
1256+ "nested call of grab_reserved fails count=(%llu)",
1257+ (unsigned long long)count);
1258+ reiser4_release_reserved(super);
1259+ return RETERR(-ENOSPC);
1260+ }
1261+ return 0;
1262+ }
1263+
1264+ if (reiser4_grab_space(count, flags)) {
1265+ down(&sbinfo->delete_sema);
1266+ assert("nikita-2929", sbinfo->delete_sema_owner == NULL);
1267+ sbinfo->delete_sema_owner = current;
1268+
1269+ if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1270+ warning("zam-833",
1271+ "reserved space is not enough (%llu)",
1272+ (unsigned long long)count);
1273+ reiser4_release_reserved(super);
1274+ return RETERR(-ENOSPC);
1275+ }
1276+ }
1277+ return 0;
1278+}
1279+
1280+void reiser4_release_reserved(struct super_block *super)
1281+{
1282+ reiser4_super_info_data *info;
1283+
1284+ info = get_super_private(super);
1285+ if (info->delete_sema_owner == current) {
1286+ info->delete_sema_owner = NULL;
1287+ up(&info->delete_sema);
1288+ }
1289+}
1290+
1291+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1292+{
1293+ reiser4_context *ctx;
1294+ reiser4_super_info_data *sbinfo;
1295+
1296+ ctx = get_current_context();
1297+ sub_from_ctx_grabbed(ctx, count);
1298+
1299+ sbinfo = get_super_private(ctx->super);
1300+ spin_lock_reiser4_super(sbinfo);
1301+
1302+ sub_from_sb_grabbed(sbinfo, count);
1303+ /* return sbinfo locked */
1304+ return sbinfo;
1305+}
1306+
1307+/* is called after @count fake block numbers are allocated and pointer to
1308+ those blocks are inserted into tree. */
1309+static void grabbed2fake_allocated_formatted(void)
1310+{
1311+ reiser4_super_info_data *sbinfo;
1312+
1313+ sbinfo = grabbed2fake_allocated_head(1);
1314+ sbinfo->blocks_fake_allocated++;
1315+
1316+ assert("vs-922", check_block_counters(reiser4_get_current_sb()));
1317+
1318+ spin_unlock_reiser4_super(sbinfo);
1319+}
1320+
1321+/**
1322+ * grabbed2fake_allocated_unformatted
1323+ * @count:
1324+ *
1325+ */
1326+static void grabbed2fake_allocated_unformatted(int count)
1327+{
1328+ reiser4_super_info_data *sbinfo;
1329+
1330+ sbinfo = grabbed2fake_allocated_head(count);
1331+ sbinfo->blocks_fake_allocated_unformatted += count;
1332+
1333+ assert("vs-9221", check_block_counters(reiser4_get_current_sb()));
1334+
1335+ spin_unlock_reiser4_super(sbinfo);
1336+}
1337+
1338+void grabbed2cluster_reserved(int count)
1339+{
1340+ reiser4_context *ctx;
1341+ reiser4_super_info_data *sbinfo;
1342+
1343+ ctx = get_current_context();
1344+ sub_from_ctx_grabbed(ctx, count);
1345+
1346+ sbinfo = get_super_private(ctx->super);
1347+ spin_lock_reiser4_super(sbinfo);
1348+
1349+ sub_from_sb_grabbed(sbinfo, count);
1350+ sbinfo->blocks_clustered += count;
1351+
1352+ assert("edward-504", check_block_counters(ctx->super));
1353+
1354+ spin_unlock_reiser4_super(sbinfo);
1355+}
1356+
1357+void cluster_reserved2grabbed(int count)
1358+{
1359+ reiser4_context *ctx;
1360+ reiser4_super_info_data *sbinfo;
1361+
1362+ ctx = get_current_context();
1363+
1364+ sbinfo = get_super_private(ctx->super);
1365+ spin_lock_reiser4_super(sbinfo);
1366+
1367+ sub_from_cluster_reserved(sbinfo, count);
1368+ sbinfo->blocks_grabbed += count;
1369+
1370+ assert("edward-505", check_block_counters(ctx->super));
1371+
1372+ spin_unlock_reiser4_super(sbinfo);
1373+ add_to_ctx_grabbed(ctx, count);
1374+}
1375+
1376+void cluster_reserved2free(int count)
1377+{
1378+ reiser4_context *ctx;
1379+ reiser4_super_info_data *sbinfo;
1380+
1381+ assert("edward-503", get_current_context()->grabbed_blocks == 0);
1382+
1383+ ctx = get_current_context();
1384+ sbinfo = get_super_private(ctx->super);
1385+ spin_lock_reiser4_super(sbinfo);
1386+
1387+ sub_from_cluster_reserved(sbinfo, count);
1388+ sbinfo->blocks_free += count;
1389+
1390+ assert("edward-502", check_block_counters(ctx->super));
1391+
1392+ spin_unlock_reiser4_super(sbinfo);
1393+}
1394+
1395+static DEFINE_SPINLOCK(fake_lock);
1396+static reiser4_block_nr fake_gen = 0;
1397+
1398+/**
1399+ * assign_fake_blocknr
1400+ * @blocknr:
1401+ * @count:
1402+ *
1403+ * Obtain a fake block number for new node which will be used to refer to
1404+ * this newly allocated node until real allocation is done.
1405+ */
1406+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1407+{
1408+ spin_lock(&fake_lock);
1409+ *blocknr = fake_gen;
1410+ fake_gen += count;
1411+ spin_unlock(&fake_lock);
1412+
1413+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1414+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1415+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1416+ assert("zam-394", zlook(current_tree, blocknr) == NULL);
1417+}
1418+
1419+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1420+{
1421+ assign_fake_blocknr(blocknr, 1);
1422+ grabbed2fake_allocated_formatted();
1423+ return 0;
1424+}
1425+
1426+/**
1427+ * fake_blocknrs_unformatted
1428+ * @count: number of fake numbers to get
1429+ *
1430+ * Allocates @count fake block numbers which will be assigned to jnodes
1431+ */
1432+reiser4_block_nr fake_blocknr_unformatted(int count)
1433+{
1434+ reiser4_block_nr blocknr;
1435+
1436+ assign_fake_blocknr(&blocknr, count);
1437+ grabbed2fake_allocated_unformatted(count);
1438+
1439+ return blocknr;
1440+}
1441+
1442+/* adjust sb block counters, if real (on-disk) block allocation immediately
1443+ follows grabbing of free disk space. */
1444+void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1445+ __u64 count)
1446+{
1447+ sub_from_ctx_grabbed(ctx, count);
1448+
1449+ spin_lock_reiser4_super(sbinfo);
1450+
1451+ sub_from_sb_grabbed(sbinfo, count);
1452+ sbinfo->blocks_used += count;
1453+
1454+ assert("nikita-2679", check_block_counters(ctx->super));
1455+
1456+ spin_unlock_reiser4_super(sbinfo);
1457+}
1458+
1459+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1460+void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1461+ reiser4_ba_flags_t flags)
1462+{
1463+ spin_lock_reiser4_super(sbinfo);
1464+
1465+ sub_from_sb_fake_allocated(sbinfo, count, flags);
1466+ sbinfo->blocks_used += count;
1467+
1468+ assert("nikita-2680", check_block_counters(reiser4_get_current_sb()));
1469+
1470+ spin_unlock_reiser4_super(sbinfo);
1471+}
1472+
1473+void flush_reserved2used(txn_atom * atom, __u64 count)
1474+{
1475+ reiser4_super_info_data *sbinfo;
1476+
1477+ assert("zam-787", atom != NULL);
1478+ assert_spin_locked(&(atom->alock));
1479+
1480+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1481+
1482+ sbinfo = get_current_super_private();
1483+ spin_lock_reiser4_super(sbinfo);
1484+
1485+ sub_from_sb_flush_reserved(sbinfo, count);
1486+ sbinfo->blocks_used += count;
1487+
1488+ assert("zam-789", check_block_counters(reiser4_get_current_sb()));
1489+
1490+ spin_unlock_reiser4_super(sbinfo);
1491+}
1492+
1493+/* update the per fs blocknr hint default value. */
1494+void
1495+update_blocknr_hint_default(const struct super_block *s,
1496+ const reiser4_block_nr * block)
1497+{
1498+ reiser4_super_info_data *sbinfo = get_super_private(s);
1499+
1500+ assert("nikita-3342", !blocknr_is_fake(block));
1501+
1502+ spin_lock_reiser4_super(sbinfo);
1503+ if (*block < sbinfo->block_count) {
1504+ sbinfo->blocknr_hint_default = *block;
1505+ } else {
1506+ warning("zam-676",
1507+ "block number %llu is too large to be used in a blocknr hint\n",
1508+ (unsigned long long)*block);
1509+ dump_stack();
1510+ DEBUGON(1);
1511+ }
1512+ spin_unlock_reiser4_super(sbinfo);
1513+}
1514+
1515+/* get current value of the default blocknr hint. */
1516+void get_blocknr_hint_default(reiser4_block_nr * result)
1517+{
1518+ reiser4_super_info_data *sbinfo = get_current_super_private();
1519+
1520+ spin_lock_reiser4_super(sbinfo);
1521+ *result = sbinfo->blocknr_hint_default;
1522+ assert("zam-677", *result < sbinfo->block_count);
1523+ spin_unlock_reiser4_super(sbinfo);
1524+}
1525+
1526+/* Allocate "real" disk blocks by calling a proper space allocation plugin
1527+ * method. Blocks are allocated in one contiguous disk region. The plugin
1528+ * independent part accounts blocks by subtracting allocated amount from grabbed
1529+ * or fake block counter and add the same amount to the counter of allocated
1530+ * blocks.
1531+ *
1532+ * @hint -- a reiser4 blocknr hint object which contains further block
1533+ * allocation hints and parameters (search start, a stage of block
1534+ * which will be mapped to disk, etc.),
1535+ * @blk -- an out parameter for the beginning of the allocated region,
1536+ * @len -- in/out parameter, it should contain the maximum number of allocated
1537+ * blocks, after block allocation completes, it contains the length of
1538+ * allocated disk region.
1539+ * @flags -- see reiser4_ba_flags_t description.
1540+ *
1541+ * @return -- 0 if success, error code otherwise.
1542+ */
1543+int
1544+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1545+ reiser4_block_nr * len, reiser4_ba_flags_t flags)
1546+{
1547+ __u64 needed = *len;
1548+ reiser4_context *ctx;
1549+ reiser4_super_info_data *sbinfo;
1550+ int ret;
1551+
1552+ assert("zam-986", hint != NULL);
1553+
1554+ ctx = get_current_context();
1555+ sbinfo = get_super_private(ctx->super);
1556+
1557+ /* For write-optimized data we use default search start value, which is
1558+ * close to last write location. */
1559+ if (flags & BA_USE_DEFAULT_SEARCH_START) {
1560+ get_blocknr_hint_default(&hint->blk);
1561+ }
1562+
1563+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1564+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1565+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
1566+ ret = reiser4_grab_space_force(*len, flags);
1567+ if (ret != 0)
1568+ return ret;
1569+ }
1570+
1571+ ret =
1572+ sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int)needed,
1573+ blk, len);
1574+
1575+ if (!ret) {
1576+ assert("zam-680", *blk < reiser4_block_count(ctx->super));
1577+ assert("zam-681",
1578+ *blk + *len <= reiser4_block_count(ctx->super));
1579+
1580+ if (flags & BA_PERMANENT) {
1581+ /* we assume that current atom exists at this moment */
1582+ txn_atom *atom = get_current_atom_locked();
1583+ atom->nr_blocks_allocated += *len;
1584+ spin_unlock_atom(atom);
1585+ }
1586+
1587+ switch (hint->block_stage) {
1588+ case BLOCK_NOT_COUNTED:
1589+ case BLOCK_GRABBED:
1590+ grabbed2used(ctx, sbinfo, *len);
1591+ break;
1592+ case BLOCK_UNALLOCATED:
1593+ fake_allocated2used(sbinfo, *len, flags);
1594+ break;
1595+ case BLOCK_FLUSH_RESERVED:
1596+ {
1597+ txn_atom *atom = get_current_atom_locked();
1598+ flush_reserved2used(atom, *len);
1599+ spin_unlock_atom(atom);
1600+ }
1601+ break;
1602+ default:
1603+ impossible("zam-531", "wrong block stage");
1604+ }
1605+ } else {
1606+ assert("zam-821",
1607+ ergo(hint->max_dist == 0
1608+ && !hint->backward, ret != -ENOSPC));
1609+ if (hint->block_stage == BLOCK_NOT_COUNTED)
1610+ grabbed2free(ctx, sbinfo, needed);
1611+ }
1612+
1613+ return ret;
1614+}
1615+
1616+/* used -> fake_allocated -> grabbed -> free */
1617+
1618+/* adjust sb block counters when @count unallocated blocks get unmapped from
1619+ disk */
1620+static void
1621+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1622+ int formatted)
1623+{
1624+ spin_lock_reiser4_super(sbinfo);
1625+
1626+ if (formatted)
1627+ sbinfo->blocks_fake_allocated += count;
1628+ else
1629+ sbinfo->blocks_fake_allocated_unformatted += count;
1630+
1631+ sub_from_sb_used(sbinfo, count);
1632+
1633+ assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1634+
1635+ spin_unlock_reiser4_super(sbinfo);
1636+}
1637+
1638+static void
1639+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1640+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1641+{
1642+ assert("nikita-2791", atom != NULL);
1643+ assert_spin_locked(&(atom->alock));
1644+
1645+ add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1646+
1647+ spin_lock_reiser4_super(sbinfo);
1648+
1649+ sbinfo->blocks_flush_reserved += count;
1650+ /*add_to_sb_flush_reserved(sbinfo, count); */
1651+ sub_from_sb_used(sbinfo, count);
1652+
1653+ assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1654+
1655+ spin_unlock_reiser4_super(sbinfo);
1656+}
1657+
1658+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1659+static void
1660+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1661+ __u64 count, reiser4_ba_flags_t flags)
1662+{
1663+ add_to_ctx_grabbed(ctx, count);
1664+
1665+ spin_lock_reiser4_super(sbinfo);
1666+
1667+ assert("nikita-2682", check_block_counters(ctx->super));
1668+
1669+ sbinfo->blocks_grabbed += count;
1670+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1671+
1672+ assert("nikita-2683", check_block_counters(ctx->super));
1673+
1674+ spin_unlock_reiser4_super(sbinfo);
1675+}
1676+
1677+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1678+{
1679+ reiser4_context *ctx;
1680+ reiser4_super_info_data *sbinfo;
1681+
1682+ ctx = get_current_context();
1683+ sbinfo = get_super_private(ctx->super);
1684+
1685+ fake_allocated2grabbed(ctx, sbinfo, count, flags);
1686+ grabbed2free(ctx, sbinfo, count);
1687+}
1688+
1689+void grabbed2free_mark(__u64 mark)
1690+{
1691+ reiser4_context *ctx;
1692+ reiser4_super_info_data *sbinfo;
1693+
1694+ ctx = get_current_context();
1695+ sbinfo = get_super_private(ctx->super);
1696+
1697+ assert("nikita-3007", (__s64) mark >= 0);
1698+ assert("nikita-3006", ctx->grabbed_blocks >= mark);
1699+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1700+}
1701+
1702+/**
1703+ * grabbed2free - adjust grabbed and free block counters
1704+ * @ctx: context to update grabbed block counter of
1705+ * @sbinfo: super block to update grabbed and free block counters of
1706+ * @count: number of blocks to adjust counters by
1707+ *
1708+ * Decreases context's and per filesystem's counters of grabbed
1709+ * blocks. Increases per filesystem's counter of free blocks.
1710+ */
1711+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1712+ __u64 count)
1713+{
1714+ sub_from_ctx_grabbed(ctx, count);
1715+
1716+ spin_lock_reiser4_super(sbinfo);
1717+
1718+ sub_from_sb_grabbed(sbinfo, count);
1719+ sbinfo->blocks_free += count;
1720+ assert("nikita-2684", check_block_counters(ctx->super));
1721+
1722+ spin_unlock_reiser4_super(sbinfo);
1723+}
1724+
1725+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1726+{
1727+ reiser4_context *ctx;
1728+ reiser4_super_info_data *sbinfo;
1729+
1730+ assert("vs-1095", atom);
1731+
1732+ ctx = get_current_context();
1733+ sbinfo = get_super_private(ctx->super);
1734+
1735+ sub_from_ctx_grabbed(ctx, count);
1736+
1737+ add_to_atom_flush_reserved_nolock(atom, count);
1738+
1739+ spin_lock_reiser4_super(sbinfo);
1740+
1741+ sbinfo->blocks_flush_reserved += count;
1742+ sub_from_sb_grabbed(sbinfo, count);
1743+
1744+ assert("vpf-292", check_block_counters(ctx->super));
1745+
1746+ spin_unlock_reiser4_super(sbinfo);
1747+}
1748+
1749+void grabbed2flush_reserved(__u64 count)
1750+{
1751+ txn_atom *atom = get_current_atom_locked();
1752+
1753+ grabbed2flush_reserved_nolock(atom, count);
1754+
1755+ spin_unlock_atom(atom);
1756+}
1757+
1758+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1759+{
1760+ reiser4_context *ctx;
1761+ reiser4_super_info_data *sbinfo;
1762+
1763+ assert("nikita-2788", atom != NULL);
1764+ assert_spin_locked(&(atom->alock));
1765+
1766+ ctx = get_current_context();
1767+ sbinfo = get_super_private(ctx->super);
1768+
1769+ add_to_ctx_grabbed(ctx, count);
1770+
1771+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1772+
1773+ spin_lock_reiser4_super(sbinfo);
1774+
1775+ sbinfo->blocks_grabbed += count;
1776+ sub_from_sb_flush_reserved(sbinfo, count);
1777+
1778+ assert("vpf-292", check_block_counters(ctx->super));
1779+
1780+ spin_unlock_reiser4_super(sbinfo);
1781+}
1782+
1783+/**
1784+ * all_grabbed2free - releases all blocks grabbed in context
1785+ *
1786+ * Decreases context's and super block's grabbed block counters by number of
1787+ * blocks grabbed by current context and increases super block's free block
1788+ * counter correspondingly.
1789+ */
1790+void all_grabbed2free(void)
1791+{
1792+ reiser4_context *ctx = get_current_context();
1793+
1794+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1795+}
1796+
1797+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1798+ after freeing, @count blocks become "grabbed". */
1799+static void
1800+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1801+ __u64 count)
1802+{
1803+ add_to_ctx_grabbed(ctx, count);
1804+
1805+ spin_lock_reiser4_super(sbinfo);
1806+
1807+ sbinfo->blocks_grabbed += count;
1808+ sub_from_sb_used(sbinfo, count);
1809+
1810+ assert("nikita-2685", check_block_counters(ctx->super));
1811+
1812+ spin_unlock_reiser4_super(sbinfo);
1813+}
1814+
1815+/* this used to be done through used2grabbed and grabbed2free*/
1816+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1817+{
1818+ spin_lock_reiser4_super(sbinfo);
1819+
1820+ sbinfo->blocks_free += count;
1821+ sub_from_sb_used(sbinfo, count);
1822+
1823+ assert("nikita-2685", check_block_counters(reiser4_get_current_sb()));
1824+
1825+ spin_unlock_reiser4_super(sbinfo);
1826+}
1827+
1828+#if REISER4_DEBUG
1829+
1830+/* check "allocated" state of given block range */
1831+static void
1832+reiser4_check_blocks(const reiser4_block_nr * start,
1833+ const reiser4_block_nr * len, int desired)
1834+{
1835+ sa_check_blocks(start, len, desired);
1836+}
1837+
1838+/* check "allocated" state of given block */
1839+void reiser4_check_block(const reiser4_block_nr * block, int desired)
1840+{
1841+ const reiser4_block_nr one = 1;
1842+
1843+ reiser4_check_blocks(block, &one, desired);
1844+}
1845+
1846+#endif
1847+
1848+/* Blocks deallocation function may do an actual deallocation through space
1849+ plugin allocation or store deleted block numbers in atom's delete_set data
1850+ structure depend on @defer parameter. */
1851+
1852+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1853+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1854+ freed but disk space is still grabbed by current thread, or these blocks must
1855+ not be counted in any reiser4 sb block counters, see block_stage_t comment */
1856+
1857+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1858+ distinguish blocks allocated for unformatted and formatted nodes */
1859+
1860+int
1861+reiser4_dealloc_blocks(const reiser4_block_nr * start,
1862+ const reiser4_block_nr * len,
1863+ block_stage_t target_stage, reiser4_ba_flags_t flags)
1864+{
1865+ txn_atom *atom = NULL;
1866+ int ret;
1867+ reiser4_context *ctx;
1868+ reiser4_super_info_data *sbinfo;
1869+
1870+ ctx = get_current_context();
1871+ sbinfo = get_super_private(ctx->super);
1872+
1873+ if (REISER4_DEBUG) {
1874+ assert("zam-431", *len != 0);
1875+ assert("zam-432", *start != 0);
1876+ assert("zam-558", !blocknr_is_fake(start));
1877+
1878+ spin_lock_reiser4_super(sbinfo);
1879+ assert("zam-562", *start < sbinfo->block_count);
1880+ spin_unlock_reiser4_super(sbinfo);
1881+ }
1882+
1883+ if (flags & BA_DEFER) {
1884+ blocknr_set_entry *bsep = NULL;
1885+
1886+ /* storing deleted block numbers in a blocknr set
1887+ datastructure for further actual deletion */
1888+ do {
1889+ atom = get_current_atom_locked();
1890+ assert("zam-430", atom != NULL);
1891+
1892+ ret =
1893+ blocknr_set_add_extent(atom, &atom->delete_set,
1894+ &bsep, start, len);
1895+
1896+ if (ret == -ENOMEM)
1897+ return ret;
1898+
1899+ /* This loop might spin at most two times */
1900+ } while (ret == -E_REPEAT);
1901+
1902+ assert("zam-477", ret == 0);
1903+ assert("zam-433", atom != NULL);
1904+
1905+ spin_unlock_atom(atom);
1906+
1907+ } else {
1908+ assert("zam-425", get_current_super_private() != NULL);
1909+ sa_dealloc_blocks(get_space_allocator(ctx->super), *start,
1910+ *len);
1911+
1912+ if (flags & BA_PERMANENT) {
1913+ /* These blocks were counted as allocated, we have to revert it
1914+ * back if allocation is discarded. */
1915+ txn_atom *atom = get_current_atom_locked();
1916+ atom->nr_blocks_allocated -= *len;
1917+ spin_unlock_atom(atom);
1918+ }
1919+
1920+ switch (target_stage) {
1921+ case BLOCK_NOT_COUNTED:
1922+ assert("vs-960", flags & BA_FORMATTED);
1923+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1924+ used2free(sbinfo, *len);
1925+ break;
1926+
1927+ case BLOCK_GRABBED:
1928+ used2grabbed(ctx, sbinfo, *len);
1929+ break;
1930+
1931+ case BLOCK_UNALLOCATED:
1932+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1933+ break;
1934+
1935+ case BLOCK_FLUSH_RESERVED:{
1936+ txn_atom *atom;
1937+
1938+ atom = get_current_atom_locked();
1939+ used2flush_reserved(sbinfo, atom, *len,
1940+ flags & BA_FORMATTED);
1941+ spin_unlock_atom(atom);
1942+ break;
1943+ }
1944+ default:
1945+ impossible("zam-532", "wrong block stage");
1946+ }
1947+ }
1948+
1949+ return 0;
1950+}
1951+
1952+/* wrappers for block allocator plugin methods */
1953+int pre_commit_hook(void)
1954+{
1955+ assert("zam-502", get_current_super_private() != NULL);
1956+ sa_pre_commit_hook();
1957+ return 0;
1958+}
1959+
1960+/* an actor which applies delete set to block allocator data */
1961+static int
1962+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1963+ const reiser4_block_nr * b, void *data UNUSED_ARG)
1964+{
1965+ reiser4_context *ctx;
1966+ reiser4_super_info_data *sbinfo;
1967+
1968+ __u64 len = 1;
1969+
1970+ ctx = get_current_context();
1971+ sbinfo = get_super_private(ctx->super);
1972+
1973+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1974+ assert("zam-552", sbinfo != NULL);
1975+
1976+ if (b != NULL)
1977+ len = *b;
1978+
1979+ if (REISER4_DEBUG) {
1980+ spin_lock_reiser4_super(sbinfo);
1981+
1982+ assert("zam-554", *a < reiser4_block_count(ctx->super));
1983+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1984+
1985+ spin_unlock_reiser4_super(sbinfo);
1986+ }
1987+
1988+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1989+ /* adjust sb block counters */
1990+ used2free(sbinfo, len);
1991+ return 0;
1992+}
1993+
1994+void post_commit_hook(void)
1995+{
1996+ txn_atom *atom;
1997+
1998+ atom = get_current_atom_locked();
1999+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
2000+ spin_unlock_atom(atom);
2001+
2002+ /* do the block deallocation which was deferred
2003+ until commit is done */
2004+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
2005+
2006+ assert("zam-504", get_current_super_private() != NULL);
2007+ sa_post_commit_hook();
2008+}
2009+
2010+void post_write_back_hook(void)
2011+{
2012+ assert("zam-504", get_current_super_private() != NULL);
2013+
2014+ sa_post_commit_hook();
2015+}
2016+
2017+/*
2018+ Local variables:
2019+ c-indentation-style: "K&R"
2020+ mode-name: "LC"
2021+ c-basic-offset: 8
2022+ tab-width: 8
2023+ fill-column: 120
2024+ scroll-step: 1
2025+ End:
2026+*/
2027Index: linux-2.6.16/fs/reiser4/block_alloc.h
2028===================================================================
2029--- /dev/null
2030+++ linux-2.6.16/fs/reiser4/block_alloc.h
2031@@ -0,0 +1,175 @@
2032+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2033+
2034+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
2035+#define __FS_REISER4_BLOCK_ALLOC_H__
2036+
2037+#include "dformat.h"
2038+#include "forward.h"
2039+
2040+#include <linux/types.h> /* for __u?? */
2041+#include <linux/fs.h>
2042+
2043+/* Mask when is applied to given block number shows is that block number is a fake one */
2044+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
2045+/* Mask which isolates a type of object this fake block number was assigned to */
2046+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
2047+
2048+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
2049+ against these two values to understand is the object unallocated or bitmap
2050+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
2051+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
2052+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
2053+
2054+/* specification how block allocation was counted in sb block counters */
2055+typedef enum {
2056+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
2057+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
2058+ of this block */
2059+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
2060+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
2061+ ( unallocated formatted or unformatted
2062+ node) */
2063+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
2064+ number assigned */
2065+} block_stage_t;
2066+
2067+/* a hint for block allocator */
2068+struct reiser4_blocknr_hint {
2069+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
2070+ is to prevent jnode_flush() calls from interleaving allocations on the same
2071+ bitmap, once a hint is established. */
2072+
2073+ /* search start hint */
2074+ reiser4_block_nr blk;
2075+ /* if not zero, it is a region size we search for free blocks in */
2076+ reiser4_block_nr max_dist;
2077+ /* level for allocation, may be useful have branch-level and higher
2078+ write-optimized. */
2079+ tree_level level;
2080+ /* block allocator assumes that blocks, which will be mapped to disk,
2081+ are in this specified block_stage */
2082+ block_stage_t block_stage;
2083+ /* If direction = 1 allocate blocks in backward direction from the end
2084+ * of disk to the beginning of disk. */
2085+ unsigned int backward:1;
2086+
2087+};
2088+
2089+/* These flags control block allocation/deallocation behavior */
2090+enum reiser4_ba_flags {
2091+ /* do allocatations from reserved (5%) area */
2092+ BA_RESERVED = (1 << 0),
2093+
2094+ /* block allocator can do commit trying to recover free space */
2095+ BA_CAN_COMMIT = (1 << 1),
2096+
2097+ /* if operation will be applied to formatted block */
2098+ BA_FORMATTED = (1 << 2),
2099+
2100+ /* defer actual block freeing until transaction commit */
2101+ BA_DEFER = (1 << 3),
2102+
2103+ /* allocate blocks for permanent fs objects (formatted or unformatted), not
2104+ wandered of log blocks */
2105+ BA_PERMANENT = (1 << 4),
2106+
2107+ /* grab space even it was disabled */
2108+ BA_FORCE = (1 << 5),
2109+
2110+ /* use default start value for free blocks search. */
2111+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
2112+};
2113+
2114+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
2115+
2116+extern void blocknr_hint_init(reiser4_blocknr_hint * hint);
2117+extern void blocknr_hint_done(reiser4_blocknr_hint * hint);
2118+extern void update_blocknr_hint_default(const struct super_block *,
2119+ const reiser4_block_nr *);
2120+extern void get_blocknr_hint_default(reiser4_block_nr *);
2121+
2122+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
2123+
2124+int assign_fake_blocknr_formatted(reiser4_block_nr *);
2125+reiser4_block_nr fake_blocknr_unformatted(int);
2126+
2127+/* free -> grabbed -> fake_allocated -> used */
2128+
2129+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
2130+void all_grabbed2free(void);
2131+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
2132+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
2133+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
2134+void grabbed2flush_reserved(__u64 count);
2135+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
2136+ reiser4_block_nr * start,
2137+ reiser4_block_nr * len, reiser4_ba_flags_t flags);
2138+int reiser4_dealloc_blocks(const reiser4_block_nr *,
2139+ const reiser4_block_nr *,
2140+ block_stage_t, reiser4_ba_flags_t flags);
2141+
2142+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
2143+ reiser4_block_nr * start,
2144+ reiser4_ba_flags_t flags)
2145+{
2146+ reiser4_block_nr one = 1;
2147+ return reiser4_alloc_blocks(hint, start, &one, flags);
2148+}
2149+
2150+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
2151+ block_stage_t stage,
2152+ reiser4_ba_flags_t flags)
2153+{
2154+ const reiser4_block_nr one = 1;
2155+ return reiser4_dealloc_blocks(block, &one, stage, flags);
2156+}
2157+
2158+#define reiser4_grab_space_force(count, flags) \
2159+ reiser4_grab_space(count, flags | BA_FORCE)
2160+
2161+extern void grabbed2free_mark(__u64 mark);
2162+extern int reiser4_grab_reserved(struct super_block *,
2163+ __u64, reiser4_ba_flags_t);
2164+extern void reiser4_release_reserved(struct super_block *super);
2165+
2166+/* grabbed -> fake_allocated */
2167+
2168+/* fake_allocated -> used */
2169+
2170+/* used -> fake_allocated -> grabbed -> free */
2171+
2172+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2173+
2174+extern int blocknr_is_fake(const reiser4_block_nr * da);
2175+
2176+extern void grabbed2cluster_reserved(int count);
2177+extern void cluster_reserved2grabbed(int count);
2178+extern void cluster_reserved2free(int count);
2179+
2180+extern int check_block_counters(const struct super_block *);
2181+
2182+#if REISER4_DEBUG
2183+
2184+extern void reiser4_check_block(const reiser4_block_nr *, int);
2185+
2186+#else
2187+
2188+# define reiser4_check_block(beg, val) noop
2189+
2190+#endif
2191+
2192+extern int pre_commit_hook(void);
2193+extern void post_commit_hook(void);
2194+extern void post_write_back_hook(void);
2195+
2196+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
2197+
2198+/* Make Linus happy.
2199+ Local variables:
2200+ c-indentation-style: "K&R"
2201+ mode-name: "LC"
2202+ c-basic-offset: 8
2203+ tab-width: 8
2204+ fill-column: 120
2205+ End:
2206+*/
2207Index: linux-2.6.16/fs/reiser4/blocknrset.c
2208===================================================================
2209--- /dev/null
2210+++ linux-2.6.16/fs/reiser4/blocknrset.c
2211@@ -0,0 +1,368 @@
2212+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2213+
2214+/* This file contains code for various block number sets used by the atom to
2215+ track the deleted set and wandered block mappings. */
2216+
2217+#include "debug.h"
2218+#include "dformat.h"
2219+#include "txnmgr.h"
2220+#include "context.h"
2221+
2222+#include <linux/slab.h>
2223+
2224+/* The proposed data structure for storing unordered block number sets is a
2225+ list of elements, each of which contains an array of block number or/and
2226+ array of block number pairs. That element called blocknr_set_entry is used
2227+ to store block numbers from the beginning and for extents from the end of
2228+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2229+ count numbers of blocks and extents.
2230+
2231+ +------------------- blocknr_set_entry->data ------------------+
2232+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2233+ +------------------------------------------------------------+
2234+
2235+ When current blocknr_set_entry is full, allocate a new one. */
2236+
2237+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2238+ * set (single blocks and block extents), in that case blocknr pair represent an
2239+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2240+ * there represent a (real block) -> (wandered block) mapping. */
2241+
2242+typedef struct blocknr_pair blocknr_pair;
2243+
2244+/* The total size of a blocknr_set_entry. */
2245+#define BLOCKNR_SET_ENTRY_SIZE 128
2246+
2247+/* The number of blocks that can fit the blocknr data area. */
2248+#define BLOCKNR_SET_ENTRIES_NUMBER \
2249+ ((BLOCKNR_SET_ENTRY_SIZE - \
2250+ 2 * sizeof (unsigned) - \
2251+ sizeof(struct list_head)) / \
2252+ sizeof(reiser4_block_nr))
2253+
2254+/* An entry of the blocknr_set */
2255+struct blocknr_set_entry {
2256+ unsigned nr_singles;
2257+ unsigned nr_pairs;
2258+ struct list_head link;
2259+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2260+};
2261+
2262+/* A pair of blocks as recorded in the blocknr_set_entry data. */
2263+struct blocknr_pair {
2264+ reiser4_block_nr a;
2265+ reiser4_block_nr b;
2266+};
2267+
2268+/* Return the number of blocknr slots available in a blocknr_set_entry. */
2269+/* Audited by: green(2002.06.11) */
2270+static unsigned bse_avail(blocknr_set_entry * bse)
2271+{
2272+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2273+
2274+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2275+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2276+
2277+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
2278+}
2279+
2280+/* Initialize a blocknr_set_entry. */
2281+static void bse_init(blocknr_set_entry *bse)
2282+{
2283+ bse->nr_singles = 0;
2284+ bse->nr_pairs = 0;
2285+ INIT_LIST_HEAD(&bse->link);
2286+}
2287+
2288+/* Allocate and initialize a blocknr_set_entry. */
2289+/* Audited by: green(2002.06.11) */
2290+static blocknr_set_entry *bse_alloc(void)
2291+{
2292+ blocknr_set_entry *e;
2293+
2294+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2295+ get_gfp_mask())) == NULL)
2296+ return NULL;
2297+
2298+ bse_init(e);
2299+
2300+ return e;
2301+}
2302+
2303+/* Free a blocknr_set_entry. */
2304+/* Audited by: green(2002.06.11) */
2305+static void bse_free(blocknr_set_entry * bse)
2306+{
2307+ kfree(bse);
2308+}
2309+
2310+/* Add a block number to a blocknr_set_entry */
2311+/* Audited by: green(2002.06.11) */
2312+static void
2313+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2314+{
2315+ assert("jmacd-5099", bse_avail(bse) >= 1);
2316+
2317+ bse->entries[bse->nr_singles++] = *block;
2318+}
2319+
2320+/* Get a pair of block numbers */
2321+/* Audited by: green(2002.06.11) */
2322+static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2323+{
2324+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2325+
2326+ return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2327+ 2 * (pno + 1));
2328+}
2329+
2330+/* Add a pair of block numbers to a blocknr_set_entry */
2331+/* Audited by: green(2002.06.11) */
2332+static void
2333+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2334+ const reiser4_block_nr * b)
2335+{
2336+ blocknr_pair *pair;
2337+
2338+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2339+
2340+ pair = bse_get_pair(bse, bse->nr_pairs++);
2341+
2342+ pair->a = *a;
2343+ pair->b = *b;
2344+}
2345+
2346+/* Add either a block or pair of blocks to the block number set. The first
2347+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2348+ @b is non-NULL a pair is added. The block number set belongs to atom, and
2349+ the call is made with the atom lock held. There may not be enough space in
2350+ the current blocknr_set_entry. If new_bsep points to a non-NULL
2351+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2352+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
2353+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
2354+ returned with the atom unlocked for the operation to be tried again. If
2355+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2356+ used during the call, it will be freed automatically. */
2357+static int blocknr_set_add(txn_atom *atom, blocknr_set *bset,
2358+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2359+ const reiser4_block_nr *b)
2360+{
2361+ blocknr_set_entry *bse;
2362+ unsigned entries_needed;
2363+
2364+ assert("jmacd-5101", a != NULL);
2365+
2366+ entries_needed = (b == NULL) ? 1 : 2;
2367+ if (list_empty(&bset->entries) ||
2368+ bse_avail(list_entry(bset->entries.next, blocknr_set_entry, link)) < entries_needed) {
2369+ /* See if a bse was previously allocated. */
2370+ if (*new_bsep == NULL) {
2371+ spin_unlock_atom(atom);
2372+ *new_bsep = bse_alloc();
2373+ return (*new_bsep != NULL) ? -E_REPEAT :
2374+ RETERR(-ENOMEM);
2375+ }
2376+
2377+ /* Put it on the head of the list. */
2378+ list_add(&((*new_bsep)->link), &bset->entries);
2379+
2380+ *new_bsep = NULL;
2381+ }
2382+
2383+ /* Add the single or pair. */
2384+ bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2385+ if (b == NULL) {
2386+ bse_put_single(bse, a);
2387+ } else {
2388+ bse_put_pair(bse, a, b);
2389+ }
2390+
2391+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2392+ if (*new_bsep != NULL) {
2393+ bse_free(*new_bsep);
2394+ *new_bsep = NULL;
2395+ }
2396+
2397+ return 0;
2398+}
2399+
2400+/* Add an extent to the block set. If the length is 1, it is treated as a
2401+ single block (e.g., reiser4_set_add_block). */
2402+/* Audited by: green(2002.06.11) */
2403+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2404+ kmalloc might schedule. The only exception is atom spinlock, which is
2405+ properly freed. */
2406+int
2407+blocknr_set_add_extent(txn_atom * atom,
2408+ blocknr_set * bset,
2409+ blocknr_set_entry ** new_bsep,
2410+ const reiser4_block_nr * start,
2411+ const reiser4_block_nr * len)
2412+{
2413+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2414+ return blocknr_set_add(atom, bset, new_bsep, start,
2415+ *len == 1 ? NULL : len);
2416+}
2417+
2418+/* Add a block pair to the block set. It adds exactly a pair, which is checked
2419+ * by an assertion that both arguments are not null.*/
2420+/* Audited by: green(2002.06.11) */
2421+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2422+ kmalloc might schedule. The only exception is atom spinlock, which is
2423+ properly freed. */
2424+int
2425+blocknr_set_add_pair(txn_atom * atom,
2426+ blocknr_set * bset,
2427+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2428+ const reiser4_block_nr * b)
2429+{
2430+ assert("jmacd-5103", a != NULL && b != NULL);
2431+ return blocknr_set_add(atom, bset, new_bsep, a, b);
2432+}
2433+
2434+/* Initialize a blocknr_set. */
2435+void blocknr_set_init(blocknr_set *bset)
2436+{
2437+ INIT_LIST_HEAD(&bset->entries);
2438+}
2439+
2440+/* Release the entries of a blocknr_set. */
2441+void blocknr_set_destroy(blocknr_set *bset)
2442+{
2443+ blocknr_set_entry *bse;
2444+
2445+ while (!list_empty_careful(&bset->entries)) {
2446+ bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2447+ list_del_init(&bse->link);
2448+ bse_free(bse);
2449+ }
2450+}
2451+
2452+/* Merge blocknr_set entries out of @from into @into. */
2453+/* Audited by: green(2002.06.11) */
2454+/* Auditor comments: This merge does not know if merged sets contain
2455+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
2456+ overlapping ranges if there is some. So I believe it may lead to
2457+ some blocks being presented several times in one blocknr_set. To help
2458+ debugging such problems it might help to check for duplicate entries on
2459+ actual processing of this set. Testing this kind of stuff right here is
2460+ also complicated by the fact that these sets are not sorted and going
2461+ through whole set on each element addition is going to be CPU-heavy task */
2462+void blocknr_set_merge(blocknr_set * from, blocknr_set * into)
2463+{
2464+ blocknr_set_entry *bse_into = NULL;
2465+
2466+ /* If @from is empty, no work to perform. */
2467+ if (list_empty_careful(&from->entries)) {
2468+ return;
2469+ }
2470+
2471+ /* If @into is not empty, try merging partial-entries. */
2472+ if (!list_empty_careful(&into->entries)) {
2473+
2474+ /* Neither set is empty, pop the front to members and try to combine them. */
2475+ blocknr_set_entry *bse_from;
2476+ unsigned into_avail;
2477+
2478+ bse_into = list_entry(into->entries.next, blocknr_set_entry, link);
2479+ list_del_init(&bse_into->link);
2480+ bse_from = list_entry(from->entries.next, blocknr_set_entry, link);
2481+ list_del_init(&bse_from->link);
2482+
2483+ /* Combine singles. */
2484+ for (into_avail = bse_avail(bse_into);
2485+ into_avail != 0 && bse_from->nr_singles != 0;
2486+ into_avail -= 1) {
2487+ bse_put_single(bse_into,
2488+ &bse_from->entries[--bse_from->
2489+ nr_singles]);
2490+ }
2491+
2492+ /* Combine pairs. */
2493+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
2494+ into_avail -= 2) {
2495+ blocknr_pair *pair =
2496+ bse_get_pair(bse_from, --bse_from->nr_pairs);
2497+ bse_put_pair(bse_into, &pair->a, &pair->b);
2498+ }
2499+
2500+ /* If bse_from is empty, delete it now. */
2501+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2502+ bse_free(bse_from);
2503+ } else {
2504+ /* Otherwise, bse_into is full or nearly full (e.g.,
2505+ it could have one slot avail and bse_from has one
2506+ pair left). Push it back onto the list. bse_from
2507+ becomes bse_into, which will be the new partial. */
2508+ list_add(&bse_into->link, &into->entries);
2509+ bse_into = bse_from;
2510+ }
2511+ }
2512+
2513+ /* Splice lists together. */
2514+ list_splice_init(&from->entries, into->entries.prev);
2515+
2516+ /* Add the partial entry back to the head of the list. */
2517+ if (bse_into != NULL) {
2518+ list_add(&bse_into->link, &into->entries);
2519+ }
2520+}
2521+
2522+/* Iterate over all blocknr set elements. */
2523+int blocknr_set_iterator(txn_atom *atom, blocknr_set *bset,
2524+ blocknr_set_actor_f actor, void *data, int delete)
2525+{
2526+
2527+ blocknr_set_entry *entry;
2528+
2529+ assert("zam-429", atom != NULL);
2530+ assert("zam-430", atom_is_protected(atom));
2531+ assert("zam-431", bset != 0);
2532+ assert("zam-432", actor != NULL);
2533+
2534+ entry = list_entry(bset->entries.next, blocknr_set_entry, link);
2535+ while (&bset->entries != &entry->link) {
2536+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2537+ unsigned int i;
2538+ int ret;
2539+
2540+ for (i = 0; i < entry->nr_singles; i++) {
2541+ ret = actor(atom, &entry->entries[i], NULL, data);
2542+
2543+ /* We can't break a loop if delete flag is set. */
2544+ if (ret != 0 && !delete)
2545+ return ret;
2546+ }
2547+
2548+ for (i = 0; i < entry->nr_pairs; i++) {
2549+ struct blocknr_pair *ab;
2550+
2551+ ab = bse_get_pair(entry, i);
2552+
2553+ ret = actor(atom, &ab->a, &ab->b, data);
2554+
2555+ if (ret != 0 && !delete)
2556+ return ret;
2557+ }
2558+
2559+ if (delete) {
2560+ list_del(&entry->link);
2561+ bse_free(entry);
2562+ }
2563+
2564+ entry = tmp;
2565+ }
2566+
2567+ return 0;
2568+}
2569+
2570+/*
2571+ * Local variables:
2572+ * c-indentation-style: "K&R"
2573+ * mode-name: "LC"
2574+ * c-basic-offset: 8
2575+ * tab-width: 8
2576+ * fill-column: 79
2577+ * scroll-step: 1
2578+ * End:
2579+ */
2580Index: linux-2.6.16/fs/reiser4/carry.c
2581===================================================================
2582--- /dev/null
2583+++ linux-2.6.16/fs/reiser4/carry.c
2584@@ -0,0 +1,1381 @@
2585+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2586+/* Functions to "carry" tree modification(s) upward. */
2587+/* Tree is modified one level at a time. As we modify a level we accumulate a
2588+ set of changes that need to be propagated to the next level. We manage
2589+ node locking such that any searches that collide with carrying are
2590+ restarted, from the root if necessary.
2591+
2592+ Insertion of a new item may result in items being moved among nodes and
2593+ this requires the delimiting key to be updated at the least common parent
2594+ of the nodes modified to preserve search tree invariants. Also, insertion
2595+ may require allocation of a new node. A pointer to the new node has to be
2596+ inserted into some node on the parent level, etc.
2597+
2598+ Tree carrying is meant to be analogous to arithmetic carrying.
2599+
2600+ A carry operation is always associated with some node (&carry_node).
2601+
2602+ Carry process starts with some initial set of operations to be performed
2603+ and an initial set of already locked nodes. Operations are performed one
2604+ by one. Performing each single operation has following possible effects:
2605+
2606+ - content of carry node associated with operation is modified
2607+ - new carry nodes are locked and involved into carry process on this level
2608+ - new carry operations are posted to the next level
2609+
2610+ After all carry operations on this level are done, process is repeated for
2611+ the accumulated sequence on carry operations for the next level. This
2612+ starts by trying to lock (in left to right order) all carry nodes
2613+ associated with carry operations on the parent level. After this, we decide
2614+ whether more nodes are required on the left of already locked set. If so,
2615+ all locks taken on the parent level are released, new carry nodes are
2616+ added, and locking process repeats.
2617+
2618+ It may happen that balancing process fails owing to unrecoverable error on
2619+ some of upper levels of a tree (possible causes are io error, failure to
2620+ allocate new node, etc.). In this case we should unmount the filesystem,
2621+ rebooting if it is the root, and possibly advise the use of fsck.
2622+
2623+ USAGE:
2624+
2625+ int some_tree_operation( znode *node, ... )
2626+ {
2627+ // Allocate on a stack pool of carry objects: operations and nodes.
2628+ // Most carry processes will only take objects from here, without
2629+ // dynamic allocation.
2630+
2631+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2632+
2633+ carry_pool pool;
2634+ carry_level lowest_level;
2635+ carry_op *op;
2636+
2637+ init_carry_pool( &pool );
2638+ init_carry_level( &lowest_level, &pool );
2639+
2640+ // operation may be one of:
2641+ // COP_INSERT --- insert new item into node
2642+ // COP_CUT --- remove part of or whole node
2643+ // COP_PASTE --- increase size of item
2644+ // COP_DELETE --- delete pointer from parent node
2645+ // COP_UPDATE --- update delimiting key in least
2646+ // common ancestor of two
2647+
2648+ op = post_carry( &lowest_level, operation, node, 0 );
2649+ if( IS_ERR( op ) || ( op == NULL ) ) {
2650+ handle error
2651+ } else {
2652+ // fill in remaining fields in @op, according to carry.h:carry_op
2653+ result = carry( &lowest_level, NULL );
2654+ }
2655+ done_carry_pool( &pool );
2656+ }
2657+
2658+ When you are implementing node plugin method that participates in carry
2659+ (shifting, insertion, deletion, etc.), do the following:
2660+
2661+ int foo_node_method( znode *node, ..., carry_level *todo )
2662+ {
2663+ carry_op *op;
2664+
2665+ ....
2666+
2667+ // note, that last argument to post_carry() is non-null
2668+ // here, because @op is to be applied to the parent of @node, rather
2669+ // than to the @node itself as in the previous case.
2670+
2671+ op = node_post_carry( todo, operation, node, 1 );
2672+ // fill in remaining fields in @op, according to carry.h:carry_op
2673+
2674+ ....
2675+
2676+ }
2677+
2678+ BATCHING:
2679+
2680+ One of the main advantages of level-by-level balancing implemented here is
2681+ ability to batch updates on a parent level and to peform them more
2682+ efficiently as a result.
2683+
2684+ Description To Be Done (TBD).
2685+
2686+ DIFFICULTIES AND SUBTLE POINTS:
2687+
2688+ 1. complex plumbing is required, because:
2689+
2690+ a. effective allocation through pools is needed
2691+
2692+ b. target of operation is not exactly known when operation is
2693+ posted. This is worked around through bitfields in &carry_node and
2694+ logic in lock_carry_node()
2695+
2696+ c. of interaction with locking code: node should be added into sibling
2697+ list when pointer to it is inserted into its parent, which is some time
2698+ after node was created. Between these moments, node is somewhat in
2699+ suspended state and is only registered in the carry lists
2700+
2701+ 2. whole balancing logic is implemented here, in particular, insertion
2702+ logic is coded in make_space().
2703+
2704+ 3. special cases like insertion (add_tree_root()) or deletion
2705+ (kill_tree_root()) of tree root and morphing of paste into insert
2706+ (insert_paste()) have to be handled.
2707+
2708+ 4. there is non-trivial interdependency between allocation of new nodes
2709+ and almost everything else. This is mainly due to the (1.c) above. I shall
2710+ write about this later.
2711+
2712+*/
2713+
2714+#include "forward.h"
2715+#include "debug.h"
2716+#include "key.h"
2717+#include "coord.h"
2718+#include "plugin/item/item.h"
2719+#include "plugin/item/extent.h"
2720+#include "plugin/node/node.h"
2721+#include "jnode.h"
2722+#include "znode.h"
2723+#include "tree_mod.h"
2724+#include "tree_walk.h"
2725+#include "block_alloc.h"
2726+#include "pool.h"
2727+#include "tree.h"
2728+#include "carry.h"
2729+#include "carry_ops.h"
2730+#include "super.h"
2731+#include "reiser4.h"
2732+
2733+#include <linux/types.h>
2734+
2735+/* level locking/unlocking */
2736+static int lock_carry_level(carry_level * level);
2737+static void unlock_carry_level(carry_level * level, int failure);
2738+static void done_carry_level(carry_level * level);
2739+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2740+
2741+int lock_carry_node(carry_level * level, carry_node * node);
2742+int lock_carry_node_tail(carry_node * node);
2743+
2744+/* carry processing proper */
2745+static int carry_on_level(carry_level * doing, carry_level * todo);
2746+
2747+static carry_op *add_op(carry_level * level, pool_ordering order,
2748+ carry_op * reference);
2749+
2750+/* handlers for carry operations. */
2751+
2752+static void fatal_carry_error(carry_level * doing, int ecode);
2753+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2754+
2755+
2756+static void print_level(const char *prefix, carry_level * level);
2757+
2758+#if REISER4_DEBUG
2759+typedef enum {
2760+ CARRY_TODO,
2761+ CARRY_DOING
2762+} carry_queue_state;
2763+static int carry_level_invariant(carry_level * level, carry_queue_state state);
2764+#endif
2765+
2766+/* main entry point for tree balancing.
2767+
2768+ Tree carry performs operations from @doing and while doing so accumulates
2769+ information about operations to be performed on the next level ("carried"
2770+ to the parent level). Carried operations are performed, causing possibly
2771+ more operations to be carried upward etc. carry() takes care about
2772+ locking and pinning znodes while operating on them.
2773+
2774+ For usage, see comment at the top of fs/reiser4/carry.c
2775+
2776+*/
2777+int carry(carry_level * doing /* set of carry operations to be performed */ ,
2778+ carry_level * done /* set of nodes, already performed at the
2779+ * previous level. NULL in most cases */ )
2780+{
2781+ int result = 0;
2782+ /* queue of new requests */
2783+ carry_level *todo;
2784+ ON_DEBUG(STORE_COUNTERS);
2785+
2786+ assert("nikita-888", doing != NULL);
2787+ BUG_ON(done != NULL);
2788+
2789+ todo = doing + 1;
2790+ init_carry_level(todo, doing->pool);
2791+
2792+ /* queue of requests preformed on the previous level */
2793+ done = todo + 1;
2794+ init_carry_level(done, doing->pool);
2795+
2796+ /* iterate until there is nothing more to do */
2797+ while (result == 0 && doing->ops_num > 0) {
2798+ carry_level *tmp;
2799+
2800+ /* at this point @done is locked. */
2801+ /* repeat lock/do/unlock while
2802+
2803+ (1) lock_carry_level() fails due to deadlock avoidance, or
2804+
2805+ (2) carry_on_level() decides that more nodes have to
2806+ be involved.
2807+
2808+ (3) some unexpected error occurred while balancing on the
2809+ upper levels. In this case all changes are rolled back.
2810+
2811+ */
2812+ while (1) {
2813+ result = lock_carry_level(doing);
2814+ if (result == 0) {
2815+ /* perform operations from @doing and
2816+ accumulate new requests in @todo */
2817+ result = carry_on_level(doing, todo);
2818+ if (result == 0)
2819+ break;
2820+ else if (result != -E_REPEAT ||
2821+ !doing->restartable) {
2822+ warning("nikita-1043",
2823+ "Fatal error during carry: %i",
2824+ result);
2825+ print_level("done", done);
2826+ print_level("doing", doing);
2827+ print_level("todo", todo);
2828+ /* do some rough stuff like aborting
2829+ all pending transcrashes and thus
2830+ pushing tree back to the consistent
2831+ state. Alternatvely, just panic.
2832+ */
2833+ fatal_carry_error(doing, result);
2834+ return result;
2835+ }
2836+ } else if (result != -E_REPEAT) {
2837+ fatal_carry_error(doing, result);
2838+ return result;
2839+ }
2840+ unlock_carry_level(doing, 1);
2841+ }
2842+ /* at this point @done can be safely unlocked */
2843+ done_carry_level(done);
2844+
2845+ /* cyclically shift queues */
2846+ tmp = done;
2847+ done = doing;
2848+ doing = todo;
2849+ todo = tmp;
2850+ init_carry_level(todo, doing->pool);
2851+
2852+ /* give other threads chance to run */
2853+ preempt_point();
2854+ }
2855+ done_carry_level(done);
2856+
2857+ /* all counters, but x_refs should remain the same. x_refs can change
2858+ owing to transaction manager */
2859+ ON_DEBUG(CHECK_COUNTERS);
2860+ return result;
2861+}
2862+
2863+/* perform carry operations on given level.
2864+
2865+ Optimizations proposed by pooh:
2866+
2867+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2868+ required;
2869+
2870+ (2) unlock node if there are no more operations to be performed upon it and
2871+ node didn't add any operation to @todo. This can be implemented by
2872+ attaching to each node two counters: counter of operaions working on this
2873+ node and counter and operations carried upward from this node.
2874+
2875+*/
2876+static int carry_on_level(carry_level * doing /* queue of carry operations to
2877+ * do on this level */ ,
2878+ carry_level * todo /* queue where new carry
2879+ * operations to be performed on
2880+ * the * parent level are
2881+ * accumulated during @doing
2882+ * processing. */ )
2883+{
2884+ int result;
2885+ int (*f) (carry_op *, carry_level *, carry_level *);
2886+ carry_op *op;
2887+ carry_op *tmp_op;
2888+
2889+ assert("nikita-1034", doing != NULL);
2890+ assert("nikita-1035", todo != NULL);
2891+
2892+ /* @doing->nodes are locked. */
2893+
2894+ /* This function can be split into two phases: analysis and modification.
2895+
2896+ Analysis calculates precisely what items should be moved between
2897+ nodes. This information is gathered in some structures attached to
2898+ each carry_node in a @doing queue. Analysis also determines whether
2899+ new nodes are to be allocated etc.
2900+
2901+ After analysis is completed, actual modification is performed. Here
2902+ we can take advantage of "batch modification": if there are several
2903+ operations acting on the same node, modifications can be performed
2904+ more efficiently when batched together.
2905+
2906+ Above is an optimization left for the future.
2907+ */
2908+ /* Important, but delayed optimization: it's possible to batch
2909+ operations together and perform them more efficiently as a
2910+ result. For example, deletion of several neighboring items from a
2911+ node can be converted to a single ->cut() operation.
2912+
2913+ Before processing queue, it should be scanned and "mergeable"
2914+ operations merged.
2915+ */
2916+ result = 0;
2917+ for_all_ops(doing, op, tmp_op) {
2918+ carry_opcode opcode;
2919+
2920+ assert("nikita-1041", op != NULL);
2921+ opcode = op->op;
2922+ assert("nikita-1042", op->op < COP_LAST_OP);
2923+ f = op_dispatch_table[op->op].handler;
2924+ result = f(op, doing, todo);
2925+ /* locking can fail with -E_REPEAT. Any different error is fatal
2926+ and will be handled by fatal_carry_error() sledgehammer.
2927+ */
2928+ if (result != 0)
2929+ break;
2930+ }
2931+ if (result == 0) {
2932+ carry_plugin_info info;
2933+ carry_node *scan;
2934+ carry_node *tmp_scan;
2935+
2936+ info.doing = doing;
2937+ info.todo = todo;
2938+
2939+ assert("nikita-3002",
2940+ carry_level_invariant(doing, CARRY_DOING));
2941+ for_all_nodes(doing, scan, tmp_scan) {
2942+ znode *node;
2943+
2944+ node = carry_real(scan);
2945+ assert("nikita-2547", node != NULL);
2946+ if (node_is_empty(node)) {
2947+ result =
2948+ node_plugin_by_node(node)->
2949+ prepare_removal(node, &info);
2950+ if (result != 0)
2951+ break;
2952+ }
2953+ }
2954+ }
2955+ return result;
2956+}
2957+
2958+/* post carry operation
2959+
2960+ This is main function used by external carry clients: node layout plugins
2961+ and tree operations to create new carry operation to be performed on some
2962+ level.
2963+
2964+ New operation will be included in the @level queue. To actually perform it,
2965+ call carry( level, ... ). This function takes write lock on @node. Carry
2966+ manages all its locks by itself, don't worry about this.
2967+
2968+ This function adds operation and node at the end of the queue. It is up to
2969+ caller to guarantee proper ordering of node queue.
2970+
2971+*/
2972+carry_op *post_carry(carry_level * level /* queue where new operation is to
2973+ * be posted at */ ,
2974+ carry_opcode op /* opcode of operation */ ,
2975+ znode * node /* node on which this operation
2976+ * will operate */ ,
2977+ int apply_to_parent_p /* whether operation will operate
2978+ * directly on @node or on it
2979+ * parent. */ )
2980+{
2981+ carry_op *result;
2982+ carry_node *child;
2983+
2984+ assert("nikita-1046", level != NULL);
2985+ assert("nikita-1788", znode_is_write_locked(node));
2986+
2987+ result = add_op(level, POOLO_LAST, NULL);
2988+ if (IS_ERR(result))
2989+ return result;
2990+ child = add_carry(level, POOLO_LAST, NULL);
2991+ if (IS_ERR(child)) {
2992+ reiser4_pool_free(&level->pool->op_pool, &result->header);
2993+ return (carry_op *) child;
2994+ }
2995+ result->node = child;
2996+ result->op = op;
2997+ child->parent = apply_to_parent_p;
2998+ if (ZF_ISSET(node, JNODE_ORPHAN))
2999+ child->left_before = 1;
3000+ child->node = node;
3001+ return result;
3002+}
3003+
3004+/* initialize carry queue */
3005+void init_carry_level(carry_level * level /* level to initialize */ ,
3006+ carry_pool * pool /* pool @level will allocate objects
3007+ * from */ )
3008+{
3009+ assert("nikita-1045", level != NULL);
3010+ assert("nikita-967", pool != NULL);
3011+
3012+ memset(level, 0, sizeof *level);
3013+ level->pool = pool;
3014+
3015+ INIT_LIST_HEAD(&level->nodes);
3016+ INIT_LIST_HEAD(&level->ops);
3017+}
3018+
3019+/* allocate carry pool and initialize pools within queue */
3020+carry_pool *init_carry_pool(int size)
3021+{
3022+ carry_pool *pool;
3023+
3024+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
3025+ pool = kmalloc(size, get_gfp_mask());
3026+ if (pool == NULL)
3027+ return ERR_PTR(RETERR(-ENOMEM));
3028+
3029+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
3030+ (char *)pool->op);
3031+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
3032+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
3033+ return pool;
3034+}
3035+
3036+/* finish with queue pools */
3037+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
3038+{
3039+ reiser4_done_pool(&pool->op_pool);
3040+ reiser4_done_pool(&pool->node_pool);
3041+ kfree(pool);
3042+}
3043+
3044+/* add new carry node to the @level.
3045+
3046+ Returns pointer to the new carry node allocated from pool. It's up to
3047+ callers to maintain proper order in the @level. Assumption is that if carry
3048+ nodes on one level are already sorted and modifications are peroformed from
3049+ left to right, carry nodes added on the parent level will be ordered
3050+ automatically. To control ordering use @order and @reference parameters.
3051+
3052+*/
3053+carry_node *add_carry_skip(carry_level * level /* &carry_level to add node
3054+ * to */ ,
3055+ pool_ordering order /* where to insert: at the
3056+ * beginning of @level,
3057+ * before @reference, after
3058+ * @reference, at the end
3059+ * of @level */ ,
3060+ carry_node * reference /* reference node for
3061+ * insertion */ )
3062+{
3063+ ON_DEBUG(carry_node * orig_ref = reference);
3064+
3065+ if (order == POOLO_BEFORE) {
3066+ reference = find_left_carry(reference, level);
3067+ if (reference == NULL)
3068+ reference = list_entry(level->nodes.next, carry_node,
3069+ header.level_linkage);
3070+ else
3071+ reference = list_entry(reference->header.level_linkage.next,
3072+ carry_node, header.level_linkage);
3073+ } else if (order == POOLO_AFTER) {
3074+ reference = find_right_carry(reference, level);
3075+ if (reference == NULL)
3076+ reference = list_entry(level->nodes.prev, carry_node,
3077+ header.level_linkage);
3078+ else
3079+ reference = list_entry(reference->header.level_linkage.prev,
3080+ carry_node, header.level_linkage);
3081+ }
3082+ assert("nikita-2209",
3083+ ergo(orig_ref != NULL,
3084+ carry_real(reference) == carry_real(orig_ref)));
3085+ return add_carry(level, order, reference);
3086+}
3087+
3088+carry_node *add_carry(carry_level * level /* &carry_level to add node
3089+ * to */ ,
3090+ pool_ordering order /* where to insert: at the
3091+ * beginning of @level, before
3092+ * @reference, after @reference,
3093+ * at the end of @level */ ,
3094+ carry_node * reference /* reference node for
3095+ * insertion */ )
3096+{
3097+ carry_node *result;
3098+
3099+ result =
3100+ (carry_node *) add_obj(&level->pool->node_pool, &level->nodes,
3101+ order, &reference->header);
3102+ if (!IS_ERR(result) && (result != NULL))
3103+ ++level->nodes_num;
3104+ return result;
3105+}
3106+
3107+/* add new carry operation to the @level.
3108+
3109+ Returns pointer to the new carry operations allocated from pool. It's up to
3110+ callers to maintain proper order in the @level. To control ordering use
3111+ @order and @reference parameters.
3112+
3113+*/
3114+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
3115+ pool_ordering order /* where to insert: at the beginning of
3116+ * @level, before @reference, after
3117+ * @reference, at the end of @level */ ,
3118+ carry_op *
3119+ reference /* reference node for insertion */ )
3120+{
3121+ carry_op *result;
3122+
3123+ result =
3124+ (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order,
3125+ &reference->header);
3126+ if (!IS_ERR(result) && (result != NULL))
3127+ ++level->ops_num;
3128+ return result;
3129+}
3130+
3131+/* Return node on the right of which @node was created.
3132+
3133+ Each node is created on the right of some existing node (or it is new root,
3134+ which is special case not handled here).
3135+
3136+ @node is new node created on some level, but not yet inserted into its
3137+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3138+
3139+*/
3140+static carry_node *find_begetting_brother(carry_node * node /* node to start search
3141+ * from */ ,
3142+ carry_level * kin UNUSED_ARG /* level to
3143+ * scan */ )
3144+{
3145+ carry_node *scan;
3146+
3147+ assert("nikita-1614", node != NULL);
3148+ assert("nikita-1615", kin != NULL);
3149+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3150+ assert("nikita-1619", ergo(carry_real(node) != NULL,
3151+ ZF_ISSET(carry_real(node), JNODE_ORPHAN)));
3152+
3153+ for (scan = node;;
3154+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
3155+ header.level_linkage)) {
3156+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
3157+ if ((scan->node != node->node) &&
3158+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3159+ assert("nikita-1618", carry_real(scan) != NULL);
3160+ break;
3161+ }
3162+ }
3163+ return scan;
3164+}
3165+
3166+static cmp_t
3167+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3168+{
3169+ assert("nikita-2199", n1 != NULL);
3170+ assert("nikita-2200", n2 != NULL);
3171+
3172+ if (n1 == n2)
3173+ return EQUAL_TO;
3174+ while (1) {
3175+ n1 = carry_node_next(n1);
3176+ if (carry_node_end(level, n1))
3177+ return GREATER_THAN;
3178+ if (n1 == n2)
3179+ return LESS_THAN;
3180+ }
3181+ impossible("nikita-2201", "End of level reached");
3182+}
3183+
3184+carry_node *find_carry_node(carry_level * level, const znode * node)
3185+{
3186+ carry_node *scan;
3187+ carry_node *tmp_scan;
3188+
3189+ assert("nikita-2202", level != NULL);
3190+ assert("nikita-2203", node != NULL);
3191+
3192+ for_all_nodes(level, scan, tmp_scan) {
3193+ if (carry_real(scan) == node)
3194+ return scan;
3195+ }
3196+ return NULL;
3197+}
3198+
3199+znode *carry_real(const carry_node * node)
3200+{
3201+ assert("nikita-3061", node != NULL);
3202+
3203+ return node->lock_handle.node;
3204+}
3205+
3206+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
3207+ const znode * node)
3208+{
3209+ carry_node *base;
3210+ carry_node *scan;
3211+ carry_node *tmp_scan;
3212+ carry_node *proj;
3213+
3214+ base = find_carry_node(doing, node);
3215+ assert("nikita-2204", base != NULL);
3216+
3217+ for_all_nodes(todo, scan, tmp_scan) {
3218+ proj = find_carry_node(doing, scan->node);
3219+ assert("nikita-2205", proj != NULL);
3220+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3221+ break;
3222+ }
3223+ return scan;
3224+}
3225+
3226+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
3227+ znode * node)
3228+{
3229+ carry_node *reference;
3230+
3231+ assert("nikita-2994", doing != NULL);
3232+ assert("nikita-2995", todo != NULL);
3233+ assert("nikita-2996", node != NULL);
3234+
3235+ reference = insert_carry_node(doing, todo, node);
3236+ assert("nikita-2997", reference != NULL);
3237+
3238+ return add_carry(todo, POOLO_BEFORE, reference);
3239+}
3240+
3241+/* like post_carry(), but designed to be called from node plugin methods.
3242+ This function is different from post_carry() in that it finds proper place
3243+ to insert node in the queue. */
3244+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
3245+ * passed down to node
3246+ * plugin */ ,
3247+ carry_opcode op /* opcode of operation */ ,
3248+ znode * node /* node on which this
3249+ * operation will operate */ ,
3250+ int apply_to_parent_p /* whether operation will
3251+ * operate directly on @node
3252+ * or on it parent. */ )
3253+{
3254+ carry_op *result;
3255+ carry_node *child;
3256+
3257+ assert("nikita-2207", info != NULL);
3258+ assert("nikita-2208", info->todo != NULL);
3259+
3260+ if (info->doing == NULL)
3261+ return post_carry(info->todo, op, node, apply_to_parent_p);
3262+
3263+ result = add_op(info->todo, POOLO_LAST, NULL);
3264+ if (IS_ERR(result))
3265+ return result;
3266+ child = add_carry_atplace(info->doing, info->todo, node);
3267+ if (IS_ERR(child)) {
3268+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3269+ return (carry_op *) child;
3270+ }
3271+ result->node = child;
3272+ result->op = op;
3273+ child->parent = apply_to_parent_p;
3274+ if (ZF_ISSET(node, JNODE_ORPHAN))
3275+ child->left_before = 1;
3276+ child->node = node;
3277+ return result;
3278+}
3279+
3280+/* lock all carry nodes in @level */
3281+static int lock_carry_level(carry_level * level /* level to lock */ )
3282+{
3283+ int result;
3284+ carry_node *node;
3285+ carry_node *tmp_node;
3286+
3287+ assert("nikita-881", level != NULL);
3288+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3289+
3290+ /* lock nodes from left to right */
3291+ result = 0;
3292+ for_all_nodes(level, node, tmp_node) {
3293+ result = lock_carry_node(level, node);
3294+ if (result != 0)
3295+ break;
3296+ }
3297+ return result;
3298+}
3299+
3300+/* Synchronize delimiting keys between @node and its left neighbor.
3301+
3302+ To reduce contention on dk key and simplify carry code, we synchronize
3303+ delimiting keys only when carry ultimately leaves tree level (carrying
3304+ changes upward) and unlocks nodes at this level.
3305+
3306+ This function first finds left neighbor of @node and then updates left
3307+ neighbor's right delimiting key to conincide with least key in @node.
3308+
3309+*/
3310+
3311+ON_DEBUG(extern atomic_t delim_key_version;
3312+ )
3313+
3314+static void sync_dkeys(znode * spot /* node to update */ )
3315+{
3316+ reiser4_key pivot;
3317+ reiser4_tree *tree;
3318+
3319+ assert("nikita-1610", spot != NULL);
3320+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3321+
3322+ tree = znode_get_tree(spot);
3323+ read_lock_tree(tree);
3324+ write_lock_dk(tree);
3325+
3326+ assert("nikita-2192", znode_is_loaded(spot));
3327+
3328+ /* sync left delimiting key of @spot with key in its leftmost item */
3329+ if (node_is_empty(spot))
3330+ pivot = *znode_get_rd_key(spot);
3331+ else
3332+ leftmost_key_in_node(spot, &pivot);
3333+
3334+ znode_set_ld_key(spot, &pivot);
3335+
3336+ /* there can be sequence of empty nodes pending removal on the left of
3337+ @spot. Scan them and update their left and right delimiting keys to
3338+ match left delimiting key of @spot. Also, update right delimiting
3339+ key of first non-empty left neighbor.
3340+ */
3341+ while (1) {
3342+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3343+ break;
3344+
3345+ spot = spot->left;
3346+ if (spot == NULL)
3347+ break;
3348+
3349+ znode_set_rd_key(spot, &pivot);
3350+ /* don't sink into the domain of another balancing */
3351+ if (!znode_is_write_locked(spot))
3352+ break;
3353+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3354+ znode_set_ld_key(spot, &pivot);
3355+ else
3356+ break;
3357+ }
3358+
3359+ write_unlock_dk(tree);
3360+ read_unlock_tree(tree);
3361+}
3362+
3363+/* unlock all carry nodes in @level */
3364+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3365+ int failure /* true if unlocking owing to
3366+ * failure */ )
3367+{
3368+ carry_node *node;
3369+ carry_node *tmp_node;
3370+
3371+ assert("nikita-889", level != NULL);
3372+
3373+ if (!failure) {
3374+ znode *spot;
3375+
3376+ spot = NULL;
3377+ /* update delimiting keys */
3378+ for_all_nodes(level, node, tmp_node) {
3379+ if (carry_real(node) != spot) {
3380+ spot = carry_real(node);
3381+ sync_dkeys(spot);
3382+ }
3383+ }
3384+ }
3385+
3386+ /* nodes can be unlocked in arbitrary order. In preemptible
3387+ environment it's better to unlock in reverse order of locking,
3388+ though.
3389+ */
3390+ for_all_nodes_back(level, node, tmp_node) {
3391+ /* all allocated nodes should be already linked to their
3392+ parents at this moment. */
3393+ assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node),
3394+ JNODE_ORPHAN)));
3395+ ON_DEBUG(check_dkeys(carry_real(node)));
3396+ unlock_carry_node(level, node, failure);
3397+ }
3398+ level->new_root = NULL;
3399+}
3400+
3401+/* finish with @level
3402+
3403+ Unlock nodes and release all allocated resources */
3404+static void done_carry_level(carry_level * level /* level to finish */ )
3405+{
3406+ carry_node *node;
3407+ carry_node *tmp_node;
3408+ carry_op *op;
3409+ carry_op *tmp_op;
3410+
3411+ assert("nikita-1076", level != NULL);
3412+
3413+ unlock_carry_level(level, 0);
3414+ for_all_nodes(level, node, tmp_node) {
3415+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3416+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3417+ reiser4_pool_free(&level->pool->node_pool, &node->header);
3418+ }
3419+ for_all_ops(level, op, tmp_op)
3420+ reiser4_pool_free(&level->pool->op_pool, &op->header);
3421+}
3422+
3423+/* helper function to complete locking of carry node
3424+
3425+ Finish locking of carry node. There are several ways in which new carry
3426+ node can be added into carry level and locked. Normal is through
3427+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
3428+ function factors out common final part of all locking scenarios. It
3429+ supposes that @node -> lock_handle is lock handle for lock just taken and
3430+ fills ->real_node from this lock handle.
3431+
3432+*/
3433+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3434+{
3435+ assert("nikita-1052", node != NULL);
3436+ assert("nikita-1187", carry_real(node) != NULL);
3437+ assert("nikita-1188", !node->unlock);
3438+
3439+ node->unlock = 1;
3440+ /* Load node content into memory and install node plugin by
3441+ looking at the node header.
3442+
3443+ Most of the time this call is cheap because the node is
3444+ already in memory.
3445+
3446+ Corresponding zrelse() is in unlock_carry_node()
3447+ */
3448+ return zload(carry_real(node));
3449+}
3450+
3451+/* lock carry node
3452+
3453+ "Resolve" node to real znode, lock it and mark as locked.
3454+ This requires recursive locking of znodes.
3455+
3456+ When operation is posted to the parent level, node it will be applied to is
3457+ not yet known. For example, when shifting data between two nodes,
3458+ delimiting has to be updated in parent or parents of nodes involved. But
3459+ their parents is not yet locked and, moreover said nodes can be reparented
3460+ by concurrent balancing.
3461+
3462+ To work around this, carry operation is applied to special "carry node"
3463+ rather than to the znode itself. Carry node consists of some "base" or
3464+ "reference" znode and flags indicating how to get to the target of carry
3465+ operation (->real_node field of carry_node) from base.
3466+
3467+*/
3468+int lock_carry_node(carry_level * level /* level @node is in */ ,
3469+ carry_node * node /* node to lock */ )
3470+{
3471+ int result;
3472+ znode *reference_point;
3473+ lock_handle lh;
3474+ lock_handle tmp_lh;
3475+ reiser4_tree *tree;
3476+
3477+ assert("nikita-887", level != NULL);
3478+ assert("nikita-882", node != NULL);
3479+
3480+ result = 0;
3481+ reference_point = node->node;
3482+ init_lh(&lh);
3483+ init_lh(&tmp_lh);
3484+ if (node->left_before) {
3485+ /* handling of new nodes, allocated on the previous level:
3486+
3487+ some carry ops were propably posted from the new node, but
3488+ this node neither has parent pointer set, nor is
3489+ connected. This will be done in ->create_hook() for
3490+ internal item.
3491+
3492+ No then less, parent of new node has to be locked. To do
3493+ this, first go to the "left" in the carry order. This
3494+ depends on the decision to always allocate new node on the
3495+ right of existing one.
3496+
3497+ Loop handles case when multiple nodes, all orphans, were
3498+ inserted.
3499+
3500+ Strictly speaking, taking tree lock is not necessary here,
3501+ because all nodes scanned by loop in
3502+ find_begetting_brother() are write-locked by this thread,
3503+ and thus, their sibling linkage cannot change.
3504+
3505+ */
3506+ tree = znode_get_tree(reference_point);
3507+ read_lock_tree(tree);
3508+ reference_point = find_begetting_brother(node, level)->node;
3509+ read_unlock_tree(tree);
3510+ assert("nikita-1186", reference_point != NULL);
3511+ }
3512+ if (node->parent && (result == 0)) {
3513+ result =
3514+ reiser4_get_parent(&tmp_lh, reference_point,
3515+ ZNODE_WRITE_LOCK);
3516+ if (result != 0) {
3517+ ; /* nothing */
3518+ } else if (znode_get_level(tmp_lh.node) == 0) {
3519+ assert("nikita-1347", znode_above_root(tmp_lh.node));
3520+ result = add_new_root(level, node, tmp_lh.node);
3521+ if (result == 0) {
3522+ reference_point = level->new_root;
3523+ move_lh(&lh, &node->lock_handle);
3524+ }
3525+ } else if ((level->new_root != NULL)
3526+ && (level->new_root !=
3527+ znode_parent_nolock(reference_point))) {
3528+ /* parent of node exists, but this level aready
3529+ created different new root, so */
3530+ warning("nikita-1109",
3531+ /* it should be "radicis", but tradition is
3532+ tradition. do banshees read latin? */
3533+ "hodie natus est radici frater");
3534+ result = -EIO;
3535+ } else {
3536+ move_lh(&lh, &tmp_lh);
3537+ reference_point = lh.node;
3538+ }
3539+ }
3540+ if (node->left && (result == 0)) {
3541+ assert("nikita-1183", node->parent);
3542+ assert("nikita-883", reference_point != NULL);
3543+ result =
3544+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
3545+ ZNODE_WRITE_LOCK,
3546+ GN_CAN_USE_UPPER_LEVELS);
3547+ if (result == 0) {
3548+ done_lh(&lh);
3549+ move_lh(&lh, &tmp_lh);
3550+ reference_point = lh.node;
3551+ }
3552+ }
3553+ if (!node->parent && !node->left && !node->left_before) {
3554+ result =
3555+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3556+ ZNODE_LOCK_HIPRI);
3557+ }
3558+ if (result == 0) {
3559+ move_lh(&node->lock_handle, &lh);
3560+ result = lock_carry_node_tail(node);
3561+ }
3562+ done_lh(&tmp_lh);
3563+ done_lh(&lh);
3564+ return result;
3565+}
3566+
3567+/* release a lock on &carry_node.
3568+
3569+ Release if necessary lock on @node. This opearion is pair of
3570+ lock_carry_node() and is idempotent: you can call it more than once on the
3571+ same node.
3572+
3573+*/
3574+static void
3575+unlock_carry_node(carry_level * level,
3576+ carry_node * node /* node to be released */ ,
3577+ int failure /* 0 if node is unlocked due
3578+ * to some error */ )
3579+{
3580+ znode *real_node;
3581+
3582+ assert("nikita-884", node != NULL);
3583+
3584+ real_node = carry_real(node);
3585+ /* pair to zload() in lock_carry_node_tail() */
3586+ zrelse(real_node);
3587+ if (node->unlock && (real_node != NULL)) {
3588+ assert("nikita-899", real_node == node->lock_handle.node);
3589+ longterm_unlock_znode(&node->lock_handle);
3590+ }
3591+ if (failure) {
3592+ if (node->deallocate && (real_node != NULL)) {
3593+ /* free node in bitmap
3594+
3595+ Prepare node for removal. Last zput() will finish
3596+ with it.
3597+ */
3598+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3599+ }
3600+ if (node->free) {
3601+ assert("nikita-2177",
3602+ list_empty_careful(&node->lock_handle.locks_link));
3603+ assert("nikita-2112",
3604+ list_empty_careful(&node->lock_handle.owners_link));
3605+ reiser4_pool_free(&level->pool->node_pool,
3606+ &node->header);
3607+ }
3608+ }
3609+}
3610+
3611+/* fatal_carry_error() - all-catching error handling function
3612+
3613+ It is possible that carry faces unrecoverable error, like unability to
3614+ insert pointer at the internal level. Our simple solution is just panic in
3615+ this situation. More sophisticated things like attempt to remount
3616+ file-system as read-only can be implemented without much difficlties.
3617+
3618+ It is believed, that:
3619+
3620+ 1. in stead of panicking, all current transactions can be aborted rolling
3621+ system back to the consistent state.
3622+
3623+Umm, if you simply panic without doing anything more at all, then all current
3624+transactions are aborted and the system is rolled back to a consistent state,
3625+by virtue of the design of the transactional mechanism. Well, wait, let's be
3626+precise. If an internal node is corrupted on disk due to hardware failure,
3627+then there may be no consistent state that can be rolled back to, so instead
3628+we should say that it will rollback the transactions, which barring other
3629+factors means rolling back to a consistent state.
3630+
3631+# Nikita: there is a subtle difference between panic and aborting
3632+# transactions: machine doesn't reboot. Processes aren't killed. Processes
3633+# don't using reiser4 (not that we care about such processes), or using other
3634+# reiser4 mounts (about them we do care) will simply continue to run. With
3635+# some luck, even application using aborted file system can survive: it will
3636+# get some error, like EBADF, from each file descriptor on failed file system,
3637+# but applications that do care about tolerance will cope with this (squid
3638+# will).
3639+
3640+It would be a nice feature though to support rollback without rebooting
3641+followed by remount, but this can wait for later versions.
3642+
3643+ 2. once isolated transactions will be implemented it will be possible to
3644+ roll back offending transaction.
3645+
3646+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3647+it more before deciding if it should be done. -Hans
3648+
3649+*/
3650+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3651+ * where
3652+ * unrecoverable
3653+ * error
3654+ * occurred */ ,
3655+ int ecode /* error code */ )
3656+{
3657+ assert("nikita-1230", doing != NULL);
3658+ assert("nikita-1231", ecode < 0);
3659+
3660+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3661+}
3662+
3663+/* add new root to the tree
3664+
3665+ This function itself only manages changes in carry structures and delegates
3666+ all hard work (allocation of znode for new root, changes of parent and
3667+ sibling pointers to the add_tree_root().
3668+
3669+ Locking: old tree root is locked by carry at this point. Fake znode is also
3670+ locked.
3671+
3672+*/
3673+static int add_new_root(carry_level * level /* carry level in context of which
3674+ * operation is performed */ ,
3675+ carry_node * node /* carry node for existing root */ ,
3676+ znode * fake /* "fake" znode already locked by
3677+ * us */ )
3678+{
3679+ int result;
3680+
3681+ assert("nikita-1104", level != NULL);
3682+ assert("nikita-1105", node != NULL);
3683+
3684+ assert("nikita-1403", znode_is_write_locked(node->node));
3685+ assert("nikita-1404", znode_is_write_locked(fake));
3686+
3687+ /* trying to create new root. */
3688+ /* @node is root and it's already locked by us. This
3689+ means that nobody else can be trying to add/remove
3690+ tree root right now.
3691+ */
3692+ if (level->new_root == NULL)
3693+ level->new_root = add_tree_root(node->node, fake);
3694+ if (!IS_ERR(level->new_root)) {
3695+ assert("nikita-1210", znode_is_root(level->new_root));
3696+ node->deallocate = 1;
3697+ result =
3698+ longterm_lock_znode(&node->lock_handle, level->new_root,
3699+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3700+ if (result == 0)
3701+ zput(level->new_root);
3702+ } else {
3703+ result = PTR_ERR(level->new_root);
3704+ level->new_root = NULL;
3705+ }
3706+ return result;
3707+}
3708+
3709+/* allocate new znode and add the operation that inserts the
3710+ pointer to it into the parent node into the todo level
3711+
3712+ Allocate new znode, add it into carry queue and post into @todo queue
3713+ request to add pointer to new node into its parent.
3714+
3715+ This is carry related routing that calls new_node() to allocate new
3716+ node.
3717+*/
3718+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3719+ * node */ ,
3720+ carry_node * ref /* carry node after which new
3721+ * carry node is to be inserted
3722+ * into queue. This affects
3723+ * locking. */ ,
3724+ carry_level * doing /* carry queue where new node is
3725+ * to be added */ ,
3726+ carry_level * todo /* carry queue where COP_INSERT
3727+ * operation to add pointer to
3728+ * new node will ne added */ )
3729+{
3730+ carry_node *fresh;
3731+ znode *new_znode;
3732+ carry_op *add_pointer;
3733+ carry_plugin_info info;
3734+
3735+ assert("nikita-1048", brother != NULL);
3736+ assert("nikita-1049", todo != NULL);
3737+
3738+ /* There is a lot of possible variations here: to what parent
3739+ new node will be attached and where. For simplicity, always
3740+ do the following:
3741+
3742+ (1) new node and @brother will have the same parent.
3743+
3744+ (2) new node is added on the right of @brother
3745+
3746+ */
3747+
3748+ fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref);
3749+ if (IS_ERR(fresh))
3750+ return fresh;
3751+
3752+ fresh->deallocate = 1;
3753+ fresh->free = 1;
3754+
3755+ new_znode = new_node(brother, znode_get_level(brother));
3756+ if (IS_ERR(new_znode))
3757+ /* @fresh will be deallocated automatically by error
3758+ handling code in the caller. */
3759+ return (carry_node *) new_znode;
3760+
3761+ /* new_znode returned znode with x_count 1. Caller has to decrease
3762+ it. make_space() does. */
3763+
3764+ ZF_SET(new_znode, JNODE_ORPHAN);
3765+ fresh->node = new_znode;
3766+
3767+ while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) {
3768+ ref = carry_node_prev(ref);
3769+ assert("nikita-1606", !carry_node_end(doing, ref));
3770+ }
3771+
3772+ info.todo = todo;
3773+ info.doing = doing;
3774+ add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1);
3775+ if (IS_ERR(add_pointer)) {
3776+ /* no need to deallocate @new_znode here: it will be
3777+ deallocated during carry error handling. */
3778+ return (carry_node *) add_pointer;
3779+ }
3780+
3781+ add_pointer->u.insert.type = COPT_CHILD;
3782+ add_pointer->u.insert.child = fresh;
3783+ add_pointer->u.insert.brother = brother;
3784+ /* initially new node spawns empty key range */
3785+ write_lock_dk(znode_get_tree(brother));
3786+ znode_set_ld_key(new_znode,
3787+ znode_set_rd_key(new_znode,
3788+ znode_get_rd_key(brother)));
3789+ write_unlock_dk(znode_get_tree(brother));
3790+ return fresh;
3791+}
3792+
3793+/* DEBUGGING FUNCTIONS.
3794+
3795+ Probably we also should leave them on even when
3796+ debugging is turned off to print dumps at errors.
3797+*/
3798+#if REISER4_DEBUG
3799+static int carry_level_invariant(carry_level * level, carry_queue_state state)
3800+{
3801+ carry_node *node;
3802+ carry_node *tmp_node;
3803+
3804+ if (level == NULL)
3805+ return 0;
3806+
3807+ if (level->track_type != 0 &&
3808+ level->track_type != CARRY_TRACK_NODE &&
3809+ level->track_type != CARRY_TRACK_CHANGE)
3810+ return 0;
3811+
3812+ /* check that nodes are in ascending order */
3813+ for_all_nodes(level, node, tmp_node) {
3814+ znode *left;
3815+ znode *right;
3816+
3817+ reiser4_key lkey;
3818+ reiser4_key rkey;
3819+
3820+ if (node != carry_node_front(level)) {
3821+ if (state == CARRY_TODO) {
3822+ right = node->node;
3823+ left = carry_node_prev(node)->node;
3824+ } else {
3825+ right = carry_real(node);
3826+ left = carry_real(carry_node_prev(node));
3827+ }
3828+ if (right == NULL || left == NULL)
3829+ continue;
3830+ if (node_is_empty(right) || node_is_empty(left))
3831+ continue;
3832+ if (!keyle(leftmost_key_in_node(left, &lkey),
3833+ leftmost_key_in_node(right, &rkey))) {
3834+ warning("", "wrong key order");
3835+ return 0;
3836+ }
3837+ }
3838+ }
3839+ return 1;
3840+}
3841+#endif
3842+
3843+/* get symbolic name for boolean */
3844+static const char *tf(int boolean /* truth value */ )
3845+{
3846+ return boolean ? "t" : "f";
3847+}
3848+
3849+/* symbolic name for carry operation */
3850+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3851+{
3852+ switch (op) {
3853+ case COP_INSERT:
3854+ return "COP_INSERT";
3855+ case COP_DELETE:
3856+ return "COP_DELETE";
3857+ case COP_CUT:
3858+ return "COP_CUT";
3859+ case COP_PASTE:
3860+ return "COP_PASTE";
3861+ case COP_UPDATE:
3862+ return "COP_UPDATE";
3863+ case COP_EXTENT:
3864+ return "COP_EXTENT";
3865+ case COP_INSERT_FLOW:
3866+ return "COP_INSERT_FLOW";
3867+ default:{
3868+ /* not mt safe, but who cares? */
3869+ static char buf[20];
3870+
3871+ sprintf(buf, "unknown op: %x", op);
3872+ return buf;
3873+ }
3874+ }
3875+}
3876+
3877+/* dump information about carry node */
3878+static void print_carry(const char *prefix /* prefix to print */ ,
3879+ carry_node * node /* node to print */ )
3880+{
3881+ if (node == NULL) {
3882+ printk("%s: null\n", prefix);
3883+ return;
3884+ }
3885+ printk
3886+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3887+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3888+ tf(node->free), tf(node->deallocate));
3889+}
3890+
3891+/* dump information about carry operation */
3892+static void print_op(const char *prefix /* prefix to print */ ,
3893+ carry_op * op /* operation to print */ )
3894+{
3895+ if (op == NULL) {
3896+ printk("%s: null\n", prefix);
3897+ return;
3898+ }
3899+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3900+ print_carry("\tnode", op->node);
3901+ switch (op->op) {
3902+ case COP_INSERT:
3903+ case COP_PASTE:
3904+ print_coord("\tcoord",
3905+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3906+ print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL);
3907+ print_carry("\tchild", op->u.insert.child);
3908+ break;
3909+ case COP_DELETE:
3910+ print_carry("\tchild", op->u.delete.child);
3911+ break;
3912+ case COP_CUT:
3913+ if (op->u.cut_or_kill.is_cut) {
3914+ print_coord("\tfrom",
3915+ op->u.cut_or_kill.u.kill->params.from, 0);
3916+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3917+ 0);
3918+ } else {
3919+ print_coord("\tfrom",
3920+ op->u.cut_or_kill.u.cut->params.from, 0);
3921+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3922+ 0);
3923+ }
3924+ break;
3925+ case COP_UPDATE:
3926+ print_carry("\tleft", op->u.update.left);
3927+ break;
3928+ default:
3929+ /* do nothing */
3930+ break;
3931+ }
3932+}
3933+
3934+/* dump information about all nodes and operations in a @level */
3935+static void print_level(const char *prefix /* prefix to print */ ,
3936+ carry_level * level /* level to print */ )
3937+{
3938+ carry_node *node;
3939+ carry_node *tmp_node;
3940+ carry_op *op;
3941+ carry_op *tmp_op;
3942+
3943+ if (level == NULL) {
3944+ printk("%s: null\n", prefix);
3945+ return;
3946+ }
3947+ printk("%s: %p, restartable: %s\n",
3948+ prefix, level, tf(level->restartable));
3949+
3950+ for_all_nodes(level, node, tmp_node)
3951+ print_carry("\tcarry node", node);
3952+ for_all_ops(level, op, tmp_op)
3953+ print_op("\tcarry op", op);
3954+}
3955+
3956+/* Make Linus happy.
3957+ Local variables:
3958+ c-indentation-style: "K&R"
3959+ mode-name: "LC"
3960+ c-basic-offset: 8
3961+ tab-width: 8
3962+ fill-column: 120
3963+ scroll-step: 1
3964+ End:
3965+*/
3966Index: linux-2.6.16/fs/reiser4/carry.h
3967===================================================================
3968--- /dev/null
3969+++ linux-2.6.16/fs/reiser4/carry.h
3970@@ -0,0 +1,442 @@
3971+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3972+
3973+/* Functions and data types to "carry" tree modification(s) upward.
3974+ See fs/reiser4/carry.c for details. */
3975+
3976+#if !defined( __FS_REISER4_CARRY_H__ )
3977+#define __FS_REISER4_CARRY_H__
3978+
3979+#include "forward.h"
3980+#include "debug.h"
3981+#include "pool.h"
3982+#include "znode.h"
3983+
3984+#include <linux/types.h>
3985+
3986+/* &carry_node - "location" of carry node.
3987+
3988+ "location" of node that is involved or going to be involved into
3989+ carry process. Node where operation will be carried to on the
3990+ parent level cannot be recorded explicitly. Operation will be carried
3991+ usually to the parent of some node (where changes are performed at
3992+ the current level) or, to the left neighbor of its parent. But while
3993+ modifications are performed at the current level, parent may
3994+ change. So, we have to allow some indirection (or, positevly,
3995+ flexibility) in locating carry nodes.
3996+
3997+*/
3998+typedef struct carry_node {
3999+ /* pool linkage */
4000+ reiser4_pool_header header;
4001+
4002+ /* base node from which real_node is calculated. See
4003+ fs/reiser4/carry.c:lock_carry_node(). */
4004+ znode *node;
4005+
4006+ /* how to get ->real_node */
4007+ /* to get ->real_node obtain parent of ->node */
4008+ __u32 parent:1;
4009+ /* to get ->real_node obtain left neighbor of parent of
4010+ ->node */
4011+ __u32 left:1;
4012+ __u32 left_before:1;
4013+
4014+ /* locking */
4015+
4016+ /* this node was locked by carry process and should be
4017+ unlocked when carry leaves a level */
4018+ __u32 unlock:1;
4019+
4020+ /* disk block for this node was allocated by carry process and
4021+ should be deallocated when carry leaves a level */
4022+ __u32 deallocate:1;
4023+ /* this carry node was allocated by carry process and should be
4024+ freed when carry leaves a level */
4025+ __u32 free:1;
4026+
4027+ /* type of lock we want to take on this node */
4028+ lock_handle lock_handle;
4029+} carry_node;
4030+
4031+/* &carry_opcode - elementary operations that can be carried upward
4032+
4033+ Operations that carry() can handle. This list is supposed to be
4034+ expanded.
4035+
4036+ Each carry operation (cop) is handled by appropriate function defined
4037+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
4038+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
4039+ call plugins of nodes affected by operation to modify nodes' content
4040+ and to gather operations to be performed on the next level.
4041+
4042+*/
4043+typedef enum {
4044+ /* insert new item into node. */
4045+ COP_INSERT,
4046+ /* delete pointer from parent node */
4047+ COP_DELETE,
4048+ /* remove part of or whole node. */
4049+ COP_CUT,
4050+ /* increase size of item. */
4051+ COP_PASTE,
4052+ /* insert extent (that is sequence of unformatted nodes). */
4053+ COP_EXTENT,
4054+ /* update delimiting key in least common ancestor of two
4055+ nodes. This is performed when items are moved between two
4056+ nodes.
4057+ */
4058+ COP_UPDATE,
4059+ /* insert flow */
4060+ COP_INSERT_FLOW,
4061+ COP_LAST_OP,
4062+} carry_opcode;
4063+
4064+#define CARRY_FLOW_NEW_NODES_LIMIT 20
4065+
4066+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
4067+ item is determined. */
4068+typedef enum {
4069+ /* target item is one containing pointer to the ->child node */
4070+ COPT_CHILD,
4071+ /* target item is given explicitly by @coord */
4072+ COPT_ITEM_DATA,
4073+ /* target item is given by key */
4074+ COPT_KEY,
4075+ /* see insert_paste_common() for more comments on this. */
4076+ COPT_PASTE_RESTARTED,
4077+} cop_insert_pos_type;
4078+
4079+/* flags to cut and delete */
4080+typedef enum {
4081+ /* don't kill node even if it became completely empty as results of
4082+ * cut. This is needed for eottl handling. See carry_extent() for
4083+ * details. */
4084+ DELETE_RETAIN_EMPTY = (1 << 0)
4085+} cop_delete_flag;
4086+
4087+/*
4088+ * carry() implements "lock handle tracking" feature.
4089+ *
4090+ * Callers supply carry with node where to perform initial operation and lock
4091+ * handle on this node. Trying to optimize node utilization carry may actually
4092+ * move insertion point to different node. Callers expect that lock handle
4093+ * will rebe transferred to the new node also.
4094+ *
4095+ */
4096+typedef enum {
4097+ /* transfer lock handle along with insertion point */
4098+ CARRY_TRACK_CHANGE = 1,
4099+ /* acquire new lock handle to the node where insertion point is. This
4100+ * is used when carry() client doesn't initially possess lock handle
4101+ * on the insertion point node, for example, by extent insertion
4102+ * code. See carry_extent(). */
4103+ CARRY_TRACK_NODE = 2
4104+} carry_track_type;
4105+
4106+/* data supplied to COP_{INSERT|PASTE} by callers */
4107+typedef struct carry_insert_data {
4108+ /* position where new item is to be inserted */
4109+ coord_t *coord;
4110+ /* new item description */
4111+ reiser4_item_data *data;
4112+ /* key of new item */
4113+ const reiser4_key *key;
4114+} carry_insert_data;
4115+
4116+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
4117+struct cut_kill_params {
4118+ /* coord where cut starts (inclusive) */
4119+ coord_t *from;
4120+ /* coord where cut stops (inclusive, this item/unit will also be
4121+ * cut) */
4122+ coord_t *to;
4123+ /* starting key. This is necessary when item and unit pos don't
4124+ * uniquely identify what portion or tree to remove. For example, this
4125+ * indicates what portion of extent unit will be affected. */
4126+ const reiser4_key *from_key;
4127+ /* exclusive stop key */
4128+ const reiser4_key *to_key;
4129+ /* if this is not NULL, smallest actually removed key is stored
4130+ * here. */
4131+ reiser4_key *smallest_removed;
4132+ /* kill_node_content() is called for file truncate */
4133+ int truncate;
4134+};
4135+
4136+struct carry_cut_data {
4137+ struct cut_kill_params params;
4138+};
4139+
4140+struct carry_kill_data {
4141+ struct cut_kill_params params;
4142+ /* parameter to be passed to the ->kill_hook() method of item
4143+ * plugin */
4144+ /*void *iplug_params; *//* FIXME: unused currently */
4145+ /* if not NULL---inode whose items are being removed. This is needed
4146+ * for ->kill_hook() of extent item to update VM structures when
4147+ * removing pages. */
4148+ struct inode *inode;
4149+ /* sibling list maintenance is complicated by existence of eottl. When
4150+ * eottl whose left and right neighbors are formatted leaves is
4151+ * removed, one has to connect said leaves in the sibling list. This
4152+ * cannot be done when extent removal is just started as locking rules
4153+ * require sibling list update to happen atomically with removal of
4154+ * extent item. Therefore: 1. pointers to left and right neighbors
4155+ * have to be passed down to the ->kill_hook() of extent item, and
4156+ * 2. said neighbors have to be locked. */
4157+ lock_handle *left;
4158+ lock_handle *right;
4159+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
4160+ unsigned flags;
4161+ char *buf;
4162+};
4163+
4164+/* &carry_tree_op - operation to "carry" upward.
4165+
4166+ Description of an operation we want to "carry" to the upper level of
4167+ a tree: e.g, when we insert something and there is not enough space
4168+ we allocate a new node and "carry" the operation of inserting a
4169+ pointer to the new node to the upper level, on removal of empty node,
4170+ we carry up operation of removing appropriate entry from parent.
4171+
4172+ There are two types of carry ops: when adding or deleting node we
4173+ node at the parent level where appropriate modification has to be
4174+ performed is known in advance. When shifting items between nodes
4175+ (split, merge), delimiting key should be changed in the least common
4176+ parent of the nodes involved that is not known in advance.
4177+
4178+ For the operations of the first type we store in &carry_op pointer to
4179+ the &carry_node at the parent level. For the operation of the second
4180+ type we store &carry_node or parents of the left and right nodes
4181+ modified and keep track of them upward until they coincide.
4182+
4183+*/
4184+typedef struct carry_op {
4185+ /* pool linkage */
4186+ reiser4_pool_header header;
4187+ carry_opcode op;
4188+ /* node on which operation is to be performed:
4189+
4190+ for insert, paste: node where new item is to be inserted
4191+
4192+ for delete: node where pointer is to be deleted
4193+
4194+ for cut: node to cut from
4195+
4196+ for update: node where delimiting key is to be modified
4197+
4198+ for modify: parent of modified node
4199+
4200+ */
4201+ carry_node *node;
4202+ union {
4203+ struct {
4204+ /* (sub-)type of insertion/paste. Taken from
4205+ cop_insert_pos_type. */
4206+ __u8 type;
4207+ /* various operation flags. Taken from
4208+ cop_insert_flag. */
4209+ __u8 flags;
4210+ carry_insert_data *d;
4211+ carry_node *child;
4212+ znode *brother;
4213+ } insert, paste, extent;
4214+
4215+ struct {
4216+ int is_cut;
4217+ union {
4218+ carry_kill_data *kill;
4219+ carry_cut_data *cut;
4220+ } u;
4221+ } cut_or_kill;
4222+
4223+ struct {
4224+ carry_node *left;
4225+ } update;
4226+ struct {
4227+ /* changed child */
4228+ carry_node *child;
4229+ /* bitmask of changes. See &cop_modify_flag */
4230+ __u32 flag;
4231+ } modify;
4232+ struct {
4233+ /* flags to deletion operation. Are taken from
4234+ cop_delete_flag */
4235+ __u32 flags;
4236+ /* child to delete from parent. If this is
4237+ NULL, delete op->node. */
4238+ carry_node *child;
4239+ } delete;
4240+ struct {
4241+ /* various operation flags. Taken from
4242+ cop_insert_flag. */
4243+ __u32 flags;
4244+ flow_t *flow;
4245+ coord_t *insert_point;
4246+ reiser4_item_data *data;
4247+ /* flow insertion is limited by number of new blocks
4248+ added in that operation which do not get any data
4249+ but part of flow. This limit is set by macro
4250+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4251+ of nodes added already during one carry_flow */
4252+ int new_nodes;
4253+ } insert_flow;
4254+ } u;
4255+} carry_op;
4256+
4257+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4258+typedef struct carry_pool {
4259+ carry_op op[CARRIES_POOL_SIZE];
4260+ reiser4_pool op_pool;
4261+ carry_node node[NODES_LOCKED_POOL_SIZE];
4262+ reiser4_pool node_pool;
4263+} carry_pool;
4264+
4265+/* &carry_tree_level - carry process on given level
4266+
4267+ Description of balancing process on the given level.
4268+
4269+ No need for locking here, as carry_tree_level is essentially per
4270+ thread thing (for now).
4271+
4272+*/
4273+struct carry_level {
4274+ /* this level may be restarted */
4275+ __u32 restartable:1;
4276+ /* list of carry nodes on this level, ordered by key order */
4277+ struct list_head nodes;
4278+ struct list_head ops;
4279+ /* pool where new objects are allocated from */
4280+ carry_pool *pool;
4281+ int ops_num;
4282+ int nodes_num;
4283+ /* new root created on this level, if any */
4284+ znode *new_root;
4285+ /* This is set by caller (insert_by_key(), resize_item(), etc.) when
4286+ they want ->tracked to automagically wander to the node where
4287+ insertion point moved after insert or paste.
4288+ */
4289+ carry_track_type track_type;
4290+ /* lock handle supplied by user that we are tracking. See
4291+ above. */
4292+ lock_handle *tracked;
4293+};
4294+
4295+/* information carry passes to plugin methods that may add new operations to
4296+ the @todo queue */
4297+struct carry_plugin_info {
4298+ carry_level *doing;
4299+ carry_level *todo;
4300+};
4301+
4302+int carry(carry_level * doing, carry_level * done);
4303+
4304+carry_node *add_carry(carry_level * level, pool_ordering order,
4305+ carry_node * reference);
4306+carry_node *add_carry_skip(carry_level * level, pool_ordering order,
4307+ carry_node * reference);
4308+
4309+extern carry_node *insert_carry_node(carry_level * doing,
4310+ carry_level * todo, const znode * node);
4311+
4312+extern carry_pool *init_carry_pool(int);
4313+extern void done_carry_pool(carry_pool * pool);
4314+
4315+extern void init_carry_level(carry_level * level, carry_pool * pool);
4316+
4317+extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node,
4318+ int apply_to_parent);
4319+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4320+ znode * node, int apply_to_parent_p);
4321+
4322+carry_node *add_new_znode(znode * brother, carry_node * reference,
4323+ carry_level * doing, carry_level * todo);
4324+
4325+carry_node *find_carry_node(carry_level * level, const znode * node);
4326+
4327+extern znode *carry_real(const carry_node * node);
4328+
4329+/* helper macros to iterate over carry queues */
4330+
4331+#define carry_node_next( node ) \
4332+ list_entry((node)->header.level_linkage.next, carry_node, \
4333+ header.level_linkage)
4334+
4335+#define carry_node_prev( node ) \
4336+ list_entry((node)->header.level_linkage.prev, carry_node, \
4337+ header.level_linkage)
4338+
4339+#define carry_node_front( level ) \
4340+ list_entry((level)->nodes.next, carry_node, header.level_linkage)
4341+
4342+#define carry_node_back( level ) \
4343+ list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4344+
4345+#define carry_node_end( level, node ) \
4346+ (&(level)->nodes == &(node)->header.level_linkage)
4347+
4348+/* macro to iterate over all operations in a @level */
4349+#define for_all_ops( level /* carry level (of type carry_level *) */, \
4350+ op /* pointer to carry operation, modified by loop (of \
4351+ * type carry_op *) */, \
4352+ tmp /* pointer to carry operation (of type carry_op *), \
4353+ * used to make iterator stable in the face of \
4354+ * deletions from the level */ ) \
4355+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4356+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4357+ &op->header.level_linkage != &level->ops; \
4358+ op = tmp, \
4359+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4360+
4361+#if 0
4362+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4363+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4364+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4365+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4366+#endif
4367+
4368+/* macro to iterate over all nodes in a @level */ \
4369+#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4370+ node /* pointer to carry node, modified by loop (of \
4371+ * type carry_node *) */, \
4372+ tmp /* pointer to carry node (of type carry_node *), \
4373+ * used to make iterator stable in the face of * \
4374+ * deletions from the level */ ) \
4375+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4376+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4377+ &node->header.level_linkage != &level->nodes; \
4378+ node = tmp, \
4379+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4380+
4381+#if 0
4382+for( node = carry_node_front( level ), \
4383+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4384+ node = tmp, tmp = carry_node_next( node ) )
4385+#endif
4386+
4387+/* macro to iterate over all nodes in a @level in reverse order
4388+
4389+ This is used, because nodes are unlocked in reversed order of locking */
4390+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4391+ node /* pointer to carry node, modified by loop \
4392+ * (of type carry_node *) */, \
4393+ tmp /* pointer to carry node (of type carry_node \
4394+ * *), used to make iterator stable in the \
4395+ * face of deletions from the level */ ) \
4396+for( node = carry_node_back( level ), \
4397+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4398+ node = tmp, tmp = carry_node_prev( node ) )
4399+
4400+/* __FS_REISER4_CARRY_H__ */
4401+#endif
4402+
4403+/* Make Linus happy.
4404+ Local variables:
4405+ c-indentation-style: "K&R"
4406+ mode-name: "LC"
4407+ c-basic-offset: 8
4408+ tab-width: 8
4409+ fill-column: 120
4410+ scroll-step: 1
4411+ End:
4412+*/
4413Index: linux-2.6.16/fs/reiser4/carry_ops.c
4414===================================================================
4415--- /dev/null
4416+++ linux-2.6.16/fs/reiser4/carry_ops.c
4417@@ -0,0 +1,2103 @@
4418+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4419+
4420+/* implementation of carry operations */
4421+
4422+#include "forward.h"
4423+#include "debug.h"
4424+#include "key.h"
4425+#include "coord.h"
4426+#include "plugin/item/item.h"
4427+#include "plugin/node/node.h"
4428+#include "jnode.h"
4429+#include "znode.h"
4430+#include "block_alloc.h"
4431+#include "tree_walk.h"
4432+#include "pool.h"
4433+#include "tree_mod.h"
4434+#include "carry.h"
4435+#include "carry_ops.h"
4436+#include "tree.h"
4437+#include "super.h"
4438+#include "reiser4.h"
4439+
4440+#include <linux/types.h>
4441+#include <linux/err.h>
4442+
4443+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4444+ carry_level * doing, carry_level * todo,
4445+ unsigned int including_insert_coord_p);
4446+
4447+extern int lock_carry_node(carry_level * level, carry_node * node);
4448+extern int lock_carry_node_tail(carry_node * node);
4449+
4450+/* find left neighbor of a carry node
4451+
4452+ Look for left neighbor of @node and add it to the @doing queue. See
4453+ comments in the body.
4454+
4455+*/
4456+static carry_node *find_left_neighbor(carry_op * op /* node to find left
4457+ * neighbor of */ ,
4458+ carry_level * doing /* level to scan */ )
4459+{
4460+ int result;
4461+ carry_node *node;
4462+ carry_node *left;
4463+ int flags;
4464+ reiser4_tree *tree;
4465+
4466+ node = op->node;
4467+
4468+ tree = current_tree;
4469+ read_lock_tree(tree);
4470+ /* first, check whether left neighbor is already in a @doing queue */
4471+ if (carry_real(node)->left != NULL) {
4472+ /* NOTE: there is locking subtlety here. Look into
4473+ * find_right_neighbor() for more info */
4474+ if (find_carry_node(doing, carry_real(node)->left) != NULL) {
4475+ read_unlock_tree(tree);
4476+ left = node;
4477+ do {
4478+ left = list_entry(left->header.level_linkage.prev,
4479+ carry_node, header.level_linkage);
4480+ assert("nikita-3408", !carry_node_end(doing,
4481+ left));
4482+ } while (carry_real(left) == carry_real(node));
4483+ return left;
4484+ }
4485+ }
4486+ read_unlock_tree(tree);
4487+
4488+ left = add_carry_skip(doing, POOLO_BEFORE, node);
4489+ if (IS_ERR(left))
4490+ return left;
4491+
4492+ left->node = node->node;
4493+ left->free = 1;
4494+
4495+ flags = GN_TRY_LOCK;
4496+ if (!op->u.insert.flags & COPI_LOAD_LEFT)
4497+ flags |= GN_NO_ALLOC;
4498+
4499+ /* then, feeling lucky, peek left neighbor in the cache. */
4500+ result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node),
4501+ ZNODE_WRITE_LOCK, flags);
4502+ if (result == 0) {
4503+ /* ok, node found and locked. */
4504+ result = lock_carry_node_tail(left);
4505+ if (result != 0)
4506+ left = ERR_PTR(result);
4507+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4508+ /* node is leftmost node in a tree, or neighbor wasn't in
4509+ cache, or there is an extent on the left. */
4510+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4511+ left = NULL;
4512+ } else if (doing->restartable) {
4513+ /* if left neighbor is locked, and level is restartable, add
4514+ new node to @doing and restart. */
4515+ assert("nikita-913", node->parent != 0);
4516+ assert("nikita-914", node->node != NULL);
4517+ left->left = 1;
4518+ left->free = 0;
4519+ left = ERR_PTR(-E_REPEAT);
4520+ } else {
4521+ /* left neighbor is locked, level cannot be restarted. Just
4522+ ignore left neighbor. */
4523+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4524+ left = NULL;
4525+ }
4526+ return left;
4527+}
4528+
4529+/* find right neighbor of a carry node
4530+
4531+ Look for right neighbor of @node and add it to the @doing queue. See
4532+ comments in the body.
4533+
4534+*/
4535+static carry_node *find_right_neighbor(carry_op * op /* node to find right
4536+ * neighbor of */ ,
4537+ carry_level * doing /* level to scan */ )
4538+{
4539+ int result;
4540+ carry_node *node;
4541+ carry_node *right;
4542+ lock_handle lh;
4543+ int flags;
4544+ reiser4_tree *tree;
4545+
4546+ init_lh(&lh);
4547+
4548+ node = op->node;
4549+
4550+ tree = current_tree;
4551+ read_lock_tree(tree);
4552+ /* first, check whether right neighbor is already in a @doing queue */
4553+ if (carry_real(node)->right != NULL) {
4554+ /*
4555+ * Tree lock is taken here anyway, because, even if _outcome_
4556+ * of (find_carry_node() != NULL) doesn't depends on
4557+ * concurrent updates to ->right, find_carry_node() cannot
4558+ * work with second argument NULL. Hence, following comment is
4559+ * of historic importance only.
4560+ *
4561+ * Subtle:
4562+ *
4563+ * Q: why don't we need tree lock here, looking for the right
4564+ * neighbor?
4565+ *
4566+ * A: even if value of node->real_node->right were changed
4567+ * during find_carry_node() execution, outcome of execution
4568+ * wouldn't change, because (in short) other thread cannot add
4569+ * elements to the @doing, and if node->real_node->right
4570+ * already was in @doing, value of node->real_node->right
4571+ * couldn't change, because node cannot be inserted between
4572+ * locked neighbors.
4573+ */
4574+ if (find_carry_node(doing, carry_real(node)->right) != NULL) {
4575+ read_unlock_tree(tree);
4576+ /*
4577+ * What we are doing here (this is also applicable to
4578+ * the find_left_neighbor()).
4579+ *
4580+ * tree_walk.c code requires that insertion of a
4581+ * pointer to a child, modification of parent pointer
4582+ * in the child, and insertion of the child into
4583+ * sibling list are atomic (see
4584+ * plugin/item/internal.c:create_hook_internal()).
4585+ *
4586+ * carry allocates new node long before pointer to it
4587+ * is inserted into parent and, actually, long before
4588+ * parent is even known. Such allocated-but-orphaned
4589+ * nodes are only trackable through carry level lists.
4590+ *
4591+ * Situation that is handled here is following: @node
4592+ * has valid ->right pointer, but there is
4593+ * allocated-but-orphaned node in the carry queue that
4594+ * is logically between @node and @node->right. Here
4595+ * we are searching for it. Critical point is that
4596+ * this is only possible if @node->right is also in
4597+ * the carry queue (this is checked above), because
4598+ * this is the only way new orphaned node could be
4599+ * inserted between them (before inserting new node,
4600+ * make_space() first tries to shift to the right, so,
4601+ * right neighbor will be locked and queued).
4602+ *
4603+ */
4604+ right = node;
4605+ do {
4606+ right = list_entry(right->header.level_linkage.next,
4607+ carry_node, header.level_linkage);
4608+ assert("nikita-3408", !carry_node_end(doing,
4609+ right));
4610+ } while (carry_real(right) == carry_real(node));
4611+ return right;
4612+ }
4613+ }
4614+ read_unlock_tree(tree);
4615+
4616+ flags = GN_CAN_USE_UPPER_LEVELS;
4617+ if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4618+ flags = GN_NO_ALLOC;
4619+
4620+ /* then, try to lock right neighbor */
4621+ init_lh(&lh);
4622+ result = reiser4_get_right_neighbor(&lh, carry_real(node),
4623+ ZNODE_WRITE_LOCK, flags);
4624+ if (result == 0) {
4625+ /* ok, node found and locked. */
4626+ right = add_carry_skip(doing, POOLO_AFTER, node);
4627+ if (!IS_ERR(right)) {
4628+ right->node = lh.node;
4629+ move_lh(&right->lock_handle, &lh);
4630+ right->free = 1;
4631+ result = lock_carry_node_tail(right);
4632+ if (result != 0)
4633+ right = ERR_PTR(result);
4634+ }
4635+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4636+ /* node is rightmost node in a tree, or neighbor wasn't in
4637+ cache, or there is an extent on the right. */
4638+ right = NULL;
4639+ } else
4640+ right = ERR_PTR(result);
4641+ done_lh(&lh);
4642+ return right;
4643+}
4644+
4645+/* how much free space in a @node is needed for @op
4646+
4647+ How much space in @node is required for completion of @op, where @op is
4648+ insert or paste operation.
4649+*/
4650+static unsigned int space_needed_for_op(znode * node /* znode data are
4651+ * inserted or
4652+ * pasted in */ ,
4653+ carry_op * op /* carry
4654+ operation */ )
4655+{
4656+ assert("nikita-919", op != NULL);
4657+
4658+ switch (op->op) {
4659+ default:
4660+ impossible("nikita-1701", "Wrong opcode");
4661+ case COP_INSERT:
4662+ return space_needed(node, NULL, op->u.insert.d->data, 1);
4663+ case COP_PASTE:
4664+ return space_needed(node, op->u.insert.d->coord,
4665+ op->u.insert.d->data, 0);
4666+ }
4667+}
4668+
4669+/* how much space in @node is required to insert or paste @data at
4670+ @coord. */
4671+unsigned int space_needed(const znode * node /* node data are inserted or
4672+ * pasted in */ ,
4673+ const coord_t * coord /* coord where data are
4674+ * inserted or pasted
4675+ * at */ ,
4676+ const reiser4_item_data * data /* data to insert or
4677+ * paste */ ,
4678+ int insertion /* non-0 is inserting, 0---paste */ )
4679+{
4680+ int result;
4681+ item_plugin *iplug;
4682+
4683+ assert("nikita-917", node != NULL);
4684+ assert("nikita-918", node_plugin_by_node(node) != NULL);
4685+ assert("vs-230", !insertion || (coord == NULL));
4686+
4687+ result = 0;
4688+ iplug = data->iplug;
4689+ if (iplug->b.estimate != NULL) {
4690+ /* ask item plugin how much space is needed to insert this
4691+ item */
4692+ result += iplug->b.estimate(insertion ? NULL : coord, data);
4693+ } else {
4694+ /* reasonable default */
4695+ result += data->length;
4696+ }
4697+ if (insertion) {
4698+ node_plugin *nplug;
4699+
4700+ nplug = node->nplug;
4701+ /* and add node overhead */
4702+ if (nplug->item_overhead != NULL) {
4703+ result += nplug->item_overhead(node, NULL);
4704+ }
4705+ }
4706+ return result;
4707+}
4708+
4709+/* find &coord in parent where pointer to new child is to be stored. */
4710+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4711+ * insert pointer to new
4712+ * child */ )
4713+{
4714+ int result;
4715+ znode *node;
4716+ znode *child;
4717+
4718+ assert("nikita-941", op != NULL);
4719+ assert("nikita-942", op->op == COP_INSERT);
4720+
4721+ node = carry_real(op->node);
4722+ assert("nikita-943", node != NULL);
4723+ assert("nikita-944", node_plugin_by_node(node) != NULL);
4724+
4725+ child = carry_real(op->u.insert.child);
4726+ result =
4727+ find_new_child_ptr(node, child, op->u.insert.brother,
4728+ op->u.insert.d->coord);
4729+
4730+ build_child_ptr_data(child, op->u.insert.d->data);
4731+ return result;
4732+}
4733+
4734+/* additional amount of free space in @node required to complete @op */
4735+static int free_space_shortage(znode * node /* node to check */ ,
4736+ carry_op * op /* operation being performed */ )
4737+{
4738+ assert("nikita-1061", node != NULL);
4739+ assert("nikita-1062", op != NULL);
4740+
4741+ switch (op->op) {
4742+ default:
4743+ impossible("nikita-1702", "Wrong opcode");
4744+ case COP_INSERT:
4745+ case COP_PASTE:
4746+ return space_needed_for_op(node, op) - znode_free_space(node);
4747+ case COP_EXTENT:
4748+ /* when inserting extent shift data around until insertion
4749+ point is utmost in the node. */
4750+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4751+ return +1;
4752+ else
4753+ return -1;
4754+ }
4755+}
4756+
4757+/* helper function: update node pointer in operation after insertion
4758+ point was probably shifted into @target. */
4759+static znode *sync_op(carry_op * op, carry_node * target)
4760+{
4761+ znode *insertion_node;
4762+
4763+ /* reget node from coord: shift might move insertion coord to
4764+ the neighbor */
4765+ insertion_node = op->u.insert.d->coord->node;
4766+ /* if insertion point was actually moved into new node,
4767+ update carry node pointer in operation. */
4768+ if (insertion_node != carry_real(op->node)) {
4769+ op->node = target;
4770+ assert("nikita-2540", carry_real(target) == insertion_node);
4771+ }
4772+ assert("nikita-2541",
4773+ carry_real(op->node) == op->u.insert.d->coord->node);
4774+ return insertion_node;
4775+}
4776+
4777+/*
4778+ * complete make_space() call: update tracked lock handle if necessary. See
4779+ * comments for fs/reiser4/carry.h:carry_track_type
4780+ */
4781+static int
4782+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4783+{
4784+ int result;
4785+ carry_track_type tracking;
4786+ znode *node;
4787+
4788+ tracking = doing->track_type;
4789+ node = op->u.insert.d->coord->node;
4790+
4791+ if (tracking == CARRY_TRACK_NODE ||
4792+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4793+ /* inserting or pasting into node different from
4794+ original. Update lock handle supplied by caller. */
4795+ assert("nikita-1417", doing->tracked != NULL);
4796+ done_lh(doing->tracked);
4797+ init_lh(doing->tracked);
4798+ result = longterm_lock_znode(doing->tracked, node,
4799+ ZNODE_WRITE_LOCK,
4800+ ZNODE_LOCK_HIPRI);
4801+ } else
4802+ result = 0;
4803+ return result;
4804+}
4805+
4806+/* This is insertion policy function. It shifts data to the left and right
4807+ neighbors of insertion coord and allocates new nodes until there is enough
4808+ free space to complete @op.
4809+
4810+ See comments in the body.
4811+
4812+ Assumes that the node format favors insertions at the right end of the node
4813+ as node40 does.
4814+
4815+ See carry_flow() on detail about flow insertion
4816+*/
4817+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4818+ carry_level * doing /* current carry queue */ ,
4819+ carry_level * todo /* carry queue on the parent level */ )
4820+{
4821+ znode *node;
4822+ int result;
4823+ int not_enough_space;
4824+ int blk_alloc;
4825+ znode *orig_node;
4826+ __u32 flags;
4827+
4828+ coord_t *coord;
4829+
4830+ assert("nikita-890", op != NULL);
4831+ assert("nikita-891", todo != NULL);
4832+ assert("nikita-892",
4833+ op->op == COP_INSERT ||
4834+ op->op == COP_PASTE || op->op == COP_EXTENT);
4835+ assert("nikita-1607",
4836+ carry_real(op->node) == op->u.insert.d->coord->node);
4837+
4838+ flags = op->u.insert.flags;
4839+
4840+ /* NOTE check that new node can only be allocated after checking left
4841+ * and right neighbors. This is necessary for proper work of
4842+ * find_{left,right}_neighbor(). */
4843+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4844+ flags & COPI_DONT_SHIFT_LEFT));
4845+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4846+ flags & COPI_DONT_SHIFT_RIGHT));
4847+
4848+ coord = op->u.insert.d->coord;
4849+ orig_node = node = coord->node;
4850+
4851+ assert("nikita-908", node != NULL);
4852+ assert("nikita-909", node_plugin_by_node(node) != NULL);
4853+
4854+ result = 0;
4855+ /* If there is not enough space in a node, try to shift something to
4856+ the left neighbor. This is a bit tricky, as locking to the left is
4857+ low priority. This is handled by restart logic in carry().
4858+ */
4859+ not_enough_space = free_space_shortage(node, op);
4860+ if (not_enough_space <= 0)
4861+ /* it is possible that carry was called when there actually
4862+ was enough space in the node. For example, when inserting
4863+ leftmost item so that delimiting keys have to be updated.
4864+ */
4865+ return make_space_tail(op, doing, orig_node);
4866+ if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4867+ carry_node *left;
4868+ /* make note in statistics of an attempt to move
4869+ something into the left neighbor */
4870+ left = find_left_neighbor(op, doing);
4871+ if (unlikely(IS_ERR(left))) {
4872+ if (PTR_ERR(left) == -E_REPEAT)
4873+ return -E_REPEAT;
4874+ else {
4875+ /* some error other than restart request
4876+ occurred. This shouldn't happen. Issue a
4877+ warning and continue as if left neighbor
4878+ weren't existing.
4879+ */
4880+ warning("nikita-924",
4881+ "Error accessing left neighbor: %li",
4882+ PTR_ERR(left));
4883+ }
4884+ } else if (left != NULL) {
4885+
4886+ /* shift everything possible on the left of and
4887+ including insertion coord into the left neighbor */
4888+ result = carry_shift_data(LEFT_SIDE, coord,
4889+ carry_real(left), doing, todo,
4890+ flags & COPI_GO_LEFT);
4891+
4892+ /* reget node from coord: shift_left() might move
4893+ insertion coord to the left neighbor */
4894+ node = sync_op(op, left);
4895+
4896+ not_enough_space = free_space_shortage(node, op);
4897+ /* There is not enough free space in @node, but
4898+ may be, there is enough free space in
4899+ @left. Various balancing decisions are valid here.
4900+ The same for the shifiting to the right.
4901+ */
4902+ }
4903+ }
4904+ /* If there still is not enough space, shift to the right */
4905+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4906+ carry_node *right;
4907+
4908+ right = find_right_neighbor(op, doing);
4909+ if (IS_ERR(right)) {
4910+ warning("nikita-1065",
4911+ "Error accessing right neighbor: %li",
4912+ PTR_ERR(right));
4913+ } else if (right != NULL) {
4914+ /* node containing insertion point, and its right
4915+ neighbor node are write locked by now.
4916+
4917+ shift everything possible on the right of but
4918+ excluding insertion coord into the right neighbor
4919+ */
4920+ result = carry_shift_data(RIGHT_SIDE, coord,
4921+ carry_real(right),
4922+ doing, todo,
4923+ flags & COPI_GO_RIGHT);
4924+ /* reget node from coord: shift_right() might move
4925+ insertion coord to the right neighbor */
4926+ node = sync_op(op, right);
4927+ not_enough_space = free_space_shortage(node, op);
4928+ }
4929+ }
4930+ /* If there is still not enough space, allocate new node(s).
4931+
4932+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4933+ the carry operation flags (currently this is needed during flush
4934+ only).
4935+ */
4936+ for (blk_alloc = 0;
4937+ not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4938+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4939+ carry_node *fresh; /* new node we are allocating */
4940+ coord_t coord_shadow; /* remembered insertion point before
4941+ * shifting data into new node */
4942+ carry_node *node_shadow; /* remembered insertion node before
4943+ * shifting */
4944+ unsigned int gointo; /* whether insertion point should move
4945+ * into newly allocated node */
4946+
4947+ /* allocate new node on the right of @node. Znode and disk
4948+ fake block number for new node are allocated.
4949+
4950+ add_new_znode() posts carry operation COP_INSERT with
4951+ COPT_CHILD option to the parent level to add
4952+ pointer to newly created node to its parent.
4953+
4954+ Subtle point: if several new nodes are required to complete
4955+ insertion operation at this level, they will be inserted
4956+ into their parents in the order of creation, which means
4957+ that @node will be valid "cookie" at the time of insertion.
4958+
4959+ */
4960+ fresh = add_new_znode(node, op->node, doing, todo);
4961+ if (IS_ERR(fresh))
4962+ return PTR_ERR(fresh);
4963+
4964+ /* Try to shift into new node. */
4965+ result = lock_carry_node(doing, fresh);
4966+ zput(carry_real(fresh));
4967+ if (result != 0) {
4968+ warning("nikita-947",
4969+ "Cannot lock new node: %i", result);
4970+ return result;
4971+ }
4972+
4973+ /* both nodes are write locked by now.
4974+
4975+ shift everything possible on the right of and
4976+ including insertion coord into the right neighbor.
4977+ */
4978+ coord_dup(&coord_shadow, op->u.insert.d->coord);
4979+ node_shadow = op->node;
4980+ /* move insertion point into newly created node if:
4981+
4982+ . insertion point is rightmost in the source node, or
4983+ . this is not the first node we are allocating in a row.
4984+ */
4985+ gointo =
4986+ (blk_alloc > 0) ||
4987+ coord_is_after_rightmost(op->u.insert.d->coord);
4988+
4989+ result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh),
4990+ doing, todo, gointo);
4991+ /* if insertion point was actually moved into new node,
4992+ update carry node pointer in operation. */
4993+ node = sync_op(op, fresh);
4994+ not_enough_space = free_space_shortage(node, op);
4995+ if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4996+ /* there is not enough free in new node. Shift
4997+ insertion point back to the @shadow_node so that
4998+ next new node would be inserted between
4999+ @shadow_node and @fresh.
5000+ */
5001+ coord_normalize(&coord_shadow);
5002+ coord_dup(coord, &coord_shadow);
5003+ node = coord->node;
5004+ op->node = node_shadow;
5005+ if (1 || (flags & COPI_STEP_BACK)) {
5006+ /* still not enough space?! Maybe there is
5007+ enough space in the source node (i.e., node
5008+ data are moved from) now.
5009+ */
5010+ not_enough_space =
5011+ free_space_shortage(node, op);
5012+ }
5013+ }
5014+ }
5015+ if (not_enough_space > 0) {
5016+ if (!(flags & COPI_DONT_ALLOCATE))
5017+ warning("nikita-948", "Cannot insert new item");
5018+ result = -E_NODE_FULL;
5019+ }
5020+ assert("nikita-1622", ergo(result == 0,
5021+ carry_real(op->node) == coord->node));
5022+ assert("nikita-2616", coord == op->u.insert.d->coord);
5023+ if (result == 0)
5024+ result = make_space_tail(op, doing, orig_node);
5025+ return result;
5026+}
5027+
5028+/* insert_paste_common() - common part of insert and paste operations
5029+
5030+ This function performs common part of COP_INSERT and COP_PASTE.
5031+
5032+ There are two ways in which insertion/paste can be requested:
5033+
5034+ . by directly supplying reiser4_item_data. In this case, op ->
5035+ u.insert.type is set to COPT_ITEM_DATA.
5036+
5037+ . by supplying child pointer to which is to inserted into parent. In this
5038+ case op -> u.insert.type == COPT_CHILD.
5039+
5040+ . by supplying key of new item/unit. This is currently only used during
5041+ extent insertion
5042+
5043+ This is required, because when new node is allocated we don't know at what
5044+ position pointer to it is to be stored in the parent. Actually, we don't
5045+ even know what its parent will be, because parent can be re-balanced
5046+ concurrently and new node re-parented, and because parent can be full and
5047+ pointer to the new node will go into some other node.
5048+
5049+ insert_paste_common() resolves pointer to child node into position in the
5050+ parent by calling find_new_child_coord(), that fills
5051+ reiser4_item_data. After this, insertion/paste proceeds uniformly.
5052+
5053+ Another complication is with finding free space during pasting. It may
5054+ happen that while shifting items to the neighbors and newly allocated
5055+ nodes, insertion coord can no longer be in the item we wanted to paste
5056+ into. At this point, paste becomes (morphs) into insert. Moreover free
5057+ space analysis has to be repeated, because amount of space required for
5058+ insertion is different from that of paste (item header overhead, etc).
5059+
5060+ This function "unifies" different insertion modes (by resolving child
5061+ pointer or key into insertion coord), and then calls make_space() to free
5062+ enough space in the node by shifting data to the left and right and by
5063+ allocating new nodes if necessary. Carry operation knows amount of space
5064+ required for its completion. After enough free space is obtained, caller of
5065+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste
5066+ by calling item plugin method.
5067+
5068+*/
5069+static int insert_paste_common(carry_op * op /* carry operation being
5070+ * performed */ ,
5071+ carry_level * doing /* current carry level */ ,
5072+ carry_level * todo /* next carry level */ ,
5073+ carry_insert_data * cdata /* pointer to
5074+ * cdata */ ,
5075+ coord_t * coord /* insertion/paste coord */ ,
5076+ reiser4_item_data * data /* data to be
5077+ * inserted/pasted */ )
5078+{
5079+ assert("nikita-981", op != NULL);
5080+ assert("nikita-980", todo != NULL);
5081+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
5082+ || (op->op == COP_EXTENT));
5083+
5084+ if (op->u.insert.type == COPT_PASTE_RESTARTED) {
5085+ /* nothing to do. Fall through to make_space(). */
5086+ ;
5087+ } else if (op->u.insert.type == COPT_KEY) {
5088+ node_search_result intra_node;
5089+ znode *node;
5090+ /* Problem with doing batching at the lowest level, is that
5091+ operations here are given by coords where modification is
5092+ to be performed, and one modification can invalidate coords
5093+ of all following operations.
5094+
5095+ So, we are implementing yet another type for operation that
5096+ will use (the only) "locator" stable across shifting of
5097+ data between nodes, etc.: key (COPT_KEY).
5098+
5099+ This clause resolves key to the coord in the node.
5100+
5101+ But node can change also. Probably some pieces have to be
5102+ added to the lock_carry_node(), to lock node by its key.
5103+
5104+ */
5105+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
5106+ if you need something else. */
5107+ op->u.insert.d->coord = coord;
5108+ node = carry_real(op->node);
5109+ intra_node = node_plugin_by_node(node)->lookup
5110+ (node, op->u.insert.d->key, FIND_EXACT,
5111+ op->u.insert.d->coord);
5112+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
5113+ warning("nikita-1715", "Intra node lookup failure: %i",
5114+ intra_node);
5115+ return intra_node;
5116+ }
5117+ } else if (op->u.insert.type == COPT_CHILD) {
5118+ /* if we are asked to insert pointer to the child into
5119+ internal node, first convert pointer to the child into
5120+ coord within parent node.
5121+ */
5122+ znode *child;
5123+ int result;
5124+
5125+ op->u.insert.d = cdata;
5126+ op->u.insert.d->coord = coord;
5127+ op->u.insert.d->data = data;
5128+ op->u.insert.d->coord->node = carry_real(op->node);
5129+ result = find_new_child_coord(op);
5130+ child = carry_real(op->u.insert.child);
5131+ if (result != NS_NOT_FOUND) {
5132+ warning("nikita-993",
5133+ "Cannot find a place for child pointer: %i",
5134+ result);
5135+ return result;
5136+ }
5137+ /* This only happens when we did multiple insertions at
5138+ the previous level, trying to insert single item and
5139+ it so happened, that insertion of pointers to all new
5140+ nodes before this one already caused parent node to
5141+ split (may be several times).
5142+
5143+ I am going to come up with better solution.
5144+
5145+ You are not expected to understand this.
5146+ -- v6root/usr/sys/ken/slp.c
5147+
5148+ Basically, what happens here is the following: carry came
5149+ to the parent level and is about to insert internal item
5150+ pointing to the child node that it just inserted in the
5151+ level below. Position where internal item is to be inserted
5152+ was found by find_new_child_coord() above, but node of the
5153+ current carry operation (that is, parent node of child
5154+ inserted on the previous level), was determined earlier in
5155+ the lock_carry_level/lock_carry_node. It could so happen
5156+ that other carry operations already performed on the parent
5157+ level already split parent node, so that insertion point
5158+ moved into another node. Handle this by creating new carry
5159+ node for insertion point if necessary.
5160+ */
5161+ if (carry_real(op->node) != op->u.insert.d->coord->node) {
5162+ pool_ordering direction;
5163+ znode *z1;
5164+ znode *z2;
5165+ reiser4_key k1;
5166+ reiser4_key k2;
5167+
5168+ /*
5169+ * determine in what direction insertion point
5170+ * moved. Do this by comparing delimiting keys.
5171+ */
5172+ z1 = op->u.insert.d->coord->node;
5173+ z2 = carry_real(op->node);
5174+ if (keyle(leftmost_key_in_node(z1, &k1),
5175+ leftmost_key_in_node(z2, &k2)))
5176+ /* insertion point moved to the left */
5177+ direction = POOLO_BEFORE;
5178+ else
5179+ /* insertion point moved to the right */
5180+ direction = POOLO_AFTER;
5181+
5182+ op->node = add_carry_skip(doing, direction, op->node);
5183+ if (IS_ERR(op->node))
5184+ return PTR_ERR(op->node);
5185+ op->node->node = op->u.insert.d->coord->node;
5186+ op->node->free = 1;
5187+ result = lock_carry_node(doing, op->node);
5188+ if (result != 0)
5189+ return result;
5190+ }
5191+
5192+ /*
5193+ * set up key of an item being inserted: we are inserting
5194+ * internal item and its key is (by the very definition of
5195+ * search tree) is leftmost key in the child node.
5196+ */
5197+ write_lock_dk(znode_get_tree(child));
5198+ op->u.insert.d->key = leftmost_key_in_node(child,
5199+ znode_get_ld_key(child));
5200+ write_unlock_dk(znode_get_tree(child));
5201+ op->u.insert.d->data->arg = op->u.insert.brother;
5202+ } else {
5203+ assert("vs-243", op->u.insert.d->coord != NULL);
5204+ op->u.insert.d->coord->node = carry_real(op->node);
5205+ }
5206+
5207+ /* find free space. */
5208+ return make_space(op, doing, todo);
5209+}
5210+
5211+/* handle carry COP_INSERT operation.
5212+
5213+ Insert new item into node. New item can be given in one of two ways:
5214+
5215+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is
5216+ only applicable at the leaf/twig level.
5217+
5218+ - by passing a child node pointer to which is to be inserted by this
5219+ operation.
5220+
5221+*/
5222+static int carry_insert(carry_op * op /* operation to perform */ ,
5223+ carry_level * doing /* queue of operations @op
5224+ * is part of */ ,
5225+ carry_level * todo /* queue where new operations
5226+ * are accumulated */ )
5227+{
5228+ znode *node;
5229+ carry_insert_data cdata;
5230+ coord_t coord;
5231+ reiser4_item_data data;
5232+ carry_plugin_info info;
5233+ int result;
5234+
5235+ assert("nikita-1036", op != NULL);
5236+ assert("nikita-1037", todo != NULL);
5237+ assert("nikita-1038", op->op == COP_INSERT);
5238+
5239+ coord_init_zero(&coord);
5240+
5241+ /* perform common functionality of insert and paste. */
5242+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5243+ if (result != 0)
5244+ return result;
5245+
5246+ node = op->u.insert.d->coord->node;
5247+ assert("nikita-1039", node != NULL);
5248+ assert("nikita-1040", node_plugin_by_node(node) != NULL);
5249+
5250+ assert("nikita-949",
5251+ space_needed_for_op(node, op) <= znode_free_space(node));
5252+
5253+ /* ask node layout to create new item. */
5254+ info.doing = doing;
5255+ info.todo = todo;
5256+ result = node_plugin_by_node(node)->create_item
5257+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5258+ &info);
5259+ doing->restartable = 0;
5260+ znode_make_dirty(node);
5261+
5262+ return result;
5263+}
5264+
5265+/*
5266+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5267+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5268+ * by slicing into multiple items.
5269+ */
5270+
5271+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5272+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5273+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5274+
5275+static size_t item_data_overhead(carry_op * op)
5276+{
5277+ if (flow_insert_data(op)->iplug->b.estimate == NULL)
5278+ return 0;
5279+ return (flow_insert_data(op)->iplug->b.
5280+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5281+ flow_insert_data(op)->length);
5282+}
5283+
5284+/* FIXME-VS: this is called several times during one make_flow_for_insertion
5285+ and it will always return the same result. Some optimization could be made
5286+ by calculating this value once at the beginning and passing it around. That
5287+ would reduce some flexibility in future changes
5288+*/
5289+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5290+static size_t flow_insertion_overhead(carry_op * op)
5291+{
5292+ znode *node;
5293+ size_t insertion_overhead;
5294+
5295+ node = flow_insert_point(op)->node;
5296+ insertion_overhead = 0;
5297+ if (node->nplug->item_overhead &&
5298+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5299+ flow_insert_data(op)))
5300+ insertion_overhead =
5301+ node->nplug->item_overhead(node, NULL) +
5302+ item_data_overhead(op);
5303+ return insertion_overhead;
5304+}
5305+
5306+/* how many bytes of flow does fit to the node */
5307+static int what_can_fit_into_node(carry_op * op)
5308+{
5309+ size_t free, overhead;
5310+
5311+ overhead = flow_insertion_overhead(op);
5312+ free = znode_free_space(flow_insert_point(op)->node);
5313+ if (free <= overhead)
5314+ return 0;
5315+ free -= overhead;
5316+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5317+ if (free < op->u.insert_flow.flow->length)
5318+ return free;
5319+ return (int)op->u.insert_flow.flow->length;
5320+}
5321+
5322+/* in make_space_for_flow_insertion we need to check either whether whole flow
5323+ fits into a node or whether minimal fraction of flow fits into a node */
5324+static int enough_space_for_whole_flow(carry_op * op)
5325+{
5326+ return (unsigned)what_can_fit_into_node(op) ==
5327+ op->u.insert_flow.flow->length;
5328+}
5329+
5330+#define MIN_FLOW_FRACTION 1
5331+static int enough_space_for_min_flow_fraction(carry_op * op)
5332+{
5333+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5334+
5335+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5336+}
5337+
5338+/* this returns 0 if left neighbor was obtained successfully and everything
5339+ upto insertion point including it were shifted and left neighbor still has
5340+ some free space to put minimal fraction of flow into it */
5341+static int
5342+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5343+{
5344+ carry_node *left;
5345+ znode *orig;
5346+
5347+ left = find_left_neighbor(op, doing);
5348+ if (unlikely(IS_ERR(left))) {
5349+ warning("vs-899",
5350+ "make_space_by_shift_left: "
5351+ "error accessing left neighbor: %li", PTR_ERR(left));
5352+ return 1;
5353+ }
5354+ if (left == NULL)
5355+ /* left neighbor either does not exist or is unformatted
5356+ node */
5357+ return 1;
5358+
5359+ orig = flow_insert_point(op)->node;
5360+ /* try to shift content of node @orig from its head upto insert point
5361+ including insertion point into the left neighbor */
5362+ carry_shift_data(LEFT_SIDE, flow_insert_point(op), carry_real(left), doing, todo, 1 /* including insert
5363+ * point */ );
5364+ if (carry_real(left) != flow_insert_point(op)->node) {
5365+ /* insertion point did not move */
5366+ return 1;
5367+ }
5368+
5369+ /* insertion point is set after last item in the node */
5370+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5371+
5372+ if (!enough_space_for_min_flow_fraction(op)) {
5373+ /* insertion point node does not have enough free space to put
5374+ even minimal portion of flow into it, therefore, move
5375+ insertion point back to orig node (before first item) */
5376+ coord_init_before_first_item(flow_insert_point(op), orig);
5377+ return 1;
5378+ }
5379+
5380+ /* part of flow is to be written to the end of node */
5381+ op->node = left;
5382+ return 0;
5383+}
5384+
5385+/* this returns 0 if right neighbor was obtained successfully and everything to
5386+ the right of insertion point was shifted to it and node got enough free
5387+ space to put minimal fraction of flow into it */
5388+static int
5389+make_space_by_shift_right(carry_op * op, carry_level * doing,
5390+ carry_level * todo)
5391+{
5392+ carry_node *right;
5393+
5394+ right = find_right_neighbor(op, doing);
5395+ if (unlikely(IS_ERR(right))) {
5396+ warning("nikita-1065", "shift_right_excluding_insert_point: "
5397+ "error accessing right neighbor: %li", PTR_ERR(right));
5398+ return 1;
5399+ }
5400+ if (right) {
5401+ /* shift everything possible on the right of but excluding
5402+ insertion coord into the right neighbor */
5403+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(right), doing, todo, 0 /* not
5404+ * including
5405+ * insert
5406+ * point */ );
5407+ } else {
5408+ /* right neighbor either does not exist or is unformatted
5409+ node */
5410+ ;
5411+ }
5412+ if (coord_is_after_rightmost(flow_insert_point(op))) {
5413+ if (enough_space_for_min_flow_fraction(op)) {
5414+ /* part of flow is to be written to the end of node */
5415+ return 0;
5416+ }
5417+ }
5418+
5419+ /* new node is to be added if insert point node did not get enough
5420+ space for whole flow */
5421+ return 1;
5422+}
5423+
5424+/* this returns 0 when insert coord is set at the node end and fraction of flow
5425+ fits into that node */
5426+static int
5427+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5428+{
5429+ int result;
5430+ znode *node;
5431+ carry_node *new;
5432+
5433+ node = flow_insert_point(op)->node;
5434+
5435+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5436+ return RETERR(-E_NODE_FULL);
5437+ /* add new node after insert point node */
5438+ new = add_new_znode(node, op->node, doing, todo);
5439+ if (unlikely(IS_ERR(new))) {
5440+ return PTR_ERR(new);
5441+ }
5442+ result = lock_carry_node(doing, new);
5443+ zput(carry_real(new));
5444+ if (unlikely(result)) {
5445+ return result;
5446+ }
5447+ op->u.insert_flow.new_nodes++;
5448+ if (!coord_is_after_rightmost(flow_insert_point(op))) {
5449+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(new), doing, todo, 0 /* not
5450+ * including
5451+ * insert
5452+ * point */ );
5453+
5454+ assert("vs-901",
5455+ coord_is_after_rightmost(flow_insert_point(op)));
5456+
5457+ if (enough_space_for_min_flow_fraction(op)) {
5458+ return 0;
5459+ }
5460+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5461+ return RETERR(-E_NODE_FULL);
5462+
5463+ /* add one more new node */
5464+ new = add_new_znode(node, op->node, doing, todo);
5465+ if (unlikely(IS_ERR(new))) {
5466+ return PTR_ERR(new);
5467+ }
5468+ result = lock_carry_node(doing, new);
5469+ zput(carry_real(new));
5470+ if (unlikely(result)) {
5471+ return result;
5472+ }
5473+ op->u.insert_flow.new_nodes++;
5474+ }
5475+
5476+ /* move insertion point to new node */
5477+ coord_init_before_first_item(flow_insert_point(op), carry_real(new));
5478+ op->node = new;
5479+ return 0;
5480+}
5481+
5482+static int
5483+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5484+ carry_level * todo)
5485+{
5486+ __u32 flags = op->u.insert_flow.flags;
5487+
5488+ if (enough_space_for_whole_flow(op)) {
5489+ /* whole flow fits into insert point node */
5490+ return 0;
5491+ }
5492+
5493+ if (!(flags & COPI_DONT_SHIFT_LEFT)
5494+ && (make_space_by_shift_left(op, doing, todo) == 0)) {
5495+ /* insert point is shifted to left neighbor of original insert
5496+ point node and is set after last unit in that node. It has
5497+ enough space to fit at least minimal fraction of flow. */
5498+ return 0;
5499+ }
5500+
5501+ if (enough_space_for_whole_flow(op)) {
5502+ /* whole flow fits into insert point node */
5503+ return 0;
5504+ }
5505+
5506+ if (!(flags & COPI_DONT_SHIFT_RIGHT)
5507+ && (make_space_by_shift_right(op, doing, todo) == 0)) {
5508+ /* insert point is still set to the same node, but there is
5509+ nothing to the right of insert point. */
5510+ return 0;
5511+ }
5512+
5513+ if (enough_space_for_whole_flow(op)) {
5514+ /* whole flow fits into insert point node */
5515+ return 0;
5516+ }
5517+
5518+ return make_space_by_new_nodes(op, doing, todo);
5519+}
5520+
5521+/* implements COP_INSERT_FLOW operation */
5522+static int
5523+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5524+{
5525+ int result;
5526+ flow_t *f;
5527+ coord_t *insert_point;
5528+ node_plugin *nplug;
5529+ carry_plugin_info info;
5530+ znode *orig_node;
5531+ lock_handle *orig_lh;
5532+
5533+ f = op->u.insert_flow.flow;
5534+ result = 0;
5535+
5536+ /* carry system needs this to work */
5537+ info.doing = doing;
5538+ info.todo = todo;
5539+
5540+ orig_node = flow_insert_point(op)->node;
5541+ orig_lh = doing->tracked;
5542+
5543+ while (f->length) {
5544+ result = make_space_for_flow_insertion(op, doing, todo);
5545+ if (result)
5546+ break;
5547+
5548+ insert_point = flow_insert_point(op);
5549+ nplug = node_plugin_by_node(insert_point->node);
5550+
5551+ /* compose item data for insertion/pasting */
5552+ flow_insert_data(op)->data = f->data;
5553+ flow_insert_data(op)->length = what_can_fit_into_node(op);
5554+
5555+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5556+ /* insert point is set to item of file we are writing to and we have to append to it */
5557+ assert("vs-903", insert_point->between == AFTER_UNIT);
5558+ nplug->change_item_size(insert_point,
5559+ flow_insert_data(op)->length);
5560+ flow_insert_data(op)->iplug->b.paste(insert_point,
5561+ flow_insert_data
5562+ (op), &info);
5563+ } else {
5564+ /* new item must be inserted */
5565+ pos_in_node_t new_pos;
5566+ flow_insert_data(op)->length += item_data_overhead(op);
5567+
5568+ /* FIXME-VS: this is because node40_create_item changes
5569+ insert_point for obscure reasons */
5570+ switch (insert_point->between) {
5571+ case AFTER_ITEM:
5572+ new_pos = insert_point->item_pos + 1;
5573+ break;
5574+ case EMPTY_NODE:
5575+ new_pos = 0;
5576+ break;
5577+ case BEFORE_ITEM:
5578+ assert("vs-905", insert_point->item_pos == 0);
5579+ new_pos = 0;
5580+ break;
5581+ default:
5582+ impossible("vs-906",
5583+ "carry_insert_flow: invalid coord");
5584+ new_pos = 0;
5585+ break;
5586+ }
5587+
5588+ nplug->create_item(insert_point, &f->key,
5589+ flow_insert_data(op), &info);
5590+ coord_set_item_pos(insert_point, new_pos);
5591+ }
5592+ coord_init_after_item_end(insert_point);
5593+ doing->restartable = 0;
5594+ znode_make_dirty(insert_point->node);
5595+
5596+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5597+ }
5598+
5599+ if (orig_node != flow_insert_point(op)->node) {
5600+ /* move lock to new insert point */
5601+ done_lh(orig_lh);
5602+ init_lh(orig_lh);
5603+ result =
5604+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5605+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5606+ }
5607+
5608+ return result;
5609+}
5610+
5611+/* implements COP_DELETE operation
5612+
5613+ Remove pointer to @op -> u.delete.child from it's parent.
5614+
5615+ This function also handles killing of a tree root is last pointer from it
5616+ was removed. This is complicated by our handling of "twig" level: root on
5617+ twig level is never killed.
5618+
5619+*/
5620+static int carry_delete(carry_op * op /* operation to be performed */ ,
5621+ carry_level * doing UNUSED_ARG /* current carry
5622+ * level */ ,
5623+ carry_level * todo /* next carry level */ )
5624+{
5625+ int result;
5626+ coord_t coord;
5627+ coord_t coord2;
5628+ znode *parent;
5629+ znode *child;
5630+ carry_plugin_info info;
5631+ reiser4_tree *tree;
5632+
5633+ /*
5634+ * This operation is called to delete internal item pointing to the
5635+ * child node that was removed by carry from the tree on the previous
5636+ * tree level.
5637+ */
5638+
5639+ assert("nikita-893", op != NULL);
5640+ assert("nikita-894", todo != NULL);
5641+ assert("nikita-895", op->op == COP_DELETE);
5642+
5643+ coord_init_zero(&coord);
5644+ coord_init_zero(&coord2);
5645+
5646+ parent = carry_real(op->node);
5647+ child = op->u.delete.child ?
5648+ carry_real(op->u.delete.child) : op->node->node;
5649+ tree = znode_get_tree(child);
5650+ read_lock_tree(tree);
5651+
5652+ /*
5653+ * @parent was determined when carry entered parent level
5654+ * (lock_carry_level/lock_carry_node). Since then, actual parent of
5655+ * @child node could change due to other carry operations performed on
5656+ * the parent level. Check for this.
5657+ */
5658+
5659+ if (znode_parent(child) != parent) {
5660+ /* NOTE-NIKITA add stat counter for this. */
5661+ parent = znode_parent(child);
5662+ assert("nikita-2581", find_carry_node(doing, parent));
5663+ }
5664+ read_unlock_tree(tree);
5665+
5666+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5667+
5668+ /* Twig level horrors: tree should be of height at least 2. So, last
5669+ pointer from the root at twig level is preserved even if child is
5670+ empty. This is ugly, but so it was architectured.
5671+ */
5672+
5673+ if (znode_is_root(parent) &&
5674+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5675+ node_num_items(parent) == 1) {
5676+ /* Delimiting key manipulations. */
5677+ write_lock_dk(tree);
5678+ znode_set_ld_key(child, znode_set_ld_key(parent, min_key()));
5679+ znode_set_rd_key(child, znode_set_rd_key(parent, max_key()));
5680+ ZF_SET(child, JNODE_DKSET);
5681+ write_unlock_dk(tree);
5682+
5683+ /* @child escaped imminent death! */
5684+ ZF_CLR(child, JNODE_HEARD_BANSHEE);
5685+ return 0;
5686+ }
5687+
5688+ /* convert child pointer to the coord_t */
5689+ result = find_child_ptr(parent, child, &coord);
5690+ if (result != NS_FOUND) {
5691+ warning("nikita-994", "Cannot find child pointer: %i", result);
5692+ print_coord_content("coord", &coord);
5693+ return result;
5694+ }
5695+
5696+ coord_dup(&coord2, &coord);
5697+ info.doing = doing;
5698+ info.todo = todo;
5699+ {
5700+ /*
5701+ * Actually kill internal item: prepare structure with
5702+ * arguments for ->cut_and_kill() method...
5703+ */
5704+
5705+ struct carry_kill_data kdata;
5706+ kdata.params.from = &coord;
5707+ kdata.params.to = &coord2;
5708+ kdata.params.from_key = NULL;
5709+ kdata.params.to_key = NULL;
5710+ kdata.params.smallest_removed = NULL;
5711+ kdata.params.truncate = 1;
5712+ kdata.flags = op->u.delete.flags;
5713+ kdata.inode = NULL;
5714+ kdata.left = NULL;
5715+ kdata.right = NULL;
5716+ kdata.buf = NULL;
5717+ /* ... and call it. */
5718+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5719+ &info);
5720+ }
5721+ doing->restartable = 0;
5722+
5723+ /* check whether root should be killed violently */
5724+ if (znode_is_root(parent) &&
5725+ /* don't kill roots at and lower than twig level */
5726+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5727+ node_num_items(parent) == 1) {
5728+ result = kill_tree_root(coord.node);
5729+ }
5730+
5731+ return result < 0 ? : 0;
5732+}
5733+
5734+/* implements COP_CUT opration
5735+
5736+ Cuts part or whole content of node.
5737+
5738+*/
5739+static int carry_cut(carry_op * op /* operation to be performed */ ,
5740+ carry_level * doing /* current carry level */ ,
5741+ carry_level * todo /* next carry level */ )
5742+{
5743+ int result;
5744+ carry_plugin_info info;
5745+ node_plugin *nplug;
5746+
5747+ assert("nikita-896", op != NULL);
5748+ assert("nikita-897", todo != NULL);
5749+ assert("nikita-898", op->op == COP_CUT);
5750+
5751+ info.doing = doing;
5752+ info.todo = todo;
5753+
5754+ nplug = node_plugin_by_node(carry_real(op->node));
5755+ if (op->u.cut_or_kill.is_cut)
5756+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5757+ else
5758+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5759+
5760+ doing->restartable = 0;
5761+ return result < 0 ? : 0;
5762+}
5763+
5764+/* helper function for carry_paste(): returns true if @op can be continued as
5765+ paste */
5766+static int
5767+can_paste(coord_t * icoord, const reiser4_key * key,
5768+ const reiser4_item_data * data)
5769+{
5770+ coord_t circa;
5771+ item_plugin *new_iplug;
5772+ item_plugin *old_iplug;
5773+ int result = 0; /* to keep gcc shut */
5774+
5775+ assert("", icoord->between != AT_UNIT);
5776+
5777+ /* obviously, one cannot paste when node is empty---there is nothing
5778+ to paste into. */
5779+ if (node_is_empty(icoord->node))
5780+ return 0;
5781+ /* if insertion point is at the middle of the item, then paste */
5782+ if (!coord_is_between_items(icoord))
5783+ return 1;
5784+ coord_dup(&circa, icoord);
5785+ circa.between = AT_UNIT;
5786+
5787+ old_iplug = item_plugin_by_coord(&circa);
5788+ new_iplug = data->iplug;
5789+
5790+ /* check whether we can paste to the item @icoord is "at" when we
5791+ ignore ->between field */
5792+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5793+ result = 1;
5794+ } else if (icoord->between == BEFORE_UNIT
5795+ || icoord->between == BEFORE_ITEM) {
5796+ /* otherwise, try to glue to the item at the left, if any */
5797+ coord_dup(&circa, icoord);
5798+ if (coord_set_to_left(&circa)) {
5799+ result = 0;
5800+ coord_init_before_item(icoord);
5801+ } else {
5802+ old_iplug = item_plugin_by_coord(&circa);
5803+ result = (old_iplug == new_iplug)
5804+ && item_can_contain_key(icoord, key, data);
5805+ if (result) {
5806+ coord_dup(icoord, &circa);
5807+ icoord->between = AFTER_UNIT;
5808+ }
5809+ }
5810+ } else if (icoord->between == AFTER_UNIT
5811+ || icoord->between == AFTER_ITEM) {
5812+ coord_dup(&circa, icoord);
5813+ /* otherwise, try to glue to the item at the right, if any */
5814+ if (coord_set_to_right(&circa)) {
5815+ result = 0;
5816+ coord_init_after_item(icoord);
5817+ } else {
5818+ int (*cck) (const coord_t *, const reiser4_key *,
5819+ const reiser4_item_data *);
5820+
5821+ old_iplug = item_plugin_by_coord(&circa);
5822+
5823+ cck = old_iplug->b.can_contain_key;
5824+ if (cck == NULL)
5825+ /* item doesn't define ->can_contain_key
5826+ method? So it is not expandable. */
5827+ result = 0;
5828+ else {
5829+ result = (old_iplug == new_iplug)
5830+ && cck(&circa /*icoord */ , key, data);
5831+ if (result) {
5832+ coord_dup(icoord, &circa);
5833+ icoord->between = BEFORE_UNIT;
5834+ }
5835+ }
5836+ }
5837+ } else
5838+ impossible("nikita-2513", "Nothing works");
5839+ if (result) {
5840+ if (icoord->between == BEFORE_ITEM) {
5841+ assert("vs-912", icoord->unit_pos == 0);
5842+ icoord->between = BEFORE_UNIT;
5843+ } else if (icoord->between == AFTER_ITEM) {
5844+ coord_init_after_item_end(icoord);
5845+ }
5846+ }
5847+ return result;
5848+}
5849+
5850+/* implements COP_PASTE operation
5851+
5852+ Paste data into existing item. This is complicated by the fact that after
5853+ we shifted something to the left or right neighbors trying to free some
5854+ space, item we were supposed to paste into can be in different node than
5855+ insertion coord. If so, we are no longer doing paste, but insert. See
5856+ comments in insert_paste_common().
5857+
5858+*/
5859+static int carry_paste(carry_op * op /* operation to be performed */ ,
5860+ carry_level * doing UNUSED_ARG /* current carry
5861+ * level */ ,
5862+ carry_level * todo /* next carry level */ )
5863+{
5864+ znode *node;
5865+ carry_insert_data cdata;
5866+ coord_t dcoord;
5867+ reiser4_item_data data;
5868+ int result;
5869+ int real_size;
5870+ item_plugin *iplug;
5871+ carry_plugin_info info;
5872+ coord_t *coord;
5873+
5874+ assert("nikita-982", op != NULL);
5875+ assert("nikita-983", todo != NULL);
5876+ assert("nikita-984", op->op == COP_PASTE);
5877+
5878+ coord_init_zero(&dcoord);
5879+
5880+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5881+ if (result != 0)
5882+ return result;
5883+
5884+ coord = op->u.insert.d->coord;
5885+
5886+ /* handle case when op -> u.insert.coord doesn't point to the item
5887+ of required type. restart as insert. */
5888+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5889+ op->op = COP_INSERT;
5890+ op->u.insert.type = COPT_PASTE_RESTARTED;
5891+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5892+
5893+ return result;
5894+ }
5895+
5896+ node = coord->node;
5897+ iplug = item_plugin_by_coord(coord);
5898+ assert("nikita-992", iplug != NULL);
5899+
5900+ assert("nikita-985", node != NULL);
5901+ assert("nikita-986", node_plugin_by_node(node) != NULL);
5902+
5903+ assert("nikita-987",
5904+ space_needed_for_op(node, op) <= znode_free_space(node));
5905+
5906+ assert("nikita-1286", coord_is_existing_item(coord));
5907+
5908+ /*
5909+ * if item is expanded as a result of this operation, we should first
5910+ * change item size, than call ->b.paste item method. If item is
5911+ * shrunk, it should be done other way around: first call ->b.paste
5912+ * method, then reduce item size.
5913+ */
5914+
5915+ real_size = space_needed_for_op(node, op);
5916+ if (real_size > 0)
5917+ node->nplug->change_item_size(coord, real_size);
5918+
5919+ doing->restartable = 0;
5920+ info.doing = doing;
5921+ info.todo = todo;
5922+
5923+ result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5924+
5925+ if (real_size < 0)
5926+ node->nplug->change_item_size(coord, real_size);
5927+
5928+ /* if we pasted at the beginning of the item, update item's key. */
5929+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5930+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5931+
5932+ znode_make_dirty(node);
5933+ return result;
5934+}
5935+
5936+/* handle carry COP_EXTENT operation. */
5937+static int carry_extent(carry_op * op /* operation to perform */ ,
5938+ carry_level * doing /* queue of operations @op
5939+ * is part of */ ,
5940+ carry_level * todo /* queue where new operations
5941+ * are accumulated */ )
5942+{
5943+ znode *node;
5944+ carry_insert_data cdata;
5945+ coord_t coord;
5946+ reiser4_item_data data;
5947+ carry_op *delete_dummy;
5948+ carry_op *insert_extent;
5949+ int result;
5950+ carry_plugin_info info;
5951+
5952+ assert("nikita-1751", op != NULL);
5953+ assert("nikita-1752", todo != NULL);
5954+ assert("nikita-1753", op->op == COP_EXTENT);
5955+
5956+ /* extent insertion overview:
5957+
5958+ extents live on the TWIG LEVEL, which is level one above the leaf
5959+ one. This complicates extent insertion logic somewhat: it may
5960+ happen (and going to happen all the time) that in logical key
5961+ ordering extent has to be placed between items I1 and I2, located
5962+ at the leaf level, but I1 and I2 are in the same formatted leaf
5963+ node N1. To insert extent one has to
5964+
5965+ (1) reach node N1 and shift data between N1, its neighbors and
5966+ possibly newly allocated nodes until I1 and I2 fall into different
5967+ nodes. Since I1 and I2 are still neighboring items in logical key
5968+ order, they will be necessary utmost items in their respective
5969+ nodes.
5970+
5971+ (2) After this new extent item is inserted into node on the twig
5972+ level.
5973+
5974+ Fortunately this process can reuse almost all code from standard
5975+ insertion procedure (viz. make_space() and insert_paste_common()),
5976+ due to the following observation: make_space() only shifts data up
5977+ to and excluding or including insertion point. It never
5978+ "over-moves" through insertion point. Thus, one can use
5979+ make_space() to perform step (1). All required for this is just to
5980+ instruct free_space_shortage() to keep make_space() shifting data
5981+ until insertion point is at the node border.
5982+
5983+ */
5984+
5985+ /* perform common functionality of insert and paste. */
5986+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5987+ if (result != 0)
5988+ return result;
5989+
5990+ node = op->u.extent.d->coord->node;
5991+ assert("nikita-1754", node != NULL);
5992+ assert("nikita-1755", node_plugin_by_node(node) != NULL);
5993+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5994+
5995+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5996+ extent fits between items. */
5997+
5998+ info.doing = doing;
5999+ info.todo = todo;
6000+
6001+ /* there is another complication due to placement of extents on the
6002+ twig level: extents are "rigid" in the sense that key-range
6003+ occupied by extent cannot grow indefinitely to the right as it is
6004+ for the formatted leaf nodes. Because of this when search finds two
6005+ adjacent extents on the twig level, it has to "drill" to the leaf
6006+ level, creating new node. Here we are removing this node.
6007+ */
6008+ if (node_is_empty(node)) {
6009+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
6010+ if (IS_ERR(delete_dummy))
6011+ return PTR_ERR(delete_dummy);
6012+ delete_dummy->u.delete.child = NULL;
6013+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
6014+ ZF_SET(node, JNODE_HEARD_BANSHEE);
6015+ }
6016+
6017+ /* proceed with inserting extent item into parent. We are definitely
6018+ inserting rather than pasting if we get that far. */
6019+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
6020+ if (IS_ERR(insert_extent))
6021+ /* @delete_dummy will be automatically destroyed on the level
6022+ exiting */
6023+ return PTR_ERR(insert_extent);
6024+ /* NOTE-NIKITA insertion by key is simplest option here. Another
6025+ possibility is to insert on the left or right of already existing
6026+ item.
6027+ */
6028+ insert_extent->u.insert.type = COPT_KEY;
6029+ insert_extent->u.insert.d = op->u.extent.d;
6030+ assert("nikita-1719", op->u.extent.d->key != NULL);
6031+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
6032+ insert_extent->u.insert.flags =
6033+ znode_get_tree(node)->carry.new_extent_flags;
6034+
6035+ /*
6036+ * if carry was asked to track lock handle we should actually track
6037+ * lock handle on the twig node rather than on the leaf where
6038+ * operation was started from. Transfer tracked lock handle.
6039+ */
6040+ if (doing->track_type) {
6041+ assert("nikita-3242", doing->tracked != NULL);
6042+ assert("nikita-3244", todo->tracked == NULL);
6043+ todo->tracked = doing->tracked;
6044+ todo->track_type = CARRY_TRACK_NODE;
6045+ doing->tracked = NULL;
6046+ doing->track_type = 0;
6047+ }
6048+
6049+ return 0;
6050+}
6051+
6052+/* update key in @parent between pointers to @left and @right.
6053+
6054+ Find coords of @left and @right and update delimiting key between them.
6055+ This is helper function called by carry_update(). Finds position of
6056+ internal item involved. Updates item key. Updates delimiting keys of child
6057+ nodes involved.
6058+*/
6059+static int update_delimiting_key(znode * parent /* node key is updated
6060+ * in */ ,
6061+ znode * left /* child of @parent */ ,
6062+ znode * right /* child of @parent */ ,
6063+ carry_level * doing /* current carry
6064+ * level */ ,
6065+ carry_level * todo /* parent carry
6066+ * level */ ,
6067+ const char **error_msg /* place to
6068+ * store error
6069+ * message */ )
6070+{
6071+ coord_t left_pos;
6072+ coord_t right_pos;
6073+ int result;
6074+ reiser4_key ldkey;
6075+ carry_plugin_info info;
6076+
6077+ assert("nikita-1177", right != NULL);
6078+ /* find position of right left child in a parent */
6079+ result = find_child_ptr(parent, right, &right_pos);
6080+ if (result != NS_FOUND) {
6081+ *error_msg = "Cannot find position of right child";
6082+ return result;
6083+ }
6084+
6085+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
6086+ /* find position of the left child in a parent */
6087+ result = find_child_ptr(parent, left, &left_pos);
6088+ if (result != NS_FOUND) {
6089+ *error_msg = "Cannot find position of left child";
6090+ return result;
6091+ }
6092+ assert("nikita-1355", left_pos.node != NULL);
6093+ } else
6094+ left_pos.node = NULL;
6095+
6096+ /* check that they are separated by exactly one key and are basically
6097+ sane */
6098+ if (REISER4_DEBUG) {
6099+ if ((left_pos.node != NULL)
6100+ && !coord_is_existing_unit(&left_pos)) {
6101+ *error_msg = "Left child is bastard";
6102+ return RETERR(-EIO);
6103+ }
6104+ if (!coord_is_existing_unit(&right_pos)) {
6105+ *error_msg = "Right child is bastard";
6106+ return RETERR(-EIO);
6107+ }
6108+ if (left_pos.node != NULL &&
6109+ !coord_are_neighbors(&left_pos, &right_pos)) {
6110+ *error_msg = "Children are not direct siblings";
6111+ return RETERR(-EIO);
6112+ }
6113+ }
6114+ *error_msg = NULL;
6115+
6116+ info.doing = doing;
6117+ info.todo = todo;
6118+
6119+ /*
6120+ * If child node is not empty, new key of internal item is a key of
6121+ * leftmost item in the child node. If the child is empty, take its
6122+ * right delimiting key as a new key of the internal item. Precise key
6123+ * in the latter case is not important per se, because the child (and
6124+ * the internal item) are going to be killed shortly anyway, but we
6125+ * have to preserve correct order of keys in the parent node.
6126+ */
6127+
6128+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
6129+ leftmost_key_in_node(right, &ldkey);
6130+ else {
6131+ read_lock_dk(znode_get_tree(parent));
6132+ ldkey = *znode_get_rd_key(right);
6133+ read_unlock_dk(znode_get_tree(parent));
6134+ }
6135+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
6136+ doing->restartable = 0;
6137+ znode_make_dirty(parent);
6138+ return 0;
6139+}
6140+
6141+/* implements COP_UPDATE opration
6142+
6143+ Update delimiting keys.
6144+
6145+*/
6146+static int carry_update(carry_op * op /* operation to be performed */ ,
6147+ carry_level * doing /* current carry level */ ,
6148+ carry_level * todo /* next carry level */ )
6149+{
6150+ int result;
6151+ carry_node *missing UNUSED_ARG;
6152+ znode *left;
6153+ znode *right;
6154+ carry_node *lchild;
6155+ carry_node *rchild;
6156+ const char *error_msg;
6157+ reiser4_tree *tree;
6158+
6159+ /*
6160+ * This operation is called to update key of internal item. This is
6161+ * necessary when carry shifted of cut data on the child
6162+ * level. Arguments of this operation are:
6163+ *
6164+ * @right --- child node. Operation should update key of internal
6165+ * item pointing to @right.
6166+ *
6167+ * @left --- left neighbor of @right. This parameter is optional.
6168+ */
6169+
6170+ assert("nikita-902", op != NULL);
6171+ assert("nikita-903", todo != NULL);
6172+ assert("nikita-904", op->op == COP_UPDATE);
6173+
6174+ lchild = op->u.update.left;
6175+ rchild = op->node;
6176+
6177+ if (lchild != NULL) {
6178+ assert("nikita-1001", lchild->parent);
6179+ assert("nikita-1003", !lchild->left);
6180+ left = carry_real(lchild);
6181+ } else
6182+ left = NULL;
6183+
6184+ tree = znode_get_tree(rchild->node);
6185+ read_lock_tree(tree);
6186+ right = znode_parent(rchild->node);
6187+ read_unlock_tree(tree);
6188+
6189+ if (right != NULL) {
6190+ result = update_delimiting_key(right,
6191+ lchild ? lchild->node : NULL,
6192+ rchild->node,
6193+ doing, todo, &error_msg);
6194+ } else {
6195+ error_msg = "Cannot find node to update key in";
6196+ result = RETERR(-EIO);
6197+ }
6198+ /* operation will be reposted to the next level by the
6199+ ->update_item_key() method of node plugin, if necessary. */
6200+
6201+ if (result != 0) {
6202+ warning("nikita-999", "Error updating delimiting key: %s (%i)",
6203+ error_msg ? : "", result);
6204+ }
6205+ return result;
6206+}
6207+
6208+/* move items from @node during carry */
6209+static int carry_shift_data(sideof side /* in what direction to move data */ ,
6210+ coord_t * insert_coord /* coord where new item
6211+ * is to be inserted */ ,
6212+ znode * node /* node which data are moved from */ ,
6213+ carry_level * doing /* active carry queue */ ,
6214+ carry_level * todo /* carry queue where new
6215+ * operations are to be put
6216+ * in */ ,
6217+ unsigned int including_insert_coord_p /* true if
6218+ * @insertion_coord
6219+ * can be moved */ )
6220+{
6221+ int result;
6222+ znode *source;
6223+ carry_plugin_info info;
6224+ node_plugin *nplug;
6225+
6226+ source = insert_coord->node;
6227+
6228+ info.doing = doing;
6229+ info.todo = todo;
6230+
6231+ nplug = node_plugin_by_node(node);
6232+ result = nplug->shift(insert_coord, node,
6233+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
6234+ (int)including_insert_coord_p, &info);
6235+ /* the only error ->shift() method of node plugin can return is
6236+ -ENOMEM due to carry node/operation allocation. */
6237+ assert("nikita-915", result >= 0 || result == -ENOMEM);
6238+ if (result > 0) {
6239+ /*
6240+ * if some number of bytes was actually shifted, mark nodes
6241+ * dirty, and carry level as non-restartable.
6242+ */
6243+ doing->restartable = 0;
6244+ znode_make_dirty(source);
6245+ znode_make_dirty(node);
6246+ }
6247+
6248+ assert("nikita-2077", coord_check(insert_coord));
6249+ return 0;
6250+}
6251+
6252+typedef carry_node *(*carry_iterator) (carry_node * node);
6253+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6254+ carry_iterator iterator);
6255+
6256+static carry_node *pool_level_list_prev(carry_node *node)
6257+{
6258+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6259+}
6260+
6261+/* look for the left neighbor of given carry node in a carry queue.
6262+
6263+ This is used by find_left_neighbor(), but I am not sure that this
6264+ really gives any advantage. More statistics required.
6265+
6266+*/
6267+carry_node *find_left_carry(carry_node * node /* node to find left neighbor
6268+ * of */ ,
6269+ carry_level * level /* level to scan */ )
6270+{
6271+ return find_dir_carry(node, level,
6272+ (carry_iterator) pool_level_list_prev);
6273+}
6274+
6275+static carry_node *pool_level_list_next(carry_node *node)
6276+{
6277+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6278+}
6279+
6280+/* look for the right neighbor of given carry node in a
6281+ carry queue.
6282+
6283+ This is used by find_right_neighbor(), but I am not sure that this
6284+ really gives any advantage. More statistics required.
6285+
6286+*/
6287+carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6288+ * of */ ,
6289+ carry_level * level /* level to scan */ )
6290+{
6291+ return find_dir_carry(node, level,
6292+ (carry_iterator) pool_level_list_next);
6293+}
6294+
6295+/* look for the left or right neighbor of given carry node in a carry
6296+ queue.
6297+
6298+ Helper function used by find_{left|right}_carry().
6299+*/
6300+static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6301+ * from */ ,
6302+ carry_level * level /* level to scan */ ,
6303+ carry_iterator iterator /* operation to
6304+ * move to the next
6305+ * node */ )
6306+{
6307+ carry_node *neighbor;
6308+
6309+ assert("nikita-1059", node != NULL);
6310+ assert("nikita-1060", level != NULL);
6311+
6312+ /* scan list of carry nodes on this list dir-ward, skipping all
6313+ carry nodes referencing the same znode. */
6314+ neighbor = node;
6315+ while (1) {
6316+ neighbor = iterator(neighbor);
6317+ if (carry_node_end(level, neighbor))
6318+ /* list head is reached */
6319+ return NULL;
6320+ if (carry_real(neighbor) != carry_real(node))
6321+ return neighbor;
6322+ }
6323+}
6324+
6325+/*
6326+ * Memory reservation estimation.
6327+ *
6328+ * Carry process proceeds through tree levels upwards. Carry assumes that it
6329+ * takes tree in consistent state (e.g., that search tree invariants hold),
6330+ * and leaves tree consistent after it finishes. This means that when some
6331+ * error occurs carry cannot simply return if there are pending carry
6332+ * operations. Generic solution for this problem is carry-undo either as
6333+ * transaction manager feature (requiring checkpoints and isolation), or
6334+ * through some carry specific mechanism.
6335+ *
6336+ * Our current approach is to panic if carry hits an error while tree is
6337+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6338+ * this "memory reservation" mechanism was added.
6339+ *
6340+ * Memory reservation is implemented by perthread-pages.diff patch from
6341+ * core-patches. Its API is defined in <linux/gfp.h>
6342+ *
6343+ * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6344+ * void perthread_pages_release(int nrpages);
6345+ * int perthread_pages_count(void);
6346+ *
6347+ * carry estimates its worst case memory requirements at the entry, reserved
6348+ * enough memory, and released unused pages before returning.
6349+ *
6350+ * Code below estimates worst case memory requirements for a given carry
6351+ * queue. This is dome by summing worst case memory requirements for each
6352+ * operation in the queue.
6353+ *
6354+ */
6355+
6356+/*
6357+ * Memory memory requirements of many operations depends on the tree
6358+ * height. For example, item insertion requires new node to be inserted at
6359+ * each tree level in the worst case. What tree height should be used for
6360+ * estimation? Current tree height is wrong, because tree height can change
6361+ * between the time when estimation was done and the time when operation is
6362+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6363+ * is also not desirable, because it would lead to the huge over-estimation
6364+ * all the time. Plausible solution is "capped tree height": if current tree
6365+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6366+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6367+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6368+ * to be increased even more during short interval of time.
6369+ */
6370+#define TREE_HEIGHT_CAP (5)
6371+
6372+/* return capped tree height for the @tree. See comment above. */
6373+static int cap_tree_height(reiser4_tree * tree)
6374+{
6375+ return max_t(int, tree->height, TREE_HEIGHT_CAP);
6376+}
6377+
6378+/* return capped tree height for the current tree. */
6379+static int capped_height(void)
6380+{
6381+ return cap_tree_height(current_tree);
6382+}
6383+
6384+/* return number of pages required to store given number of bytes */
6385+static int bytes_to_pages(int bytes)
6386+{
6387+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6388+}
6389+
6390+/* how many pages are required to allocate znodes during item insertion. */
6391+static int carry_estimate_znodes(void)
6392+{
6393+ /*
6394+ * Note, that there we have some problem here: there is no way to
6395+ * reserve pages specifically for the given slab. This means that
6396+ * these pages can be hijacked for some other end.
6397+ */
6398+
6399+ /* in the worst case we need 3 new znode on each tree level */
6400+ return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6401+}
6402+
6403+/*
6404+ * how many pages are required to load bitmaps. One bitmap per level.
6405+ */
6406+static int carry_estimate_bitmaps(void)
6407+{
6408+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6409+ int bytes;
6410+
6411+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6412+ * bitmap.c, skip for now. */
6413+ 2 * sizeof(jnode)); /* working and commit jnodes */
6414+ return bytes_to_pages(bytes) + 2; /* and their contents */
6415+ } else
6416+ /* bitmaps were pre-loaded during mount */
6417+ return 0;
6418+}
6419+
6420+/* worst case item insertion memory requirements */
6421+static int carry_estimate_insert(carry_op * op, carry_level * level)
6422+{
6423+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6424+ capped_height() + /* new block on each level */
6425+ 1 + /* and possibly extra new block at the leaf level */
6426+ 3; /* loading of leaves into memory */
6427+}
6428+
6429+/* worst case item deletion memory requirements */
6430+static int carry_estimate_delete(carry_op * op, carry_level * level)
6431+{
6432+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6433+ 3; /* loading of leaves into memory */
6434+}
6435+
6436+/* worst case tree cut memory requirements */
6437+static int carry_estimate_cut(carry_op * op, carry_level * level)
6438+{
6439+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6440+ 3; /* loading of leaves into memory */
6441+}
6442+
6443+/* worst case memory requirements of pasting into item */
6444+static int carry_estimate_paste(carry_op * op, carry_level * level)
6445+{
6446+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6447+ capped_height() + /* new block on each level */
6448+ 1 + /* and possibly extra new block at the leaf level */
6449+ 3; /* loading of leaves into memory */
6450+}
6451+
6452+/* worst case memory requirements of extent insertion */
6453+static int carry_estimate_extent(carry_op * op, carry_level * level)
6454+{
6455+ return carry_estimate_insert(op, level) + /* insert extent */
6456+ carry_estimate_delete(op, level); /* kill leaf */
6457+}
6458+
6459+/* worst case memory requirements of key update */
6460+static int carry_estimate_update(carry_op * op, carry_level * level)
6461+{
6462+ return 0;
6463+}
6464+
6465+/* worst case memory requirements of flow insertion */
6466+static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6467+{
6468+ int newnodes;
6469+
6470+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6471+ CARRY_FLOW_NEW_NODES_LIMIT);
6472+ /*
6473+ * roughly estimate insert_flow as a sequence of insertions.
6474+ */
6475+ return newnodes * carry_estimate_insert(op, level);
6476+}
6477+
6478+/* This is dispatch table for carry operations. It can be trivially
6479+ abstracted into useful plugin: tunable balancing policy is a good
6480+ thing. */
6481+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6482+ [COP_INSERT] = {
6483+ .handler = carry_insert,
6484+ .estimate = carry_estimate_insert}
6485+ ,
6486+ [COP_DELETE] = {
6487+ .handler = carry_delete,
6488+ .estimate = carry_estimate_delete}
6489+ ,
6490+ [COP_CUT] = {
6491+ .handler = carry_cut,
6492+ .estimate = carry_estimate_cut}
6493+ ,
6494+ [COP_PASTE] = {
6495+ .handler = carry_paste,
6496+ .estimate = carry_estimate_paste}
6497+ ,
6498+ [COP_EXTENT] = {
6499+ .handler = carry_extent,
6500+ .estimate = carry_estimate_extent}
6501+ ,
6502+ [COP_UPDATE] = {
6503+ .handler = carry_update,
6504+ .estimate = carry_estimate_update}
6505+ ,
6506+ [COP_INSERT_FLOW] = {
6507+ .handler = carry_insert_flow,
6508+ .estimate = carry_estimate_insert_flow}
6509+};
6510+
6511+/* Make Linus happy.
6512+ Local variables:
6513+ c-indentation-style: "K&R"
6514+ mode-name: "LC"
6515+ c-basic-offset: 8
6516+ tab-width: 8
6517+ fill-column: 120
6518+ scroll-step: 1
6519+ End:
6520+*/
6521Index: linux-2.6.16/fs/reiser4/carry_ops.h
6522===================================================================
6523--- /dev/null
6524+++ linux-2.6.16/fs/reiser4/carry_ops.h
6525@@ -0,0 +1,42 @@
6526+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6527+
6528+/* implementation of carry operations. See carry_ops.c for details. */
6529+
6530+#if !defined( __CARRY_OPS_H__ )
6531+#define __CARRY_OPS_H__
6532+
6533+#include "forward.h"
6534+#include "znode.h"
6535+#include "carry.h"
6536+
6537+/* carry operation handlers */
6538+typedef struct carry_op_handler {
6539+ /* perform operation */
6540+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6541+ /* estimate memory requirements for @op */
6542+ int (*estimate) (carry_op * op, carry_level * level);
6543+} carry_op_handler;
6544+
6545+/* This is dispatch table for carry operations. It can be trivially
6546+ abstracted into useful plugin: tunable balancing policy is a good
6547+ thing. */
6548+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6549+
6550+unsigned int space_needed(const znode * node, const coord_t * coord,
6551+ const reiser4_item_data * data, int inserting);
6552+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6553+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6554+
6555+/* __CARRY_OPS_H__ */
6556+#endif
6557+
6558+/* Make Linus happy.
6559+ Local variables:
6560+ c-indentation-style: "K&R"
6561+ mode-name: "LC"
6562+ c-basic-offset: 8
6563+ tab-width: 8
6564+ fill-column: 120
6565+ scroll-step: 1
6566+ End:
6567+*/
6568Index: linux-2.6.16/fs/reiser4/context.c
6569===================================================================
6570--- /dev/null
6571+++ linux-2.6.16/fs/reiser4/context.c
6572@@ -0,0 +1,278 @@
6573+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6574+
6575+/* Manipulation of reiser4_context */
6576+
6577+/*
6578+ * global context used during system call. Variable of this type is allocated
6579+ * on the stack at the beginning of the reiser4 part of the system call and
6580+ * pointer to it is stored in the current->fs_context. This allows us to avoid
6581+ * passing pointer to current transaction and current lockstack (both in
6582+ * one-to-one mapping with threads) all over the call chain.
6583+ *
6584+ * It's kind of like those global variables the prof used to tell you not to
6585+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6586+ *
6587+ * In some situations it is desirable to have ability to enter reiser4_context
6588+ * more than once for the same thread (nested contexts). For example, there
6589+ * are some functions that can be called either directly from VFS/VM or from
6590+ * already active reiser4 context (->writepage, for example).
6591+ *
6592+ * In such situations "child" context acts like dummy: all activity is
6593+ * actually performed in the top level context, and get_current_context()
6594+ * always returns top level context. Of course, init_context()/done_context()
6595+ * have to be properly nested any way.
6596+ *
6597+ * Note that there is an important difference between reiser4 uses
6598+ * ->fs_context and the way other file systems use it. Other file systems
6599+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6600+ * (this is why ->fs_context was initially called ->journal_info). This means,
6601+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6602+ * to the file system, they assume that some transaction is already underway,
6603+ * and usually bail out, because starting nested transaction would most likely
6604+ * lead to the deadlock. This gives false positives with reiser4, because we
6605+ * set ->fs_context before starting transaction.
6606+ */
6607+
6608+#include "debug.h"
6609+#include "super.h"
6610+#include "context.h"
6611+
6612+#include <linux/writeback.h> /* balance_dirty_pages() */
6613+#include <linux/hardirq.h>
6614+
6615+
6616+static void _init_context(reiser4_context * context, struct super_block *super)
6617+{
6618+ memset(context, 0, sizeof(*context));
6619+
6620+ context->super = super;
6621+ context->magic = context_magic;
6622+ context->outer = current->journal_info;
6623+ current->journal_info = (void *)context;
6624+ context->nr_children = 0;
6625+ context->gfp_mask = GFP_KERNEL;
6626+
6627+ init_lock_stack(&context->stack);
6628+
6629+ txn_begin(context);
6630+
6631+ /* initialize head of tap list */
6632+ INIT_LIST_HEAD(&context->taps);
6633+#if REISER4_DEBUG
6634+ context->task = current;
6635+#endif
6636+ grab_space_enable();
6637+}
6638+
6639+/* initialize context and bind it to the current thread
6640+
6641+ This function should be called at the beginning of reiser4 part of
6642+ syscall.
6643+*/
6644+reiser4_context *init_context(struct super_block *super /* super block we are going to
6645+ * work with */ )
6646+{
6647+ reiser4_context *context;
6648+
6649+ assert("nikita-2662", !in_interrupt() && !in_irq());
6650+ assert("nikita-3357", super != NULL);
6651+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6652+
6653+ context = get_current_context_check();
6654+ if (context && context->super == super) {
6655+ context = (reiser4_context *) current->journal_info;
6656+ context->nr_children++;
6657+ return context;
6658+ }
6659+
6660+ context = kmalloc(sizeof(*context), GFP_KERNEL);
6661+ if (context == NULL)
6662+ return ERR_PTR(RETERR(-ENOMEM));
6663+
6664+ _init_context(context, super);
6665+ return context;
6666+}
6667+
6668+/* this is used in scan_mgr which is called with spinlock held and in
6669+ reiser4_fill_super magic */
6670+void init_stack_context(reiser4_context *context, struct super_block *super)
6671+{
6672+ assert("nikita-2662", !in_interrupt() && !in_irq());
6673+ assert("nikita-3357", super != NULL);
6674+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6675+ assert("vs-12", !is_in_reiser4_context());
6676+
6677+ _init_context(context, super);
6678+ context->on_stack = 1;
6679+ return;
6680+}
6681+
6682+/* cast lock stack embedded into reiser4 context up to its container */
6683+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6684+{
6685+ return container_of(owner, reiser4_context, stack);
6686+}
6687+
6688+/* true if there is already _any_ reiser4 context for the current thread */
6689+int is_in_reiser4_context(void)
6690+{
6691+ reiser4_context *ctx;
6692+
6693+ ctx = current->journal_info;
6694+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6695+}
6696+
6697+/*
6698+ * call balance dirty pages for the current context.
6699+ *
6700+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
6701+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6702+ * write---this covers vast majority of all dirty traffic), but we cannot do
6703+ * this immediately when formatted node is dirtied, because long term lock is
6704+ * usually held at that time. To work around this, dirtying of formatted node
6705+ * simply increases ->nr_marked_dirty counter in the current reiser4
6706+ * context. When we are about to leave this context,
6707+ * balance_dirty_pages_ratelimited() is called, if necessary.
6708+ *
6709+ * This introduces another problem: sometimes we do not want to run
6710+ * balance_dirty_pages_ratelimited() when leaving a context, for example
6711+ * because some important lock (like ->i_mutex on the parent directory) is
6712+ * held. To achieve this, ->nobalance flag can be set in the current context.
6713+ */
6714+static void balance_dirty_pages_at(reiser4_context *context)
6715+{
6716+ reiser4_super_info_data *sbinfo = get_super_private(context->super);
6717+
6718+ /*
6719+ * call balance_dirty_pages_ratelimited() to process formatted nodes
6720+ * dirtied during this system call. Do that only if we are not in mount
6721+ * and there were nodes dirtied in this context and we are not in
6722+ * writepage (to avoid deadlock) and not in pdflush
6723+ */
6724+ if (sbinfo != NULL && sbinfo->fake != NULL &&
6725+ context->nr_marked_dirty != 0 &&
6726+ !(current->flags & PF_MEMALLOC) &&
6727+ !current_is_pdflush())
6728+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6729+}
6730+
6731+/* release resources associated with context.
6732+
6733+ This function should be called at the end of "session" with reiser4,
6734+ typically just before leaving reiser4 driver back to VFS.
6735+
6736+ This is good place to put some degugging consistency checks, like that
6737+ thread released all locks and closed transcrash etc.
6738+
6739+*/
6740+static void done_context(reiser4_context * context /* context being released */ )
6741+{
6742+ assert("nikita-860", context != NULL);
6743+ assert("nikita-859", context->magic == context_magic);
6744+ assert("vs-646", (reiser4_context *) current->journal_info == context);
6745+ assert("zam-686", !in_interrupt() && !in_irq());
6746+
6747+ /* only do anything when leaving top-level reiser4 context. All nested
6748+ * contexts are just dummies. */
6749+ if (context->nr_children == 0) {
6750+ assert("jmacd-673", context->trans == NULL);
6751+ assert("jmacd-1002", lock_stack_isclean(&context->stack));
6752+ assert("nikita-1936", no_counters_are_held());
6753+ assert("nikita-2626", list_empty_careful(taps_list()));
6754+ assert("zam-1004", ergo(get_super_private(context->super),
6755+ get_super_private(context->super)->delete_sema_owner !=
6756+ current));
6757+
6758+ /* release all grabbed but as yet unused blocks */
6759+ if (context->grabbed_blocks != 0)
6760+ all_grabbed2free();
6761+
6762+ /*
6763+ * synchronize against longterm_unlock_znode():
6764+ * wake_up_requestor() wakes up requestors without holding
6765+ * zlock (otherwise they will immediately bump into that lock
6766+ * after wake up on another CPU). To work around (rare)
6767+ * situation where requestor has been woken up asynchronously
6768+ * and managed to run until completion (and destroy its
6769+ * context and lock stack) before wake_up_requestor() called
6770+ * wake_up() on it, wake_up_requestor() synchronize on lock
6771+ * stack spin lock. It has actually been observed that spin
6772+ * lock _was_ locked at this point, because
6773+ * wake_up_requestor() took interrupt.
6774+ */
6775+ spin_lock_stack(&context->stack);
6776+ spin_unlock_stack(&context->stack);
6777+
6778+ assert("zam-684", context->nr_children == 0);
6779+ /* restore original ->fs_context value */
6780+ current->journal_info = context->outer;
6781+ if (context->on_stack == 0)
6782+ kfree(context);
6783+ } else {
6784+ context->nr_children--;
6785+#if REISER4_DEBUG
6786+ assert("zam-685", context->nr_children >= 0);
6787+#endif
6788+ }
6789+}
6790+
6791+/*
6792+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6793+ * transaction. Call done_context() to do context related book-keeping.
6794+ */
6795+void reiser4_exit_context(reiser4_context * context)
6796+{
6797+ assert("nikita-3021", schedulable());
6798+
6799+ if (context->nr_children == 0) {
6800+ if (!context->nobalance) {
6801+ txn_restart(context);
6802+ balance_dirty_pages_at(context);
6803+ }
6804+
6805+ /* if filesystem is mounted with -o sync or -o dirsync - commit
6806+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6807+ commiting on exit_context when inode semaphore is held and
6808+ to have ktxnmgrd to do commit instead to get better
6809+ concurrent filesystem accesses. But, when one mounts with -o
6810+ sync, he cares more about reliability than about
6811+ performance. So, for now we have this simple mount -o sync
6812+ support. */
6813+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6814+ txn_atom *atom;
6815+
6816+ atom = get_current_atom_locked_nocheck();
6817+ if (atom) {
6818+ atom->flags |= ATOM_FORCE_COMMIT;
6819+ context->trans->flags &= ~TXNH_DONT_COMMIT;
6820+ spin_unlock_atom(atom);
6821+ }
6822+ }
6823+ txn_end(context);
6824+ }
6825+ done_context(context);
6826+}
6827+
6828+void set_gfp_mask(void)
6829+{
6830+ reiser4_context *ctx;
6831+
6832+ ctx = get_current_context();
6833+ if (ctx->entd == 0 &&
6834+ list_empty(&ctx->stack.locks) &&
6835+ ctx->trans->atom == NULL)
6836+ ctx->gfp_mask = GFP_KERNEL;
6837+ else
6838+ ctx->gfp_mask = GFP_NOFS;
6839+}
6840+
6841+/*
6842+ * Local variables:
6843+ * c-indentation-style: "K&R"
6844+ * mode-name: "LC"
6845+ * c-basic-offset: 8
6846+ * tab-width: 8
6847+ * fill-column: 120
6848+ * scroll-step: 1
6849+ * End:
6850+ */
6851Index: linux-2.6.16/fs/reiser4/context.h
6852===================================================================
6853--- /dev/null
6854+++ linux-2.6.16/fs/reiser4/context.h
6855@@ -0,0 +1,228 @@
6856+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6857+ * reiser4/README */
6858+
6859+/* Reiser4 context. See context.c for details. */
6860+
6861+#if !defined( __REISER4_CONTEXT_H__ )
6862+#define __REISER4_CONTEXT_H__
6863+
6864+#include "forward.h"
6865+#include "debug.h"
6866+#include "dformat.h"
6867+#include "tap.h"
6868+#include "lock.h"
6869+
6870+#include <linux/types.h> /* for __u?? */
6871+#include <linux/fs.h> /* for struct super_block */
6872+#include <linux/spinlock.h>
6873+#include <linux/sched.h> /* for struct task_struct */
6874+
6875+
6876+/* reiser4 per-thread context */
6877+struct reiser4_context {
6878+ /* magic constant. For identification of reiser4 contexts. */
6879+ __u32 magic;
6880+
6881+ /* current lock stack. See lock.[ch]. This is where list of all
6882+ locks taken by current thread is kept. This is also used in
6883+ deadlock detection. */
6884+ lock_stack stack;
6885+
6886+ /* current transcrash. */
6887+ txn_handle *trans;
6888+ /* transaction handle embedded into reiser4_context. ->trans points
6889+ * here by default. */
6890+ txn_handle trans_in_ctx;
6891+
6892+ /* super block we are working with. To get the current tree
6893+ use &get_super_private (reiser4_get_current_sb ())->tree. */
6894+ struct super_block *super;
6895+
6896+ /* parent fs activation */
6897+ struct fs_activation *outer;
6898+
6899+ /* per-thread grabbed (for further allocation) blocks counter */
6900+ reiser4_block_nr grabbed_blocks;
6901+
6902+ /* list of taps currently monitored. See tap.c */
6903+ struct list_head taps;
6904+
6905+ /* grabbing space is enabled */
6906+ unsigned int grab_enabled:1;
6907+ /* should be set when we are write dirty nodes to disk in jnode_flush or
6908+ * reiser4_write_logs() */
6909+ unsigned int writeout_mode:1;
6910+ /* true, if current thread is an ent thread */
6911+ unsigned int entd:1;
6912+ /* true, if balance_dirty_pages() should not be run when leaving this
6913+ * context. This is used to avoid lengthly balance_dirty_pages()
6914+ * operation when holding some important resource, like directory
6915+ * ->i_mutex */
6916+ unsigned int nobalance:1;
6917+
6918+ /* this bit is used on done_context to decide whether context is
6919+ kmalloc-ed and has to be kfree-ed */
6920+ unsigned int on_stack:1;
6921+
6922+ /* count non-trivial jnode_set_dirty() calls */
6923+ unsigned long nr_marked_dirty;
6924+
6925+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6926+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6927+ * captures pages. When number of pages captured in one
6928+ * reiser4_sync_inodes reaches some threshold - some atoms get
6929+ * flushed */
6930+ int nr_captured;
6931+ int nr_children; /* number of child contexts */
6932+#if REISER4_DEBUG
6933+ /* debugging information about reiser4 locks held by the current
6934+ * thread */
6935+ lock_counters_info locks;
6936+ struct task_struct *task; /* so we can easily find owner of the stack */
6937+
6938+ /*
6939+ * disk space grabbing debugging support
6940+ */
6941+ /* how many disk blocks were grabbed by the first call to
6942+ * reiser4_grab_space() in this context */
6943+ reiser4_block_nr grabbed_initially;
6944+
6945+ /* list of all threads doing flush currently */
6946+ struct list_head flushers_link;
6947+ /* information about last error encountered by reiser4 */
6948+ err_site err;
6949+#endif
6950+ void *vp;
6951+ gfp_t gfp_mask;
6952+};
6953+
6954+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6955+
6956+/* Debugging helps. */
6957+#if REISER4_DEBUG
6958+extern void print_contexts(void);
6959+#endif
6960+
6961+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6962+#define current_blocksize reiser4_get_current_sb()->s_blocksize
6963+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6964+
6965+extern reiser4_context *init_context(struct super_block *);
6966+extern void init_stack_context(reiser4_context *, struct super_block *);
6967+extern void reiser4_exit_context(reiser4_context *);
6968+
6969+/* magic constant we store in reiser4_context allocated at the stack. Used to
6970+ catch accesses to staled or uninitialized contexts. */
6971+#define context_magic ((__u32) 0x4b1b5d0b)
6972+
6973+extern int is_in_reiser4_context(void);
6974+
6975+/*
6976+ * return reiser4_context for the thread @tsk
6977+ */
6978+static inline reiser4_context *get_context(const struct task_struct *tsk)
6979+{
6980+ assert("vs-1682",
6981+ ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6982+ return (reiser4_context *) tsk->journal_info;
6983+}
6984+
6985+/*
6986+ * return reiser4 context of the current thread, or NULL if there is none.
6987+ */
6988+static inline reiser4_context *get_current_context_check(void)
6989+{
6990+ if (is_in_reiser4_context())
6991+ return get_context(current);
6992+ else
6993+ return NULL;
6994+}
6995+
6996+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6997+
6998+/* return context associated with current thread */
6999+static inline reiser4_context *get_current_context(void)
7000+{
7001+ return get_context(current);
7002+}
7003+
7004+static inline gfp_t get_gfp_mask(void)
7005+{
7006+ reiser4_context *ctx;
7007+
7008+ ctx = get_current_context_check();
7009+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
7010+}
7011+
7012+void set_gfp_mask(void);
7013+
7014+/*
7015+ * true if current thread is in the write-out mode. Thread enters write-out
7016+ * mode during jnode_flush and reiser4_write_logs().
7017+ */
7018+static inline int is_writeout_mode(void)
7019+{
7020+ return get_current_context()->writeout_mode;
7021+}
7022+
7023+/*
7024+ * enter write-out mode
7025+ */
7026+static inline void writeout_mode_enable(void)
7027+{
7028+ assert("zam-941", !get_current_context()->writeout_mode);
7029+ get_current_context()->writeout_mode = 1;
7030+}
7031+
7032+/*
7033+ * leave write-out mode
7034+ */
7035+static inline void writeout_mode_disable(void)
7036+{
7037+ assert("zam-942", get_current_context()->writeout_mode);
7038+ get_current_context()->writeout_mode = 0;
7039+}
7040+
7041+static inline void grab_space_enable(void)
7042+{
7043+ get_current_context()->grab_enabled = 1;
7044+}
7045+
7046+static inline void grab_space_disable(void)
7047+{
7048+ get_current_context()->grab_enabled = 0;
7049+}
7050+
7051+static inline void grab_space_set_enabled(int enabled)
7052+{
7053+ get_current_context()->grab_enabled = enabled;
7054+}
7055+
7056+static inline int is_grab_enabled(reiser4_context * ctx)
7057+{
7058+ return ctx->grab_enabled;
7059+}
7060+
7061+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
7062+ * flush would be performed when it is closed. This is necessary when handle
7063+ * has to be closed under some coarse semaphore, like i_mutex of
7064+ * directory. Commit will be performed by ktxnmgrd. */
7065+static inline void context_set_commit_async(reiser4_context * context)
7066+{
7067+ context->nobalance = 1;
7068+ context->trans->flags |= TXNH_DONT_COMMIT;
7069+}
7070+
7071+/* __REISER4_CONTEXT_H__ */
7072+#endif
7073+
7074+/* Make Linus happy.
7075+ Local variables:
7076+ c-indentation-style: "K&R"
7077+ mode-name: "LC"
7078+ c-basic-offset: 8
7079+ tab-width: 8
7080+ fill-column: 120
7081+ scroll-step: 1
7082+ End:
7083+*/
7084Index: linux-2.6.16/fs/reiser4/coord.c
7085===================================================================
7086--- /dev/null
7087+++ linux-2.6.16/fs/reiser4/coord.c
7088@@ -0,0 +1,937 @@
7089+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7090+
7091+#include "forward.h"
7092+#include "debug.h"
7093+#include "dformat.h"
7094+#include "tree.h"
7095+#include "plugin/item/item.h"
7096+#include "znode.h"
7097+#include "coord.h"
7098+
7099+/* Internal constructor. */
7100+static inline void
7101+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
7102+ pos_in_node_t unit_pos, between_enum between)
7103+{
7104+ coord->node = (znode *) node;
7105+ coord_set_item_pos(coord, item_pos);
7106+ coord->unit_pos = unit_pos;
7107+ coord->between = between;
7108+ ON_DEBUG(coord->plug_v = 0);
7109+ ON_DEBUG(coord->body_v = 0);
7110+
7111+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
7112+}
7113+
7114+/* after shifting of node content, coord previously set properly may become
7115+ invalid, try to "normalize" it. */
7116+void coord_normalize(coord_t * coord)
7117+{
7118+ znode *node;
7119+
7120+ node = coord->node;
7121+ assert("vs-683", node);
7122+
7123+ coord_clear_iplug(coord);
7124+
7125+ if (node_is_empty(node)) {
7126+ coord_init_first_unit(coord, node);
7127+ } else if ((coord->between == AFTER_ITEM)
7128+ || (coord->between == AFTER_UNIT)) {
7129+ return;
7130+ } else if (coord->item_pos == coord_num_items(coord)
7131+ && coord->between == BEFORE_ITEM) {
7132+ coord_dec_item_pos(coord);
7133+ coord->between = AFTER_ITEM;
7134+ } else if (coord->unit_pos == coord_num_units(coord)
7135+ && coord->between == BEFORE_UNIT) {
7136+ coord->unit_pos--;
7137+ coord->between = AFTER_UNIT;
7138+ } else if (coord->item_pos == coord_num_items(coord)
7139+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
7140+ coord_dec_item_pos(coord);
7141+ coord->unit_pos = 0;
7142+ coord->between = AFTER_ITEM;
7143+ }
7144+}
7145+
7146+/* Copy a coordinate. */
7147+void coord_dup(coord_t * coord, const coord_t * old_coord)
7148+{
7149+ assert("jmacd-9800", coord_check(old_coord));
7150+ coord_dup_nocheck(coord, old_coord);
7151+}
7152+
7153+/* Copy a coordinate without check. Useful when old_coord->node is not
7154+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
7155+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
7156+{
7157+ coord->node = old_coord->node;
7158+ coord_set_item_pos(coord, old_coord->item_pos);
7159+ coord->unit_pos = old_coord->unit_pos;
7160+ coord->between = old_coord->between;
7161+ coord->iplugid = old_coord->iplugid;
7162+ ON_DEBUG(coord->plug_v = old_coord->plug_v);
7163+ ON_DEBUG(coord->body_v = old_coord->body_v);
7164+}
7165+
7166+/* Initialize an invalid coordinate. */
7167+void coord_init_invalid(coord_t * coord, const znode * node)
7168+{
7169+ coord_init_values(coord, node, 0, 0, INVALID_COORD);
7170+}
7171+
7172+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
7173+{
7174+ coord_init_values(coord, node, 0, 0, AT_UNIT);
7175+}
7176+
7177+/* Initialize a coordinate to point at the first unit of the first item. If the node is
7178+ empty, it is positioned at the EMPTY_NODE. */
7179+void coord_init_first_unit(coord_t * coord, const znode * node)
7180+{
7181+ int is_empty = node_is_empty(node);
7182+
7183+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
7184+
7185+ assert("jmacd-9801", coord_check(coord));
7186+}
7187+
7188+/* Initialize a coordinate to point at the last unit of the last item. If the node is
7189+ empty, it is positioned at the EMPTY_NODE. */
7190+void coord_init_last_unit(coord_t * coord, const znode * node)
7191+{
7192+ int is_empty = node_is_empty(node);
7193+
7194+ coord_init_values(coord, node,
7195+ (is_empty ? 0 : node_num_items(node) - 1), 0,
7196+ (is_empty ? EMPTY_NODE : AT_UNIT));
7197+ if (!is_empty)
7198+ coord->unit_pos = coord_last_unit_pos(coord);
7199+ assert("jmacd-9802", coord_check(coord));
7200+}
7201+
7202+/* Initialize a coordinate to before the first item. If the node is empty, it is
7203+ positioned at the EMPTY_NODE. */
7204+void coord_init_before_first_item(coord_t * coord, const znode * node)
7205+{
7206+ int is_empty = node_is_empty(node);
7207+
7208+ coord_init_values(coord, node, 0, 0,
7209+ (is_empty ? EMPTY_NODE : BEFORE_UNIT));
7210+
7211+ assert("jmacd-9803", coord_check(coord));
7212+}
7213+
7214+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7215+ at the EMPTY_NODE. */
7216+void coord_init_after_last_item(coord_t * coord, const znode * node)
7217+{
7218+ int is_empty = node_is_empty(node);
7219+
7220+ coord_init_values(coord, node,
7221+ (is_empty ? 0 : node_num_items(node) - 1), 0,
7222+ (is_empty ? EMPTY_NODE : AFTER_ITEM));
7223+
7224+ assert("jmacd-9804", coord_check(coord));
7225+}
7226+
7227+/* Initialize a coordinate to after last unit in the item. Coord must be set
7228+ already to existing item */
7229+void coord_init_after_item_end(coord_t * coord)
7230+{
7231+ coord->between = AFTER_UNIT;
7232+ coord->unit_pos = coord_last_unit_pos(coord);
7233+}
7234+
7235+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7236+void coord_init_before_item(coord_t * coord)
7237+{
7238+ coord->unit_pos = 0;
7239+ coord->between = BEFORE_ITEM;
7240+}
7241+
7242+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7243+void coord_init_after_item(coord_t * coord)
7244+{
7245+ coord->unit_pos = 0;
7246+ coord->between = AFTER_ITEM;
7247+}
7248+
7249+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7250+ it was not clear how actually */
7251+void coord_init_zero(coord_t * coord)
7252+{
7253+ memset(coord, 0, sizeof(*coord));
7254+}
7255+
7256+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
7257+unsigned coord_num_units(const coord_t * coord)
7258+{
7259+ assert("jmacd-9806", coord_is_existing_item(coord));
7260+
7261+ return item_plugin_by_coord(coord)->b.nr_units(coord);
7262+}
7263+
7264+/* Returns true if the coord was initializewd by coord_init_invalid (). */
7265+/* Audited by: green(2002.06.15) */
7266+int coord_is_invalid(const coord_t * coord)
7267+{
7268+ return coord->between == INVALID_COORD;
7269+}
7270+
7271+/* Returns true if the coordinate is positioned at an existing item, not before or after
7272+ an item. It may be placed at, before, or after any unit within the item, whether
7273+ existing or not. */
7274+int coord_is_existing_item(const coord_t * coord)
7275+{
7276+ switch (coord->between) {
7277+ case EMPTY_NODE:
7278+ case BEFORE_ITEM:
7279+ case AFTER_ITEM:
7280+ case INVALID_COORD:
7281+ return 0;
7282+
7283+ case BEFORE_UNIT:
7284+ case AT_UNIT:
7285+ case AFTER_UNIT:
7286+ return coord->item_pos < coord_num_items(coord);
7287+ }
7288+
7289+ impossible("jmacd-9900", "unreachable coord: %p", coord);
7290+ return 0;
7291+}
7292+
7293+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7294+ unit. */
7295+/* Audited by: green(2002.06.15) */
7296+int coord_is_existing_unit(const coord_t * coord)
7297+{
7298+ switch (coord->between) {
7299+ case EMPTY_NODE:
7300+ case BEFORE_UNIT:
7301+ case AFTER_UNIT:
7302+ case BEFORE_ITEM:
7303+ case AFTER_ITEM:
7304+ case INVALID_COORD:
7305+ return 0;
7306+
7307+ case AT_UNIT:
7308+ return (coord->item_pos < coord_num_items(coord)
7309+ && coord->unit_pos < coord_num_units(coord));
7310+ }
7311+
7312+ impossible("jmacd-9902", "unreachable");
7313+ return 0;
7314+}
7315+
7316+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7317+ true for empty nodes nor coordinates positioned before the first item. */
7318+/* Audited by: green(2002.06.15) */
7319+int coord_is_leftmost_unit(const coord_t * coord)
7320+{
7321+ return (coord->between == AT_UNIT && coord->item_pos == 0
7322+ && coord->unit_pos == 0);
7323+}
7324+
7325+#if REISER4_DEBUG
7326+/* For assertions only, checks for a valid coordinate. */
7327+int coord_check(const coord_t * coord)
7328+{
7329+ if (coord->node == NULL) {
7330+ return 0;
7331+ }
7332+ if (znode_above_root(coord->node))
7333+ return 1;
7334+
7335+ switch (coord->between) {
7336+ default:
7337+ case INVALID_COORD:
7338+ return 0;
7339+ case EMPTY_NODE:
7340+ if (!node_is_empty(coord->node)) {
7341+ return 0;
7342+ }
7343+ return coord->item_pos == 0 && coord->unit_pos == 0;
7344+
7345+ case BEFORE_UNIT:
7346+ case AFTER_UNIT:
7347+ if (node_is_empty(coord->node) && (coord->item_pos == 0)
7348+ && (coord->unit_pos == 0))
7349+ return 1;
7350+ case AT_UNIT:
7351+ break;
7352+ case AFTER_ITEM:
7353+ case BEFORE_ITEM:
7354+ /* before/after item should not set unit_pos. */
7355+ if (coord->unit_pos != 0) {
7356+ return 0;
7357+ }
7358+ break;
7359+ }
7360+
7361+ if (coord->item_pos >= node_num_items(coord->node)) {
7362+ return 0;
7363+ }
7364+
7365+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7366+ between is set either AFTER_ITEM or BEFORE_ITEM */
7367+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7368+ return 1;
7369+
7370+ if (coord_is_iplug_set(coord) &&
7371+ coord->unit_pos >
7372+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7373+ return 0;
7374+ }
7375+ return 1;
7376+}
7377+#endif
7378+
7379+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7380+ Returns 1 if the new position is does not exist. */
7381+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7382+{
7383+ /* If the node is invalid, leave it. */
7384+ if (coord->between == INVALID_COORD) {
7385+ return 1;
7386+ }
7387+
7388+ /* If the node is empty, set it appropriately. */
7389+ if (items == 0) {
7390+ coord->between = EMPTY_NODE;
7391+ coord_set_item_pos(coord, 0);
7392+ coord->unit_pos = 0;
7393+ return 1;
7394+ }
7395+
7396+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7397+ if (coord->between == EMPTY_NODE) {
7398+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7399+ coord_set_item_pos(coord, 0);
7400+ coord->unit_pos = 0;
7401+ return 0;
7402+ }
7403+
7404+ /* If the item_pos is out-of-range, set it appropriatly. */
7405+ if (coord->item_pos >= items) {
7406+ coord->between = AFTER_ITEM;
7407+ coord_set_item_pos(coord, items - 1);
7408+ coord->unit_pos = 0;
7409+ /* If is_next, return 1 (can't go any further). */
7410+ return is_next;
7411+ }
7412+
7413+ return 0;
7414+}
7415+
7416+/* Advances the coordinate by one unit to the right. If empty, no change. If
7417+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7418+ existing unit. */
7419+int coord_next_unit(coord_t * coord)
7420+{
7421+ unsigned items = coord_num_items(coord);
7422+
7423+ if (coord_adjust_items(coord, items, 1) == 1) {
7424+ return 1;
7425+ }
7426+
7427+ switch (coord->between) {
7428+ case BEFORE_UNIT:
7429+ /* Now it is positioned at the same unit. */
7430+ coord->between = AT_UNIT;
7431+ return 0;
7432+
7433+ case AFTER_UNIT:
7434+ case AT_UNIT:
7435+ /* If it was at or after a unit and there are more units in this item,
7436+ advance to the next one. */
7437+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7438+ coord->unit_pos += 1;
7439+ coord->between = AT_UNIT;
7440+ return 0;
7441+ }
7442+
7443+ /* Otherwise, it is crossing an item boundary and treated as if it was
7444+ after the current item. */
7445+ coord->between = AFTER_ITEM;
7446+ coord->unit_pos = 0;
7447+ /* FALLTHROUGH */
7448+
7449+ case AFTER_ITEM:
7450+ /* Check for end-of-node. */
7451+ if (coord->item_pos == items - 1) {
7452+ return 1;
7453+ }
7454+
7455+ coord_inc_item_pos(coord);
7456+ coord->unit_pos = 0;
7457+ coord->between = AT_UNIT;
7458+ return 0;
7459+
7460+ case BEFORE_ITEM:
7461+ /* The adjust_items checks ensure that we are valid here. */
7462+ coord->unit_pos = 0;
7463+ coord->between = AT_UNIT;
7464+ return 0;
7465+
7466+ case INVALID_COORD:
7467+ case EMPTY_NODE:
7468+ /* Handled in coord_adjust_items(). */
7469+ break;
7470+ }
7471+
7472+ impossible("jmacd-9902", "unreachable");
7473+ return 0;
7474+}
7475+
7476+/* Advances the coordinate by one item to the right. If empty, no change. If
7477+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7478+ an existing item. */
7479+int coord_next_item(coord_t * coord)
7480+{
7481+ unsigned items = coord_num_items(coord);
7482+
7483+ if (coord_adjust_items(coord, items, 1) == 1) {
7484+ return 1;
7485+ }
7486+
7487+ switch (coord->between) {
7488+ case AFTER_UNIT:
7489+ case AT_UNIT:
7490+ case BEFORE_UNIT:
7491+ case AFTER_ITEM:
7492+ /* Check for end-of-node. */
7493+ if (coord->item_pos == items - 1) {
7494+ coord->between = AFTER_ITEM;
7495+ coord->unit_pos = 0;
7496+ coord_clear_iplug(coord);
7497+ return 1;
7498+ }
7499+
7500+ /* Anywhere in an item, go to the next one. */
7501+ coord->between = AT_UNIT;
7502+ coord_inc_item_pos(coord);
7503+ coord->unit_pos = 0;
7504+ return 0;
7505+
7506+ case BEFORE_ITEM:
7507+ /* The out-of-range check ensures that we are valid here. */
7508+ coord->unit_pos = 0;
7509+ coord->between = AT_UNIT;
7510+ return 0;
7511+ case INVALID_COORD:
7512+ case EMPTY_NODE:
7513+ /* Handled in coord_adjust_items(). */
7514+ break;
7515+ }
7516+
7517+ impossible("jmacd-9903", "unreachable");
7518+ return 0;
7519+}
7520+
7521+/* Advances the coordinate by one unit to the left. If empty, no change. If
7522+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7523+ is an existing unit. */
7524+int coord_prev_unit(coord_t * coord)
7525+{
7526+ unsigned items = coord_num_items(coord);
7527+
7528+ if (coord_adjust_items(coord, items, 0) == 1) {
7529+ return 1;
7530+ }
7531+
7532+ switch (coord->between) {
7533+ case AT_UNIT:
7534+ case BEFORE_UNIT:
7535+ if (coord->unit_pos > 0) {
7536+ coord->unit_pos -= 1;
7537+ coord->between = AT_UNIT;
7538+ return 0;
7539+ }
7540+
7541+ if (coord->item_pos == 0) {
7542+ coord->between = BEFORE_ITEM;
7543+ return 1;
7544+ }
7545+
7546+ coord_dec_item_pos(coord);
7547+ coord->unit_pos = coord_last_unit_pos(coord);
7548+ coord->between = AT_UNIT;
7549+ return 0;
7550+
7551+ case AFTER_UNIT:
7552+ /* What if unit_pos is out-of-range? */
7553+ assert("jmacd-5442",
7554+ coord->unit_pos <= coord_last_unit_pos(coord));
7555+ coord->between = AT_UNIT;
7556+ return 0;
7557+
7558+ case BEFORE_ITEM:
7559+ if (coord->item_pos == 0) {
7560+ return 1;
7561+ }
7562+
7563+ coord_dec_item_pos(coord);
7564+ /* FALLTHROUGH */
7565+
7566+ case AFTER_ITEM:
7567+ coord->between = AT_UNIT;
7568+ coord->unit_pos = coord_last_unit_pos(coord);
7569+ return 0;
7570+
7571+ case INVALID_COORD:
7572+ case EMPTY_NODE:
7573+ break;
7574+ }
7575+
7576+ impossible("jmacd-9904", "unreachable");
7577+ return 0;
7578+}
7579+
7580+/* Advances the coordinate by one item to the left. If empty, no change. If
7581+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7582+ is an existing item. */
7583+int coord_prev_item(coord_t * coord)
7584+{
7585+ unsigned items = coord_num_items(coord);
7586+
7587+ if (coord_adjust_items(coord, items, 0) == 1) {
7588+ return 1;
7589+ }
7590+
7591+ switch (coord->between) {
7592+ case AT_UNIT:
7593+ case AFTER_UNIT:
7594+ case BEFORE_UNIT:
7595+ case BEFORE_ITEM:
7596+
7597+ if (coord->item_pos == 0) {
7598+ coord->between = BEFORE_ITEM;
7599+ coord->unit_pos = 0;
7600+ return 1;
7601+ }
7602+
7603+ coord_dec_item_pos(coord);
7604+ coord->unit_pos = 0;
7605+ coord->between = AT_UNIT;
7606+ return 0;
7607+
7608+ case AFTER_ITEM:
7609+ coord->between = AT_UNIT;
7610+ coord->unit_pos = 0;
7611+ return 0;
7612+
7613+ case INVALID_COORD:
7614+ case EMPTY_NODE:
7615+ break;
7616+ }
7617+
7618+ impossible("jmacd-9905", "unreachable");
7619+ return 0;
7620+}
7621+
7622+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7623+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7624+{
7625+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7626+ if (dir == LEFT_SIDE) {
7627+ coord_init_first_unit(coord, node);
7628+ } else {
7629+ coord_init_last_unit(coord, node);
7630+ }
7631+}
7632+
7633+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7634+ argument. */
7635+/* Audited by: green(2002.06.15) */
7636+int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7637+{
7638+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7639+ if (dir == LEFT_SIDE) {
7640+ return coord_is_before_leftmost(coord);
7641+ } else {
7642+ return coord_is_after_rightmost(coord);
7643+ }
7644+}
7645+
7646+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7647+/* Audited by: green(2002.06.15) */
7648+int coord_sideof_unit(coord_t * coord, sideof dir)
7649+{
7650+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7651+ if (dir == LEFT_SIDE) {
7652+ return coord_prev_unit(coord);
7653+ } else {
7654+ return coord_next_unit(coord);
7655+ }
7656+}
7657+
7658+#if REISER4_DEBUG
7659+#define DEBUG_COORD_FIELDS (sizeof(c1->plug_v) + sizeof(c1->body_v))
7660+#else
7661+#define DEBUG_COORD_FIELDS (0)
7662+#endif
7663+
7664+int coords_equal(const coord_t * c1, const coord_t * c2)
7665+{
7666+ assert("nikita-2840", c1 != NULL);
7667+ assert("nikita-2841", c2 != NULL);
7668+
7669+ return
7670+ c1->node == c2->node &&
7671+ c1->item_pos == c2->item_pos &&
7672+ c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7673+}
7674+
7675+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7676+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7677+/* Audited by: green(2002.06.15) */
7678+coord_wrt_node coord_wrt(const coord_t * coord)
7679+{
7680+ if (coord_is_before_leftmost(coord)) {
7681+ return COORD_ON_THE_LEFT;
7682+ }
7683+
7684+ if (coord_is_after_rightmost(coord)) {
7685+ return COORD_ON_THE_RIGHT;
7686+ }
7687+
7688+ return COORD_INSIDE;
7689+}
7690+
7691+/* Returns true if the coordinate is positioned after the last item or after the last unit
7692+ of the last item or it is an empty node. */
7693+/* Audited by: green(2002.06.15) */
7694+int coord_is_after_rightmost(const coord_t * coord)
7695+{
7696+ assert("jmacd-7313", coord_check(coord));
7697+
7698+ switch (coord->between) {
7699+ case INVALID_COORD:
7700+ case AT_UNIT:
7701+ case BEFORE_UNIT:
7702+ case BEFORE_ITEM:
7703+ return 0;
7704+
7705+ case EMPTY_NODE:
7706+ return 1;
7707+
7708+ case AFTER_ITEM:
7709+ return (coord->item_pos == node_num_items(coord->node) - 1);
7710+
7711+ case AFTER_UNIT:
7712+ return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7713+ coord->unit_pos == coord_last_unit_pos(coord));
7714+ }
7715+
7716+ impossible("jmacd-9908", "unreachable");
7717+ return 0;
7718+}
7719+
7720+/* Returns true if the coordinate is positioned before the first item or it is an empty
7721+ node. */
7722+int coord_is_before_leftmost(const coord_t * coord)
7723+{
7724+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7725+ necessary to check if coord is set before leftmost
7726+ assert ("jmacd-7313", coord_check (coord)); */
7727+ switch (coord->between) {
7728+ case INVALID_COORD:
7729+ case AT_UNIT:
7730+ case AFTER_ITEM:
7731+ case AFTER_UNIT:
7732+ return 0;
7733+
7734+ case EMPTY_NODE:
7735+ return 1;
7736+
7737+ case BEFORE_ITEM:
7738+ case BEFORE_UNIT:
7739+ return (coord->item_pos == 0) && (coord->unit_pos == 0);
7740+ }
7741+
7742+ impossible("jmacd-9908", "unreachable");
7743+ return 0;
7744+}
7745+
7746+/* Returns true if the coordinate is positioned after a item, before a item, after the
7747+ last unit of an item, before the first unit of an item, or at an empty node. */
7748+/* Audited by: green(2002.06.15) */
7749+int coord_is_between_items(const coord_t * coord)
7750+{
7751+ assert("jmacd-7313", coord_check(coord));
7752+
7753+ switch (coord->between) {
7754+ case INVALID_COORD:
7755+ case AT_UNIT:
7756+ return 0;
7757+
7758+ case AFTER_ITEM:
7759+ case BEFORE_ITEM:
7760+ case EMPTY_NODE:
7761+ return 1;
7762+
7763+ case BEFORE_UNIT:
7764+ return coord->unit_pos == 0;
7765+
7766+ case AFTER_UNIT:
7767+ return coord->unit_pos == coord_last_unit_pos(coord);
7768+ }
7769+
7770+ impossible("jmacd-9908", "unreachable");
7771+ return 0;
7772+}
7773+
7774+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7775+ before-after or item boundaries. */
7776+int coord_are_neighbors(coord_t * c1, coord_t * c2)
7777+{
7778+ coord_t *left;
7779+ coord_t *right;
7780+
7781+ assert("nikita-1241", c1 != NULL);
7782+ assert("nikita-1242", c2 != NULL);
7783+ assert("nikita-1243", c1->node == c2->node);
7784+ assert("nikita-1244", coord_is_existing_unit(c1));
7785+ assert("nikita-1245", coord_is_existing_unit(c2));
7786+
7787+ left = right = NULL;
7788+ switch (coord_compare(c1, c2)) {
7789+ case COORD_CMP_ON_LEFT:
7790+ left = c1;
7791+ right = c2;
7792+ break;
7793+ case COORD_CMP_ON_RIGHT:
7794+ left = c2;
7795+ right = c1;
7796+ break;
7797+ case COORD_CMP_SAME:
7798+ return 0;
7799+ default:
7800+ wrong_return_value("nikita-1246", "compare_coords()");
7801+ }
7802+ assert("vs-731", left && right);
7803+ if (left->item_pos == right->item_pos) {
7804+ return left->unit_pos + 1 == right->unit_pos;
7805+ } else if (left->item_pos + 1 == right->item_pos) {
7806+ return (left->unit_pos == coord_last_unit_pos(left))
7807+ && (right->unit_pos == 0);
7808+ } else {
7809+ return 0;
7810+ }
7811+}
7812+
7813+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7814+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7815+/* Audited by: green(2002.06.15) */
7816+coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7817+{
7818+ assert("vs-209", c1->node == c2->node);
7819+ assert("vs-194", coord_is_existing_unit(c1)
7820+ && coord_is_existing_unit(c2));
7821+
7822+ if (c1->item_pos > c2->item_pos)
7823+ return COORD_CMP_ON_RIGHT;
7824+ if (c1->item_pos < c2->item_pos)
7825+ return COORD_CMP_ON_LEFT;
7826+ if (c1->unit_pos > c2->unit_pos)
7827+ return COORD_CMP_ON_RIGHT;
7828+ if (c1->unit_pos < c2->unit_pos)
7829+ return COORD_CMP_ON_LEFT;
7830+ return COORD_CMP_SAME;
7831+}
7832+
7833+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7834+ non-zero if there is no position to the right. */
7835+int coord_set_to_right(coord_t * coord)
7836+{
7837+ unsigned items = coord_num_items(coord);
7838+
7839+ if (coord_adjust_items(coord, items, 1) == 1) {
7840+ return 1;
7841+ }
7842+
7843+ switch (coord->between) {
7844+ case AT_UNIT:
7845+ return 0;
7846+
7847+ case BEFORE_ITEM:
7848+ case BEFORE_UNIT:
7849+ coord->between = AT_UNIT;
7850+ return 0;
7851+
7852+ case AFTER_UNIT:
7853+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7854+ coord->unit_pos += 1;
7855+ coord->between = AT_UNIT;
7856+ return 0;
7857+ } else {
7858+
7859+ coord->unit_pos = 0;
7860+
7861+ if (coord->item_pos == items - 1) {
7862+ coord->between = AFTER_ITEM;
7863+ return 1;
7864+ }
7865+
7866+ coord_inc_item_pos(coord);
7867+ coord->between = AT_UNIT;
7868+ return 0;
7869+ }
7870+
7871+ case AFTER_ITEM:
7872+ if (coord->item_pos == items - 1) {
7873+ return 1;
7874+ }
7875+
7876+ coord_inc_item_pos(coord);
7877+ coord->unit_pos = 0;
7878+ coord->between = AT_UNIT;
7879+ return 0;
7880+
7881+ case EMPTY_NODE:
7882+ return 1;
7883+
7884+ case INVALID_COORD:
7885+ break;
7886+ }
7887+
7888+ impossible("jmacd-9920", "unreachable");
7889+ return 0;
7890+}
7891+
7892+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7893+ non-zero if there is no position to the left. */
7894+int coord_set_to_left(coord_t * coord)
7895+{
7896+ unsigned items = coord_num_items(coord);
7897+
7898+ if (coord_adjust_items(coord, items, 0) == 1) {
7899+ return 1;
7900+ }
7901+
7902+ switch (coord->between) {
7903+ case AT_UNIT:
7904+ return 0;
7905+
7906+ case AFTER_UNIT:
7907+ coord->between = AT_UNIT;
7908+ return 0;
7909+
7910+ case AFTER_ITEM:
7911+ coord->between = AT_UNIT;
7912+ coord->unit_pos = coord_last_unit_pos(coord);
7913+ return 0;
7914+
7915+ case BEFORE_UNIT:
7916+ if (coord->unit_pos > 0) {
7917+ coord->unit_pos -= 1;
7918+ coord->between = AT_UNIT;
7919+ return 0;
7920+ } else {
7921+
7922+ if (coord->item_pos == 0) {
7923+ coord->between = BEFORE_ITEM;
7924+ return 1;
7925+ }
7926+
7927+ coord->unit_pos = coord_last_unit_pos(coord);
7928+ coord_dec_item_pos(coord);
7929+ coord->between = AT_UNIT;
7930+ return 0;
7931+ }
7932+
7933+ case BEFORE_ITEM:
7934+ if (coord->item_pos == 0) {
7935+ return 1;
7936+ }
7937+
7938+ coord_dec_item_pos(coord);
7939+ coord->unit_pos = coord_last_unit_pos(coord);
7940+ coord->between = AT_UNIT;
7941+ return 0;
7942+
7943+ case EMPTY_NODE:
7944+ return 1;
7945+
7946+ case INVALID_COORD:
7947+ break;
7948+ }
7949+
7950+ impossible("jmacd-9920", "unreachable");
7951+ return 0;
7952+}
7953+
7954+static const char *coord_tween_tostring(between_enum n)
7955+{
7956+ switch (n) {
7957+ case BEFORE_UNIT:
7958+ return "before unit";
7959+ case BEFORE_ITEM:
7960+ return "before item";
7961+ case AT_UNIT:
7962+ return "at unit";
7963+ case AFTER_UNIT:
7964+ return "after unit";
7965+ case AFTER_ITEM:
7966+ return "after item";
7967+ case EMPTY_NODE:
7968+ return "empty node";
7969+ case INVALID_COORD:
7970+ return "invalid";
7971+ default:
7972+ {
7973+ static char buf[30];
7974+
7975+ sprintf(buf, "unknown: %i", n);
7976+ return buf;
7977+ }
7978+ }
7979+}
7980+
7981+void print_coord(const char *mes, const coord_t * coord, int node)
7982+{
7983+ if (coord == NULL) {
7984+ printk("%s: null\n", mes);
7985+ return;
7986+ }
7987+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7988+ mes, coord->item_pos, coord->unit_pos,
7989+ coord_tween_tostring(coord->between), coord->iplugid);
7990+}
7991+
7992+int
7993+item_utmost_child_real_block(const coord_t * coord, sideof side,
7994+ reiser4_block_nr * blk)
7995+{
7996+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7997+ side,
7998+ blk);
7999+}
8000+
8001+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
8002+{
8003+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
8004+}
8005+
8006+/* @count bytes of flow @f got written, update correspondingly f->length,
8007+ f->data and f->key */
8008+void move_flow_forward(flow_t * f, unsigned count)
8009+{
8010+ if (f->data)
8011+ f->data += count;
8012+ f->length -= count;
8013+ set_key_offset(&f->key, get_key_offset(&f->key) + count);
8014+}
8015+
8016+/*
8017+ Local variables:
8018+ c-indentation-style: "K&R"
8019+ mode-name: "LC"
8020+ c-basic-offset: 8
8021+ tab-width: 8
8022+ fill-column: 120
8023+ scroll-step: 1
8024+ End:
8025+*/
8026Index: linux-2.6.16/fs/reiser4/coord.h
8027===================================================================
8028--- /dev/null
8029+++ linux-2.6.16/fs/reiser4/coord.h
8030@@ -0,0 +1,389 @@
8031+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8032+
8033+/* Coords */
8034+
8035+#if !defined( __REISER4_COORD_H__ )
8036+#define __REISER4_COORD_H__
8037+
8038+#include "forward.h"
8039+#include "debug.h"
8040+#include "dformat.h"
8041+#include "key.h"
8042+
8043+/* insertions happen between coords in the tree, so we need some means
8044+ of specifying the sense of betweenness. */
8045+typedef enum {
8046+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
8047+ AT_UNIT,
8048+ AFTER_UNIT,
8049+ BEFORE_ITEM,
8050+ AFTER_ITEM,
8051+ INVALID_COORD,
8052+ EMPTY_NODE,
8053+} between_enum;
8054+
8055+/* location of coord w.r.t. its node */
8056+typedef enum {
8057+ COORD_ON_THE_LEFT = -1,
8058+ COORD_ON_THE_RIGHT = +1,
8059+ COORD_INSIDE = 0
8060+} coord_wrt_node;
8061+
8062+typedef enum {
8063+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
8064+} coord_cmp;
8065+
8066+struct coord {
8067+ /* node in a tree */
8068+ /* 0 */ znode *node;
8069+
8070+ /* position of item within node */
8071+ /* 4 */ pos_in_node_t item_pos;
8072+ /* position of unit within item */
8073+ /* 6 */ pos_in_node_t unit_pos;
8074+ /* optimization: plugin of item is stored in coord_t. Until this was
8075+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
8076+ is invalidated (set to 0xff) on each modification of ->item_pos,
8077+ and all such modifications are funneled through coord_*_item_pos()
8078+ functions below.
8079+ */
8080+ /* 8 */ char iplugid;
8081+ /* position of coord w.r.t. to neighboring items and/or units.
8082+ Values are taken from &between_enum above.
8083+ */
8084+ /* 9 */ char between;
8085+ /* padding. It will be added by the compiler anyway to conform to the
8086+ * C language alignment requirements. We keep it here to be on the
8087+ * safe side and to have a clear picture of the memory layout of this
8088+ * structure. */
8089+ /* 10 */ __u16 pad;
8090+ /* 12 */ int offset;
8091+#if REISER4_DEBUG
8092+ unsigned long plug_v;
8093+ unsigned long body_v;
8094+#endif
8095+};
8096+
8097+#define INVALID_PLUGID ((char)((1 << 8) - 1))
8098+#define INVALID_OFFSET -1
8099+
8100+static inline void coord_clear_iplug(coord_t * coord)
8101+{
8102+ assert("nikita-2835", coord != NULL);
8103+ coord->iplugid = INVALID_PLUGID;
8104+ coord->offset = INVALID_OFFSET;
8105+}
8106+
8107+static inline int coord_is_iplug_set(const coord_t * coord)
8108+{
8109+ assert("nikita-2836", coord != NULL);
8110+ return coord->iplugid != INVALID_PLUGID;
8111+}
8112+
8113+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
8114+{
8115+ assert("nikita-2478", coord != NULL);
8116+ coord->item_pos = pos;
8117+ coord_clear_iplug(coord);
8118+}
8119+
8120+static inline void coord_dec_item_pos(coord_t * coord)
8121+{
8122+ assert("nikita-2480", coord != NULL);
8123+ --coord->item_pos;
8124+ coord_clear_iplug(coord);
8125+}
8126+
8127+static inline void coord_inc_item_pos(coord_t * coord)
8128+{
8129+ assert("nikita-2481", coord != NULL);
8130+ ++coord->item_pos;
8131+ coord_clear_iplug(coord);
8132+}
8133+
8134+static inline void coord_add_item_pos(coord_t * coord, int delta)
8135+{
8136+ assert("nikita-2482", coord != NULL);
8137+ coord->item_pos += delta;
8138+ coord_clear_iplug(coord);
8139+}
8140+
8141+static inline void coord_invalid_item_pos(coord_t * coord)
8142+{
8143+ assert("nikita-2832", coord != NULL);
8144+ coord->item_pos = (unsigned short)~0;
8145+ coord_clear_iplug(coord);
8146+}
8147+
8148+/* Reverse a direction. */
8149+static inline sideof sideof_reverse(sideof side)
8150+{
8151+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
8152+}
8153+
8154+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
8155+
8156+ "first" and "last"
8157+ "next" and "prev"
8158+ "before" and "after"
8159+ "leftmost" and "rightmost"
8160+
8161+ But I think the chosen names are decent the way they are.
8162+*/
8163+
8164+/* COORD INITIALIZERS */
8165+
8166+/* Initialize an invalid coordinate. */
8167+extern void coord_init_invalid(coord_t * coord, const znode * node);
8168+
8169+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
8170+
8171+/* Initialize a coordinate to point at the first unit of the first item. If the node is
8172+ empty, it is positioned at the EMPTY_NODE. */
8173+extern void coord_init_first_unit(coord_t * coord, const znode * node);
8174+
8175+/* Initialize a coordinate to point at the last unit of the last item. If the node is
8176+ empty, it is positioned at the EMPTY_NODE. */
8177+extern void coord_init_last_unit(coord_t * coord, const znode * node);
8178+
8179+/* Initialize a coordinate to before the first item. If the node is empty, it is
8180+ positioned at the EMPTY_NODE. */
8181+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
8182+
8183+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
8184+ at the EMPTY_NODE. */
8185+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
8186+
8187+/* Initialize a coordinate to after last unit in the item. Coord must be set
8188+ already to existing item */
8189+void coord_init_after_item_end(coord_t * coord);
8190+
8191+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
8192+void coord_init_before_item(coord_t *);
8193+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
8194+void coord_init_after_item(coord_t *);
8195+
8196+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
8197+extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
8198+ sideof dir);
8199+
8200+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8201+ it was not clear how actually
8202+ FIXME-VS: added by vs (2002, june, 8) */
8203+extern void coord_init_zero(coord_t * coord);
8204+
8205+/* COORD METHODS */
8206+
8207+/* after shifting of node content, coord previously set properly may become
8208+ invalid, try to "normalize" it. */
8209+void coord_normalize(coord_t * coord);
8210+
8211+/* Copy a coordinate. */
8212+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
8213+
8214+/* Copy a coordinate without check. */
8215+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
8216+
8217+unsigned coord_num_units(const coord_t * coord);
8218+
8219+/* Return the last valid unit number at the present item (i.e.,
8220+ coord_num_units() - 1). */
8221+static inline unsigned coord_last_unit_pos(const coord_t * coord)
8222+{
8223+ return coord_num_units(coord) - 1;
8224+}
8225+
8226+#if REISER4_DEBUG
8227+/* For assertions only, checks for a valid coordinate. */
8228+extern int coord_check(const coord_t * coord);
8229+
8230+extern unsigned long znode_times_locked(const znode * z);
8231+
8232+static inline void coord_update_v(coord_t * coord)
8233+{
8234+ coord->plug_v = coord->body_v = znode_times_locked(coord->node);
8235+}
8236+#endif
8237+
8238+extern int coords_equal(const coord_t * c1, const coord_t * c2);
8239+
8240+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8241+
8242+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8243+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8244+extern coord_wrt_node coord_wrt(const coord_t * coord);
8245+
8246+/* Returns true if the coordinates are positioned at adjacent units, regardless of
8247+ before-after or item boundaries. */
8248+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8249+
8250+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8251+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
8252+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8253+
8254+/* COORD PREDICATES */
8255+
8256+/* Returns true if the coord was initializewd by coord_init_invalid (). */
8257+extern int coord_is_invalid(const coord_t * coord);
8258+
8259+/* Returns true if the coordinate is positioned at an existing item, not before or after
8260+ an item. It may be placed at, before, or after any unit within the item, whether
8261+ existing or not. If this is true you can call methods of the item plugin. */
8262+extern int coord_is_existing_item(const coord_t * coord);
8263+
8264+/* Returns true if the coordinate is positioned after a item, before a item, after the
8265+ last unit of an item, before the first unit of an item, or at an empty node. */
8266+extern int coord_is_between_items(const coord_t * coord);
8267+
8268+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8269+ unit. */
8270+extern int coord_is_existing_unit(const coord_t * coord);
8271+
8272+/* Returns true if the coordinate is positioned at an empty node. */
8273+extern int coord_is_empty(const coord_t * coord);
8274+
8275+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
8276+ true for empty nodes nor coordinates positioned before the first item. */
8277+extern int coord_is_leftmost_unit(const coord_t * coord);
8278+
8279+/* Returns true if the coordinate is positioned after the last item or after the last unit
8280+ of the last item or it is an empty node. */
8281+extern int coord_is_after_rightmost(const coord_t * coord);
8282+
8283+/* Returns true if the coordinate is positioned before the first item or it is an empty
8284+ node. */
8285+extern int coord_is_before_leftmost(const coord_t * coord);
8286+
8287+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8288+ argument. */
8289+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8290+
8291+/* COORD MODIFIERS */
8292+
8293+/* Advances the coordinate by one unit to the right. If empty, no change. If
8294+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8295+ an existing unit. */
8296+extern int coord_next_unit(coord_t * coord);
8297+
8298+/* Advances the coordinate by one item to the right. If empty, no change. If
8299+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8300+ an existing item. */
8301+extern int coord_next_item(coord_t * coord);
8302+
8303+/* Advances the coordinate by one unit to the left. If empty, no change. If
8304+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8305+ is an existing unit. */
8306+extern int coord_prev_unit(coord_t * coord);
8307+
8308+/* Advances the coordinate by one item to the left. If empty, no change. If
8309+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8310+ is an existing item. */
8311+extern int coord_prev_item(coord_t * coord);
8312+
8313+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8314+ non-zero if there is no position to the right. */
8315+extern int coord_set_to_right(coord_t * coord);
8316+
8317+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8318+ non-zero if there is no position to the left. */
8319+extern int coord_set_to_left(coord_t * coord);
8320+
8321+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8322+ and non-zero if the unit did not exist. */
8323+extern int coord_set_after_unit(coord_t * coord);
8324+
8325+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8326+extern int coord_sideof_unit(coord_t * coord, sideof dir);
8327+
8328+/* iterate over all units in @node */
8329+#define for_all_units( coord, node ) \
8330+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8331+ coord_next_unit( coord ) == 0 ; )
8332+
8333+/* iterate over all items in @node */
8334+#define for_all_items( coord, node ) \
8335+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8336+ coord_next_item( coord ) == 0 ; )
8337+
8338+/* COORD/ITEM METHODS */
8339+
8340+extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8341+ reiser4_block_nr * blk);
8342+extern int item_utmost_child(const coord_t * coord, sideof side,
8343+ jnode ** child);
8344+
8345+/* a flow is a sequence of bytes being written to or read from the tree. The
8346+ tree will slice the flow into items while storing it into nodes, but all of
8347+ that is hidden from anything outside the tree. */
8348+
8349+struct flow {
8350+ reiser4_key key; /* key of start of flow's sequence of bytes */
8351+ loff_t length; /* length of flow's sequence of bytes */
8352+ char *data; /* start of flow's sequence of bytes */
8353+ int user; /* if 1 data is user space, 0 - kernel space */
8354+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8355+};
8356+
8357+void move_flow_forward(flow_t * f, unsigned count);
8358+
8359+/* &reiser4_item_data - description of data to be inserted or pasted
8360+
8361+ Q: articulate the reasons for the difference between this and flow.
8362+
8363+ A: Becides flow we insert into tree other things: stat data, directory
8364+ entry, etc. To insert them into tree one has to provide this structure. If
8365+ one is going to insert flow - he can use insert_flow, where this structure
8366+ does not have to be created
8367+*/
8368+struct reiser4_item_data {
8369+ /* actual data to be inserted. If NULL, ->create_item() will not
8370+ do xmemcpy itself, leaving this up to the caller. This can
8371+ save some amount of unnecessary memory copying, for example,
8372+ during insertion of stat data.
8373+
8374+ */
8375+ char *data;
8376+ /* 1 if 'char * data' contains pointer to user space and 0 if it is
8377+ kernel space */
8378+ int user;
8379+ /* amount of data we are going to insert or paste */
8380+ int length;
8381+ /* "Arg" is opaque data that is passed down to the
8382+ ->create_item() method of node layout, which in turn
8383+ hands it to the ->create_hook() of item being created. This
8384+ arg is currently used by:
8385+
8386+ . ->create_hook() of internal item
8387+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8388+ . ->paste() method of directory item.
8389+ . ->create_hook() of extent item
8390+
8391+ For internal item, this is left "brother" of new node being
8392+ inserted and it is used to add new node into sibling list
8393+ after parent to it was just inserted into parent.
8394+
8395+ While ->arg does look somewhat of unnecessary compication,
8396+ it actually saves a lot of headache in many places, because
8397+ all data necessary to insert or paste new data into tree are
8398+ collected in one place, and this eliminates a lot of extra
8399+ argument passing and storing everywhere.
8400+
8401+ */
8402+ void *arg;
8403+ /* plugin of item we are inserting */
8404+ item_plugin *iplug;
8405+};
8406+
8407+/* __REISER4_COORD_H__ */
8408+#endif
8409+
8410+/* Make Linus happy.
8411+ Local variables:
8412+ c-indentation-style: "K&R"
8413+ mode-name: "LC"
8414+ c-basic-offset: 8
8415+ tab-width: 8
8416+ fill-column: 120
8417+ scroll-step: 1
8418+ End:
8419+*/
8420Index: linux-2.6.16/fs/reiser4/debug.c
8421===================================================================
8422--- /dev/null
8423+++ linux-2.6.16/fs/reiser4/debug.c
8424@@ -0,0 +1,300 @@
8425+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8426+ * reiser4/README */
8427+
8428+/* Debugging facilities. */
8429+
8430+/*
8431+ * This file contains generic debugging functions used by reiser4. Roughly
8432+ * following:
8433+ *
8434+ * panicking: reiser4_do_panic(), reiser4_print_prefix().
8435+ *
8436+ * locking: schedulable(), lock_counters(), print_lock_counters(),
8437+ * no_counters_are_held(), commit_check_locks()
8438+ *
8439+ * error code monitoring (see comment before RETERR macro): return_err(),
8440+ * report_err().
8441+ *
8442+ * stack back-tracing: fill_backtrace()
8443+ *
8444+ * miscellaneous: preempt_point(), call_on_each_assert(), debugtrap().
8445+ *
8446+ */
8447+
8448+#include "reiser4.h"
8449+#include "context.h"
8450+#include "super.h"
8451+#include "txnmgr.h"
8452+#include "znode.h"
8453+
8454+#include <linux/sysfs.h>
8455+#include <linux/slab.h>
8456+#include <linux/types.h>
8457+#include <linux/fs.h>
8458+#include <linux/spinlock.h>
8459+#include <linux/kallsyms.h>
8460+#include <linux/vmalloc.h>
8461+#include <linux/ctype.h>
8462+#include <linux/sysctl.h>
8463+#include <linux/hardirq.h>
8464+
8465+#if REISER4_DEBUG
8466+static void report_err(void);
8467+#else
8468+#define report_err() noop
8469+#endif
8470+
8471+/*
8472+ * global buffer where message given to reiser4_panic is formatted.
8473+ */
8474+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8475+
8476+/*
8477+ * lock protecting consistency of panic_buf under concurrent panics
8478+ */
8479+static DEFINE_SPINLOCK(panic_guard);
8480+
8481+/* Your best friend. Call it on each occasion. This is called by
8482+ fs/reiser4/debug.h:reiser4_panic(). */
8483+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8484+{
8485+ static int in_panic = 0;
8486+ va_list args;
8487+
8488+ /*
8489+ * check for recursive panic.
8490+ */
8491+ if (in_panic == 0) {
8492+ in_panic = 1;
8493+
8494+ spin_lock(&panic_guard);
8495+ va_start(args, format);
8496+ vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8497+ va_end(args);
8498+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8499+ spin_unlock(&panic_guard);
8500+
8501+ /*
8502+ * if kernel debugger is configured---drop in. Early dropping
8503+ * into kgdb is not always convenient, because panic message
8504+ * is not yet printed most of the times. But:
8505+ *
8506+ * (1) message can be extracted from printk_buf[]
8507+ * (declared static inside of printk()), and
8508+ *
8509+ * (2) sometimes serial/kgdb combo dies while printing
8510+ * long panic message, so it's more prudent to break into
8511+ * debugger earlier.
8512+ *
8513+ */
8514+ DEBUGON(1);
8515+ }
8516+ /* to make gcc happy about noreturn attribute */
8517+ panic("%s", panic_buf);
8518+}
8519+
8520+void
8521+reiser4_print_prefix(const char *level, int reperr, const char *mid,
8522+ const char *function, const char *file, int lineno)
8523+{
8524+ const char *comm;
8525+ int pid;
8526+
8527+ if (unlikely(in_interrupt() || in_irq())) {
8528+ comm = "interrupt";
8529+ pid = 0;
8530+ } else {
8531+ comm = current->comm;
8532+ pid = current->pid;
8533+ }
8534+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8535+ level, comm, pid, function, file, lineno, mid);
8536+ if (reperr)
8537+ report_err();
8538+}
8539+
8540+/* Preemption point: this should be called periodically during long running
8541+ operations (carry, allocate, and squeeze are best examples) */
8542+int preempt_point(void)
8543+{
8544+ assert("nikita-3008", schedulable());
8545+ cond_resched();
8546+ return signal_pending(current);
8547+}
8548+
8549+#if REISER4_DEBUG
8550+/* Debugging aid: return struct where information about locks taken by current
8551+ thread is accumulated. This can be used to formulate lock ordering
8552+ constraints and various assertions.
8553+
8554+*/
8555+lock_counters_info *lock_counters(void)
8556+{
8557+ reiser4_context *ctx = get_current_context();
8558+ assert("jmacd-1123", ctx != NULL);
8559+ return &ctx->locks;
8560+}
8561+
8562+/*
8563+ * print human readable information about locks held by the reiser4 context.
8564+ */
8565+static void print_lock_counters(const char *prefix,
8566+ const lock_counters_info * info)
8567+{
8568+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8569+ "jload: %i, "
8570+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8571+ "ktxnmgrd: %i, fq: %i\n"
8572+ "inode: %i, "
8573+ "cbk_cache: %i (r:%i,w%i), "
8574+ "eflush: %i, "
8575+ "zlock: %i,\n"
8576+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8577+ "d: %i, x: %i, t: %i\n", prefix,
8578+ info->spin_locked_jnode,
8579+ info->rw_locked_tree, info->read_locked_tree,
8580+ info->write_locked_tree,
8581+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8582+ info->spin_locked_jload,
8583+ info->spin_locked_txnh,
8584+ info->spin_locked_atom, info->spin_locked_stack,
8585+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8586+ info->spin_locked_fq,
8587+ info->spin_locked_inode,
8588+ info->rw_locked_cbk_cache,
8589+ info->read_locked_cbk_cache,
8590+ info->write_locked_cbk_cache,
8591+ info->spin_locked_super_eflush,
8592+ info->spin_locked_zlock,
8593+ info->spin_locked,
8594+ info->long_term_locked_znode,
8595+ info->inode_sem_r, info->inode_sem_w,
8596+ info->d_refs, info->x_refs, info->t_refs);
8597+}
8598+
8599+/* check that no spinlocks are held */
8600+int schedulable(void)
8601+{
8602+ if (get_current_context_check() != NULL) {
8603+ if (!LOCK_CNT_NIL(spin_locked)) {
8604+ print_lock_counters("in atomic", lock_counters());
8605+ return 0;
8606+ }
8607+ }
8608+ might_sleep();
8609+ return 1;
8610+}
8611+/*
8612+ * return true, iff no locks are held.
8613+ */
8614+int no_counters_are_held(void)
8615+{
8616+ lock_counters_info *counters;
8617+
8618+ counters = lock_counters();
8619+ return
8620+ (counters->spin_locked_zlock == 0) &&
8621+ (counters->spin_locked_jnode == 0) &&
8622+ (counters->rw_locked_tree == 0) &&
8623+ (counters->read_locked_tree == 0) &&
8624+ (counters->write_locked_tree == 0) &&
8625+ (counters->rw_locked_dk == 0) &&
8626+ (counters->read_locked_dk == 0) &&
8627+ (counters->write_locked_dk == 0) &&
8628+ (counters->spin_locked_txnh == 0) &&
8629+ (counters->spin_locked_atom == 0) &&
8630+ (counters->spin_locked_stack == 0) &&
8631+ (counters->spin_locked_txnmgr == 0) &&
8632+ (counters->spin_locked_inode == 0) &&
8633+ (counters->spin_locked == 0) &&
8634+ (counters->long_term_locked_znode == 0) &&
8635+ (counters->inode_sem_r == 0) &&
8636+ (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8637+}
8638+
8639+/*
8640+ * return true, iff transaction commit can be done under locks held by the
8641+ * current thread.
8642+ */
8643+int commit_check_locks(void)
8644+{
8645+ lock_counters_info *counters;
8646+ int inode_sem_r;
8647+ int inode_sem_w;
8648+ int result;
8649+
8650+ /*
8651+ * inode's read/write semaphore is the only reiser4 lock that can be
8652+ * held during commit.
8653+ */
8654+
8655+ counters = lock_counters();
8656+ inode_sem_r = counters->inode_sem_r;
8657+ inode_sem_w = counters->inode_sem_w;
8658+
8659+ counters->inode_sem_r = counters->inode_sem_w = 0;
8660+ result = no_counters_are_held();
8661+ counters->inode_sem_r = inode_sem_r;
8662+ counters->inode_sem_w = inode_sem_w;
8663+ return result;
8664+}
8665+
8666+/*
8667+ * fill "error site" in the current reiser4 context. See comment before RETERR
8668+ * macro for more details.
8669+ */
8670+void return_err(int code, const char *file, int line)
8671+{
8672+ if (code < 0 && is_in_reiser4_context()) {
8673+ reiser4_context *ctx = get_current_context();
8674+
8675+ if (ctx != NULL) {
8676+ ctx->err.code = code;
8677+ ctx->err.file = file;
8678+ ctx->err.line = line;
8679+ }
8680+ }
8681+}
8682+
8683+/*
8684+ * report error information recorder by return_err().
8685+ */
8686+static void report_err(void)
8687+{
8688+ reiser4_context *ctx = get_current_context_check();
8689+
8690+ if (ctx != NULL) {
8691+ if (ctx->err.code != 0) {
8692+ printk("code: %i at %s:%i\n",
8693+ ctx->err.code, ctx->err.file, ctx->err.line);
8694+ }
8695+ }
8696+}
8697+
8698+#endif /* REISER4_DEBUG */
8699+
8700+#if KERNEL_DEBUGGER
8701+
8702+/*
8703+ * this functions just drops into kernel debugger. It is a convenient place to
8704+ * put breakpoint in.
8705+ */
8706+void debugtrap(void)
8707+{
8708+ /* do nothing. Put break point here. */
8709+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8710+ extern void breakpoint(void);
8711+ breakpoint();
8712+#endif
8713+}
8714+#endif
8715+
8716+/* Make Linus happy.
8717+ Local variables:
8718+ c-indentation-style: "K&R"
8719+ mode-name: "LC"
8720+ c-basic-offset: 8
8721+ tab-width: 8
8722+ fill-column: 120
8723+ End:
8724+*/
8725Index: linux-2.6.16/fs/reiser4/debug.h
8726===================================================================
8727--- /dev/null
8728+++ linux-2.6.16/fs/reiser4/debug.h
8729@@ -0,0 +1,350 @@
8730+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8731+
8732+/* Declarations of debug macros. */
8733+
8734+#if !defined( __FS_REISER4_DEBUG_H__ )
8735+#define __FS_REISER4_DEBUG_H__
8736+
8737+#include "forward.h"
8738+#include "reiser4.h"
8739+
8740+/* generic function to produce formatted output, decorating it with
8741+ whatever standard prefixes/postfixes we want. "Fun" is a function
8742+ that will be actually called, can be printk, panic etc.
8743+ This is for use by other debugging macros, not by users. */
8744+#define DCALL(lev, fun, reperr, label, format, ...) \
8745+({ \
8746+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8747+ current->comm, current->pid, __FUNCTION__, \
8748+ __FILE__, __LINE__, label, ## __VA_ARGS__); \
8749+})
8750+
8751+/*
8752+ * cause kernel to crash
8753+ */
8754+#define reiser4_panic(mid, format, ...) \
8755+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8756+
8757+/* print message with indication of current process, file, line and
8758+ function */
8759+#define reiser4_log(label, format, ...) \
8760+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8761+
8762+/* Assertion checked during compilation.
8763+ If "cond" is false (0) we get duplicate case label in switch.
8764+ Use this to check something like famous
8765+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8766+ in 3.x journal.c. If cassertion fails you get compiler error,
8767+ so no "maintainer-id".
8768+*/
8769+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8770+
8771+#define noop do {;} while(0)
8772+
8773+#if REISER4_DEBUG
8774+/* version of info that only actually prints anything when _d_ebugging
8775+ is on */
8776+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8777+/* macro to catch logical errors. Put it into `default' clause of
8778+ switch() statement. */
8779+#define impossible(label, format, ...) \
8780+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8781+/* assert assures that @cond is true. If it is not, reiser4_panic() is
8782+ called. Use this for checking logical consistency and _never_ call
8783+ this to check correctness of external data: disk blocks and user-input . */
8784+#define assert(label, cond) \
8785+({ \
8786+ /* call_on_each_assert(); */ \
8787+ if (cond) { \
8788+ /* put negated check to avoid using !(cond) that would lose \
8789+ * warnings for things like assert(a = b); */ \
8790+ ; \
8791+ } else { \
8792+ DEBUGON(1); \
8793+ reiser4_panic(label, "assertion failed: %s", #cond); \
8794+ } \
8795+})
8796+
8797+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8798+#define check_me( label, expr ) assert( label, ( expr ) )
8799+
8800+#define ON_DEBUG( exp ) exp
8801+
8802+extern int schedulable(void);
8803+extern void call_on_each_assert(void);
8804+
8805+#else
8806+
8807+#define dinfo( format, args... ) noop
8808+#define impossible( label, format, args... ) noop
8809+#define assert( label, cond ) noop
8810+#define check_me( label, expr ) ( ( void ) ( expr ) )
8811+#define ON_DEBUG( exp )
8812+#define schedulable() might_sleep()
8813+
8814+/* REISER4_DEBUG */
8815+#endif
8816+
8817+#if REISER4_DEBUG
8818+/* per-thread information about lock acquired by this thread. Used by lock
8819+ * ordering checking in spin_macros.h */
8820+typedef struct lock_counters_info {
8821+ int rw_locked_tree;
8822+ int read_locked_tree;
8823+ int write_locked_tree;
8824+
8825+ int rw_locked_dk;
8826+ int read_locked_dk;
8827+ int write_locked_dk;
8828+
8829+ int rw_locked_cbk_cache;
8830+ int read_locked_cbk_cache;
8831+ int write_locked_cbk_cache;
8832+
8833+ int spin_locked_zlock;
8834+ int spin_locked_jnode;
8835+ int spin_locked_jload;
8836+ int spin_locked_txnh;
8837+ int spin_locked_atom;
8838+ int spin_locked_stack;
8839+ int spin_locked_txnmgr;
8840+ int spin_locked_ktxnmgrd;
8841+ int spin_locked_fq;
8842+ int spin_locked_inode;
8843+ int spin_locked_super_eflush;
8844+ int spin_locked;
8845+ int long_term_locked_znode;
8846+
8847+ int inode_sem_r;
8848+ int inode_sem_w;
8849+
8850+ int d_refs;
8851+ int x_refs;
8852+ int t_refs;
8853+} lock_counters_info;
8854+
8855+extern lock_counters_info *lock_counters(void);
8856+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8857+
8858+/* increment lock-counter @counter, if present */
8859+#define LOCK_CNT_INC(counter) IN_CONTEXT(++(lock_counters()->counter), 0)
8860+
8861+/* decrement lock-counter @counter, if present */
8862+#define LOCK_CNT_DEC(counter) IN_CONTEXT(--(lock_counters()->counter), 0)
8863+
8864+/* check that lock-counter is zero. This is for use in assertions */
8865+#define LOCK_CNT_NIL(counter) IN_CONTEXT(lock_counters()->counter == 0, 1)
8866+
8867+/* check that lock-counter is greater than zero. This is for use in
8868+ * assertions */
8869+#define LOCK_CNT_GTZ(counter) IN_CONTEXT(lock_counters()->counter > 0, 1)
8870+#define LOCK_CNT_LT(counter,n) IN_CONTEXT(lock_counters()->counter < n, 1)
8871+
8872+#else /* REISER4_DEBUG */
8873+
8874+/* no-op versions on the above */
8875+
8876+typedef struct lock_counters_info {
8877+} lock_counters_info;
8878+
8879+#define lock_counters() ((lock_counters_info *)NULL)
8880+#define LOCK_CNT_INC(counter) noop
8881+#define LOCK_CNT_DEC(counter) noop
8882+#define LOCK_CNT_NIL(counter) (1)
8883+#define LOCK_CNT_GTZ(counter) (1)
8884+#define LOCK_CNT_LT(counter,n) (1)
8885+
8886+#endif /* REISER4_DEBUG */
8887+
8888+#define assert_spin_not_locked(lock) BUG_ON(0)
8889+#define assert_rw_write_locked(lock) BUG_ON(0)
8890+#define assert_rw_read_locked(lock) BUG_ON(0)
8891+#define assert_rw_locked(lock) BUG_ON(0)
8892+#define assert_rw_not_write_locked(lock) BUG_ON(0)
8893+#define assert_rw_not_read_locked(lock) BUG_ON(0)
8894+#define assert_rw_not_locked(lock) BUG_ON(0)
8895+
8896+/* flags controlling debugging behavior. Are set through debug_flags=N mount
8897+ option. */
8898+typedef enum {
8899+ /* print a lot of information during panic. When this is on all jnodes
8900+ * are listed. This can be *very* large output. Usually you don't want
8901+ * this. Especially over serial line. */
8902+ REISER4_VERBOSE_PANIC = 0x00000001,
8903+ /* print a lot of information during umount */
8904+ REISER4_VERBOSE_UMOUNT = 0x00000002,
8905+ /* print gathered statistics on umount */
8906+ REISER4_STATS_ON_UMOUNT = 0x00000004,
8907+ /* check node consistency */
8908+ REISER4_CHECK_NODE = 0x00000008
8909+} reiser4_debug_flags;
8910+
8911+extern int is_in_reiser4_context(void);
8912+
8913+/*
8914+ * evaluate expression @e only if with reiser4 context
8915+ */
8916+#define ON_CONTEXT(e) do { \
8917+ if(is_in_reiser4_context()) { \
8918+ e; \
8919+ } } while(0)
8920+
8921+/*
8922+ * evaluate expression @e only when within reiser4_context and debugging is
8923+ * on.
8924+ */
8925+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8926+
8927+/*
8928+ * complain about unexpected function result and crash. Used in "default"
8929+ * branches of switch statements and alike to assert that invalid results are
8930+ * not silently ignored.
8931+ */
8932+#define wrong_return_value( label, function ) \
8933+ impossible( label, "wrong return value from " function )
8934+
8935+/* Issue different types of reiser4 messages to the console */
8936+#define warning( label, format, ... ) \
8937+ DCALL( KERN_WARNING, \
8938+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8939+#define notice( label, format, ... ) \
8940+ DCALL( KERN_NOTICE, \
8941+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8942+
8943+/* mark not yet implemented functionality */
8944+#define not_yet( label, format, ... ) \
8945+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8946+
8947+extern void reiser4_do_panic(const char *format, ...)
8948+ __attribute__ ((noreturn, format(printf, 1, 2)));
8949+
8950+extern void reiser4_print_prefix(const char *level, int reperr, const char *mid,
8951+ const char *function,
8952+ const char *file, int lineno);
8953+
8954+extern int preempt_point(void);
8955+extern void reiser4_print_stats(void);
8956+
8957+
8958+#if REISER4_DEBUG
8959+extern int no_counters_are_held(void);
8960+extern int commit_check_locks(void);
8961+#else
8962+#define no_counters_are_held() (1)
8963+#define commit_check_locks() (1)
8964+#endif
8965+
8966+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8967+#define IS_POW(i) \
8968+({ \
8969+ typeof(i) __i; \
8970+ \
8971+ __i = (i); \
8972+ !(__i & (__i - 1)); \
8973+})
8974+
8975+#define KERNEL_DEBUGGER (1)
8976+
8977+#if KERNEL_DEBUGGER
8978+
8979+extern void debugtrap(void);
8980+
8981+/*
8982+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8983+ * kgdb is not compiled in, do nothing.
8984+ */
8985+#define DEBUGON(cond) \
8986+({ \
8987+ if (unlikely(cond)) \
8988+ debugtrap(); \
8989+})
8990+#else
8991+#define DEBUGON(cond) noop
8992+#endif
8993+
8994+/*
8995+ * Error code tracing facility. (Idea is borrowed from XFS code.)
8996+ *
8997+ * Suppose some strange and/or unexpected code is returned from some function
8998+ * (for example, write(2) returns -EEXIST). It is possible to place a
8999+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
9000+ * in what particular place -EEXIST was generated first?
9001+ *
9002+ * In reiser4 all places where actual error codes are produced (that is,
9003+ * statements of the form
9004+ *
9005+ * return -EFOO; // (1), or
9006+ *
9007+ * result = -EFOO; // (2)
9008+ *
9009+ * are replaced with
9010+ *
9011+ * return RETERR(-EFOO); // (1a), and
9012+ *
9013+ * result = RETERR(-EFOO); // (2a) respectively
9014+ *
9015+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
9016+ * printed in error and warning messages. Moreover, it's possible to put a
9017+ * conditional breakpoint in return_err (low-level function called by RETERR()
9018+ * to do the actual work) to break into debugger immediately when particular
9019+ * error happens.
9020+ *
9021+ */
9022+
9023+#if REISER4_DEBUG
9024+
9025+/*
9026+ * data-type to store information about where error happened ("error site").
9027+ */
9028+typedef struct err_site {
9029+ int code; /* error code */
9030+ const char *file; /* source file, filled by __FILE__ */
9031+ int line; /* source file line, filled by __LINE__ */
9032+} err_site;
9033+
9034+extern void return_err(int code, const char *file, int line);
9035+
9036+/*
9037+ * fill &get_current_context()->err_site with error information.
9038+ */
9039+#define RETERR(code) \
9040+({ \
9041+ typeof(code) __code; \
9042+ \
9043+ __code = (code); \
9044+ return_err(__code, __FILE__, __LINE__); \
9045+ __code; \
9046+})
9047+
9048+#else
9049+
9050+/*
9051+ * no-op versions of the above
9052+ */
9053+
9054+typedef struct err_site {
9055+} err_site;
9056+#define RETERR(code) code
9057+#endif
9058+
9059+#if REISER4_LARGE_KEY
9060+/*
9061+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
9062+ */
9063+#define ON_LARGE_KEY(...) __VA_ARGS__
9064+#else
9065+#define ON_LARGE_KEY(...)
9066+#endif
9067+
9068+/* __FS_REISER4_DEBUG_H__ */
9069+#endif
9070+
9071+/* Make Linus happy.
9072+ Local variables:
9073+ c-indentation-style: "K&R"
9074+ mode-name: "LC"
9075+ c-basic-offset: 8
9076+ tab-width: 8
9077+ fill-column: 120
9078+ End:
9079+*/
9080Index: linux-2.6.16/fs/reiser4/dformat.h
9081===================================================================
9082--- /dev/null
9083+++ linux-2.6.16/fs/reiser4/dformat.h
9084@@ -0,0 +1,71 @@
9085+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9086+
9087+/* Formats of on-disk data and conversion functions. */
9088+
9089+/* put all item formats in the files describing the particular items,
9090+ our model is, everything you need to do to add an item to reiser4,
9091+ (excepting the changes to the plugin that uses the item which go
9092+ into the file defining that plugin), you put into one file. */
9093+/* Data on disk are stored in little-endian format.
9094+ To declare fields of on-disk structures, use d8, d16, d32 and d64.
9095+ d??tocpu() and cputod??() to convert. */
9096+
9097+#if !defined( __FS_REISER4_DFORMAT_H__ )
9098+#define __FS_REISER4_DFORMAT_H__
9099+
9100+#include <asm/byteorder.h>
9101+#include <asm/unaligned.h>
9102+#include <linux/types.h>
9103+
9104+
9105+typedef __u8 d8;
9106+typedef __le16 d16;
9107+typedef __le32 d32;
9108+typedef __le64 d64;
9109+
9110+#define PACKED __attribute__((packed))
9111+
9112+/* data-type for block number */
9113+typedef __u64 reiser4_block_nr;
9114+
9115+/* data-type for block number on disk, disk format */
9116+typedef __le64 reiser4_dblock_nr;
9117+
9118+/**
9119+ * disk_addr_eq - compare disk addresses
9120+ * @b1: pointer to block number ot compare
9121+ * @b2: pointer to block number ot compare
9122+ *
9123+ * Returns true if if disk addresses are the same
9124+ */
9125+static inline int disk_addr_eq(const reiser4_block_nr *b1,
9126+ const reiser4_block_nr * b2)
9127+{
9128+ assert("nikita-1033", b1 != NULL);
9129+ assert("nikita-1266", b2 != NULL);
9130+
9131+ return !memcmp(b1, b2, sizeof *b1);
9132+}
9133+
9134+/* structure of master reiser4 super block */
9135+typedef struct reiser4_master_sb {
9136+ char magic[16]; /* "ReIsEr4" */
9137+ __le16 disk_plugin_id; /* id of disk layout plugin */
9138+ __le16 blocksize;
9139+ char uuid[16]; /* unique id */
9140+ char label[16]; /* filesystem label */
9141+ __le64 diskmap; /* location of the diskmap. 0 if not present */
9142+} reiser4_master_sb;
9143+
9144+/* __FS_REISER4_DFORMAT_H__ */
9145+#endif
9146+
9147+/*
9148+ * Local variables:
9149+ * c-indentation-style: "K&R"
9150+ * mode-name: "LC"
9151+ * c-basic-offset: 8
9152+ * tab-width: 8
9153+ * fill-column: 79
9154+ * End:
9155+ */
9156Index: linux-2.6.16/fs/reiser4/dscale.c
9157===================================================================
9158--- /dev/null
9159+++ linux-2.6.16/fs/reiser4/dscale.c
9160@@ -0,0 +1,174 @@
9161+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9162+ * reiser4/README */
9163+
9164+/* Scalable on-disk integers */
9165+
9166+/*
9167+ * Various on-disk structures contain integer-like structures. Stat-data
9168+ * contain [yes, "data" is plural, check the dictionary] file size, link
9169+ * count; extent unit contains extent width etc. To accommodate for general
9170+ * case enough space is reserved to keep largest possible value. 64 bits in
9171+ * all cases above. But in overwhelming majority of cases numbers actually
9172+ * stored in these fields will be comparatively small and reserving 8 bytes is
9173+ * a waste of precious disk bandwidth.
9174+ *
9175+ * Scalable integers are one way to solve this problem. dscale_write()
9176+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
9177+ * depending on the magnitude of the value supplied. dscale_read() reads value
9178+ * previously stored by dscale_write().
9179+ *
9180+ * dscale_write() produces format not completely unlike of UTF: two highest
9181+ * bits of the first byte are used to store "tag". One of 4 possible tag
9182+ * values is chosen depending on the number being encoded:
9183+ *
9184+ * 0 ... 0x3f => 0 [table 1]
9185+ * 0x40 ... 0x3fff => 1
9186+ * 0x4000 ... 0x3fffffff => 2
9187+ * 0x40000000 ... 0xffffffffffffffff => 3
9188+ *
9189+ * (see dscale_range() function)
9190+ *
9191+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
9192+ * to be stored, so in this case there is no place in the first byte to store
9193+ * tag. For such values tag is stored in an extra 9th byte.
9194+ *
9195+ * As _highest_ bits are used for the test (which is natural) scaled integers
9196+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
9197+ * uses LITTLE-ENDIAN.
9198+ *
9199+ */
9200+
9201+#include "debug.h"
9202+#include "dscale.h"
9203+
9204+/* return tag of scaled integer stored at @address */
9205+static int gettag(const unsigned char *address)
9206+{
9207+ /* tag is stored in two highest bits */
9208+ return (*address) >> 6;
9209+}
9210+
9211+/* clear tag from value. Clear tag embedded into @value. */
9212+static void cleartag(__u64 * value, int tag)
9213+{
9214+ /*
9215+ * W-w-what ?!
9216+ *
9217+ * Actually, this is rather simple: @value passed here was read by
9218+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
9219+ * zeroes. Tag is still stored in the highest (arithmetically)
9220+ * non-zero bits of @value, but relative position of tag within __u64
9221+ * depends on @tag.
9222+ *
9223+ * For example if @tag is 0, it's stored 2 highest bits of lowest
9224+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
9225+ *
9226+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
9227+ * and it's offset if (2 * 8) - 2 == 14 bits.
9228+ *
9229+ * See table 1 above for details.
9230+ *
9231+ * All these cases are captured by the formula:
9232+ */
9233+ *value &= ~(3 << (((1 << tag) << 3) - 2));
9234+ /*
9235+ * That is, clear two (3 == 0t11) bits at the offset
9236+ *
9237+ * 8 * (2 ^ tag) - 2,
9238+ *
9239+ * that is, two highest bits of (2 ^ tag)-th byte of @value.
9240+ */
9241+}
9242+
9243+/* return tag for @value. See table 1 above for details. */
9244+static int dscale_range(__u64 value)
9245+{
9246+ if (value > 0x3fffffff)
9247+ return 3;
9248+ if (value > 0x3fff)
9249+ return 2;
9250+ if (value > 0x3f)
9251+ return 1;
9252+ return 0;
9253+}
9254+
9255+/* restore value stored at @adderss by dscale_write() and return number of
9256+ * bytes consumed */
9257+int dscale_read(unsigned char *address, __u64 * value)
9258+{
9259+ int tag;
9260+
9261+ /* read tag */
9262+ tag = gettag(address);
9263+ switch (tag) {
9264+ case 3:
9265+ /* In this case tag is stored in an extra byte, skip this byte
9266+ * and decode value stored in the next 8 bytes.*/
9267+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9268+ /* worst case: 8 bytes for value itself plus one byte for
9269+ * tag. */
9270+ return 9;
9271+ case 0:
9272+ *value = get_unaligned(address);
9273+ break;
9274+ case 1:
9275+ *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9276+ break;
9277+ case 2:
9278+ *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9279+ break;
9280+ default:
9281+ return RETERR(-EIO);
9282+ }
9283+ /* clear tag embedded into @value */
9284+ cleartag(value, tag);
9285+ /* number of bytes consumed is (2 ^ tag)---see table 1. */
9286+ return 1 << tag;
9287+}
9288+
9289+/* store @value at @address and return number of bytes consumed */
9290+int dscale_write(unsigned char *address, __u64 value)
9291+{
9292+ int tag;
9293+ int shift;
9294+ __be64 v;
9295+ unsigned char *valarr;
9296+
9297+ tag = dscale_range(value);
9298+ v = __cpu_to_be64(value);
9299+ valarr = (unsigned char *)&v;
9300+ shift = (tag == 3) ? 1 : 0;
9301+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9302+ *address |= (tag << 6);
9303+ return shift + (1 << tag);
9304+}
9305+
9306+/* number of bytes required to store @value */
9307+int dscale_bytes(__u64 value)
9308+{
9309+ int bytes;
9310+
9311+ bytes = 1 << dscale_range(value);
9312+ if (bytes == 8)
9313+ ++bytes;
9314+ return bytes;
9315+}
9316+
9317+/* returns true if @value and @other require the same number of bytes to be
9318+ * stored. Used by detect when data structure (like stat-data) has to be
9319+ * expanded or contracted. */
9320+int dscale_fit(__u64 value, __u64 other)
9321+{
9322+ return dscale_range(value) == dscale_range(other);
9323+}
9324+
9325+/* Make Linus happy.
9326+ Local variables:
9327+ c-indentation-style: "K&R"
9328+ mode-name: "LC"
9329+ c-basic-offset: 8
9330+ tab-width: 8
9331+ fill-column: 120
9332+ scroll-step: 1
9333+ End:
9334+*/
9335Index: linux-2.6.16/fs/reiser4/dscale.h
9336===================================================================
9337--- /dev/null
9338+++ linux-2.6.16/fs/reiser4/dscale.h
9339@@ -0,0 +1,27 @@
9340+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9341+ * reiser4/README */
9342+
9343+/* Scalable on-disk integers. See dscale.h for details. */
9344+
9345+#if !defined( __FS_REISER4_DSCALE_H__ )
9346+#define __FS_REISER4_DSCALE_H__
9347+
9348+#include "dformat.h"
9349+
9350+extern int dscale_read(unsigned char *address, __u64 * value);
9351+extern int dscale_write(unsigned char *address, __u64 value);
9352+extern int dscale_bytes(__u64 value);
9353+extern int dscale_fit(__u64 value, __u64 other);
9354+
9355+/* __FS_REISER4_DSCALE_H__ */
9356+#endif
9357+
9358+/* Make Linus happy.
9359+ Local variables:
9360+ c-indentation-style: "K&R"
9361+ mode-name: "LC"
9362+ c-basic-offset: 8
9363+ tab-width: 8
9364+ fill-column: 120
9365+ End:
9366+*/
9367Index: linux-2.6.16/fs/reiser4/entd.c
9368===================================================================
9369--- /dev/null
9370+++ linux-2.6.16/fs/reiser4/entd.c
9371@@ -0,0 +1,356 @@
9372+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9373+ * reiser4/README */
9374+
9375+/* Ent daemon. */
9376+
9377+#include "debug.h"
9378+#include "txnmgr.h"
9379+#include "tree.h"
9380+#include "entd.h"
9381+#include "super.h"
9382+#include "context.h"
9383+#include "reiser4.h"
9384+#include "vfs_ops.h"
9385+#include "page_cache.h"
9386+#include "inode.h"
9387+
9388+#include <linux/sched.h> /* struct task_struct */
9389+#include <linux/suspend.h>
9390+#include <linux/kernel.h>
9391+#include <linux/writeback.h>
9392+#include <linux/time.h> /* INITIAL_JIFFIES */
9393+#include <linux/backing-dev.h> /* bdi_write_congested */
9394+#include <linux/wait.h>
9395+#include <linux/kthread.h>
9396+
9397+#define LLONG_MAX ((long long)(~0ULL>>1))
9398+
9399+#define DEF_PRIORITY 12
9400+#define MAX_ENTD_ITERS 10
9401+
9402+static void entd_flush(struct super_block *, struct wbq *);
9403+static int entd(void *arg);
9404+
9405+/*
9406+ * set ->comm field of end thread to make its state visible to the user level
9407+ */
9408+#define entd_set_comm(state) \
9409+ snprintf(current->comm, sizeof(current->comm), \
9410+ "ent:%s%s", super->s_id, (state))
9411+
9412+/**
9413+ * init_entd - initialize entd context and start kernel daemon
9414+ * @super: super block to start ent thread for
9415+ *
9416+ * Creates entd contexts, starts kernel thread and waits until it
9417+ * initializes.
9418+ */
9419+int init_entd(struct super_block *super)
9420+{
9421+ entd_context *ctx;
9422+
9423+ assert("nikita-3104", super != NULL);
9424+
9425+ ctx = get_entd_context(super);
9426+
9427+ memset(ctx, 0, sizeof *ctx);
9428+ spin_lock_init(&ctx->guard);
9429+ init_waitqueue_head(&ctx->wait);
9430+#if REISER4_DEBUG
9431+ INIT_LIST_HEAD(&ctx->flushers_list);
9432+#endif
9433+ /* lists of writepage requests */
9434+ INIT_LIST_HEAD(&ctx->todo_list);
9435+ INIT_LIST_HEAD(&ctx->done_list);
9436+ /* start entd */
9437+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9438+ if (IS_ERR(ctx->tsk))
9439+ return PTR_ERR(ctx->tsk);
9440+ return 0;
9441+}
9442+
9443+static void __put_wbq(entd_context *ent, struct wbq *rq)
9444+{
9445+ up(&rq->sem);
9446+}
9447+
9448+/* ent should be locked */
9449+static struct wbq *__get_wbq(entd_context * ent)
9450+{
9451+ struct wbq *wbq;
9452+
9453+ if (list_empty_careful(&ent->todo_list))
9454+ return NULL;
9455+
9456+ ent->nr_todo_reqs --;
9457+ wbq = list_entry(ent->todo_list.next, struct wbq, link);
9458+ list_del_init(&wbq->link);
9459+ return wbq;
9460+}
9461+
9462+static void wakeup_all_wbq(entd_context * ent)
9463+{
9464+ struct wbq *rq;
9465+
9466+ spin_lock(&ent->guard);
9467+ while ((rq = __get_wbq(ent)) != NULL)
9468+ __put_wbq(ent, rq);
9469+ spin_unlock(&ent->guard);
9470+}
9471+
9472+/* ent thread function */
9473+static int entd(void *arg)
9474+{
9475+ struct super_block *super;
9476+ entd_context *ent;
9477+ int done = 0;
9478+
9479+ super = arg;
9480+ /* do_fork() just copies task_struct into the new
9481+ thread. ->fs_context shouldn't be copied of course. This shouldn't
9482+ be a problem for the rest of the code though.
9483+ */
9484+ current->journal_info = NULL;
9485+
9486+ ent = get_entd_context(super);
9487+
9488+ while (!done) {
9489+ try_to_freeze();
9490+
9491+ spin_lock(&ent->guard);
9492+ while (ent->nr_todo_reqs != 0) {
9493+ struct wbq *rq, *next;
9494+
9495+ assert("", list_empty_careful(&ent->done_list));
9496+
9497+ /* take request from the queue head */
9498+ rq = __get_wbq(ent);
9499+ assert("", rq != NULL);
9500+ ent->cur_request = rq;
9501+ spin_unlock(&ent->guard);
9502+
9503+ entd_set_comm("!");
9504+ entd_flush(super, rq);
9505+
9506+ iput(rq->mapping->host);
9507+ up(&(rq->sem));
9508+
9509+ /*
9510+ * wakeup all requestors and iput their inodes
9511+ */
9512+ spin_lock(&ent->guard);
9513+ list_for_each_entry_safe(rq, next, &ent->done_list, link) {
9514+ list_del_init(&(rq->link));
9515+ ent->nr_done_reqs --;
9516+ spin_unlock(&ent->guard);
9517+
9518+ assert("", rq->written == 1);
9519+ iput(rq->mapping->host);
9520+ up(&(rq->sem));
9521+ spin_lock(&ent->guard);
9522+ }
9523+ }
9524+ spin_unlock(&ent->guard);
9525+
9526+ entd_set_comm(".");
9527+
9528+ {
9529+ DEFINE_WAIT(__wait);
9530+
9531+ do {
9532+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9533+ if (kthread_should_stop()) {
9534+ done = 1;
9535+ break;
9536+ }
9537+ if (ent->nr_todo_reqs != 0)
9538+ break;
9539+ schedule();
9540+ } while (0);
9541+ finish_wait(&ent->wait, &__wait);
9542+ }
9543+ }
9544+ spin_lock(&ent->guard);
9545+ BUG_ON(ent->nr_todo_reqs != 0);
9546+ spin_unlock(&ent->guard);
9547+ wakeup_all_wbq(ent);
9548+ return 0;
9549+}
9550+
9551+/**
9552+ * done_entd - stop entd kernel thread
9553+ * @super: super block to stop ent thread for
9554+ *
9555+ * It is called on umount. Sends stop signal to entd and wait until it handles
9556+ * it.
9557+ */
9558+void done_entd(struct super_block *super)
9559+{
9560+ entd_context *ent;
9561+
9562+ assert("nikita-3103", super != NULL);
9563+
9564+ ent = get_entd_context(super);
9565+ assert("zam-1055", ent->tsk != NULL);
9566+ kthread_stop(ent->tsk);
9567+}
9568+
9569+/* called at the beginning of jnode_flush to register flusher thread with ent
9570+ * daemon */
9571+void enter_flush(struct super_block *super)
9572+{
9573+ entd_context *ent;
9574+
9575+ assert("zam-1029", super != NULL);
9576+ ent = get_entd_context(super);
9577+
9578+ assert("zam-1030", ent != NULL);
9579+
9580+ spin_lock(&ent->guard);
9581+ ent->flushers++;
9582+#if REISER4_DEBUG
9583+ list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9584+#endif
9585+ spin_unlock(&ent->guard);
9586+}
9587+
9588+/* called at the end of jnode_flush */
9589+void leave_flush(struct super_block *super)
9590+{
9591+ entd_context *ent;
9592+ int wake_up_ent;
9593+
9594+ assert("zam-1027", super != NULL);
9595+ ent = get_entd_context(super);
9596+
9597+ assert("zam-1028", ent != NULL);
9598+
9599+ spin_lock(&ent->guard);
9600+ ent->flushers--;
9601+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9602+#if REISER4_DEBUG
9603+ list_del_init(&get_current_context()->flushers_link);
9604+#endif
9605+ spin_unlock(&ent->guard);
9606+ if (wake_up_ent)
9607+ wake_up(&ent->wait);
9608+}
9609+
9610+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9611+
9612+static void entd_flush(struct super_block *super, struct wbq *rq)
9613+{
9614+ reiser4_context ctx;
9615+ int tmp;
9616+
9617+ init_stack_context(&ctx, super);
9618+ ctx.entd = 1;
9619+ ctx.gfp_mask = GFP_NOFS;
9620+
9621+ rq->wbc->start = rq->page->index << PAGE_CACHE_SHIFT;
9622+ rq->wbc->end = (rq->page->index + ENTD_CAPTURE_APAGE_BURST) << PAGE_CACHE_SHIFT;
9623+ tmp = rq->wbc->nr_to_write;
9624+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9625+
9626+ if (rq->wbc->nr_to_write > 0) {
9627+ rq->wbc->start = 0;
9628+ rq->wbc->end = LLONG_MAX;
9629+ generic_sync_sb_inodes(super, rq->wbc);
9630+ }
9631+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9632+ writeout(super, rq->wbc);
9633+
9634+ context_set_commit_async(&ctx);
9635+ reiser4_exit_context(&ctx);
9636+}
9637+
9638+/**
9639+ * write_page_by_ent - ask entd thread to flush this page as part of slum
9640+ * @page: page to be written
9641+ * @wbc: writeback control passed to reiser4_writepage
9642+ *
9643+ * Creates a request, puts it on entd list of requests, wakeups entd if
9644+ * necessary, waits until entd completes with the request.
9645+ */
9646+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9647+{
9648+ struct super_block *sb;
9649+ struct inode *inode;
9650+ entd_context *ent;
9651+ struct wbq rq;
9652+
9653+ assert("", PageLocked(page));
9654+ assert("", page->mapping != NULL);
9655+
9656+ sb = page->mapping->host->i_sb;
9657+ ent = get_entd_context(sb);
9658+ assert("", ent && ent->done == 0);
9659+
9660+ /*
9661+ * we are going to unlock page and ask ent thread to write the
9662+ * page. Re-dirty page before unlocking so that if ent thread fails to
9663+ * write it - it will remain dirty
9664+ */
9665+ set_page_dirty_internal(page);
9666+
9667+ /*
9668+ * pin inode in memory, unlock page, entd_flush will iput. We can not
9669+ * iput here becasue we can not allow delete_inode to be called here
9670+ */
9671+ inode = igrab(page->mapping->host);
9672+ unlock_page(page);
9673+ if (inode == NULL)
9674+ /* inode is getting freed */
9675+ return 0;
9676+
9677+ /* init wbq */
9678+ INIT_LIST_HEAD(&rq.link);
9679+ rq.magic = WBQ_MAGIC;
9680+ rq.wbc = wbc;
9681+ rq.page = page;
9682+ rq.mapping = inode->i_mapping;
9683+ rq.node = NULL;
9684+ rq.written = 0;
9685+ sema_init(&rq.sem, 0);
9686+
9687+ /* add request to entd's list of writepage requests */
9688+ spin_lock(&ent->guard);
9689+ ent->nr_todo_reqs++;
9690+ list_add_tail(&rq.link, &ent->todo_list);
9691+ if (ent->nr_todo_reqs == 1)
9692+ wake_up(&ent->wait);
9693+
9694+ spin_unlock(&ent->guard);
9695+
9696+ /* wait until entd finishes */
9697+ down(&rq.sem);
9698+
9699+ /*
9700+ * spin until entd thread which did up(&rq.sem) does not need rq
9701+ * anymore
9702+ */
9703+ spin_lock(&ent->guard);
9704+ spin_unlock(&ent->guard);
9705+
9706+ if (rq.written)
9707+ /* Eventually ENTD has written the page to disk. */
9708+ return 0;
9709+ return 0;
9710+}
9711+
9712+int wbq_available(void)
9713+{
9714+ struct super_block *sb = reiser4_get_current_sb();
9715+ entd_context *ent = get_entd_context(sb);
9716+ return ent->nr_todo_reqs;
9717+}
9718+
9719+/*
9720+ * Local variables:
9721+ * c-indentation-style: "K&R"
9722+ * mode-name: "LC"
9723+ * c-basic-offset: 8
9724+ * tab-width: 8
9725+ * fill-column: 79
9726+ * End:
9727+ */
9728Index: linux-2.6.16/fs/reiser4/entd.h
9729===================================================================
9730--- /dev/null
9731+++ linux-2.6.16/fs/reiser4/entd.h
9732@@ -0,0 +1,90 @@
9733+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9734+
9735+/* Ent daemon. */
9736+
9737+#ifndef __ENTD_H__
9738+#define __ENTD_H__
9739+
9740+#include "context.h"
9741+
9742+#include <linux/fs.h>
9743+#include <linux/completion.h>
9744+#include <linux/wait.h>
9745+#include <linux/spinlock.h>
9746+#include <linux/sched.h> /* for struct task_struct */
9747+
9748+#define WBQ_MAGIC 0x7876dc76
9749+
9750+/* write-back request. */
9751+struct wbq {
9752+ int magic;
9753+ struct list_head link; /* list head of this list is in entd context */
9754+ struct writeback_control *wbc;
9755+ struct page *page;
9756+ struct address_space *mapping;
9757+ struct semaphore sem;
9758+ jnode *node; /* set if ent thread captured requested page */
9759+ int written; /* set if ent thread wrote requested page */
9760+};
9761+
9762+/* ent-thread context. This is used to synchronize starting/stopping ent
9763+ * threads. */
9764+typedef struct entd_context {
9765+ /* wait queue that ent thread waits on for more work. It's
9766+ * signaled by write_page_by_ent(). */
9767+ wait_queue_head_t wait;
9768+ /* spinlock protecting other fields */
9769+ spinlock_t guard;
9770+ /* ent thread */
9771+ struct task_struct *tsk;
9772+ /* set to indicate that ent thread should leave. */
9773+ int done;
9774+ /* counter of active flushers */
9775+ int flushers;
9776+ /*
9777+ * when reiser4_writepage asks entd to write a page - it adds struct
9778+ * wbq to this list
9779+ */
9780+ struct list_head todo_list;
9781+ /* number of elements on the above list */
9782+ int nr_todo_reqs;
9783+
9784+ struct wbq *cur_request;
9785+ /*
9786+ * when entd writes a page it moves write-back request from todo_list
9787+ * to done_list. This list is used at the end of entd iteration to
9788+ * wakeup requestors and iput inodes.
9789+ */
9790+ struct list_head done_list;
9791+ /* number of elements on the above list */
9792+ int nr_done_reqs;
9793+
9794+#if REISER4_DEBUG
9795+ /* list of all active flushers */
9796+ struct list_head flushers_list;
9797+#endif
9798+} entd_context;
9799+
9800+extern int init_entd(struct super_block *);
9801+extern void done_entd(struct super_block *);
9802+
9803+extern void enter_flush(struct super_block *);
9804+extern void leave_flush(struct super_block *);
9805+
9806+extern int write_page_by_ent(struct page *, struct writeback_control *);
9807+extern int wbq_available(void);
9808+extern void ent_writes_page(struct super_block *, struct page *);
9809+
9810+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9811+/* __ENTD_H__ */
9812+#endif
9813+
9814+/* Make Linus happy.
9815+ Local variables:
9816+ c-indentation-style: "K&R"
9817+ mode-name: "LC"
9818+ c-basic-offset: 8
9819+ tab-width: 8
9820+ fill-column: 120
9821+ End:
9822+*/
9823Index: linux-2.6.16/fs/reiser4/eottl.c
9824===================================================================
9825--- /dev/null
9826+++ linux-2.6.16/fs/reiser4/eottl.c
9827@@ -0,0 +1,510 @@
9828+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9829+
9830+#include "forward.h"
9831+#include "debug.h"
9832+#include "key.h"
9833+#include "coord.h"
9834+#include "plugin/item/item.h"
9835+#include "plugin/node/node.h"
9836+#include "znode.h"
9837+#include "block_alloc.h"
9838+#include "tree_walk.h"
9839+#include "tree_mod.h"
9840+#include "carry.h"
9841+#include "tree.h"
9842+#include "super.h"
9843+
9844+#include <linux/types.h> /* for __u?? */
9845+
9846+/*
9847+ * Extents on the twig level (EOTTL) handling.
9848+ *
9849+ * EOTTL poses some problems to the tree traversal, that are better explained
9850+ * by example.
9851+ *
9852+ * Suppose we have block B1 on the twig level with the following items:
9853+ *
9854+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9855+ * offset)
9856+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9857+ * 2. internal item I2 with key (10:0:0:0)
9858+ *
9859+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9860+ * then intra-node lookup is done. This lookup finished on the E1, because the
9861+ * key we are looking for is larger than the key of E1 and is smaller than key
9862+ * the of I2.
9863+ *
9864+ * Here search is stuck.
9865+ *
9866+ * After some thought it is clear what is wrong here: extents on the twig level
9867+ * break some basic property of the *search* tree (on the pretext, that they
9868+ * restore property of balanced tree).
9869+ *
9870+ * Said property is the following: if in the internal node of the search tree
9871+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9872+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9873+ * through the Pointer.
9874+ *
9875+ * This is not true, when Pointer is Extent-Pointer, simply because extent
9876+ * cannot expand indefinitely to the right to include any item with
9877+ *
9878+ * Key1 <= Key <= Key2.
9879+ *
9880+ * For example, our E1 extent is only responsible for the data with keys
9881+ *
9882+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9883+ *
9884+ * so, key range
9885+ *
9886+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9887+ *
9888+ * is orphaned: there is no way to get there from the tree root.
9889+ *
9890+ * In other words, extent pointers are different than normal child pointers as
9891+ * far as search tree is concerned, and this creates such problems.
9892+ *
9893+ * Possible solution for this problem is to insert our item into node pointed
9894+ * to by I2. There are some problems through:
9895+ *
9896+ * (1) I2 can be in a different node.
9897+ * (2) E1 can be immediately followed by another extent E2.
9898+ *
9899+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9900+ * for locks/coords as necessary.
9901+ *
9902+ * (2) is more complex. Solution here is to insert new empty leaf node and
9903+ * insert internal item between E1 and E2 pointing to said leaf node. This is
9904+ * further complicated by possibility that E2 is in a different node, etc.
9905+ *
9906+ * Problems:
9907+ *
9908+ * (1) if there was internal item I2 immediately on the right of an extent E1
9909+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9910+ * key of S1 will be less than smallest key in the N2. Normally, search key
9911+ * checks that key we are looking for is in the range of keys covered by the
9912+ * node key is being looked in. To work around of this situation, while
9913+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9914+ * cbk falgs bitmask. This flag is automatically set on entrance to the
9915+ * coord_by_key() and is only cleared when we are about to enter situation
9916+ * described above.
9917+ *
9918+ * (2) If extent E1 is immediately followed by another extent E2 and we are
9919+ * searching for the key that is between E1 and E2 we only have to insert new
9920+ * empty leaf node when coord_by_key was called for insertion, rather than just
9921+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9922+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9923+ * performed by insert_by_key() and friends.
9924+ *
9925+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9926+ * case it requires modification of node content which is only possible under
9927+ * write lock. It may well happen that we only have read lock on the node where
9928+ * new internal pointer is to be inserted (common case: lookup of non-existent
9929+ * stat-data that fells between two extents). If only read lock is held, tree
9930+ * traversal is restarted with lock_level modified so that next time we hit
9931+ * this problem, write lock will be held. Once we have write lock, balancing
9932+ * will be performed.
9933+ */
9934+
9935+/**
9936+ * is_next_item_internal - check whether next item is internal
9937+ * @coord: coordinate of extent item in twig node
9938+ * @key: search key
9939+ * @lh: twig node lock handle
9940+ *
9941+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9942+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9943+ * to that node, @coord is set to its first unit. If next item is not internal
9944+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9945+ * is returned if search restart has to be done.
9946+ */
9947+static int
9948+is_next_item_internal(coord_t *coord, const reiser4_key *key,
9949+ lock_handle *lh)
9950+{
9951+ coord_t next;
9952+ lock_handle rn;
9953+ int result;
9954+
9955+ coord_dup(&next, coord);
9956+ if (coord_next_unit(&next) == 0) {
9957+ /* next unit is in this node */
9958+ if (item_is_internal(&next)) {
9959+ coord_dup(coord, &next);
9960+ return 1;
9961+ }
9962+ assert("vs-3", item_is_extent(&next));
9963+ return 0;
9964+ }
9965+
9966+ /*
9967+ * next unit either does not exist or is in right neighbor. If it is in
9968+ * right neighbor we have to check right delimiting key because
9969+ * concurrent thread could get their first and insert item with a key
9970+ * smaller than @key
9971+ */
9972+ read_lock_dk(current_tree);
9973+ result = keycmp(key, znode_get_rd_key(coord->node));
9974+ read_unlock_dk(current_tree);
9975+ assert("vs-6", result != EQUAL_TO);
9976+ if (result == GREATER_THAN)
9977+ return 2;
9978+
9979+ /* lock right neighbor */
9980+ init_lh(&rn);
9981+ result = reiser4_get_right_neighbor(&rn, coord->node,
9982+ znode_is_wlocked(coord->node) ?
9983+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9984+ GN_CAN_USE_UPPER_LEVELS);
9985+ if (result == -E_NO_NEIGHBOR) {
9986+ /* we are on the rightmost edge of the tree */
9987+ done_lh(&rn);
9988+ return 0;
9989+ }
9990+
9991+ if (result) {
9992+ assert("vs-4", result < 0);
9993+ done_lh(&rn);
9994+ return result;
9995+ }
9996+
9997+ /*
9998+ * check whether concurrent thread managed to insert item with a key
9999+ * smaller than @key
10000+ */
10001+ read_lock_dk(current_tree);
10002+ result = keycmp(key, znode_get_ld_key(rn.node));
10003+ read_unlock_dk(current_tree);
10004+ assert("vs-6", result != EQUAL_TO);
10005+ if (result == GREATER_THAN) {
10006+ done_lh(&rn);
10007+ return 2;
10008+ }
10009+
10010+ result = zload(rn.node);
10011+ if (result) {
10012+ assert("vs-5", result < 0);
10013+ done_lh(&rn);
10014+ return result;
10015+ }
10016+
10017+ coord_init_first_unit(&next, rn.node);
10018+ if (item_is_internal(&next)) {
10019+ /*
10020+ * next unit is in right neighbor and it is an unit of internal
10021+ * item. Unlock coord->node. Move @lh to right neighbor. @coord
10022+ * is set to the first unit of right neighbor.
10023+ */
10024+ coord_dup(coord, &next);
10025+ zrelse(rn.node);
10026+ done_lh(lh);
10027+ move_lh(lh, &rn);
10028+ return 1;
10029+ }
10030+
10031+ /*
10032+ * next unit is unit of extent item. Return without chaning @lh and
10033+ * @coord.
10034+ */
10035+ assert("vs-6", item_is_extent(&next));
10036+ zrelse(rn.node);
10037+ done_lh(&rn);
10038+ return 0;
10039+}
10040+
10041+/**
10042+ * rd_key - calculate key of an item next to the given one
10043+ * @coord: position in a node
10044+ * @key: storage for result key
10045+ *
10046+ * @coord is set between items or after the last item in a node. Calculate key
10047+ * of item to the right of @coord.
10048+ */
10049+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
10050+{
10051+ coord_t dup;
10052+
10053+ assert("nikita-2281", coord_is_between_items(coord));
10054+ coord_dup(&dup, coord);
10055+
10056+ if (coord_set_to_right(&dup) == 0)
10057+ /* next item is in this node. Return its key. */
10058+ unit_key_by_coord(&dup, key);
10059+ else {
10060+ /*
10061+ * next item either does not exist or is in right
10062+ * neighbor. Return znode's right delimiting key.
10063+ */
10064+ read_lock_dk(current_tree);
10065+ *key = *znode_get_rd_key(coord->node);
10066+ read_unlock_dk(current_tree);
10067+ }
10068+ return key;
10069+}
10070+
10071+/**
10072+ * add_empty_leaf - insert empty leaf between two extents
10073+ * @insert_coord: position in twig node between two extents
10074+ * @lh: twig node lock handle
10075+ * @key: left delimiting key of new node
10076+ * @rdkey: right delimiting key of new node
10077+ *
10078+ * Inserts empty leaf node between two extent items. It is necessary when we
10079+ * have to insert an item on leaf level between two extents (items on the twig
10080+ * level).
10081+ */
10082+static int
10083+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
10084+ const reiser4_key *key, const reiser4_key *rdkey)
10085+{
10086+ int result;
10087+ carry_pool *pool;
10088+ carry_level *todo;
10089+ reiser4_item_data *item;
10090+ carry_insert_data *cdata;
10091+ carry_op *op;
10092+ znode *node;
10093+ reiser4_tree *tree;
10094+
10095+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
10096+ tree = znode_get_tree(insert_coord->node);
10097+ node = new_node(insert_coord->node, LEAF_LEVEL);
10098+ if (IS_ERR(node))
10099+ return PTR_ERR(node);
10100+
10101+ /* setup delimiting keys for node being inserted */
10102+ write_lock_dk(tree);
10103+ znode_set_ld_key(node, key);
10104+ znode_set_rd_key(node, rdkey);
10105+ ON_DEBUG(node->creator = current);
10106+ ON_DEBUG(node->first_key = *key);
10107+ write_unlock_dk(tree);
10108+
10109+ ZF_SET(node, JNODE_ORPHAN);
10110+
10111+ /*
10112+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
10113+ * carry_insert_data
10114+ */
10115+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
10116+ sizeof(*item) + sizeof(*cdata));
10117+ if (IS_ERR(pool))
10118+ return PTR_ERR(pool);
10119+ todo = (carry_level *) (pool + 1);
10120+ init_carry_level(todo, pool);
10121+
10122+ item = (reiser4_item_data *) (todo + 3);
10123+ cdata = (carry_insert_data *) (item + 1);
10124+
10125+ op = post_carry(todo, COP_INSERT, insert_coord->node, 0);
10126+ if (!IS_ERR(op)) {
10127+ cdata->coord = insert_coord;
10128+ cdata->key = key;
10129+ cdata->data = item;
10130+ op->u.insert.d = cdata;
10131+ op->u.insert.type = COPT_ITEM_DATA;
10132+ build_child_ptr_data(node, item);
10133+ item->arg = NULL;
10134+ /* have @insert_coord to be set at inserted item after
10135+ insertion is done */
10136+ todo->track_type = CARRY_TRACK_CHANGE;
10137+ todo->tracked = lh;
10138+
10139+ result = carry(todo, NULL);
10140+ if (result == 0) {
10141+ /*
10142+ * pin node in memory. This is necessary for
10143+ * znode_make_dirty() below.
10144+ */
10145+ result = zload(node);
10146+ if (result == 0) {
10147+ lock_handle local_lh;
10148+
10149+ /*
10150+ * if we inserted new child into tree we have
10151+ * to mark it dirty so that flush will be able
10152+ * to process it.
10153+ */
10154+ init_lh(&local_lh);
10155+ result = longterm_lock_znode(&local_lh, node,
10156+ ZNODE_WRITE_LOCK,
10157+ ZNODE_LOCK_LOPRI);
10158+ if (result == 0) {
10159+ znode_make_dirty(node);
10160+
10161+ /*
10162+ * when internal item pointing to @node
10163+ * was inserted into twig node
10164+ * create_hook_internal did not connect
10165+ * it properly because its right
10166+ * neighbor was not known. Do it
10167+ * here
10168+ */
10169+ write_lock_tree(tree);
10170+ assert("nikita-3312",
10171+ znode_is_right_connected(node));
10172+ assert("nikita-2984",
10173+ node->right == NULL);
10174+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
10175+ write_unlock_tree(tree);
10176+ result =
10177+ connect_znode(insert_coord, node);
10178+ if (result == 0)
10179+ ON_DEBUG(check_dkeys(node));
10180+
10181+ done_lh(lh);
10182+ move_lh(lh, &local_lh);
10183+ assert("vs-1676", node_is_empty(node));
10184+ coord_init_first_unit(insert_coord,
10185+ node);
10186+ } else {
10187+ warning("nikita-3136",
10188+ "Cannot lock child");
10189+ }
10190+ done_lh(&local_lh);
10191+ zrelse(node);
10192+ }
10193+ }
10194+ } else
10195+ result = PTR_ERR(op);
10196+ zput(node);
10197+ done_carry_pool(pool);
10198+ return result;
10199+}
10200+
10201+/**
10202+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
10203+ * @h: search handle
10204+ * @outcome: flag saying whether search has to restart or is done
10205+ *
10206+ * Handles search on twig level. If this function completes search itself then
10207+ * it returns 1. If search has to go one level down then 0 is returned. If
10208+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
10209+ * in @h->result.
10210+ */
10211+int handle_eottl(cbk_handle *h, int *outcome)
10212+{
10213+ int result;
10214+ reiser4_key key;
10215+ coord_t *coord;
10216+
10217+ coord = h->coord;
10218+
10219+ if (h->level != TWIG_LEVEL ||
10220+ (coord_is_existing_item(coord) && item_is_internal(coord))) {
10221+ /* Continue to traverse tree downward. */
10222+ return 0;
10223+ }
10224+
10225+ /*
10226+ * make sure that @h->coord is set to twig node and that it is either
10227+ * set to extent item or after extent item
10228+ */
10229+ assert("vs-356", h->level == TWIG_LEVEL);
10230+ assert("vs-357", ( {
10231+ coord_t lcoord;
10232+ coord_dup(&lcoord, coord);
10233+ check_me("vs-733", coord_set_to_left(&lcoord) == 0);
10234+ item_is_extent(&lcoord);
10235+ }
10236+ ));
10237+
10238+ if (*outcome == NS_FOUND) {
10239+ /* we have found desired key on twig level in extent item */
10240+ h->result = CBK_COORD_FOUND;
10241+ *outcome = LOOKUP_DONE;
10242+ return 1;
10243+ }
10244+
10245+ if (!(h->flags & CBK_FOR_INSERT)) {
10246+ /* tree traversal is not for insertion. Just return
10247+ CBK_COORD_NOTFOUND. */
10248+ h->result = CBK_COORD_NOTFOUND;
10249+ *outcome = LOOKUP_DONE;
10250+ return 1;
10251+ }
10252+
10253+ /* take a look at the item to the right of h -> coord */
10254+ result = is_next_item_internal(coord, h->key, h->active_lh);
10255+ if (unlikely(result < 0)) {
10256+ h->error = "get_right_neighbor failed";
10257+ h->result = result;
10258+ *outcome = LOOKUP_DONE;
10259+ return 1;
10260+ }
10261+ if (result == 0) {
10262+ /*
10263+ * item to the right is also an extent one. Allocate a new node
10264+ * and insert pointer to it after item h -> coord.
10265+ *
10266+ * This is a result of extents being located at the twig
10267+ * level. For explanation, see comment just above
10268+ * is_next_item_internal().
10269+ */
10270+ znode *loaded;
10271+
10272+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10273+ /*
10274+ * we got node read locked, restart coord_by_key to
10275+ * have write lock on twig level
10276+ */
10277+ h->lock_level = TWIG_LEVEL;
10278+ h->lock_mode = ZNODE_WRITE_LOCK;
10279+ *outcome = LOOKUP_REST;
10280+ return 1;
10281+ }
10282+
10283+ loaded = coord->node;
10284+ result =
10285+ add_empty_leaf(coord, h->active_lh, h->key,
10286+ rd_key(coord, &key));
10287+ if (result) {
10288+ h->error = "could not add empty leaf";
10289+ h->result = result;
10290+ *outcome = LOOKUP_DONE;
10291+ return 1;
10292+ }
10293+ /* added empty leaf is locked (h->active_lh), its parent node
10294+ is unlocked, h->coord is set as EMPTY */
10295+ assert("vs-13", coord->between == EMPTY_NODE);
10296+ assert("vs-14", znode_is_write_locked(coord->node));
10297+ assert("vs-15",
10298+ WITH_DATA(coord->node, node_is_empty(coord->node)));
10299+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10300+ assert("vs-17", coord->node == h->active_lh->node);
10301+ *outcome = LOOKUP_DONE;
10302+ h->result = CBK_COORD_NOTFOUND;
10303+ return 1;
10304+ } else if (result == 1) {
10305+ /*
10306+ * this is special case mentioned in the comment on
10307+ * tree.h:cbk_flags. We have found internal item immediately on
10308+ * the right of extent, and we are going to insert new item
10309+ * there. Key of item we are going to insert is smaller than
10310+ * leftmost key in the node pointed to by said internal item
10311+ * (otherwise search wouldn't come to the extent in the first
10312+ * place).
10313+ *
10314+ * This is a result of extents being located at the twig
10315+ * level. For explanation, see comment just above
10316+ * is_next_item_internal().
10317+ */
10318+ h->flags &= ~CBK_TRUST_DK;
10319+ } else {
10320+ assert("vs-8", result == 2);
10321+ *outcome = LOOKUP_REST;
10322+ return 1;
10323+ }
10324+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10325+ return 0;
10326+}
10327+
10328+/*
10329+ * Local variables:
10330+ * c-indentation-style: "K&R"
10331+ * mode-name: "LC"
10332+ * c-basic-offset: 8
10333+ * tab-width: 8
10334+ * fill-column: 120
10335+ * scroll-step: 1
10336+ * End:
10337+ */
10338Index: linux-2.6.16/fs/reiser4/estimate.c
10339===================================================================
10340--- /dev/null
10341+++ linux-2.6.16/fs/reiser4/estimate.c
10342@@ -0,0 +1,111 @@
10343+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10344+
10345+#include "debug.h"
10346+#include "dformat.h"
10347+#include "tree.h"
10348+#include "carry.h"
10349+#include "inode.h"
10350+#include "plugin/cluster.h"
10351+#include "plugin/item/ctail.h"
10352+
10353+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10354+
10355+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10356+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10357+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10358+ leaf level, 3 for twig level, 2 on upper + 1 for root.
10359+
10360+ Do not calculate the current node of the lowest level here - this is overhead only.
10361+
10362+ children is almost always 1 here. Exception is flow insertion
10363+*/
10364+static reiser4_block_nr
10365+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10366+{
10367+ reiser4_block_nr ten_percent;
10368+
10369+ ten_percent = ((103 * childen) >> 10);
10370+
10371+ /* If we have too many balancings at the time, tree height can raise on more
10372+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10373+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10374+}
10375+
10376+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10377+ perform insertion of one item into the tree */
10378+/* it is only called when tree height changes, or gets initialized */
10379+reiser4_block_nr calc_estimate_one_insert(tree_level height)
10380+{
10381+ return 1 + max_balance_overhead(1, height);
10382+}
10383+
10384+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10385+{
10386+ return tree->estimate_one_insert;
10387+}
10388+
10389+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10390+ perform insertion of one unit into an item in the tree */
10391+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10392+{
10393+ /* estimate insert into item just like item insertion */
10394+ return tree->estimate_one_insert;
10395+}
10396+
10397+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10398+{
10399+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10400+ level */
10401+ return tree->estimate_one_insert;
10402+}
10403+
10404+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10405+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10406+ levels */
10407+reiser4_block_nr estimate_insert_flow(tree_level height)
10408+{
10409+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10410+ CARRY_FLOW_NEW_NODES_LIMIT,
10411+ height);
10412+}
10413+
10414+/* returnes max number of nodes can be occupied by disk cluster */
10415+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10416+{
10417+ int per_cluster;
10418+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10419+ return 3 + per_cluster +
10420+ max_balance_overhead(3 + per_cluster,
10421+ REISER4_MAX_ZTREE_HEIGHT);
10422+}
10423+
10424+/* how many nodes might get dirty and added
10425+ during insertion of a disk cluster */
10426+reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10427+{
10428+ return estimate_cluster(inode, 1); /* 24 */
10429+}
10430+
10431+/* how many nodes might get dirty and added
10432+ during update of a (prepped or unprepped) disk cluster */
10433+reiser4_block_nr estimate_update_cluster(struct inode * inode)
10434+{
10435+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10436+}
10437+
10438+/* how many nodes occupied by a disk cluster might get dirty */
10439+reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10440+{
10441+ return 2 + cluster_nrpages(inode);
10442+}
10443+
10444+/* Make Linus happy.
10445+ Local variables:
10446+ c-indentation-style: "K&R"
10447+ mode-name: "LC"
10448+ c-basic-offset: 8
10449+ tab-width: 8
10450+ fill-column: 120
10451+ scroll-step: 1
10452+ End:
10453+*/
10454Index: linux-2.6.16/fs/reiser4/export_ops.c
10455===================================================================
10456--- /dev/null
10457+++ linux-2.6.16/fs/reiser4/export_ops.c
10458@@ -0,0 +1,296 @@
10459+/* Copyright 2005 by Hans Reiser, licensing governed by
10460+ * reiser4/README */
10461+
10462+#include "inode.h"
10463+#include "plugin/plugin.h"
10464+
10465+
10466+/*
10467+ * Supported file-handle types
10468+ */
10469+typedef enum {
10470+ FH_WITH_PARENT = 0x10, /* file handle with parent */
10471+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10472+} reiser4_fhtype;
10473+
10474+#define NFSERROR (255)
10475+
10476+/* initialize place-holder for object */
10477+static void object_on_wire_init(reiser4_object_on_wire *o)
10478+{
10479+ o->plugin = NULL;
10480+}
10481+
10482+/* finish with @o */
10483+static void object_on_wire_done(reiser4_object_on_wire *o)
10484+{
10485+ if (o->plugin != NULL)
10486+ o->plugin->wire.done(o);
10487+}
10488+
10489+/*
10490+ * read serialized object identity from @addr and store information about
10491+ * object in @obj. This is dual to encode_inode().
10492+ */
10493+static char *decode_inode(struct super_block *s, char *addr,
10494+ reiser4_object_on_wire * obj)
10495+{
10496+ file_plugin *fplug;
10497+
10498+ /* identifier of object plugin is stored in the first two bytes,
10499+ * followed by... */
10500+ fplug = file_plugin_by_disk_id(get_tree(s), (d16 *) addr);
10501+ if (fplug != NULL) {
10502+ addr += sizeof(d16);
10503+ obj->plugin = fplug;
10504+ assert("nikita-3520", fplug->wire.read != NULL);
10505+ /* plugin specific encoding of object identity. */
10506+ addr = fplug->wire.read(addr, obj);
10507+ } else
10508+ addr = ERR_PTR(RETERR(-EINVAL));
10509+ return addr;
10510+}
10511+
10512+/**
10513+ * reiser4_decode_fh - decode_fh of export operations
10514+ * @super: super block
10515+ * @fh: nfsd file handle
10516+ * @len: length of file handle
10517+ * @fhtype: type of file handle
10518+ * @acceptable: acceptability testing function
10519+ * @context: argument for @acceptable
10520+ *
10521+ * Returns dentry referring to the same file as @fh.
10522+ */
10523+static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10524+ int len, int fhtype,
10525+ int (*acceptable) (void *context,
10526+ struct dentry *de),
10527+ void *context)
10528+{
10529+ reiser4_context *ctx;
10530+ reiser4_object_on_wire object;
10531+ reiser4_object_on_wire parent;
10532+ char *addr;
10533+ int with_parent;
10534+
10535+ ctx = init_context(super);
10536+ if (IS_ERR(ctx))
10537+ return (struct dentry *)ctx;
10538+
10539+ assert("vs-1482",
10540+ fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10541+
10542+ with_parent = (fhtype == FH_WITH_PARENT);
10543+
10544+ addr = (char *)fh;
10545+
10546+ object_on_wire_init(&object);
10547+ object_on_wire_init(&parent);
10548+
10549+ addr = decode_inode(super, addr, &object);
10550+ if (!IS_ERR(addr)) {
10551+ if (with_parent)
10552+ addr = decode_inode(super, addr, &parent);
10553+ if (!IS_ERR(addr)) {
10554+ struct dentry *d;
10555+ typeof(super->s_export_op->find_exported_dentry) fn;
10556+
10557+ fn = super->s_export_op->find_exported_dentry;
10558+ assert("nikita-3521", fn != NULL);
10559+ d = fn(super, &object, with_parent ? &parent : NULL,
10560+ acceptable, context);
10561+ if (d != NULL && !IS_ERR(d))
10562+ /* FIXME check for -ENOMEM */
10563+ reiser4_get_dentry_fsdata(d)->stateless = 1;
10564+ addr = (char *)d;
10565+ }
10566+ }
10567+
10568+ object_on_wire_done(&object);
10569+ object_on_wire_done(&parent);
10570+
10571+ reiser4_exit_context(ctx);
10572+ return (void *)addr;
10573+}
10574+
10575+/*
10576+ * Object serialization support.
10577+ *
10578+ * To support knfsd file system provides export_operations that are used to
10579+ * construct and interpret NFS file handles. As a generalization of this,
10580+ * reiser4 object plugins have serialization support: it provides methods to
10581+ * create on-wire representation of identity of reiser4 object, and
10582+ * re-create/locate object given its on-wire identity.
10583+ *
10584+ */
10585+
10586+/*
10587+ * return number of bytes that on-wire representation of @inode's identity
10588+ * consumes.
10589+ */
10590+static int encode_inode_size(struct inode *inode)
10591+{
10592+ assert("nikita-3514", inode != NULL);
10593+ assert("nikita-3515", inode_file_plugin(inode) != NULL);
10594+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10595+
10596+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10597+}
10598+
10599+/*
10600+ * store on-wire representation of @inode's identity at the area beginning at
10601+ * @start.
10602+ */
10603+static char *encode_inode(struct inode *inode, char *start)
10604+{
10605+ assert("nikita-3517", inode != NULL);
10606+ assert("nikita-3518", inode_file_plugin(inode) != NULL);
10607+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10608+
10609+ /*
10610+ * first, store two-byte identifier of object plugin, then
10611+ */
10612+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10613+ (d16 *) start);
10614+ start += sizeof(d16);
10615+ /*
10616+ * call plugin to serialize object's identity
10617+ */
10618+ return inode_file_plugin(inode)->wire.write(inode, start);
10619+}
10620+
10621+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10622+ * returned if file handle can not be stored */
10623+/**
10624+ * reiser4_encode_fh - encode_fh of export operations
10625+ * @dentry:
10626+ * @fh:
10627+ * @lenp:
10628+ * @need_parent:
10629+ *
10630+ */
10631+static int
10632+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10633+ int need_parent)
10634+{
10635+ struct inode *inode;
10636+ struct inode *parent;
10637+ char *addr;
10638+ int need;
10639+ int delta;
10640+ int result;
10641+ reiser4_context *ctx;
10642+
10643+ /*
10644+ * knfsd asks as to serialize object in @dentry, and, optionally its
10645+ * parent (if need_parent != 0).
10646+ *
10647+ * encode_inode() and encode_inode_size() is used to build
10648+ * representation of object and its parent. All hard work is done by
10649+ * object plugins.
10650+ */
10651+ inode = dentry->d_inode;
10652+ parent = dentry->d_parent->d_inode;
10653+
10654+ addr = (char *)fh;
10655+
10656+ need = encode_inode_size(inode);
10657+ if (need < 0)
10658+ return NFSERROR;
10659+ if (need_parent) {
10660+ delta = encode_inode_size(parent);
10661+ if (delta < 0)
10662+ return NFSERROR;
10663+ need += delta;
10664+ }
10665+
10666+ ctx = init_context(dentry->d_inode->i_sb);
10667+ if (IS_ERR(ctx))
10668+ return PTR_ERR(ctx);
10669+
10670+ if (need <= sizeof(__u32) * (*lenp)) {
10671+ addr = encode_inode(inode, addr);
10672+ if (need_parent)
10673+ addr = encode_inode(parent, addr);
10674+
10675+ /* store in lenp number of 32bit words required for file
10676+ * handle. */
10677+ *lenp = (need + sizeof(__u32) - 1) >> 2;
10678+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10679+ } else
10680+ /* no enough space in file handle */
10681+ result = NFSERROR;
10682+ reiser4_exit_context(ctx);
10683+ return result;
10684+}
10685+
10686+/**
10687+ * reiser4_get_dentry_parent - get_parent of export operations
10688+ * @child:
10689+ *
10690+ */
10691+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10692+{
10693+ struct inode *dir;
10694+ dir_plugin *dplug;
10695+
10696+ assert("nikita-3527", child != NULL);
10697+ /* see comment in reiser4_get_dentry() about following assertion */
10698+ assert("nikita-3528", is_in_reiser4_context());
10699+
10700+ dir = child->d_inode;
10701+ assert("nikita-3529", dir != NULL);
10702+ dplug = inode_dir_plugin(dir);
10703+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10704+ if (dplug != NULL)
10705+ return dplug->get_parent(dir);
10706+ else
10707+ return ERR_PTR(RETERR(-ENOTDIR));
10708+}
10709+
10710+/**
10711+ * reiser4_get_dentry - get_dentry of export operations
10712+ * @super:
10713+ * @data:
10714+ *
10715+ *
10716+ */
10717+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10718+{
10719+ reiser4_object_on_wire *o;
10720+
10721+ assert("nikita-3522", super != NULL);
10722+ assert("nikita-3523", data != NULL);
10723+ /*
10724+ * this is only supposed to be called by
10725+ *
10726+ * reiser4_decode_fh->find_exported_dentry
10727+ *
10728+ * so, reiser4_context should be here already.
10729+ */
10730+ assert("nikita-3526", is_in_reiser4_context());
10731+
10732+ o = (reiser4_object_on_wire *)data;
10733+ assert("nikita-3524", o->plugin != NULL);
10734+ assert("nikita-3525", o->plugin->wire.get != NULL);
10735+
10736+ return o->plugin->wire.get(super, o);
10737+}
10738+
10739+struct export_operations reiser4_export_operations = {
10740+ .encode_fh = reiser4_encode_fh,
10741+ .decode_fh = reiser4_decode_fh,
10742+ .get_parent = reiser4_get_dentry_parent,
10743+ .get_dentry = reiser4_get_dentry
10744+};
10745+
10746+/*
10747+ * Local variables:
10748+ * c-indentation-style: "K&R"
10749+ * mode-name: "LC"
10750+ * c-basic-offset: 8
10751+ * tab-width: 8
10752+ * fill-column: 79
10753+ * End:
10754+ */
10755Index: linux-2.6.16/fs/reiser4/flush.c
10756===================================================================
10757--- /dev/null
10758+++ linux-2.6.16/fs/reiser4/flush.c
10759@@ -0,0 +1,3626 @@
10760+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10761+
10762+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10763+
10764+#include "forward.h"
10765+#include "debug.h"
10766+#include "dformat.h"
10767+#include "key.h"
10768+#include "coord.h"
10769+#include "plugin/item/item.h"
10770+#include "plugin/plugin.h"
10771+#include "plugin/object.h"
10772+#include "txnmgr.h"
10773+#include "jnode.h"
10774+#include "znode.h"
10775+#include "block_alloc.h"
10776+#include "tree_walk.h"
10777+#include "carry.h"
10778+#include "tree.h"
10779+#include "vfs_ops.h"
10780+#include "inode.h"
10781+#include "page_cache.h"
10782+#include "wander.h"
10783+#include "super.h"
10784+#include "entd.h"
10785+#include "reiser4.h"
10786+#include "flush.h"
10787+#include "writeout.h"
10788+
10789+#include <asm/atomic.h>
10790+#include <linux/fs.h> /* for struct super_block */
10791+#include <linux/mm.h> /* for struct page */
10792+#include <linux/bio.h> /* for struct bio */
10793+#include <linux/pagemap.h>
10794+#include <linux/blkdev.h>
10795+
10796+/* IMPLEMENTATION NOTES */
10797+
10798+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10799+ order to the nodes of the tree in which the parent is placed before its children, which
10800+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10801+ describes the node that "came before in forward parent-first order". When we speak of a
10802+ "parent-first follower", it describes the node that "comes next in parent-first
10803+ order" (alternatively the node that "came before in reverse parent-first order").
10804+
10805+ The following pseudo-code prints the nodes of a tree in forward parent-first order:
10806+
10807+ void parent_first (node)
10808+ {
10809+ print_node (node);
10810+ if (node->level > leaf) {
10811+ for (i = 0; i < num_children; i += 1) {
10812+ parent_first (node->child[i]);
10813+ }
10814+ }
10815+ }
10816+*/
10817+
10818+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10819+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10820+ can be accomplished with sequential reads, which results in reading nodes in their
10821+ parent-first order. This is a read-optimization aspect of the flush algorithm, and
10822+ there is also a write-optimization aspect, which is that we wish to make large
10823+ sequential writes to the disk by allocating or reallocating blocks so that they can be
10824+ written in sequence. Sometimes the read-optimization and write-optimization goals
10825+ conflict with each other, as we discuss in more detail below.
10826+*/
10827+
10828+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10829+ the relevant jnode->state bits and their relevence to flush:
10830+
10831+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10832+ must be allocated first. In order to be considered allocated, the jnode must have
10833+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10834+ all dirtied jnodes eventually have one of these bits set during each transaction.
10835+
10836+ JNODE_CREATED: The node was freshly created in its transaction and has no previous
10837+ block address, so it is unconditionally assigned to be relocated, although this is
10838+ mainly for code-convenience. It is not being 'relocated' from anything, but in
10839+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10840+ remains set even after JNODE_RELOC is set, so the actual relocate can be
10841+ distinguished from the created-and-allocated set easily: relocate-set members
10842+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10843+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10844+
10845+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10846+ decision to maintain the pre-existing location for this node and it will be written
10847+ to the wandered-log.
10848+
10849+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10850+ not created, see note above). A block with JNODE_RELOC set is eligible for
10851+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10852+ bit is set on a znode, the parent node's internal item is modified and the znode is
10853+ rehashed.
10854+
10855+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10856+ and calls plugin->f.squeeze() method for its items. By this technology we update disk
10857+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10858+ has this flag (races with write(), rare case) the flush algorythm makes the decision
10859+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10860+ repeated allocation.
10861+
10862+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10863+ flush queue. This means the jnode is not on any clean or dirty list, instead it is
10864+ moved to one of the flush queue (see flush_queue.h) object private list. This
10865+ prevents multiple concurrent flushes from attempting to start flushing from the
10866+ same node.
10867+
10868+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10869+ squeeze-and-allocate on a node while its children are actively being squeezed and
10870+ allocated. This flag was created to avoid submitting a write request for a node
10871+ while its children are still being allocated and squeezed. Then flush queue was
10872+ re-implemented to allow unlimited number of nodes be queued. This flag support was
10873+ commented out in source code because we decided that there was no reason to submit
10874+ queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10875+ during a slum traversal and may submit "busy nodes" to disk. Probably we can
10876+ re-enable the JNODE_FLUSH_BUSY bit support in future.
10877+
10878+ With these state bits, we describe a test used frequently in the code below,
10879+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10880+ test for "flushprepped" returns true if any of the following are true:
10881+
10882+ - The node is not dirty
10883+ - The node has JNODE_RELOC set
10884+ - The node has JNODE_OVRWR set
10885+
10886+ If either the node is not dirty or it has already been processed by flush (and assigned
10887+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10888+ true then flush has work to do on that node.
10889+*/
10890+
10891+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10892+ flushprepped twice (unless an explicit call to flush_unprep is made as described in
10893+ detail below). For example a node is dirtied, allocated, and then early-flushed to
10894+ disk and set clean. Before the transaction commits, the page is dirtied again and, due
10895+ to memory pressure, the node is flushed again. The flush algorithm will not relocate
10896+ the node to a new disk location, it will simply write it to the same, previously
10897+ relocated position again.
10898+*/
10899+
10900+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10901+ start at a leaf node and allocate in parent-first order by iterating to the right. At
10902+ each step of the iteration, we check for the right neighbor. Before advancing to the
10903+ right neighbor, we check if the current position and the right neighbor share the same
10904+ parent. If they do not share the same parent, the parent is allocated before the right
10905+ neighbor.
10906+
10907+ This process goes recursively up the tree and squeeze nodes level by level as long as
10908+ the right neighbor and the current position have different parents, then it allocates
10909+ the right-neighbors-with-different-parents on the way back down. This process is
10910+ described in more detail in flush_squalloc_changed_ancestor and the recursive function
10911+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10912+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10913+ approaches.
10914+
10915+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10916+ approach, we find a starting point by scanning left along each level past dirty nodes,
10917+ then going up and repeating the process until the left node and the parent node are
10918+ clean. We then perform a parent-first traversal from the starting point, which makes
10919+ allocating in parent-first order trivial. After one subtree has been allocated in this
10920+ manner, we move to the right, try moving upward, then repeat the parent-first
10921+ traversal.
10922+
10923+ Both approaches have problems that need to be addressed. Both are approximately the
10924+ same amount of code, but the bottom-up approach has advantages in the order it acquires
10925+ locks which, at the very least, make it the better approach. At first glance each one
10926+ makes the other one look simpler, so it is important to remember a few of the problems
10927+ with each one.
10928+
10929+ Main problem with the top-down approach: When you encounter a clean child during the
10930+ parent-first traversal, what do you do? You would like to avoid searching through a
10931+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10932+ obvious solution. One of the advantages of the top-down approach is that during the
10933+ parent-first traversal you check every child of a parent to see if it is dirty. In
10934+ this way, the top-down approach easily handles the main problem of the bottom-up
10935+ approach: unallocated children.
10936+
10937+ The unallocated children problem is that before writing a node to disk we must make
10938+ sure that all of its children are allocated. Otherwise, the writing the node means
10939+ extra I/O because the node will have to be written again when the child is finally
10940+ allocated.
10941+
10942+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10943+ should not cause any file system corruption, it only degrades I/O performance because a
10944+ node may be written when it is sure to be written at least one more time in the same
10945+ transaction when the remaining children are allocated. What follows is a description
10946+ of how we will solve the problem.
10947+*/
10948+
10949+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10950+ proceeding in parent first order, allocate some of its left-children, then encounter a
10951+ clean child in the middle of the parent. We do not allocate the clean child, but there
10952+ may remain unallocated (dirty) children to the right of the clean child. If we were to
10953+ stop flushing at this moment and write everything to disk, the parent might still
10954+ contain unallocated children.
10955+
10956+ We could try to allocate all the descendents of every node that we allocate, but this
10957+ is not necessary. Doing so could result in allocating the entire tree: if the root
10958+ node is allocated then every unallocated node would have to be allocated before
10959+ flushing. Actually, we do not have to write a node just because we allocate it. It is
10960+ possible to allocate but not write a node during flush, when it still has unallocated
10961+ children. However, this approach is probably not optimal for the following reason.
10962+
10963+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10964+ to optimize reads that occur in the same order. Thus we are read-optimizing for a
10965+ left-to-right scan through all the leaves in the system, and we are hoping to
10966+ write-optimize at the same time because those nodes will be written together in batch.
10967+ What happens, however, if we assign a block number to a node in its read-optimized
10968+ order but then avoid writing it because it has unallocated children? In that
10969+ situation, we lose out on the write-optimization aspect because a node will have to be
10970+ written again to the its location on the device, later, which likely means seeking back
10971+ to that location.
10972+
10973+ So there are tradeoffs. We can choose either:
10974+
10975+ A. Allocate all unallocated children to preserve both write-optimization and
10976+ read-optimization, but this is not always desirable because it may mean having to
10977+ allocate and flush very many nodes at once.
10978+
10979+ B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10980+ but sacrifice write-optimization because those nodes will be written again.
10981+
10982+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10983+ locations. Instead, choose to write-optimize them later, when they are written. To
10984+ facilitate this, we "undo" the read-optimized allocation that was given to the node so
10985+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10986+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10987+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10988+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10989+ location, and set the JNODE_CREATED bit, effectively setting the node back to an
10990+ unallocated state.
10991+
10992+ We will take the following approach in v4.0: for twig nodes we will always finish
10993+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10994+ writing and choose write-optimization (C).
10995+
10996+ To summarize, there are several parts to a solution that avoids the problem with
10997+ unallocated children:
10998+
10999+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
11000+ problem because there was an experiment which was done showed that we have 1-2 nodes
11001+ with unallocated children for thousands of written nodes. The experiment was simple
11002+ like coping / deletion of linux kernel sources. However the problem can arise in more
11003+ complex tests. I think we have jnode_io_hook to insert a check for unallocated
11004+ children and see what kind of problem we have.
11005+
11006+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
11007+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
11008+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see
11009+ comments in that function.
11010+
11011+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
11012+ have unallocated children. If the twig level has unallocated children it is an
11013+ assertion failure. If a higher-level node has unallocated children, then it should be
11014+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
11015+ should be simple.
11016+
11017+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
11018+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize
11019+ this somewhat in the case where large sub-trees are flushed. The following observation
11020+ helps: if both the left- and right-neighbor of a node are processed by the flush
11021+ algorithm then the node itself is guaranteed to have all of its children allocated.
11022+ However, the cost of this check may not be so expensive after all: it is not needed for
11023+ leaves and flush can guarantee this property for twigs. That leaves only (level >
11024+ TWIG) nodes that have to be checked, so this optimization only helps if at least three
11025+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
11026+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
11027+ then the number of blocks being written will be very large, so the savings may be
11028+ insignificant. That said, the idea is to maintain both the left and right edges of
11029+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively
11030+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
11031+ edge, the slow check is necessary, but if it is in the interior then it can be assumed
11032+ to have all of its children allocated. FIXME: medium complexity to implement, but
11033+ simple to verify given that we must have a slow check anyway.
11034+
11035+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of
11036+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
11037+ left-scan operation to take unallocated children into account. Normally, the left-scan
11038+ operation goes left as long as adjacent nodes are dirty up until some large maximum
11039+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
11040+ may stop at a position where there are unallocated children to the left with the same
11041+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
11042+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
11043+ with a rapid scan. The rapid scan skips all the interior children of a node--if the
11044+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
11045+ twig to the left). If the left neighbor of the leftmost child is also dirty, then
11046+ continue the scan at the left twig and repeat. This option will cause flush to
11047+ allocate more twigs in a single pass, but it also has the potential to write many more
11048+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
11049+ was partially implemented, code removed August 12, 2002 by JMACD.
11050+*/
11051+
11052+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
11053+ starting point for flush is a leaf node, but actually the flush code cares very little
11054+ about whether or not this is true. It is possible that all the leaf nodes are flushed
11055+ and dirty parent nodes still remain, in which case jnode_flush() is called on a
11056+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
11057+ leaf, even when it is not. This is a simple approach, and there may be a more optimal
11058+ policy but until a problem with this approach is discovered, simplest is probably best.
11059+
11060+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
11061+ the leaves. This is done as a matter of simplicity and there is only one (shaky)
11062+ justification. When an atom commits, it flushes all leaf level nodes first, followed
11063+ by twigs, and so on. With flushing done in this order, if flush is eventually called
11064+ on a non-leaf node it means that (somehow) we reached a point where all leaves are
11065+ clean and only internal nodes need to be flushed. If that it the case, then it means
11066+ there were no leaves that were the parent-first preceder/follower of the parent. This
11067+ is expected to be a rare case, which is why we do nothing special about it. However,
11068+ memory pressure may pass an internal node to flush when there are still dirty leaf
11069+ nodes that need to be flushed, which could prove our original assumptions
11070+ "inoperative". If this needs to be fixed, then scan_left/right should have
11071+ special checks for the non-leaf levels. For example, instead of passing from a node to
11072+ the left neighbor, it should pass from the node to the left neighbor's rightmost
11073+ descendent (if dirty).
11074+
11075+*/
11076+
11077+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
11078+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
11079+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
11080+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
11081+ device becomes sorted such that tree order and block number order fully correlate.
11082+
11083+ Resizing is done by shifting everything either all the way to the left or all the way
11084+ to the right, and then reporting the last block.
11085+*/
11086+
11087+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
11088+ descibes the policy from the highest level:
11089+
11090+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
11091+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate
11092+ leaf nodes.
11093+
11094+ Otherwise, there are two contexts in which we make a decision to relocate:
11095+
11096+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
11097+ During the initial stages of flush, after scan-right completes, we want to ask the
11098+ question: should we relocate this leaf node and thus dirty the parent node. Then if
11099+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
11100+ the question at the next level up, and so on. In these cases we are moving in the
11101+ reverse-parent first direction.
11102+
11103+ There is another case which is considered the reverse direction, which comes at the end
11104+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
11105+ reach a point where there is a clean twig to the right with a dirty leftmost child. In
11106+ this case, we may wish to relocate the child by testing if it should be relocated
11107+ relative to its parent.
11108+
11109+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
11110+ allocate_znode. What distinguishes the forward parent-first case from the
11111+ reverse-parent first case is that the preceder has already been allocated in the
11112+ forward case, whereas in the reverse case we don't know what the preceder is until we
11113+ finish "going in reverse". That simplifies the forward case considerably, and there we
11114+ actually use the block allocator to determine whether, e.g., a block closer to the
11115+ preceder is available.
11116+*/
11117+
11118+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
11119+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then
11120+ squeeze the parent's left neighbor and the parent. This may change the
11121+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child
11122+ is a leftmost child, repeat this left-edge squeezing operation at the next level up.
11123+ Note that we cannot allocate extents during this or they will be out of parent-first
11124+ order. There is also some difficult coordinate maintenence issues. We can't do a tree
11125+ search to find coordinates again (because we hold locks), we have to determine them
11126+ from the two nodes being squeezed. Looks difficult, but has potential to increase
11127+ space utilization. */
11128+
11129+/* Flush-scan helper functions. */
11130+static void scan_init(flush_scan * scan);
11131+static void scan_done(flush_scan * scan);
11132+
11133+/* Flush-scan algorithm. */
11134+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
11135+ unsigned limit);
11136+static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
11137+static int scan_common(flush_scan * scan, flush_scan * other);
11138+static int scan_formatted(flush_scan * scan);
11139+static int scan_unformatted(flush_scan * scan, flush_scan * other);
11140+static int scan_by_coord(flush_scan * scan);
11141+
11142+/* Initial flush-point ancestor allocation. */
11143+static int alloc_pos_and_ancestors(flush_pos_t * pos);
11144+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
11145+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
11146+
11147+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
11148+static int squalloc(flush_pos_t * pos);
11149+
11150+/* Flush squeeze implementation. */
11151+static int squeeze_right_non_twig(znode * left, znode * right);
11152+static int shift_one_internal_unit(znode * left, znode * right);
11153+
11154+/* Flush reverse parent-first relocation routines. */
11155+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11156+ const reiser4_block_nr * nblk);
11157+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11158+ flush_pos_t * pos);
11159+static int reverse_relocate_check_dirty_parent(jnode * node,
11160+ const coord_t * parent_coord,
11161+ flush_pos_t * pos);
11162+
11163+/* Flush allocate write-queueing functions: */
11164+static int allocate_znode(znode * node, const coord_t * parent_coord,
11165+ flush_pos_t * pos);
11166+static int allocate_znode_update(znode * node, const coord_t * parent_coord,
11167+ flush_pos_t * pos);
11168+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
11169+
11170+/* Flush helper functions: */
11171+static int jnode_lock_parent_coord(jnode * node,
11172+ coord_t * coord,
11173+ lock_handle * parent_lh,
11174+ load_count * parent_zh,
11175+ znode_lock_mode mode, int try);
11176+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
11177+ znode_lock_mode mode, int check_dirty);
11178+static int znode_same_parents(znode * a, znode * b);
11179+
11180+static int znode_check_flushprepped(znode * node)
11181+{
11182+ return jnode_check_flushprepped(ZJNODE(node));
11183+}
11184+
11185+/* Flush position functions */
11186+static void pos_init(flush_pos_t * pos);
11187+static int pos_valid(flush_pos_t * pos);
11188+static void pos_done(flush_pos_t * pos);
11189+static int pos_stop(flush_pos_t * pos);
11190+
11191+/* check that @org is first jnode extent unit, if extent is unallocated,
11192+ * because all jnodes of unallocated extent are dirty and of the same atom. */
11193+#define checkchild(scan) \
11194+assert("nikita-3435", \
11195+ ergo(scan->direction == LEFT_SIDE && \
11196+ (scan->parent_coord.node->level == TWIG_LEVEL) && \
11197+ jnode_is_unformatted(scan->node) && \
11198+ extent_is_unallocated(&scan->parent_coord), \
11199+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
11200+
11201+/* This flush_cnt variable is used to track the number of concurrent flush operations,
11202+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
11203+ no static initializer function...) */
11204+ON_DEBUG(atomic_t flush_cnt;
11205+ )
11206+
11207+/* check fs backing device for write congestion */
11208+static int check_write_congestion(void)
11209+{
11210+ struct super_block *sb;
11211+ struct backing_dev_info *bdi;
11212+
11213+ sb = reiser4_get_current_sb();
11214+ bdi = get_super_fake(sb)->i_mapping->backing_dev_info;
11215+ return bdi_write_congested(bdi);
11216+}
11217+
11218+/* conditionally write flush queue */
11219+static int write_prepped_nodes(flush_pos_t * pos)
11220+{
11221+ int ret;
11222+
11223+ assert("zam-831", pos);
11224+ assert("zam-832", pos->fq);
11225+
11226+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11227+ return 0;
11228+
11229+ if (check_write_congestion())
11230+ return 0;
11231+
11232+ ret = write_fq(pos->fq, pos->nr_written,
11233+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11234+ return ret;
11235+}
11236+
11237+/* Proper release all flush pos. resources then move flush position to new
11238+ locked node */
11239+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
11240+ load_count * new_load, const coord_t * new_coord)
11241+{
11242+ assert("zam-857", new_lock->node == new_load->node);
11243+
11244+ if (new_coord) {
11245+ assert("zam-858", new_coord->node == new_lock->node);
11246+ coord_dup(&pos->coord, new_coord);
11247+ } else {
11248+ coord_init_first_unit(&pos->coord, new_lock->node);
11249+ }
11250+
11251+ if (pos->child) {
11252+ jput(pos->child);
11253+ pos->child = NULL;
11254+ }
11255+
11256+ move_load_count(&pos->load, new_load);
11257+ done_lh(&pos->lock);
11258+ move_lh(&pos->lock, new_lock);
11259+}
11260+
11261+/* delete empty node which link from the parent still exists. */
11262+static int delete_empty_node(znode * node)
11263+{
11264+ reiser4_key smallest_removed;
11265+
11266+ assert("zam-1019", node != NULL);
11267+ assert("zam-1020", node_is_empty(node));
11268+ assert("zam-1023", znode_is_wlocked(node));
11269+
11270+ return delete_node(node, &smallest_removed, NULL, 1);
11271+}
11272+
11273+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11274+static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11275+{
11276+ int ret;
11277+ load_count load;
11278+ lock_handle lock;
11279+
11280+ init_lh(&lock);
11281+ init_load_count(&load);
11282+
11283+ if (jnode_is_znode(org)) {
11284+ ret = longterm_lock_znode(&lock, JZNODE(org),
11285+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11286+ if (ret)
11287+ return ret;
11288+
11289+ ret = incr_load_count_znode(&load, JZNODE(org));
11290+ if (ret)
11291+ return ret;
11292+
11293+ pos->state =
11294+ (jnode_get_level(org) ==
11295+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11296+ move_flush_pos(pos, &lock, &load, NULL);
11297+ } else {
11298+ coord_t parent_coord;
11299+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11300+ &load, ZNODE_WRITE_LOCK, 0);
11301+ if (ret)
11302+ goto done;
11303+ if (!item_is_extent(&parent_coord)) {
11304+ /* file was converted to tail, org became HB, we found internal
11305+ item */
11306+ ret = -EAGAIN;
11307+ goto done;
11308+ }
11309+
11310+ pos->state = POS_ON_EPOINT;
11311+ move_flush_pos(pos, &lock, &load, &parent_coord);
11312+ pos->child = jref(org);
11313+ if (extent_is_unallocated(&parent_coord)
11314+ && extent_unit_index(&parent_coord) != index_jnode(org)) {
11315+ /* @org is not first child of its parent unit. This may happen
11316+ because longerm lock of its parent node was released between
11317+ scan_left and scan_right. For now work around this having flush to repeat */
11318+ ret = -EAGAIN;
11319+ }
11320+ }
11321+
11322+ done:
11323+ done_load_count(&load);
11324+ done_lh(&lock);
11325+ return ret;
11326+}
11327+
11328+/* TODO LIST (no particular order): */
11329+/* I have labelled most of the legitimate FIXME comments in this file with letters to
11330+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11331+ specific names mentioned instead that need to be inspected/resolved. */
11332+/* B. There is an issue described in reverse_relocate_test having to do with an
11333+ imprecise is_preceder? check having to do with partially-dirty extents. The code that
11334+ sets preceder hints and computes the preceder is basically untested. Careful testing
11335+ needs to be done that preceder calculations are done correctly, since if it doesn't
11336+ affect correctness we will not catch this stuff during regular testing. */
11337+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11338+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11339+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11340+ Many of the calls that may produce one of these return values (i.e.,
11341+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11342+ values themselves and, for instance, stop flushing instead of resulting in a restart.
11343+ If any of these results are true error conditions then flush will go into a busy-loop,
11344+ as we noticed during testing when a corrupt tree caused find_child_ptr to return
11345+ ENOENT. It needs careful thought and testing of corner conditions.
11346+*/
11347+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11348+ block is assigned a block number then early-flushed to disk. It is dirtied again and
11349+ flush is called again. Concurrently, that block is deleted, and the de-allocation of
11350+ its block number does not need to be deferred, since it is not part of the preserve set
11351+ (i.e., it didn't exist before the transaction). I think there may be a race condition
11352+ where flush writes the dirty, created block after the non-deferred deallocated block
11353+ number is re-allocated, making it possible to write deleted data on top of non-deleted
11354+ data. Its just a theory, but it needs to be thought out. */
11355+/* F. bio_alloc() failure is not handled gracefully. */
11356+/* G. Unallocated children. */
11357+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11358+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11359+
11360+/* JNODE_FLUSH: MAIN ENTRY POINT */
11361+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11362+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11363+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11364+ a part of transaction commit.
11365+
11366+ Our objective here is to prep and flush the slum the jnode belongs to. We want to
11367+ squish the slum together, and allocate the nodes in it as we squish because allocation
11368+ of children affects squishing of parents.
11369+
11370+ The "argument" @node tells flush where to start. From there, flush finds the left edge
11371+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11372+ "better place" to start squalloc first we perform a flush_scan.
11373+
11374+ Flush-scanning may be performed in both left and right directions, but for different
11375+ purposes. When scanning to the left, we are searching for a node that precedes a
11376+ sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11377+ During flush-scanning, we also take the opportunity to count the number of consecutive
11378+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11379+ make a decision to reallocate leaf nodes (thus favoring write-optimization).
11380+
11381+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11382+ also be dirty nodes to the right of the argument. If the scan-left operation does not
11383+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11384+ operation to see whether there is, in fact, enough nodes to meet the relocate
11385+ threshold. Each right- and left-scan operation uses a single flush_scan object.
11386+
11387+ After left-scan and possibly right-scan, we prepare a flush_position object with the
11388+ starting flush point or parent coordinate, which was determined using scan-left.
11389+
11390+ Next we call the main flush routine, squalloc, which iterates along the
11391+ leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11392+
11393+ After squalloc returns we take extra steps to ensure that all the children
11394+ of the final twig node are allocated--this involves repeating squalloc
11395+ until we finish at a twig with no unallocated children.
11396+
11397+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11398+ any above-twig nodes during flush_empty_queue that still have unallocated children, we
11399+ flush_unprep them.
11400+
11401+ Flush treats several "failure" cases as non-failures, essentially causing them to start
11402+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11403+ probably be handled properly rather than restarting, but there are a bunch of cases to
11404+ audit.
11405+*/
11406+
11407+static int
11408+jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11409+ flush_queue_t * fq, int flags)
11410+{
11411+ long ret = 0;
11412+ flush_scan *right_scan;
11413+ flush_scan *left_scan;
11414+ flush_pos_t *flush_pos;
11415+ int todo;
11416+ struct super_block *sb;
11417+ reiser4_super_info_data *sbinfo;
11418+ jnode *leftmost_in_slum = NULL;
11419+
11420+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11421+ assert("nikita-3022", schedulable());
11422+
11423+ /* lock ordering: delete_sema and flush_sema are unordered */
11424+ assert("nikita-3185",
11425+ get_current_super_private()->delete_sema_owner != current);
11426+
11427+ /* allocate right_scan, left_scan and flush_pos */
11428+ right_scan =
11429+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), get_gfp_mask());
11430+ if (right_scan == NULL)
11431+ return RETERR(-ENOMEM);
11432+ left_scan = right_scan + 1;
11433+ flush_pos = (flush_pos_t *) (left_scan + 1);
11434+
11435+ sb = reiser4_get_current_sb();
11436+ sbinfo = get_super_private(sb);
11437+ if (!reiser4_is_set(sb, REISER4_MTFLUSH)) {
11438+ down(&sbinfo->flush_sema);
11439+ }
11440+
11441+ /* Flush-concurrency debug code */
11442+#if REISER4_DEBUG
11443+ atomic_inc(&flush_cnt);
11444+#endif
11445+
11446+ enter_flush(sb);
11447+
11448+ /* Initialize a flush position. */
11449+ pos_init(flush_pos);
11450+
11451+ flush_pos->nr_written = nr_written;
11452+ flush_pos->fq = fq;
11453+ flush_pos->flags = flags;
11454+ flush_pos->nr_to_write = nr_to_write;
11455+
11456+ scan_init(right_scan);
11457+ scan_init(left_scan);
11458+
11459+ /* First scan left and remember the leftmost scan position. If the leftmost
11460+ position is unformatted we remember its parent_coord. We scan until counting
11461+ FLUSH_SCAN_MAXNODES.
11462+
11463+ If starting @node is unformatted, at the beginning of left scan its
11464+ parent (twig level node, containing extent item) will be long term
11465+ locked and lock handle will be stored in the
11466+ @right_scan->parent_lock. This lock is used to start the rightward
11467+ scan without redoing the tree traversal (necessary to find parent)
11468+ and, hence, is kept during leftward scan. As a result, we have to
11469+ use try-lock when taking long term locks during the leftward scan.
11470+ */
11471+ ret = scan_left(left_scan, right_scan,
11472+ node, sbinfo->flush.scan_maxnodes);
11473+ if (ret != 0)
11474+ goto failed;
11475+
11476+ leftmost_in_slum = jref(left_scan->node);
11477+ scan_done(left_scan);
11478+
11479+ /* Then possibly go right to decide if we will use a policy of relocating leaves.
11480+ This is only done if we did not scan past (and count) enough nodes during the
11481+ leftward scan. If we do scan right, we only care to go far enough to establish
11482+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11483+ scan limit is the difference between left_scan.count and the threshold. */
11484+
11485+ todo = sbinfo->flush.relocate_threshold - left_scan->count;
11486+ /* scan right is inherently deadlock prone, because we are
11487+ * (potentially) holding a lock on the twig node at this moment.
11488+ * FIXME: this is incorrect comment: lock is not held */
11489+ if (todo > 0) {
11490+ ret = scan_right(right_scan, node, (unsigned)todo);
11491+ if (ret != 0)
11492+ goto failed;
11493+ }
11494+
11495+ /* Only the right-scan count is needed, release any rightward locks right away. */
11496+ scan_done(right_scan);
11497+
11498+ /* ... and the answer is: we should relocate leaf nodes if at least
11499+ FLUSH_RELOCATE_THRESHOLD nodes were found. */
11500+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11501+ (left_scan->count + right_scan->count >=
11502+ sbinfo->flush.relocate_threshold);
11503+
11504+ /* Funny business here. We set the 'point' in the flush_position at prior to
11505+ starting squalloc regardless of whether the first point is
11506+ formatted or unformatted. Without this there would be an invariant, in the
11507+ rest of the code, that if the flush_position is unformatted then
11508+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11509+ and if the flush_position is formatted then flush_position->point is non-NULL
11510+ and no parent info is set.
11511+
11512+ This seems lazy, but it makes the initial calls to reverse_relocate_test
11513+ (which ask "is it the pos->point the leftmost child of its parent") much easier
11514+ because we know the first child already. Nothing is broken by this, but the
11515+ reasoning is subtle. Holding an extra reference on a jnode during flush can
11516+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11517+ removed from sibling lists until they have zero reference count. Flush would
11518+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11519+ deleted to the right. So if nothing is broken, why fix it?
11520+
11521+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11522+ point and in any moment, because of the concurrent file system
11523+ activity (for example, truncate). */
11524+
11525+ /* Check jnode state after flush_scan completed. Having a lock on this
11526+ node or its parent (in case of unformatted) helps us in case of
11527+ concurrent flushing. */
11528+ if (jnode_check_flushprepped(leftmost_in_slum)
11529+ && !jnode_convertible(leftmost_in_slum)) {
11530+ ret = 0;
11531+ goto failed;
11532+ }
11533+
11534+ /* Now setup flush_pos using scan_left's endpoint. */
11535+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11536+ if (ret)
11537+ goto failed;
11538+
11539+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11540+ && node_is_empty(flush_pos->coord.node)) {
11541+ znode *empty = flush_pos->coord.node;
11542+
11543+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11544+ ret = delete_empty_node(empty);
11545+ goto failed;
11546+ }
11547+
11548+ if (jnode_check_flushprepped(leftmost_in_slum)
11549+ && !jnode_convertible(leftmost_in_slum)) {
11550+ ret = 0;
11551+ goto failed;
11552+ }
11553+
11554+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11555+ ret = alloc_pos_and_ancestors(flush_pos);
11556+ if (ret)
11557+ goto failed;
11558+
11559+ /* Do the main rightward-bottom-up squeeze and allocate loop. */
11560+ ret = squalloc(flush_pos);
11561+ pos_stop(flush_pos);
11562+ if (ret)
11563+ goto failed;
11564+
11565+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11566+ First, the pos_stop() and pos_valid() routines should be modified
11567+ so that pos_stop() sets a flush_position->stop flag to 1 without
11568+ releasing the current position immediately--instead release it in
11569+ pos_done(). This is a better implementation than the current one anyway.
11570+
11571+ It is not clear that all fields of the flush_position should not be released,
11572+ but at the very least the parent_lock, parent_coord, and parent_load should
11573+ remain held because they are hold the last twig when pos_stop() is
11574+ called.
11575+
11576+ When we reach this point in the code, if the parent_coord is set to after the
11577+ last item then we know that flush reached the end of a twig (and according to
11578+ the new flush queueing design, we will return now). If parent_coord is not
11579+ past the last item, we should check if the current twig has any unallocated
11580+ children to the right (we are not concerned with unallocated children to the
11581+ left--in that case the twig itself should not have been allocated). If the
11582+ twig has unallocated children to the right, set the parent_coord to that
11583+ position and then repeat the call to squalloc.
11584+
11585+ Testing for unallocated children may be defined in two ways: if any internal
11586+ item has a fake block number, it is unallocated; if any extent item is
11587+ unallocated then all of its children are unallocated. But there is a more
11588+ aggressive approach: if there are any dirty children of the twig to the right
11589+ of the current position, we may wish to relocate those nodes now. Checking for
11590+ potential relocation is more expensive as it requires knowing whether there are
11591+ any dirty children that are not unallocated. The extent_needs_allocation
11592+ should be used after setting the correct preceder.
11593+
11594+ When we reach the end of a twig at this point in the code, if the flush can
11595+ continue (when the queue is ready) it will need some information on the future
11596+ starting point. That should be stored away in the flush_handle using a seal, I
11597+ believe. Holding a jref() on the future starting point may break other code
11598+ that deletes that node.
11599+ */
11600+
11601+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11602+ above the twig level. If the VM calls flush above the twig level, do nothing
11603+ and return (but figure out why this happens). The txnmgr should be modified to
11604+ only flush its leaf-level dirty list. This will do all the necessary squeeze
11605+ and allocate steps but leave unallocated branches and possibly unallocated
11606+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11607+ level, the remaining unallocated nodes should be given write-optimized
11608+ locations. (Possibly, the remaining unallocated twigs should be allocated just
11609+ before their leftmost child.)
11610+ */
11611+
11612+ /* Any failure reaches this point. */
11613+ failed:
11614+
11615+ switch (ret) {
11616+ case -E_REPEAT:
11617+ case -EINVAL:
11618+ case -E_DEADLOCK:
11619+ case -E_NO_NEIGHBOR:
11620+ case -ENOENT:
11621+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11622+ in each case. They already are handled in many cases. */
11623+ /* Something bad happened, but difficult to avoid... Try again! */
11624+ ret = 0;
11625+ }
11626+
11627+ if (leftmost_in_slum)
11628+ jput(leftmost_in_slum);
11629+
11630+ pos_done(flush_pos);
11631+ scan_done(left_scan);
11632+ scan_done(right_scan);
11633+ kfree(right_scan);
11634+
11635+ ON_DEBUG(atomic_dec(&flush_cnt));
11636+
11637+ leave_flush(sb);
11638+
11639+ if (!reiser4_is_set(sb, REISER4_MTFLUSH))
11640+ up(&sbinfo->flush_sema);
11641+
11642+ return ret;
11643+}
11644+
11645+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11646+ * flusher should submit all prepped nodes immediately without keeping them in
11647+ * flush queues for long time. The reason for rapid flush mode is to free
11648+ * memory as fast as possible. */
11649+
11650+#if REISER4_USE_RAPID_FLUSH
11651+
11652+/**
11653+ * submit all prepped nodes if rapid flush mode is set,
11654+ * turn rapid flush mode off.
11655+ */
11656+
11657+static int rapid_flush(flush_pos_t * pos)
11658+{
11659+ if (!wbq_available())
11660+ return 0;
11661+
11662+ return write_prepped_nodes(pos);
11663+}
11664+
11665+#else
11666+
11667+#define rapid_flush(pos) (0)
11668+
11669+#endif /* REISER4_USE_RAPID_FLUSH */
11670+
11671+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11672+ flush_queue_t *fq, int *nr_queued,
11673+ int flags)
11674+{
11675+ jnode * node;
11676+
11677+ if (start != NULL) {
11678+ spin_lock_jnode(start);
11679+ if (!jnode_is_flushprepped(start)) {
11680+ assert("zam-1056", start->atom == atom);
11681+ node = start;
11682+ goto enter;
11683+ }
11684+ spin_unlock_jnode(start);
11685+ }
11686+ /*
11687+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11688+ * nodes. The atom spin lock is not released until all dirty nodes processed or
11689+ * not prepped node found in the atom dirty lists.
11690+ */
11691+ while ((node = find_first_dirty_jnode(atom, flags))) {
11692+ spin_lock_jnode(node);
11693+ enter:
11694+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11695+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11696+
11697+ if (JF_ISSET(node, JNODE_WRITEBACK)) {
11698+ /* move node to the end of atom's writeback list */
11699+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11700+
11701+ /*
11702+ * jnode is not necessarily on dirty list: if it was dirtied when
11703+ * it was on flush queue - it does not get moved to dirty list
11704+ */
11705+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11706+ WB_LIST, 1));
11707+
11708+ } else if (jnode_is_znode(node)
11709+ && znode_above_root(JZNODE(node))) {
11710+ /*
11711+ * A special case for znode-above-root. The above-root (fake)
11712+ * znode is captured and dirtied when the tree height changes or
11713+ * when the root node is relocated. This causes atoms to fuse so
11714+ * that changes at the root are serialized. However, this node is
11715+ * never flushed. This special case used to be in lock.c to
11716+ * prevent the above-root node from ever being captured, but now
11717+ * that it is captured we simply prevent it from flushing. The
11718+ * log-writer code relies on this to properly log superblock
11719+ * modifications of the tree height.
11720+ */
11721+ jnode_make_wander_nolock(node);
11722+ } else if (JF_ISSET(node, JNODE_RELOC)) {
11723+ queue_jnode(fq, node);
11724+ ++(*nr_queued);
11725+ } else
11726+ break;
11727+
11728+ spin_unlock_jnode(node);
11729+ }
11730+ return node;
11731+}
11732+
11733+
11734+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11735+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11736+ * other errors as they are. */
11737+int
11738+flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11739+ txn_atom ** atom, jnode *start)
11740+{
11741+ reiser4_super_info_data *sinfo = get_current_super_private();
11742+ flush_queue_t *fq = NULL;
11743+ jnode *node;
11744+ int nr_queued;
11745+ int ret;
11746+
11747+ assert("zam-889", atom != NULL && *atom != NULL);
11748+ assert_spin_locked(&((*atom)->alock));
11749+ assert("zam-892", get_current_context()->trans->atom == *atom);
11750+
11751+ nr_to_write = LONG_MAX;
11752+ while (1) {
11753+ ret = fq_by_atom(*atom, &fq);
11754+ if (ret != -E_REPEAT)
11755+ break;
11756+ *atom = get_current_atom_locked();
11757+ }
11758+ if (ret)
11759+ return ret;
11760+
11761+ assert_spin_locked(&((*atom)->alock));
11762+
11763+ /* parallel flushers limit */
11764+ if (sinfo->tmgr.atom_max_flushers != 0) {
11765+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11766+ /* An atom_send_event() call is inside fq_put_nolock() which is
11767+ called when flush is finished and nr_flushers is
11768+ decremented. */
11769+ atom_wait_event(*atom);
11770+ *atom = get_current_atom_locked();
11771+ }
11772+ }
11773+
11774+ /* count ourself as a flusher */
11775+ (*atom)->nr_flushers++;
11776+
11777+ writeout_mode_enable();
11778+
11779+ nr_queued = 0;
11780+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11781+
11782+ if (node == NULL) {
11783+ if (nr_queued == 0) {
11784+ (*atom)->nr_flushers--;
11785+ fq_put_nolock(fq);
11786+ atom_send_event(*atom);
11787+ /* current atom remains locked */
11788+ writeout_mode_disable();
11789+ return 0;
11790+ }
11791+ spin_unlock_atom(*atom);
11792+ } else {
11793+ jref(node);
11794+ BUG_ON((*atom)->super != node->tree->super);
11795+ spin_unlock_atom(*atom);
11796+ spin_unlock_jnode(node);
11797+ BUG_ON(nr_to_write == 0);
11798+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11799+ jput(node);
11800+ }
11801+
11802+ ret =
11803+ write_fq(fq, nr_submitted,
11804+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11805+
11806+ *atom = get_current_atom_locked();
11807+ (*atom)->nr_flushers--;
11808+ fq_put_nolock(fq);
11809+ atom_send_event(*atom);
11810+ spin_unlock_atom(*atom);
11811+
11812+ writeout_mode_disable();
11813+
11814+ if (ret == 0)
11815+ ret = -E_REPEAT;
11816+
11817+ return ret;
11818+}
11819+
11820+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11821+
11822+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11823+ reverse parent-first relocate context. Here all we know is the preceder and the block
11824+ number. Since we are going in reverse, the preceder may still be relocated as well, so
11825+ we can't ask the block allocator "is there a closer block available to relocate?" here.
11826+ In the _forward_ parent-first relocate context (not here) we actually call the block
11827+ allocator to try and find a closer location. */
11828+static int
11829+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11830+ const reiser4_block_nr * nblk)
11831+{
11832+ reiser4_block_nr dist;
11833+
11834+ assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11835+ assert("jmacd-7711", !blocknr_is_fake(pblk));
11836+ assert("jmacd-7712", !blocknr_is_fake(nblk));
11837+
11838+ /* Distance is the absolute value. */
11839+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11840+
11841+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11842+ block, do not relocate. */
11843+ if (dist <= get_current_super_private()->flush.relocate_distance) {
11844+ return 0;
11845+ }
11846+
11847+ return 1;
11848+}
11849+
11850+/* This function is a predicate that tests for relocation. Always called in the
11851+ reverse-parent-first context, when we are asking whether the current node should be
11852+ relocated in order to expand the flush by dirtying the parent level (and thus
11853+ proceeding to flush that level). When traversing in the forward parent-first direction
11854+ (not here), relocation decisions are handled in two places: allocate_znode() and
11855+ extent_needs_allocation(). */
11856+static int
11857+reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11858+ flush_pos_t * pos)
11859+{
11860+ reiser4_block_nr pblk = 0;
11861+ reiser4_block_nr nblk = 0;
11862+
11863+ assert("jmacd-8989", !jnode_is_root(node));
11864+
11865+ /*
11866+ * This function is called only from the
11867+ * reverse_relocate_check_dirty_parent() and only if the parent
11868+ * node is clean. This implies that the parent has the real (i.e., not
11869+ * fake) block number, and, so does the child, because otherwise the
11870+ * parent would be dirty.
11871+ */
11872+
11873+ /* New nodes are treated as if they are being relocated. */
11874+ if (JF_ISSET (node, JNODE_CREATED) ||
11875+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11876+ return 1;
11877+ }
11878+
11879+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11880+ existing node, the coord may be leftmost even though the child is not the
11881+ parent-first preceder of the parent. If the first dirty node appears somewhere
11882+ in the middle of the first extent unit, this preceder calculation is wrong.
11883+ Needs more logic in here. */
11884+ if (coord_is_leftmost_unit(parent_coord)) {
11885+ pblk = *znode_get_block(parent_coord->node);
11886+ } else {
11887+ pblk = pos->preceder.blk;
11888+ }
11889+ check_preceder(pblk);
11890+
11891+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11892+ if (pblk == 0) {
11893+ return 1;
11894+ }
11895+
11896+ nblk = *jnode_get_block(node);
11897+
11898+ if (blocknr_is_fake(&nblk))
11899+ /* child is unallocated, mark parent dirty */
11900+ return 1;
11901+
11902+ return reverse_relocate_if_close_enough(&pblk, &nblk);
11903+}
11904+
11905+/* This function calls reverse_relocate_test to make a reverse-parent-first
11906+ relocation decision and then, if yes, it marks the parent dirty. */
11907+static int
11908+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11909+ flush_pos_t * pos)
11910+{
11911+ int ret;
11912+
11913+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11914+
11915+ ret = reverse_relocate_test(node, parent_coord, pos);
11916+ if (ret < 0) {
11917+ return ret;
11918+ }
11919+
11920+ /* FIXME-ZAM
11921+ if parent is already relocated - we do not want to grab space, right? */
11922+ if (ret == 1) {
11923+ int grabbed;
11924+
11925+ grabbed = get_current_context()->grabbed_blocks;
11926+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11927+ 0)
11928+ reiser4_panic("umka-1250",
11929+ "No space left during flush.");
11930+
11931+ assert("jmacd-18923",
11932+ znode_is_write_locked(parent_coord->node));
11933+ znode_make_dirty(parent_coord->node);
11934+ grabbed2free_mark(grabbed);
11935+ }
11936+ }
11937+
11938+ return 0;
11939+}
11940+
11941+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11942+ PARENT-FIRST LOOP BEGINS) */
11943+
11944+/* Get the leftmost child for given coord. */
11945+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11946+{
11947+ int ret;
11948+
11949+ ret = item_utmost_child(coord, LEFT_SIDE, child);
11950+
11951+ if (ret)
11952+ return ret;
11953+
11954+ if (IS_ERR(*child))
11955+ return PTR_ERR(*child);
11956+
11957+ return 0;
11958+}
11959+
11960+/* This step occurs after the left- and right-scans are completed, before starting the
11961+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11962+ flush point, which means continuing in the reverse parent-first direction to the
11963+ parent, grandparent, and so on (as long as the child is a leftmost child). This
11964+ routine calls a recursive process, alloc_one_ancestor, which does the real work,
11965+ except there is special-case handling here for the first ancestor, which may be a twig.
11966+ At each level (here and alloc_one_ancestor), we check for relocation and then, if
11967+ the child is a leftmost child, repeat at the next level. On the way back down (the
11968+ recursion), we allocate the ancestors in parent-first order. */
11969+static int alloc_pos_and_ancestors(flush_pos_t * pos)
11970+{
11971+ int ret = 0;
11972+ lock_handle plock;
11973+ load_count pload;
11974+ coord_t pcoord;
11975+
11976+ if (znode_check_flushprepped(pos->lock.node))
11977+ return 0;
11978+
11979+ coord_init_invalid(&pcoord, NULL);
11980+ init_lh(&plock);
11981+ init_load_count(&pload);
11982+
11983+ if (pos->state == POS_ON_EPOINT) {
11984+ /* a special case for pos on twig level, where we already have
11985+ a lock on parent node. */
11986+ /* The parent may not be dirty, in which case we should decide
11987+ whether to relocate the child now. If decision is made to
11988+ relocate the child, the parent is marked dirty. */
11989+ ret =
11990+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11991+ pos);
11992+ if (ret)
11993+ goto exit;
11994+
11995+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11996+ is leftmost) and the leaf/child, so recursion is not needed.
11997+ Levels above the twig will be allocated for
11998+ write-optimization before the transaction commits. */
11999+
12000+ /* Do the recursive step, allocating zero or more of our
12001+ * ancestors. */
12002+ ret = alloc_one_ancestor(&pos->coord, pos);
12003+
12004+ } else {
12005+ if (!znode_is_root(pos->lock.node)) {
12006+ /* all formatted nodes except tree root */
12007+ ret =
12008+ reiser4_get_parent(&plock, pos->lock.node,
12009+ ZNODE_WRITE_LOCK);
12010+ if (ret)
12011+ goto exit;
12012+
12013+ ret = incr_load_count_znode(&pload, plock.node);
12014+ if (ret)
12015+ goto exit;
12016+
12017+ ret =
12018+ find_child_ptr(plock.node, pos->lock.node, &pcoord);
12019+ if (ret)
12020+ goto exit;
12021+
12022+ ret =
12023+ reverse_relocate_check_dirty_parent(ZJNODE
12024+ (pos->lock.
12025+ node), &pcoord,
12026+ pos);
12027+ if (ret)
12028+ goto exit;
12029+
12030+ ret = alloc_one_ancestor(&pcoord, pos);
12031+ if (ret)
12032+ goto exit;
12033+ }
12034+
12035+ ret = allocate_znode(pos->lock.node, &pcoord, pos);
12036+ }
12037+ exit:
12038+ done_load_count(&pload);
12039+ done_lh(&plock);
12040+ return ret;
12041+}
12042+
12043+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
12044+ call to set_preceder, which is the next function described, this checks if the
12045+ child is a leftmost child and returns if it is not. If the child is a leftmost child
12046+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive
12047+ step. */
12048+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
12049+{
12050+ int ret = 0;
12051+ lock_handle alock;
12052+ load_count aload;
12053+ coord_t acoord;
12054+
12055+ /* As we ascend at the left-edge of the region to flush, take this opportunity at
12056+ the twig level to find our parent-first preceder unless we have already set
12057+ it. */
12058+ if (pos->preceder.blk == 0) {
12059+ ret = set_preceder(coord, pos);
12060+ if (ret != 0)
12061+ return ret;
12062+ }
12063+
12064+ /* If the ancestor is clean or already allocated, or if the child is not a
12065+ leftmost child, stop going up, even leaving coord->node not flushprepped. */
12066+ if (znode_check_flushprepped(coord->node)
12067+ || !coord_is_leftmost_unit(coord))
12068+ return 0;
12069+
12070+ init_lh(&alock);
12071+ init_load_count(&aload);
12072+ coord_init_invalid(&acoord, NULL);
12073+
12074+ /* Only ascend to the next level if it is a leftmost child, but write-lock the
12075+ parent in case we will relocate the child. */
12076+ if (!znode_is_root(coord->node)) {
12077+
12078+ ret =
12079+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
12080+ &alock, &aload, ZNODE_WRITE_LOCK,
12081+ 0);
12082+ if (ret != 0) {
12083+ /* FIXME(C): check EINVAL, E_DEADLOCK */
12084+ goto exit;
12085+ }
12086+
12087+ ret =
12088+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
12089+ &acoord, pos);
12090+ if (ret != 0) {
12091+ goto exit;
12092+ }
12093+
12094+ /* Recursive call. */
12095+ if (!znode_check_flushprepped(acoord.node)) {
12096+ ret = alloc_one_ancestor(&acoord, pos);
12097+ if (ret)
12098+ goto exit;
12099+ }
12100+ }
12101+
12102+ /* Note: we call allocate with the parent write-locked (except at the root) in
12103+ case we relocate the child, in which case it will modify the parent during this
12104+ call. */
12105+ ret = allocate_znode(coord->node, &acoord, pos);
12106+
12107+ exit:
12108+ done_load_count(&aload);
12109+ done_lh(&alock);
12110+ return ret;
12111+}
12112+
12113+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
12114+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
12115+ should this node be relocated (in reverse parent-first context)? We repeat this
12116+ process as long as the child is the leftmost child, eventually reaching an ancestor of
12117+ the flush point that is not a leftmost child. The preceder of that ancestors, which is
12118+ not a leftmost child, is actually on the leaf level. The preceder of that block is the
12119+ left-neighbor of the flush point. The preceder of that block is the rightmost child of
12120+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
12121+ level, it stops momentarily to remember the block of the rightmost child of the twig on
12122+ the left and sets it to the flush_position's preceder_hint.
12123+
12124+ There is one other place where we may set the flush_position's preceder hint, which is
12125+ during scan-left.
12126+*/
12127+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
12128+{
12129+ int ret;
12130+ coord_t coord;
12131+ lock_handle left_lock;
12132+ load_count left_load;
12133+
12134+ coord_dup(&coord, coord_in);
12135+
12136+ init_lh(&left_lock);
12137+ init_load_count(&left_load);
12138+
12139+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
12140+ coord_is_leftmost_unit is not the right test if the unformatted child is in the
12141+ middle of the first extent unit. */
12142+ if (!coord_is_leftmost_unit(&coord)) {
12143+ coord_prev_unit(&coord);
12144+ } else {
12145+ ret =
12146+ reiser4_get_left_neighbor(&left_lock, coord.node,
12147+ ZNODE_READ_LOCK, GN_SAME_ATOM);
12148+ if (ret) {
12149+ /* If we fail for any reason it doesn't matter because the
12150+ preceder is only a hint. We are low-priority at this point, so
12151+ this must be the case. */
12152+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
12153+ ret == -ENOENT || ret == -EINVAL
12154+ || ret == -E_DEADLOCK) {
12155+ ret = 0;
12156+ }
12157+ goto exit;
12158+ }
12159+
12160+ ret = incr_load_count_znode(&left_load, left_lock.node);
12161+ if (ret)
12162+ goto exit;
12163+
12164+ coord_init_last_unit(&coord, left_lock.node);
12165+ }
12166+
12167+ ret =
12168+ item_utmost_child_real_block(&coord, RIGHT_SIDE,
12169+ &pos->preceder.blk);
12170+ exit:
12171+ check_preceder(pos->preceder.blk);
12172+ done_load_count(&left_load);
12173+ done_lh(&left_lock);
12174+ return ret;
12175+}
12176+
12177+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
12178+
12179+/* This procedure implements the outer loop of the flush algorithm. To put this in
12180+ context, here is the general list of steps taken by the flush routine as a whole:
12181+
12182+ 1. Scan-left
12183+ 2. Scan-right (maybe)
12184+ 3. Allocate initial flush position and its ancestors
12185+ 4. <handle extents>
12186+ 5. <squeeze and next position and its ancestors to-the-right,
12187+ then update position to-the-right>
12188+ 6. <repeat from #4 until flush is stopped>
12189+
12190+ This procedure implements the loop in steps 4 through 6 in the above listing.
12191+
12192+ Step 4: if the current flush position is an extent item (position on the twig level),
12193+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next
12194+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
12195+ If the next coordinate is an internal item, we descend back to the leaf level,
12196+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
12197+ brings us past the end of the twig level, then we call
12198+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
12199+ step #5 which moves to the right.
12200+
12201+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
12202+ tree to allocate any ancestors of the next-right flush position that are not also
12203+ ancestors of the current position. Those ancestors (in top-down order) are the next in
12204+ parent-first order. We squeeze adjacent nodes on the way up until the right node and
12205+ current node share the same parent, then allocate on the way back down. Finally, this
12206+ step sets the flush position to the next-right node. Then repeat steps 4 and 5.
12207+*/
12208+
12209+/* SQUEEZE CODE */
12210+
12211+/* squalloc_right_twig helper function, cut a range of extent items from
12212+ cut node to->node from the beginning up to coord @to. */
12213+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
12214+ znode * left)
12215+{
12216+ coord_t from;
12217+ reiser4_key from_key;
12218+
12219+ coord_init_first_unit(&from, to->node);
12220+ item_key_by_coord(&from, &from_key);
12221+
12222+ return cut_node_content(&from, to, &from_key, to_key, NULL);
12223+}
12224+
12225+/* Copy as much of the leading extents from @right to @left, allocating
12226+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
12227+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
12228+ internal item it calls shift_one_internal_unit and may then return
12229+ SUBTREE_MOVED. */
12230+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
12231+{
12232+ int ret = SUBTREE_MOVED;
12233+ coord_t coord; /* used to iterate over items */
12234+ reiser4_key stop_key;
12235+
12236+ assert("jmacd-2008", !node_is_empty(right));
12237+ coord_init_first_unit(&coord, right);
12238+
12239+ /* FIXME: can be optimized to cut once */
12240+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12241+ ON_DEBUG(void *vp);
12242+
12243+ assert("vs-1468", coord_is_leftmost_unit(&coord));
12244+ ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12245+
12246+ /* stop_key is used to find what was copied and what to cut */
12247+ stop_key = *min_key();
12248+ ret = squalloc_extent(left, &coord, pos, &stop_key);
12249+ if (ret != SQUEEZE_CONTINUE) {
12250+ ON_DEBUG(kfree(vp));
12251+ break;
12252+ }
12253+ assert("vs-1465", !keyeq(&stop_key, min_key()));
12254+
12255+ /* Helper function to do the cutting. */
12256+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12257+ check_me("vs-1466",
12258+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12259+
12260+ ON_DEBUG(shift_check(vp, left, coord.node));
12261+ }
12262+
12263+ if (node_is_empty(coord.node))
12264+ ret = SQUEEZE_SOURCE_EMPTY;
12265+
12266+ if (ret == SQUEEZE_TARGET_FULL) {
12267+ goto out;
12268+ }
12269+
12270+ if (node_is_empty(right)) {
12271+ /* The whole right node was copied into @left. */
12272+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12273+ goto out;
12274+ }
12275+
12276+ coord_init_first_unit(&coord, right);
12277+
12278+ if (!item_is_internal(&coord)) {
12279+ /* we do not want to squeeze anything else to left neighbor because "slum"
12280+ is over */
12281+ ret = SQUEEZE_TARGET_FULL;
12282+ goto out;
12283+ }
12284+ assert("jmacd-433", item_is_internal(&coord));
12285+
12286+ /* Shift an internal unit. The child must be allocated before shifting any more
12287+ extents, so we stop here. */
12288+ ret = shift_one_internal_unit(left, right);
12289+
12290+ out:
12291+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12292+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12293+
12294+ if (ret == SQUEEZE_TARGET_FULL) {
12295+ /* We submit prepped nodes here and expect that this @left twig
12296+ * will not be modified again during this jnode_flush() call. */
12297+ int ret1;
12298+
12299+ /* NOTE: seems like io is done under long term locks. */
12300+ ret1 = write_prepped_nodes(pos);
12301+ if (ret1 < 0)
12302+ return ret1;
12303+ }
12304+
12305+ return ret;
12306+}
12307+
12308+#if REISER4_DEBUG
12309+static void item_convert_invariant(flush_pos_t * pos)
12310+{
12311+ assert("edward-1225", coord_is_existing_item(&pos->coord));
12312+ if (chaining_data_present(pos)) {
12313+ item_plugin *iplug = item_convert_plug(pos);
12314+
12315+ assert("edward-1000",
12316+ iplug == item_plugin_by_coord(&pos->coord));
12317+ assert("edward-1001", iplug->f.convert != NULL);
12318+ } else
12319+ assert("edward-1226", pos->child == NULL);
12320+}
12321+#else
12322+
12323+#define item_convert_invariant(pos) noop
12324+
12325+#endif
12326+
12327+/* Scan node items starting from the first one and apply for each
12328+ item its flush ->convert() method (if any). This method may
12329+ resize/kill the item so the tree will be changed.
12330+*/
12331+static int convert_node(flush_pos_t * pos, znode * node)
12332+{
12333+ int ret = 0;
12334+ item_plugin *iplug;
12335+
12336+ assert("edward-304", pos != NULL);
12337+ assert("edward-305", pos->child == NULL);
12338+ assert("edward-475", znode_convertible(node));
12339+ assert("edward-669", znode_is_wlocked(node));
12340+ assert("edward-1210", !node_is_empty(node));
12341+
12342+ if (znode_get_level(node) != LEAF_LEVEL)
12343+ /* unsupported */
12344+ goto exit;
12345+
12346+ coord_init_first_unit(&pos->coord, node);
12347+
12348+ while (1) {
12349+ ret = 0;
12350+ coord_set_to_left(&pos->coord);
12351+ item_convert_invariant(pos);
12352+
12353+ iplug = item_plugin_by_coord(&pos->coord);
12354+ assert("edward-844", iplug != NULL);
12355+
12356+ if (iplug->f.convert) {
12357+ ret = iplug->f.convert(pos);
12358+ if (ret)
12359+ goto exit;
12360+ }
12361+ assert("edward-307", pos->child == NULL);
12362+
12363+ if (coord_next_item(&pos->coord)) {
12364+ /* node is over */
12365+
12366+ if (!chaining_data_present(pos))
12367+ /* finished this node */
12368+ break;
12369+ if (should_chain_next_node(pos)) {
12370+ /* go to next node */
12371+ move_chaining_data(pos, 0 /* to next node */ );
12372+ break;
12373+ }
12374+ /* repeat this node */
12375+ move_chaining_data(pos, 1 /* this node */ );
12376+ continue;
12377+ }
12378+ /* Node is not over.
12379+ Check if there is attached convert data.
12380+ If so roll one item position back and repeat
12381+ on this node
12382+ */
12383+ if (chaining_data_present(pos)) {
12384+
12385+ if (iplug != item_plugin_by_coord(&pos->coord))
12386+ set_item_convert_count(pos, 0);
12387+
12388+ ret = coord_prev_item(&pos->coord);
12389+ assert("edward-1003", !ret);
12390+
12391+ move_chaining_data(pos, 1 /* this node */ );
12392+ }
12393+ }
12394+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12395+ znode_make_dirty(node);
12396+ exit:
12397+ assert("edward-1004", !ret);
12398+ return ret;
12399+}
12400+
12401+/* Squeeze and allocate the right neighbor. This is called after @left and
12402+ its current children have been squeezed and allocated already. This
12403+ procedure's job is to squeeze and items from @right to @left.
12404+
12405+ If at the leaf level, use the shift_everything_left memcpy-optimized
12406+ version of shifting (squeeze_right_leaf).
12407+
12408+ If at the twig level, extents are allocated as they are shifted from @right
12409+ to @left (squalloc_right_twig).
12410+
12411+ At any other level, shift one internal item and return to the caller
12412+ (squalloc_parent_first) so that the shifted-subtree can be processed in
12413+ parent-first order.
12414+
12415+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12416+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12417+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12418+ is returned.
12419+*/
12420+
12421+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12422+ znode * right)
12423+{
12424+ int ret;
12425+
12426+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12427+ * tree owing to error (for example, ENOSPC) in write */
12428+ /* assert("jmacd-9321", !node_is_empty(left)); */
12429+ assert("jmacd-9322", !node_is_empty(right));
12430+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12431+
12432+ switch (znode_get_level(left)) {
12433+ case TWIG_LEVEL:
12434+ /* Shift with extent allocating until either an internal item
12435+ is encountered or everything is shifted or no free space
12436+ left in @left */
12437+ ret = squeeze_right_twig(left, right, pos);
12438+ break;
12439+
12440+ default:
12441+ /* All other levels can use shift_everything until we implement per-item
12442+ flush plugins. */
12443+ ret = squeeze_right_non_twig(left, right);
12444+ break;
12445+ }
12446+
12447+ assert("jmacd-2011", (ret < 0 ||
12448+ ret == SQUEEZE_SOURCE_EMPTY
12449+ || ret == SQUEEZE_TARGET_FULL
12450+ || ret == SUBTREE_MOVED));
12451+ return ret;
12452+}
12453+
12454+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12455+ znode * right)
12456+{
12457+ int ret;
12458+
12459+ ret = squeeze_right_twig(pos->lock.node, right, pos);
12460+ if (ret < 0)
12461+ return ret;
12462+ if (ret > 0) {
12463+ coord_init_after_last_item(&pos->coord, pos->lock.node);
12464+ return ret;
12465+ }
12466+
12467+ coord_init_last_unit(&pos->coord, pos->lock.node);
12468+ return 0;
12469+}
12470+
12471+/* forward declaration */
12472+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12473+
12474+/* do a fast check for "same parents" condition before calling
12475+ * squalloc_upper_levels() */
12476+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12477+ znode * left,
12478+ znode * right)
12479+{
12480+ if (znode_same_parents(left, right))
12481+ return 0;
12482+
12483+ return squalloc_upper_levels(pos, left, right);
12484+}
12485+
12486+/* Check whether the parent of given @right node needs to be processes
12487+ ((re)allocated) prior to processing of the child. If @left and @right do not
12488+ share at least the parent of the @right is after the @left but before the
12489+ @right in parent-first order, we have to (re)allocate it before the @right
12490+ gets (re)allocated. */
12491+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12492+{
12493+ int ret;
12494+
12495+ lock_handle left_parent_lock;
12496+ lock_handle right_parent_lock;
12497+
12498+ load_count left_parent_load;
12499+ load_count right_parent_load;
12500+
12501+ init_lh(&left_parent_lock);
12502+ init_lh(&right_parent_lock);
12503+
12504+ init_load_count(&left_parent_load);
12505+ init_load_count(&right_parent_load);
12506+
12507+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12508+ if (ret)
12509+ goto out;
12510+
12511+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12512+ if (ret)
12513+ goto out;
12514+
12515+ /* Check for same parents */
12516+ if (left_parent_lock.node == right_parent_lock.node)
12517+ goto out;
12518+
12519+ if (znode_check_flushprepped(right_parent_lock.node)) {
12520+ /* Keep parent-first order. In the order, the right parent node stands
12521+ before the @right node. If it is already allocated, we set the
12522+ preceder (next block search start point) to its block number, @right
12523+ node should be allocated after it.
12524+
12525+ However, preceder is set only if the right parent is on twig level.
12526+ The explanation is the following: new branch nodes are allocated over
12527+ already allocated children while the tree grows, it is difficult to
12528+ keep tree ordered, we assume that only leaves and twings are correctly
12529+ allocated. So, only twigs are used as a preceder for allocating of the
12530+ rest of the slum. */
12531+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12532+ pos->preceder.blk =
12533+ *znode_get_block(right_parent_lock.node);
12534+ check_preceder(pos->preceder.blk);
12535+ }
12536+ goto out;
12537+ }
12538+
12539+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12540+ if (ret)
12541+ goto out;
12542+
12543+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12544+ if (ret)
12545+ goto out;
12546+
12547+ ret =
12548+ squeeze_right_neighbor(pos, left_parent_lock.node,
12549+ right_parent_lock.node);
12550+ /* We stop if error. We stop if some items/units were shifted (ret == 0)
12551+ * and thus @right changed its parent. It means we have not process
12552+ * right_parent node prior to processing of @right. Positive return
12553+ * values say that shifting items was not happen because of "empty
12554+ * source" or "target full" conditions. */
12555+ if (ret <= 0)
12556+ goto out;
12557+
12558+ /* parent(@left) and parent(@right) may have different parents also. We
12559+ * do a recursive call for checking that. */
12560+ ret =
12561+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12562+ right_parent_lock.node);
12563+ if (ret)
12564+ goto out;
12565+
12566+ /* allocate znode when going down */
12567+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12568+
12569+ out:
12570+ done_load_count(&left_parent_load);
12571+ done_load_count(&right_parent_load);
12572+
12573+ done_lh(&left_parent_lock);
12574+ done_lh(&right_parent_lock);
12575+
12576+ return ret;
12577+}
12578+
12579+/* Check the leftmost child "flushprepped" status, also returns true if child
12580+ * node was not found in cache. */
12581+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12582+{
12583+ int ret;
12584+ int prepped;
12585+
12586+ jnode *child;
12587+
12588+ ret = get_leftmost_child_of_unit(coord, &child);
12589+
12590+ if (ret)
12591+ return ret;
12592+
12593+ if (child) {
12594+ prepped = jnode_check_flushprepped(child);
12595+ jput(child);
12596+ } else {
12597+ /* We consider not existing child as a node which slum
12598+ processing should not continue to. Not cached node is clean,
12599+ so it is flushprepped. */
12600+ prepped = 1;
12601+ }
12602+
12603+ return prepped;
12604+}
12605+
12606+/* (re)allocate znode with automated getting parent node */
12607+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12608+{
12609+ int ret;
12610+ lock_handle parent_lock;
12611+ load_count parent_load;
12612+ coord_t pcoord;
12613+
12614+ assert("zam-851", znode_is_write_locked(node));
12615+
12616+ init_lh(&parent_lock);
12617+ init_load_count(&parent_load);
12618+
12619+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12620+ if (ret)
12621+ goto out;
12622+
12623+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12624+ if (ret)
12625+ goto out;
12626+
12627+ ret = find_child_ptr(parent_lock.node, node, &pcoord);
12628+ if (ret)
12629+ goto out;
12630+
12631+ ret = allocate_znode(node, &pcoord, pos);
12632+
12633+ out:
12634+ done_load_count(&parent_load);
12635+ done_lh(&parent_lock);
12636+ return ret;
12637+}
12638+
12639+/* Process nodes on leaf level until unformatted node or rightmost node in the
12640+ * slum reached. */
12641+static int handle_pos_on_formatted(flush_pos_t * pos)
12642+{
12643+ int ret;
12644+ lock_handle right_lock;
12645+ load_count right_load;
12646+
12647+ init_lh(&right_lock);
12648+ init_load_count(&right_load);
12649+
12650+ if (should_convert_node(pos, pos->lock.node)) {
12651+ ret = convert_node(pos, pos->lock.node);
12652+ if (ret)
12653+ return ret;
12654+ }
12655+
12656+ while (1) {
12657+ ret =
12658+ neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12659+ ZNODE_WRITE_LOCK,
12660+ !should_convert_next_node(pos,
12661+ right_lock.
12662+ node));
12663+ if (ret)
12664+ break;
12665+
12666+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12667+ * can be optimal. For now we choose to live with the risk that it will
12668+ * be suboptimal because it would be quite complex to code it to be
12669+ * smarter. */
12670+ if (znode_check_flushprepped(right_lock.node)
12671+ && !znode_convertible(right_lock.node)) {
12672+ assert("edward-1005",
12673+ !should_convert_next_node(pos, right_lock.node));
12674+ pos_stop(pos);
12675+ break;
12676+ }
12677+
12678+ ret = incr_load_count_znode(&right_load, right_lock.node);
12679+ if (ret)
12680+ break;
12681+
12682+ if (should_convert_node(pos, right_lock.node)) {
12683+ ret = convert_node(pos, right_lock.node);
12684+ if (ret)
12685+ break;
12686+ if (node_is_empty(right_lock.node)) {
12687+ /* node became empty after converting, repeat */
12688+ done_load_count(&right_load);
12689+ done_lh(&right_lock);
12690+ continue;
12691+ }
12692+ }
12693+
12694+ /* squeeze _before_ going upward. */
12695+ ret =
12696+ squeeze_right_neighbor(pos, pos->lock.node,
12697+ right_lock.node);
12698+ if (ret < 0)
12699+ break;
12700+
12701+ if (znode_check_flushprepped(right_lock.node)) {
12702+ if (should_convert_next_node(pos, right_lock.node)) {
12703+ /* in spite of flushprepped status of the node,
12704+ its right slum neighbor should be converted */
12705+ assert("edward-953", convert_data(pos));
12706+ assert("edward-954", item_convert_data(pos));
12707+
12708+ if (node_is_empty(right_lock.node)) {
12709+ done_load_count(&right_load);
12710+ done_lh(&right_lock);
12711+ } else
12712+ move_flush_pos(pos, &right_lock,
12713+ &right_load, NULL);
12714+ continue;
12715+ }
12716+ pos_stop(pos);
12717+ break;
12718+ }
12719+
12720+ if (node_is_empty(right_lock.node)) {
12721+ /* repeat if right node was squeezed completely */
12722+ done_load_count(&right_load);
12723+ done_lh(&right_lock);
12724+ continue;
12725+ }
12726+
12727+ /* parent(right_lock.node) has to be processed before
12728+ * (right_lock.node) due to "parent-first" allocation order. */
12729+ ret =
12730+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12731+ right_lock.node);
12732+ if (ret)
12733+ break;
12734+ /* (re)allocate _after_ going upward */
12735+ ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12736+ if (ret)
12737+ break;
12738+
12739+ if (should_terminate_squalloc(pos)) {
12740+ set_item_convert_count(pos, 0);
12741+ break;
12742+ }
12743+
12744+ /* advance the flush position to the right neighbor */
12745+ move_flush_pos(pos, &right_lock, &right_load, NULL);
12746+
12747+ ret = rapid_flush(pos);
12748+ if (ret)
12749+ break;
12750+ }
12751+
12752+ assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
12753+
12754+ done_load_count(&right_load);
12755+ done_lh(&right_lock);
12756+
12757+ /* This function indicates via pos whether to stop or go to twig or continue on current
12758+ * level. */
12759+ return ret;
12760+
12761+}
12762+
12763+/* Process nodes on leaf level until unformatted node or rightmost node in the
12764+ * slum reached. */
12765+static int handle_pos_on_leaf(flush_pos_t * pos)
12766+{
12767+ int ret;
12768+
12769+ assert("zam-845", pos->state == POS_ON_LEAF);
12770+
12771+ ret = handle_pos_on_formatted(pos);
12772+
12773+ if (ret == -E_NO_NEIGHBOR) {
12774+ /* cannot get right neighbor, go process extents. */
12775+ pos->state = POS_TO_TWIG;
12776+ return 0;
12777+ }
12778+
12779+ return ret;
12780+}
12781+
12782+/* Process slum on level > 1 */
12783+static int handle_pos_on_internal(flush_pos_t * pos)
12784+{
12785+ assert("zam-850", pos->state == POS_ON_INTERNAL);
12786+ return handle_pos_on_formatted(pos);
12787+}
12788+
12789+/* check whether squalloc should stop before processing given extent */
12790+static int squalloc_extent_should_stop(flush_pos_t * pos)
12791+{
12792+ assert("zam-869", item_is_extent(&pos->coord));
12793+
12794+ /* pos->child is a jnode handle_pos_on_extent() should start with in
12795+ * stead of the first child of the first extent unit. */
12796+ if (pos->child) {
12797+ int prepped;
12798+
12799+ assert("vs-1383", jnode_is_unformatted(pos->child));
12800+ prepped = jnode_check_flushprepped(pos->child);
12801+ pos->pos_in_unit =
12802+ jnode_get_index(pos->child) -
12803+ extent_unit_index(&pos->coord);
12804+ assert("vs-1470",
12805+ pos->pos_in_unit < extent_unit_width(&pos->coord));
12806+ assert("nikita-3434",
12807+ ergo(extent_is_unallocated(&pos->coord),
12808+ pos->pos_in_unit == 0));
12809+ jput(pos->child);
12810+ pos->child = NULL;
12811+
12812+ return prepped;
12813+ }
12814+
12815+ pos->pos_in_unit = 0;
12816+ if (extent_is_unallocated(&pos->coord))
12817+ return 0;
12818+
12819+ return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12820+}
12821+
12822+/* Handle the case when regular reiser4 tree (znodes connected one to its
12823+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
12824+ * unformatted nodes. By having a lock on twig level and use extent code
12825+ * routines to process unformatted nodes we swim around an irregular part of
12826+ * reiser4 tree. */
12827+static int handle_pos_on_twig(flush_pos_t * pos)
12828+{
12829+ int ret;
12830+
12831+ assert("zam-844", pos->state == POS_ON_EPOINT);
12832+ assert("zam-843", item_is_extent(&pos->coord));
12833+
12834+ /* We decide should we continue slum processing with current extent
12835+ unit: if leftmost child of current extent unit is flushprepped
12836+ (i.e. clean or already processed by flush) we stop squalloc(). There
12837+ is a fast check for unallocated extents which we assume contain all
12838+ not flushprepped nodes. */
12839+ /* FIXME: Here we implement simple check, we are only looking on the
12840+ leftmost child. */
12841+ ret = squalloc_extent_should_stop(pos);
12842+ if (ret != 0) {
12843+ pos_stop(pos);
12844+ return ret;
12845+ }
12846+
12847+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12848+ && item_is_extent(&pos->coord)) {
12849+ ret = alloc_extent(pos);
12850+ if (ret) {
12851+ break;
12852+ }
12853+ coord_next_unit(&pos->coord);
12854+ }
12855+
12856+ if (coord_is_after_rightmost(&pos->coord)) {
12857+ pos->state = POS_END_OF_TWIG;
12858+ return 0;
12859+ }
12860+ if (item_is_internal(&pos->coord)) {
12861+ pos->state = POS_TO_LEAF;
12862+ return 0;
12863+ }
12864+
12865+ assert("zam-860", item_is_extent(&pos->coord));
12866+
12867+ /* "slum" is over */
12868+ pos->state = POS_INVALID;
12869+ return 0;
12870+}
12871+
12872+/* When we about to return flush position from twig to leaf level we can process
12873+ * the right twig node or move position to the leaf. This processes right twig
12874+ * if it is possible and jump to leaf level if not. */
12875+static int handle_pos_end_of_twig(flush_pos_t * pos)
12876+{
12877+ int ret;
12878+ lock_handle right_lock;
12879+ load_count right_load;
12880+ coord_t at_right;
12881+ jnode *child = NULL;
12882+
12883+ assert("zam-848", pos->state == POS_END_OF_TWIG);
12884+ assert("zam-849", coord_is_after_rightmost(&pos->coord));
12885+
12886+ init_lh(&right_lock);
12887+ init_load_count(&right_load);
12888+
12889+ /* We get a lock on the right twig node even it is not dirty because
12890+ * slum continues or discontinues on leaf level not on next twig. This
12891+ * lock on the right twig is needed for getting its leftmost child. */
12892+ ret =
12893+ reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12894+ ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12895+ if (ret)
12896+ goto out;
12897+
12898+ ret = incr_load_count_znode(&right_load, right_lock.node);
12899+ if (ret)
12900+ goto out;
12901+
12902+ /* right twig could be not dirty */
12903+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12904+ /* If right twig node is dirty we always attempt to squeeze it
12905+ * content to the left... */
12906+ became_dirty:
12907+ ret =
12908+ squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12909+ if (ret <= 0) {
12910+ /* pos->coord is on internal item, go to leaf level, or
12911+ * we have an error which will be caught in squalloc() */
12912+ pos->state = POS_TO_LEAF;
12913+ goto out;
12914+ }
12915+
12916+ /* If right twig was squeezed completely we wave to re-lock
12917+ * right twig. now it is done through the top-level squalloc
12918+ * routine. */
12919+ if (node_is_empty(right_lock.node))
12920+ goto out;
12921+
12922+ /* ... and prep it if it is not yet prepped */
12923+ if (!znode_check_flushprepped(right_lock.node)) {
12924+ /* As usual, process parent before ... */
12925+ ret =
12926+ check_parents_and_squalloc_upper_levels(pos,
12927+ pos->lock.
12928+ node,
12929+ right_lock.
12930+ node);
12931+ if (ret)
12932+ goto out;
12933+
12934+ /* ... processing the child */
12935+ ret =
12936+ lock_parent_and_allocate_znode(right_lock.node,
12937+ pos);
12938+ if (ret)
12939+ goto out;
12940+ }
12941+ } else {
12942+ coord_init_first_unit(&at_right, right_lock.node);
12943+
12944+ /* check first child of next twig, should we continue there ? */
12945+ ret = get_leftmost_child_of_unit(&at_right, &child);
12946+ if (ret || child == NULL || jnode_check_flushprepped(child)) {
12947+ pos_stop(pos);
12948+ goto out;
12949+ }
12950+
12951+ /* check clean twig for possible relocation */
12952+ if (!znode_check_flushprepped(right_lock.node)) {
12953+ ret =
12954+ reverse_relocate_check_dirty_parent(child,
12955+ &at_right, pos);
12956+ if (ret)
12957+ goto out;
12958+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12959+ goto became_dirty;
12960+ }
12961+ }
12962+
12963+ assert("zam-875", znode_check_flushprepped(right_lock.node));
12964+
12965+ /* Update the preceder by a block number of just processed right twig
12966+ * node. The code above could miss the preceder updating because
12967+ * allocate_znode() could not be called for this node. */
12968+ pos->preceder.blk = *znode_get_block(right_lock.node);
12969+ check_preceder(pos->preceder.blk);
12970+
12971+ coord_init_first_unit(&at_right, right_lock.node);
12972+ assert("zam-868", coord_is_existing_unit(&at_right));
12973+
12974+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12975+ move_flush_pos(pos, &right_lock, &right_load, &at_right);
12976+
12977+ out:
12978+ done_load_count(&right_load);
12979+ done_lh(&right_lock);
12980+
12981+ if (child)
12982+ jput(child);
12983+
12984+ return ret;
12985+}
12986+
12987+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12988+ * continue there. */
12989+static int handle_pos_to_leaf(flush_pos_t * pos)
12990+{
12991+ int ret;
12992+ lock_handle child_lock;
12993+ load_count child_load;
12994+ jnode *child;
12995+
12996+ assert("zam-846", pos->state == POS_TO_LEAF);
12997+ assert("zam-847", item_is_internal(&pos->coord));
12998+
12999+ init_lh(&child_lock);
13000+ init_load_count(&child_load);
13001+
13002+ ret = get_leftmost_child_of_unit(&pos->coord, &child);
13003+ if (ret)
13004+ return ret;
13005+ if (child == NULL) {
13006+ pos_stop(pos);
13007+ return 0;
13008+ }
13009+
13010+ if (jnode_check_flushprepped(child)) {
13011+ pos->state = POS_INVALID;
13012+ goto out;
13013+ }
13014+
13015+ ret =
13016+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
13017+ ZNODE_LOCK_LOPRI);
13018+ if (ret)
13019+ goto out;
13020+
13021+ ret = incr_load_count_znode(&child_load, JZNODE(child));
13022+ if (ret)
13023+ goto out;
13024+
13025+ ret = allocate_znode(JZNODE(child), &pos->coord, pos);
13026+ if (ret)
13027+ goto out;
13028+
13029+ /* move flush position to leaf level */
13030+ pos->state = POS_ON_LEAF;
13031+ move_flush_pos(pos, &child_lock, &child_load, NULL);
13032+
13033+ if (node_is_empty(JZNODE(child))) {
13034+ ret = delete_empty_node(JZNODE(child));
13035+ pos->state = POS_INVALID;
13036+ }
13037+ out:
13038+ done_load_count(&child_load);
13039+ done_lh(&child_lock);
13040+ jput(child);
13041+
13042+ return ret;
13043+}
13044+
13045+/* move pos from leaf to twig, and move lock from leaf to twig. */
13046+/* Move pos->lock to upper (twig) level */
13047+static int handle_pos_to_twig(flush_pos_t * pos)
13048+{
13049+ int ret;
13050+
13051+ lock_handle parent_lock;
13052+ load_count parent_load;
13053+ coord_t pcoord;
13054+
13055+ assert("zam-852", pos->state == POS_TO_TWIG);
13056+
13057+ init_lh(&parent_lock);
13058+ init_load_count(&parent_load);
13059+
13060+ ret =
13061+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
13062+ if (ret)
13063+ goto out;
13064+
13065+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
13066+ if (ret)
13067+ goto out;
13068+
13069+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
13070+ if (ret)
13071+ goto out;
13072+
13073+ assert("zam-870", item_is_internal(&pcoord));
13074+ coord_next_item(&pcoord);
13075+
13076+ if (coord_is_after_rightmost(&pcoord))
13077+ pos->state = POS_END_OF_TWIG;
13078+ else if (item_is_extent(&pcoord))
13079+ pos->state = POS_ON_EPOINT;
13080+ else {
13081+ /* Here we understand that getting -E_NO_NEIGHBOR in
13082+ * handle_pos_on_leaf() was because of just a reaching edge of
13083+ * slum */
13084+ pos_stop(pos);
13085+ goto out;
13086+ }
13087+
13088+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
13089+
13090+ out:
13091+ done_load_count(&parent_load);
13092+ done_lh(&parent_lock);
13093+
13094+ return ret;
13095+}
13096+
13097+typedef int (*pos_state_handle_t) (flush_pos_t *);
13098+static pos_state_handle_t flush_pos_handlers[] = {
13099+ /* process formatted nodes on leaf level, keep lock on a leaf node */
13100+ [POS_ON_LEAF] = handle_pos_on_leaf,
13101+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
13102+ * being processed */
13103+ [POS_ON_EPOINT] = handle_pos_on_twig,
13104+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */
13105+ [POS_TO_TWIG] = handle_pos_to_twig,
13106+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
13107+ * pos->coord points to the leaf node we jump to */
13108+ [POS_TO_LEAF] = handle_pos_to_leaf,
13109+ /* after processing last extent in the twig node, attempting to shift items from the twigs
13110+ * right neighbor and process them while shifting */
13111+ [POS_END_OF_TWIG] = handle_pos_end_of_twig,
13112+ /* process formatted nodes on internal level, keep lock on an internal node */
13113+ [POS_ON_INTERNAL] = handle_pos_on_internal
13114+};
13115+
13116+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
13117+ * encrypt) nodes and their ancestors in "parent-first" order */
13118+static int squalloc(flush_pos_t * pos)
13119+{
13120+ int ret = 0;
13121+
13122+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
13123+ * greater CPU efficiency? Measure and see.... -Hans */
13124+ while (pos_valid(pos)) {
13125+ ret = flush_pos_handlers[pos->state] (pos);
13126+ if (ret < 0)
13127+ break;
13128+
13129+ ret = rapid_flush(pos);
13130+ if (ret)
13131+ break;
13132+ }
13133+
13134+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
13135+ routines, -E_NO_NEIGHBOR means that slum edge was reached */
13136+ if (ret > 0 || ret == -E_NO_NEIGHBOR)
13137+ ret = 0;
13138+
13139+ return ret;
13140+}
13141+
13142+static void update_ldkey(znode * node)
13143+{
13144+ reiser4_key ldkey;
13145+
13146+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
13147+ if (node_is_empty(node))
13148+ return;
13149+
13150+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
13151+}
13152+
13153+/* this is to be called after calling of shift node's method to shift data from @right to
13154+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left
13155+ and @right correspondingly and sets right delimiting key of @left to first key of @right */
13156+static void update_znode_dkeys(znode * left, znode * right)
13157+{
13158+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
13159+ assert("vs-1629", (znode_is_write_locked(left) &&
13160+ znode_is_write_locked(right)));
13161+
13162+ /* we need to update left delimiting of left if it was empty before shift */
13163+ update_ldkey(left);
13164+ update_ldkey(right);
13165+ if (node_is_empty(right))
13166+ znode_set_rd_key(left, znode_get_rd_key(right));
13167+ else
13168+ znode_set_rd_key(left, znode_get_ld_key(right));
13169+}
13170+
13171+/* try to shift everything from @right to @left. If everything was shifted -
13172+ @right is removed from the tree. Result is the number of bytes shifted. */
13173+static int
13174+shift_everything_left(znode * right, znode * left, carry_level * todo)
13175+{
13176+ coord_t from;
13177+ node_plugin *nplug;
13178+ carry_plugin_info info;
13179+
13180+ coord_init_after_last_item(&from, right);
13181+
13182+ nplug = node_plugin_by_node(right);
13183+ info.doing = NULL;
13184+ info.todo = todo;
13185+ return nplug->shift(&from, left, SHIFT_LEFT,
13186+ 1 /* delete @right if it becomes empty */ ,
13187+ 1
13188+ /* move coord @from to node @left if everything will be shifted */
13189+ ,
13190+ &info);
13191+}
13192+
13193+/* Shift as much as possible from @right to @left using the memcpy-optimized
13194+ shift_everything_left. @left and @right are formatted neighboring nodes on
13195+ leaf level. */
13196+static int squeeze_right_non_twig(znode * left, znode * right)
13197+{
13198+ int ret;
13199+ carry_pool *pool;
13200+ carry_level *todo;
13201+
13202+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
13203+
13204+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
13205+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
13206+ return SQUEEZE_TARGET_FULL;
13207+
13208+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
13209+ if (IS_ERR(pool))
13210+ return PTR_ERR(pool);
13211+ todo = (carry_level *) (pool + 1);
13212+ init_carry_level(todo, pool);
13213+
13214+ ret = shift_everything_left(right, left, todo);
13215+ if (ret > 0) {
13216+ /* something was shifted */
13217+ reiser4_tree *tree;
13218+ __u64 grabbed;
13219+
13220+ znode_make_dirty(left);
13221+ znode_make_dirty(right);
13222+
13223+ /* update delimiting keys of nodes which participated in
13224+ shift. FIXME: it would be better to have this in shift
13225+ node's operation. But it can not be done there. Nobody
13226+ remembers why, though */
13227+ tree = znode_get_tree(left);
13228+ write_lock_dk(tree);
13229+ update_znode_dkeys(left, right);
13230+ write_unlock_dk(tree);
13231+
13232+ /* Carry is called to update delimiting key and, maybe, to remove empty
13233+ node. */
13234+ grabbed = get_current_context()->grabbed_blocks;
13235+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13236+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13237+ ret = carry(todo, NULL /* previous level */ );
13238+ grabbed2free_mark(grabbed);
13239+ } else {
13240+ /* Shifting impossible, we return appropriate result code */
13241+ ret =
13242+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13243+ SQUEEZE_TARGET_FULL;
13244+ }
13245+
13246+ done_carry_pool(pool);
13247+
13248+ return ret;
13249+}
13250+
13251+#if REISER4_DEBUG
13252+static int sibling_link_is_ok(const znode *left, const znode *right)
13253+{
13254+ int result;
13255+
13256+ read_lock_tree(znode_get_tree(left));
13257+ result = (left->right == right && left == right->left);
13258+ read_unlock_tree(znode_get_tree(left));
13259+ return result;
13260+}
13261+#endif
13262+
13263+/* Shift first unit of first item if it is an internal one. Return
13264+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13265+ SUBTREE_MOVED. */
13266+static int shift_one_internal_unit(znode * left, znode * right)
13267+{
13268+ int ret;
13269+ carry_pool *pool;
13270+ carry_level *todo;
13271+ coord_t *coord;
13272+ carry_plugin_info *info;
13273+ int size, moved;
13274+
13275+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13276+ assert("nikita-2435", znode_is_write_locked(left));
13277+ assert("nikita-2436", znode_is_write_locked(right));
13278+ assert("nikita-2434", sibling_link_is_ok(left, right));
13279+
13280+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13281+ sizeof(*coord) + sizeof(*info)
13282+#if REISER4_DEBUG
13283+ + sizeof(*coord) + 2 * sizeof(reiser4_key)
13284+#endif
13285+ );
13286+ if (IS_ERR(pool))
13287+ return PTR_ERR(pool);
13288+ todo = (carry_level *) (pool + 1);
13289+ init_carry_level(todo, pool);
13290+
13291+ coord = (coord_t *) (todo + 3);
13292+ coord_init_first_unit(coord, right);
13293+ info = (carry_plugin_info *) (coord + 1);
13294+
13295+#if REISER4_DEBUG
13296+ if (!node_is_empty(left)) {
13297+ coord_t *last;
13298+ reiser4_key *right_key;
13299+ reiser4_key *left_key;
13300+
13301+ last = (coord_t *) (info + 1);
13302+ right_key = (reiser4_key *) (last + 1);
13303+ left_key = right_key + 1;
13304+ coord_init_last_unit(last, left);
13305+
13306+ assert("nikita-2463",
13307+ keyle(item_key_by_coord(last, left_key),
13308+ item_key_by_coord(coord, right_key)));
13309+ }
13310+#endif
13311+
13312+ assert("jmacd-2007", item_is_internal(coord));
13313+
13314+ size = item_length_by_coord(coord);
13315+ info->todo = todo;
13316+ info->doing = NULL;
13317+
13318+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13319+ 1
13320+ /* delete @right if it becomes empty */
13321+ ,
13322+ 0
13323+ /* do not move coord @coord to node @left */
13324+ ,
13325+ info);
13326+
13327+ /* If shift returns positive, then we shifted the item. */
13328+ assert("vs-423", ret <= 0 || size == ret);
13329+ moved = (ret > 0);
13330+
13331+ if (moved) {
13332+ /* something was moved */
13333+ reiser4_tree *tree;
13334+ int grabbed;
13335+
13336+ znode_make_dirty(left);
13337+ znode_make_dirty(right);
13338+ tree = znode_get_tree(left);
13339+ write_lock_dk(tree);
13340+ update_znode_dkeys(left, right);
13341+ write_unlock_dk(tree);
13342+
13343+ /* reserve space for delimiting keys after shifting */
13344+ grabbed = get_current_context()->grabbed_blocks;
13345+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13346+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13347+
13348+ ret = carry(todo, NULL /* previous level */ );
13349+ grabbed2free_mark(grabbed);
13350+ }
13351+
13352+ done_carry_pool(pool);
13353+
13354+ if (ret != 0) {
13355+ /* Shift or carry operation failed. */
13356+ assert("jmacd-7325", ret < 0);
13357+ return ret;
13358+ }
13359+
13360+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13361+}
13362+
13363+/* Make the final relocate/wander decision during forward parent-first squalloc for a
13364+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13365+static int
13366+allocate_znode_loaded(znode * node,
13367+ const coord_t * parent_coord, flush_pos_t * pos)
13368+{
13369+ int ret;
13370+ reiser4_super_info_data *sbinfo = get_current_super_private();
13371+ /* FIXME(D): We have the node write-locked and should have checked for !
13372+ allocated() somewhere before reaching this point, but there can be a race, so
13373+ this assertion is bogus. */
13374+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13375+ assert("jmacd-7988", znode_is_write_locked(node));
13376+ assert("jmacd-7989", coord_is_invalid(parent_coord)
13377+ || znode_is_write_locked(parent_coord->node));
13378+
13379+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13380+ znode_is_root(node) ||
13381+ /* We have enough nodes to relocate no matter what. */
13382+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13383+ /* No need to decide with new nodes, they are treated the same as
13384+ relocate. If the root node is dirty, relocate. */
13385+ if (pos->preceder.blk == 0) {
13386+ /* preceder is unknown and we have decided to relocate node --
13387+ using of default value for search start is better than search
13388+ from block #0. */
13389+ get_blocknr_hint_default(&pos->preceder.blk);
13390+ check_preceder(pos->preceder.blk);
13391+ }
13392+
13393+ goto best_reloc;
13394+
13395+ } else if (pos->preceder.blk == 0) {
13396+ /* If we don't know the preceder, leave it where it is. */
13397+ jnode_make_wander(ZJNODE(node));
13398+ } else {
13399+ /* Make a decision based on block distance. */
13400+ reiser4_block_nr dist;
13401+ reiser4_block_nr nblk = *znode_get_block(node);
13402+
13403+ assert("jmacd-6172", !blocknr_is_fake(&nblk));
13404+ assert("jmacd-6173", !blocknr_is_fake(&pos->preceder.blk));
13405+ assert("jmacd-6174", pos->preceder.blk != 0);
13406+
13407+ if (pos->preceder.blk == nblk - 1) {
13408+ /* Ideal. */
13409+ jnode_make_wander(ZJNODE(node));
13410+ } else {
13411+
13412+ dist =
13413+ (nblk <
13414+ pos->preceder.blk) ? (pos->preceder.blk -
13415+ nblk) : (nblk -
13416+ pos->preceder.blk);
13417+
13418+ /* See if we can find a closer block (forward direction only). */
13419+ pos->preceder.max_dist =
13420+ min((reiser4_block_nr) sbinfo->flush.
13421+ relocate_distance, dist);
13422+ pos->preceder.level = znode_get_level(node);
13423+
13424+ ret = allocate_znode_update(node, parent_coord, pos);
13425+
13426+ pos->preceder.max_dist = 0;
13427+
13428+ if (ret && (ret != -ENOSPC))
13429+ return ret;
13430+
13431+ if (ret == 0) {
13432+ /* Got a better allocation. */
13433+ znode_make_reloc(node, pos->fq);
13434+ } else if (dist < sbinfo->flush.relocate_distance) {
13435+ /* The present allocation is good enough. */
13436+ jnode_make_wander(ZJNODE(node));
13437+ } else {
13438+ /* Otherwise, try to relocate to the best position. */
13439+ best_reloc:
13440+ ret =
13441+ allocate_znode_update(node, parent_coord,
13442+ pos);
13443+ if (ret != 0)
13444+ return ret;
13445+
13446+ /* set JNODE_RELOC bit _after_ node gets allocated */
13447+ znode_make_reloc(node, pos->fq);
13448+ }
13449+ }
13450+ }
13451+
13452+ /* This is the new preceder. */
13453+ pos->preceder.blk = *znode_get_block(node);
13454+ check_preceder(pos->preceder.blk);
13455+ pos->alloc_cnt += 1;
13456+
13457+ assert("jmacd-4277", !blocknr_is_fake(&pos->preceder.blk));
13458+
13459+ return 0;
13460+}
13461+
13462+static int
13463+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13464+{
13465+ /*
13466+ * perform znode allocation with znode pinned in memory to avoid races
13467+ * with asynchronous emergency flush (which plays with
13468+ * JNODE_FLUSH_RESERVED bit).
13469+ */
13470+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13471+}
13472+
13473+/* A subroutine of allocate_znode, this is called first to see if there is a close
13474+ position to relocate to. It may return ENOSPC if there is no close position. If there
13475+ is no close position it may not relocate. This takes care of updating the parent node
13476+ with the relocated block address. */
13477+static int
13478+allocate_znode_update(znode * node, const coord_t * parent_coord,
13479+ flush_pos_t * pos)
13480+{
13481+ int ret;
13482+ reiser4_block_nr blk;
13483+ lock_handle uber_lock;
13484+ int flush_reserved_used = 0;
13485+ int grabbed;
13486+ reiser4_context *ctx;
13487+ reiser4_super_info_data *sbinfo;
13488+
13489+ init_lh(&uber_lock);
13490+
13491+ ctx = get_current_context();
13492+ sbinfo = get_super_private(ctx->super);
13493+
13494+ grabbed = ctx->grabbed_blocks;
13495+
13496+ /* discard e-flush allocation */
13497+ ret = zload(node);
13498+ if (ret)
13499+ return ret;
13500+
13501+ if (ZF_ISSET(node, JNODE_CREATED)) {
13502+ assert("zam-816", blocknr_is_fake(znode_get_block(node)));
13503+ pos->preceder.block_stage = BLOCK_UNALLOCATED;
13504+ } else {
13505+ pos->preceder.block_stage = BLOCK_GRABBED;
13506+
13507+ /* The disk space for relocating the @node is already reserved in "flush reserved"
13508+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13509+ * space from whole disk not from only 95%). */
13510+ if (znode_get_level(node) == LEAF_LEVEL) {
13511+ /*
13512+ * earlier (during do_jnode_make_dirty()) we decided
13513+ * that @node can possibly go into overwrite set and
13514+ * reserved block for its wandering location.
13515+ */
13516+ txn_atom *atom = get_current_atom_locked();
13517+ assert("nikita-3449",
13518+ ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13519+ flush_reserved2grabbed(atom, (__u64) 1);
13520+ spin_unlock_atom(atom);
13521+ /*
13522+ * we are trying to move node into relocate
13523+ * set. Allocation of relocated position "uses"
13524+ * reserved block.
13525+ */
13526+ ZF_CLR(node, JNODE_FLUSH_RESERVED);
13527+ flush_reserved_used = 1;
13528+ } else {
13529+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13530+ if (ret != 0)
13531+ goto exit;
13532+ }
13533+ }
13534+
13535+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13536+ ret = reiser4_alloc_block(&pos->preceder, &blk,
13537+ BA_FORMATTED | BA_PERMANENT);
13538+ if (ret)
13539+ goto exit;
13540+
13541+ if (!ZF_ISSET(node, JNODE_CREATED) &&
13542+ (ret =
13543+ reiser4_dealloc_block(znode_get_block(node), 0,
13544+ BA_DEFER | BA_FORMATTED)))
13545+ goto exit;
13546+
13547+ if (likely(!znode_is_root(node))) {
13548+ item_plugin *iplug;
13549+
13550+ iplug = item_plugin_by_coord(parent_coord);
13551+ assert("nikita-2954", iplug->f.update != NULL);
13552+ iplug->f.update(parent_coord, &blk);
13553+
13554+ znode_make_dirty(parent_coord->node);
13555+
13556+ } else {
13557+ reiser4_tree *tree = znode_get_tree(node);
13558+ znode *uber;
13559+
13560+ /* We take a longterm lock on the fake node in order to change
13561+ the root block number. This may cause atom fusion. */
13562+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13563+ &uber_lock);
13564+ /* The fake node cannot be deleted, and we must have priority
13565+ here, and may not be confused with ENOSPC. */
13566+ assert("jmacd-74412",
13567+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13568+
13569+ if (ret)
13570+ goto exit;
13571+
13572+ uber = uber_lock.node;
13573+
13574+ write_lock_tree(tree);
13575+ tree->root_block = blk;
13576+ write_unlock_tree(tree);
13577+
13578+ znode_make_dirty(uber);
13579+ }
13580+
13581+ ret = znode_rehash(node, &blk);
13582+ exit:
13583+ if (ret) {
13584+ /* Get flush reserved block back if something fails, because
13585+ * callers assume that on error block wasn't relocated and its
13586+ * flush reserved block wasn't used. */
13587+ if (flush_reserved_used) {
13588+ /*
13589+ * ok, we failed to move node into relocate
13590+ * set. Restore status quo.
13591+ */
13592+ grabbed2flush_reserved((__u64) 1);
13593+ ZF_SET(node, JNODE_FLUSH_RESERVED);
13594+ }
13595+ }
13596+ zrelse(node);
13597+ done_lh(&uber_lock);
13598+ grabbed2free_mark(grabbed);
13599+ return ret;
13600+}
13601+
13602+/* JNODE INTERFACE */
13603+
13604+/* Lock a node (if formatted) and then get its parent locked, set the child's
13605+ coordinate in the parent. If the child is the root node, the above_root
13606+ znode is returned but the coord is not set. This function may cause atom
13607+ fusion, but it is only used for read locks (at this point) and therefore
13608+ fusion only occurs when the parent is already dirty. */
13609+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13610+ pointer in jnodes. */
13611+static int
13612+jnode_lock_parent_coord(jnode * node,
13613+ coord_t * coord,
13614+ lock_handle * parent_lh,
13615+ load_count * parent_zh,
13616+ znode_lock_mode parent_mode, int try)
13617+{
13618+ int ret;
13619+
13620+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13621+ assert("edward-54", jnode_is_unformatted(node)
13622+ || znode_is_any_locked(JZNODE(node)));
13623+
13624+ if (!jnode_is_znode(node)) {
13625+ reiser4_key key;
13626+ tree_level stop_level = TWIG_LEVEL;
13627+ lookup_bias bias = FIND_EXACT;
13628+
13629+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13630+
13631+ /* The case when node is not znode, but can have parent coord
13632+ (unformatted node, node which represents cluster page,
13633+ etc..). Generate a key for the appropriate entry, search
13634+ in the tree using coord_by_key, which handles locking for
13635+ us. */
13636+
13637+ /*
13638+ * nothing is locked at this moment, so, nothing prevents
13639+ * concurrent truncate from removing jnode from inode. To
13640+ * prevent this spin-lock jnode. jnode can be truncated just
13641+ * after call to the jnode_build_key(), but this is ok,
13642+ * because coord_by_key() will just fail to find appropriate
13643+ * extent.
13644+ */
13645+ spin_lock_jnode(node);
13646+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13647+ jnode_build_key(node, &key);
13648+ ret = 0;
13649+ } else
13650+ ret = RETERR(-ENOENT);
13651+ spin_unlock_jnode(node);
13652+
13653+ if (ret != 0)
13654+ return ret;
13655+
13656+ if (jnode_is_cluster_page(node))
13657+ stop_level = LEAF_LEVEL;
13658+
13659+ assert("jmacd-1812", coord != NULL);
13660+
13661+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13662+ parent_mode, bias, stop_level, stop_level,
13663+ CBK_UNIQUE, NULL /*ra_info */ );
13664+ switch (ret) {
13665+ case CBK_COORD_NOTFOUND:
13666+ assert("edward-1038",
13667+ ergo(jnode_is_cluster_page(node),
13668+ JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13669+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13670+ warning("nikita-3177", "Parent not found");
13671+ return ret;
13672+ case CBK_COORD_FOUND:
13673+ if (coord->between != AT_UNIT) {
13674+ /* FIXME: comment needed */
13675+ done_lh(parent_lh);
13676+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13677+ warning("nikita-3178",
13678+ "Found but not happy: %i",
13679+ coord->between);
13680+ }
13681+ return RETERR(-ENOENT);
13682+ }
13683+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13684+ if (ret != 0)
13685+ return ret;
13686+ /* if (jnode_is_cluster_page(node)) {
13687+ races with write() are possible
13688+ check_child_cluster (parent_lh->node);
13689+ }
13690+ */
13691+ break;
13692+ default:
13693+ return ret;
13694+ }
13695+
13696+ } else {
13697+ int flags;
13698+ znode *z;
13699+
13700+ z = JZNODE(node);
13701+ /* Formatted node case: */
13702+ assert("jmacd-2061", !znode_is_root(z));
13703+
13704+ flags = GN_ALLOW_NOT_CONNECTED;
13705+ if (try)
13706+ flags |= GN_TRY_LOCK;
13707+
13708+ ret =
13709+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13710+ if (ret != 0)
13711+ /* -E_REPEAT is ok here, it is handled by the caller. */
13712+ return ret;
13713+
13714+ /* Make the child's position "hint" up-to-date. (Unless above
13715+ root, which caller must check.) */
13716+ if (coord != NULL) {
13717+
13718+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13719+ if (ret != 0) {
13720+ warning("jmacd-976812386",
13721+ "incr_load_count_znode failed: %d",
13722+ ret);
13723+ return ret;
13724+ }
13725+
13726+ ret = find_child_ptr(parent_lh->node, z, coord);
13727+ if (ret != 0) {
13728+ warning("jmacd-976812",
13729+ "find_child_ptr failed: %d", ret);
13730+ return ret;
13731+ }
13732+ }
13733+ }
13734+
13735+ return 0;
13736+}
13737+
13738+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13739+ If there is no next neighbor or the neighbor is not in memory or if there is a
13740+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13741+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13742+static int neighbor_in_slum(znode * node, /* starting point */
13743+ lock_handle * lock, /* lock on starting point */
13744+ sideof side, /* left or right direction we seek the next node in */
13745+ znode_lock_mode mode, /* kind of lock we want */
13746+ int check_dirty)
13747+{ /* true if the neighbor should be dirty */
13748+ int ret;
13749+
13750+ assert("jmacd-6334", znode_is_connected(node));
13751+
13752+ ret =
13753+ reiser4_get_neighbor(lock, node, mode,
13754+ GN_SAME_ATOM | (side ==
13755+ LEFT_SIDE ? GN_GO_LEFT : 0));
13756+
13757+ if (ret) {
13758+ /* May return -ENOENT or -E_NO_NEIGHBOR. */
13759+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13760+ if (ret == -ENOENT) {
13761+ ret = RETERR(-E_NO_NEIGHBOR);
13762+ }
13763+
13764+ return ret;
13765+ }
13766+ if (!check_dirty)
13767+ return 0;
13768+ /* Check dirty bit of locked znode, no races here */
13769+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13770+ return 0;
13771+
13772+ done_lh(lock);
13773+ return RETERR(-E_NO_NEIGHBOR);
13774+}
13775+
13776+/* Return true if two znodes have the same parent. This is called with both nodes
13777+ write-locked (for squeezing) so no tree lock is needed. */
13778+static int znode_same_parents(znode * a, znode * b)
13779+{
13780+ int result;
13781+
13782+ assert("jmacd-7011", znode_is_write_locked(a));
13783+ assert("jmacd-7012", znode_is_write_locked(b));
13784+
13785+ /* We lock the whole tree for this check.... I really don't like whole tree
13786+ * locks... -Hans */
13787+ read_lock_tree(znode_get_tree(a));
13788+ result = (znode_parent(a) == znode_parent(b));
13789+ read_unlock_tree(znode_get_tree(a));
13790+ return result;
13791+}
13792+
13793+/* FLUSH SCAN */
13794+
13795+/* Initialize the flush_scan data structure. */
13796+static void scan_init(flush_scan * scan)
13797+{
13798+ memset(scan, 0, sizeof(*scan));
13799+ init_lh(&scan->node_lock);
13800+ init_lh(&scan->parent_lock);
13801+ init_load_count(&scan->parent_load);
13802+ init_load_count(&scan->node_load);
13803+ coord_init_invalid(&scan->parent_coord, NULL);
13804+}
13805+
13806+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13807+static void scan_done(flush_scan * scan)
13808+{
13809+ done_load_count(&scan->node_load);
13810+ if (scan->node != NULL) {
13811+ jput(scan->node);
13812+ scan->node = NULL;
13813+ }
13814+ done_load_count(&scan->parent_load);
13815+ done_lh(&scan->parent_lock);
13816+ done_lh(&scan->node_lock);
13817+}
13818+
13819+/* Returns true if flush scanning is finished. */
13820+int scan_finished(flush_scan * scan)
13821+{
13822+ return scan->stop || (scan->direction == RIGHT_SIDE &&
13823+ scan->count >= scan->max_count);
13824+}
13825+
13826+/* Return true if the scan should continue to the @tonode. True if the node meets the
13827+ same_slum_check condition. If not, deref the "left" node and stop the scan. */
13828+int scan_goto(flush_scan * scan, jnode * tonode)
13829+{
13830+ int go = same_slum_check(scan->node, tonode, 1, 0);
13831+
13832+ if (!go) {
13833+ scan->stop = 1;
13834+ jput(tonode);
13835+ }
13836+
13837+ return go;
13838+}
13839+
13840+/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13841+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13842+ parent coordinate. */
13843+int
13844+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13845+ const coord_t * parent)
13846+{
13847+ /* Release the old references, take the new reference. */
13848+ done_load_count(&scan->node_load);
13849+
13850+ if (scan->node != NULL) {
13851+ jput(scan->node);
13852+ }
13853+ scan->node = node;
13854+ scan->count += add_count;
13855+
13856+ /* This next stmt is somewhat inefficient. The scan_extent_coord code could
13857+ delay this update step until it finishes and update the parent_coord only once.
13858+ It did that before, but there was a bug and this was the easiest way to make it
13859+ correct. */
13860+ if (parent != NULL) {
13861+ coord_dup(&scan->parent_coord, parent);
13862+ }
13863+
13864+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13865+ is safely taken. */
13866+ return incr_load_count_jnode(&scan->node_load, node);
13867+}
13868+
13869+/* Return true if scanning in the leftward direction. */
13870+int scanning_left(flush_scan * scan)
13871+{
13872+ return scan->direction == LEFT_SIDE;
13873+}
13874+
13875+/* Performs leftward scanning starting from either kind of node. Counts the starting
13876+ node. The right-scan object is passed in for the left-scan in order to copy the parent
13877+ of an unformatted starting position. This way we avoid searching for the unformatted
13878+ node's parent when scanning in each direction. If we search for the parent once it is
13879+ set in both scan objects. The limit parameter tells flush-scan when to stop.
13880+
13881+ Rapid scanning is used only during scan_left, where we are interested in finding the
13882+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13883+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13884+ problem is finding a way to flush only those nodes without unallocated children, and it
13885+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13886+ problem can be solved by scanning left at every level as we go upward, but this would
13887+ basically bring us back to using a top-down allocation strategy, which we already tried
13888+ (see BK history from May 2002), and has a different set of problems. The top-down
13889+ strategy makes avoiding unallocated children easier, but makes it difficult to
13890+ propertly flush dirty children with clean parents that would otherwise stop the
13891+ top-down flush, only later to dirty the parent once the children are flushed. So we
13892+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13893+ only.
13894+
13895+ The first step in solving the problem is this rapid leftward scan. After we determine
13896+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13897+ are no longer interested in the exact count, we are only interested in finding a the
13898+ best place to start the flush. We could choose one of two possibilities:
13899+
13900+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13901+ This requires checking one leaf per rapid-scan twig
13902+
13903+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13904+ to the left. This requires checking possibly all of the in-memory children of each
13905+ twig during the rapid scan.
13906+
13907+ For now we implement the first policy.
13908+*/
13909+static int
13910+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13911+{
13912+ int ret = 0;
13913+
13914+ scan->max_count = limit;
13915+ scan->direction = LEFT_SIDE;
13916+
13917+ ret = scan_set_current(scan, jref(node), 1, NULL);
13918+ if (ret != 0) {
13919+ return ret;
13920+ }
13921+
13922+ ret = scan_common(scan, right);
13923+ if (ret != 0) {
13924+ return ret;
13925+ }
13926+
13927+ /* Before rapid scanning, we need a lock on scan->node so that we can get its
13928+ parent, only if formatted. */
13929+ if (jnode_is_znode(scan->node)) {
13930+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13931+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13932+ }
13933+
13934+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13935+ return ret;
13936+}
13937+
13938+/* Performs rightward scanning... Does not count the starting node. The limit parameter
13939+ is described in scan_left. If the starting node is unformatted then the
13940+ parent_coord was already set during scan_left. The rapid_after parameter is not used
13941+ during right-scanning.
13942+
13943+ scan_right is only called if the scan_left operation does not count at least
13944+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13945+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13946+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13947+static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13948+{
13949+ int ret;
13950+
13951+ scan->max_count = limit;
13952+ scan->direction = RIGHT_SIDE;
13953+
13954+ ret = scan_set_current(scan, jref(node), 0, NULL);
13955+ if (ret != 0) {
13956+ return ret;
13957+ }
13958+
13959+ return scan_common(scan, NULL);
13960+}
13961+
13962+/* Common code to perform left or right scanning. */
13963+static int scan_common(flush_scan * scan, flush_scan * other)
13964+{
13965+ int ret;
13966+
13967+ assert("nikita-2376", scan->node != NULL);
13968+ assert("edward-54", jnode_is_unformatted(scan->node)
13969+ || jnode_is_znode(scan->node));
13970+
13971+ /* Special case for starting at an unformatted node. Optimization: we only want
13972+ to search for the parent (which requires a tree traversal) once. Obviously, we
13973+ shouldn't have to call it once for the left scan and once for the right scan.
13974+ For this reason, if we search for the parent during scan-left we then duplicate
13975+ the coord/lock/load into the scan-right object. */
13976+ if (jnode_is_unformatted(scan->node)) {
13977+ ret = scan_unformatted(scan, other);
13978+ if (ret != 0)
13979+ return ret;
13980+ }
13981+ /* This loop expects to start at a formatted position and performs chaining of
13982+ formatted regions */
13983+ while (!scan_finished(scan)) {
13984+
13985+ ret = scan_formatted(scan);
13986+ if (ret != 0) {
13987+ return ret;
13988+ }
13989+ }
13990+
13991+ return 0;
13992+}
13993+
13994+static int scan_unformatted(flush_scan * scan, flush_scan * other)
13995+{
13996+ int ret = 0;
13997+ int try = 0;
13998+
13999+ if (!coord_is_invalid(&scan->parent_coord))
14000+ goto scan;
14001+
14002+ /* set parent coord from */
14003+ if (!jnode_is_unformatted(scan->node)) {
14004+ /* formatted position */
14005+
14006+ lock_handle lock;
14007+ assert("edward-301", jnode_is_znode(scan->node));
14008+ init_lh(&lock);
14009+
14010+ /*
14011+ * when flush starts from unformatted node, first thing it
14012+ * does is tree traversal to find formatted parent of starting
14013+ * node. This parent is then kept lock across scans to the
14014+ * left and to the right. This means that during scan to the
14015+ * left we cannot take left-ward lock, because this is
14016+ * dead-lock prone. So, if we are scanning to the left and
14017+ * there is already lock held by this thread,
14018+ * jnode_lock_parent_coord() should use try-lock.
14019+ */
14020+ try = scanning_left(scan)
14021+ && !lock_stack_isclean(get_current_lock_stack());
14022+ /* Need the node locked to get the parent lock, We have to
14023+ take write lock since there is at least one call path
14024+ where this znode is already write-locked by us. */
14025+ ret =
14026+ longterm_lock_znode(&lock, JZNODE(scan->node),
14027+ ZNODE_WRITE_LOCK,
14028+ scanning_left(scan) ? ZNODE_LOCK_LOPRI :
14029+ ZNODE_LOCK_HIPRI);
14030+ if (ret != 0)
14031+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
14032+ scanned too far and can't back out, just start over. */
14033+ return ret;
14034+
14035+ ret = jnode_lock_parent_coord(scan->node,
14036+ &scan->parent_coord,
14037+ &scan->parent_lock,
14038+ &scan->parent_load,
14039+ ZNODE_WRITE_LOCK, try);
14040+
14041+ /* FIXME(C): check EINVAL, E_DEADLOCK */
14042+ done_lh(&lock);
14043+ if (ret == -E_REPEAT) {
14044+ scan->stop = 1;
14045+ return 0;
14046+ }
14047+ if (ret)
14048+ return ret;
14049+
14050+ } else {
14051+ /* unformatted position */
14052+
14053+ ret =
14054+ jnode_lock_parent_coord(scan->node, &scan->parent_coord,
14055+ &scan->parent_lock,
14056+ &scan->parent_load,
14057+ ZNODE_WRITE_LOCK, try);
14058+
14059+ if (IS_CBKERR(ret))
14060+ return ret;
14061+
14062+ if (ret == CBK_COORD_NOTFOUND)
14063+ /* FIXME(C): check EINVAL, E_DEADLOCK */
14064+ return ret;
14065+
14066+ /* parent was found */
14067+ assert("jmacd-8661", other != NULL);
14068+ /* Duplicate the reference into the other flush_scan. */
14069+ coord_dup(&other->parent_coord, &scan->parent_coord);
14070+ copy_lh(&other->parent_lock, &scan->parent_lock);
14071+ copy_load_count(&other->parent_load, &scan->parent_load);
14072+ }
14073+ scan:
14074+ return scan_by_coord(scan);
14075+}
14076+
14077+/* Performs left- or rightward scanning starting from a formatted node. Follow left
14078+ pointers under tree lock as long as:
14079+
14080+ - node->left/right is non-NULL
14081+ - node->left/right is connected, dirty
14082+ - node->left/right belongs to the same atom
14083+ - scan has not reached maximum count
14084+*/
14085+static int scan_formatted(flush_scan * scan)
14086+{
14087+ int ret;
14088+ znode *neighbor = NULL;
14089+
14090+ assert("jmacd-1401", !scan_finished(scan));
14091+
14092+ do {
14093+ znode *node = JZNODE(scan->node);
14094+
14095+ /* Node should be connected, but if not stop the scan. */
14096+ if (!znode_is_connected(node)) {
14097+ scan->stop = 1;
14098+ break;
14099+ }
14100+
14101+ /* Lock the tree, check-for and reference the next sibling. */
14102+ read_lock_tree(znode_get_tree(node));
14103+
14104+ /* It may be that a node is inserted or removed between a node and its
14105+ left sibling while the tree lock is released, but the flush-scan count
14106+ does not need to be precise. Thus, we release the tree lock as soon as
14107+ we get the neighboring node. */
14108+ neighbor = scanning_left(scan) ? node->left : node->right;
14109+ if (neighbor != NULL) {
14110+ zref(neighbor);
14111+ }
14112+
14113+ read_unlock_tree(znode_get_tree(node));
14114+
14115+ /* If neighbor is NULL at the leaf level, need to check for an unformatted
14116+ sibling using the parent--break in any case. */
14117+ if (neighbor == NULL) {
14118+ break;
14119+ }
14120+
14121+ /* Check the condition for going left, break if it is not met. This also
14122+ releases (jputs) the neighbor if false. */
14123+ if (!scan_goto(scan, ZJNODE(neighbor))) {
14124+ break;
14125+ }
14126+
14127+ /* Advance the flush_scan state to the left, repeat. */
14128+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
14129+ if (ret != 0) {
14130+ return ret;
14131+ }
14132+
14133+ } while (!scan_finished(scan));
14134+
14135+ /* If neighbor is NULL then we reached the end of a formatted region, or else the
14136+ sibling is out of memory, now check for an extent to the left (as long as
14137+ LEAF_LEVEL). */
14138+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
14139+ || scan_finished(scan)) {
14140+ scan->stop = 1;
14141+ return 0;
14142+ }
14143+ /* Otherwise, calls scan_by_coord for the right(left)most item of the
14144+ left(right) neighbor on the parent level, then possibly continue. */
14145+
14146+ coord_init_invalid(&scan->parent_coord, NULL);
14147+ return scan_unformatted(scan, NULL);
14148+}
14149+
14150+/* NOTE-EDWARD:
14151+ This scans adjacent items of the same type and calls scan flush plugin for each one.
14152+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
14153+ from unformatted node, then we continue only if the next neighbor is also unformatted.
14154+ When called from scan_formatted, we skip first iteration (to make sure that
14155+ right(left)most item of the left(right) neighbor on the parent level is of the same
14156+ type and set appropriate coord). */
14157+static int scan_by_coord(flush_scan * scan)
14158+{
14159+ int ret = 0;
14160+ int scan_this_coord;
14161+ lock_handle next_lock;
14162+ load_count next_load;
14163+ coord_t next_coord;
14164+ jnode *child;
14165+ item_plugin *iplug;
14166+
14167+ init_lh(&next_lock);
14168+ init_load_count(&next_load);
14169+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
14170+
14171+ /* set initial item id */
14172+ iplug = item_plugin_by_coord(&scan->parent_coord);
14173+
14174+ for (; !scan_finished(scan); scan_this_coord = 1) {
14175+ if (scan_this_coord) {
14176+ /* Here we expect that unit is scannable. it would not be so due
14177+ * to race with extent->tail conversion. */
14178+ if (iplug->f.scan == NULL) {
14179+ scan->stop = 1;
14180+ ret = -E_REPEAT;
14181+ /* skip the check at the end. */
14182+ goto race;
14183+ }
14184+
14185+ ret = iplug->f.scan(scan);
14186+ if (ret != 0)
14187+ goto exit;
14188+
14189+ if (scan_finished(scan)) {
14190+ checkchild(scan);
14191+ break;
14192+ }
14193+ } else {
14194+ /* the same race against truncate as above is possible
14195+ * here, it seems */
14196+
14197+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
14198+ the first coordinate. */
14199+ assert("jmacd-1231",
14200+ item_is_internal(&scan->parent_coord));
14201+ }
14202+
14203+ if (iplug->f.utmost_child == NULL
14204+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
14205+ /* stop this coord and continue on parrent level */
14206+ ret =
14207+ scan_set_current(scan,
14208+ ZJNODE(zref
14209+ (scan->parent_coord.node)),
14210+ 1, NULL);
14211+ if (ret != 0)
14212+ goto exit;
14213+ break;
14214+ }
14215+
14216+ /* Either way, the invariant is that scan->parent_coord is set to the
14217+ parent of scan->node. Now get the next unit. */
14218+ coord_dup(&next_coord, &scan->parent_coord);
14219+ coord_sideof_unit(&next_coord, scan->direction);
14220+
14221+ /* If off-the-end of the twig, try the next twig. */
14222+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14223+ /* We take the write lock because we may start flushing from this
14224+ * coordinate. */
14225+ ret =
14226+ neighbor_in_slum(next_coord.node, &next_lock,
14227+ scan->direction, ZNODE_WRITE_LOCK,
14228+ 1 /* check dirty */ );
14229+ if (ret == -E_NO_NEIGHBOR) {
14230+ scan->stop = 1;
14231+ ret = 0;
14232+ break;
14233+ }
14234+
14235+ if (ret != 0) {
14236+ goto exit;
14237+ }
14238+
14239+ ret = incr_load_count_znode(&next_load, next_lock.node);
14240+ if (ret != 0) {
14241+ goto exit;
14242+ }
14243+
14244+ coord_init_sideof_unit(&next_coord, next_lock.node,
14245+ sideof_reverse(scan->direction));
14246+ }
14247+
14248+ iplug = item_plugin_by_coord(&next_coord);
14249+
14250+ /* Get the next child. */
14251+ ret =
14252+ iplug->f.utmost_child(&next_coord,
14253+ sideof_reverse(scan->direction),
14254+ &child);
14255+ if (ret != 0)
14256+ goto exit;
14257+ /* If the next child is not in memory, or, item_utmost_child
14258+ failed (due to race with unlink, most probably), stop
14259+ here. */
14260+ if (child == NULL || IS_ERR(child)) {
14261+ scan->stop = 1;
14262+ checkchild(scan);
14263+ break;
14264+ }
14265+
14266+ assert("nikita-2374", jnode_is_unformatted(child)
14267+ || jnode_is_znode(child));
14268+
14269+ /* See if it is dirty, part of the same atom. */
14270+ if (!scan_goto(scan, child)) {
14271+ checkchild(scan);
14272+ break;
14273+ }
14274+
14275+ /* If so, make this child current. */
14276+ ret = scan_set_current(scan, child, 1, &next_coord);
14277+ if (ret != 0)
14278+ goto exit;
14279+
14280+ /* Now continue. If formatted we release the parent lock and return, then
14281+ proceed. */
14282+ if (jnode_is_znode(child))
14283+ break;
14284+
14285+ /* Otherwise, repeat the above loop with next_coord. */
14286+ if (next_load.node != NULL) {
14287+ done_lh(&scan->parent_lock);
14288+ move_lh(&scan->parent_lock, &next_lock);
14289+ move_load_count(&scan->parent_load, &next_load);
14290+ }
14291+ }
14292+
14293+ assert("jmacd-6233", scan_finished(scan) || jnode_is_znode(scan->node));
14294+ exit:
14295+ checkchild(scan);
14296+ race: /* skip the above check */
14297+ if (jnode_is_znode(scan->node)) {
14298+ done_lh(&scan->parent_lock);
14299+ done_load_count(&scan->parent_load);
14300+ }
14301+
14302+ done_load_count(&next_load);
14303+ done_lh(&next_lock);
14304+ return ret;
14305+}
14306+
14307+/* FLUSH POS HELPERS */
14308+
14309+/* Initialize the fields of a flush_position. */
14310+static void pos_init(flush_pos_t * pos)
14311+{
14312+ memset(pos, 0, sizeof *pos);
14313+
14314+ pos->state = POS_INVALID;
14315+ coord_init_invalid(&pos->coord, NULL);
14316+ init_lh(&pos->lock);
14317+ init_load_count(&pos->load);
14318+
14319+ blocknr_hint_init(&pos->preceder);
14320+}
14321+
14322+/* The flush loop inside squalloc periodically checks pos_valid to
14323+ determine when "enough flushing" has been performed. This will return true until one
14324+ of the following conditions is met:
14325+
14326+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14327+ parameter, meaning we have flushed as many blocks as the kernel requested. When
14328+ flushing to commit, this parameter is NULL.
14329+
14330+ 2. pos_stop() is called because squalloc discovers that the "next" node in the
14331+ flush order is either non-existant, not dirty, or not in the same atom.
14332+*/
14333+
14334+static int pos_valid(flush_pos_t * pos)
14335+{
14336+ return pos->state != POS_INVALID;
14337+}
14338+
14339+/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14340+static void pos_done(flush_pos_t * pos)
14341+{
14342+ pos_stop(pos);
14343+ blocknr_hint_done(&pos->preceder);
14344+ if (convert_data(pos))
14345+ free_convert_data(pos);
14346+}
14347+
14348+/* Reset the point and parent. Called during flush subroutines to terminate the
14349+ squalloc loop. */
14350+static int pos_stop(flush_pos_t * pos)
14351+{
14352+ pos->state = POS_INVALID;
14353+ done_lh(&pos->lock);
14354+ done_load_count(&pos->load);
14355+ coord_init_invalid(&pos->coord, NULL);
14356+
14357+ if (pos->child) {
14358+ jput(pos->child);
14359+ pos->child = NULL;
14360+ }
14361+
14362+ return 0;
14363+}
14364+
14365+/* Return the flush_position's block allocator hint. */
14366+reiser4_blocknr_hint *pos_hint(flush_pos_t * pos)
14367+{
14368+ return &pos->preceder;
14369+}
14370+
14371+flush_queue_t *pos_fq(flush_pos_t * pos)
14372+{
14373+ return pos->fq;
14374+}
14375+
14376+/* Make Linus happy.
14377+ Local variables:
14378+ c-indentation-style: "K&R"
14379+ mode-name: "LC"
14380+ c-basic-offset: 8
14381+ tab-width: 8
14382+ fill-column: 90
14383+ LocalWords: preceder
14384+ End:
14385+*/
14386Index: linux-2.6.16/fs/reiser4/flush.h
14387===================================================================
14388--- /dev/null
14389+++ linux-2.6.16/fs/reiser4/flush.h
14390@@ -0,0 +1,274 @@
14391+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14392+
14393+/* DECLARATIONS: */
14394+
14395+#if !defined(__REISER4_FLUSH_H__)
14396+#define __REISER4_FLUSH_H__
14397+
14398+#include "plugin/cluster.h"
14399+
14400+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14401+ single level of the tree. A flush-scan is used for counting the number of adjacent
14402+ nodes to flush, which is used to determine whether we should relocate, and it is also
14403+ used to find a starting point for flush. A flush-scan object can scan in both right
14404+ and left directions via the scan_left() and scan_right() interfaces. The
14405+ right- and left-variations are similar but perform different functions. When scanning
14406+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14407+ When scanning right we are simply counting the number of adjacent, dirty nodes. */
14408+struct flush_scan {
14409+
14410+ /* The current number of nodes scanned on this level. */
14411+ unsigned count;
14412+
14413+ /* There may be a maximum number of nodes for a scan on any single level. When
14414+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14415+ unsigned max_count;
14416+
14417+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14418+ sideof direction;
14419+
14420+ /* Initially @stop is set to false then set true once some condition stops the
14421+ search (e.g., we found a clean node before reaching max_count or we found a
14422+ node belonging to another atom). */
14423+ int stop;
14424+
14425+ /* The current scan position. If @node is non-NULL then its reference count has
14426+ been incremented to reflect this reference. */
14427+ jnode *node;
14428+
14429+ /* A handle for zload/zrelse of current scan position node. */
14430+ load_count node_load;
14431+
14432+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14433+ node is locked using this lock handle. The endpoint needs to be locked for
14434+ transfer to the flush_position object after scanning finishes. */
14435+ lock_handle node_lock;
14436+
14437+ /* When the position is unformatted, its parent, coordinate, and parent
14438+ zload/zrelse handle. */
14439+ lock_handle parent_lock;
14440+ coord_t parent_coord;
14441+ load_count parent_load;
14442+
14443+ /* The block allocator preceder hint. Sometimes flush_scan determines what the
14444+ preceder is and if so it sets it here, after which it is copied into the
14445+ flush_position. Otherwise, the preceder is computed later. */
14446+ reiser4_block_nr preceder_blk;
14447+};
14448+
14449+typedef struct convert_item_info {
14450+ dc_item_stat d_cur; /* disk cluster state of the current item */
14451+ dc_item_stat d_next; /* disk cluster state of the next slum item */
14452+ struct inode *inode;
14453+ flow_t flow;
14454+} convert_item_info_t;
14455+
14456+typedef struct convert_info {
14457+ int count; /* for squalloc terminating */
14458+ reiser4_cluster_t clust; /* transform cluster */
14459+ item_plugin *iplug; /* current item plugin */
14460+ convert_item_info_t *itm; /* current item info */
14461+} convert_info_t;
14462+
14463+typedef enum flush_position_state {
14464+ POS_INVALID, /* Invalid or stopped pos, do not continue slum
14465+ * processing */
14466+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14467+ * leaf level */
14468+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14469+ * to traverse unformatted nodes */
14470+ POS_TO_LEAF, /* pos is being moved to leaf level */
14471+ POS_TO_TWIG, /* pos is being moved to twig level */
14472+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14473+ * rightmost unit of the current twig */
14474+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14475+} flushpos_state_t;
14476+
14477+/* An encapsulation of the current flush point and all the parameters that are passed
14478+ through the entire squeeze-and-allocate stage of the flush routine. A single
14479+ flush_position object is constructed after left- and right-scanning finishes. */
14480+struct flush_position {
14481+ flushpos_state_t state;
14482+
14483+ coord_t coord; /* coord to traverse unformatted nodes */
14484+ lock_handle lock; /* current lock we hold */
14485+ load_count load; /* load status for current locked formatted node */
14486+
14487+ jnode *child; /* for passing a reference to unformatted child
14488+ * across pos state changes */
14489+
14490+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14491+ int leaf_relocate; /* True if enough leaf-level nodes were
14492+ * found to suggest a relocate policy. */
14493+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14494+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14495+ flush_queue_t *fq;
14496+ long *nr_written; /* number of nodes submitted to disk */
14497+ int flags; /* a copy of jnode_flush flags argument */
14498+
14499+ znode *prev_twig; /* previous parent pointer value, used to catch
14500+ * processing of new twig node */
14501+ convert_info_t *sq; /* convert info */
14502+
14503+ unsigned long pos_in_unit; /* for extents only. Position
14504+ within an extent unit of first
14505+ jnode of slum */
14506+ long nr_to_write; /* number of unformatted nodes to handle on flush */
14507+};
14508+
14509+static inline int item_convert_count(flush_pos_t * pos)
14510+{
14511+ return pos->sq->count;
14512+}
14513+static inline void inc_item_convert_count(flush_pos_t * pos)
14514+{
14515+ pos->sq->count++;
14516+}
14517+static inline void set_item_convert_count(flush_pos_t * pos, int count)
14518+{
14519+ pos->sq->count = count;
14520+}
14521+static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14522+{
14523+ return pos->sq->iplug;
14524+}
14525+
14526+static inline convert_info_t *convert_data(flush_pos_t * pos)
14527+{
14528+ return pos->sq;
14529+}
14530+
14531+static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
14532+{
14533+ assert("edward-955", convert_data(pos));
14534+ return pos->sq->itm;
14535+}
14536+
14537+static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
14538+{
14539+ return &pos->sq->clust.tc;
14540+}
14541+
14542+static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
14543+{
14544+ assert("edward-854", pos->sq != NULL);
14545+ return tfm_stream(tfm_cluster_sq(pos), id);
14546+}
14547+
14548+static inline int chaining_data_present(flush_pos_t * pos)
14549+{
14550+ return convert_data(pos) && item_convert_data(pos);
14551+}
14552+
14553+/* Returns true if next node contains next item of the disk cluster
14554+ so item convert data should be moved to the right slum neighbor.
14555+*/
14556+static inline int should_chain_next_node(flush_pos_t * pos)
14557+{
14558+ int result = 0;
14559+
14560+ assert("edward-1007", chaining_data_present(pos));
14561+
14562+ switch (item_convert_data(pos)->d_next) {
14563+ case DC_CHAINED_ITEM:
14564+ result = 1;
14565+ break;
14566+ case DC_AFTER_CLUSTER:
14567+ break;
14568+ default:
14569+ impossible("edward-1009", "bad state of next slum item");
14570+ }
14571+ return result;
14572+}
14573+
14574+/* update item state in a disk cluster to assign conversion mode */
14575+static inline void
14576+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14577+{
14578+
14579+ assert("edward-1010", chaining_data_present(pos));
14580+
14581+ if (this_node == 0) {
14582+ /* next item is on the right neighbor */
14583+ assert("edward-1011",
14584+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14585+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14586+ assert("edward-1012",
14587+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14588+
14589+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14590+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14591+ } else {
14592+ /* next item is on the same node */
14593+ assert("edward-1013",
14594+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14595+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14596+ assert("edward-1227",
14597+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14598+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
14599+
14600+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14601+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14602+ }
14603+}
14604+
14605+static inline int should_convert_node(flush_pos_t * pos, znode * node)
14606+{
14607+ return znode_convertible(node);
14608+}
14609+
14610+/* true if there is attached convert item info */
14611+static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
14612+{
14613+ return convert_data(pos) && item_convert_data(pos);
14614+}
14615+
14616+#define SQUALLOC_THRESHOLD 256
14617+
14618+static inline int should_terminate_squalloc(flush_pos_t * pos)
14619+{
14620+ return convert_data(pos) &&
14621+ !item_convert_data(pos) &&
14622+ item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14623+}
14624+
14625+void free_convert_data(flush_pos_t * pos);
14626+/* used in extent.c */
14627+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14628+ const coord_t * parent);
14629+int scan_finished(flush_scan * scan);
14630+int scanning_left(flush_scan * scan);
14631+int scan_goto(flush_scan * scan, jnode * tonode);
14632+txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14633+int alloc_extent(flush_pos_t *flush_pos);
14634+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14635+ reiser4_key *stop_key);
14636+extern int init_fqs(void);
14637+extern void done_fqs(void);
14638+
14639+#if REISER4_DEBUG
14640+
14641+extern void check_fq(const txn_atom *atom);
14642+extern atomic_t flush_cnt;
14643+
14644+#define check_preceder(blk) \
14645+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14646+extern void check_pos(flush_pos_t * pos);
14647+#else
14648+#define check_preceder(b) noop
14649+#define check_pos(pos) noop
14650+#endif
14651+
14652+/* __REISER4_FLUSH_H__ */
14653+#endif
14654+
14655+/* Make Linus happy.
14656+ Local variables:
14657+ c-indentation-style: "K&R"
14658+ mode-name: "LC"
14659+ c-basic-offset: 8
14660+ tab-width: 8
14661+ fill-column: 90
14662+ LocalWords: preceder
14663+ End:
14664+*/
14665Index: linux-2.6.16/fs/reiser4/flush_queue.c
14666===================================================================
14667--- /dev/null
14668+++ linux-2.6.16/fs/reiser4/flush_queue.c
14669@@ -0,0 +1,681 @@
14670+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14671+
14672+#include "debug.h"
14673+#include "super.h"
14674+#include "txnmgr.h"
14675+#include "jnode.h"
14676+#include "znode.h"
14677+#include "page_cache.h"
14678+#include "wander.h"
14679+#include "vfs_ops.h"
14680+#include "writeout.h"
14681+#include "flush.h"
14682+
14683+#include <linux/bio.h>
14684+#include <linux/mm.h>
14685+#include <linux/pagemap.h>
14686+#include <linux/blkdev.h>
14687+#include <linux/writeback.h>
14688+
14689+/* A flush queue object is an accumulator for keeping jnodes prepared
14690+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14691+ kept on the flush queue until memory pressure or atom commit asks
14692+ flush queues to write some or all from their jnodes. */
14693+
14694+/*
14695+ LOCKING:
14696+
14697+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14698+ list protected by atom spin lock. fq->prepped list uses the following
14699+ locking:
14700+
14701+ two ways to protect fq->prepped list for read-only list traversal:
14702+
14703+ 1. atom spin-lock atom.
14704+ 2. fq is IN_USE, atom->nr_running_queues increased.
14705+
14706+ and one for list modification:
14707+
14708+ 1. atom is spin-locked and one condition is true: fq is IN_USE or
14709+ atom->nr_running_queues == 0.
14710+
14711+ The deadlock-safe order for flush queues and atoms is: first lock atom, then
14712+ lock flush queue, then lock jnode.
14713+*/
14714+
14715+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14716+#define fq_ready(fq) (!fq_in_use(fq))
14717+
14718+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14719+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14720+
14721+/* get lock on atom from locked flush queue object */
14722+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14723+{
14724+ /* This code is similar to jnode_get_atom(), look at it for the
14725+ * explanation. */
14726+ txn_atom *atom;
14727+
14728+ assert_spin_locked(&(fq->guard));
14729+
14730+ while (1) {
14731+ atom = fq->atom;
14732+ if (atom == NULL)
14733+ break;
14734+
14735+ if (spin_trylock_atom(atom))
14736+ break;
14737+
14738+ atomic_inc(&atom->refcount);
14739+ spin_unlock(&(fq->guard));
14740+ spin_lock_atom(atom);
14741+ spin_lock(&(fq->guard));
14742+
14743+ if (fq->atom == atom) {
14744+ atomic_dec(&atom->refcount);
14745+ break;
14746+ }
14747+
14748+ spin_unlock(&(fq->guard));
14749+ atom_dec_and_unlock(atom);
14750+ spin_lock(&(fq->guard));
14751+ }
14752+
14753+ return atom;
14754+}
14755+
14756+txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14757+{
14758+ txn_atom *atom;
14759+
14760+ spin_lock(&(fq->guard));
14761+ atom = atom_locked_by_fq_nolock(fq);
14762+ spin_unlock(&(fq->guard));
14763+ return atom;
14764+}
14765+
14766+static void init_fq(flush_queue_t * fq)
14767+{
14768+ memset(fq, 0, sizeof *fq);
14769+
14770+ atomic_set(&fq->nr_submitted, 0);
14771+
14772+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14773+
14774+ sema_init(&fq->io_sem, 0);
14775+ spin_lock_init(&fq->guard);
14776+}
14777+
14778+/* slab for flush queues */
14779+static kmem_cache_t *fq_slab;
14780+
14781+
14782+/**
14783+ * init_fqs - create flush queue cache
14784+ *
14785+ * Initializes slab cache of flush queues. It is part of reiser4 module
14786+ * initialization.
14787+ */
14788+int init_fqs(void)
14789+{
14790+ fq_slab = kmem_cache_create("fq",
14791+ sizeof(flush_queue_t),
14792+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14793+ if (fq_slab == NULL)
14794+ return RETERR(-ENOMEM);
14795+ return 0;
14796+}
14797+
14798+/**
14799+ * done_fqs - delete flush queue cache
14800+ *
14801+ * This is called on reiser4 module unloading or system shutdown.
14802+ */
14803+void done_fqs(void)
14804+{
14805+ destroy_reiser4_cache(&fq_slab);
14806+}
14807+
14808+/* create new flush queue object */
14809+static flush_queue_t *create_fq(gfp_t gfp)
14810+{
14811+ flush_queue_t *fq;
14812+
14813+ fq = kmem_cache_alloc(fq_slab, gfp);
14814+ if (fq)
14815+ init_fq(fq);
14816+
14817+ return fq;
14818+}
14819+
14820+/* adjust atom's and flush queue's counters of queued nodes */
14821+static void count_enqueued_node(flush_queue_t * fq)
14822+{
14823+ ON_DEBUG(fq->atom->num_queued++);
14824+}
14825+
14826+static void count_dequeued_node(flush_queue_t * fq)
14827+{
14828+ assert("zam-993", fq->atom->num_queued > 0);
14829+ ON_DEBUG(fq->atom->num_queued--);
14830+}
14831+
14832+/* attach flush queue object to the atom */
14833+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14834+{
14835+ assert_spin_locked(&(atom->alock));
14836+ list_add(&fq->alink, &atom->flush_queues);
14837+ fq->atom = atom;
14838+ ON_DEBUG(atom->nr_flush_queues++);
14839+}
14840+
14841+static void detach_fq(flush_queue_t * fq)
14842+{
14843+ assert_spin_locked(&(fq->atom->alock));
14844+
14845+ spin_lock(&(fq->guard));
14846+ list_del_init(&fq->alink);
14847+ assert("vs-1456", fq->atom->nr_flush_queues > 0);
14848+ ON_DEBUG(fq->atom->nr_flush_queues--);
14849+ fq->atom = NULL;
14850+ spin_unlock(&(fq->guard));
14851+}
14852+
14853+/* destroy flush queue object */
14854+static void done_fq(flush_queue_t * fq)
14855+{
14856+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14857+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14858+
14859+ kmem_cache_free(fq_slab, fq);
14860+}
14861+
14862+/* */
14863+void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14864+{
14865+ JF_SET(node, JNODE_FLUSH_QUEUED);
14866+ count_enqueued_node(fq);
14867+}
14868+
14869+/* Putting jnode into the flush queue. Both atom and jnode should be
14870+ spin-locked. */
14871+void queue_jnode(flush_queue_t * fq, jnode * node)
14872+{
14873+ assert_spin_locked(&(node->guard));
14874+ assert("zam-713", node->atom != NULL);
14875+ assert_spin_locked(&(node->atom->alock));
14876+ assert("zam-716", fq->atom != NULL);
14877+ assert("zam-717", fq->atom == node->atom);
14878+ assert("zam-907", fq_in_use(fq));
14879+
14880+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14881+ assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14882+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14883+ assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14884+
14885+ mark_jnode_queued(fq, node);
14886+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14887+
14888+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14889+ FQ_LIST, 1));
14890+}
14891+
14892+/* repeatable process for waiting io completion on a flush queue object */
14893+static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14894+{
14895+ assert("zam-738", fq->atom != NULL);
14896+ assert_spin_locked(&(fq->atom->alock));
14897+ assert("zam-736", fq_in_use(fq));
14898+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14899+
14900+ if (atomic_read(&fq->nr_submitted) != 0) {
14901+ struct super_block *super;
14902+
14903+ spin_unlock_atom(fq->atom);
14904+
14905+ assert("nikita-3013", schedulable());
14906+
14907+ super = reiser4_get_current_sb();
14908+
14909+ /* FIXME: this is instead of blk_run_queues() */
14910+ blk_run_address_space(get_super_fake(super)->i_mapping);
14911+
14912+ if (!(super->s_flags & MS_RDONLY))
14913+ down(&fq->io_sem);
14914+
14915+ /* Ask the caller to re-acquire the locks and call this
14916+ function again. Note: this technique is commonly used in
14917+ the txnmgr code. */
14918+ return -E_REPEAT;
14919+ }
14920+
14921+ *nr_io_errors += atomic_read(&fq->nr_errors);
14922+ return 0;
14923+}
14924+
14925+/* wait on I/O completion, re-submit dirty nodes to write */
14926+static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14927+{
14928+ int ret;
14929+ txn_atom *atom = fq->atom;
14930+
14931+ assert("zam-801", atom != NULL);
14932+ assert_spin_locked(&(atom->alock));
14933+ assert("zam-762", fq_in_use(fq));
14934+
14935+ ret = wait_io(fq, nr_io_errors);
14936+ if (ret)
14937+ return ret;
14938+
14939+ detach_fq(fq);
14940+ done_fq(fq);
14941+
14942+ atom_send_event(atom);
14943+
14944+ return 0;
14945+}
14946+
14947+/* wait for all i/o for given atom to be completed, actually do one iteration
14948+ on that and return -E_REPEAT if there more iterations needed */
14949+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14950+{
14951+ flush_queue_t *fq;
14952+
14953+ assert_spin_locked(&(atom->alock));
14954+
14955+ if (list_empty_careful(&atom->flush_queues))
14956+ return 0;
14957+
14958+ list_for_each_entry(fq, &atom->flush_queues, alink) {
14959+ if (fq_ready(fq)) {
14960+ int ret;
14961+
14962+ mark_fq_in_use(fq);
14963+ assert("vs-1247", fq->owner == NULL);
14964+ ON_DEBUG(fq->owner = current);
14965+ ret = finish_fq(fq, nr_io_errors);
14966+
14967+ if (*nr_io_errors)
14968+ reiser4_handle_error();
14969+
14970+ if (ret) {
14971+ fq_put(fq);
14972+ return ret;
14973+ }
14974+
14975+ spin_unlock_atom(atom);
14976+
14977+ return -E_REPEAT;
14978+ }
14979+ }
14980+
14981+ /* All flush queues are in use; atom remains locked */
14982+ return -EBUSY;
14983+}
14984+
14985+/* wait all i/o for current atom */
14986+int current_atom_finish_all_fq(void)
14987+{
14988+ txn_atom *atom;
14989+ int nr_io_errors = 0;
14990+ int ret = 0;
14991+
14992+ do {
14993+ while (1) {
14994+ atom = get_current_atom_locked();
14995+ ret = finish_all_fq(atom, &nr_io_errors);
14996+ if (ret != -EBUSY)
14997+ break;
14998+ atom_wait_event(atom);
14999+ }
15000+ } while (ret == -E_REPEAT);
15001+
15002+ /* we do not need locked atom after this function finishes, SUCCESS or
15003+ -EBUSY are two return codes when atom remains locked after
15004+ finish_all_fq */
15005+ if (!ret)
15006+ spin_unlock_atom(atom);
15007+
15008+ assert_spin_not_locked(&(atom->alock));
15009+
15010+ if (ret)
15011+ return ret;
15012+
15013+ if (nr_io_errors)
15014+ return RETERR(-EIO);
15015+
15016+ return 0;
15017+}
15018+
15019+/* change node->atom field for all jnode from given list */
15020+static void
15021+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
15022+{
15023+ jnode *cur;
15024+
15025+ list_for_each_entry(cur, list, capture_link) {
15026+ spin_lock_jnode(cur);
15027+ cur->atom = atom;
15028+ spin_unlock_jnode(cur);
15029+ }
15030+}
15031+
15032+/* support for atom fusion operation */
15033+void fuse_fq(txn_atom *to, txn_atom *from)
15034+{
15035+ flush_queue_t *fq;
15036+
15037+ assert_spin_locked(&(to->alock));
15038+ assert_spin_locked(&(from->alock));
15039+
15040+ list_for_each_entry(fq, &from->flush_queues, alink) {
15041+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
15042+ spin_lock(&(fq->guard));
15043+ fq->atom = to;
15044+ spin_unlock(&(fq->guard));
15045+ }
15046+
15047+ list_splice_init(&from->flush_queues, to->flush_queues.prev);
15048+
15049+#if REISER4_DEBUG
15050+ to->num_queued += from->num_queued;
15051+ to->nr_flush_queues += from->nr_flush_queues;
15052+ from->nr_flush_queues = 0;
15053+#endif
15054+}
15055+
15056+#if REISER4_DEBUG
15057+int atom_fq_parts_are_clean(txn_atom * atom)
15058+{
15059+ assert("zam-915", atom != NULL);
15060+ return list_empty_careful(&atom->flush_queues);
15061+}
15062+#endif
15063+/* Bio i/o completion routine for reiser4 write operations. */
15064+static int
15065+end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
15066+ int err)
15067+{
15068+ int i;
15069+ int nr_errors = 0;
15070+ flush_queue_t *fq;
15071+
15072+ assert("zam-958", bio->bi_rw & WRITE);
15073+
15074+ /* i/o op. is not fully completed */
15075+ if (bio->bi_size != 0)
15076+ return 1;
15077+
15078+ if (err == -EOPNOTSUPP)
15079+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
15080+
15081+ /* we expect that bio->private is set to NULL or fq object which is used
15082+ * for synchronization and error counting. */
15083+ fq = bio->bi_private;
15084+ /* Check all elements of io_vec for correct write completion. */
15085+ for (i = 0; i < bio->bi_vcnt; i += 1) {
15086+ struct page *pg = bio->bi_io_vec[i].bv_page;
15087+
15088+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
15089+ SetPageError(pg);
15090+ nr_errors++;
15091+ }
15092+
15093+ {
15094+ /* jnode WRITEBACK ("write is in progress bit") is
15095+ * atomically cleared here. */
15096+ jnode *node;
15097+
15098+ assert("zam-736", pg != NULL);
15099+ assert("zam-736", PagePrivate(pg));
15100+ node = jprivate(pg);
15101+
15102+ JF_CLR(node, JNODE_WRITEBACK);
15103+ }
15104+
15105+ end_page_writeback(pg);
15106+ page_cache_release(pg);
15107+ }
15108+
15109+ if (fq) {
15110+ /* count i/o error in fq object */
15111+ atomic_add(nr_errors, &fq->nr_errors);
15112+
15113+ /* If all write requests registered in this "fq" are done we up
15114+ * the semaphore. */
15115+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
15116+ up(&fq->io_sem);
15117+ }
15118+
15119+ bio_put(bio);
15120+ return 0;
15121+}
15122+
15123+/* Count I/O requests which will be submitted by @bio in given flush queues
15124+ @fq */
15125+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
15126+{
15127+ bio->bi_private = fq;
15128+ bio->bi_end_io = end_io_handler;
15129+
15130+ if (fq)
15131+ atomic_add(bio->bi_vcnt, &fq->nr_submitted);
15132+}
15133+
15134+/* Move all queued nodes out from @fq->prepped list. */
15135+static void release_prepped_list(flush_queue_t * fq)
15136+{
15137+ txn_atom *atom;
15138+
15139+ assert("zam-904", fq_in_use(fq));
15140+ atom = atom_locked_by_fq(fq);
15141+
15142+ while (!list_empty(ATOM_FQ_LIST(fq))) {
15143+ jnode *cur;
15144+
15145+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
15146+ list_del_init(&cur->capture_link);
15147+
15148+ count_dequeued_node(fq);
15149+ spin_lock_jnode(cur);
15150+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
15151+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
15152+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
15153+ JF_CLR(cur, JNODE_FLUSH_QUEUED);
15154+
15155+ if (JF_ISSET(cur, JNODE_DIRTY)) {
15156+ list_add_tail(&cur->capture_link,
15157+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
15158+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15159+ DIRTY_LIST, 1));
15160+ } else {
15161+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
15162+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15163+ CLEAN_LIST, 1));
15164+ }
15165+
15166+ spin_unlock_jnode(cur);
15167+ }
15168+
15169+ if (--atom->nr_running_queues == 0)
15170+ atom_send_event(atom);
15171+
15172+ spin_unlock_atom(atom);
15173+}
15174+
15175+/* Submit write requests for nodes on the already filled flush queue @fq.
15176+
15177+ @fq: flush queue object which contains jnodes we can (and will) write.
15178+ @return: number of submitted blocks (>=0) if success, otherwise -- an error
15179+ code (<0). */
15180+int write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
15181+{
15182+ int ret;
15183+ txn_atom *atom;
15184+
15185+ while (1) {
15186+ atom = atom_locked_by_fq(fq);
15187+ assert("zam-924", atom);
15188+ /* do not write fq in parallel. */
15189+ if (atom->nr_running_queues == 0
15190+ || !(flags & WRITEOUT_SINGLE_STREAM))
15191+ break;
15192+ atom_wait_event(atom);
15193+ }
15194+
15195+ atom->nr_running_queues++;
15196+ spin_unlock_atom(atom);
15197+
15198+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
15199+ release_prepped_list(fq);
15200+
15201+ return ret;
15202+}
15203+
15204+/* Getting flush queue object for exclusive use by one thread. May require
15205+ several iterations which is indicated by -E_REPEAT return code.
15206+
15207+ This function does not contain code for obtaining an atom lock because an
15208+ atom lock is obtained by different ways in different parts of reiser4,
15209+ usually it is current atom, but we need a possibility for getting fq for the
15210+ atom of given jnode. */
15211+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15212+{
15213+ flush_queue_t *fq;
15214+
15215+ assert_spin_locked(&(atom->alock));
15216+
15217+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15218+ while (&atom->flush_queues != &fq->alink) {
15219+ spin_lock(&(fq->guard));
15220+
15221+ if (fq_ready(fq)) {
15222+ mark_fq_in_use(fq);
15223+ assert("vs-1246", fq->owner == NULL);
15224+ ON_DEBUG(fq->owner = current);
15225+ spin_unlock(&(fq->guard));
15226+
15227+ if (*new_fq)
15228+ done_fq(*new_fq);
15229+
15230+ *new_fq = fq;
15231+
15232+ return 0;
15233+ }
15234+
15235+ spin_unlock(&(fq->guard));
15236+
15237+ fq = list_entry(fq->alink.next, flush_queue_t, alink);
15238+ }
15239+
15240+ /* Use previously allocated fq object */
15241+ if (*new_fq) {
15242+ mark_fq_in_use(*new_fq);
15243+ assert("vs-1248", (*new_fq)->owner == 0);
15244+ ON_DEBUG((*new_fq)->owner = current);
15245+ attach_fq(atom, *new_fq);
15246+
15247+ return 0;
15248+ }
15249+
15250+ spin_unlock_atom(atom);
15251+
15252+ *new_fq = create_fq(gfp);
15253+
15254+ if (*new_fq == NULL)
15255+ return RETERR(-ENOMEM);
15256+
15257+ return RETERR(-E_REPEAT);
15258+}
15259+
15260+int fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15261+{
15262+ return fq_by_atom_gfp(atom, new_fq, get_gfp_mask());
15263+}
15264+
15265+/* A wrapper around fq_by_atom for getting a flush queue object for current
15266+ * atom, if success fq->atom remains locked. */
15267+flush_queue_t *get_fq_for_current_atom(void)
15268+{
15269+ flush_queue_t *fq = NULL;
15270+ txn_atom *atom;
15271+ int ret;
15272+
15273+ do {
15274+ atom = get_current_atom_locked();
15275+ ret = fq_by_atom(atom, &fq);
15276+ } while (ret == -E_REPEAT);
15277+
15278+ if (ret)
15279+ return ERR_PTR(ret);
15280+ return fq;
15281+}
15282+
15283+/* Releasing flush queue object after exclusive use */
15284+void fq_put_nolock(flush_queue_t *fq)
15285+{
15286+ assert("zam-747", fq->atom != NULL);
15287+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15288+ mark_fq_ready(fq);
15289+ assert("vs-1245", fq->owner == current);
15290+ ON_DEBUG(fq->owner = NULL);
15291+}
15292+
15293+void fq_put(flush_queue_t * fq)
15294+{
15295+ txn_atom *atom;
15296+
15297+ spin_lock(&(fq->guard));
15298+ atom = atom_locked_by_fq_nolock(fq);
15299+
15300+ assert("zam-746", atom != NULL);
15301+
15302+ fq_put_nolock(fq);
15303+ atom_send_event(atom);
15304+
15305+ spin_unlock(&(fq->guard));
15306+ spin_unlock_atom(atom);
15307+}
15308+
15309+/* A part of atom object initialization related to the embedded flush queue
15310+ list head */
15311+
15312+void init_atom_fq_parts(txn_atom *atom)
15313+{
15314+ INIT_LIST_HEAD(&atom->flush_queues);
15315+}
15316+
15317+#if REISER4_DEBUG
15318+
15319+void check_fq(const txn_atom *atom)
15320+{
15321+ /* check number of nodes on all atom's flush queues */
15322+ flush_queue_t *fq;
15323+ int count;
15324+ struct list_head *pos;
15325+
15326+ count = 0;
15327+ list_for_each_entry(fq, &atom->flush_queues, alink) {
15328+ spin_lock(&(fq->guard));
15329+ /* calculate number of jnodes on fq' list of prepped jnodes */
15330+ list_for_each(pos, ATOM_FQ_LIST(fq))
15331+ count++;
15332+ spin_unlock(&(fq->guard));
15333+ }
15334+ if (count != atom->fq)
15335+ warning("", "fq counter %d, real %d\n", atom->fq, count);
15336+
15337+}
15338+
15339+#endif
15340+
15341+/*
15342+ * Local variables:
15343+ * c-indentation-style: "K&R"
15344+ * mode-name: "LC"
15345+ * c-basic-offset: 8
15346+ * tab-width: 8
15347+ * fill-column: 79
15348+ * scroll-step: 1
15349+ * End:
15350+ */
15351Index: linux-2.6.16/fs/reiser4/forward.h
15352===================================================================
15353--- /dev/null
15354+++ linux-2.6.16/fs/reiser4/forward.h
15355@@ -0,0 +1,258 @@
15356+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15357+
15358+/* Forward declarations. Thank you Kernighan. */
15359+
15360+#if !defined( __REISER4_FORWARD_H__ )
15361+#define __REISER4_FORWARD_H__
15362+
15363+#include <asm/errno.h>
15364+#include <linux/types.h>
15365+
15366+typedef struct zlock zlock;
15367+typedef struct lock_stack lock_stack;
15368+typedef struct lock_handle lock_handle;
15369+typedef struct znode znode;
15370+typedef struct flow flow_t;
15371+typedef struct coord coord_t;
15372+typedef struct tree_access_pointer tap_t;
15373+typedef struct item_coord item_coord;
15374+typedef struct shift_params shift_params;
15375+typedef struct reiser4_object_create_data reiser4_object_create_data;
15376+typedef union reiser4_plugin reiser4_plugin;
15377+typedef __u16 reiser4_plugin_id;
15378+typedef struct item_plugin item_plugin;
15379+typedef struct jnode_plugin jnode_plugin;
15380+typedef struct reiser4_item_data reiser4_item_data;
15381+typedef union reiser4_key reiser4_key;
15382+typedef struct reiser4_tree reiser4_tree;
15383+typedef struct carry_cut_data carry_cut_data;
15384+typedef struct carry_kill_data carry_kill_data;
15385+typedef struct carry_tree_op carry_tree_op;
15386+typedef struct carry_tree_node carry_tree_node;
15387+typedef struct carry_plugin_info carry_plugin_info;
15388+typedef struct reiser4_journal reiser4_journal;
15389+typedef struct txn_atom txn_atom;
15390+typedef struct txn_handle txn_handle;
15391+typedef struct txn_mgr txn_mgr;
15392+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15393+typedef struct reiser4_context reiser4_context;
15394+typedef struct carry_level carry_level;
15395+typedef struct blocknr_set blocknr_set;
15396+typedef struct blocknr_set_entry blocknr_set_entry;
15397+/* super_block->s_fs_info points to this */
15398+typedef struct reiser4_super_info_data reiser4_super_info_data;
15399+/* next two objects are fields of reiser4_super_info_data */
15400+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15401+typedef struct reiser4_space_allocator reiser4_space_allocator;
15402+
15403+typedef struct flush_scan flush_scan;
15404+typedef struct flush_position flush_pos_t;
15405+
15406+typedef unsigned short pos_in_node_t;
15407+#define MAX_POS_IN_NODE 65535
15408+
15409+typedef struct jnode jnode;
15410+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15411+
15412+typedef struct uf_coord uf_coord_t;
15413+typedef struct hint hint_t;
15414+
15415+typedef struct ktxnmgrd_context ktxnmgrd_context;
15416+
15417+typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
15418+
15419+struct inode;
15420+struct page;
15421+struct file;
15422+struct dentry;
15423+struct super_block;
15424+
15425+/* return values of coord_by_key(). cbk == coord_by_key */
15426+typedef enum {
15427+ CBK_COORD_FOUND = 0,
15428+ CBK_COORD_NOTFOUND = -ENOENT,
15429+} lookup_result;
15430+
15431+/* results of lookup with directory file */
15432+typedef enum {
15433+ FILE_NAME_FOUND = 0,
15434+ FILE_NAME_NOTFOUND = -ENOENT,
15435+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15436+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15437+} file_lookup_result;
15438+
15439+/* behaviors of lookup. If coord we are looking for is actually in a tree,
15440+ both coincide. */
15441+typedef enum {
15442+ /* search exactly for the coord with key given */
15443+ FIND_EXACT,
15444+ /* search for coord with the maximal key not greater than one
15445+ given */
15446+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15447+} lookup_bias;
15448+
15449+typedef enum {
15450+ /* number of leaf level of the tree
15451+ The fake root has (tree_level=0). */
15452+ LEAF_LEVEL = 1,
15453+
15454+ /* number of level one above leaf level of the tree.
15455+
15456+ It is supposed that internal tree used by reiser4 to store file
15457+ system data and meta data will have height 2 initially (when
15458+ created by mkfs).
15459+ */
15460+ TWIG_LEVEL = 2,
15461+} tree_level;
15462+
15463+/* The "real" maximum ztree height is the 0-origin size of any per-level
15464+ array, since the zero'th level is not used. */
15465+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15466+
15467+/* enumeration of possible mutual position of item and coord. This enum is
15468+ return type of ->is_in_item() item plugin method which see. */
15469+typedef enum {
15470+ /* coord is on the left of an item */
15471+ IP_ON_THE_LEFT,
15472+ /* coord is inside item */
15473+ IP_INSIDE,
15474+ /* coord is inside item, but to the right of the rightmost unit of
15475+ this item */
15476+ IP_RIGHT_EDGE,
15477+ /* coord is on the right of an item */
15478+ IP_ON_THE_RIGHT
15479+} interposition;
15480+
15481+/* type of lock to acquire on znode before returning it to caller */
15482+typedef enum {
15483+ ZNODE_NO_LOCK = 0,
15484+ ZNODE_READ_LOCK = 1,
15485+ ZNODE_WRITE_LOCK = 2,
15486+} znode_lock_mode;
15487+
15488+/* type of lock request */
15489+typedef enum {
15490+ ZNODE_LOCK_LOPRI = 0,
15491+ ZNODE_LOCK_HIPRI = (1 << 0),
15492+
15493+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15494+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15495+ return the value -E_REPEAT. */
15496+ ZNODE_LOCK_NONBLOCK = (1 << 1),
15497+ /* An option for longterm_lock_znode which prevents atom fusion */
15498+ ZNODE_LOCK_DONT_FUSE = (1 << 2)
15499+} znode_lock_request;
15500+
15501+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15502+
15503+/* used to specify direction of shift. These must be -1 and 1 */
15504+typedef enum {
15505+ SHIFT_LEFT = 1,
15506+ SHIFT_RIGHT = -1
15507+} shift_direction;
15508+
15509+typedef enum {
15510+ LEFT_SIDE,
15511+ RIGHT_SIDE
15512+} sideof;
15513+
15514+#define round_up( value, order ) \
15515+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15516+ ~( ( order ) - 1 ) ) )
15517+
15518+/* values returned by squalloc_right_neighbor and its auxiliary functions */
15519+typedef enum {
15520+ /* unit of internal item is moved */
15521+ SUBTREE_MOVED = 0,
15522+ /* nothing else can be squeezed into left neighbor */
15523+ SQUEEZE_TARGET_FULL = 1,
15524+ /* all content of node is squeezed into its left neighbor */
15525+ SQUEEZE_SOURCE_EMPTY = 2,
15526+ /* one more item is copied (this is only returned by
15527+ allocate_and_copy_extent to squalloc_twig)) */
15528+ SQUEEZE_CONTINUE = 3
15529+} squeeze_result;
15530+
15531+/* Do not change items ids. If you do - there will be format change */
15532+typedef enum {
15533+ STATIC_STAT_DATA_ID = 0x0,
15534+ SIMPLE_DIR_ENTRY_ID = 0x1,
15535+ COMPOUND_DIR_ID = 0x2,
15536+ NODE_POINTER_ID = 0x3,
15537+ EXTENT_POINTER_ID = 0x5,
15538+ FORMATTING_ID = 0x6,
15539+ CTAIL_ID = 0x7,
15540+ BLACK_BOX_ID = 0x8,
15541+ LAST_ITEM_ID = 0x9
15542+} item_id;
15543+
15544+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15545+ whether commit() was called or VM memory pressure was applied. */
15546+typedef enum {
15547+ /* submit flush queue to disk at jnode_flush completion */
15548+ JNODE_FLUSH_WRITE_BLOCKS = 1,
15549+
15550+ /* flush is called for commit */
15551+ JNODE_FLUSH_COMMIT = 2,
15552+ /* not implemented */
15553+ JNODE_FLUSH_MEMORY_FORMATTED = 4,
15554+
15555+ /* not implemented */
15556+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15557+} jnode_flush_flags;
15558+
15559+/* Flags to insert/paste carry operations. Currently they only used in
15560+ flushing code, but in future, they can be used to optimize for repetitive
15561+ accesses. */
15562+typedef enum {
15563+ /* carry is not allowed to shift data to the left when trying to find
15564+ free space */
15565+ COPI_DONT_SHIFT_LEFT = (1 << 0),
15566+ /* carry is not allowed to shift data to the right when trying to find
15567+ free space */
15568+ COPI_DONT_SHIFT_RIGHT = (1 << 1),
15569+ /* carry is not allowed to allocate new node(s) when trying to find
15570+ free space */
15571+ COPI_DONT_ALLOCATE = (1 << 2),
15572+ /* try to load left neighbor if its not in a cache */
15573+ COPI_LOAD_LEFT = (1 << 3),
15574+ /* try to load right neighbor if its not in a cache */
15575+ COPI_LOAD_RIGHT = (1 << 4),
15576+ /* shift insertion point to the left neighbor */
15577+ COPI_GO_LEFT = (1 << 5),
15578+ /* shift insertion point to the right neighbor */
15579+ COPI_GO_RIGHT = (1 << 6),
15580+ /* try to step back into original node if insertion into new node
15581+ fails after shifting data there. */
15582+ COPI_STEP_BACK = (1 << 7)
15583+} cop_insert_flag;
15584+
15585+typedef enum {
15586+ SAFE_UNLINK, /* safe-link for unlink */
15587+ SAFE_TRUNCATE /* safe-link for truncate */
15588+} reiser4_safe_link_t;
15589+
15590+/* this is to show on which list of atom jnode is */
15591+typedef enum {
15592+ NOT_CAPTURED,
15593+ DIRTY_LIST,
15594+ CLEAN_LIST,
15595+ FQ_LIST,
15596+ WB_LIST,
15597+ OVRWR_LIST
15598+} atom_list;
15599+
15600+
15601+
15602+/* __REISER4_FORWARD_H__ */
15603+#endif
15604+
15605+/* Make Linus happy.
15606+ Local variables:
15607+ c-indentation-style: "K&R"
15608+ mode-name: "LC"
15609+ c-basic-offset: 8
15610+ tab-width: 8
15611+ fill-column: 120
15612+ End:
15613+*/
15614Index: linux-2.6.16/fs/reiser4/fsdata.c
15615===================================================================
15616--- /dev/null
15617+++ linux-2.6.16/fs/reiser4/fsdata.c
15618@@ -0,0 +1,803 @@
15619+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15620+ * reiser4/README */
15621+
15622+#include "fsdata.h"
15623+#include "inode.h"
15624+
15625+
15626+/* cache or dir_cursors */
15627+static kmem_cache_t *d_cursor_cache;
15628+static struct shrinker *d_cursor_shrinker;
15629+
15630+/* list of unused cursors */
15631+static LIST_HEAD(cursor_cache);
15632+
15633+/* number of cursors in list of ununsed cursors */
15634+static unsigned long d_cursor_unused = 0;
15635+
15636+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15637+DEFINE_SPINLOCK(d_lock);
15638+
15639+static reiser4_file_fsdata *create_fsdata(struct file *file);
15640+static int file_is_stateless(struct file *file);
15641+static void free_fsdata(reiser4_file_fsdata *fsdata);
15642+static void kill_cursor(dir_cursor *);
15643+
15644+/**
15645+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15646+ * @nr: number of objects to free
15647+ * @mask: GFP mask
15648+ *
15649+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15650+ * number. Return number of still freeable cursors.
15651+ */
15652+static int d_cursor_shrink(int nr, gfp_t mask)
15653+{
15654+ if (nr != 0) {
15655+ dir_cursor *scan;
15656+ int killed;
15657+
15658+ killed = 0;
15659+ spin_lock(&d_lock);
15660+ while (!list_empty(&cursor_cache)) {
15661+ scan = list_entry(cursor_cache.next, dir_cursor, alist);
15662+ assert("nikita-3567", scan->ref == 0);
15663+ kill_cursor(scan);
15664+ ++killed;
15665+ --nr;
15666+ if (nr == 0)
15667+ break;
15668+ }
15669+ spin_unlock(&d_lock);
15670+ }
15671+ return d_cursor_unused;
15672+}
15673+
15674+/**
15675+ * init_d_cursor - create d_cursor cache
15676+ *
15677+ * Initializes slab cache of d_cursors. It is part of reiser4 module
15678+ * initialization.
15679+ */
15680+int init_d_cursor(void)
15681+{
15682+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15683+ SLAB_HWCACHE_ALIGN, NULL, NULL);
15684+ if (d_cursor_cache == NULL)
15685+ return RETERR(-ENOMEM);
15686+
15687+ /*
15688+ * actually, d_cursors are "priceless", because there is no way to
15689+ * recover information stored in them. On the other hand, we don't
15690+ * want to consume all kernel memory by them. As a compromise, just
15691+ * assign higher "seeks" value to d_cursor cache, so that it will be
15692+ * shrunk only if system is really tight on memory.
15693+ */
15694+ d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15695+ d_cursor_shrink);
15696+ if (d_cursor_shrinker == NULL) {
15697+ destroy_reiser4_cache(&d_cursor_cache);
15698+ d_cursor_cache = NULL;
15699+ return RETERR(-ENOMEM);
15700+ }
15701+ return 0;
15702+}
15703+
15704+/**
15705+ * done_d_cursor - delete d_cursor cache and d_cursor shrinker
15706+ *
15707+ * This is called on reiser4 module unloading or system shutdown.
15708+ */
15709+void done_d_cursor(void)
15710+{
15711+ BUG_ON(d_cursor_shrinker == NULL);
15712+ remove_shrinker(d_cursor_shrinker);
15713+ d_cursor_shrinker = NULL;
15714+
15715+ destroy_reiser4_cache(&d_cursor_cache);
15716+}
15717+
15718+#define D_CURSOR_TABLE_SIZE (256)
15719+
15720+static inline unsigned long
15721+d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
15722+{
15723+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15724+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15725+}
15726+
15727+static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
15728+{
15729+ return k1->cid == k2->cid && k1->oid == k2->oid;
15730+}
15731+
15732+/*
15733+ * define functions to manipulate reiser4 super block's hash table of
15734+ * dir_cursors
15735+ */
15736+#define KMALLOC(size) kmalloc((size), get_gfp_mask())
15737+#define KFREE(ptr, size) kfree(ptr)
15738+TYPE_SAFE_HASH_DEFINE(d_cursor,
15739+ dir_cursor,
15740+ d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
15741+#undef KFREE
15742+#undef KMALLOC
15743+
15744+/**
15745+ * init_super_d_info - initialize per-super-block d_cursor resources
15746+ * @super: super block to initialize
15747+ *
15748+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15749+ * of mount.
15750+ */
15751+int init_super_d_info(struct super_block *super)
15752+{
15753+ d_cursor_info *p;
15754+
15755+ p = &get_super_private(super)->d_info;
15756+
15757+ INIT_RADIX_TREE(&p->tree, get_gfp_mask());
15758+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15759+}
15760+
15761+/**
15762+ * done_super_d_info - release per-super-block d_cursor resources
15763+ * @super: super block being umounted
15764+ *
15765+ * It is called on umount. Kills all directory cursors attached to suoer block.
15766+ */
15767+void done_super_d_info(struct super_block *super)
15768+{
15769+ d_cursor_info *d_info;
15770+ dir_cursor *cursor, *next;
15771+
15772+ d_info = &get_super_private(super)->d_info;
15773+ for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15774+ kill_cursor(cursor);
15775+
15776+ BUG_ON(d_info->tree.rnode != NULL);
15777+ d_cursor_hash_done(&d_info->table);
15778+}
15779+
15780+/**
15781+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15782+ * @cursor: cursor to free
15783+ *
15784+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15785+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15786+ * indices, hash table, list of unused cursors and frees it.
15787+ */
15788+static void kill_cursor(dir_cursor *cursor)
15789+{
15790+ unsigned long index;
15791+
15792+ assert("nikita-3566", cursor->ref == 0);
15793+ assert("nikita-3572", cursor->fsdata != NULL);
15794+
15795+ index = (unsigned long)cursor->key.oid;
15796+ list_del_init(&cursor->fsdata->dir.linkage);
15797+ free_fsdata(cursor->fsdata);
15798+ cursor->fsdata = NULL;
15799+
15800+ if (list_empty_careful(&cursor->list))
15801+ /* this is last cursor for a file. Kill radix-tree entry */
15802+ radix_tree_delete(&cursor->info->tree, index);
15803+ else {
15804+ void **slot;
15805+
15806+ /*
15807+ * there are other cursors for the same oid.
15808+ */
15809+
15810+ /*
15811+ * if radix tree point to the cursor being removed, re-target
15812+ * radix tree slot to the next cursor in the (non-empty as was
15813+ * checked above) element of the circular list of all cursors
15814+ * for this oid.
15815+ */
15816+ slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15817+ assert("nikita-3571", *slot != NULL);
15818+ if (*slot == cursor)
15819+ *slot = list_entry(cursor->list.next, dir_cursor, list);
15820+ /* remove cursor from circular list */
15821+ list_del_init(&cursor->list);
15822+ }
15823+ /* remove cursor from the list of unused cursors */
15824+ list_del_init(&cursor->alist);
15825+ /* remove cursor from the hash table */
15826+ d_cursor_hash_remove(&cursor->info->table, cursor);
15827+ /* and free it */
15828+ kmem_cache_free(d_cursor_cache, cursor);
15829+ --d_cursor_unused;
15830+}
15831+
15832+/* possible actions that can be performed on all cursors for the given file */
15833+enum cursor_action {
15834+ /*
15835+ * load all detached state: this is called when stat-data is loaded
15836+ * from the disk to recover information about all pending readdirs
15837+ */
15838+ CURSOR_LOAD,
15839+ /*
15840+ * detach all state from inode, leaving it in the cache. This is called
15841+ * when inode is removed form the memory by memory pressure
15842+ */
15843+ CURSOR_DISPOSE,
15844+ /*
15845+ * detach cursors from the inode, and free them. This is called when
15846+ * inode is destroyed
15847+ */
15848+ CURSOR_KILL
15849+};
15850+
15851+/*
15852+ * return d_cursor data for the file system @inode is in.
15853+ */
15854+static inline d_cursor_info *d_info(struct inode *inode)
15855+{
15856+ return &get_super_private(inode->i_sb)->d_info;
15857+}
15858+
15859+/*
15860+ * lookup d_cursor in the per-super-block radix tree.
15861+ */
15862+static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
15863+{
15864+ return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15865+}
15866+
15867+/*
15868+ * attach @cursor to the radix tree. There may be multiple cursors for the
15869+ * same oid, they are chained into circular list.
15870+ */
15871+static void bind_cursor(dir_cursor * cursor, unsigned long index)
15872+{
15873+ dir_cursor *head;
15874+
15875+ head = lookup(cursor->info, index);
15876+ if (head == NULL) {
15877+ /* this is the first cursor for this index */
15878+ INIT_LIST_HEAD(&cursor->list);
15879+ radix_tree_insert(&cursor->info->tree, index, cursor);
15880+ } else {
15881+ /* some cursor already exists. Chain ours */
15882+ list_add(&cursor->list, &head->list);
15883+ }
15884+}
15885+
15886+/*
15887+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
15888+ * "unused" list. Called when file descriptor is not longer in active use.
15889+ */
15890+static void clean_fsdata(struct file *file)
15891+{
15892+ dir_cursor *cursor;
15893+ reiser4_file_fsdata *fsdata;
15894+
15895+ assert("nikita-3570", file_is_stateless(file));
15896+
15897+ fsdata = (reiser4_file_fsdata *) file->private_data;
15898+ if (fsdata != NULL) {
15899+ cursor = fsdata->cursor;
15900+ if (cursor != NULL) {
15901+ spin_lock(&d_lock);
15902+ --cursor->ref;
15903+ if (cursor->ref == 0) {
15904+ list_add_tail(&cursor->alist, &cursor_cache);
15905+ ++d_cursor_unused;
15906+ }
15907+ spin_unlock(&d_lock);
15908+ file->private_data = NULL;
15909+ }
15910+ }
15911+}
15912+
15913+/*
15914+ * global counter used to generate "client ids". These ids are encoded into
15915+ * high bits of fpos.
15916+ */
15917+static __u32 cid_counter = 0;
15918+#define CID_SHIFT (20)
15919+#define CID_MASK (0xfffffull)
15920+
15921+static void free_file_fsdata_nolock(struct file *);
15922+
15923+/**
15924+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15925+ * @cursor:
15926+ * @file:
15927+ * @inode:
15928+ *
15929+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15930+ * reiser4 super block's hash table and radix tree.
15931+ add detachable readdir
15932+ * state to the @f
15933+ */
15934+static int insert_cursor(dir_cursor *cursor, struct file *file,
15935+ struct inode *inode)
15936+{
15937+ int result;
15938+ reiser4_file_fsdata *fsdata;
15939+
15940+ memset(cursor, 0, sizeof *cursor);
15941+
15942+ /* this is either first call to readdir, or rewind. Anyway, create new
15943+ * cursor. */
15944+ fsdata = create_fsdata(NULL);
15945+ if (fsdata != NULL) {
15946+ result = radix_tree_preload(get_gfp_mask());
15947+ if (result == 0) {
15948+ d_cursor_info *info;
15949+ oid_t oid;
15950+
15951+ info = d_info(inode);
15952+ oid = get_inode_oid(inode);
15953+ /* cid occupies higher 12 bits of f->f_pos. Don't
15954+ * allow it to become negative: this confuses
15955+ * nfsd_readdir() */
15956+ cursor->key.cid = (++cid_counter) & 0x7ff;
15957+ cursor->key.oid = oid;
15958+ cursor->fsdata = fsdata;
15959+ cursor->info = info;
15960+ cursor->ref = 1;
15961+
15962+ spin_lock_inode(inode);
15963+ /* install cursor as @f's private_data, discarding old
15964+ * one if necessary */
15965+#if REISER4_DEBUG
15966+ if (file->private_data)
15967+ warning("", "file has fsdata already");
15968+#endif
15969+ clean_fsdata(file);
15970+ free_file_fsdata_nolock(file);
15971+ file->private_data = fsdata;
15972+ fsdata->cursor = cursor;
15973+ spin_unlock_inode(inode);
15974+ spin_lock(&d_lock);
15975+ /* insert cursor into hash table */
15976+ d_cursor_hash_insert(&info->table, cursor);
15977+ /* and chain it into radix-tree */
15978+ bind_cursor(cursor, (unsigned long)oid);
15979+ spin_unlock(&d_lock);
15980+ radix_tree_preload_end();
15981+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15982+ }
15983+ } else
15984+ result = RETERR(-ENOMEM);
15985+ return result;
15986+}
15987+
15988+/**
15989+ * process_cursors - do action on each cursor attached to inode
15990+ * @inode:
15991+ * @act: action to do
15992+ *
15993+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15994+ * and performs action specified by @act on each of cursors.
15995+ */
15996+static void process_cursors(struct inode *inode, enum cursor_action act)
15997+{
15998+ oid_t oid;
15999+ dir_cursor *start;
16000+ struct list_head *head;
16001+ reiser4_context *ctx;
16002+ d_cursor_info *info;
16003+
16004+ /* this can be called by
16005+ *
16006+ * kswapd->...->prune_icache->..reiser4_destroy_inode
16007+ *
16008+ * without reiser4_context
16009+ */
16010+ ctx = init_context(inode->i_sb);
16011+ if (IS_ERR(ctx)) {
16012+ warning("vs-23", "failed to init context");
16013+ return;
16014+ }
16015+
16016+ assert("nikita-3558", inode != NULL);
16017+
16018+ info = d_info(inode);
16019+ oid = get_inode_oid(inode);
16020+ spin_lock_inode(inode);
16021+ head = get_readdir_list(inode);
16022+ spin_lock(&d_lock);
16023+ /* find any cursor for this oid: reference to it is hanging of radix
16024+ * tree */
16025+ start = lookup(info, (unsigned long)oid);
16026+ if (start != NULL) {
16027+ dir_cursor *scan;
16028+ reiser4_file_fsdata *fsdata;
16029+
16030+ /* process circular list of cursors for this oid */
16031+ scan = start;
16032+ do {
16033+ dir_cursor *next;
16034+
16035+ next = list_entry(scan->list.next, dir_cursor, list);
16036+ fsdata = scan->fsdata;
16037+ assert("nikita-3557", fsdata != NULL);
16038+ if (scan->key.oid == oid) {
16039+ switch (act) {
16040+ case CURSOR_DISPOSE:
16041+ list_del_init(&fsdata->dir.linkage);
16042+ break;
16043+ case CURSOR_LOAD:
16044+ list_add(&fsdata->dir.linkage, head);
16045+ break;
16046+ case CURSOR_KILL:
16047+ kill_cursor(scan);
16048+ break;
16049+ }
16050+ }
16051+ if (scan == next)
16052+ /* last cursor was just killed */
16053+ break;
16054+ scan = next;
16055+ } while (scan != start);
16056+ }
16057+ spin_unlock(&d_lock);
16058+ /* check that we killed 'em all */
16059+ assert("nikita-3568",
16060+ ergo(act == CURSOR_KILL,
16061+ list_empty_careful(get_readdir_list(inode))));
16062+ assert("nikita-3569",
16063+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
16064+ spin_unlock_inode(inode);
16065+ reiser4_exit_context(ctx);
16066+}
16067+
16068+/**
16069+ * dispose_cursors - removes cursors from inode's list
16070+ * @inode: inode to dispose cursors of
16071+ *
16072+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
16073+ * attached to cursor from inode's readdir list. This is called when inode is
16074+ * removed from the memory by memory pressure.
16075+ */
16076+void dispose_cursors(struct inode *inode)
16077+{
16078+ process_cursors(inode, CURSOR_DISPOSE);
16079+}
16080+
16081+/**
16082+ * load_cursors - attach cursors to inode
16083+ * @inode: inode to load cursors to
16084+ *
16085+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
16086+ * attached to cursor to inode's readdir list. This is done when inode is
16087+ * loaded into memory.
16088+ */
16089+void load_cursors(struct inode *inode)
16090+{
16091+ process_cursors(inode, CURSOR_LOAD);
16092+}
16093+
16094+/**
16095+ * kill_cursors - kill all inode cursors
16096+ * @inode: inode to kill cursors of
16097+ *
16098+ * Frees all cursors for this inode. This is called when inode is destroyed.
16099+ */
16100+void kill_cursors(struct inode *inode)
16101+{
16102+ process_cursors(inode, CURSOR_KILL);
16103+}
16104+
16105+/**
16106+ * file_is_stateless -
16107+ * @file:
16108+ *
16109+ * true, if file descriptor @f is created by NFS server by "demand" to serve
16110+ * one file system operation. This means that there may be "detached state"
16111+ * for underlying inode.
16112+ */
16113+static int file_is_stateless(struct file *file)
16114+{
16115+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
16116+}
16117+
16118+/**
16119+ * get_dir_fpos -
16120+ * @dir:
16121+ *
16122+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
16123+ * in the case of stateless directory operation (readdir-over-nfs), client id
16124+ * was encoded in the high bits of cookie and should me masked off.
16125+ */
16126+loff_t get_dir_fpos(struct file *dir)
16127+{
16128+ if (file_is_stateless(dir))
16129+ return dir->f_pos & CID_MASK;
16130+ else
16131+ return dir->f_pos;
16132+}
16133+
16134+/**
16135+ * try_to_attach_fsdata - ???
16136+ * @file:
16137+ * @inode:
16138+ *
16139+ * Finds or creates cursor for readdir-over-nfs.
16140+ */
16141+int try_to_attach_fsdata(struct file *file, struct inode *inode)
16142+{
16143+ loff_t pos;
16144+ int result;
16145+ dir_cursor *cursor;
16146+
16147+ /*
16148+ * we are serialized by inode->i_mutex
16149+ */
16150+ if (!file_is_stateless(file))
16151+ return 0;
16152+
16153+ pos = file->f_pos;
16154+ result = 0;
16155+ if (pos == 0) {
16156+ /*
16157+ * first call to readdir (or rewind to the beginning of
16158+ * directory)
16159+ */
16160+ cursor = kmem_cache_alloc(d_cursor_cache, get_gfp_mask());
16161+ if (cursor != NULL)
16162+ result = insert_cursor(cursor, file, inode);
16163+ else
16164+ result = RETERR(-ENOMEM);
16165+ } else {
16166+ /* try to find existing cursor */
16167+ d_cursor_key key;
16168+
16169+ key.cid = pos >> CID_SHIFT;
16170+ key.oid = get_inode_oid(inode);
16171+ spin_lock(&d_lock);
16172+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
16173+ if (cursor != NULL) {
16174+ /* cursor was found */
16175+ if (cursor->ref == 0) {
16176+ /* move it from unused list */
16177+ list_del_init(&cursor->alist);
16178+ --d_cursor_unused;
16179+ }
16180+ ++cursor->ref;
16181+ }
16182+ spin_unlock(&d_lock);
16183+ if (cursor != NULL) {
16184+ spin_lock_inode(inode);
16185+ assert("nikita-3556", cursor->fsdata->back == NULL);
16186+ clean_fsdata(file);
16187+ free_file_fsdata_nolock(file);
16188+ file->private_data = cursor->fsdata;
16189+ spin_unlock_inode(inode);
16190+ }
16191+ }
16192+ return result;
16193+}
16194+
16195+/**
16196+ * detach_fsdata - ???
16197+ * @file:
16198+ *
16199+ * detach fsdata, if necessary
16200+ */
16201+void detach_fsdata(struct file *file)
16202+{
16203+ struct inode *inode;
16204+
16205+ if (!file_is_stateless(file))
16206+ return;
16207+
16208+ inode = file->f_dentry->d_inode;
16209+ spin_lock_inode(inode);
16210+ clean_fsdata(file);
16211+ spin_unlock_inode(inode);
16212+}
16213+
16214+/* slab for reiser4_dentry_fsdata */
16215+static kmem_cache_t *dentry_fsdata_cache;
16216+
16217+/**
16218+ * init_dentry_fsdata - create cache of dentry_fsdata
16219+ *
16220+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
16221+ * part of reiser4 module initialization.
16222+ */
16223+int init_dentry_fsdata(void)
16224+{
16225+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16226+ sizeof(reiser4_dentry_fsdata),
16227+ 0,
16228+ SLAB_HWCACHE_ALIGN |
16229+ SLAB_RECLAIM_ACCOUNT, NULL,
16230+ NULL);
16231+ if (dentry_fsdata_cache == NULL)
16232+ return RETERR(-ENOMEM);
16233+ return 0;
16234+}
16235+
16236+/**
16237+ * done_dentry_fsdata - delete cache of dentry_fsdata
16238+ *
16239+ * This is called on reiser4 module unloading or system shutdown.
16240+ */
16241+void done_dentry_fsdata(void)
16242+{
16243+ destroy_reiser4_cache(&dentry_fsdata_cache);
16244+}
16245+
16246+/**
16247+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
16248+ * @dentry: queried dentry
16249+ *
16250+ * Allocates if necessary and returns per-dentry data that we attach to each
16251+ * dentry.
16252+ */
16253+reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16254+{
16255+ assert("nikita-1365", dentry != NULL);
16256+
16257+ if (dentry->d_fsdata == NULL) {
16258+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16259+ get_gfp_mask());
16260+ if (dentry->d_fsdata == NULL)
16261+ return ERR_PTR(RETERR(-ENOMEM));
16262+ memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
16263+ }
16264+ return dentry->d_fsdata;
16265+}
16266+
16267+/**
16268+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16269+ * @dentry: dentry to free fsdata of
16270+ *
16271+ * Detaches and frees fs-specific dentry data
16272+ */
16273+void reiser4_free_dentry_fsdata(struct dentry *dentry)
16274+{
16275+ if (dentry->d_fsdata != NULL) {
16276+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16277+ dentry->d_fsdata = NULL;
16278+ }
16279+}
16280+
16281+
16282+/* slab for reiser4_file_fsdata */
16283+static kmem_cache_t *file_fsdata_cache;
16284+
16285+/**
16286+ * init_file_fsdata - create cache of reiser4_file_fsdata
16287+ *
16288+ * Initializes slab cache of structures attached to file->private_data. It is
16289+ * part of reiser4 module initialization.
16290+ */
16291+int init_file_fsdata(void)
16292+{
16293+ file_fsdata_cache = kmem_cache_create("file_fsdata",
16294+ sizeof(reiser4_file_fsdata),
16295+ 0,
16296+ SLAB_HWCACHE_ALIGN |
16297+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
16298+ if (file_fsdata_cache == NULL)
16299+ return RETERR(-ENOMEM);
16300+ return 0;
16301+}
16302+
16303+/**
16304+ * done_file_fsdata - delete cache of reiser4_file_fsdata
16305+ *
16306+ * This is called on reiser4 module unloading or system shutdown.
16307+ */
16308+void done_file_fsdata(void)
16309+{
16310+ destroy_reiser4_cache(&file_fsdata_cache);
16311+}
16312+
16313+/**
16314+ * create_fsdata - allocate and initialize reiser4_file_fsdata
16315+ * @file: what to create file_fsdata for, may be NULL
16316+ *
16317+ * Allocates and initializes reiser4_file_fsdata structure.
16318+ */
16319+static reiser4_file_fsdata *create_fsdata(struct file *file)
16320+{
16321+ reiser4_file_fsdata *fsdata;
16322+
16323+ fsdata = kmem_cache_alloc(file_fsdata_cache, get_gfp_mask());
16324+ if (fsdata != NULL) {
16325+ memset(fsdata, 0, sizeof *fsdata);
16326+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16327+ fsdata->back = file;
16328+ INIT_LIST_HEAD(&fsdata->dir.linkage);
16329+ }
16330+ return fsdata;
16331+}
16332+
16333+/**
16334+ * free_fsdata - free reiser4_file_fsdata
16335+ * @fsdata: object to free
16336+ *
16337+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
16338+ */
16339+static void free_fsdata(reiser4_file_fsdata *fsdata)
16340+{
16341+ BUG_ON(fsdata == NULL);
16342+ kmem_cache_free(file_fsdata_cache, fsdata);
16343+}
16344+
16345+/**
16346+ * reiser4_get_file_fsdata - get fs-specific file data
16347+ * @file: queried file
16348+ *
16349+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16350+ * to @file.
16351+ */
16352+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16353+{
16354+ assert("nikita-1603", file != NULL);
16355+
16356+ if (file->private_data == NULL) {
16357+ reiser4_file_fsdata *fsdata;
16358+ struct inode *inode;
16359+
16360+ fsdata = create_fsdata(file);
16361+ if (fsdata == NULL)
16362+ return ERR_PTR(RETERR(-ENOMEM));
16363+
16364+ inode = file->f_dentry->d_inode;
16365+ spin_lock_inode(inode);
16366+ if (file->private_data == NULL) {
16367+ file->private_data = fsdata;
16368+ fsdata = NULL;
16369+ }
16370+ spin_unlock_inode(inode);
16371+ if (fsdata != NULL)
16372+ /* other thread initialized ->fsdata */
16373+ kmem_cache_free(file_fsdata_cache, fsdata);
16374+ }
16375+ assert("nikita-2665", file->private_data != NULL);
16376+ return file->private_data;
16377+}
16378+
16379+/**
16380+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16381+ * @file:
16382+ *
16383+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16384+ * readdir list, frees if it is not linked to d_cursor object.
16385+ */
16386+static void free_file_fsdata_nolock(struct file *file)
16387+{
16388+ reiser4_file_fsdata *fsdata;
16389+
16390+ assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16391+ fsdata = file->private_data;
16392+ if (fsdata != NULL) {
16393+ list_del_init(&fsdata->dir.linkage);
16394+ if (fsdata->cursor == NULL)
16395+ free_fsdata(fsdata);
16396+ }
16397+ file->private_data = NULL;
16398+}
16399+
16400+/**
16401+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16402+ * @file:
16403+ *
16404+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16405+ */
16406+void reiser4_free_file_fsdata(struct file *file)
16407+{
16408+ spin_lock_inode(file->f_dentry->d_inode);
16409+ free_file_fsdata_nolock(file);
16410+ spin_unlock_inode(file->f_dentry->d_inode);
16411+}
16412+
16413+/*
16414+ * Local variables:
16415+ * c-indentation-style: "K&R"
16416+ * mode-name: "LC"
16417+ * c-basic-offset: 8
16418+ * tab-width: 8
16419+ * fill-column: 79
16420+ * End:
16421+ */
16422Index: linux-2.6.16/fs/reiser4/fsdata.h
16423===================================================================
16424--- /dev/null
16425+++ linux-2.6.16/fs/reiser4/fsdata.h
16426@@ -0,0 +1,218 @@
16427+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16428+ * reiser4/README */
16429+
16430+#if !defined( __REISER4_FSDATA_H__ )
16431+#define __REISER4_FSDATA_H__
16432+
16433+#include "debug.h"
16434+#include "kassign.h"
16435+#include "seal.h"
16436+#include "type_safe_hash.h"
16437+#include "plugin/file/file.h"
16438+#include "readahead.h"
16439+
16440+/*
16441+ * comment about reiser4_dentry_fsdata
16442+ *
16443+ *
16444+ */
16445+
16446+/*
16447+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
16448+ * protected by ->i_mutex on inode. Under this lock following invariant
16449+ * holds:
16450+ *
16451+ * file descriptor is "looking" at the entry_no-th directory entry from
16452+ * the beginning of directory. This entry has key dir_entry_key and is
16453+ * pos-th entry with duplicate-key sequence.
16454+ *
16455+ */
16456+
16457+/* logical position within directory */
16458+typedef struct {
16459+ /* key of directory entry (actually, part of a key sufficient to
16460+ identify directory entry) */
16461+ de_id dir_entry_key;
16462+ /* ordinal number of directory entry among all entries with the same
16463+ key. (Starting from 0.) */
16464+ unsigned pos;
16465+} dir_pos;
16466+
16467+typedef struct {
16468+ /* f_pos corresponding to this readdir position */
16469+ __u64 fpos;
16470+ /* logical position within directory */
16471+ dir_pos position;
16472+ /* logical number of directory entry within
16473+ directory */
16474+ __u64 entry_no;
16475+} readdir_pos;
16476+
16477+/*
16478+ * this is used to speed up lookups for directory entry: on initial call to
16479+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
16480+ * in struct dentry and reused later to avoid tree traversals.
16481+ */
16482+typedef struct de_location {
16483+ /* seal covering directory entry */
16484+ seal_t entry_seal;
16485+ /* coord of directory entry */
16486+ coord_t entry_coord;
16487+ /* ordinal number of directory entry among all entries with the same
16488+ key. (Starting from 0.) */
16489+ int pos;
16490+} de_location;
16491+
16492+/**
16493+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16494+ *
16495+ * This is allocated dynamically and released in d_op->d_release()
16496+ *
16497+ * Currently it only contains cached location (hint) of directory entry, but
16498+ * it is expected that other information will be accumulated here.
16499+ */
16500+typedef struct reiser4_dentry_fsdata {
16501+ /*
16502+ * here will go fields filled by ->lookup() to speedup next
16503+ * create/unlink, like blocknr of znode with stat-data, or key of
16504+ * stat-data.
16505+ */
16506+ de_location dec;
16507+ int stateless; /* created through reiser4_decode_fh, needs special
16508+ * treatment in readdir. */
16509+} reiser4_dentry_fsdata;
16510+
16511+extern int init_dentry_fsdata(void);
16512+extern void done_dentry_fsdata(void);
16513+extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16514+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16515+
16516+
16517+/**
16518+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16519+ *
16520+ * This is allocated dynamically and released in inode->i_fop->release
16521+ */
16522+typedef struct reiser4_file_fsdata {
16523+ /*
16524+ * pointer back to the struct file which this reiser4_file_fsdata is
16525+ * part of
16526+ */
16527+ struct file *back;
16528+ /* detached cursor for stateless readdir. */
16529+ struct dir_cursor *cursor;
16530+ /*
16531+ * We need both directory and regular file parts here, because there
16532+ * are file system objects that are files and directories.
16533+ */
16534+ struct {
16535+ /*
16536+ * position in directory. It is updated each time directory is
16537+ * modified
16538+ */
16539+ readdir_pos readdir;
16540+ /* head of this list is reiser4_inode->lists.readdir_list */
16541+ struct list_head linkage;
16542+ } dir;
16543+ /* hints to speed up operations with regular files: read and write. */
16544+ struct {
16545+ hint_t hint;
16546+ } reg;
16547+ /* */
16548+ struct {
16549+ /* this is called by reiser4_readpages if set */
16550+ void (*readpages) (struct address_space *,
16551+ struct list_head * pages, void *data);
16552+ /* reiser4_readpaextended coord. It is set by read_extent before
16553+ calling page_cache_readahead */
16554+ void *data;
16555+ } ra2;
16556+ struct reiser4_file_ra_state ra1;
16557+
16558+} reiser4_file_fsdata;
16559+
16560+extern int init_file_fsdata(void);
16561+extern void done_file_fsdata(void);
16562+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16563+extern void reiser4_free_file_fsdata(struct file *);
16564+
16565+
16566+/*
16567+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16568+ * used to address problem reiser4 has with readdir accesses via NFS. See
16569+ * plugin/file_ops_readdir.c for more details.
16570+ */
16571+typedef struct {
16572+ __u16 cid;
16573+ __u64 oid;
16574+} d_cursor_key;
16575+
16576+/*
16577+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16578+ * maintain hash table of dir_cursor-s in reiser4's super block
16579+ */
16580+typedef struct dir_cursor dir_cursor;
16581+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16582+
16583+typedef struct d_cursor_info d_cursor_info;
16584+
16585+struct dir_cursor {
16586+ int ref;
16587+ reiser4_file_fsdata *fsdata;
16588+
16589+ /* link to reiser4 super block hash table of cursors */
16590+ d_cursor_hash_link hash;
16591+
16592+ /*
16593+ * this is to link cursors to reiser4 super block's radix tree of
16594+ * cursors if there are more than one cursor of the same objectid
16595+ */
16596+ struct list_head list;
16597+ d_cursor_key key;
16598+ d_cursor_info *info;
16599+ /* list of unused cursors */
16600+ struct list_head alist;
16601+};
16602+
16603+extern int init_d_cursor(void);
16604+extern void done_d_cursor(void);
16605+
16606+extern int init_super_d_info(struct super_block *);
16607+extern void done_super_d_info(struct super_block *);
16608+
16609+extern loff_t get_dir_fpos(struct file *);
16610+extern int try_to_attach_fsdata(struct file *, struct inode *);
16611+extern void detach_fsdata(struct file *);
16612+
16613+
16614+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16615+ more details */
16616+void dispose_cursors(struct inode *inode);
16617+void load_cursors(struct inode *inode);
16618+void kill_cursors(struct inode *inode);
16619+void adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj);
16620+
16621+/*
16622+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16623+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16624+ */
16625+struct d_cursor_info {
16626+ d_cursor_hash_table table;
16627+ struct radix_tree_root tree;
16628+};
16629+
16630+/* spinlock protecting readdir cursors */
16631+extern spinlock_t d_lock;
16632+
16633+/* __REISER4_FSDATA_H__ */
16634+#endif
16635+
16636+/*
16637+ * Local variables:
16638+ * c-indentation-style: "K&R"
16639+ * mode-name: "LC"
16640+ * c-basic-offset: 8
16641+ * tab-width: 8
16642+ * fill-column: 120
16643+ * End:
16644+ */
16645Index: linux-2.6.16/fs/reiser4/init_super.c
16646===================================================================
16647--- /dev/null
16648+++ linux-2.6.16/fs/reiser4/init_super.c
16649@@ -0,0 +1,739 @@
16650+/* Copyright by Hans Reiser, 2003 */
16651+
16652+#include "super.h"
16653+#include "inode.h"
16654+#include "plugin/plugin_set.h"
16655+
16656+#include <linux/swap.h>
16657+
16658+
16659+/**
16660+ * init_fs_info - allocate reiser4 specific super block
16661+ * @super: super block of filesystem
16662+ *
16663+ * Allocates and initialize reiser4_super_info_data, attaches it to
16664+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
16665+ */
16666+int init_fs_info(struct super_block *super)
16667+{
16668+ reiser4_super_info_data *sbinfo;
16669+
16670+ sbinfo = kmalloc(sizeof(reiser4_super_info_data), get_gfp_mask());
16671+ if (!sbinfo)
16672+ return RETERR(-ENOMEM);
16673+
16674+ super->s_fs_info = sbinfo;
16675+ super->s_op = NULL;
16676+ memset(sbinfo, 0, sizeof(*sbinfo));
16677+
16678+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16679+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16680+
16681+ sema_init(&sbinfo->delete_sema, 1);
16682+ sema_init(&sbinfo->flush_sema, 1);
16683+ spin_lock_init(&(sbinfo->guard));
16684+
16685+ /* initialize per-super-block d_cursor resources */
16686+ init_super_d_info(super);
16687+
16688+ return 0;
16689+}
16690+
16691+/**
16692+ * done_fs_info - free reiser4 specific super block
16693+ * @super: super block of filesystem
16694+ *
16695+ * Performs some sanity checks, releases structures maintaining d_cursor-s,
16696+ * frees reiser4_super_info_data.
16697+ */
16698+void done_fs_info(struct super_block *super)
16699+{
16700+ assert("zam-990", super->s_fs_info != NULL);
16701+
16702+ /* release per-super-block d_cursor resources */
16703+ done_super_d_info(super);
16704+
16705+ /* make sure that there are not jnodes already */
16706+ assert("", list_empty(&get_super_private(super)->all_jnodes));
16707+ assert("", get_current_context()->trans->atom == NULL);
16708+ check_block_counters(super);
16709+ kfree(super->s_fs_info);
16710+ super->s_fs_info = NULL;
16711+}
16712+
16713+/* type of option parseable by parse_option() */
16714+typedef enum {
16715+ /* value of option is arbitrary string */
16716+ OPT_STRING,
16717+
16718+ /*
16719+ * option specifies bit in a bitmask. When option is set - bit in
16720+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16721+ * dont_load_bitmap, atomic_write.
16722+ */
16723+ OPT_BIT,
16724+
16725+ /*
16726+ * value of option should conform to sprintf() format. Examples are
16727+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16728+ */
16729+ OPT_FORMAT,
16730+
16731+ /*
16732+ * option can take one of predefined values. Example is onerror=panic or
16733+ * onerror=remount-ro
16734+ */
16735+ OPT_ONEOF,
16736+} opt_type_t;
16737+
16738+typedef struct opt_bitmask_bit {
16739+ const char *bit_name;
16740+ int bit_nr;
16741+} opt_bitmask_bit;
16742+
16743+/* description of option parseable by parse_option() */
16744+typedef struct opt_desc {
16745+ /* option name.
16746+
16747+ parsed portion of string has a form "name=value".
16748+ */
16749+ const char *name;
16750+ /* type of option */
16751+ opt_type_t type;
16752+ union {
16753+ /* where to store value of string option (type == OPT_STRING) */
16754+ char **string;
16755+ /* description of bits for bit option (type == OPT_BIT) */
16756+ struct {
16757+ int nr;
16758+ void *addr;
16759+ } bit;
16760+ /* description of format and targets for format option (type
16761+ == OPT_FORMAT) */
16762+ struct {
16763+ const char *format;
16764+ int nr_args;
16765+ void *arg1;
16766+ void *arg2;
16767+ void *arg3;
16768+ void *arg4;
16769+ } f;
16770+ struct {
16771+ int *result;
16772+ const char *list[10];
16773+ } oneof;
16774+ struct {
16775+ void *addr;
16776+ int nr_bits;
16777+ opt_bitmask_bit *bits;
16778+ } bitmask;
16779+ } u;
16780+} opt_desc_t;
16781+
16782+/**
16783+ * parse_option - parse one option
16784+ * @opt_strin: starting point of parsing
16785+ * @opt: option description
16786+ *
16787+ * foo=bar,
16788+ * ^ ^ ^
16789+ * | | +-- replaced to '\0'
16790+ * | +-- val_start
16791+ * +-- opt_string
16792+ * Figures out option type and handles option correspondingly.
16793+ */
16794+static int parse_option(char *opt_string, opt_desc_t *opt)
16795+{
16796+ char *val_start;
16797+ int result;
16798+ const char *err_msg;
16799+
16800+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16801+
16802+ val_start = strchr(opt_string, '=');
16803+ if (val_start != NULL) {
16804+ *val_start = '\0';
16805+ ++val_start;
16806+ }
16807+
16808+ err_msg = NULL;
16809+ result = 0;
16810+ switch (opt->type) {
16811+ case OPT_STRING:
16812+ if (val_start == NULL) {
16813+ err_msg = "String arg missing";
16814+ result = RETERR(-EINVAL);
16815+ } else
16816+ *opt->u.string = val_start;
16817+ break;
16818+ case OPT_BIT:
16819+ if (val_start != NULL)
16820+ err_msg = "Value ignored";
16821+ else
16822+ set_bit(opt->u.bit.nr, opt->u.bit.addr);
16823+ break;
16824+ case OPT_FORMAT:
16825+ if (val_start == NULL) {
16826+ err_msg = "Formatted arg missing";
16827+ result = RETERR(-EINVAL);
16828+ break;
16829+ }
16830+ if (sscanf(val_start, opt->u.f.format,
16831+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16832+ opt->u.f.arg4) != opt->u.f.nr_args) {
16833+ err_msg = "Wrong conversion";
16834+ result = RETERR(-EINVAL);
16835+ }
16836+ break;
16837+ case OPT_ONEOF:
16838+ {
16839+ int i = 0;
16840+
16841+ if (val_start == NULL) {
16842+ err_msg = "Value is missing";
16843+ result = RETERR(-EINVAL);
16844+ break;
16845+ }
16846+ err_msg = "Wrong option value";
16847+ result = RETERR(-EINVAL);
16848+ while (opt->u.oneof.list[i]) {
16849+ if (!strcmp(opt->u.oneof.list[i], val_start)) {
16850+ result = 0;
16851+ err_msg = NULL;
16852+ *opt->u.oneof.result = i;
16853+ break;
16854+ }
16855+ i++;
16856+ }
16857+ break;
16858+ }
16859+ default:
16860+ wrong_return_value("nikita-2100", "opt -> type");
16861+ break;
16862+ }
16863+ if (err_msg != NULL) {
16864+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16865+ err_msg, opt->name, val_start ? "=" : "",
16866+ val_start ? : "");
16867+ }
16868+ return result;
16869+}
16870+
16871+/**
16872+ * parse_options - parse reiser4 mount options
16873+ * @opt_string: starting point
16874+ * @opts: array of option description
16875+ * @nr_opts: number of elements in @opts
16876+ *
16877+ * Parses comma separated list of reiser4 mount options.
16878+ */
16879+static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
16880+{
16881+ int result;
16882+
16883+ result = 0;
16884+ while ((result == 0) && opt_string && *opt_string) {
16885+ int j;
16886+ char *next;
16887+
16888+ next = strchr(opt_string, ',');
16889+ if (next != NULL) {
16890+ *next = '\0';
16891+ ++next;
16892+ }
16893+ for (j = 0; j < nr_opts; ++j) {
16894+ if (!strncmp(opt_string, opts[j].name,
16895+ strlen(opts[j].name))) {
16896+ result = parse_option(opt_string, &opts[j]);
16897+ break;
16898+ }
16899+ }
16900+ if (j == nr_opts) {
16901+ warning("nikita-2307", "Unrecognized option: \"%s\"",
16902+ opt_string);
16903+ /* traditionally, -EINVAL is returned on wrong mount
16904+ option */
16905+ result = RETERR(-EINVAL);
16906+ }
16907+ opt_string = next;
16908+ }
16909+ return result;
16910+}
16911+
16912+#define NUM_OPT( label, fmt, addr ) \
16913+ { \
16914+ .name = ( label ), \
16915+ .type = OPT_FORMAT, \
16916+ .u = { \
16917+ .f = { \
16918+ .format = ( fmt ), \
16919+ .nr_args = 1, \
16920+ .arg1 = ( addr ), \
16921+ .arg2 = NULL, \
16922+ .arg3 = NULL, \
16923+ .arg4 = NULL \
16924+ } \
16925+ } \
16926+ }
16927+
16928+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16929+
16930+#define BIT_OPT(label, bitnr) \
16931+ { \
16932+ .name = label, \
16933+ .type = OPT_BIT, \
16934+ .u = { \
16935+ .bit = { \
16936+ .nr = bitnr, \
16937+ .addr = &sbinfo->fs_flags \
16938+ } \
16939+ } \
16940+ }
16941+
16942+#define MAX_NR_OPTIONS (30)
16943+
16944+/**
16945+ * init_super_data - initialize reiser4 private super block
16946+ * @super: super block to initialize
16947+ * @opt_string: list of reiser4 mount options
16948+ *
16949+ * Sets various reiser4 parameters to default values. Parses mount options and
16950+ * overwrites default settings.
16951+ */
16952+int init_super_data(struct super_block *super, char *opt_string)
16953+{
16954+ int result;
16955+ opt_desc_t *opts, *p;
16956+ reiser4_super_info_data *sbinfo = get_super_private(super);
16957+
16958+ /* initialize super, export, dentry operations */
16959+ sbinfo->ops.super = reiser4_super_operations;
16960+ sbinfo->ops.export = reiser4_export_operations;
16961+ sbinfo->ops.dentry = reiser4_dentry_operations;
16962+ super->s_op = &sbinfo->ops.super;
16963+ super->s_export_op = &sbinfo->ops.export;
16964+
16965+ /* initialize transaction manager parameters to default values */
16966+ sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16967+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16968+ sbinfo->tmgr.atom_min_size = 256;
16969+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16970+
16971+ /* initialize cbk cache parameter */
16972+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16973+
16974+ /* initialize flush parameters */
16975+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16976+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16977+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16978+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16979+
16980+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16981+
16982+ /* preliminary tree initializations */
16983+ sbinfo->tree.super = super;
16984+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16985+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16986+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16987+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16988+ rwlock_init(&(sbinfo->tree.tree_lock));
16989+ spin_lock_init(&(sbinfo->tree.epoch_lock));
16990+
16991+ /* initialize default readahead params */
16992+ sbinfo->ra_params.max = num_physpages / 4;
16993+ sbinfo->ra_params.flags = 0;
16994+
16995+ /* allocate memory for structure describing reiser4 mount options */
16996+ opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS, get_gfp_mask());
16997+ if (opts == NULL)
16998+ return RETERR(-ENOMEM);
16999+
17000+ /* initialize structure describing reiser4 mount options */
17001+ p = opts;
17002+
17003+#if REISER4_DEBUG
17004+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
17005+ warning ("zam-1046", "opt array is overloaded"); break; \
17006+ }
17007+#else
17008+# define OPT_ARRAY_CHECK noop
17009+#endif
17010+
17011+#define PUSH_OPT(...) \
17012+do { \
17013+ opt_desc_t o = __VA_ARGS__; \
17014+ OPT_ARRAY_CHECK; \
17015+ *p ++ = o; \
17016+} while (0)
17017+
17018+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
17019+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
17020+
17021+ /*
17022+ * tmgr.atom_max_size=N
17023+ * Atoms containing more than N blocks will be forced to commit. N is
17024+ * decimal.
17025+ */
17026+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
17027+ /*
17028+ * tmgr.atom_max_age=N
17029+ * Atoms older than N seconds will be forced to commit. N is decimal.
17030+ */
17031+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
17032+ /*
17033+ * tmgr.atom_min_size=N
17034+ * In committing an atom to free dirty pages, force the atom less than
17035+ * N in size to fuse with another one.
17036+ */
17037+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
17038+ /*
17039+ * tmgr.atom_max_flushers=N
17040+ * limit of concurrent flushers for one atom. 0 means no limit.
17041+ */
17042+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
17043+ /*
17044+ * tree.cbk_cache_slots=N
17045+ * Number of slots in the cbk cache.
17046+ */
17047+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
17048+ /*
17049+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
17050+ * leaf-level blocks it will force them to be relocated.
17051+ */
17052+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
17053+ /*
17054+ * If flush finds can find a block allocation closer than at most
17055+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
17056+ * position.
17057+ */
17058+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
17059+ /*
17060+ * If we have written this much or more blocks before encountering busy
17061+ * jnode in flush list - abort flushing hoping that next time we get
17062+ * called this jnode will be clean already, and we will save some
17063+ * seeks.
17064+ */
17065+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
17066+ /* The maximum number of nodes to scan left on a level during flush. */
17067+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
17068+ /* preferred IO size */
17069+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
17070+ /* carry flags used for insertion of new nodes */
17071+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
17072+ /* carry flags used for insertion of new extents */
17073+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
17074+ /* carry flags used for paste operations */
17075+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
17076+ /* carry flags used for insert operations */
17077+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
17078+
17079+#ifdef CONFIG_REISER4_BADBLOCKS
17080+ /*
17081+ * Alternative master superblock location in case if it's original
17082+ * location is not writeable/accessable. This is offset in BYTES.
17083+ */
17084+ PUSH_SB_FIELD_OPT(altsuper, "%lu");
17085+#endif
17086+
17087+ /* turn on BSD-style gid assignment */
17088+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
17089+ /* turn on 32 bit times */
17090+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
17091+ /* turn off concurrent flushing */
17092+ PUSH_BIT_OPT("mtflush", REISER4_MTFLUSH);
17093+ /*
17094+ * Don't load all bitmap blocks at mount time, it is useful for
17095+ * machines with tiny RAM and large disks.
17096+ */
17097+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
17098+ /* disable transaction commits during write() */
17099+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
17100+ /* disable use of write barriers in the reiser4 log writer. */
17101+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
17102+
17103+ PUSH_OPT(
17104+ {
17105+ /*
17106+ * tree traversal readahead parameters:
17107+ * -o readahead:MAXNUM:FLAGS
17108+ * MAXNUM - max number fo nodes to request readahead for: -1UL
17109+ * will set it to max_sane_readahead()
17110+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
17111+ * CONTINUE_ON_PRESENT
17112+ */
17113+ .name = "readahead",
17114+ .type = OPT_FORMAT,
17115+ .u = {
17116+ .f = {
17117+ .format = "%u:%u",
17118+ .nr_args = 2,
17119+ .arg1 = &sbinfo->ra_params.max,
17120+ .arg2 = &sbinfo->ra_params.flags,
17121+ .arg3 = NULL,
17122+ .arg4 = NULL
17123+ }
17124+ }
17125+ }
17126+ );
17127+
17128+ /* What to do in case of fs error */
17129+ PUSH_OPT(
17130+ {
17131+ .name = "onerror",
17132+ .type = OPT_ONEOF,
17133+ .u = {
17134+ .oneof = {
17135+ .result = &sbinfo->onerror,
17136+ .list = {
17137+ "panic", "remount-ro", NULL
17138+ },
17139+ }
17140+ }
17141+ }
17142+ );
17143+
17144+ /* modify default settings to values set by mount options */
17145+ result = parse_options(opt_string, opts, p - opts);
17146+ kfree(opts);
17147+ if (result != 0)
17148+ return result;
17149+
17150+ /* correct settings to sanity values */
17151+ sbinfo->tmgr.atom_max_age *= HZ;
17152+ if (sbinfo->tmgr.atom_max_age <= 0)
17153+ /* overflow */
17154+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
17155+
17156+ /* round optimal io size up to 512 bytes */
17157+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
17158+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
17159+ if (sbinfo->optimal_io_size == 0) {
17160+ warning("nikita-2497", "optimal_io_size is too small");
17161+ return RETERR(-EINVAL);
17162+ }
17163+
17164+ /* disable single-threaded flush as it leads to deadlock */
17165+ sbinfo->fs_flags |= (1 << REISER4_MTFLUSH);
17166+ return result;
17167+}
17168+
17169+/**
17170+ * init_read_super - read reiser4 master super block
17171+ * @super: super block to fill
17172+ * @silent: if 0 - print warnings
17173+ *
17174+ * Reads reiser4 master super block either from predefined location or from
17175+ * location specified by altsuper mount option, initializes disk format plugin.
17176+ */
17177+int init_read_super(struct super_block *super, int silent)
17178+{
17179+ struct buffer_head *super_bh;
17180+ struct reiser4_master_sb *master_sb;
17181+ reiser4_super_info_data *sbinfo = get_super_private(super);
17182+ unsigned long blocksize;
17183+
17184+ read_super_block:
17185+#ifdef CONFIG_REISER4_BADBLOCKS
17186+ if (sbinfo->altsuper)
17187+ /*
17188+ * read reiser4 master super block at position specified by
17189+ * mount option
17190+ */
17191+ super_bh = sb_bread(super,
17192+ (sector_t)(sbinfo->altsuper / super->s_blocksize));
17193+ else
17194+#endif
17195+ /* read reiser4 master super block at 16-th 4096 block */
17196+ super_bh = sb_bread(super,
17197+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
17198+ if (!super_bh)
17199+ return RETERR(-EIO);
17200+
17201+ master_sb = (struct reiser4_master_sb *)super_bh->b_data;
17202+ /* check reiser4 magic string */
17203+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
17204+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
17205+ /* reiser4 master super block contains filesystem blocksize */
17206+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
17207+
17208+ if (blocksize != PAGE_CACHE_SIZE) {
17209+ /*
17210+ * currenly reiser4's blocksize must be equal to
17211+ * pagesize
17212+ */
17213+ if (!silent)
17214+ warning("nikita-2609",
17215+ "%s: wrong block size %ld\n", super->s_id,
17216+ blocksize);
17217+ brelse(super_bh);
17218+ return RETERR(-EINVAL);
17219+ }
17220+ if (blocksize != super->s_blocksize) {
17221+ /*
17222+ * filesystem uses different blocksize. Reread master
17223+ * super block with correct blocksize
17224+ */
17225+ brelse(super_bh);
17226+ if (!sb_set_blocksize(super, (int)blocksize))
17227+ return RETERR(-EINVAL);
17228+ goto read_super_block;
17229+ }
17230+
17231+ sbinfo->df_plug =
17232+ disk_format_plugin_by_id(
17233+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17234+ if (sbinfo->df_plug == NULL) {
17235+ if (!silent)
17236+ warning("nikita-26091",
17237+ "%s: unknown disk format plugin %d\n",
17238+ super->s_id,
17239+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17240+ brelse(super_bh);
17241+ return RETERR(-EINVAL);
17242+ }
17243+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17244+ brelse(super_bh);
17245+ return 0;
17246+ }
17247+
17248+ /* there is no reiser4 on the device */
17249+ if (!silent)
17250+ warning("nikita-2608",
17251+ "%s: wrong master super block magic", super->s_id);
17252+ brelse(super_bh);
17253+ return RETERR(-EINVAL);
17254+}
17255+
17256+static struct {
17257+ reiser4_plugin_type type;
17258+ reiser4_plugin_id id;
17259+} default_plugins[PSET_LAST] = {
17260+ [PSET_FILE] = {
17261+ .type = REISER4_FILE_PLUGIN_TYPE,
17262+ .id = UNIX_FILE_PLUGIN_ID
17263+ },
17264+ [PSET_DIR] = {
17265+ .type = REISER4_DIR_PLUGIN_TYPE,
17266+ .id = HASHED_DIR_PLUGIN_ID
17267+ },
17268+ [PSET_HASH] = {
17269+ .type = REISER4_HASH_PLUGIN_TYPE,
17270+ .id = R5_HASH_ID
17271+ },
17272+ [PSET_FIBRATION] = {
17273+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
17274+ .id = FIBRATION_DOT_O
17275+ },
17276+ [PSET_PERM] = {
17277+ .type = REISER4_PERM_PLUGIN_TYPE,
17278+ .id = NULL_PERM_ID
17279+ },
17280+ [PSET_FORMATTING] = {
17281+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
17282+ .id = SMALL_FILE_FORMATTING_ID
17283+ },
17284+ [PSET_SD] = {
17285+ .type = REISER4_ITEM_PLUGIN_TYPE,
17286+ .id = STATIC_STAT_DATA_ID
17287+ },
17288+ [PSET_DIR_ITEM] = {
17289+ .type = REISER4_ITEM_PLUGIN_TYPE,
17290+ .id = COMPOUND_DIR_ID
17291+ },
17292+ [PSET_CIPHER] = {
17293+ .type = REISER4_CIPHER_PLUGIN_TYPE,
17294+ .id = NONE_CIPHER_ID
17295+ },
17296+ [PSET_DIGEST] = {
17297+ .type = REISER4_DIGEST_PLUGIN_TYPE,
17298+ .id = SHA256_32_DIGEST_ID
17299+ },
17300+ [PSET_COMPRESSION] = {
17301+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17302+ .id = LZO1_COMPRESSION_ID
17303+ },
17304+ [PSET_COMPRESSION_MODE] = {
17305+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17306+ .id = COL_16_COMPRESSION_MODE_ID
17307+ },
17308+ [PSET_CLUSTER] = {
17309+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
17310+ .id = CLUSTER_64K_ID
17311+ },
17312+ [PSET_REGULAR_ENTRY] = {
17313+ .type = REISER4_REGULAR_PLUGIN_TYPE,
17314+ .id = UF_REGULAR_ID
17315+ }
17316+};
17317+
17318+/* access to default plugin table */
17319+static reiser4_plugin *get_default_plugin(pset_member memb)
17320+{
17321+ return plugin_by_id(default_plugins[memb].type,
17322+ default_plugins[memb].id);
17323+}
17324+
17325+/**
17326+ * init_root_inode - obtain inode of root directory
17327+ * @super: super block of filesystem
17328+ *
17329+ * Obtains inode of root directory (reading it from disk), initializes plugin
17330+ * set it was not initialized.
17331+ */
17332+int init_root_inode(struct super_block *super)
17333+{
17334+ reiser4_super_info_data *sbinfo = get_super_private(super);
17335+ struct inode *inode;
17336+ int result = 0;
17337+
17338+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17339+ if (IS_ERR(inode))
17340+ return RETERR(PTR_ERR(inode));
17341+
17342+ super->s_root = d_alloc_root(inode);
17343+ if (!super->s_root) {
17344+ iput(inode);
17345+ return RETERR(-ENOMEM);
17346+ }
17347+
17348+ super->s_root->d_op = &sbinfo->ops.dentry;
17349+
17350+ if (!is_inode_loaded(inode)) {
17351+ pset_member memb;
17352+
17353+ for (memb = 0; memb < PSET_LAST; ++memb) {
17354+ reiser4_plugin *plug;
17355+
17356+ plug = get_default_plugin(memb);
17357+ result = grab_plugin_from(inode, memb, plug);
17358+ if (result != 0)
17359+ break;
17360+ }
17361+
17362+ if (result == 0) {
17363+ if (REISER4_DEBUG) {
17364+ plugin_set *pset;
17365+
17366+ pset = reiser4_inode_data(inode)->pset;
17367+ for (memb = 0; memb < PSET_LAST; ++memb)
17368+ assert("nikita-3500",
17369+ pset_get(pset, memb) != NULL);
17370+ }
17371+ } else
17372+ warning("nikita-3448", "Cannot set plugins of root: %i",
17373+ result);
17374+ reiser4_iget_complete(inode);
17375+ }
17376+ super->s_maxbytes = MAX_LFS_FILESIZE;
17377+ return result;
17378+}
17379+
17380+/*
17381+ * Local variables:
17382+ * c-indentation-style: "K&R"
17383+ * mode-name: "LC"
17384+ * c-basic-offset: 8
17385+ * tab-width: 8
17386+ * fill-column: 79
17387+ * End:
17388+ */
17389Index: linux-2.6.16/fs/reiser4/inode.c
17390===================================================================
17391--- /dev/null
17392+++ linux-2.6.16/fs/reiser4/inode.c
17393@@ -0,0 +1,727 @@
17394+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17395+
17396+/* Inode specific operations. */
17397+
17398+#include "forward.h"
17399+#include "debug.h"
17400+#include "key.h"
17401+#include "kassign.h"
17402+#include "coord.h"
17403+#include "seal.h"
17404+#include "dscale.h"
17405+#include "plugin/item/item.h"
17406+#include "plugin/security/perm.h"
17407+#include "plugin/plugin.h"
17408+#include "plugin/object.h"
17409+#include "znode.h"
17410+#include "vfs_ops.h"
17411+#include "inode.h"
17412+#include "super.h"
17413+#include "reiser4.h"
17414+
17415+#include <linux/fs.h> /* for struct super_block, address_space */
17416+
17417+/* return reiser4 internal tree which inode belongs to */
17418+/* Audited by: green(2002.06.17) */
17419+reiser4_tree *tree_by_inode(const struct inode *inode /* inode queried */ )
17420+{
17421+ assert("nikita-256", inode != NULL);
17422+ assert("nikita-257", inode->i_sb != NULL);
17423+ return get_tree(inode->i_sb);
17424+}
17425+
17426+/* return reiser4-specific inode flags */
17427+static inline unsigned long *inode_flags(const struct inode *const inode)
17428+{
17429+ assert("nikita-2842", inode != NULL);
17430+ return &reiser4_inode_data(inode)->flags;
17431+}
17432+
17433+/* set reiser4-specific flag @f in @inode */
17434+void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17435+{
17436+ assert("nikita-2248", inode != NULL);
17437+ set_bit((int)f, inode_flags(inode));
17438+}
17439+
17440+/* clear reiser4-specific flag @f in @inode */
17441+void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17442+{
17443+ assert("nikita-2250", inode != NULL);
17444+ clear_bit((int)f, inode_flags(inode));
17445+}
17446+
17447+/* true if reiser4-specific flag @f is set in @inode */
17448+int inode_get_flag(const struct inode *inode, reiser4_file_plugin_flags f)
17449+{
17450+ assert("nikita-2251", inode != NULL);
17451+ return test_bit((int)f, inode_flags(inode));
17452+}
17453+
17454+/* convert oid to inode number */
17455+ino_t oid_to_ino(oid_t oid)
17456+{
17457+ return (ino_t) oid;
17458+}
17459+
17460+/* convert oid to user visible inode number */
17461+ino_t oid_to_uino(oid_t oid)
17462+{
17463+ /* reiser4 object is uniquely identified by oid which is 64 bit
17464+ quantity. Kernel in-memory inode is indexed (in the hash table) by
17465+ 32 bit i_ino field, but this is not a problem, because there is a
17466+ way to further distinguish inodes with identical inode numbers
17467+ (find_actor supplied to iget()).
17468+
17469+ But user space expects unique 32 bit inode number. Obviously this
17470+ is impossible. Work-around is to somehow hash oid into user visible
17471+ inode number.
17472+ */
17473+ oid_t max_ino = (ino_t) ~ 0;
17474+
17475+ if (REISER4_INO_IS_OID || (oid <= max_ino))
17476+ return oid;
17477+ else
17478+ /* this is remotely similar to algorithm used to find next pid
17479+ to use for process: after wrap-around start from some
17480+ offset rather than from 0. Idea is that there are some long
17481+ living objects with which we don't want to collide.
17482+ */
17483+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17484+}
17485+
17486+/* check that "inode" is on reiser4 file-system */
17487+int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17488+{
17489+ return inode != NULL && is_reiser4_super(inode->i_sb);
17490+}
17491+
17492+/* Maximal length of a name that can be stored in directory @inode.
17493+
17494+ This is used in check during file creation and lookup. */
17495+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17496+{
17497+ assert("nikita-287", is_reiser4_inode(inode));
17498+ assert("nikita-1710", inode_dir_item_plugin(inode));
17499+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17500+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17501+ else
17502+ return 255;
17503+}
17504+
17505+#if REISER4_USE_COLLISION_LIMIT
17506+/* Maximal number of hash collisions for this directory. */
17507+int max_hash_collisions(const struct inode *dir /* inode queried */ )
17508+{
17509+ assert("nikita-1711", dir != NULL);
17510+ return reiser4_inode_data(dir)->plugin.max_collisions;
17511+}
17512+#endif /* REISER4_USE_COLLISION_LIMIT */
17513+
17514+/* Install file, inode, and address_space operation on @inode, depending on
17515+ its mode. */
17516+int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17517+ reiser4_object_create_data * data /* parameters to create
17518+ * object */ )
17519+{
17520+ reiser4_super_info_data *sinfo;
17521+ file_plugin *fplug;
17522+ dir_plugin *dplug;
17523+
17524+ fplug = inode_file_plugin(inode);
17525+ dplug = inode_dir_plugin(inode);
17526+
17527+ sinfo = get_super_private(inode->i_sb);
17528+
17529+ switch (inode->i_mode & S_IFMT) {
17530+ case S_IFSOCK:
17531+ case S_IFBLK:
17532+ case S_IFCHR:
17533+ case S_IFIFO:
17534+ {
17535+ dev_t rdev; /* to keep gcc happy */
17536+
17537+ assert("vs-46", fplug != NULL);
17538+ /* ugly hack with rdev */
17539+ if (data == NULL) {
17540+ rdev = inode->i_rdev;
17541+ inode->i_rdev = 0;
17542+ } else
17543+ rdev = data->rdev;
17544+ inode->i_blocks = 0;
17545+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17546+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17547+ /* initialize inode->i_fop and inode->i_rdev for block and char
17548+ devices */
17549+ init_special_inode(inode, inode->i_mode, rdev);
17550+ /* all address space operations are null */
17551+ inode->i_mapping->a_ops =
17552+ &file_plugins[fplug->h.id].as_ops;
17553+ break;
17554+ }
17555+ case S_IFLNK:
17556+ assert("vs-46", fplug != NULL);
17557+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17558+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17559+ inode->i_fop = NULL;
17560+ /* all address space operations are null */
17561+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17562+ break;
17563+ case S_IFDIR:
17564+ assert("vs-46", dplug != NULL);
17565+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17566+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17567+ inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17568+ inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17569+ inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17570+ break;
17571+ case S_IFREG:
17572+ assert("vs-46", fplug != NULL);
17573+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17574+ fplug->h.id == CRC_FILE_PLUGIN_ID));
17575+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17576+ inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17577+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17578+ break;
17579+ default:
17580+ warning("nikita-291", "wrong file mode: %o for %llu",
17581+ inode->i_mode,
17582+ (unsigned long long)get_inode_oid(inode));
17583+ reiser4_make_bad_inode(inode);
17584+ return RETERR(-EINVAL);
17585+ }
17586+ return 0;
17587+}
17588+
17589+/* initialize inode from disk data. Called with inode locked.
17590+ Return inode locked. */
17591+static int init_inode(struct inode *inode /* inode to intialise */ ,
17592+ coord_t * coord /* coord of stat data */ )
17593+{
17594+ int result;
17595+ item_plugin *iplug;
17596+ void *body;
17597+ int length;
17598+ reiser4_inode *state;
17599+
17600+ assert("nikita-292", coord != NULL);
17601+ assert("nikita-293", inode != NULL);
17602+
17603+ coord_clear_iplug(coord);
17604+ result = zload(coord->node);
17605+ if (result)
17606+ return result;
17607+ iplug = item_plugin_by_coord(coord);
17608+ body = item_body_by_coord(coord);
17609+ length = item_length_by_coord(coord);
17610+
17611+ assert("nikita-295", iplug != NULL);
17612+ assert("nikita-296", body != NULL);
17613+ assert("nikita-297", length > 0);
17614+
17615+ /* inode is under I_LOCK now */
17616+
17617+ state = reiser4_inode_data(inode);
17618+ /* call stat-data plugin method to load sd content into inode */
17619+ result = iplug->s.sd.init_inode(inode, body, length);
17620+ plugin_set_sd(&state->pset, iplug);
17621+ if (result == 0) {
17622+ result = setup_inode_ops(inode, NULL);
17623+ if (result == 0 &&
17624+ inode->i_sb->s_root && inode->i_sb->s_root->d_inode) {
17625+ struct inode *root;
17626+ pset_member ind;
17627+
17628+ /* take missing plugins from file-system defaults */
17629+ root = inode->i_sb->s_root->d_inode;
17630+ /* file and directory plugins are already initialized. */
17631+ for (ind = PSET_DIR + 1; ind < PSET_LAST; ++ind) {
17632+ result = grab_plugin(inode, root, ind);
17633+ if (result != 0)
17634+ break;
17635+ }
17636+ if (result != 0) {
17637+ warning("nikita-3447",
17638+ "Cannot set up plugins for %lli",
17639+ (unsigned long long)
17640+ get_inode_oid(inode));
17641+ }
17642+ }
17643+ }
17644+ zrelse(coord->node);
17645+ return result;
17646+}
17647+
17648+/* read `inode' from the disk. This is what was previously in
17649+ reiserfs_read_inode2().
17650+
17651+ Must be called with inode locked. Return inode still locked.
17652+*/
17653+static int read_inode(struct inode *inode /* inode to read from disk */ ,
17654+ const reiser4_key * key /* key of stat data */ ,
17655+ int silent)
17656+{
17657+ int result;
17658+ lock_handle lh;
17659+ reiser4_inode *info;
17660+ coord_t coord;
17661+
17662+ assert("nikita-298", inode != NULL);
17663+ assert("nikita-1945", !is_inode_loaded(inode));
17664+
17665+ info = reiser4_inode_data(inode);
17666+ assert("nikita-300", info->locality_id != 0);
17667+
17668+ coord_init_zero(&coord);
17669+ init_lh(&lh);
17670+ /* locate stat-data in a tree and return znode locked */
17671+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17672+ assert("nikita-301", !is_inode_loaded(inode));
17673+ if (result == 0) {
17674+ /* use stat-data plugin to load sd into inode. */
17675+ result = init_inode(inode, &coord);
17676+ if (result == 0) {
17677+ /* initialize stat-data seal */
17678+ spin_lock_inode(inode);
17679+ seal_init(&info->sd_seal, &coord, key);
17680+ info->sd_coord = coord;
17681+ spin_unlock_inode(inode);
17682+
17683+ /* call file plugin's method to initialize plugin
17684+ * specific part of inode */
17685+ if (inode_file_plugin(inode)->init_inode_data)
17686+ inode_file_plugin(inode)->init_inode_data(inode,
17687+ NULL,
17688+ 0);
17689+ /* load detached directory cursors for stateless
17690+ * directory readers (NFS). */
17691+ load_cursors(inode);
17692+
17693+ /* Check the opened inode for consistency. */
17694+ result =
17695+ get_super_private(inode->i_sb)->df_plug->
17696+ check_open(inode);
17697+ }
17698+ }
17699+ /* lookup_sd() doesn't release coord because we want znode
17700+ stay read-locked while stat-data fields are accessed in
17701+ init_inode() */
17702+ done_lh(&lh);
17703+
17704+ if (result != 0)
17705+ reiser4_make_bad_inode(inode);
17706+ return result;
17707+}
17708+
17709+/* initialise new reiser4 inode being inserted into hash table. */
17710+static int init_locked_inode(struct inode *inode /* new inode */ ,
17711+ void *opaque /* key of stat data passed to the
17712+ * iget5_locked as cookie */ )
17713+{
17714+ reiser4_key *key;
17715+
17716+ assert("nikita-1995", inode != NULL);
17717+ assert("nikita-1996", opaque != NULL);
17718+ key = opaque;
17719+ set_inode_oid(inode, get_key_objectid(key));
17720+ reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17721+ return 0;
17722+}
17723+
17724+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17725+
17726+ This function is called by iget5_locked() to distinguish reiser4 inodes
17727+ having the same inode numbers. Such inodes can only exist due to some error
17728+ condition. One of them should be bad. Inodes with identical inode numbers
17729+ (objectids) are distinguished by their packing locality.
17730+
17731+*/
17732+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17733+ * check */ ,
17734+ void *opaque /* "cookie" passed to
17735+ * iget5_locked(). This is stat data
17736+ * key */ )
17737+{
17738+ reiser4_key *key;
17739+
17740+ key = opaque;
17741+ return
17742+ /* oid is unique, so first term is enough, actually. */
17743+ get_inode_oid(inode) == get_key_objectid(key) &&
17744+ /*
17745+ * also, locality should be checked, but locality is stored in
17746+ * the reiser4-specific part of the inode, and actor can be
17747+ * called against arbitrary inode that happened to be in this
17748+ * hash chain. Hence we first have to check that this is
17749+ * reiser4 inode at least. is_reiser4_inode() is probably too
17750+ * early to call, as inode may have ->i_op not yet
17751+ * initialised.
17752+ */
17753+ is_reiser4_super(inode->i_sb) &&
17754+ /*
17755+ * usually objectid is unique, but pseudo files use counter to
17756+ * generate objectid. All pseudo files are placed into special
17757+ * (otherwise unused) locality.
17758+ */
17759+ reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17760+}
17761+
17762+/* hook for kmem_cache_create */
17763+void loading_init_once(reiser4_inode * info)
17764+{
17765+ sema_init(&info->loading, 1);
17766+}
17767+
17768+/* for reiser4_alloc_inode */
17769+void loading_alloc(reiser4_inode * info)
17770+{
17771+#if REISER4_DEBUG
17772+ assert("vs-1717", down_trylock(&info->loading) == 0);
17773+ up(&info->loading);
17774+#endif
17775+}
17776+
17777+/* for reiser4_destroy */
17778+void loading_destroy(reiser4_inode * info)
17779+{
17780+#if REISER4_DEBUG
17781+ assert("vs-1717", down_trylock(&info->loading) == 0);
17782+ up(&info->loading);
17783+#endif
17784+}
17785+
17786+static void loading_down(reiser4_inode * info)
17787+{
17788+ down(&info->loading);
17789+}
17790+
17791+static void loading_up(reiser4_inode * info)
17792+{
17793+ up(&info->loading);
17794+}
17795+
17796+/**
17797+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17798+ * @super: super block of filesystem
17799+ * @key: key of inode's stat-data
17800+ * @silent:
17801+ *
17802+ * This is our helper function a la iget(). This is be called by
17803+ * reiser4_lookup() and reiser4_read_super(). Return inode locked or error
17804+ * encountered.
17805+ */
17806+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17807+ int silent)
17808+{
17809+ struct inode *inode;
17810+ int result;
17811+ reiser4_inode *info;
17812+
17813+ assert("nikita-302", super != NULL);
17814+ assert("nikita-303", key != NULL);
17815+
17816+ result = 0;
17817+
17818+ /* call iget(). Our ->read_inode() is dummy, so this will either
17819+ find inode in cache or return uninitialised inode */
17820+ inode = iget5_locked(super,
17821+ (unsigned long)get_key_objectid(key),
17822+ reiser4_inode_find_actor,
17823+ init_locked_inode, (reiser4_key *) key);
17824+ if (inode == NULL)
17825+ return ERR_PTR(RETERR(-ENOMEM));
17826+ if (is_bad_inode(inode)) {
17827+ warning("nikita-304", "Bad inode found");
17828+ print_key("key", key);
17829+ iput(inode);
17830+ return ERR_PTR(RETERR(-EIO));
17831+ }
17832+
17833+ info = reiser4_inode_data(inode);
17834+
17835+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17836+ loaded and initialized inode from just allocated inode. If
17837+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17838+ info->loading. The place in reiser4 which uses not initialized inode
17839+ is the reiser4 repacker, see repacker-related functions in
17840+ plugin/item/extent.c */
17841+ if (!is_inode_loaded(inode)) {
17842+ loading_down(info);
17843+ if (!is_inode_loaded(inode)) {
17844+ /* locking: iget5_locked returns locked inode */
17845+ assert("nikita-1941", !is_inode_loaded(inode));
17846+ assert("nikita-1949",
17847+ reiser4_inode_find_actor(inode,
17848+ (reiser4_key *) key));
17849+ /* now, inode has objectid as ->i_ino and locality in
17850+ reiser4-specific part. This is enough for
17851+ read_inode() to read stat data from the disk */
17852+ result = read_inode(inode, key, silent);
17853+ } else
17854+ loading_up(info);
17855+ }
17856+
17857+ if (inode->i_state & I_NEW)
17858+ unlock_new_inode(inode);
17859+
17860+ if (is_bad_inode(inode)) {
17861+ assert("vs-1717", result != 0);
17862+ loading_up(info);
17863+ iput(inode);
17864+ inode = ERR_PTR(result);
17865+ } else if (REISER4_DEBUG) {
17866+ reiser4_key found_key;
17867+
17868+ assert("vs-1717", result == 0);
17869+ build_sd_key(inode, &found_key);
17870+ if (!keyeq(&found_key, key)) {
17871+ warning("nikita-305", "Wrong key in sd");
17872+ print_key("sought for", key);
17873+ print_key("found", &found_key);
17874+ }
17875+ if (inode->i_nlink == 0) {
17876+ warning("nikita-3559", "Unlinked inode found: %llu\n",
17877+ (unsigned long long)get_inode_oid(inode));
17878+ }
17879+ }
17880+ return inode;
17881+}
17882+
17883+/* reiser4_iget() may return not fully initialized inode, this function should
17884+ * be called after one completes reiser4 inode initializing. */
17885+void reiser4_iget_complete(struct inode *inode)
17886+{
17887+ assert("zam-988", is_reiser4_inode(inode));
17888+
17889+ if (!is_inode_loaded(inode)) {
17890+ inode_set_flag(inode, REISER4_LOADED);
17891+ loading_up(reiser4_inode_data(inode));
17892+ }
17893+}
17894+
17895+void reiser4_make_bad_inode(struct inode *inode)
17896+{
17897+ assert("nikita-1934", inode != NULL);
17898+
17899+ /* clear LOADED bit */
17900+ inode_clr_flag(inode, REISER4_LOADED);
17901+ make_bad_inode(inode);
17902+ return;
17903+}
17904+
17905+file_plugin *inode_file_plugin(const struct inode * inode)
17906+{
17907+ assert("nikita-1997", inode != NULL);
17908+ return reiser4_inode_data(inode)->pset->file;
17909+}
17910+
17911+dir_plugin *inode_dir_plugin(const struct inode * inode)
17912+{
17913+ assert("nikita-1998", inode != NULL);
17914+ return reiser4_inode_data(inode)->pset->dir;
17915+}
17916+
17917+#if 0
17918+perm_plugin *inode_perm_plugin(const struct inode * inode)
17919+{
17920+ assert("nikita-1999", inode != NULL);
17921+ return reiser4_inode_data(inode)->pset->perm;
17922+}
17923+#endif /* 0 */
17924+
17925+formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17926+{
17927+ assert("nikita-2000", inode != NULL);
17928+ return reiser4_inode_data(inode)->pset->formatting;
17929+}
17930+
17931+hash_plugin *inode_hash_plugin(const struct inode * inode)
17932+{
17933+ assert("nikita-2001", inode != NULL);
17934+ return reiser4_inode_data(inode)->pset->hash;
17935+}
17936+
17937+fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17938+{
17939+ assert("nikita-2001", inode != NULL);
17940+ return reiser4_inode_data(inode)->pset->fibration;
17941+}
17942+
17943+cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17944+{
17945+ assert("edward-36", inode != NULL);
17946+ return reiser4_inode_data(inode)->pset->cipher;
17947+}
17948+
17949+compression_plugin *inode_compression_plugin(const struct inode * inode)
17950+{
17951+ assert("edward-37", inode != NULL);
17952+ return reiser4_inode_data(inode)->pset->compression;
17953+}
17954+
17955+compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17956+ inode)
17957+{
17958+ assert("edward-1330", inode != NULL);
17959+ return reiser4_inode_data(inode)->pset->compression_mode;
17960+}
17961+
17962+cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17963+{
17964+ assert("edward-1328", inode != NULL);
17965+ return reiser4_inode_data(inode)->pset->cluster;
17966+}
17967+
17968+regular_plugin *inode_regular_plugin(const struct inode * inode)
17969+{
17970+ assert("edward-1329", inode != NULL);
17971+ return reiser4_inode_data(inode)->pset->regular_entry;
17972+}
17973+
17974+digest_plugin *inode_digest_plugin(const struct inode * inode)
17975+{
17976+ assert("edward-86", inode != NULL);
17977+ return reiser4_inode_data(inode)->pset->digest;
17978+}
17979+
17980+item_plugin *inode_sd_plugin(const struct inode * inode)
17981+{
17982+ assert("vs-534", inode != NULL);
17983+ return reiser4_inode_data(inode)->pset->sd;
17984+}
17985+
17986+item_plugin *inode_dir_item_plugin(const struct inode * inode)
17987+{
17988+ assert("vs-534", inode != NULL);
17989+ return reiser4_inode_data(inode)->pset->dir_item;
17990+}
17991+
17992+void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17993+{
17994+ reiser4_inode *state;
17995+
17996+ assert("nikita-2716", inode != NULL);
17997+ assert("nikita-2717", ext < LAST_SD_EXTENSION);
17998+ assert("nikita-3491", spin_inode_is_locked(inode));
17999+
18000+ state = reiser4_inode_data(inode);
18001+ state->extmask |= 1 << ext;
18002+ /* force re-calculation of stat-data length on next call to
18003+ update_sd(). */
18004+ inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18005+}
18006+
18007+void
18008+inode_set_plugin(struct inode *inode, reiser4_plugin * plug, pset_member memb)
18009+{
18010+ assert("nikita-2718", inode != NULL);
18011+ assert("nikita-2719", plug != NULL);
18012+
18013+ reiser4_inode_data(inode)->plugin_mask |= (1 << memb);
18014+}
18015+
18016+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
18017+{
18018+ assert("edward-1287", inode != NULL);
18019+ if (!dscale_fit(old, new))
18020+ inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18021+ return;
18022+}
18023+
18024+void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
18025+{
18026+ assert("nikita-2875", inode != NULL);
18027+ spin_lock_inode(inode);
18028+ inode_check_scale_nolock(inode, old, new);
18029+ spin_unlock_inode(inode);
18030+}
18031+
18032+/*
18033+ * initialize ->ordering field of inode. This field defines how file stat-data
18034+ * and body is ordered within a tree with respect to other objects within the
18035+ * same parent directory.
18036+ */
18037+void
18038+init_inode_ordering(struct inode *inode,
18039+ reiser4_object_create_data * crd, int create)
18040+{
18041+ reiser4_key key;
18042+
18043+ if (create) {
18044+ struct inode *parent;
18045+
18046+ parent = crd->parent;
18047+ assert("nikita-3224", inode_dir_plugin(parent) != NULL);
18048+ inode_dir_plugin(parent)->build_entry_key(parent,
18049+ &crd->dentry->d_name,
18050+ &key);
18051+ } else {
18052+ coord_t *coord;
18053+
18054+ coord = &reiser4_inode_data(inode)->sd_coord;
18055+ coord_clear_iplug(coord);
18056+ /* safe to use ->sd_coord, because node is under long term
18057+ * lock */
18058+ WITH_DATA(coord->node, item_key_by_coord(coord, &key));
18059+ }
18060+
18061+ set_inode_ordering(inode, get_key_ordering(&key));
18062+}
18063+
18064+znode *inode_get_vroot(struct inode *inode)
18065+{
18066+ reiser4_block_nr blk;
18067+ znode *result;
18068+
18069+ spin_lock_inode(inode);
18070+ blk = reiser4_inode_data(inode)->vroot;
18071+ spin_unlock_inode(inode);
18072+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
18073+ result = zlook(tree_by_inode(inode), &blk);
18074+ else
18075+ result = NULL;
18076+ return result;
18077+}
18078+
18079+void inode_set_vroot(struct inode *inode, znode *vroot)
18080+{
18081+ spin_lock_inode(inode);
18082+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
18083+ spin_unlock_inode(inode);
18084+}
18085+
18086+#if REISER4_DEBUG
18087+
18088+void inode_invariant(const struct inode *inode)
18089+{
18090+ assert("nikita-3077", spin_inode_is_locked(inode));
18091+}
18092+
18093+int inode_has_no_jnodes(reiser4_inode * r4_inode)
18094+{
18095+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
18096+ r4_inode->nr_jnodes == 0;
18097+}
18098+
18099+#endif
18100+
18101+/* true if directory is empty (only contains dot and dotdot) */
18102+/* FIXME: shouldn't it be dir plugin method? */
18103+int is_dir_empty(const struct inode *dir)
18104+{
18105+ assert("nikita-1976", dir != NULL);
18106+
18107+ /* rely on our method to maintain directory i_size being equal to the
18108+ number of entries. */
18109+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
18110+}
18111+
18112+/* Make Linus happy.
18113+ Local variables:
18114+ c-indentation-style: "K&R"
18115+ mode-name: "LC"
18116+ c-basic-offset: 8
18117+ tab-width: 8
18118+ fill-column: 120
18119+ End:
18120+*/
18121Index: linux-2.6.16/fs/reiser4/inode.h
18122===================================================================
18123--- /dev/null
18124+++ linux-2.6.16/fs/reiser4/inode.h
18125@@ -0,0 +1,430 @@
18126+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
18127+
18128+/* Inode functions. */
18129+
18130+#if !defined( __REISER4_INODE_H__ )
18131+#define __REISER4_INODE_H__
18132+
18133+#include "forward.h"
18134+#include "debug.h"
18135+#include "key.h"
18136+#include "seal.h"
18137+#include "plugin/plugin.h"
18138+#include "plugin/file/cryptcompress.h"
18139+#include "plugin/file/file.h"
18140+#include "plugin/dir/dir.h"
18141+#include "plugin/plugin_set.h"
18142+#include "plugin/security/perm.h"
18143+#include "vfs_ops.h"
18144+#include "jnode.h"
18145+#include "fsdata.h"
18146+
18147+#include <linux/types.h> /* for __u?? , ino_t */
18148+#include <linux/fs.h> /* for struct super_block, struct
18149+ * rw_semaphore, etc */
18150+#include <linux/spinlock.h>
18151+#include <asm/types.h>
18152+
18153+/* reiser4-specific inode flags. They are "transient" and are not
18154+ supposed to be stored on disk. Used to trace "state" of
18155+ inode
18156+*/
18157+typedef enum {
18158+ /* this is light-weight inode, inheriting some state from its
18159+ parent */
18160+ REISER4_LIGHT_WEIGHT = 0,
18161+ /* stat data wasn't yet created */
18162+ REISER4_NO_SD = 1,
18163+ /* internal immutable flag. Currently is only used
18164+ to avoid race condition during file creation.
18165+ See comment in create_object(). */
18166+ REISER4_IMMUTABLE = 2,
18167+ /* inode was read from storage */
18168+ REISER4_LOADED = 3,
18169+ /* this bit is set for symlinks. inode->u.generic_ip points to target
18170+ name of symlink. */
18171+ REISER4_GENERIC_PTR_USED = 4,
18172+ /* set if size of stat-data item for this inode is known. If this is
18173+ * set we can avoid recalculating size of stat-data on each update. */
18174+ REISER4_SDLEN_KNOWN = 5,
18175+ /* reiser4_inode->crypt points to the crypto stat */
18176+ REISER4_CRYPTO_STAT_LOADED = 6,
18177+ /* cryptcompress_inode_data points to the secret key */
18178+ REISER4_SECRET_KEY_INSTALLED = 7,
18179+ /* File (possibly) has pages corresponding to the tail items, that
18180+ * were created by ->readpage. It is set by mmap_unix_file() and
18181+ * sendfile_unix_file(). This bit is inspected by write_unix_file and
18182+ * kill-hook of tail items. It is never cleared once set. This bit is
18183+ * modified and inspected under i_mutex. */
18184+ REISER4_HAS_MMAP = 8,
18185+
18186+ REISER4_PART_MIXED = 9,
18187+ REISER4_PART_IN_CONV = 10
18188+} reiser4_file_plugin_flags;
18189+
18190+/* state associated with each inode.
18191+ reiser4 inode.
18192+
18193+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
18194+ be of the same size. File-system allocates inodes by itself through
18195+ s_op->allocate_inode() method. So, it is possible to adjust size of inode
18196+ at the time of its creation.
18197+
18198+ Invariants involving parts of this data-type:
18199+
18200+ [inode->eflushed]
18201+
18202+*/
18203+
18204+typedef struct reiser4_inode reiser4_inode;
18205+/* return pointer to reiser4-specific part of inode */
18206+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18207+ /* inode queried */ );
18208+
18209+#if BITS_PER_LONG == 64
18210+
18211+#define REISER4_INO_IS_OID (1)
18212+typedef struct {;
18213+} oid_hi_t;
18214+
18215+/* BITS_PER_LONG == 64 */
18216+#else
18217+
18218+#define REISER4_INO_IS_OID (0)
18219+typedef __u32 oid_hi_t;
18220+
18221+/* BITS_PER_LONG == 64 */
18222+#endif
18223+
18224+struct reiser4_inode {
18225+ /* spin lock protecting fields of this structure. */
18226+ spinlock_t guard;
18227+ /* object plugins */
18228+ plugin_set *pset;
18229+ /* plugins set for inheritance */
18230+ plugin_set *hset;
18231+ /* high 32 bits of object id */
18232+ oid_hi_t oid_hi;
18233+ /* seal for stat-data */
18234+ seal_t sd_seal;
18235+ /* locality id for this file */
18236+ oid_t locality_id;
18237+#if REISER4_LARGE_KEY
18238+ __u64 ordering;
18239+#endif
18240+ /* coord of stat-data in sealed node */
18241+ coord_t sd_coord;
18242+ /* bit-mask of stat-data extentions used by this file */
18243+ __u64 extmask;
18244+ /* bitmask of non-default plugins for this inode */
18245+ __u16 plugin_mask;
18246+ union {
18247+ struct list_head readdir_list;
18248+ struct list_head not_used;
18249+ } lists;
18250+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18251+ unsigned long flags;
18252+ union {
18253+ /* fields specific to unix_file plugin */
18254+ unix_file_info_t unix_file_info;
18255+ /* fields specific to cryptcompress plugin */
18256+ cryptcompress_info_t cryptcompress_info;
18257+ } file_plugin_data;
18258+
18259+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18260+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18261+ struct radix_tree_root jnodes_tree;
18262+#if REISER4_DEBUG
18263+ /* number of unformatted node jnodes of this file in jnode hash table */
18264+ unsigned long nr_jnodes;
18265+#endif
18266+
18267+ /* block number of virtual root for this object. See comment above
18268+ * fs/reiser4/search.c:handle_vroot() */
18269+ reiser4_block_nr vroot;
18270+ struct semaphore loading;
18271+};
18272+
18273+void loading_init_once(reiser4_inode *);
18274+void loading_alloc(reiser4_inode *);
18275+void loading_destroy(reiser4_inode *);
18276+
18277+typedef struct reiser4_inode_object {
18278+ /* private part */
18279+ reiser4_inode p;
18280+ /* generic fields not specific to reiser4, but used by VFS */
18281+ struct inode vfs_inode;
18282+} reiser4_inode_object;
18283+
18284+/* return pointer to the reiser4 specific portion of @inode */
18285+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18286+ /* inode queried */ )
18287+{
18288+ assert("nikita-254", inode != NULL);
18289+ return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
18290+}
18291+
18292+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18293+ r4_inode /* inode queried */
18294+ )
18295+{
18296+ return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
18297+}
18298+
18299+/*
18300+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18301+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18302+ * bits.
18303+ *
18304+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18305+ * of inode, otherwise whole oid is stored in i_ino.
18306+ *
18307+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18308+ */
18309+
18310+#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18311+
18312+#if REISER4_INO_IS_OID
18313+
18314+static inline oid_t get_inode_oid(const struct inode *inode)
18315+{
18316+ return inode->i_ino;
18317+}
18318+
18319+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18320+{
18321+ inode->i_ino = oid;
18322+}
18323+
18324+/* REISER4_INO_IS_OID */
18325+#else
18326+
18327+static inline oid_t get_inode_oid(const struct inode *inode)
18328+{
18329+ return
18330+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18331+ inode->i_ino;
18332+}
18333+
18334+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18335+{
18336+ assert("nikita-2519", inode != NULL);
18337+ inode->i_ino = (ino_t) (oid);
18338+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18339+ assert("nikita-2521", get_inode_oid(inode) == (oid));
18340+}
18341+
18342+/* REISER4_INO_IS_OID */
18343+#endif
18344+
18345+static inline oid_t get_inode_locality(const struct inode *inode)
18346+{
18347+ return reiser4_inode_data(inode)->locality_id;
18348+}
18349+
18350+#if REISER4_LARGE_KEY
18351+static inline __u64 get_inode_ordering(const struct inode *inode)
18352+{
18353+ return reiser4_inode_data(inode)->ordering;
18354+}
18355+
18356+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18357+{
18358+ reiser4_inode_data(inode)->ordering = ordering;
18359+}
18360+
18361+#else
18362+
18363+#define get_inode_ordering(inode) (0)
18364+#define set_inode_ordering(inode, val) noop
18365+
18366+#endif
18367+
18368+/* return inode in which @uf_info is embedded */
18369+static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
18370+ uf_info)
18371+{
18372+ return &container_of(uf_info, reiser4_inode_object,
18373+ p.file_plugin_data.unix_file_info)->vfs_inode;
18374+}
18375+
18376+
18377+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18378+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18379+
18380+extern reiser4_tree *tree_by_inode(const struct inode *inode);
18381+
18382+#if REISER4_DEBUG
18383+extern void inode_invariant(const struct inode *inode);
18384+extern int inode_has_no_jnodes(reiser4_inode *);
18385+#else
18386+#define inode_invariant(inode) noop
18387+#endif
18388+
18389+static inline int spin_inode_is_locked(const struct inode *inode)
18390+{
18391+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18392+ return 1;
18393+}
18394+
18395+/**
18396+ * spin_lock_inode - lock reiser4_inode' embedded spinlock
18397+ * @inode: inode to lock
18398+ *
18399+ * In debug mode it checks that lower priority locks are not held and
18400+ * increments reiser4_context's lock counters on which lock ordering checking
18401+ * is based.
18402+ */
18403+static inline void spin_lock_inode(struct inode *inode)
18404+{
18405+ assert("", LOCK_CNT_NIL(spin_locked));
18406+ /* check lock ordering */
18407+ assert_spin_not_locked(&d_lock);
18408+
18409+ spin_lock(&reiser4_inode_data(inode)->guard);
18410+
18411+ LOCK_CNT_INC(spin_locked_inode);
18412+ LOCK_CNT_INC(spin_locked);
18413+
18414+ inode_invariant(inode);
18415+}
18416+
18417+/**
18418+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18419+ * @inode: inode to unlock
18420+ *
18421+ * In debug mode it checks that spinlock is held and decrements
18422+ * reiser4_context's lock counters on which lock ordering checking is based.
18423+ */
18424+static inline void spin_unlock_inode(struct inode *inode)
18425+{
18426+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18427+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18428+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18429+
18430+ inode_invariant(inode);
18431+
18432+ LOCK_CNT_DEC(spin_locked_inode);
18433+ LOCK_CNT_DEC(spin_locked);
18434+
18435+ spin_unlock(&reiser4_inode_data(inode)->guard);
18436+}
18437+
18438+
18439+extern znode *inode_get_vroot(struct inode *inode);
18440+extern void inode_set_vroot(struct inode *inode, znode * vroot);
18441+
18442+extern int reiser4_max_filename_len(const struct inode *inode);
18443+extern int max_hash_collisions(const struct inode *dir);
18444+extern void reiser4_unlock_inode(struct inode *inode);
18445+extern int is_reiser4_inode(const struct inode *inode);
18446+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18447+extern struct inode *reiser4_iget(struct super_block *super,
18448+ const reiser4_key * key, int silent);
18449+extern void reiser4_iget_complete(struct inode *inode);
18450+extern void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18451+extern void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18452+extern int inode_get_flag(const struct inode *inode,
18453+ reiser4_file_plugin_flags f);
18454+
18455+/* has inode been initialized? */
18456+static inline int
18457+is_inode_loaded(const struct inode *inode /* inode queried */ )
18458+{
18459+ assert("nikita-1120", inode != NULL);
18460+ return inode_get_flag(inode, REISER4_LOADED);
18461+}
18462+
18463+extern file_plugin *inode_file_plugin(const struct inode *inode);
18464+extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18465+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18466+extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18467+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18468+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18469+extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18470+extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18471+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18472+ *inode);
18473+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18474+extern regular_plugin *inode_regular_plugin(const struct inode *inode);
18475+extern item_plugin *inode_sd_plugin(const struct inode *inode);
18476+extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18477+
18478+extern void inode_set_plugin(struct inode *inode,
18479+ reiser4_plugin * plug, pset_member memb);
18480+extern void reiser4_make_bad_inode(struct inode *inode);
18481+
18482+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18483+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18484+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18485+
18486+/*
18487+ * update field @field in inode @i to contain value @value.
18488+ */
18489+#define INODE_SET_FIELD(i, field, value) \
18490+({ \
18491+ struct inode *__i; \
18492+ typeof(value) __v; \
18493+ \
18494+ __i = (i); \
18495+ __v = (value); \
18496+ inode_check_scale(__i, __i->field, __v); \
18497+ __i->field = __v; \
18498+})
18499+
18500+#define INODE_INC_FIELD(i, field) \
18501+({ \
18502+ struct inode *__i; \
18503+ \
18504+ __i = (i); \
18505+ inode_check_scale(__i, __i->field, __i->field + 1); \
18506+ ++ __i->field; \
18507+})
18508+
18509+#define INODE_DEC_FIELD(i, field) \
18510+({ \
18511+ struct inode *__i; \
18512+ \
18513+ __i = (i); \
18514+ inode_check_scale(__i, __i->field, __i->field - 1); \
18515+ -- __i->field; \
18516+})
18517+
18518+/* See comment before readdir_common() for description. */
18519+static inline struct list_head *get_readdir_list(const struct inode *inode)
18520+{
18521+ return &reiser4_inode_data(inode)->lists.readdir_list;
18522+}
18523+
18524+extern void init_inode_ordering(struct inode *inode,
18525+ reiser4_object_create_data * crd, int create);
18526+
18527+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18528+{
18529+ return &reiser4_inode_data(inode)->jnodes_tree;
18530+}
18531+
18532+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18533+ * r4_inode)
18534+{
18535+ return &r4_inode->jnodes_tree;
18536+}
18537+
18538+#if REISER4_DEBUG
18539+extern void print_inode(const char *prefix, const struct inode *i);
18540+#endif
18541+
18542+int is_dir_empty(const struct inode *);
18543+
18544+/* __REISER4_INODE_H__ */
18545+#endif
18546+
18547+/* Make Linus happy.
18548+ Local variables:
18549+ c-indentation-style: "K&R"
18550+ mode-name: "LC"
18551+ c-basic-offset: 8
18552+ tab-width: 8
18553+ fill-column: 120
18554+ End:
18555+*/
18556Index: linux-2.6.16/fs/reiser4/ioctl.h
18557===================================================================
18558--- /dev/null
18559+++ linux-2.6.16/fs/reiser4/ioctl.h
18560@@ -0,0 +1,41 @@
18561+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18562+ * reiser4/README */
18563+
18564+#if !defined( __REISER4_IOCTL_H__ )
18565+#define __REISER4_IOCTL_H__
18566+
18567+#include <linux/fs.h>
18568+
18569+/*
18570+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18571+ * extents and fix in this state. This is used by applications that rely on
18572+ *
18573+ * . files being block aligned, and
18574+ *
18575+ * . files never migrating on disk
18576+ *
18577+ * for example, boot loaders (LILO) need this.
18578+ *
18579+ * This ioctl should be used as
18580+ *
18581+ * result = ioctl(fd, REISER4_IOC_UNPACK);
18582+ *
18583+ * File behind fd descriptor will be converted to the extents (if necessary),
18584+ * and its stat-data will be updated so that it will never be converted back
18585+ * into tails again.
18586+ */
18587+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18588+
18589+/* __REISER4_IOCTL_H__ */
18590+#endif
18591+
18592+/* Make Linus happy.
18593+ Local variables:
18594+ c-indentation-style: "K&R"
18595+ mode-name: "LC"
18596+ c-basic-offset: 8
18597+ tab-width: 8
18598+ fill-column: 120
18599+ scroll-step: 1
18600+ End:
18601+*/
18602Index: linux-2.6.16/fs/reiser4/jnode.c
18603===================================================================
18604--- /dev/null
18605+++ linux-2.6.16/fs/reiser4/jnode.c
18606@@ -0,0 +1,1921 @@
18607+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18608+ * reiser4/README */
18609+/* Jnode manipulation functions. */
18610+/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18611+
18612+ In particular, jnodes are used to track transactional information
18613+ associated with each block. Each znode contains jnode as ->zjnode field.
18614+
18615+ Jnode stands for either Josh or Journal node.
18616+*/
18617+
18618+/*
18619+ * Taxonomy.
18620+ *
18621+ * Jnode represents block containing data or meta-data. There are jnodes
18622+ * for:
18623+ *
18624+ * unformatted blocks (jnodes proper). There are plans, however to
18625+ * have a handle per extent unit rather than per each unformatted
18626+ * block, because there are so many of them.
18627+ *
18628+ * For bitmaps. Each bitmap is actually represented by two jnodes--one
18629+ * for working and another for "commit" data, together forming bnode.
18630+ *
18631+ * For io-heads. These are used by log writer.
18632+ *
18633+ * For formatted nodes (znode). See comment at the top of znode.c for
18634+ * details specific to the formatted nodes (znodes).
18635+ *
18636+ * Node data.
18637+ *
18638+ * Jnode provides access to the data of node it represents. Data are
18639+ * stored in a page. Page is kept in a page cache. This means, that jnodes
18640+ * are highly interconnected with page cache and VM internals.
18641+ *
18642+ * jnode has a pointer to page (->pg) containing its data. Pointer to data
18643+ * themselves is cached in ->data field to avoid frequent calls to
18644+ * page_address().
18645+ *
18646+ * jnode and page are attached to each other by jnode_attach_page(). This
18647+ * function places pointer to jnode in set_page_private(), sets PG_private
18648+ * flag and increments page counter.
18649+ *
18650+ * Opposite operation is performed by page_clear_jnode().
18651+ *
18652+ * jnode->pg is protected by jnode spin lock, and page->private is
18653+ * protected by page lock. See comment at the top of page_cache.c for
18654+ * more.
18655+ *
18656+ * page can be detached from jnode for two reasons:
18657+ *
18658+ * . jnode is removed from a tree (file is truncated, of formatted
18659+ * node is removed by balancing).
18660+ *
18661+ * . during memory pressure, VM calls ->releasepage() method
18662+ * (reiser4_releasepage()) to evict page from memory.
18663+ *
18664+ * (there, of course, is also umount, but this is special case we are not
18665+ * concerned with here).
18666+ *
18667+ * To protect jnode page from eviction, one calls jload() function that
18668+ * "pins" page in memory (loading it if necessary), increments
18669+ * jnode->d_count, and kmap()s page. Page is unpinned through call to
18670+ * jrelse().
18671+ *
18672+ * Jnode life cycle.
18673+ *
18674+ * jnode is created, placed in hash table, and, optionally, in per-inode
18675+ * radix tree. Page can be attached to jnode, pinned, released, etc.
18676+ *
18677+ * When jnode is captured into atom its reference counter is
18678+ * increased. While being part of an atom, jnode can be "early
18679+ * flushed". This means that as part of flush procedure, jnode is placed
18680+ * into "relocate set", and its page is submitted to the disk. After io
18681+ * completes, page can be detached, then loaded again, re-dirtied, etc.
18682+ *
18683+ * Thread acquired reference to jnode by calling jref() and releases it by
18684+ * jput(). When last reference is removed, jnode is still retained in
18685+ * memory (cached) if it has page attached, _unless_ it is scheduled for
18686+ * destruction (has JNODE_HEARD_BANSHEE bit set).
18687+ *
18688+ * Tree read-write lock was used as "existential" lock for jnodes. That is,
18689+ * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18690+ * that is, tree lock protected unreferenced jnodes stored in the hash
18691+ * table, from recycling.
18692+ *
18693+ * This resulted in high contention on tree lock, because jref()/jput() is
18694+ * frequent operation. To ameliorate this problem, RCU is used: when jput()
18695+ * is just about to release last reference on jnode it sets JNODE_RIP bit
18696+ * on it, and then proceed with jnode destruction (removing jnode from hash
18697+ * table, cbk_cache, detaching page, etc.). All places that change jnode
18698+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18699+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18700+ * jnode_rip_check() function), and pretend that nothing was found in hash
18701+ * table if bit is set.
18702+ *
18703+ * jput defers actual return of jnode into slab cache to some later time
18704+ * (by call_rcu()), this guarantees that other threads can safely continue
18705+ * working with JNODE_RIP-ped jnode.
18706+ *
18707+ */
18708+
18709+#include "reiser4.h"
18710+#include "debug.h"
18711+#include "dformat.h"
18712+#include "jnode.h"
18713+#include "plugin/plugin_header.h"
18714+#include "plugin/plugin.h"
18715+#include "txnmgr.h"
18716+/*#include "jnode.h"*/
18717+#include "znode.h"
18718+#include "tree.h"
18719+#include "tree_walk.h"
18720+#include "super.h"
18721+#include "inode.h"
18722+#include "page_cache.h"
18723+
18724+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18725+#include <linux/types.h>
18726+#include <linux/slab.h>
18727+#include <linux/pagemap.h>
18728+#include <linux/vmalloc.h> /* for vmalloc(), vfree() */
18729+#include <linux/swap.h>
18730+#include <linux/fs.h> /* for struct address_space */
18731+#include <linux/writeback.h> /* for inode_lock */
18732+
18733+static kmem_cache_t *_jnode_slab = NULL;
18734+
18735+static void jnode_set_type(jnode * node, jnode_type type);
18736+static int jdelete(jnode * node);
18737+static int jnode_try_drop(jnode * node);
18738+
18739+#if REISER4_DEBUG
18740+static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18741+#endif
18742+
18743+/* true if valid page is attached to jnode */
18744+static inline int jnode_is_parsed(jnode * node)
18745+{
18746+ return JF_ISSET(node, JNODE_PARSED);
18747+}
18748+
18749+/* hash table support */
18750+
18751+/* compare two jnode keys for equality. Used by hash-table macros */
18752+static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
18753+{
18754+ assert("nikita-2350", k1 != NULL);
18755+ assert("nikita-2351", k2 != NULL);
18756+
18757+ return (k1->index == k2->index && k1->objectid == k2->objectid);
18758+}
18759+
18760+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18761+static inline __u32
18762+jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
18763+{
18764+ assert("nikita-2352", key != NULL);
18765+ assert("nikita-3346", IS_POW(table->_buckets));
18766+
18767+ /* yes, this is remarkable simply (where not stupid) hash function. */
18768+ return (key->objectid + key->index) & (table->_buckets - 1);
18769+}
18770+
18771+/* The hash table definition */
18772+#define KMALLOC(size) vmalloc(size)
18773+#define KFREE(ptr, size) vfree(ptr)
18774+TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
18775+ jnode_key_eq);
18776+#undef KFREE
18777+#undef KMALLOC
18778+
18779+/* call this to initialise jnode hash table */
18780+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18781+{
18782+ assert("nikita-2359", tree != NULL);
18783+ return j_hash_init(&tree->jhash_table, 16384);
18784+}
18785+
18786+/* call this to destroy jnode hash table. This is called during umount. */
18787+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18788+{
18789+ j_hash_table *jtable;
18790+ jnode *node;
18791+ jnode *next;
18792+
18793+ assert("nikita-2360", tree != NULL);
18794+
18795+ /*
18796+ * Scan hash table and free all jnodes.
18797+ */
18798+ jtable = &tree->jhash_table;
18799+ if (jtable->_table) {
18800+ for_all_in_htable(jtable, j, node, next) {
18801+ assert("nikita-2361", !atomic_read(&node->x_count));
18802+ jdrop(node);
18803+ }
18804+
18805+ j_hash_done(&tree->jhash_table);
18806+ }
18807+ return 0;
18808+}
18809+
18810+/**
18811+ * init_jnodes - create jnode cache
18812+ *
18813+ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18814+ */
18815+int init_jnodes(void)
18816+{
18817+ assert("umka-168", _jnode_slab == NULL);
18818+
18819+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18820+ SLAB_HWCACHE_ALIGN |
18821+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
18822+ if (_jnode_slab == NULL)
18823+ return RETERR(-ENOMEM);
18824+
18825+ return 0;
18826+}
18827+
18828+/**
18829+ * done_znodes - delete znode cache
18830+ *
18831+ * This is called on reiser4 module unloading or system shutdown.
18832+ */
18833+void done_jnodes(void)
18834+{
18835+ destroy_reiser4_cache(&_jnode_slab);
18836+}
18837+
18838+/* Initialize a jnode. */
18839+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18840+{
18841+ assert("umka-175", node != NULL);
18842+
18843+ memset(node, 0, sizeof(jnode));
18844+ ON_DEBUG(node->magic = JMAGIC);
18845+ jnode_set_type(node, type);
18846+ atomic_set(&node->d_count, 0);
18847+ atomic_set(&node->x_count, 0);
18848+ spin_lock_init(&node->guard);
18849+ spin_lock_init(&node->load);
18850+ node->atom = NULL;
18851+ node->tree = tree;
18852+ INIT_LIST_HEAD(&node->capture_link);
18853+
18854+ ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18855+
18856+ INIT_RCU_HEAD(&node->rcu);
18857+
18858+#if REISER4_DEBUG
18859+ {
18860+ reiser4_super_info_data *sbinfo;
18861+
18862+ sbinfo = get_super_private(tree->super);
18863+ spin_lock_irq(&sbinfo->all_guard);
18864+ list_add(&node->jnodes, &sbinfo->all_jnodes);
18865+ spin_unlock_irq(&sbinfo->all_guard);
18866+ }
18867+#endif
18868+}
18869+
18870+#if REISER4_DEBUG
18871+/*
18872+ * Remove jnode from ->all_jnodes list.
18873+ */
18874+static void jnode_done(jnode * node, reiser4_tree * tree)
18875+{
18876+ reiser4_super_info_data *sbinfo;
18877+
18878+ sbinfo = get_super_private(tree->super);
18879+
18880+ spin_lock_irq(&sbinfo->all_guard);
18881+ assert("nikita-2422", !list_empty(&node->jnodes));
18882+ list_del_init(&node->jnodes);
18883+ spin_unlock_irq(&sbinfo->all_guard);
18884+}
18885+#endif
18886+
18887+/* return already existing jnode of page */
18888+jnode *jnode_by_page(struct page *pg)
18889+{
18890+ assert("nikita-2066", pg != NULL);
18891+ assert("nikita-2400", PageLocked(pg));
18892+ assert("nikita-2068", PagePrivate(pg));
18893+ assert("nikita-2067", jprivate(pg) != NULL);
18894+ return jprivate(pg);
18895+}
18896+
18897+/* exported functions to allocate/free jnode objects outside this file */
18898+jnode *jalloc(void)
18899+{
18900+ jnode *jal = kmem_cache_alloc(_jnode_slab, get_gfp_mask());
18901+ return jal;
18902+}
18903+
18904+/* return jnode back to the slab allocator */
18905+inline void jfree(jnode * node)
18906+{
18907+ assert("zam-449", node != NULL);
18908+
18909+ assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18910+ NODE_LIST(node) == NOT_CAPTURED));
18911+ assert("nikita-3222", list_empty(&node->jnodes));
18912+ assert("nikita-3221", jnode_page(node) == NULL);
18913+
18914+ /* not yet phash_jnode_destroy(node); */
18915+
18916+ kmem_cache_free(_jnode_slab, node);
18917+}
18918+
18919+/*
18920+ * This function is supplied as RCU callback. It actually frees jnode when
18921+ * last reference to it is gone.
18922+ */
18923+static void jnode_free_actor(struct rcu_head *head)
18924+{
18925+ jnode *node;
18926+ jnode_type jtype;
18927+
18928+ node = container_of(head, jnode, rcu);
18929+ jtype = jnode_get_type(node);
18930+
18931+ ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18932+
18933+ switch (jtype) {
18934+ case JNODE_IO_HEAD:
18935+ case JNODE_BITMAP:
18936+ case JNODE_UNFORMATTED_BLOCK:
18937+ jfree(node);
18938+ break;
18939+ case JNODE_FORMATTED_BLOCK:
18940+ zfree(JZNODE(node));
18941+ break;
18942+ case JNODE_INODE:
18943+ default:
18944+ wrong_return_value("nikita-3197", "Wrong jnode type");
18945+ }
18946+}
18947+
18948+/*
18949+ * Free a jnode. Post a callback to be executed later through RCU when all
18950+ * references to @node are released.
18951+ */
18952+static inline void jnode_free(jnode * node, jnode_type jtype)
18953+{
18954+ if (jtype != JNODE_INODE) {
18955+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18956+ call_rcu(&node->rcu, jnode_free_actor);
18957+ } else
18958+ jnode_list_remove(node);
18959+}
18960+
18961+/* allocate new unformatted jnode */
18962+static jnode *jnew_unformatted(void)
18963+{
18964+ jnode *jal;
18965+
18966+ jal = jalloc();
18967+ if (jal == NULL)
18968+ return NULL;
18969+
18970+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18971+ jal->key.j.mapping = NULL;
18972+ jal->key.j.index = (unsigned long)-1;
18973+ jal->key.j.objectid = 0;
18974+ return jal;
18975+}
18976+
18977+/* look for jnode with given mapping and offset within hash table */
18978+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18979+{
18980+ jnode_key_t jkey;
18981+ jnode *node;
18982+
18983+ assert("nikita-2353", tree != NULL);
18984+
18985+ jkey.objectid = objectid;
18986+ jkey.index = index;
18987+
18988+ /*
18989+ * hash table is _not_ protected by any lock during lookups. All we
18990+ * have to do is to disable preemption to keep RCU happy.
18991+ */
18992+
18993+ rcu_read_lock();
18994+ node = j_hash_find(&tree->jhash_table, &jkey);
18995+ if (node != NULL) {
18996+ /* protect @node from recycling */
18997+ jref(node);
18998+ assert("nikita-2955", jnode_invariant(node, 0, 0));
18999+ node = jnode_rip_check(tree, node);
19000+ }
19001+ rcu_read_unlock();
19002+ return node;
19003+}
19004+
19005+/* per inode radix tree of jnodes is protected by tree's read write spin lock */
19006+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
19007+{
19008+ assert("vs-1694", mapping->host != NULL);
19009+
19010+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
19011+}
19012+
19013+jnode *jfind(struct address_space * mapping, unsigned long index)
19014+{
19015+ reiser4_tree *tree;
19016+ jnode *node;
19017+
19018+ assert("vs-1694", mapping->host != NULL);
19019+ tree = tree_by_inode(mapping->host);
19020+
19021+ read_lock_tree(tree);
19022+ node = jfind_nolock(mapping, index);
19023+ if (node != NULL)
19024+ jref(node);
19025+ read_unlock_tree(tree);
19026+ return node;
19027+}
19028+
19029+static void inode_attach_jnode(jnode * node)
19030+{
19031+ struct inode *inode;
19032+ reiser4_inode *info;
19033+ struct radix_tree_root *rtree;
19034+
19035+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19036+ assert("zam-1043", node->key.j.mapping != NULL);
19037+ inode = node->key.j.mapping->host;
19038+ info = reiser4_inode_data(inode);
19039+ rtree = jnode_tree_by_reiser4_inode(info);
19040+ if (rtree->rnode == NULL) {
19041+ /* prevent inode from being pruned when it has jnodes attached
19042+ to it */
19043+ write_lock_irq(&inode->i_data.tree_lock);
19044+ inode->i_data.nrpages++;
19045+ write_unlock_irq(&inode->i_data.tree_lock);
19046+ }
19047+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
19048+ check_me("zam-1045",
19049+ !radix_tree_insert(rtree, node->key.j.index, node));
19050+ ON_DEBUG(info->nr_jnodes++);
19051+}
19052+
19053+static void inode_detach_jnode(jnode * node)
19054+{
19055+ struct inode *inode;
19056+ reiser4_inode *info;
19057+ struct radix_tree_root *rtree;
19058+
19059+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19060+ assert("zam-1044", node->key.j.mapping != NULL);
19061+ inode = node->key.j.mapping->host;
19062+ info = reiser4_inode_data(inode);
19063+ rtree = jnode_tree_by_reiser4_inode(info);
19064+
19065+ assert("zam-1051", info->nr_jnodes != 0);
19066+ assert("zam-1052", rtree->rnode != NULL);
19067+ ON_DEBUG(info->nr_jnodes--);
19068+
19069+ /* delete jnode from inode's radix tree of jnodes */
19070+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
19071+ if (rtree->rnode == NULL) {
19072+ /* inode can be pruned now */
19073+ write_lock_irq(&inode->i_data.tree_lock);
19074+ inode->i_data.nrpages--;
19075+ write_unlock_irq(&inode->i_data.tree_lock);
19076+ }
19077+}
19078+
19079+/* put jnode into hash table (where they can be found by flush who does not know
19080+ mapping) and to inode's tree of jnodes (where they can be found (hopefully
19081+ faster) in places where mapping is known). Currently it is used by
19082+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
19083+ created */
19084+static void
19085+hash_unformatted_jnode(jnode * node, struct address_space *mapping,
19086+ unsigned long index)
19087+{
19088+ j_hash_table *jtable;
19089+
19090+ assert("vs-1446", jnode_is_unformatted(node));
19091+ assert("vs-1442", node->key.j.mapping == 0);
19092+ assert("vs-1443", node->key.j.objectid == 0);
19093+ assert("vs-1444", node->key.j.index == (unsigned long)-1);
19094+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19095+
19096+ node->key.j.mapping = mapping;
19097+ node->key.j.objectid = get_inode_oid(mapping->host);
19098+ node->key.j.index = index;
19099+
19100+ jtable = &jnode_get_tree(node)->jhash_table;
19101+
19102+ /* race with some other thread inserting jnode into the hash table is
19103+ * impossible, because we keep the page lock. */
19104+ /*
19105+ * following assertion no longer holds because of RCU: it is possible
19106+ * jnode is in the hash table, but with JNODE_RIP bit set.
19107+ */
19108+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
19109+ j_hash_insert_rcu(jtable, node);
19110+ inode_attach_jnode(node);
19111+}
19112+
19113+static void unhash_unformatted_node_nolock(jnode * node)
19114+{
19115+ assert("vs-1683", node->key.j.mapping != NULL);
19116+ assert("vs-1684",
19117+ node->key.j.objectid ==
19118+ get_inode_oid(node->key.j.mapping->host));
19119+
19120+ /* remove jnode from hash-table */
19121+ j_hash_remove_rcu(&node->tree->jhash_table, node);
19122+ inode_detach_jnode(node);
19123+ node->key.j.mapping = NULL;
19124+ node->key.j.index = (unsigned long)-1;
19125+ node->key.j.objectid = 0;
19126+
19127+}
19128+
19129+/* remove jnode from hash table and from inode's tree of jnodes. This is used in
19130+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
19131+ uncapture_jnode */
19132+void unhash_unformatted_jnode(jnode * node)
19133+{
19134+ assert("vs-1445", jnode_is_unformatted(node));
19135+
19136+ write_lock_tree(node->tree);
19137+ unhash_unformatted_node_nolock(node);
19138+ write_unlock_tree(node->tree);
19139+}
19140+
19141+/*
19142+ * search hash table for a jnode with given oid and index. If not found,
19143+ * allocate new jnode, insert it, and also insert into radix tree for the
19144+ * given inode/mapping.
19145+ */
19146+jnode *find_get_jnode(reiser4_tree * tree, struct address_space *mapping,
19147+ oid_t oid, unsigned long index)
19148+{
19149+ jnode *result;
19150+ jnode *shadow;
19151+ int preload;
19152+
19153+ result = jnew_unformatted();
19154+
19155+ if (unlikely(result == NULL))
19156+ return ERR_PTR(RETERR(-ENOMEM));
19157+
19158+ preload = radix_tree_preload(get_gfp_mask());
19159+ if (preload != 0)
19160+ return ERR_PTR(preload);
19161+
19162+ write_lock_tree(tree);
19163+ shadow = jfind_nolock(mapping, index);
19164+ if (likely(shadow == NULL)) {
19165+ /* add new jnode to hash table and inode's radix tree of jnodes */
19166+ jref(result);
19167+ hash_unformatted_jnode(result, mapping, index);
19168+ } else {
19169+ /* jnode is found in inode's radix tree of jnodes */
19170+ jref(shadow);
19171+ jnode_free(result, JNODE_UNFORMATTED_BLOCK);
19172+ assert("vs-1498", shadow->key.j.mapping == mapping);
19173+ result = shadow;
19174+ }
19175+ write_unlock_tree(tree);
19176+
19177+ assert("nikita-2955",
19178+ ergo(result != NULL, jnode_invariant(result, 0, 0)));
19179+ radix_tree_preload_end();
19180+ return result;
19181+}
19182+
19183+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
19184+ creates) jnode corresponding to page @pg. jnode is attached to page and
19185+ inserted into jnode hash-table. */
19186+static jnode *do_jget(reiser4_tree * tree, struct page *pg)
19187+{
19188+ /*
19189+ * There are two ways to create jnode: starting with pre-existing page
19190+ * and without page.
19191+ *
19192+ * When page already exists, jnode is created
19193+ * (jnode_of_page()->do_jget()) under page lock. This is done in
19194+ * ->writepage(), or when capturing anonymous page dirtied through
19195+ * mmap.
19196+ *
19197+ * Jnode without page is created by index_extent_jnode().
19198+ *
19199+ */
19200+
19201+ jnode *result;
19202+ oid_t oid = get_inode_oid(pg->mapping->host);
19203+
19204+ assert("umka-176", pg != NULL);
19205+ assert("nikita-2394", PageLocked(pg));
19206+
19207+ result = jprivate(pg);
19208+ if (likely(result != NULL))
19209+ return jref(result);
19210+
19211+ tree = tree_by_page(pg);
19212+
19213+ /* check hash-table first */
19214+ result = jfind(pg->mapping, pg->index);
19215+ if (unlikely(result != NULL)) {
19216+ spin_lock_jnode(result);
19217+ jnode_attach_page(result, pg);
19218+ spin_unlock_jnode(result);
19219+ result->key.j.mapping = pg->mapping;
19220+ return result;
19221+ }
19222+
19223+ result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19224+ if (unlikely(IS_ERR(result)))
19225+ return result;
19226+ /* attach jnode to page */
19227+ spin_lock_jnode(result);
19228+ jnode_attach_page(result, pg);
19229+ spin_unlock_jnode(result);
19230+ return result;
19231+}
19232+
19233+/*
19234+ * return jnode for @pg, creating it if necessary.
19235+ */
19236+jnode *jnode_of_page(struct page * pg)
19237+{
19238+ jnode *result;
19239+
19240+ assert("umka-176", pg != NULL);
19241+ assert("nikita-2394", PageLocked(pg));
19242+
19243+ result = do_jget(tree_by_page(pg), pg);
19244+
19245+ if (REISER4_DEBUG && !IS_ERR(result)) {
19246+ assert("nikita-3210", result == jprivate(pg));
19247+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19248+ if (jnode_is_unformatted(jprivate(pg))) {
19249+ assert("nikita-2364",
19250+ jprivate(pg)->key.j.index == pg->index);
19251+ assert("nikita-2367",
19252+ jprivate(pg)->key.j.mapping == pg->mapping);
19253+ assert("nikita-2365",
19254+ jprivate(pg)->key.j.objectid ==
19255+ get_inode_oid(pg->mapping->host));
19256+ assert("vs-1200",
19257+ jprivate(pg)->key.j.objectid ==
19258+ pg->mapping->host->i_ino);
19259+ assert("nikita-2356",
19260+ jnode_is_unformatted(jnode_by_page(pg)));
19261+ }
19262+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19263+ }
19264+ return result;
19265+}
19266+
19267+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19268+ * page.*/
19269+void jnode_attach_page(jnode * node, struct page *pg)
19270+{
19271+ assert("nikita-2060", node != NULL);
19272+ assert("nikita-2061", pg != NULL);
19273+
19274+ assert("nikita-2050", jprivate(pg) == 0ul);
19275+ assert("nikita-2393", !PagePrivate(pg));
19276+ assert("vs-1741", node->pg == NULL);
19277+
19278+ assert("nikita-2396", PageLocked(pg));
19279+ assert_spin_locked(&(node->guard));
19280+
19281+ page_cache_get(pg);
19282+ set_page_private(pg, (unsigned long)node);
19283+ node->pg = pg;
19284+ SetPagePrivate(pg);
19285+}
19286+
19287+/* Dual to jnode_attach_page: break a binding between page and jnode */
19288+void page_clear_jnode(struct page *page, jnode * node)
19289+{
19290+ assert("nikita-2424", page != NULL);
19291+ assert("nikita-2425", PageLocked(page));
19292+ assert("nikita-2426", node != NULL);
19293+ assert_spin_locked(&(node->guard));
19294+ assert("nikita-2428", PagePrivate(page));
19295+
19296+ assert("nikita-3551", !PageWriteback(page));
19297+
19298+ JF_CLR(node, JNODE_PARSED);
19299+ set_page_private(page, 0ul);
19300+ ClearPagePrivate(page);
19301+ node->pg = NULL;
19302+ page_cache_release(page);
19303+}
19304+
19305+/* it is only used in one place to handle error */
19306+void
19307+page_detach_jnode(struct page *page, struct address_space *mapping,
19308+ unsigned long index)
19309+{
19310+ assert("nikita-2395", page != NULL);
19311+
19312+ lock_page(page);
19313+ if ((page->mapping == mapping) && (page->index == index)
19314+ && PagePrivate(page)) {
19315+ jnode *node;
19316+
19317+ node = jprivate(page);
19318+ spin_lock_jnode(node);
19319+ page_clear_jnode(page, node);
19320+ spin_unlock_jnode(node);
19321+ }
19322+ unlock_page(page);
19323+}
19324+
19325+/* return @node page locked.
19326+
19327+ Locking ordering requires that one first takes page lock and afterwards
19328+ spin lock on node attached to this page. Sometimes it is necessary to go in
19329+ the opposite direction. This is done through standard trylock-and-release
19330+ loop.
19331+*/
19332+static struct page *jnode_lock_page(jnode * node)
19333+{
19334+ struct page *page;
19335+
19336+ assert("nikita-2052", node != NULL);
19337+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19338+
19339+ while (1) {
19340+
19341+ spin_lock_jnode(node);
19342+ page = jnode_page(node);
19343+ if (page == NULL) {
19344+ break;
19345+ }
19346+
19347+ /* no need to page_cache_get( page ) here, because page cannot
19348+ be evicted from memory without detaching it from jnode and
19349+ this requires spin lock on jnode that we already hold.
19350+ */
19351+ if (!TestSetPageLocked(page)) {
19352+ /* We won a lock on jnode page, proceed. */
19353+ break;
19354+ }
19355+
19356+ /* Page is locked by someone else. */
19357+ page_cache_get(page);
19358+ spin_unlock_jnode(node);
19359+ wait_on_page_locked(page);
19360+ /* it is possible that page was detached from jnode and
19361+ returned to the free pool, or re-assigned while we were
19362+ waiting on locked bit. This will be rechecked on the next
19363+ loop iteration.
19364+ */
19365+ page_cache_release(page);
19366+
19367+ /* try again */
19368+ }
19369+ return page;
19370+}
19371+
19372+/*
19373+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19374+ * validness of jnode content.
19375+ */
19376+static inline int jparse(jnode * node)
19377+{
19378+ int result;
19379+
19380+ assert("nikita-2466", node != NULL);
19381+
19382+ spin_lock_jnode(node);
19383+ if (likely(!jnode_is_parsed(node))) {
19384+ result = jnode_ops(node)->parse(node);
19385+ if (likely(result == 0))
19386+ JF_SET(node, JNODE_PARSED);
19387+ } else
19388+ result = 0;
19389+ spin_unlock_jnode(node);
19390+ return result;
19391+}
19392+
19393+/* Lock a page attached to jnode, create and attach page to jnode if it had no
19394+ * one. */
19395+struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19396+{
19397+ struct page *page;
19398+
19399+ spin_lock_jnode(node);
19400+ page = jnode_page(node);
19401+
19402+ if (page == NULL) {
19403+ spin_unlock_jnode(node);
19404+ page = find_or_create_page(jnode_get_mapping(node),
19405+ jnode_get_index(node), gfp_flags);
19406+ if (page == NULL)
19407+ return ERR_PTR(RETERR(-ENOMEM));
19408+ } else {
19409+ if (!TestSetPageLocked(page)) {
19410+ spin_unlock_jnode(node);
19411+ return page;
19412+ }
19413+ page_cache_get(page);
19414+ spin_unlock_jnode(node);
19415+ lock_page(page);
19416+ assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19417+ }
19418+
19419+ spin_lock_jnode(node);
19420+ if (!jnode_page(node))
19421+ jnode_attach_page(node, page);
19422+ spin_unlock_jnode(node);
19423+
19424+ page_cache_release(page);
19425+ assert("zam-894", jnode_page(node) == page);
19426+ return page;
19427+}
19428+
19429+/* Start read operation for jnode's page if page is not up-to-date. */
19430+static int jnode_start_read(jnode * node, struct page *page)
19431+{
19432+ assert("zam-893", PageLocked(page));
19433+
19434+ if (PageUptodate(page)) {
19435+ unlock_page(page);
19436+ return 0;
19437+ }
19438+ return page_io(page, node, READ, get_gfp_mask());
19439+}
19440+
19441+#if REISER4_DEBUG
19442+static void check_jload(jnode * node, struct page *page)
19443+{
19444+ if (jnode_is_znode(node)) {
19445+ node40_header *nh;
19446+ znode *z;
19447+
19448+ z = JZNODE(node);
19449+ if (znode_is_any_locked(z)) {
19450+ nh = (node40_header *) kmap(page);
19451+ /* this only works for node40-only file systems. For
19452+ * debugging. */
19453+ assert("nikita-3253",
19454+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19455+ kunmap(page);
19456+ }
19457+ assert("nikita-3565", znode_invariant(z));
19458+ }
19459+}
19460+#else
19461+#define check_jload(node, page) noop
19462+#endif
19463+
19464+/* prefetch jnode to speed up next call to jload. Call this when you are going
19465+ * to call jload() shortly. This will bring appropriate portion of jnode into
19466+ * CPU cache. */
19467+void jload_prefetch(jnode * node)
19468+{
19469+ prefetchw(&node->x_count);
19470+}
19471+
19472+/* load jnode's data into memory */
19473+int jload_gfp(jnode * node /* node to load */ ,
19474+ gfp_t gfp_flags /* allocation flags */ ,
19475+ int do_kmap /* true if page should be kmapped */ )
19476+{
19477+ struct page *page;
19478+ int result = 0;
19479+ int parsed;
19480+
19481+ assert("nikita-3010", schedulable());
19482+
19483+ prefetchw(&node->pg);
19484+
19485+ /* taking d-reference implies taking x-reference. */
19486+ jref(node);
19487+
19488+ /*
19489+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19490+ * should be atomic, otherwise there is a race against
19491+ * reiser4_releasepage().
19492+ */
19493+ spin_lock(&(node->load));
19494+ add_d_ref(node);
19495+ parsed = jnode_is_parsed(node);
19496+ spin_unlock(&(node->load));
19497+
19498+ if (unlikely(!parsed)) {
19499+ page = jnode_get_page_locked(node, gfp_flags);
19500+ if (unlikely(IS_ERR(page))) {
19501+ result = PTR_ERR(page);
19502+ goto failed;
19503+ }
19504+
19505+ result = jnode_start_read(node, page);
19506+ if (unlikely(result != 0))
19507+ goto failed;
19508+
19509+ wait_on_page_locked(page);
19510+ if (unlikely(!PageUptodate(page))) {
19511+ result = RETERR(-EIO);
19512+ goto failed;
19513+ }
19514+
19515+ if (do_kmap)
19516+ node->data = kmap(page);
19517+
19518+ result = jparse(node);
19519+ if (unlikely(result != 0)) {
19520+ if (do_kmap)
19521+ kunmap(page);
19522+ goto failed;
19523+ }
19524+ check_jload(node, page);
19525+ } else {
19526+ page = jnode_page(node);
19527+ check_jload(node, page);
19528+ if (do_kmap)
19529+ node->data = kmap(page);
19530+ }
19531+
19532+ if (!is_writeout_mode())
19533+ /* We do not mark pages active if jload is called as a part of
19534+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19535+ * and write_logs() add no value to cached data, there is no
19536+ * sense to mark pages as active when they go to disk, it just
19537+ * confuses vm scanning routines because clean page could be
19538+ * moved out from inactive list as a result of this
19539+ * mark_page_accessed() call. */
19540+ mark_page_accessed(page);
19541+
19542+ return 0;
19543+
19544+ failed:
19545+ jrelse_tail(node);
19546+ return result;
19547+
19548+}
19549+
19550+/* start asynchronous reading for given jnode's page. */
19551+int jstartio(jnode * node)
19552+{
19553+ struct page *page;
19554+
19555+ page = jnode_get_page_locked(node, get_gfp_mask());
19556+ if (IS_ERR(page))
19557+ return PTR_ERR(page);
19558+
19559+ return jnode_start_read(node, page);
19560+}
19561+
19562+/* Initialize a node by calling appropriate plugin instead of reading
19563+ * node from disk as in jload(). */
19564+int jinit_new(jnode * node, gfp_t gfp_flags)
19565+{
19566+ struct page *page;
19567+ int result;
19568+
19569+ jref(node);
19570+ add_d_ref(node);
19571+
19572+ page = jnode_get_page_locked(node, gfp_flags);
19573+ if (IS_ERR(page)) {
19574+ result = PTR_ERR(page);
19575+ goto failed;
19576+ }
19577+
19578+ SetPageUptodate(page);
19579+ unlock_page(page);
19580+
19581+ node->data = kmap(page);
19582+
19583+ if (!jnode_is_parsed(node)) {
19584+ jnode_plugin *jplug = jnode_ops(node);
19585+ spin_lock_jnode(node);
19586+ result = jplug->init(node);
19587+ spin_unlock_jnode(node);
19588+ if (result) {
19589+ kunmap(page);
19590+ goto failed;
19591+ }
19592+ JF_SET(node, JNODE_PARSED);
19593+ }
19594+
19595+ return 0;
19596+
19597+ failed:
19598+ jrelse(node);
19599+ return result;
19600+}
19601+
19602+/* release a reference to jnode acquired by jload(), decrement ->d_count */
19603+void jrelse_tail(jnode * node /* jnode to release references to */ )
19604+{
19605+ assert("nikita-489", atomic_read(&node->d_count) > 0);
19606+ atomic_dec(&node->d_count);
19607+ /* release reference acquired in jload_gfp() or jinit_new() */
19608+ jput(node);
19609+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
19610+ LOCK_CNT_DEC(d_refs);
19611+}
19612+
19613+/* drop reference to node data. When last reference is dropped, data are
19614+ unloaded. */
19615+void jrelse(jnode * node /* jnode to release references to */ )
19616+{
19617+ struct page *page;
19618+
19619+ assert("nikita-487", node != NULL);
19620+ assert_spin_not_locked(&(node->guard));
19621+
19622+ page = jnode_page(node);
19623+ if (likely(page != NULL)) {
19624+ /*
19625+ * it is safe not to lock jnode here, because at this point
19626+ * @node->d_count is greater than zero (if jrelse() is used
19627+ * correctly, that is). JNODE_PARSED may be not set yet, if,
19628+ * for example, we got here as a result of error handling path
19629+ * in jload(). Anyway, page cannot be detached by
19630+ * reiser4_releasepage(). truncate will invalidate page
19631+ * regardless, but this should not be a problem.
19632+ */
19633+ kunmap(page);
19634+ }
19635+ jrelse_tail(node);
19636+}
19637+
19638+/* called from jput() to wait for io completion */
19639+static void jnode_finish_io(jnode * node)
19640+{
19641+ struct page *page;
19642+
19643+ assert("nikita-2922", node != NULL);
19644+
19645+ spin_lock_jnode(node);
19646+ page = jnode_page(node);
19647+ if (page != NULL) {
19648+ page_cache_get(page);
19649+ spin_unlock_jnode(node);
19650+ wait_on_page_writeback(page);
19651+ page_cache_release(page);
19652+ } else
19653+ spin_unlock_jnode(node);
19654+}
19655+
19656+/*
19657+ * This is called by jput() when last reference to jnode is released. This is
19658+ * separate function, because we want fast path of jput() to be inline and,
19659+ * therefore, small.
19660+ */
19661+void jput_final(jnode * node)
19662+{
19663+ int r_i_p;
19664+
19665+ /* A fast check for keeping node in cache. We always keep node in cache
19666+ * if its page is present and node was not marked for deletion */
19667+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19668+ rcu_read_unlock();
19669+ return;
19670+ }
19671+ assert("edward-1432", node->page_count == 0);
19672+
19673+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19674+ /*
19675+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19676+ * this case it is safe to access node after unlock.
19677+ */
19678+ rcu_read_unlock();
19679+ if (r_i_p) {
19680+ jnode_finish_io(node);
19681+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19682+ /* node is removed from the tree. */
19683+ jdelete(node);
19684+ else
19685+ jnode_try_drop(node);
19686+ }
19687+ /* if !r_i_p some other thread is already killing it */
19688+}
19689+
19690+int jwait_io(jnode * node, int rw)
19691+{
19692+ struct page *page;
19693+ int result;
19694+
19695+ assert("zam-447", node != NULL);
19696+ assert("zam-448", jnode_page(node) != NULL);
19697+
19698+ page = jnode_page(node);
19699+
19700+ result = 0;
19701+ if (rw == READ) {
19702+ wait_on_page_locked(page);
19703+ } else {
19704+ assert("nikita-2227", rw == WRITE);
19705+ wait_on_page_writeback(page);
19706+ }
19707+ if (PageError(page))
19708+ result = RETERR(-EIO);
19709+
19710+ return result;
19711+}
19712+
19713+/*
19714+ * jnode types and plugins.
19715+ *
19716+ * jnode by itself is a "base type". There are several different jnode
19717+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19718+ * has to do different things based on jnode type. In the standard reiser4 way
19719+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19720+ *
19721+ * Functions below deal with jnode types and define methods of jnode plugin.
19722+ *
19723+ */
19724+
19725+/* set jnode type. This is done during jnode initialization. */
19726+static void jnode_set_type(jnode * node, jnode_type type)
19727+{
19728+ static unsigned long type_to_mask[] = {
19729+ [JNODE_UNFORMATTED_BLOCK] = 1,
19730+ [JNODE_FORMATTED_BLOCK] = 0,
19731+ [JNODE_BITMAP] = 2,
19732+ [JNODE_IO_HEAD] = 6,
19733+ [JNODE_INODE] = 4
19734+ };
19735+
19736+ assert("zam-647", type < LAST_JNODE_TYPE);
19737+ assert("nikita-2815", !jnode_is_loaded(node));
19738+ assert("nikita-3386", node->state == 0);
19739+
19740+ node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19741+}
19742+
19743+/* ->init() method of jnode plugin for jnodes that don't require plugin
19744+ * specific initialization. */
19745+static int init_noinit(jnode * node UNUSED_ARG)
19746+{
19747+ return 0;
19748+}
19749+
19750+/* ->parse() method of jnode plugin for jnodes that don't require plugin
19751+ * specific pasring. */
19752+static int parse_noparse(jnode * node UNUSED_ARG)
19753+{
19754+ return 0;
19755+}
19756+
19757+/* ->mapping() method for unformatted jnode */
19758+struct address_space *mapping_jnode(const jnode * node)
19759+{
19760+ struct address_space *map;
19761+
19762+ assert("nikita-2713", node != NULL);
19763+
19764+ /* mapping is stored in jnode */
19765+
19766+ map = node->key.j.mapping;
19767+ assert("nikita-2714", map != NULL);
19768+ assert("nikita-2897", is_reiser4_inode(map->host));
19769+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19770+ return map;
19771+}
19772+
19773+/* ->index() method for unformatted jnodes */
19774+unsigned long index_jnode(const jnode * node)
19775+{
19776+ /* index is stored in jnode */
19777+ return node->key.j.index;
19778+}
19779+
19780+/* ->remove() method for unformatted jnodes */
19781+static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19782+{
19783+ /* remove jnode from hash table and radix tree */
19784+ if (node->key.j.mapping)
19785+ unhash_unformatted_node_nolock(node);
19786+}
19787+
19788+/* ->mapping() method for znodes */
19789+static struct address_space *mapping_znode(const jnode * node)
19790+{
19791+ /* all znodes belong to fake inode */
19792+ return get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19793+}
19794+
19795+/* ->index() method for znodes */
19796+static unsigned long index_znode(const jnode * node)
19797+{
19798+ unsigned long addr;
19799+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19800+
19801+ /* index of znode is just its address (shifted) */
19802+ addr = (unsigned long)node;
19803+ return (addr - PAGE_OFFSET) >> znode_shift_order;
19804+}
19805+
19806+/* ->mapping() method for bitmap jnode */
19807+static struct address_space *mapping_bitmap(const jnode * node)
19808+{
19809+ /* all bitmap blocks belong to special bitmap inode */
19810+ return get_super_private(jnode_get_tree(node)->super)->bitmap->
19811+ i_mapping;
19812+}
19813+
19814+/* ->index() method for jnodes that are indexed by address */
19815+static unsigned long index_is_address(const jnode * node)
19816+{
19817+ unsigned long ind;
19818+
19819+ ind = (unsigned long)node;
19820+ return ind - PAGE_OFFSET;
19821+}
19822+
19823+/* resolve race with jput */
19824+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19825+{
19826+ /*
19827+ * This is used as part of RCU-based jnode handling.
19828+ *
19829+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19830+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19831+ * not protected during this, so concurrent thread may execute
19832+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19833+ * freed in jput_final(). To avoid such races, jput_final() sets
19834+ * JNODE_RIP on jnode (under tree lock). All places that work with
19835+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19836+ * (first without taking tree lock), and if this bit is set, released
19837+ * reference acquired by the current thread and returns NULL.
19838+ *
19839+ * As a result, if jnode is being concurrently freed, NULL is returned
19840+ * and caller should pretend that jnode wasn't found in the first
19841+ * place.
19842+ *
19843+ * Otherwise it's safe to release "rcu-read-lock" and continue with
19844+ * jnode.
19845+ */
19846+ if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19847+ read_lock_tree(tree);
19848+ if (JF_ISSET(node, JNODE_RIP)) {
19849+ dec_x_ref(node);
19850+ node = NULL;
19851+ }
19852+ read_unlock_tree(tree);
19853+ }
19854+ return node;
19855+}
19856+
19857+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19858+{
19859+ struct inode *inode;
19860+ item_plugin *iplug;
19861+ loff_t off;
19862+
19863+ assert("nikita-3092", node != NULL);
19864+ assert("nikita-3093", key != NULL);
19865+ assert("nikita-3094", jnode_is_unformatted(node));
19866+
19867+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19868+ inode = mapping_jnode(node)->host;
19869+
19870+ if (node->parent_item_id != 0)
19871+ iplug = item_plugin_by_id(node->parent_item_id);
19872+ else
19873+ iplug = NULL;
19874+
19875+ if (iplug != NULL && iplug->f.key_by_offset)
19876+ iplug->f.key_by_offset(inode, off, key);
19877+ else {
19878+ file_plugin *fplug;
19879+
19880+ fplug = inode_file_plugin(inode);
19881+ assert("zam-1007", fplug != NULL);
19882+ assert("zam-1008", fplug->key_by_inode != NULL);
19883+
19884+ fplug->key_by_inode(inode, off, key);
19885+ }
19886+
19887+ return key;
19888+}
19889+
19890+/* ->parse() method for formatted nodes */
19891+static int parse_znode(jnode * node)
19892+{
19893+ return zparse(JZNODE(node));
19894+}
19895+
19896+/* ->delete() method for formatted nodes */
19897+static void delete_znode(jnode * node, reiser4_tree * tree)
19898+{
19899+ znode *z;
19900+
19901+ assert_rw_write_locked(&(tree->tree_lock));
19902+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19903+
19904+ z = JZNODE(node);
19905+ assert("vs-899", z->c_count == 0);
19906+
19907+ /* delete znode from sibling list. */
19908+ sibling_list_remove(z);
19909+
19910+ znode_remove(z, tree);
19911+}
19912+
19913+/* ->remove() method for formatted nodes */
19914+static int remove_znode(jnode * node, reiser4_tree * tree)
19915+{
19916+ znode *z;
19917+
19918+ assert_rw_write_locked(&(tree->tree_lock));
19919+ z = JZNODE(node);
19920+
19921+ if (z->c_count == 0) {
19922+ /* detach znode from sibling list. */
19923+ sibling_list_drop(z);
19924+ /* this is called with tree spin-lock held, so call
19925+ znode_remove() directly (rather than znode_lock_remove()). */
19926+ znode_remove(z, tree);
19927+ return 0;
19928+ }
19929+ return RETERR(-EBUSY);
19930+}
19931+
19932+/* ->init() method for formatted nodes */
19933+static int init_znode(jnode * node)
19934+{
19935+ znode *z;
19936+
19937+ z = JZNODE(node);
19938+ /* call node plugin to do actual initialization */
19939+ return z->nplug->init(z);
19940+}
19941+
19942+/* ->clone() method for formatted nodes */
19943+static jnode *clone_formatted(jnode * node)
19944+{
19945+ znode *clone;
19946+
19947+ assert("vs-1430", jnode_is_znode(node));
19948+ clone = zalloc(get_gfp_mask());
19949+ if (clone == NULL)
19950+ return ERR_PTR(RETERR(-ENOMEM));
19951+ zinit(clone, NULL, current_tree);
19952+ jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19953+ /* ZJNODE(clone)->key.z is not initialized */
19954+ clone->level = JZNODE(node)->level;
19955+
19956+ return ZJNODE(clone);
19957+}
19958+
19959+/* jplug->clone for unformatted nodes */
19960+static jnode *clone_unformatted(jnode * node)
19961+{
19962+ jnode *clone;
19963+
19964+ assert("vs-1431", jnode_is_unformatted(node));
19965+ clone = jalloc();
19966+ if (clone == NULL)
19967+ return ERR_PTR(RETERR(-ENOMEM));
19968+
19969+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19970+ jnode_set_block(clone, jnode_get_block(node));
19971+
19972+ return clone;
19973+
19974+}
19975+
19976+/*
19977+ * Setup jnode plugin methods for various jnode types.
19978+ */
19979+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19980+ [JNODE_UNFORMATTED_BLOCK] = {
19981+ .h = {
19982+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19983+ .id = JNODE_UNFORMATTED_BLOCK,
19984+ .pops = NULL,
19985+ .label = "unformatted",
19986+ .desc = "unformatted node",
19987+ .linkage = {NULL, NULL}
19988+ },
19989+ .init = init_noinit,
19990+ .parse = parse_noparse,
19991+ .mapping = mapping_jnode,
19992+ .index = index_jnode,
19993+ .clone = clone_unformatted
19994+ },
19995+ [JNODE_FORMATTED_BLOCK] = {
19996+ .h = {
19997+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19998+ .id = JNODE_FORMATTED_BLOCK,
19999+ .pops = NULL,
20000+ .label = "formatted",
20001+ .desc = "formatted tree node",
20002+ .linkage = {NULL, NULL}
20003+ },
20004+ .init = init_znode,
20005+ .parse = parse_znode,
20006+ .mapping = mapping_znode,
20007+ .index = index_znode,
20008+ .clone = clone_formatted
20009+ },
20010+ [JNODE_BITMAP] = {
20011+ .h = {
20012+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20013+ .id = JNODE_BITMAP,
20014+ .pops = NULL,
20015+ .label = "bitmap",
20016+ .desc = "bitmap node",
20017+ .linkage = {NULL, NULL}
20018+ },
20019+ .init = init_noinit,
20020+ .parse = parse_noparse,
20021+ .mapping = mapping_bitmap,
20022+ .index = index_is_address,
20023+ .clone = NULL
20024+ },
20025+ [JNODE_IO_HEAD] = {
20026+ .h = {
20027+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20028+ .id = JNODE_IO_HEAD,
20029+ .pops = NULL,
20030+ .label = "io head",
20031+ .desc = "io head",
20032+ .linkage = {NULL, NULL}
20033+ },
20034+ .init = init_noinit,
20035+ .parse = parse_noparse,
20036+ .mapping = mapping_bitmap,
20037+ .index = index_is_address,
20038+ .clone = NULL
20039+ },
20040+ [JNODE_INODE] = {
20041+ .h = {
20042+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20043+ .id = JNODE_INODE,
20044+ .pops = NULL,
20045+ .label = "inode",
20046+ .desc = "inode's builtin jnode",
20047+ .linkage = {NULL, NULL}
20048+ },
20049+ .init = NULL,
20050+ .parse = NULL,
20051+ .mapping = NULL,
20052+ .index = NULL,
20053+ .clone = NULL
20054+ }
20055+};
20056+
20057+/*
20058+ * jnode destruction.
20059+ *
20060+ * Thread may use a jnode after it acquired a reference to it. References are
20061+ * counted in ->x_count field. Reference protects jnode from being
20062+ * recycled. This is different from protecting jnode data (that are stored in
20063+ * jnode page) from being evicted from memory. Data are protected by jload()
20064+ * and released by jrelse().
20065+ *
20066+ * If thread already possesses a reference to the jnode it can acquire another
20067+ * one through jref(). Initial reference is obtained (usually) by locating
20068+ * jnode in some indexing structure that depends on jnode type: formatted
20069+ * nodes are kept in global hash table, where they are indexed by block
20070+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
20071+ * table, which is indexed by oid and offset within file, and in per-inode
20072+ * radix tree.
20073+ *
20074+ * Reference to jnode is released by jput(). If last reference is released,
20075+ * jput_final() is called. This function determines whether jnode has to be
20076+ * deleted (this happens when corresponding node is removed from the file
20077+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
20078+ * should be just "removed" (deleted from memory).
20079+ *
20080+ * Jnode destruction is signally delicate dance because of locking and RCU.
20081+ */
20082+
20083+/*
20084+ * Returns true if jnode cannot be removed right now. This check is called
20085+ * under tree lock. If it returns true, jnode is irrevocably committed to be
20086+ * deleted/removed.
20087+ */
20088+static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
20089+{
20090+ /* if other thread managed to acquire a reference to this jnode, don't
20091+ * free it. */
20092+ if (atomic_read(&node->x_count) > 0)
20093+ return 1;
20094+ /* also, don't free znode that has children in memory */
20095+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
20096+ return 1;
20097+ return 0;
20098+}
20099+
20100+/*
20101+ * this is called as part of removing jnode. Based on jnode type, call
20102+ * corresponding function that removes jnode from indices and returns it back
20103+ * to the appropriate slab (through RCU).
20104+ */
20105+static inline void
20106+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
20107+{
20108+ switch (jtype) {
20109+ case JNODE_UNFORMATTED_BLOCK:
20110+ remove_jnode(node, tree);
20111+ break;
20112+ case JNODE_IO_HEAD:
20113+ case JNODE_BITMAP:
20114+ break;
20115+ case JNODE_INODE:
20116+ break;
20117+ case JNODE_FORMATTED_BLOCK:
20118+ remove_znode(node, tree);
20119+ break;
20120+ default:
20121+ wrong_return_value("nikita-3196", "Wrong jnode type");
20122+ }
20123+}
20124+
20125+/*
20126+ * this is called as part of deleting jnode. Based on jnode type, call
20127+ * corresponding function that removes jnode from indices and returns it back
20128+ * to the appropriate slab (through RCU).
20129+ *
20130+ * This differs from jnode_remove() only for formatted nodes---for them
20131+ * sibling list handling is different for removal and deletion.
20132+ */
20133+static inline void
20134+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
20135+{
20136+ switch (jtype) {
20137+ case JNODE_UNFORMATTED_BLOCK:
20138+ remove_jnode(node, tree);
20139+ break;
20140+ case JNODE_IO_HEAD:
20141+ case JNODE_BITMAP:
20142+ break;
20143+ case JNODE_FORMATTED_BLOCK:
20144+ delete_znode(node, tree);
20145+ break;
20146+ case JNODE_INODE:
20147+ default:
20148+ wrong_return_value("nikita-3195", "Wrong jnode type");
20149+ }
20150+}
20151+
20152+#if REISER4_DEBUG
20153+/*
20154+ * remove jnode from the debugging list of all jnodes hanging off super-block.
20155+ */
20156+void jnode_list_remove(jnode * node)
20157+{
20158+ reiser4_super_info_data *sbinfo;
20159+
20160+ sbinfo = get_super_private(jnode_get_tree(node)->super);
20161+
20162+ spin_lock_irq(&sbinfo->all_guard);
20163+ assert("nikita-2422", !list_empty(&node->jnodes));
20164+ list_del_init(&node->jnodes);
20165+ spin_unlock_irq(&sbinfo->all_guard);
20166+}
20167+#endif
20168+
20169+/*
20170+ * this is called by jput_final() to remove jnode when last reference to it is
20171+ * released.
20172+ */
20173+static int jnode_try_drop(jnode * node)
20174+{
20175+ int result;
20176+ reiser4_tree *tree;
20177+ jnode_type jtype;
20178+
20179+ assert("nikita-2491", node != NULL);
20180+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
20181+
20182+ tree = jnode_get_tree(node);
20183+ jtype = jnode_get_type(node);
20184+
20185+ spin_lock_jnode(node);
20186+ write_lock_tree(tree);
20187+ /*
20188+ * if jnode has a page---leave it alone. Memory pressure will
20189+ * eventually kill page and jnode.
20190+ */
20191+ if (jnode_page(node) != NULL) {
20192+ write_unlock_tree(tree);
20193+ spin_unlock_jnode(node);
20194+ JF_CLR(node, JNODE_RIP);
20195+ return RETERR(-EBUSY);
20196+ }
20197+
20198+ /* re-check ->x_count under tree lock. */
20199+ result = jnode_is_busy(node, jtype);
20200+ if (result == 0) {
20201+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20202+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
20203+
20204+ spin_unlock_jnode(node);
20205+ /* no page and no references---despatch him. */
20206+ jnode_remove(node, jtype, tree);
20207+ write_unlock_tree(tree);
20208+ jnode_free(node, jtype);
20209+ } else {
20210+ /* busy check failed: reference was acquired by concurrent
20211+ * thread. */
20212+ write_unlock_tree(tree);
20213+ spin_unlock_jnode(node);
20214+ JF_CLR(node, JNODE_RIP);
20215+ }
20216+ return result;
20217+}
20218+
20219+/* jdelete() -- Delete jnode from the tree and file system */
20220+static int jdelete(jnode * node /* jnode to finish with */ )
20221+{
20222+ struct page *page;
20223+ int result;
20224+ reiser4_tree *tree;
20225+ jnode_type jtype;
20226+
20227+ assert("nikita-467", node != NULL);
20228+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20229+
20230+ jtype = jnode_get_type(node);
20231+
20232+ page = jnode_lock_page(node);
20233+ assert_spin_locked(&(node->guard));
20234+
20235+ tree = jnode_get_tree(node);
20236+
20237+ write_lock_tree(tree);
20238+ /* re-check ->x_count under tree lock. */
20239+ result = jnode_is_busy(node, jtype);
20240+ if (likely(!result)) {
20241+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20242+ assert("jmacd-511", atomic_read(&node->d_count) == 0);
20243+
20244+ /* detach page */
20245+ if (page != NULL) {
20246+ /*
20247+ * FIXME this is racy against jnode_extent_write().
20248+ */
20249+ page_clear_jnode(page, node);
20250+ }
20251+ spin_unlock_jnode(node);
20252+ /* goodbye */
20253+ jnode_delete(node, jtype, tree);
20254+ write_unlock_tree(tree);
20255+ jnode_free(node, jtype);
20256+ /* @node is no longer valid pointer */
20257+ if (page != NULL)
20258+ drop_page(page);
20259+ } else {
20260+ /* busy check failed: reference was acquired by concurrent
20261+ * thread. */
20262+ JF_CLR(node, JNODE_RIP);
20263+ write_unlock_tree(tree);
20264+ spin_unlock_jnode(node);
20265+ if (page != NULL)
20266+ unlock_page(page);
20267+ }
20268+ return result;
20269+}
20270+
20271+/* drop jnode on the floor.
20272+
20273+ Return value:
20274+
20275+ -EBUSY: failed to drop jnode, because there are still references to it
20276+
20277+ 0: successfully dropped jnode
20278+
20279+*/
20280+static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20281+{
20282+ struct page *page;
20283+ jnode_type jtype;
20284+ int result;
20285+
20286+ assert("zam-602", node != NULL);
20287+ assert_rw_not_read_locked(&(tree->tree_lock));
20288+ assert_rw_not_write_locked(&(tree->tree_lock));
20289+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20290+
20291+ jtype = jnode_get_type(node);
20292+
20293+ page = jnode_lock_page(node);
20294+ assert_spin_locked(&(node->guard));
20295+
20296+ write_lock_tree(tree);
20297+
20298+ /* re-check ->x_count under tree lock. */
20299+ result = jnode_is_busy(node, jtype);
20300+ if (!result) {
20301+ assert("nikita-2488", page == jnode_page(node));
20302+ assert("nikita-2533", atomic_read(&node->d_count) == 0);
20303+ if (page != NULL) {
20304+ assert("nikita-2126", !PageDirty(page));
20305+ assert("nikita-2127", PageUptodate(page));
20306+ assert("nikita-2181", PageLocked(page));
20307+ page_clear_jnode(page, node);
20308+ }
20309+ spin_unlock_jnode(node);
20310+ jnode_remove(node, jtype, tree);
20311+ write_unlock_tree(tree);
20312+ jnode_free(node, jtype);
20313+ if (page != NULL) {
20314+ drop_page(page);
20315+ }
20316+ } else {
20317+ /* busy check failed: reference was acquired by concurrent
20318+ * thread. */
20319+ JF_CLR(node, JNODE_RIP);
20320+ write_unlock_tree(tree);
20321+ spin_unlock_jnode(node);
20322+ if (page != NULL)
20323+ unlock_page(page);
20324+ }
20325+ return result;
20326+}
20327+
20328+/* This function frees jnode "if possible". In particular, [dcx]_count has to
20329+ be 0 (where applicable). */
20330+void jdrop(jnode * node)
20331+{
20332+ jdrop_in_tree(node, jnode_get_tree(node));
20333+}
20334+
20335+/* IO head jnode implementation; The io heads are simple j-nodes with limited
20336+ functionality (these j-nodes are not in any hash table) just for reading
20337+ from and writing to disk. */
20338+
20339+jnode *alloc_io_head(const reiser4_block_nr * block)
20340+{
20341+ jnode *jal = jalloc();
20342+
20343+ if (jal != NULL) {
20344+ jnode_init(jal, current_tree, JNODE_IO_HEAD);
20345+ jnode_set_block(jal, block);
20346+ }
20347+
20348+ jref(jal);
20349+
20350+ return jal;
20351+}
20352+
20353+void drop_io_head(jnode * node)
20354+{
20355+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20356+
20357+ jput(node);
20358+ jdrop(node);
20359+}
20360+
20361+/* protect keep jnode data from reiser4_releasepage() */
20362+void pin_jnode_data(jnode * node)
20363+{
20364+ assert("zam-671", jnode_page(node) != NULL);
20365+ page_cache_get(jnode_page(node));
20366+}
20367+
20368+/* make jnode data free-able again */
20369+void unpin_jnode_data(jnode * node)
20370+{
20371+ assert("zam-672", jnode_page(node) != NULL);
20372+ page_cache_release(jnode_page(node));
20373+}
20374+
20375+struct address_space *jnode_get_mapping(const jnode * node)
20376+{
20377+ assert("nikita-3162", node != NULL);
20378+ return jnode_ops(node)->mapping(node);
20379+}
20380+
20381+#if REISER4_DEBUG
20382+/* debugging aid: jnode invariant */
20383+int jnode_invariant_f(const jnode * node, char const **msg)
20384+{
20385+#define _ergo(ant, con) \
20386+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20387+#define _check(exp) ((*msg) = #exp, (exp))
20388+
20389+ return _check(node != NULL) &&
20390+ /* [jnode-queued] */
20391+ /* only relocated node can be queued, except that when znode
20392+ * is being deleted, its JNODE_RELOC bit is cleared */
20393+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20394+ JF_ISSET(node, JNODE_RELOC) ||
20395+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20396+ _check(node->jnodes.prev != NULL) &&
20397+ _check(node->jnodes.next != NULL) &&
20398+ /* [jnode-dirty] invariant */
20399+ /* dirty inode is part of atom */
20400+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20401+ /* [jnode-oid] invariant */
20402+ /* for unformatted node ->objectid and ->mapping fields are
20403+ * consistent */
20404+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20405+ node->key.j.objectid ==
20406+ get_inode_oid(node->key.j.mapping->host)) &&
20407+ /* [jnode-atom-valid] invariant */
20408+ /* node atom has valid state */
20409+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20410+ /* [jnode-page-binding] invariant */
20411+ /* if node points to page, it points back to node */
20412+ _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20413+ /* [jnode-refs] invariant */
20414+ /* only referenced jnode can be loaded */
20415+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20416+
20417+}
20418+
20419+static const char *jnode_type_name(jnode_type type)
20420+{
20421+ switch (type) {
20422+ case JNODE_UNFORMATTED_BLOCK:
20423+ return "unformatted";
20424+ case JNODE_FORMATTED_BLOCK:
20425+ return "formatted";
20426+ case JNODE_BITMAP:
20427+ return "bitmap";
20428+ case JNODE_IO_HEAD:
20429+ return "io head";
20430+ case JNODE_INODE:
20431+ return "inode";
20432+ case LAST_JNODE_TYPE:
20433+ return "last";
20434+ default:{
20435+ static char unknown[30];
20436+
20437+ sprintf(unknown, "unknown %i", type);
20438+ return unknown;
20439+ }
20440+ }
20441+}
20442+
20443+#define jnode_state_name( node, flag ) \
20444+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20445+
20446+/* debugging aid: output human readable information about @node */
20447+static void info_jnode(const char *prefix /* prefix to print */ ,
20448+ const jnode * node /* node to print */ )
20449+{
20450+ assert("umka-068", prefix != NULL);
20451+
20452+ if (node == NULL) {
20453+ printk("%s: null\n", prefix);
20454+ return;
20455+ }
20456+
20457+ printk
20458+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20459+ " block: %s, d_count: %d, x_count: %d, "
20460+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20461+ node->state,
20462+ jnode_state_name(node, JNODE_PARSED),
20463+ jnode_state_name(node, JNODE_HEARD_BANSHEE),
20464+ jnode_state_name(node, JNODE_LEFT_CONNECTED),
20465+ jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20466+ jnode_state_name(node, JNODE_ORPHAN),
20467+ jnode_state_name(node, JNODE_CREATED),
20468+ jnode_state_name(node, JNODE_RELOC),
20469+ jnode_state_name(node, JNODE_OVRWR),
20470+ jnode_state_name(node, JNODE_DIRTY),
20471+ jnode_state_name(node, JNODE_IS_DYING),
20472+ jnode_state_name(node, JNODE_RIP),
20473+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20474+ jnode_state_name(node, JNODE_WRITEBACK),
20475+ jnode_state_name(node, JNODE_NEW),
20476+ jnode_state_name(node, JNODE_DKSET),
20477+ jnode_state_name(node, JNODE_REPACK),
20478+ jnode_state_name(node, JNODE_CLUSTER_PAGE),
20479+ jnode_get_level(node), sprint_address(jnode_get_block(node)),
20480+ atomic_read(&node->d_count), atomic_read(&node->x_count),
20481+ jnode_page(node), node->atom, 0, 0,
20482+ jnode_type_name(jnode_get_type(node)));
20483+ if (jnode_is_unformatted(node)) {
20484+ printk("inode: %llu, index: %lu, ",
20485+ node->key.j.objectid, node->key.j.index);
20486+ }
20487+}
20488+
20489+/* debugging aid: check znode invariant and panic if it doesn't hold */
20490+static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20491+{
20492+ char const *failed_msg;
20493+ int result;
20494+ reiser4_tree *tree;
20495+
20496+ tree = jnode_get_tree(node);
20497+
20498+ assert("umka-063312", node != NULL);
20499+ assert("umka-064321", tree != NULL);
20500+
20501+ if (!jlocked && !tlocked)
20502+ spin_lock_jnode((jnode *) node);
20503+ if (!tlocked)
20504+ read_lock_tree(jnode_get_tree(node));
20505+ result = jnode_invariant_f(node, &failed_msg);
20506+ if (!result) {
20507+ info_jnode("corrupted node", node);
20508+ warning("jmacd-555", "Condition %s failed", failed_msg);
20509+ }
20510+ if (!tlocked)
20511+ read_unlock_tree(jnode_get_tree(node));
20512+ if (!jlocked && !tlocked)
20513+ spin_unlock_jnode((jnode *) node);
20514+ return result;
20515+}
20516+
20517+#endif /* REISER4_DEBUG */
20518+
20519+/* Make Linus happy.
20520+ Local variables:
20521+ c-indentation-style: "K&R"
20522+ mode-name: "LC"
20523+ c-basic-offset: 8
20524+ tab-width: 8
20525+ fill-column: 80
20526+ End:
20527+*/
20528Index: linux-2.6.16/fs/reiser4/jnode.h
20529===================================================================
20530--- /dev/null
20531+++ linux-2.6.16/fs/reiser4/jnode.h
20532@@ -0,0 +1,711 @@
20533+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20534+ * reiser4/README */
20535+
20536+/* Declaration of jnode. See jnode.c for details. */
20537+
20538+#ifndef __JNODE_H__
20539+#define __JNODE_H__
20540+
20541+#include "forward.h"
20542+#include "type_safe_hash.h"
20543+#include "txnmgr.h"
20544+#include "key.h"
20545+#include "debug.h"
20546+#include "dformat.h"
20547+#include "context.h"
20548+
20549+#include "plugin/plugin.h"
20550+
20551+#include <linux/fs.h>
20552+#include <linux/mm.h>
20553+#include <linux/spinlock.h>
20554+#include <asm/atomic.h>
20555+#include <asm/bitops.h>
20556+#include <linux/list.h>
20557+#include <linux/rcupdate.h>
20558+
20559+/* declare hash table of jnodes (jnodes proper, that is, unformatted
20560+ nodes) */
20561+TYPE_SAFE_HASH_DECLARE(j, jnode);
20562+
20563+/* declare hash table of znodes */
20564+TYPE_SAFE_HASH_DECLARE(z, znode);
20565+
20566+typedef struct {
20567+ __u64 objectid;
20568+ unsigned long index;
20569+ struct address_space *mapping;
20570+} jnode_key_t;
20571+
20572+/*
20573+ Jnode is the "base class" of other nodes in reiser4. It is also happens to
20574+ be exactly the node we use for unformatted tree nodes.
20575+
20576+ Jnode provides following basic functionality:
20577+
20578+ . reference counting and indexing.
20579+
20580+ . integration with page cache. Jnode has ->pg reference to which page can
20581+ be attached.
20582+
20583+ . interface to transaction manager. It is jnode that is kept in transaction
20584+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20585+ means, there should be special type of jnode for inode.)
20586+
20587+ Locking:
20588+
20589+ Spin lock: the following fields are protected by the per-jnode spin lock:
20590+
20591+ ->state
20592+ ->atom
20593+ ->capture_link
20594+
20595+ Following fields are protected by the global tree lock:
20596+
20597+ ->link
20598+ ->key.z (content of ->key.z is only changed in znode_rehash())
20599+ ->key.j
20600+
20601+ Atomic counters
20602+
20603+ ->x_count
20604+ ->d_count
20605+
20606+ ->pg, and ->data are protected by spin lock for unused jnode and are
20607+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20608+ is false).
20609+
20610+ ->tree is immutable after creation
20611+
20612+ Unclear
20613+
20614+ ->blocknr: should be under jnode spin-lock, but current interface is based
20615+ on passing of block address.
20616+
20617+ If you ever need to spin lock two nodes at once, do this in "natural"
20618+ memory order: lock znode with lower address first. (See lock_two_nodes().)
20619+
20620+ Invariants involving this data-type:
20621+
20622+ [jnode-dirty]
20623+ [jnode-refs]
20624+ [jnode-oid]
20625+ [jnode-queued]
20626+ [jnode-atom-valid]
20627+ [jnode-page-binding]
20628+*/
20629+
20630+struct jnode {
20631+#if REISER4_DEBUG
20632+#define JMAGIC 0x52654973 /* "ReIs" */
20633+ int magic;
20634+#endif
20635+ /* FIRST CACHE LINE (16 bytes): data used by jload */
20636+
20637+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20638+ /* 0 */ unsigned long state;
20639+
20640+ /* lock, protecting jnode's fields. */
20641+ /* 4 */ spinlock_t load;
20642+
20643+ /* counter of references to jnode itself. Increased on jref().
20644+ Decreased on jput().
20645+ */
20646+ /* 8 */ atomic_t x_count;
20647+
20648+ /* counter of references to jnode's data. Pin data page(s) in
20649+ memory while this is greater than 0. Increased on jload().
20650+ Decreased on jrelse().
20651+ */
20652+ /* 12 */ atomic_t d_count;
20653+
20654+ /* SECOND CACHE LINE: data used by hash table lookups */
20655+
20656+ /* 16 */ union {
20657+ /* znodes are hashed by block number */
20658+ reiser4_block_nr z;
20659+ /* unformatted nodes are hashed by mapping plus offset */
20660+ jnode_key_t j;
20661+ } key;
20662+
20663+ /* THIRD CACHE LINE */
20664+
20665+ /* 32 */ union {
20666+ /* pointers to maintain hash-table */
20667+ z_hash_link z;
20668+ j_hash_link j;
20669+ } link;
20670+
20671+ /* pointer to jnode page. */
20672+ /* 36 */ struct page *pg;
20673+ /* pointer to node itself. This is page_address(node->pg) when page is
20674+ attached to the jnode
20675+ */
20676+ /* 40 */ void *data;
20677+
20678+ /* 44 */ reiser4_tree *tree;
20679+
20680+ /* FOURTH CACHE LINE: atom related fields */
20681+
20682+ /* 48 */ spinlock_t guard;
20683+
20684+ /* atom the block is in, if any */
20685+ /* 52 */ txn_atom *atom;
20686+
20687+ /* capture list */
20688+ /* 56 */ struct list_head capture_link;
20689+
20690+ /* FIFTH CACHE LINE */
20691+
20692+ /* 64 */ struct rcu_head rcu;
20693+ /* crosses cache line */
20694+
20695+ /* SIXTH CACHE LINE */
20696+
20697+ /* the real blocknr (where io is going to/from) */
20698+ /* 80 */ reiser4_block_nr blocknr;
20699+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20700+ /* NOTE: this parent_item_id looks like jnode type. */
20701+ /* 88 */ reiser4_plugin_id parent_item_id;
20702+ /* 92 */
20703+#if REISER4_DEBUG
20704+ /* number of pages referenced by the jnode (meaningful while capturing of
20705+ page clusters) */
20706+ int page_count;
20707+ /* list of all jnodes for debugging purposes. */
20708+ struct list_head jnodes;
20709+ /* how many times this jnode was written in one transaction */
20710+ int written;
20711+ /* this indicates which atom's list the jnode is on */
20712+ atom_list list;
20713+#endif
20714+} __attribute__ ((aligned(16)));
20715+
20716+/*
20717+ * jnode types. Enumeration of existing jnode types.
20718+ */
20719+typedef enum {
20720+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20721+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20722+ JNODE_BITMAP, /* bitmap */
20723+ JNODE_IO_HEAD, /* jnode representing a block in the
20724+ * wandering log */
20725+ JNODE_INODE, /* jnode embedded into inode */
20726+ LAST_JNODE_TYPE
20727+} jnode_type;
20728+
20729+/* jnode states */
20730+typedef enum {
20731+ /* jnode's page is loaded and data checked */
20732+ JNODE_PARSED = 0,
20733+ /* node was deleted, not all locks on it were released. This
20734+ node is empty and is going to be removed from the tree
20735+ shortly. */
20736+ JNODE_HEARD_BANSHEE = 1,
20737+ /* left sibling pointer is valid */
20738+ JNODE_LEFT_CONNECTED = 2,
20739+ /* right sibling pointer is valid */
20740+ JNODE_RIGHT_CONNECTED = 3,
20741+
20742+ /* znode was just created and doesn't yet have a pointer from
20743+ its parent */
20744+ JNODE_ORPHAN = 4,
20745+
20746+ /* this node was created by its transaction and has not been assigned
20747+ a block address. */
20748+ JNODE_CREATED = 5,
20749+
20750+ /* this node is currently relocated */
20751+ JNODE_RELOC = 6,
20752+ /* this node is currently wandered */
20753+ JNODE_OVRWR = 7,
20754+
20755+ /* this znode has been modified */
20756+ JNODE_DIRTY = 8,
20757+
20758+ /* znode lock is being invalidated */
20759+ JNODE_IS_DYING = 9,
20760+
20761+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20762+
20763+ /* jnode is queued for flushing. */
20764+ JNODE_FLUSH_QUEUED = 12,
20765+
20766+ /* In the following bits jnode type is encoded. */
20767+ JNODE_TYPE_1 = 13,
20768+ JNODE_TYPE_2 = 14,
20769+ JNODE_TYPE_3 = 15,
20770+
20771+ /* jnode is being destroyed */
20772+ JNODE_RIP = 16,
20773+
20774+ /* znode was not captured during locking (it might so be because
20775+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20776+ JNODE_MISSED_IN_CAPTURE = 17,
20777+
20778+ /* write is in progress */
20779+ JNODE_WRITEBACK = 18,
20780+
20781+ /* FIXME: now it is used by crypto-compress plugin only */
20782+ JNODE_NEW = 19,
20783+
20784+ /* delimiting keys are already set for this znode. */
20785+ JNODE_DKSET = 20,
20786+
20787+ /* when this bit is set page and jnode can not be disconnected */
20788+ JNODE_WRITE_PREPARED = 21,
20789+
20790+ JNODE_CLUSTER_PAGE = 22,
20791+ /* Jnode is marked for repacking, that means the reiser4 flush and the
20792+ * block allocator should process this node special way */
20793+ JNODE_REPACK = 23,
20794+ /* node should be converted by flush in squalloc phase */
20795+ JNODE_CONVERTIBLE = 24,
20796+ /*
20797+ * When jnode is dirtied for the first time in given transaction,
20798+ * do_jnode_make_dirty() checks whether this jnode can possible became
20799+ * member of overwrite set. If so, this bit is set, and one block is
20800+ * reserved in the ->flush_reserved space of atom.
20801+ *
20802+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20803+ *
20804+ * (1) flush decides that we want this block to go into relocate
20805+ * set after all.
20806+ *
20807+ * (2) wandering log is allocated (by log writer)
20808+ *
20809+ * (3) extent is allocated
20810+ *
20811+ */
20812+ JNODE_FLUSH_RESERVED = 29
20813+} reiser4_jnode_state;
20814+
20815+/* Macros for accessing the jnode state. */
20816+
20817+static inline void JF_CLR(jnode * j, int f)
20818+{
20819+ assert("unknown-1", j->magic == JMAGIC);
20820+ clear_bit(f, &j->state);
20821+}
20822+static inline int JF_ISSET(const jnode * j, int f)
20823+{
20824+ assert("unknown-2", j->magic == JMAGIC);
20825+ return test_bit(f, &((jnode *) j)->state);
20826+}
20827+static inline void JF_SET(jnode * j, int f)
20828+{
20829+ assert("unknown-3", j->magic == JMAGIC);
20830+ set_bit(f, &j->state);
20831+}
20832+
20833+static inline int JF_TEST_AND_SET(jnode * j, int f)
20834+{
20835+ assert("unknown-4", j->magic == JMAGIC);
20836+ return test_and_set_bit(f, &j->state);
20837+}
20838+
20839+static inline void spin_lock_jnode(jnode *node)
20840+{
20841+ /* check that spinlocks of lower priorities are not held */
20842+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20843+ LOCK_CNT_NIL(spin_locked_txnh) &&
20844+ LOCK_CNT_NIL(spin_locked_zlock) &&
20845+ LOCK_CNT_NIL(rw_locked_dk) &&
20846+ LOCK_CNT_LT(spin_locked_jnode, 2)));
20847+
20848+ spin_lock(&(node->guard));
20849+
20850+ LOCK_CNT_INC(spin_locked_jnode);
20851+ LOCK_CNT_INC(spin_locked);
20852+}
20853+
20854+static inline void spin_unlock_jnode(jnode *node)
20855+{
20856+ assert_spin_locked(&(node->guard));
20857+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20858+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20859+
20860+ LOCK_CNT_DEC(spin_locked_jnode);
20861+ LOCK_CNT_DEC(spin_locked);
20862+
20863+ spin_unlock(&(node->guard));
20864+}
20865+
20866+static inline int jnode_is_in_deleteset(const jnode * node)
20867+{
20868+ return JF_ISSET(node, JNODE_RELOC);
20869+}
20870+
20871+extern int init_jnodes(void);
20872+extern void done_jnodes(void);
20873+
20874+/* Jnode routines */
20875+extern jnode *jalloc(void);
20876+extern void jfree(jnode * node) NONNULL;
20877+extern jnode *jclone(jnode *);
20878+extern jnode *jlookup(reiser4_tree * tree,
20879+ oid_t objectid, unsigned long ind) NONNULL;
20880+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20881+extern jnode *jnode_by_page(struct page *pg) NONNULL;
20882+extern jnode *jnode_of_page(struct page *pg) NONNULL;
20883+void jnode_attach_page(jnode * node, struct page *pg);
20884+jnode *find_get_jnode(reiser4_tree * tree,
20885+ struct address_space *mapping, oid_t oid,
20886+ unsigned long index);
20887+
20888+void unhash_unformatted_jnode(jnode *);
20889+struct page *jnode_get_page_locked(jnode *, gfp_t gfp_flags);
20890+extern jnode *page_next_jnode(jnode * node) NONNULL;
20891+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20892+extern void jnode_make_dirty(jnode * node) NONNULL;
20893+extern void jnode_make_clean(jnode * node) NONNULL;
20894+extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20895+extern void jnode_make_wander(jnode *) NONNULL;
20896+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20897+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20898+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20899+
20900+/**
20901+ * jnode_get_block
20902+ * @node: jnode to query
20903+ *
20904+ */
20905+static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20906+{
20907+ assert("nikita-528", node != NULL);
20908+
20909+ return &node->blocknr;
20910+}
20911+
20912+/**
20913+ * jnode_set_block
20914+ * @node: jnode to update
20915+ * @blocknr: new block nr
20916+ */
20917+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20918+{
20919+ assert("nikita-2020", node != NULL);
20920+ assert("umka-055", blocknr != NULL);
20921+ node->blocknr = *blocknr;
20922+}
20923+
20924+
20925+/* block number for IO. Usually this is the same as jnode_get_block(), unless
20926+ * jnode was emergency flushed---then block number chosen by eflush is
20927+ * used. */
20928+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20929+{
20930+ assert("nikita-2768", node != NULL);
20931+ assert_spin_locked(&(node->guard));
20932+
20933+ return jnode_get_block(node);
20934+}
20935+
20936+/* Jnode flush interface. */
20937+extern reiser4_blocknr_hint *pos_hint(flush_pos_t * pos);
20938+extern flush_queue_t *pos_fq(flush_pos_t * pos);
20939+
20940+/* FIXME-VS: these are used in plugin/item/extent.c */
20941+
20942+/* does extent_get_block have to be called */
20943+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20944+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20945+
20946+/* the node should be converted during flush squalloc phase */
20947+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20948+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20949+
20950+/* Macros to convert from jnode to znode, znode to jnode. These are macros
20951+ because C doesn't allow overloading of const prototypes. */
20952+#define ZJNODE(x) (& (x) -> zjnode)
20953+#define JZNODE(x) \
20954+({ \
20955+ typeof (x) __tmp_x; \
20956+ \
20957+ __tmp_x = (x); \
20958+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20959+ (znode*) __tmp_x; \
20960+})
20961+
20962+extern int jnodes_tree_init(reiser4_tree * tree);
20963+extern int jnodes_tree_done(reiser4_tree * tree);
20964+
20965+#if REISER4_DEBUG
20966+
20967+extern int znode_is_any_locked(const znode * node);
20968+extern void jnode_list_remove(jnode * node);
20969+
20970+#else
20971+
20972+#define jnode_list_remove(node) noop
20973+
20974+#endif
20975+
20976+int znode_is_root(const znode * node) NONNULL;
20977+
20978+/* bump reference counter on @node */
20979+static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20980+{
20981+ assert("nikita-1911", node != NULL);
20982+
20983+ atomic_inc(&node->x_count);
20984+ LOCK_CNT_INC(x_refs);
20985+}
20986+
20987+static inline void dec_x_ref(jnode * node)
20988+{
20989+ assert("nikita-3215", node != NULL);
20990+ assert("nikita-3216", atomic_read(&node->x_count) > 0);
20991+
20992+ atomic_dec(&node->x_count);
20993+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20994+ LOCK_CNT_DEC(x_refs);
20995+}
20996+
20997+/* jref() - increase counter of references to jnode/znode (x_count) */
20998+static inline jnode *jref(jnode * node)
20999+{
21000+ assert("jmacd-508", (node != NULL) && !IS_ERR(node));
21001+ add_x_ref(node);
21002+ return node;
21003+}
21004+
21005+/* get the page of jnode */
21006+static inline struct page *jnode_page(const jnode * node)
21007+{
21008+ return node->pg;
21009+}
21010+
21011+/* return pointer to jnode data */
21012+static inline char *jdata(const jnode * node)
21013+{
21014+ assert("nikita-1415", node != NULL);
21015+ assert("nikita-3198", jnode_page(node) != NULL);
21016+ return node->data;
21017+}
21018+
21019+static inline int jnode_is_loaded(const jnode * node)
21020+{
21021+ assert("zam-506", node != NULL);
21022+ return atomic_read(&node->d_count) > 0;
21023+}
21024+
21025+extern void page_detach_jnode(struct page *page,
21026+ struct address_space *mapping,
21027+ unsigned long index) NONNULL;
21028+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
21029+
21030+static inline void jnode_set_reloc(jnode * node)
21031+{
21032+ assert("nikita-2431", node != NULL);
21033+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
21034+ JF_SET(node, JNODE_RELOC);
21035+}
21036+
21037+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
21038+
21039+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
21040+
21041+static inline int jload(jnode *node)
21042+{
21043+ return jload_gfp(node, get_gfp_mask(), 1);
21044+}
21045+
21046+extern int jinit_new(jnode *, gfp_t) NONNULL;
21047+extern int jstartio(jnode *) NONNULL;
21048+
21049+extern void jdrop(jnode *) NONNULL;
21050+extern int jwait_io(jnode *, int rw) NONNULL;
21051+
21052+void jload_prefetch(jnode *);
21053+
21054+extern jnode *alloc_io_head(const reiser4_block_nr * block) NONNULL;
21055+extern void drop_io_head(jnode * node) NONNULL;
21056+
21057+static inline reiser4_tree *jnode_get_tree(const jnode * node)
21058+{
21059+ assert("nikita-2691", node != NULL);
21060+ return node->tree;
21061+}
21062+
21063+extern void pin_jnode_data(jnode *);
21064+extern void unpin_jnode_data(jnode *);
21065+
21066+static inline jnode_type jnode_get_type(const jnode * node)
21067+{
21068+ static const unsigned long state_mask =
21069+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
21070+
21071+ static jnode_type mask_to_type[] = {
21072+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
21073+
21074+ /* 000 */
21075+ [0] = JNODE_FORMATTED_BLOCK,
21076+ /* 001 */
21077+ [1] = JNODE_UNFORMATTED_BLOCK,
21078+ /* 010 */
21079+ [2] = JNODE_BITMAP,
21080+ /* 011 */
21081+ [3] = LAST_JNODE_TYPE, /*invalid */
21082+ /* 100 */
21083+ [4] = JNODE_INODE,
21084+ /* 101 */
21085+ [5] = LAST_JNODE_TYPE,
21086+ /* 110 */
21087+ [6] = JNODE_IO_HEAD,
21088+ /* 111 */
21089+ [7] = LAST_JNODE_TYPE, /* invalid */
21090+ };
21091+
21092+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
21093+}
21094+
21095+/* returns true if node is a znode */
21096+static inline int jnode_is_znode(const jnode * node)
21097+{
21098+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
21099+}
21100+
21101+static inline int jnode_is_flushprepped(jnode * node)
21102+{
21103+ assert("jmacd-78212", node != NULL);
21104+ assert_spin_locked(&(node->guard));
21105+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
21106+ JF_ISSET(node, JNODE_OVRWR);
21107+}
21108+
21109+/* Return true if @node has already been processed by the squeeze and allocate
21110+ process. This implies the block address has been finalized for the
21111+ duration of this atom (or it is clean and will remain in place). If this
21112+ returns true you may use the block number as a hint. */
21113+static inline int jnode_check_flushprepped(jnode * node)
21114+{
21115+ int result;
21116+
21117+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */
21118+ spin_lock_jnode(node);
21119+ result = jnode_is_flushprepped(node);
21120+ spin_unlock_jnode(node);
21121+ return result;
21122+}
21123+
21124+/* returns true if node is unformatted */
21125+static inline int jnode_is_unformatted(const jnode * node)
21126+{
21127+ assert("jmacd-0123", node != NULL);
21128+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
21129+}
21130+
21131+/* returns true if node represents a cluster cache page */
21132+static inline int jnode_is_cluster_page(const jnode * node)
21133+{
21134+ assert("edward-50", node != NULL);
21135+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
21136+}
21137+
21138+/* returns true is node is builtin inode's jnode */
21139+static inline int jnode_is_inode(const jnode * node)
21140+{
21141+ assert("vs-1240", node != NULL);
21142+ return jnode_get_type(node) == JNODE_INODE;
21143+}
21144+
21145+static inline jnode_plugin *jnode_ops_of(const jnode_type type)
21146+{
21147+ assert("nikita-2367", type < LAST_JNODE_TYPE);
21148+ return jnode_plugin_by_id((reiser4_plugin_id) type);
21149+}
21150+
21151+static inline jnode_plugin *jnode_ops(const jnode * node)
21152+{
21153+ assert("nikita-2366", node != NULL);
21154+
21155+ return jnode_ops_of(jnode_get_type(node));
21156+}
21157+
21158+/* Get the index of a block. */
21159+static inline unsigned long jnode_get_index(jnode * node)
21160+{
21161+ return jnode_ops(node)->index(node);
21162+}
21163+
21164+/* return true if "node" is the root */
21165+static inline int jnode_is_root(const jnode * node)
21166+{
21167+ return jnode_is_znode(node) && znode_is_root(JZNODE(node));
21168+}
21169+
21170+extern struct address_space *mapping_jnode(const jnode * node);
21171+extern unsigned long index_jnode(const jnode * node);
21172+
21173+static inline void jput(jnode * node);
21174+extern void jput_final(jnode * node);
21175+
21176+/* bump data counter on @node */
21177+static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
21178+{
21179+ assert("nikita-1962", node != NULL);
21180+
21181+ atomic_inc(&node->d_count);
21182+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
21183+ LOCK_CNT_INC(d_refs);
21184+}
21185+
21186+/* jput() - decrement x_count reference counter on znode.
21187+
21188+ Count may drop to 0, jnode stays in cache until memory pressure causes the
21189+ eviction of its page. The c_count variable also ensures that children are
21190+ pressured out of memory before the parent. The jnode remains hashed as
21191+ long as the VM allows its page to stay in memory.
21192+*/
21193+static inline void jput(jnode * node)
21194+{
21195+ assert("jmacd-509", node != NULL);
21196+ assert("jmacd-510", atomic_read(&node->x_count) > 0);
21197+ assert("zam-926", schedulable());
21198+ LOCK_CNT_DEC(x_refs);
21199+
21200+ rcu_read_lock();
21201+ /*
21202+ * we don't need any kind of lock here--jput_final() uses RCU.
21203+ */
21204+ if (unlikely(atomic_dec_and_test(&node->x_count))) {
21205+ jput_final(node);
21206+ } else
21207+ rcu_read_unlock();
21208+ assert("nikita-3473", schedulable());
21209+}
21210+
21211+extern void jrelse(jnode * node);
21212+extern void jrelse_tail(jnode * node);
21213+
21214+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
21215+
21216+/* resolve race with jput */
21217+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
21218+{
21219+ if (unlikely(JF_ISSET(node, JNODE_RIP)))
21220+ node = jnode_rip_sync(tree, node);
21221+ return node;
21222+}
21223+
21224+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
21225+
21226+#if REISER4_DEBUG
21227+extern int jnode_invariant_f(const jnode *node, char const **msg);
21228+#endif
21229+
21230+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21231+
21232+/* __JNODE_H__ */
21233+#endif
21234+
21235+/* Make Linus happy.
21236+ Local variables:
21237+ c-indentation-style: "K&R"
21238+ mode-name: "LC"
21239+ c-basic-offset: 8
21240+ tab-width: 8
21241+ fill-column: 120
21242+ End:
21243+*/
21244Index: linux-2.6.16/fs/reiser4/kassign.c
21245===================================================================
21246--- /dev/null
21247+++ linux-2.6.16/fs/reiser4/kassign.c
21248@@ -0,0 +1,659 @@
21249+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21250+ * reiser4/README */
21251+
21252+/* Key assignment policy implementation */
21253+
21254+/*
21255+ * In reiser4 every piece of file system data and meta-data has a key. Keys
21256+ * are used to store information in and retrieve it from reiser4 internal
21257+ * tree. In addition to this, keys define _ordering_ of all file system
21258+ * information: things having close keys are placed into the same or
21259+ * neighboring (in the tree order) nodes of the tree. As our block allocator
21260+ * tries to respect tree order (see flush.c), keys also define order in which
21261+ * things are laid out on the disk, and hence, affect performance directly.
21262+ *
21263+ * Obviously, assignment of keys to data and meta-data should be consistent
21264+ * across whole file system. Algorithm that calculates a key for a given piece
21265+ * of data or meta-data is referred to as "key assignment".
21266+ *
21267+ * Key assignment is too expensive to be implemented as a plugin (that is,
21268+ * with an ability to support different key assignment schemas in the same
21269+ * compiled kernel image). As a compromise, all key-assignment functions and
21270+ * data-structures are collected in this single file, so that modifications to
21271+ * key assignment algorithm can be localized. Additional changes may be
21272+ * required in key.[ch].
21273+ *
21274+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21275+ * may guess, there is "Plan B" too.
21276+ *
21277+ */
21278+
21279+/*
21280+ * Additional complication with key assignment implementation is a requirement
21281+ * to support different key length.
21282+ */
21283+
21284+/*
21285+ * KEY ASSIGNMENT: PLAN A, LONG KEYS.
21286+ *
21287+ * DIRECTORY ITEMS
21288+ *
21289+ * | 60 | 4 | 7 |1| 56 | 64 | 64 |
21290+ * +--------------+---+---+-+-------------+------------------+-----------------+
21291+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
21292+ * +--------------+---+---+-+-------------+------------------+-----------------+
21293+ * | | | | |
21294+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21295+ *
21296+ * dirid objectid of directory this item is for
21297+ *
21298+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21299+ *
21300+ * H 1 if last 8 bytes of the key contain hash,
21301+ * 0 if last 8 bytes of the key contain prefix-3
21302+ *
21303+ * prefix-1 first 7 characters of file name.
21304+ * Padded by zeroes if name is not long enough.
21305+ *
21306+ * prefix-2 next 8 characters of the file name.
21307+ *
21308+ * prefix-3 next 8 characters of the file name.
21309+ *
21310+ * hash hash of the rest of file name (i.e., portion of file
21311+ * name not included into prefix-1 and prefix-2).
21312+ *
21313+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21314+ * in the key. Such file names are called "short". They are distinguished by H
21315+ * bit set 0 in the key.
21316+ *
21317+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21318+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21319+ * key. Last 8 bytes of the key are occupied by hash of the remaining
21320+ * characters of the name.
21321+ *
21322+ * This key assignment reaches following important goals:
21323+ *
21324+ * (1) directory entries are sorted in approximately lexicographical
21325+ * order.
21326+ *
21327+ * (2) collisions (when multiple directory items have the same key), while
21328+ * principally unavoidable in a tree with fixed length keys, are rare.
21329+ *
21330+ * STAT DATA
21331+ *
21332+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21333+ * +--------------+---+-----------------+---+--------------+-----------------+
21334+ * | locality id | 1 | ordering | 0 | objectid | 0 |
21335+ * +--------------+---+-----------------+---+--------------+-----------------+
21336+ * | | | | |
21337+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21338+ *
21339+ * locality id object id of a directory where first name was created for
21340+ * the object
21341+ *
21342+ * ordering copy of second 8-byte portion of the key of directory
21343+ * entry for the first name of this object. Ordering has a form
21344+ * {
21345+ * fibration :7;
21346+ * h :1;
21347+ * prefix1 :56;
21348+ * }
21349+ * see description of key for directory entry above.
21350+ *
21351+ * objectid object id for this object
21352+ *
21353+ * This key assignment policy is designed to keep stat-data in the same order
21354+ * as corresponding directory items, thus speeding up readdir/stat types of
21355+ * workload.
21356+ *
21357+ * FILE BODY
21358+ *
21359+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21360+ * +--------------+---+-----------------+---+--------------+-----------------+
21361+ * | locality id | 4 | ordering | 0 | objectid | offset |
21362+ * +--------------+---+-----------------+---+--------------+-----------------+
21363+ * | | | | |
21364+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21365+ *
21366+ * locality id object id of a directory where first name was created for
21367+ * the object
21368+ *
21369+ * ordering the same as in the key of stat-data for this object
21370+ *
21371+ * objectid object id for this object
21372+ *
21373+ * offset logical offset from the beginning of this file.
21374+ * Measured in bytes.
21375+ *
21376+ *
21377+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21378+ *
21379+ * DIRECTORY ITEMS
21380+ *
21381+ * | 60 | 4 | 7 |1| 56 | 64 |
21382+ * +--------------+---+---+-+-------------+-----------------+
21383+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21384+ * +--------------+---+---+-+-------------+-----------------+
21385+ * | | | |
21386+ * | 8 bytes | 8 bytes | 8 bytes |
21387+ *
21388+ * dirid objectid of directory this item is for
21389+ *
21390+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21391+ *
21392+ * H 1 if last 8 bytes of the key contain hash,
21393+ * 0 if last 8 bytes of the key contain prefix-2
21394+ *
21395+ * prefix-1 first 7 characters of file name.
21396+ * Padded by zeroes if name is not long enough.
21397+ *
21398+ * prefix-2 next 8 characters of the file name.
21399+ *
21400+ * hash hash of the rest of file name (i.e., portion of file
21401+ * name not included into prefix-1).
21402+ *
21403+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21404+ * the key. Such file names are called "short". They are distinguished by H
21405+ * bit set in the key.
21406+ *
21407+ * Other file names are "long". For long name, H bit is 0, and first 7
21408+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21409+ * key are occupied by hash of the remaining characters of the name.
21410+ *
21411+ * STAT DATA
21412+ *
21413+ * | 60 | 4 | 4 | 60 | 64 |
21414+ * +--------------+---+---+--------------+-----------------+
21415+ * | locality id | 1 | 0 | objectid | 0 |
21416+ * +--------------+---+---+--------------+-----------------+
21417+ * | | | |
21418+ * | 8 bytes | 8 bytes | 8 bytes |
21419+ *
21420+ * locality id object id of a directory where first name was created for
21421+ * the object
21422+ *
21423+ * objectid object id for this object
21424+ *
21425+ * FILE BODY
21426+ *
21427+ * | 60 | 4 | 4 | 60 | 64 |
21428+ * +--------------+---+---+--------------+-----------------+
21429+ * | locality id | 4 | 0 | objectid | offset |
21430+ * +--------------+---+---+--------------+-----------------+
21431+ * | | | |
21432+ * | 8 bytes | 8 bytes | 8 bytes |
21433+ *
21434+ * locality id object id of a directory where first name was created for
21435+ * the object
21436+ *
21437+ * objectid object id for this object
21438+ *
21439+ * offset logical offset from the beginning of this file.
21440+ * Measured in bytes.
21441+ *
21442+ *
21443+ */
21444+
21445+#include "debug.h"
21446+#include "key.h"
21447+#include "kassign.h"
21448+#include "vfs_ops.h"
21449+#include "inode.h"
21450+#include "super.h"
21451+#include "dscale.h"
21452+
21453+#include <linux/types.h> /* for __u?? */
21454+#include <linux/fs.h> /* for struct super_block, etc */
21455+
21456+/* bitmask for H bit (see comment at the beginning of this file */
21457+static const __u64 longname_mark = 0x0100000000000000ull;
21458+/* bitmask for F and H portions of the key. */
21459+static const __u64 fibration_mask = 0xff00000000000000ull;
21460+
21461+/* return true if name is not completely encoded in @key */
21462+int is_longname_key(const reiser4_key * key)
21463+{
21464+ __u64 highpart;
21465+
21466+ assert("nikita-2863", key != NULL);
21467+ if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21468+ print_key("oops", key);
21469+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21470+
21471+ if (REISER4_LARGE_KEY)
21472+ highpart = get_key_ordering(key);
21473+ else
21474+ highpart = get_key_objectid(key);
21475+
21476+ return (highpart & longname_mark) ? 1 : 0;
21477+}
21478+
21479+/* return true if @name is too long to be completely encoded in the key */
21480+int is_longname(const char *name UNUSED_ARG, int len)
21481+{
21482+ if (REISER4_LARGE_KEY)
21483+ return len > 23;
21484+ else
21485+ return len > 15;
21486+}
21487+
21488+/* code ascii string into __u64.
21489+
21490+ Put characters of @name into result (@str) one after another starting
21491+ from @start_idx-th highest (arithmetically) byte. This produces
21492+ endian-safe encoding. memcpy(2) will not do.
21493+
21494+*/
21495+static __u64 pack_string(const char *name /* string to encode */ ,
21496+ int start_idx /* highest byte in result from
21497+ * which to start encoding */ )
21498+{
21499+ unsigned i;
21500+ __u64 str;
21501+
21502+ str = 0;
21503+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21504+ str <<= 8;
21505+ str |= (unsigned char)name[i];
21506+ }
21507+ str <<= (sizeof str - i - start_idx) << 3;
21508+ return str;
21509+}
21510+
21511+/* opposite to pack_string(). Takes value produced by pack_string(), restores
21512+ * string encoded in it and stores result in @buf */
21513+char *unpack_string(__u64 value, char *buf)
21514+{
21515+ do {
21516+ *buf = value >> (64 - 8);
21517+ if (*buf)
21518+ ++buf;
21519+ value <<= 8;
21520+ } while (value != 0);
21521+ *buf = 0;
21522+ return buf;
21523+}
21524+
21525+/* obtain name encoded in @key and store it in @buf */
21526+char *extract_name_from_key(const reiser4_key * key, char *buf)
21527+{
21528+ char *c;
21529+
21530+ assert("nikita-2868", !is_longname_key(key));
21531+
21532+ c = buf;
21533+ if (REISER4_LARGE_KEY) {
21534+ c = unpack_string(get_key_ordering(key) & ~fibration_mask, c);
21535+ c = unpack_string(get_key_fulloid(key), c);
21536+ } else
21537+ c = unpack_string(get_key_fulloid(key) & ~fibration_mask, c);
21538+ unpack_string(get_key_offset(key), c);
21539+ return buf;
21540+}
21541+
21542+/**
21543+ * complete_entry_key - calculate entry key by name
21544+ * @dir: directory where entry is (or will be) in
21545+ * @name: name to calculate key of
21546+ * @len: lenth of name
21547+ * @result: place to store result in
21548+ *
21549+ * Sets fields of entry key @result which depend on file name.
21550+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21551+ * objectid and offset. Otherwise, objectid and offset are set.
21552+ */
21553+void complete_entry_key(const struct inode *dir, const char *name,
21554+ int len, reiser4_key *result)
21555+{
21556+#if REISER4_LARGE_KEY
21557+ __u64 ordering;
21558+ __u64 objectid;
21559+ __u64 offset;
21560+
21561+ assert("nikita-1139", dir != NULL);
21562+ assert("nikita-1142", result != NULL);
21563+ assert("nikita-2867", strlen(name) == len);
21564+
21565+ /*
21566+ * key allocation algorithm for directory entries in case of large
21567+ * keys:
21568+ *
21569+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21570+ * characters into ordering field of key, next 8 charactes (if any)
21571+ * into objectid field of key and next 8 ones (of any) into offset
21572+ * field of key
21573+ *
21574+ * If file name is longer than 23 characters, put first 7 characters
21575+ * into key's ordering, next 8 to objectid and hash of remaining
21576+ * characters into offset field.
21577+ *
21578+ * To distinguish above cases, in latter set up unused high bit in
21579+ * ordering field.
21580+ */
21581+
21582+ /* [0-6] characters to ordering */
21583+ ordering = pack_string(name, 1);
21584+ if (len > 7) {
21585+ /* [7-14] characters to objectid */
21586+ objectid = pack_string(name + 7, 0);
21587+ if (len > 15) {
21588+ if (len <= 23) {
21589+ /* [15-23] characters to offset */
21590+ offset = pack_string(name + 15, 0);
21591+ } else {
21592+ /* note in a key the fact that offset contains hash. */
21593+ ordering |= longname_mark;
21594+
21595+ /* offset is the hash of the file name's tail. */
21596+ offset = inode_hash_plugin(dir)->hash(name + 15,
21597+ len - 15);
21598+ }
21599+ } else {
21600+ offset = 0ull;
21601+ }
21602+ } else {
21603+ objectid = 0ull;
21604+ offset = 0ull;
21605+ }
21606+
21607+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21608+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21609+
21610+ set_key_ordering(result, ordering);
21611+ set_key_fulloid(result, objectid);
21612+ set_key_offset(result, offset);
21613+ return;
21614+
21615+#else
21616+ __u64 objectid;
21617+ __u64 offset;
21618+
21619+ assert("nikita-1139", dir != NULL);
21620+ assert("nikita-1142", result != NULL);
21621+ assert("nikita-2867", strlen(name) == len);
21622+
21623+ /*
21624+ * key allocation algorithm for directory entries in case of not large
21625+ * keys:
21626+ *
21627+ * If name is not longer than 7 + 8 = 15 characters, put first 7
21628+ * characters into objectid field of key, next 8 charactes (if any)
21629+ * into offset field of key
21630+ *
21631+ * If file name is longer than 15 characters, put first 7 characters
21632+ * into key's objectid, and hash of remaining characters into offset
21633+ * field.
21634+ *
21635+ * To distinguish above cases, in latter set up unused high bit in
21636+ * objectid field.
21637+ */
21638+
21639+ /* [0-6] characters to objectid */
21640+ objectid = pack_string(name, 1);
21641+ if (len > 7) {
21642+ if (len <= 15) {
21643+ /* [7-14] characters to offset */
21644+ offset = pack_string(name + 7, 0);
21645+ } else {
21646+ /* note in a key the fact that offset contains hash. */
21647+ objectid |= longname_mark;
21648+
21649+ /* offset is the hash of the file name. */
21650+ offset = inode_hash_plugin(dir)->hash(name + 7,
21651+ len - 7);
21652+ }
21653+ } else
21654+ offset = 0ull;
21655+
21656+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21657+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21658+
21659+ set_key_fulloid(result, objectid);
21660+ set_key_offset(result, offset);
21661+ return;
21662+#endif /* ! REISER4_LARGE_KEY */
21663+}
21664+
21665+/* true, if @key is the key of "." */
21666+int is_dot_key(const reiser4_key * key /* key to check */ )
21667+{
21668+ assert("nikita-1717", key != NULL);
21669+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21670+ return
21671+ (get_key_ordering(key) == 0ull) &&
21672+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21673+}
21674+
21675+/* build key for stat-data.
21676+
21677+ return key of stat-data of this object. This should became sd plugin
21678+ method in the future. For now, let it be here.
21679+
21680+*/
21681+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21682+ reiser4_key * result /* resulting key of @target
21683+ stat-data */ )
21684+{
21685+ assert("nikita-261", result != NULL);
21686+
21687+ reiser4_key_init(result);
21688+ set_key_locality(result, reiser4_inode_data(target)->locality_id);
21689+ set_key_ordering(result, get_inode_ordering(target));
21690+ set_key_objectid(result, get_inode_oid(target));
21691+ set_key_type(result, KEY_SD_MINOR);
21692+ set_key_offset(result, (__u64) 0);
21693+ return result;
21694+}
21695+
21696+/* encode part of key into &obj_key_id
21697+
21698+ This encodes into @id part of @key sufficient to restore @key later,
21699+ given that latter is key of object (key of stat-data).
21700+
21701+ See &obj_key_id
21702+*/
21703+int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21704+ obj_key_id * id /* id where key is encoded in */ )
21705+{
21706+ assert("nikita-1151", key != NULL);
21707+ assert("nikita-1152", id != NULL);
21708+
21709+ memcpy(id, key, sizeof *id);
21710+ return 0;
21711+}
21712+
21713+/* encode reference to @obj in @id.
21714+
21715+ This is like build_obj_key_id() above, but takes inode as parameter. */
21716+int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21717+ obj_key_id * id /* result */ )
21718+{
21719+ reiser4_key sdkey;
21720+
21721+ assert("nikita-1166", obj != NULL);
21722+ assert("nikita-1167", id != NULL);
21723+
21724+ build_sd_key(obj, &sdkey);
21725+ build_obj_key_id(&sdkey, id);
21726+ return 0;
21727+}
21728+
21729+/* decode @id back into @key
21730+
21731+ Restore key of object stat-data from @id. This is dual to
21732+ build_obj_key_id() above.
21733+*/
21734+int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21735+ * from */ ,
21736+ reiser4_key * key /* result */ )
21737+{
21738+ assert("nikita-1153", id != NULL);
21739+ assert("nikita-1154", key != NULL);
21740+
21741+ reiser4_key_init(key);
21742+ memcpy(key, id, sizeof *id);
21743+ return 0;
21744+}
21745+
21746+/* extract objectid of directory from key of directory entry within said
21747+ directory.
21748+ */
21749+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21750+ * directory
21751+ * entry */ )
21752+{
21753+ assert("nikita-1314", de_key != NULL);
21754+ return get_key_locality(de_key);
21755+}
21756+
21757+/* encode into @id key of directory entry.
21758+
21759+ Encode into @id information sufficient to later distinguish directory
21760+ entries within the same directory. This is not whole key, because all
21761+ directory entries within directory item share locality which is equal
21762+ to objectid of their directory.
21763+
21764+*/
21765+int build_de_id(const struct inode *dir /* inode of directory */ ,
21766+ const struct qstr *name /* name to be given to @obj by
21767+ * directory entry being
21768+ * constructed */ ,
21769+ de_id * id /* short key of directory entry */ )
21770+{
21771+ reiser4_key key;
21772+
21773+ assert("nikita-1290", dir != NULL);
21774+ assert("nikita-1292", id != NULL);
21775+
21776+ /* NOTE-NIKITA this is suboptimal. */
21777+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21778+ return build_de_id_by_key(&key, id);
21779+}
21780+
21781+/* encode into @id key of directory entry.
21782+
21783+ Encode into @id information sufficient to later distinguish directory
21784+ entries within the same directory. This is not whole key, because all
21785+ directory entries within directory item share locality which is equal
21786+ to objectid of their directory.
21787+
21788+*/
21789+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21790+ * entry */ ,
21791+ de_id * id /* short key of directory entry */ )
21792+{
21793+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21794+ return 0;
21795+}
21796+
21797+/* restore from @id key of directory entry.
21798+
21799+ Function dual to build_de_id(): given @id and locality, build full
21800+ key of directory entry within directory item.
21801+
21802+*/
21803+int extract_key_from_de_id(const oid_t locality /* locality of directory
21804+ * entry */ ,
21805+ const de_id * id /* directory entry id */ ,
21806+ reiser4_key * key /* result */ )
21807+{
21808+ /* no need to initialise key here: all fields are overwritten */
21809+ memcpy(((__u64 *) key) + 1, id, sizeof *id);
21810+ set_key_locality(key, locality);
21811+ set_key_type(key, KEY_FILE_NAME_MINOR);
21812+ return 0;
21813+}
21814+
21815+/* compare two &de_id's */
21816+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21817+ const de_id * id2 /* second &de_id to compare */ )
21818+{
21819+ /* NOTE-NIKITA ugly implementation */
21820+ reiser4_key k1;
21821+ reiser4_key k2;
21822+
21823+ extract_key_from_de_id((oid_t) 0, id1, &k1);
21824+ extract_key_from_de_id((oid_t) 0, id2, &k2);
21825+ return keycmp(&k1, &k2);
21826+}
21827+
21828+/* compare &de_id with key */
21829+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21830+ const reiser4_key * key /* key to compare */ )
21831+{
21832+ cmp_t result;
21833+ reiser4_key *k1;
21834+
21835+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21836+ result = KEY_DIFF_EL(k1, key, 1);
21837+ if (result == EQUAL_TO) {
21838+ result = KEY_DIFF_EL(k1, key, 2);
21839+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21840+ result = KEY_DIFF_EL(k1, key, 3);
21841+ }
21842+ }
21843+ return result;
21844+}
21845+
21846+/*
21847+ * return number of bytes necessary to encode @inode identity.
21848+ */
21849+int inode_onwire_size(const struct inode *inode)
21850+{
21851+ int result;
21852+
21853+ result = dscale_bytes(get_inode_oid(inode));
21854+ result += dscale_bytes(get_inode_locality(inode));
21855+
21856+ /*
21857+ * ordering is large (it usually has highest bits set), so it makes
21858+ * little sense to dscale it.
21859+ */
21860+ if (REISER4_LARGE_KEY)
21861+ result += sizeof(get_inode_ordering(inode));
21862+ return result;
21863+}
21864+
21865+/*
21866+ * encode @inode identity at @start
21867+ */
21868+char *build_inode_onwire(const struct inode *inode, char *start)
21869+{
21870+ start += dscale_write(start, get_inode_locality(inode));
21871+ start += dscale_write(start, get_inode_oid(inode));
21872+
21873+ if (REISER4_LARGE_KEY) {
21874+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21875+ start += sizeof(get_inode_ordering(inode));
21876+ }
21877+ return start;
21878+}
21879+
21880+/*
21881+ * extract key that was previously encoded by build_inode_onwire() at @addr
21882+ */
21883+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21884+{
21885+ __u64 val;
21886+
21887+ addr += dscale_read(addr, &val);
21888+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21889+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21890+ addr += dscale_read(addr, &val);
21891+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21892+#if REISER4_LARGE_KEY
21893+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21894+ addr += sizeof key_id->ordering;
21895+#endif
21896+ return addr;
21897+}
21898+
21899+/* Make Linus happy.
21900+ Local variables:
21901+ c-indentation-style: "K&R"
21902+ mode-name: "LC"
21903+ c-basic-offset: 8
21904+ tab-width: 8
21905+ fill-column: 120
21906+ End:
21907+*/
21908Index: linux-2.6.16/fs/reiser4/kassign.h
21909===================================================================
21910--- /dev/null
21911+++ linux-2.6.16/fs/reiser4/kassign.h
21912@@ -0,0 +1,110 @@
21913+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21914+ * reiser4/README */
21915+
21916+/* Key assignment policy interface. See kassign.c for details. */
21917+
21918+#if !defined( __KASSIGN_H__ )
21919+#define __KASSIGN_H__
21920+
21921+#include "forward.h"
21922+#include "key.h"
21923+#include "dformat.h"
21924+
21925+#include <linux/types.h> /* for __u?? */
21926+#include <linux/fs.h> /* for struct super_block, etc */
21927+#include <linux/dcache.h> /* for struct qstr */
21928+
21929+/* key assignment functions */
21930+
21931+/* Information from which key of file stat-data can be uniquely
21932+ restored. This depends on key assignment policy for
21933+ stat-data. Currently it's enough to store object id and locality id
21934+ (60+60==120) bits, because minor packing locality and offset of
21935+ stat-data key are always known constants: KEY_SD_MINOR and 0
21936+ respectively. For simplicity 4 bits are wasted in each id, and just
21937+ two 64 bit integers are stored.
21938+
21939+ This field has to be byte-aligned, because we don't want to waste
21940+ space in directory entries. There is another side of a coin of
21941+ course: we waste CPU and bus bandwidth in stead, by copying data back
21942+ and forth.
21943+
21944+ Next optimization: &obj_key_id is mainly used to address stat data from
21945+ directory entries. Under the assumption that majority of files only have
21946+ only name (one hard link) from *the* parent directory it seems reasonable
21947+ to only store objectid of stat data and take its locality from key of
21948+ directory item.
21949+
21950+ This requires some flag to be added to the &obj_key_id to distinguish
21951+ between these two cases. Remaining bits in flag byte are then asking to be
21952+ used to store file type.
21953+
21954+ This optimization requires changes in directory item handling code.
21955+
21956+*/
21957+typedef struct obj_key_id {
21958+ d8 locality[sizeof(__u64)];
21959+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21960+ )
21961+ d8 objectid[sizeof(__u64)];
21962+}
21963+obj_key_id;
21964+
21965+/* Information sufficient to uniquely identify directory entry within
21966+ compressed directory item.
21967+
21968+ For alignment issues see &obj_key_id above.
21969+*/
21970+typedef struct de_id {
21971+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21972+ d8 objectid[sizeof(__u64)];
21973+ d8 offset[sizeof(__u64)];
21974+}
21975+de_id;
21976+
21977+extern int inode_onwire_size(const struct inode *obj);
21978+extern char *build_inode_onwire(const struct inode *obj, char *area);
21979+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21980+
21981+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21982+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21983+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21984+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21985+extern int build_de_id(const struct inode *dir, const struct qstr *name,
21986+ de_id * id);
21987+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21988+extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21989+ reiser4_key * key);
21990+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21991+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21992+
21993+extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21994+extern void build_entry_key_common(const struct inode *dir,
21995+ const struct qstr *name,
21996+ reiser4_key * result);
21997+extern void build_entry_key_stable_entry(const struct inode *dir,
21998+ const struct qstr *name,
21999+ reiser4_key * result);
22000+extern int is_dot_key(const reiser4_key * key);
22001+extern reiser4_key *build_sd_key(const struct inode *target,
22002+ reiser4_key * result);
22003+
22004+extern int is_longname_key(const reiser4_key * key);
22005+extern int is_longname(const char *name, int len);
22006+extern char *extract_name_from_key(const reiser4_key * key, char *buf);
22007+extern char *unpack_string(__u64 value, char *buf);
22008+extern void complete_entry_key(const struct inode *dir, const char *name,
22009+ int len, reiser4_key *result);
22010+
22011+/* __KASSIGN_H__ */
22012+#endif
22013+
22014+/* Make Linus happy.
22015+ Local variables:
22016+ c-indentation-style: "K&R"
22017+ mode-name: "LC"
22018+ c-basic-offset: 8
22019+ tab-width: 8
22020+ fill-column: 120
22021+ End:
22022+*/
22023Index: linux-2.6.16/fs/reiser4/key.c
22024===================================================================
22025--- /dev/null
22026+++ linux-2.6.16/fs/reiser4/key.c
22027@@ -0,0 +1,137 @@
22028+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22029+
22030+/* Key manipulations. */
22031+
22032+#include "debug.h"
22033+#include "key.h"
22034+#include "super.h"
22035+#include "reiser4.h"
22036+
22037+#include <linux/types.h> /* for __u?? */
22038+
22039+/* Minimal possible key: all components are zero. It is presumed that this is
22040+ independent of key scheme. */
22041+static const reiser4_key MINIMAL_KEY = {
22042+ .el = {
22043+ 0ull,
22044+ ON_LARGE_KEY(0ull,)
22045+ 0ull,
22046+ 0ull
22047+ }
22048+};
22049+
22050+/* Maximal possible key: all components are ~0. It is presumed that this is
22051+ independent of key scheme. */
22052+static const reiser4_key MAXIMAL_KEY = {
22053+ .el = {
22054+ __constant_cpu_to_le64(~0ull),
22055+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
22056+ __constant_cpu_to_le64(~0ull),
22057+ __constant_cpu_to_le64(~0ull)
22058+ }
22059+};
22060+
22061+/* Initialize key. */
22062+void reiser4_key_init(reiser4_key * key /* key to init */ )
22063+{
22064+ assert("nikita-1169", key != NULL);
22065+ memset(key, 0, sizeof *key);
22066+}
22067+
22068+/* minimal possible key in the tree. Return pointer to the static storage. */
22069+const reiser4_key *min_key(void)
22070+{
22071+ return &MINIMAL_KEY;
22072+}
22073+
22074+/* maximum possible key in the tree. Return pointer to the static storage. */
22075+const reiser4_key *max_key(void)
22076+{
22077+ return &MAXIMAL_KEY;
22078+}
22079+
22080+#if REISER4_DEBUG
22081+/* debugging aid: print symbolic name of key type */
22082+static const char *type_name(unsigned int key_type /* key type */ )
22083+{
22084+ switch (key_type) {
22085+ case KEY_FILE_NAME_MINOR:
22086+ return "file name";
22087+ case KEY_SD_MINOR:
22088+ return "stat data";
22089+ case KEY_ATTR_NAME_MINOR:
22090+ return "attr name";
22091+ case KEY_ATTR_BODY_MINOR:
22092+ return "attr body";
22093+ case KEY_BODY_MINOR:
22094+ return "file body";
22095+ default:
22096+ return "unknown";
22097+ }
22098+}
22099+
22100+/* debugging aid: print human readable information about key */
22101+void print_key(const char *prefix /* prefix to print */ ,
22102+ const reiser4_key * key /* key to print */ )
22103+{
22104+ /* turn bold on */
22105+ /* printf ("\033[1m"); */
22106+ if (key == NULL)
22107+ printk("%s: null key\n", prefix);
22108+ else {
22109+ if (REISER4_LARGE_KEY)
22110+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
22111+ get_key_locality(key),
22112+ get_key_type(key),
22113+ get_key_ordering(key),
22114+ get_key_band(key),
22115+ get_key_objectid(key), get_key_offset(key));
22116+ else
22117+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
22118+ get_key_locality(key),
22119+ get_key_type(key),
22120+ get_key_band(key),
22121+ get_key_objectid(key), get_key_offset(key));
22122+ /*
22123+ * if this is a key of directory entry, try to decode part of
22124+ * a name stored in the key, and output it.
22125+ */
22126+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
22127+ char buf[DE_NAME_BUF_LEN];
22128+ char *c;
22129+
22130+ c = buf;
22131+ c = unpack_string(get_key_ordering(key), c);
22132+ unpack_string(get_key_fulloid(key), c);
22133+ printk("[%s", buf);
22134+ if (is_longname_key(key))
22135+ /*
22136+ * only part of the name is stored in the key.
22137+ */
22138+ printk("...]\n");
22139+ else {
22140+ /*
22141+ * whole name is stored in the key.
22142+ */
22143+ unpack_string(get_key_offset(key), buf);
22144+ printk("%s]\n", buf);
22145+ }
22146+ } else {
22147+ printk("[%s]\n", type_name(get_key_type(key)));
22148+ }
22149+ }
22150+ /* turn bold off */
22151+ /* printf ("\033[m\017"); */
22152+}
22153+
22154+#endif
22155+
22156+/* Make Linus happy.
22157+ Local variables:
22158+ c-indentation-style: "K&R"
22159+ mode-name: "LC"
22160+ c-basic-offset: 8
22161+ tab-width: 8
22162+ fill-column: 120
22163+ End:
22164+*/
22165Index: linux-2.6.16/fs/reiser4/key.h
22166===================================================================
22167--- /dev/null
22168+++ linux-2.6.16/fs/reiser4/key.h
22169@@ -0,0 +1,384 @@
22170+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22171+
22172+/* Declarations of key-related data-structures and operations on keys. */
22173+
22174+#if !defined( __REISER4_KEY_H__ )
22175+#define __REISER4_KEY_H__
22176+
22177+#include "dformat.h"
22178+#include "forward.h"
22179+#include "debug.h"
22180+
22181+#include <linux/types.h> /* for __u?? */
22182+
22183+/* Operations on keys in reiser4 tree */
22184+
22185+/* No access to any of these fields shall be done except via a
22186+ wrapping macro/function, and that wrapping macro/function shall
22187+ convert to little endian order. Compare keys will consider cpu byte order. */
22188+
22189+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
22190+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
22191+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
22192+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
22193+ right one. */
22194+
22195+/* possible values for minor packing locality (4 bits required) */
22196+typedef enum {
22197+ /* file name */
22198+ KEY_FILE_NAME_MINOR = 0,
22199+ /* stat-data */
22200+ KEY_SD_MINOR = 1,
22201+ /* file attribute name */
22202+ KEY_ATTR_NAME_MINOR = 2,
22203+ /* file attribute value */
22204+ KEY_ATTR_BODY_MINOR = 3,
22205+ /* file body (tail or extent) */
22206+ KEY_BODY_MINOR = 4,
22207+} key_minor_locality;
22208+
22209+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
22210+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
22211+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
22212+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
22213+ block_alloc.c to check the node type when deciding where to allocate the node.
22214+
22215+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
22216+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
22217+ current implementation tails have a different minor packing locality from extents, and no files have both extents and
22218+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
22219+*/
22220+
22221+/* Arbitrary major packing localities can be assigned to objects using
22222+ the reiser4(filenameA/..packing<=some_number) system call.
22223+
22224+ In reiser4, the creat() syscall creates a directory
22225+
22226+ whose default flow (that which is referred to if the directory is
22227+ read as a file) is the traditional unix file body.
22228+
22229+ whose directory plugin is the 'filedir'
22230+
22231+ whose major packing locality is that of the parent of the object created.
22232+
22233+ The static_stat item is a particular commonly used directory
22234+ compression (the one for normal unix files).
22235+
22236+ The filedir plugin checks to see if the static_stat item exists.
22237+ There is a unique key for static_stat. If yes, then it uses the
22238+ static_stat item for all of the values that it contains. The
22239+ static_stat item contains a flag for each stat it contains which
22240+ indicates whether one should look outside the static_stat item for its
22241+ contents.
22242+*/
22243+
22244+/* offset of fields in reiser4_key. Value of each element of this enum
22245+ is index within key (thought as array of __u64's) where this field
22246+ is. */
22247+typedef enum {
22248+ /* major "locale", aka dirid. Sits in 1st element */
22249+ KEY_LOCALITY_INDEX = 0,
22250+ /* minor "locale", aka item type. Sits in 1st element */
22251+ KEY_TYPE_INDEX = 0,
22252+ ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22253+ /* "object band". Sits in 2nd element */
22254+ KEY_BAND_INDEX,
22255+ /* objectid. Sits in 2nd element */
22256+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22257+ /* full objectid. Sits in 2nd element */
22258+ KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22259+ /* Offset. Sits in 3rd element */
22260+ KEY_OFFSET_INDEX,
22261+ /* Name hash. Sits in 3rd element */
22262+ KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22263+ KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22264+ KEY_LAST_INDEX
22265+} reiser4_key_field_index;
22266+
22267+/* key in reiser4 internal "balanced" tree. It is just array of three
22268+ 64bit integers in disk byte order (little-endian by default). This
22269+ array is actually indexed by reiser4_key_field. Each __u64 within
22270+ this array is called "element". Logical key component encoded within
22271+ elements are called "fields".
22272+
22273+ We declare this as union with second component dummy to suppress
22274+ inconvenient array<->pointer casts implied in C. */
22275+union reiser4_key {
22276+ __le64 el[KEY_LAST_INDEX];
22277+ int pad;
22278+};
22279+
22280+/* bitmasks showing where within reiser4_key particular key is stored. */
22281+/* major locality occupies higher 60 bits of the first element */
22282+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22283+
22284+/* minor locality occupies lower 4 bits of the first element */
22285+#define KEY_TYPE_MASK 0xfull
22286+
22287+/* controversial band occupies higher 4 bits of the 2nd element */
22288+#define KEY_BAND_MASK 0xf000000000000000ull
22289+
22290+/* objectid occupies lower 60 bits of the 2nd element */
22291+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22292+
22293+/* full 64bit objectid*/
22294+#define KEY_FULLOID_MASK 0xffffffffffffffffull
22295+
22296+/* offset is just 3rd L.M.Nt itself */
22297+#define KEY_OFFSET_MASK 0xffffffffffffffffull
22298+
22299+/* ordering is whole second element */
22300+#define KEY_ORDERING_MASK 0xffffffffffffffffull
22301+
22302+/* how many bits key element should be shifted to left to get particular field */
22303+typedef enum {
22304+ KEY_LOCALITY_SHIFT = 4,
22305+ KEY_TYPE_SHIFT = 0,
22306+ KEY_BAND_SHIFT = 60,
22307+ KEY_OBJECTID_SHIFT = 0,
22308+ KEY_FULLOID_SHIFT = 0,
22309+ KEY_OFFSET_SHIFT = 0,
22310+ KEY_ORDERING_SHIFT = 0,
22311+} reiser4_key_field_shift;
22312+
22313+static inline __u64
22314+get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22315+{
22316+ assert("nikita-753", key != NULL);
22317+ assert("nikita-754", off < KEY_LAST_INDEX);
22318+ return le64_to_cpu(get_unaligned(&key->el[off]));
22319+}
22320+
22321+static inline void
22322+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22323+{
22324+ assert("nikita-755", key != NULL);
22325+ assert("nikita-756", off < KEY_LAST_INDEX);
22326+ put_unaligned(cpu_to_le64(value), &key->el[off]);
22327+}
22328+
22329+/* macro to define getter and setter functions for field F with type T */
22330+#define DEFINE_KEY_FIELD( L, U, T ) \
22331+static inline T get_key_ ## L ( const reiser4_key *key ) \
22332+{ \
22333+ assert( "nikita-750", key != NULL ); \
22334+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22335+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22336+} \
22337+ \
22338+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22339+{ \
22340+ __u64 el; \
22341+ \
22342+ assert( "nikita-752", key != NULL ); \
22343+ \
22344+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22345+ /* clear field bits in the key */ \
22346+ el &= ~KEY_ ## U ## _MASK; \
22347+ /* actually it should be \
22348+ \
22349+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22350+ \
22351+ but we trust user to never pass values that wouldn't fit \
22352+ into field. Clearing extra bits is one operation, but this \
22353+ function is time-critical. \
22354+ But check this in assertion. */ \
22355+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22356+ ~KEY_ ## U ## _MASK ) == 0 ); \
22357+ el |= ( loc << KEY_ ## U ## _SHIFT ); \
22358+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22359+}
22360+
22361+typedef __u64 oid_t;
22362+
22363+/* define get_key_locality(), set_key_locality() */
22364+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22365+/* define get_key_type(), set_key_type() */
22366+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22367+/* define get_key_band(), set_key_band() */
22368+DEFINE_KEY_FIELD(band, BAND, __u64);
22369+/* define get_key_objectid(), set_key_objectid() */
22370+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22371+/* define get_key_fulloid(), set_key_fulloid() */
22372+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22373+/* define get_key_offset(), set_key_offset() */
22374+DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22375+#if (REISER4_LARGE_KEY)
22376+/* define get_key_ordering(), set_key_ordering() */
22377+DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22378+#else
22379+static inline __u64 get_key_ordering(const reiser4_key * key)
22380+{
22381+ return 0;
22382+}
22383+
22384+static inline void set_key_ordering(reiser4_key * key, __u64 val)
22385+{
22386+}
22387+#endif
22388+
22389+/* key comparison result */
22390+typedef enum { LESS_THAN = -1, /* if first key is less than second */
22391+ EQUAL_TO = 0, /* if keys are equal */
22392+ GREATER_THAN = +1 /* if first key is greater than second */
22393+} cmp_t;
22394+
22395+void reiser4_key_init(reiser4_key * key);
22396+
22397+/* minimal possible key in the tree. Return pointer to the static storage. */
22398+extern const reiser4_key *min_key(void);
22399+extern const reiser4_key *max_key(void);
22400+
22401+/* helper macro for keycmp() */
22402+#define KEY_DIFF(k1, k2, field) \
22403+({ \
22404+ typeof (get_key_ ## field (k1)) f1; \
22405+ typeof (get_key_ ## field (k2)) f2; \
22406+ \
22407+ f1 = get_key_ ## field (k1); \
22408+ f2 = get_key_ ## field (k2); \
22409+ \
22410+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22411+})
22412+
22413+/* helper macro for keycmp() */
22414+#define KEY_DIFF_EL(k1, k2, off) \
22415+({ \
22416+ __u64 e1; \
22417+ __u64 e2; \
22418+ \
22419+ e1 = get_key_el(k1, off); \
22420+ e2 = get_key_el(k2, off); \
22421+ \
22422+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22423+})
22424+
22425+/* compare `k1' and `k2'. This function is a heart of "key allocation
22426+ policy". All you need to implement new policy is to add yet another
22427+ clause here. */
22428+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22429+ const reiser4_key * k2 /* second key to compare */ )
22430+{
22431+ cmp_t result;
22432+
22433+ /*
22434+ * This function is the heart of reiser4 tree-routines. Key comparison
22435+ * is among most heavily used operations in the file system.
22436+ */
22437+
22438+ assert("nikita-439", k1 != NULL);
22439+ assert("nikita-440", k2 != NULL);
22440+
22441+ /* there is no actual branch here: condition is compile time constant
22442+ * and constant folding and propagation ensures that only one branch
22443+ * is actually compiled in. */
22444+
22445+ if (REISER4_PLANA_KEY_ALLOCATION) {
22446+ /* if physical order of fields in a key is identical
22447+ with logical order, we can implement key comparison
22448+ as three 64bit comparisons. */
22449+ /* logical order of fields in plan-a:
22450+ locality->type->objectid->offset. */
22451+ /* compare locality and type at once */
22452+ result = KEY_DIFF_EL(k1, k2, 0);
22453+ if (result == EQUAL_TO) {
22454+ /* compare objectid (and band if it's there) */
22455+ result = KEY_DIFF_EL(k1, k2, 1);
22456+ /* compare offset */
22457+ if (result == EQUAL_TO) {
22458+ result = KEY_DIFF_EL(k1, k2, 2);
22459+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22460+ result = KEY_DIFF_EL(k1, k2, 3);
22461+ }
22462+ }
22463+ }
22464+ } else if (REISER4_3_5_KEY_ALLOCATION) {
22465+ result = KEY_DIFF(k1, k2, locality);
22466+ if (result == EQUAL_TO) {
22467+ result = KEY_DIFF(k1, k2, objectid);
22468+ if (result == EQUAL_TO) {
22469+ result = KEY_DIFF(k1, k2, type);
22470+ if (result == EQUAL_TO)
22471+ result = KEY_DIFF(k1, k2, offset);
22472+ }
22473+ }
22474+ } else
22475+ impossible("nikita-441", "Unknown key allocation scheme!");
22476+ return result;
22477+}
22478+
22479+/* true if @k1 equals @k2 */
22480+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22481+ const reiser4_key * k2 /* second key to compare */ )
22482+{
22483+ assert("nikita-1879", k1 != NULL);
22484+ assert("nikita-1880", k2 != NULL);
22485+ return !memcmp(k1, k2, sizeof *k1);
22486+}
22487+
22488+/* true if @k1 is less than @k2 */
22489+static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22490+ const reiser4_key * k2 /* second key to compare */ )
22491+{
22492+ assert("nikita-1952", k1 != NULL);
22493+ assert("nikita-1953", k2 != NULL);
22494+ return keycmp(k1, k2) == LESS_THAN;
22495+}
22496+
22497+/* true if @k1 is less than or equal to @k2 */
22498+static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22499+ const reiser4_key * k2 /* second key to compare */ )
22500+{
22501+ assert("nikita-1954", k1 != NULL);
22502+ assert("nikita-1955", k2 != NULL);
22503+ return keycmp(k1, k2) != GREATER_THAN;
22504+}
22505+
22506+/* true if @k1 is greater than @k2 */
22507+static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22508+ const reiser4_key * k2 /* second key to compare */ )
22509+{
22510+ assert("nikita-1959", k1 != NULL);
22511+ assert("nikita-1960", k2 != NULL);
22512+ return keycmp(k1, k2) == GREATER_THAN;
22513+}
22514+
22515+/* true if @k1 is greater than or equal to @k2 */
22516+static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22517+ const reiser4_key * k2 /* second key to compare */ )
22518+{
22519+ assert("nikita-1956", k1 != NULL);
22520+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22521+ * November 3: Laika */
22522+ return keycmp(k1, k2) != LESS_THAN;
22523+}
22524+
22525+static inline void prefetchkey(reiser4_key * key)
22526+{
22527+ prefetch(key);
22528+ prefetch(&key->el[KEY_CACHELINE_END]);
22529+}
22530+
22531+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22532+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22533+/* size of a buffer suitable to hold human readable key representation */
22534+#define KEY_BUF_LEN (80)
22535+
22536+#if REISER4_DEBUG
22537+extern void print_key(const char *prefix, const reiser4_key * key);
22538+#else
22539+#define print_key(p,k) noop
22540+#endif
22541+
22542+/* __FS_REISERFS_KEY_H__ */
22543+#endif
22544+
22545+/* Make Linus happy.
22546+ Local variables:
22547+ c-indentation-style: "K&R"
22548+ mode-name: "LC"
22549+ c-basic-offset: 8
22550+ tab-width: 8
22551+ fill-column: 120
22552+ End:
22553+*/
22554Index: linux-2.6.16/fs/reiser4/ktxnmgrd.c
22555===================================================================
22556--- /dev/null
22557+++ linux-2.6.16/fs/reiser4/ktxnmgrd.c
22558@@ -0,0 +1,214 @@
22559+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22560+/* Transaction manager daemon. */
22561+
22562+/*
22563+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22564+ * needed/important for the following reasons:
22565+ *
22566+ * 1. in reiser4 atom is not committed immediately when last transaction
22567+ * handle closes, unless atom is either too old or too large (see
22568+ * atom_should_commit()). This is done to avoid committing too frequently.
22569+ * because:
22570+ *
22571+ * 2. sometimes we don't want to commit atom when closing last transaction
22572+ * handle even if it is old and fat enough. For example, because we are at
22573+ * this point under directory semaphore, and committing would stall all
22574+ * accesses to this directory.
22575+ *
22576+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22577+ * either due to (tunable) timeout or because it was explicitly woken up by
22578+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22579+ * eligible.
22580+ *
22581+ */
22582+
22583+#include "debug.h"
22584+#include "txnmgr.h"
22585+#include "tree.h"
22586+#include "ktxnmgrd.h"
22587+#include "super.h"
22588+#include "reiser4.h"
22589+
22590+#include <linux/sched.h> /* for struct task_struct */
22591+#include <linux/wait.h>
22592+#include <linux/suspend.h>
22593+#include <linux/kernel.h>
22594+#include <linux/writeback.h>
22595+#include <linux/kthread.h>
22596+
22597+static int scan_mgr(struct super_block *);
22598+
22599+/*
22600+ * change current->comm so that ps, top, and friends will see changed
22601+ * state. This serves no useful purpose whatsoever, but also costs nothing. May
22602+ * be it will make lonely system administrator feeling less alone at 3 A.M.
22603+ */
22604+#define set_comm( state ) \
22605+ snprintf( current -> comm, sizeof( current -> comm ), \
22606+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22607+
22608+/**
22609+ * ktxnmgrd - kernel txnmgr daemon
22610+ * @arg: pointer to super block
22611+ *
22612+ * The background transaction manager daemon, started as a kernel thread during
22613+ * reiser4 initialization.
22614+ */
22615+static int ktxnmgrd(void *arg)
22616+{
22617+ struct super_block *super;
22618+ ktxnmgrd_context *ctx;
22619+ txn_mgr *mgr;
22620+ int done = 0;
22621+
22622+ super = arg;
22623+ mgr = &get_super_private(super)->tmgr;
22624+
22625+ /*
22626+ * do_fork() just copies task_struct into the new thread. ->fs_context
22627+ * shouldn't be copied of course. This shouldn't be a problem for the
22628+ * rest of the code though.
22629+ */
22630+ current->journal_info = NULL;
22631+ ctx = mgr->daemon;
22632+ while (1) {
22633+ try_to_freeze();
22634+ set_comm("wait");
22635+ {
22636+ DEFINE_WAIT(__wait);
22637+
22638+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22639+ if (kthread_should_stop()) {
22640+ done = 1;
22641+ } else
22642+ schedule_timeout(ctx->timeout);
22643+ finish_wait(&ctx->wait, &__wait);
22644+ }
22645+ if (done)
22646+ break;
22647+ set_comm("run");
22648+ spin_lock(&ctx->guard);
22649+ /*
22650+ * wait timed out or ktxnmgrd was woken up by explicit request
22651+ * to commit something. Scan list of atoms in txnmgr and look
22652+ * for too old atoms.
22653+ */
22654+ do {
22655+ ctx->rescan = 0;
22656+ scan_mgr(super);
22657+ spin_lock(&ctx->guard);
22658+ if (ctx->rescan) {
22659+ /*
22660+ * the list could be modified while ctx
22661+ * spinlock was released, we have to repeat
22662+ * scanning from the beginning
22663+ */
22664+ break;
22665+ }
22666+ } while (ctx->rescan);
22667+ spin_unlock(&ctx->guard);
22668+ }
22669+ return 0;
22670+}
22671+
22672+#undef set_comm
22673+
22674+/**
22675+ * init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22676+ * @super: pointer to super block
22677+ *
22678+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22679+ * manager. Starts kernel txnmgr daemon. This is called on mount.
22680+ */
22681+int init_ktxnmgrd(struct super_block *super)
22682+{
22683+ txn_mgr *mgr;
22684+ ktxnmgrd_context *ctx;
22685+
22686+ mgr = &get_super_private(super)->tmgr;
22687+
22688+ assert("zam-1014", mgr->daemon == NULL);
22689+
22690+ ctx = kmalloc(sizeof(ktxnmgrd_context), get_gfp_mask());
22691+ if (ctx == NULL)
22692+ return RETERR(-ENOMEM);
22693+
22694+ assert("nikita-2442", ctx != NULL);
22695+
22696+ memset(ctx, 0, sizeof *ctx);
22697+ init_waitqueue_head(&ctx->wait);
22698+
22699+ /*kcond_init(&ctx->startup);*/
22700+ spin_lock_init(&ctx->guard);
22701+ ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22702+ ctx->rescan = 1;
22703+ mgr->daemon = ctx;
22704+
22705+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22706+ if (IS_ERR(ctx->tsk)) {
22707+ int ret = PTR_ERR(ctx->tsk);
22708+ mgr->daemon = NULL;
22709+ kfree(ctx);
22710+ return RETERR(ret);
22711+ }
22712+ return 0;
22713+}
22714+
22715+void ktxnmgrd_kick(txn_mgr *mgr)
22716+{
22717+ assert("nikita-3234", mgr != NULL);
22718+ assert("nikita-3235", mgr->daemon != NULL);
22719+ wake_up(&mgr->daemon->wait);
22720+}
22721+
22722+int is_current_ktxnmgrd(void)
22723+{
22724+ return (get_current_super_private()->tmgr.daemon->tsk == current);
22725+}
22726+
22727+/**
22728+ * scan_mgr - commit atoms which are to be committed
22729+ * @super: super block to commit atoms of
22730+ *
22731+ * Commits old atoms.
22732+ */
22733+static int scan_mgr(struct super_block *super)
22734+{
22735+ int ret;
22736+ reiser4_context ctx;
22737+
22738+ init_stack_context(&ctx, super);
22739+
22740+ ret = commit_some_atoms(&get_super_private(super)->tmgr);
22741+
22742+ reiser4_exit_context(&ctx);
22743+ return ret;
22744+}
22745+
22746+/**
22747+ * done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22748+ * @mgr:
22749+ *
22750+ * This is called on umount. Stops ktxnmgrd and free t
22751+ */
22752+void done_ktxnmgrd(struct super_block *super)
22753+{
22754+ txn_mgr *mgr;
22755+
22756+ mgr = &get_super_private(super)->tmgr;
22757+ assert("zam-1012", mgr->daemon != NULL);
22758+
22759+ kthread_stop(mgr->daemon->tsk);
22760+ kfree(mgr->daemon);
22761+ mgr->daemon = NULL;
22762+}
22763+
22764+/*
22765+ * Local variables:
22766+ * c-indentation-style: "K&R"
22767+ * mode-name: "LC"
22768+ * c-basic-offset: 8
22769+ * tab-width: 8
22770+ * fill-column: 120
22771+ * End:
22772+ */
22773Index: linux-2.6.16/fs/reiser4/ktxnmgrd.h
22774===================================================================
22775--- /dev/null
22776+++ linux-2.6.16/fs/reiser4/ktxnmgrd.h
22777@@ -0,0 +1,52 @@
22778+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22779+ * reiser4/README */
22780+
22781+/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22782+
22783+#ifndef __KTXNMGRD_H__
22784+#define __KTXNMGRD_H__
22785+
22786+#include "txnmgr.h"
22787+
22788+#include <linux/fs.h>
22789+#include <linux/wait.h>
22790+#include <linux/completion.h>
22791+#include <linux/spinlock.h>
22792+#include <asm/atomic.h>
22793+#include <linux/sched.h> /* for struct task_struct */
22794+
22795+/* in this structure all data necessary to start up, shut down and communicate
22796+ * with ktxnmgrd are kept. */
22797+struct ktxnmgrd_context {
22798+ /* wait queue head on which ktxnmgrd sleeps */
22799+ wait_queue_head_t wait;
22800+ /* spin lock protecting all fields of this structure */
22801+ spinlock_t guard;
22802+ /* timeout of sleeping on ->wait */
22803+ signed long timeout;
22804+ /* kernel thread running ktxnmgrd */
22805+ struct task_struct *tsk;
22806+ /* list of all file systems served by this ktxnmgrd */
22807+ struct list_head queue;
22808+ /* should ktxnmgrd repeat scanning of atoms? */
22809+ unsigned int rescan:1;
22810+};
22811+
22812+extern int init_ktxnmgrd(struct super_block *);
22813+extern void done_ktxnmgrd(struct super_block *);
22814+
22815+extern void ktxnmgrd_kick(txn_mgr * mgr);
22816+extern int is_current_ktxnmgrd(void);
22817+
22818+/* __KTXNMGRD_H__ */
22819+#endif
22820+
22821+/* Make Linus happy.
22822+ Local variables:
22823+ c-indentation-style: "K&R"
22824+ mode-name: "LC"
22825+ c-basic-offset: 8
22826+ tab-width: 8
22827+ fill-column: 120
22828+ End:
22829+*/
22830Index: linux-2.6.16/fs/reiser4/lock.c
22831===================================================================
22832--- /dev/null
22833+++ linux-2.6.16/fs/reiser4/lock.c
22834@@ -0,0 +1,1261 @@
22835+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22836+ * reiser4/README */
22837+
22838+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22839+ order. V4 balances the tree from the bottom up, and searches the tree from
22840+ the top down, and that is really the way we want it, so tradition won't work
22841+ for us.
22842+
22843+ Instead we have two lock orderings, a high priority lock ordering, and a low
22844+ priority lock ordering. Each node in the tree has a lock in its znode.
22845+
22846+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22847+ has a set (maybe empty) of already locked nodes ("process locked set"). Each
22848+ process may have a pending lock request to a node locked by another process.
22849+ Note: we lock and unlock, but do not transfer locks: it is possible
22850+ transferring locks instead would save some bus locking....
22851+
22852+ Deadlock occurs when we have a loop constructed from process locked sets and
22853+ lock request vectors.
22854+
22855+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22856+ memory is extended with "znodes" with which we connect nodes with their left
22857+ and right neighbors using sibling pointers stored in the znodes. When we
22858+ perform balancing operations we often go from left to right and from right to
22859+ left.
22860+
22861+ +-P1-+ +-P3-+
22862+ |+--+| V1 |+--+|
22863+ ||N1|| -------> ||N3||
22864+ |+--+| |+--+|
22865+ +----+ +----+
22866+ ^ |
22867+ |V2 |V3
22868+ | v
22869+ +---------P2---------+
22870+ |+--+ +--+|
22871+ ||N2| -------- |N4||
22872+ |+--+ +--+|
22873+ +--------------------+
22874+
22875+ We solve this by ensuring that only low priority processes lock in top to
22876+ bottom order and from right to left, and high priority processes lock from
22877+ bottom to top and left to right.
22878+
22879+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22880+ kill those damn busy loops.
22881+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22882+ stage) cannot be ordered that way. There are no rules what nodes can belong
22883+ to the atom and what nodes cannot. We cannot define what is right or left
22884+ direction, what is top or bottom. We can take immediate parent or side
22885+ neighbor of one node, but nobody guarantees that, say, left neighbor node is
22886+ not a far right neighbor for other nodes from the same atom. It breaks
22887+ deadlock avoidance rules and hi-low priority locking cannot be applied for
22888+ atom locks.
22889+
22890+ How does it help to avoid deadlocks ?
22891+
22892+ Suppose we have a deadlock with n processes. Processes from one priority
22893+ class never deadlock because they take locks in one consistent
22894+ order.
22895+
22896+ So, any possible deadlock loop must have low priority as well as high
22897+ priority processes. There are no other lock priority levels except low and
22898+ high. We know that any deadlock loop contains at least one node locked by a
22899+ low priority process and requested by a high priority process. If this
22900+ situation is caught and resolved it is sufficient to avoid deadlocks.
22901+
22902+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22903+
22904+ The deadlock prevention algorithm is based on comparing
22905+ priorities of node owners (processes which keep znode locked) and
22906+ requesters (processes which want to acquire a lock on znode). We
22907+ implement a scheme where low-priority owners yield locks to
22908+ high-priority requesters. We created a signal passing system that
22909+ is used to ask low-priority processes to yield one or more locked
22910+ znodes.
22911+
22912+ The condition when a znode needs to change its owners is described by the
22913+ following formula:
22914+
22915+ #############################################
22916+ # #
22917+ # (number of high-priority requesters) > 0 #
22918+ # AND #
22919+ # (numbers of high-priority owners) == 0 #
22920+ # #
22921+ #############################################
22922+
22923+ Note that a low-priority process delays node releasing if another
22924+ high-priority process owns this node. So, slightly more strictly speaking,
22925+ to have a deadlock capable cycle you must have a loop in which a high
22926+ priority process is waiting on a low priority process to yield a node, which
22927+ is slightly different from saying a high priority process is waiting on a
22928+ node owned by a low priority process.
22929+
22930+ It is enough to avoid deadlocks if we prevent any low-priority process from
22931+ falling asleep if its locked set contains a node which satisfies the
22932+ deadlock condition.
22933+
22934+ That condition is implicitly or explicitly checked in all places where new
22935+ high-priority requests may be added or removed from node request queue or
22936+ high-priority process takes or releases a lock on node. The main
22937+ goal of these checks is to never lose the moment when node becomes "has
22938+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22939+ at that time.
22940+
22941+ The information about received signals is stored in the per-process
22942+ structure (lock stack) and analyzed before a low-priority process goes to
22943+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22944+ sleeping process up and forces him to re-check lock status and received
22945+ signal info. If "must-yield-this-lock" signals were received the locking
22946+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22947+
22948+ V4 LOCKING DRAWBACKS
22949+
22950+ If we have already balanced on one level, and we are propagating our changes
22951+ upward to a higher level, it could be very messy to surrender all locks on
22952+ the lower level because we put so much computational work into it, and
22953+ reverting them to their state before they were locked might be very complex.
22954+ We also don't want to acquire all locks before performing balancing because
22955+ that would either be almost as much work as the balancing, or it would be
22956+ too conservative and lock too much. We want balancing to be done only at
22957+ high priority. Yet, we might want to go to the left one node and use some
22958+ of its empty space... So we make one attempt at getting the node to the left
22959+ using try_lock, and if it fails we do without it, because we didn't really
22960+ need it, it was only a nice to have.
22961+
22962+ LOCK STRUCTURES DESCRIPTION
22963+
22964+ The following data structures are used in the reiser4 locking
22965+ implementation:
22966+
22967+ All fields related to long-term locking are stored in znode->lock.
22968+
22969+ The lock stack is a per thread object. It owns all znodes locked by the
22970+ thread. One znode may be locked by several threads in case of read lock or
22971+ one znode may be write locked by one thread several times. The special link
22972+ objects (lock handles) support n<->m relation between znodes and lock
22973+ owners.
22974+
22975+ <Thread 1> <Thread 2>
22976+
22977+ +---------+ +---------+
22978+ | LS1 | | LS2 |
22979+ +---------+ +---------+
22980+ ^ ^
22981+ |---------------+ +----------+
22982+ v v v v
22983+ +---------+ +---------+ +---------+ +---------+
22984+ | LH1 | | LH2 | | LH3 | | LH4 |
22985+ +---------+ +---------+ +---------+ +---------+
22986+ ^ ^ ^ ^
22987+ | +------------+ |
22988+ v v v
22989+ +---------+ +---------+ +---------+
22990+ | Z1 | | Z2 | | Z3 |
22991+ +---------+ +---------+ +---------+
22992+
22993+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22994+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22995+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22996+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22997+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22998+ is locked (for read) twice by different threads and two lock handles are on
22999+ its list. Each lock handle represents a single relation of a locking of a
23000+ znode by a thread. Locking of a znode is an establishing of a locking
23001+ relation between the lock stack and the znode by adding of a new lock handle
23002+ to a list of lock handles, the lock stack. The lock stack links all lock
23003+ handles for all znodes locked by the lock stack. The znode list groups all
23004+ lock handles for all locks stacks which locked the znode.
23005+
23006+ Yet another relation may exist between znode and lock owners. If lock
23007+ procedure cannot immediately take lock on an object it adds the lock owner
23008+ on special `requestors' list belongs to znode. That list represents a
23009+ queue of pending lock requests. Because one lock owner may request only
23010+ only one lock object at a time, it is a 1->n relation between lock objects
23011+ and a lock owner implemented as it is described above. Full information
23012+ (priority, pointers to lock and link objects) about each lock request is
23013+ stored in lock owner structure in `request' field.
23014+
23015+ SHORT_TERM LOCKING
23016+
23017+ This is a list of primitive operations over lock stacks / lock handles /
23018+ znodes and locking descriptions for them.
23019+
23020+ 1. locking / unlocking which is done by two list insertion/deletion, one
23021+ to/from znode's list of lock handles, another one is to/from lock stack's
23022+ list of lock handles. The first insertion is protected by
23023+ znode->lock.guard spinlock. The list owned by the lock stack can be
23024+ modified only by thread who owns the lock stack and nobody else can
23025+ modify/read it. There is nothing to be protected by a spinlock or
23026+ something else.
23027+
23028+ 2. adding/removing a lock request to/from znode requesters list. The rule is
23029+ that znode->lock.guard spinlock should be taken for this.
23030+
23031+ 3. we can traverse list of lock handles and use references to lock stacks who
23032+ locked given znode if znode->lock.guard spinlock is taken.
23033+
23034+ 4. If a lock stack is associated with a znode as a lock requestor or lock
23035+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its
23036+ (lock stack's) fields should be protected from being accessed in parallel
23037+ by two or more threads. Please look at lock_stack structure definition
23038+ for the info how those fields are protected. */
23039+
23040+/* Znode lock and capturing intertwining. */
23041+/* In current implementation we capture formatted nodes before locking
23042+ them. Take a look on longterm lock znode, try_capture() request precedes
23043+ locking requests. The longterm_lock_znode function unconditionally captures
23044+ znode before even checking of locking conditions.
23045+
23046+ Another variant is to capture znode after locking it. It was not tested, but
23047+ at least one deadlock condition is supposed to be there. One thread has
23048+ locked a znode (Node-1) and calls try_capture() for it. Try_capture() sleeps
23049+ because znode's atom has CAPTURE_WAIT state. Second thread is a flushing
23050+ thread, its current atom is the atom Node-1 belongs to. Second thread wants
23051+ to lock Node-1 and sleeps because Node-1 is locked by the first thread. The
23052+ described situation is a deadlock. */
23053+
23054+#include "debug.h"
23055+#include "txnmgr.h"
23056+#include "znode.h"
23057+#include "jnode.h"
23058+#include "tree.h"
23059+#include "plugin/node/node.h"
23060+#include "super.h"
23061+
23062+#include <linux/spinlock.h>
23063+
23064+#if REISER4_DEBUG
23065+static int request_is_deadlock_safe(znode *, znode_lock_mode,
23066+ znode_lock_request);
23067+#endif
23068+
23069+/* Returns a lock owner associated with current thread */
23070+lock_stack *get_current_lock_stack(void)
23071+{
23072+ return &get_current_context()->stack;
23073+}
23074+
23075+/* Wakes up all low priority owners informing them about possible deadlock */
23076+static void wake_up_all_lopri_owners(znode * node)
23077+{
23078+ lock_handle *handle;
23079+
23080+ assert_spin_locked(&(node->lock.guard));
23081+ list_for_each_entry(handle, &node->lock.owners, owners_link) {
23082+ assert("nikita-1832", handle->node == node);
23083+ /* count this signal in owner->nr_signaled */
23084+ if (!handle->signaled) {
23085+ handle->signaled = 1;
23086+ atomic_inc(&handle->owner->nr_signaled);
23087+ /* Wake up a single process */
23088+ reiser4_wake_up(handle->owner);
23089+ }
23090+ }
23091+}
23092+
23093+/* Adds a lock to a lock owner, which means creating a link to the lock and
23094+ putting the link into the two lists all links are on (the doubly linked list
23095+ that forms the lock_stack, and the doubly linked list of links attached
23096+ to a lock.
23097+*/
23098+static inline void
23099+link_object(lock_handle * handle, lock_stack * owner, znode * node)
23100+{
23101+ assert("jmacd-810", handle->owner == NULL);
23102+ assert_spin_locked(&(node->lock.guard));
23103+
23104+ handle->owner = owner;
23105+ handle->node = node;
23106+
23107+ assert("reiser4-4",
23108+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
23109+
23110+ /* add lock handle to the end of lock_stack's list of locks */
23111+ list_add_tail(&handle->locks_link, &owner->locks);
23112+ ON_DEBUG(owner->nr_locks++);
23113+ set_gfp_mask();
23114+
23115+ /* add lock handle to the head of znode's list of owners */
23116+ list_add(&handle->owners_link, &node->lock.owners);
23117+ handle->signaled = 0;
23118+}
23119+
23120+/* Breaks a relation between a lock and its owner */
23121+static inline void unlink_object(lock_handle * handle)
23122+{
23123+ assert("zam-354", handle->owner != NULL);
23124+ assert("nikita-1608", handle->node != NULL);
23125+ assert_spin_locked(&(handle->node->lock.guard));
23126+ assert("nikita-1829", handle->owner == get_current_lock_stack());
23127+ assert("reiser4-5", handle->owner->nr_locks > 0);
23128+
23129+ /* remove lock handle from lock_stack's list of locks */
23130+ list_del(&handle->locks_link);
23131+ ON_DEBUG(handle->owner->nr_locks--);
23132+ set_gfp_mask();
23133+ assert("reiser4-6",
23134+ ergo(list_empty_careful(&handle->owner->locks),
23135+ handle->owner->nr_locks == 0));
23136+ /* remove lock handle from znode's list of owners */
23137+ list_del(&handle->owners_link);
23138+ /* indicates that lock handle is free now */
23139+ handle->node = NULL;
23140+#if REISER4_DEBUG
23141+ INIT_LIST_HEAD(&handle->locks_link);
23142+ INIT_LIST_HEAD(&handle->owners_link);
23143+ handle->owner = NULL;
23144+#endif
23145+}
23146+
23147+/* Actually locks an object knowing that we are able to do this */
23148+static void lock_object(lock_stack * owner)
23149+{
23150+ lock_request *request;
23151+ znode *node;
23152+
23153+ request = &owner->request;
23154+ node = request->node;
23155+ assert_spin_locked(&(node->lock.guard));
23156+ if (request->mode == ZNODE_READ_LOCK) {
23157+ node->lock.nr_readers++;
23158+ } else {
23159+ /* check that we don't switched from read to write lock */
23160+ assert("nikita-1840", node->lock.nr_readers <= 0);
23161+ /* We allow recursive locking; a node can be locked several
23162+ times for write by same process */
23163+ node->lock.nr_readers--;
23164+ }
23165+
23166+ link_object(request->handle, owner, node);
23167+
23168+ if (owner->curpri) {
23169+ node->lock.nr_hipri_owners++;
23170+ }
23171+}
23172+
23173+/* Check for recursive write locking */
23174+static int recursive(lock_stack * owner)
23175+{
23176+ int ret;
23177+ znode *node;
23178+ lock_handle *lh;
23179+
23180+ node = owner->request.node;
23181+
23182+ /* Owners list is not empty for a locked node */
23183+ assert("zam-314", !list_empty_careful(&node->lock.owners));
23184+ assert("nikita-1841", owner == get_current_lock_stack());
23185+ assert_spin_locked(&(node->lock.guard));
23186+
23187+
23188+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23189+ ret = (lh->owner == owner);
23190+
23191+ /* Recursive read locking should be done usual way */
23192+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23193+ /* mixing of read/write locks is not allowed */
23194+ assert("zam-341", !ret || znode_is_wlocked(node));
23195+
23196+ return ret;
23197+}
23198+
23199+#if REISER4_DEBUG
23200+/* Returns true if the lock is held by the calling thread. */
23201+int znode_is_any_locked(const znode * node)
23202+{
23203+ lock_handle *handle;
23204+ lock_stack *stack;
23205+ int ret;
23206+
23207+ if (!znode_is_locked(node)) {
23208+ return 0;
23209+ }
23210+
23211+ stack = get_current_lock_stack();
23212+
23213+ spin_lock_stack(stack);
23214+
23215+ ret = 0;
23216+
23217+ list_for_each_entry(handle, &stack->locks, locks_link) {
23218+ if (handle->node == node) {
23219+ ret = 1;
23220+ break;
23221+ }
23222+ }
23223+
23224+ spin_unlock_stack(stack);
23225+
23226+ return ret;
23227+}
23228+
23229+#endif
23230+
23231+/* Returns true if a write lock is held by the calling thread. */
23232+int znode_is_write_locked(const znode * node)
23233+{
23234+ lock_stack *stack;
23235+ lock_handle *handle;
23236+
23237+ assert("jmacd-8765", node != NULL);
23238+
23239+ if (!znode_is_wlocked(node)) {
23240+ return 0;
23241+ }
23242+
23243+ stack = get_current_lock_stack();
23244+
23245+ /*
23246+ * When znode is write locked, all owner handles point to the same lock
23247+ * stack. Get pointer to lock stack from the first lock handle from
23248+ * znode's owner list
23249+ */
23250+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23251+
23252+ return (handle->owner == stack);
23253+}
23254+
23255+/* This "deadlock" condition is the essential part of reiser4 locking
23256+ implementation. This condition is checked explicitly by calling
23257+ check_deadlock_condition() or implicitly in all places where znode lock
23258+ state (set of owners and request queue) is changed. Locking code is
23259+ designed to use this condition to trigger procedure of passing object from
23260+ low priority owner(s) to high priority one(s).
23261+
23262+ The procedure results in passing an event (setting lock_handle->signaled
23263+ flag) and counting this event in nr_signaled field of owner's lock stack
23264+ object and wakeup owner's process.
23265+*/
23266+static inline int check_deadlock_condition(znode * node)
23267+{
23268+ assert_spin_locked(&(node->lock.guard));
23269+ return node->lock.nr_hipri_requests > 0
23270+ && node->lock.nr_hipri_owners == 0;
23271+}
23272+
23273+static int check_livelock_condition(znode * node, znode_lock_mode mode)
23274+{
23275+ zlock * lock = &node->lock;
23276+
23277+ return mode == ZNODE_READ_LOCK &&
23278+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23279+}
23280+
23281+/* checks lock/request compatibility */
23282+static int can_lock_object(lock_stack * owner)
23283+{
23284+ znode *node = owner->request.node;
23285+
23286+ assert_spin_locked(&(node->lock.guard));
23287+
23288+ /* See if the node is disconnected. */
23289+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23290+ return RETERR(-EINVAL);
23291+
23292+ /* Do not ever try to take a lock if we are going in low priority
23293+ direction and a node have a high priority request without high
23294+ priority owners. */
23295+ if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23296+ return RETERR(-E_REPEAT);
23297+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23298+ return RETERR(-E_REPEAT);
23299+ if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23300+ return RETERR(-E_REPEAT);
23301+ return 0;
23302+}
23303+
23304+/* Setting of a high priority to the process. It clears "signaled" flags
23305+ because znode locked by high-priority process can't satisfy our "deadlock
23306+ condition". */
23307+static void set_high_priority(lock_stack * owner)
23308+{
23309+ assert("nikita-1846", owner == get_current_lock_stack());
23310+ /* Do nothing if current priority is already high */
23311+ if (!owner->curpri) {
23312+ /* We don't need locking for owner->locks list, because, this
23313+ * function is only called with the lock stack of the current
23314+ * thread, and no other thread can play with owner->locks list
23315+ * and/or change ->node pointers of lock handles in this list.
23316+ *
23317+ * (Interrupts also are not involved.)
23318+ */
23319+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23320+ while (&owner->locks != &item->locks_link) {
23321+ znode *node = item->node;
23322+
23323+ spin_lock_zlock(&node->lock);
23324+
23325+ node->lock.nr_hipri_owners++;
23326+
23327+ /* we can safely set signaled to zero, because
23328+ previous statement (nr_hipri_owners ++) guarantees
23329+ that signaled will be never set again. */
23330+ item->signaled = 0;
23331+ spin_unlock_zlock(&node->lock);
23332+
23333+ item = list_entry(item->locks_link.next, lock_handle, locks_link);
23334+ }
23335+ owner->curpri = 1;
23336+ atomic_set(&owner->nr_signaled, 0);
23337+ }
23338+}
23339+
23340+/* Sets a low priority to the process. */
23341+static void set_low_priority(lock_stack * owner)
23342+{
23343+ assert("nikita-3075", owner == get_current_lock_stack());
23344+ /* Do nothing if current priority is already low */
23345+ if (owner->curpri) {
23346+ /* scan all locks (lock handles) held by @owner, which is
23347+ actually current thread, and check whether we are reaching
23348+ deadlock possibility anywhere.
23349+ */
23350+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23351+ while (&owner->locks != &handle->locks_link) {
23352+ znode *node = handle->node;
23353+ spin_lock_zlock(&node->lock);
23354+ /* this thread just was hipri owner of @node, so
23355+ nr_hipri_owners has to be greater than zero. */
23356+ assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23357+ node->lock.nr_hipri_owners--;
23358+ /* If we have deadlock condition, adjust a nr_signaled
23359+ field. It is enough to set "signaled" flag only for
23360+ current process, other low-pri owners will be
23361+ signaled and waken up after current process unlocks
23362+ this object and any high-priority requestor takes
23363+ control. */
23364+ if (check_deadlock_condition(node)
23365+ && !handle->signaled) {
23366+ handle->signaled = 1;
23367+ atomic_inc(&owner->nr_signaled);
23368+ }
23369+ spin_unlock_zlock(&node->lock);
23370+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23371+ }
23372+ owner->curpri = 0;
23373+ }
23374+}
23375+
23376+static void remove_lock_request(lock_stack * requestor)
23377+{
23378+ zlock * lock = &requestor->request.node->lock;
23379+
23380+ if (requestor->curpri) {
23381+ assert("nikita-1838", lock->nr_hipri_requests > 0);
23382+ lock->nr_hipri_requests--;
23383+ if (requestor->request.mode == ZNODE_WRITE_LOCK)
23384+ lock->nr_hipri_write_requests --;
23385+ }
23386+ list_del(&requestor->requestors_link);
23387+}
23388+
23389+
23390+static void invalidate_all_lock_requests(znode * node)
23391+{
23392+ lock_stack *requestor, *tmp;
23393+
23394+ assert_spin_locked(&(node->lock.guard));
23395+
23396+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23397+ remove_lock_request(requestor);
23398+ requestor->request.ret_code = -EINVAL;
23399+ reiser4_wake_up(requestor);
23400+ requestor->request.mode = ZNODE_NO_LOCK;
23401+ }
23402+}
23403+
23404+static void dispatch_lock_requests(znode * node)
23405+{
23406+ lock_stack *requestor, *tmp;
23407+
23408+ assert_spin_locked(&(node->lock.guard));
23409+
23410+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23411+ if (znode_is_write_locked(node))
23412+ break;
23413+ if (!can_lock_object(requestor)) {
23414+ lock_object(requestor);
23415+ remove_lock_request(requestor);
23416+ requestor->request.ret_code = 0;
23417+ reiser4_wake_up(requestor);
23418+ requestor->request.mode = ZNODE_NO_LOCK;
23419+ }
23420+ }
23421+}
23422+
23423+/* release long-term lock, acquired by longterm_lock_znode() */
23424+void longterm_unlock_znode(lock_handle * handle)
23425+{
23426+ znode *node = handle->node;
23427+ lock_stack *oldowner = handle->owner;
23428+ int hipri;
23429+ int readers;
23430+ int rdelta;
23431+ int youdie;
23432+
23433+ /*
23434+ * this is time-critical and highly optimized code. Modify carefully.
23435+ */
23436+
23437+ assert("jmacd-1021", handle != NULL);
23438+ assert("jmacd-1022", handle->owner != NULL);
23439+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23440+
23441+ assert("zam-130", oldowner == get_current_lock_stack());
23442+
23443+ LOCK_CNT_DEC(long_term_locked_znode);
23444+
23445+ /*
23446+ * to minimize amount of operations performed under lock, pre-compute
23447+ * all variables used within critical section. This makes code
23448+ * obscure.
23449+ */
23450+
23451+ /* was this lock of hi or lo priority */
23452+ hipri = oldowner->curpri ? -1 : 0;
23453+ /* number of readers */
23454+ readers = node->lock.nr_readers;
23455+ /* +1 if write lock, -1 if read lock */
23456+ rdelta = (readers > 0) ? -1 : +1;
23457+ /* true if node is to die and write lock is released */
23458+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23459+
23460+ spin_lock_zlock(&node->lock);
23461+
23462+ assert("zam-101", znode_is_locked(node));
23463+
23464+ /* Adjust a number of high priority owners of this lock */
23465+ node->lock.nr_hipri_owners += hipri;
23466+ assert("nikita-1836", node->lock.nr_hipri_owners >= 0);
23467+
23468+ /* Handle znode deallocation on last write-lock release. */
23469+ if (znode_is_wlocked_once(node)) {
23470+ if (youdie) {
23471+ forget_znode(handle);
23472+ assert("nikita-2191", znode_invariant(node));
23473+ zput(node);
23474+ return;
23475+ }
23476+ }
23477+
23478+ if (handle->signaled)
23479+ atomic_dec(&oldowner->nr_signaled);
23480+
23481+ /* Unlocking means owner<->object link deletion */
23482+ unlink_object(handle);
23483+
23484+ /* This is enough to be sure whether an object is completely
23485+ unlocked. */
23486+ node->lock.nr_readers += rdelta;
23487+
23488+ /* If the node is locked it must have an owners list. Likewise, if
23489+ the node is unlocked it must have an empty owners list. */
23490+ assert("zam-319", equi(znode_is_locked(node),
23491+ !list_empty_careful(&node->lock.owners)));
23492+
23493+#if REISER4_DEBUG
23494+ if (!znode_is_locked(node))
23495+ ++node->times_locked;
23496+#endif
23497+
23498+ /* If there are pending lock requests we wake up a requestor */
23499+ if (!znode_is_wlocked(node))
23500+ dispatch_lock_requests(node);
23501+ if (check_deadlock_condition(node))
23502+ wake_up_all_lopri_owners(node);
23503+ spin_unlock_zlock(&node->lock);
23504+
23505+ /* minus one reference from handle->node */
23506+ assert("nikita-2190", znode_invariant(node));
23507+ ON_DEBUG(check_lock_data());
23508+ ON_DEBUG(check_lock_node_data(node));
23509+ zput(node);
23510+}
23511+
23512+/* final portion of longterm-lock */
23513+static int
23514+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23515+{
23516+ znode *node = owner->request.node;
23517+
23518+ assert_spin_locked(&(node->lock.guard));
23519+
23520+ /* If we broke with (ok == 0) it means we can_lock, now do it. */
23521+ if (ok == 0) {
23522+ lock_object(owner);
23523+ owner->request.mode = 0;
23524+ /* count a reference from lockhandle->node
23525+
23526+ znode was already referenced at the entry to this function,
23527+ hence taking spin-lock here is not necessary (see comment
23528+ in the zref()).
23529+ */
23530+ zref(node);
23531+
23532+ LOCK_CNT_INC(long_term_locked_znode);
23533+ }
23534+ spin_unlock_zlock(&node->lock);
23535+ ON_DEBUG(check_lock_data());
23536+ ON_DEBUG(check_lock_node_data(node));
23537+ return ok;
23538+}
23539+
23540+/*
23541+ * version of longterm_znode_lock() optimized for the most common case: read
23542+ * lock without any special flags. This is the kind of lock that any tree
23543+ * traversal takes on the root node of the tree, which is very frequent.
23544+ */
23545+static int longterm_lock_tryfast(lock_stack * owner)
23546+{
23547+ int result;
23548+ znode *node;
23549+ zlock *lock;
23550+
23551+ node = owner->request.node;
23552+ lock = &node->lock;
23553+
23554+ assert("nikita-3340", schedulable());
23555+ assert("nikita-3341", request_is_deadlock_safe(node,
23556+ ZNODE_READ_LOCK,
23557+ ZNODE_LOCK_LOPRI));
23558+ spin_lock_zlock(lock);
23559+ result = can_lock_object(owner);
23560+ spin_unlock_zlock(lock);
23561+
23562+ if (likely(result != -EINVAL)) {
23563+ spin_lock_znode(node);
23564+ result = try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23565+ spin_unlock_znode(node);
23566+ spin_lock_zlock(lock);
23567+ if (unlikely(result != 0)) {
23568+ owner->request.mode = 0;
23569+ } else {
23570+ result = can_lock_object(owner);
23571+ if (unlikely(result == -E_REPEAT)) {
23572+ /* fall back to longterm_lock_znode() */
23573+ spin_unlock_zlock(lock);
23574+ return 1;
23575+ }
23576+ }
23577+ return lock_tail(owner, result, ZNODE_READ_LOCK);
23578+ } else
23579+ return 1;
23580+}
23581+
23582+/* locks given lock object */
23583+int longterm_lock_znode(
23584+ /* local link object (allocated by lock owner thread, usually on its own
23585+ * stack) */
23586+ lock_handle * handle,
23587+ /* znode we want to lock. */
23588+ znode * node,
23589+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23590+ znode_lock_mode mode,
23591+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23592+ znode_lock_request request) {
23593+ int ret;
23594+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23595+ int non_blocking = 0;
23596+ int has_atom;
23597+ txn_capture cap_flags;
23598+ zlock *lock;
23599+ txn_handle *txnh;
23600+ tree_level level;
23601+
23602+ /* Get current process context */
23603+ lock_stack *owner = get_current_lock_stack();
23604+
23605+ /* Check that the lock handle is initialized and isn't already being
23606+ * used. */
23607+ assert("jmacd-808", handle->owner == NULL);
23608+ assert("nikita-3026", schedulable());
23609+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23610+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23611+ /* long term locks are not allowed in the VM contexts (->writepage(),
23612+ * prune_{d,i}cache()).
23613+ *
23614+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23615+ * bug caused by d_splice_alias() only working for directories.
23616+ */
23617+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23618+ assert ("zam-1055", mode != ZNODE_NO_LOCK);
23619+
23620+ cap_flags = 0;
23621+ if (request & ZNODE_LOCK_NONBLOCK) {
23622+ cap_flags |= TXN_CAPTURE_NONBLOCKING;
23623+ non_blocking = 1;
23624+ }
23625+
23626+ if (request & ZNODE_LOCK_DONT_FUSE)
23627+ cap_flags |= TXN_CAPTURE_DONT_FUSE;
23628+
23629+ /* If we are changing our process priority we must adjust a number
23630+ of high priority owners for each znode that we already lock */
23631+ if (hipri) {
23632+ set_high_priority(owner);
23633+ } else {
23634+ set_low_priority(owner);
23635+ }
23636+
23637+ level = znode_get_level(node);
23638+
23639+ /* Fill request structure with our values. */
23640+ owner->request.mode = mode;
23641+ owner->request.handle = handle;
23642+ owner->request.node = node;
23643+
23644+ txnh = get_current_context()->trans;
23645+ lock = &node->lock;
23646+
23647+ if (mode == ZNODE_READ_LOCK && request == 0) {
23648+ ret = longterm_lock_tryfast(owner);
23649+ if (ret <= 0)
23650+ return ret;
23651+ }
23652+
23653+ has_atom = (txnh->atom != NULL);
23654+
23655+ /* Synchronize on node's zlock guard lock. */
23656+ spin_lock_zlock(lock);
23657+
23658+ if (znode_is_locked(node) &&
23659+ mode == ZNODE_WRITE_LOCK && recursive(owner))
23660+ return lock_tail(owner, 0, mode);
23661+
23662+ for (;;) {
23663+ /* Check the lock's availability: if it is unavaiable we get
23664+ E_REPEAT, 0 indicates "can_lock", otherwise the node is
23665+ invalid. */
23666+ ret = can_lock_object(owner);
23667+
23668+ if (unlikely(ret == -EINVAL)) {
23669+ /* @node is dying. Leave it alone. */
23670+ break;
23671+ }
23672+
23673+ if (unlikely(ret == -E_REPEAT && non_blocking)) {
23674+ /* either locking of @node by the current thread will
23675+ * lead to the deadlock, or lock modes are
23676+ * incompatible. */
23677+ break;
23678+ }
23679+
23680+ assert("nikita-1844", (ret == 0)
23681+ || ((ret == -E_REPEAT) && !non_blocking));
23682+ /* If we can get the lock... Try to capture first before
23683+ taking the lock. */
23684+
23685+ /* first handle commonest case where node and txnh are already
23686+ * in the same atom. */
23687+ /* safe to do without taking locks, because:
23688+ *
23689+ * 1. read of aligned word is atomic with respect to writes to
23690+ * this word
23691+ *
23692+ * 2. false negatives are handled in try_capture().
23693+ *
23694+ * 3. false positives are impossible.
23695+ *
23696+ * PROOF: left as an exercise to the curious reader.
23697+ *
23698+ * Just kidding. Here is one:
23699+ *
23700+ * At the time T0 txnh->atom is stored in txnh_atom.
23701+ *
23702+ * At the time T1 node->atom is stored in node_atom.
23703+ *
23704+ * At the time T2 we observe that
23705+ *
23706+ * txnh_atom != NULL && node_atom == txnh_atom.
23707+ *
23708+ * Imagine that at this moment we acquire node and txnh spin
23709+ * lock in this order. Suppose that under spin lock we have
23710+ *
23711+ * node->atom != txnh->atom, (S1)
23712+ *
23713+ * at the time T3.
23714+ *
23715+ * txnh->atom != NULL still, because txnh is open by the
23716+ * current thread.
23717+ *
23718+ * Suppose node->atom == NULL, that is, node was un-captured
23719+ * between T1, and T3. But un-capturing of formatted node is
23720+ * always preceded by the call to invalidate_lock(), which
23721+ * marks znode as JNODE_IS_DYING under zlock spin
23722+ * lock. Contradiction, because can_lock_object() above checks
23723+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23724+ *
23725+ * Suppose that node->atom != node_atom, that is, atom, node
23726+ * belongs to was fused into another atom: node_atom was fused
23727+ * into node->atom. Atom of txnh was equal to node_atom at T2,
23728+ * which means that under spin lock, txnh->atom == node->atom,
23729+ * because txnh->atom can only follow fusion
23730+ * chain. Contradicts S1.
23731+ *
23732+ * The same for hypothesis txnh->atom != txnh_atom. Hence,
23733+ * node->atom == node_atom == txnh_atom == txnh->atom. Again
23734+ * contradicts S1. Hence S1 is false. QED.
23735+ *
23736+ */
23737+
23738+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23739+ ;
23740+ } else {
23741+ /*
23742+ * unlock zlock spin lock here. It is possible for
23743+ * longterm_unlock_znode() to sneak in here, but there
23744+ * is no harm: invalidate_lock() will mark znode as
23745+ * JNODE_IS_DYING and this will be noted by
23746+ * can_lock_object() below.
23747+ */
23748+ spin_unlock_zlock(lock);
23749+ spin_lock_znode(node);
23750+ ret = try_capture(ZJNODE(node), mode, cap_flags);
23751+ spin_unlock_znode(node);
23752+ spin_lock_zlock(lock);
23753+ if (unlikely(ret != 0)) {
23754+ /* In the failure case, the txnmgr releases
23755+ the znode's lock (or in some cases, it was
23756+ released a while ago). There's no need to
23757+ reacquire it so we should return here,
23758+ avoid releasing the lock. */
23759+ owner->request.mode = 0;
23760+ break;
23761+ }
23762+
23763+ /* Check the lock's availability again -- this is
23764+ because under some circumstances the capture code
23765+ has to release and reacquire the znode spinlock. */
23766+ ret = can_lock_object(owner);
23767+ }
23768+
23769+ /* This time, a return of (ret == 0) means we can lock, so we
23770+ should break out of the loop. */
23771+ if (likely(ret != -E_REPEAT || non_blocking)) {
23772+ break;
23773+ }
23774+
23775+ /* Lock is unavailable, we have to wait. */
23776+
23777+ /* By having semaphore initialization here we cannot lose
23778+ wakeup signal even if it comes after `nr_signaled' field
23779+ check. */
23780+ ret = prepare_to_sleep(owner);
23781+ if (unlikely(ret != 0)) {
23782+ break;
23783+ }
23784+
23785+ assert_spin_locked(&(node->lock.guard));
23786+ if (hipri) {
23787+ /* If we are going in high priority direction then
23788+ increase high priority requests counter for the
23789+ node */
23790+ lock->nr_hipri_requests++;
23791+ if (mode == ZNODE_WRITE_LOCK)
23792+ lock->nr_hipri_write_requests ++;
23793+ /* If there are no high priority owners for a node,
23794+ then immediately wake up low priority owners, so
23795+ they can detect possible deadlock */
23796+ if (lock->nr_hipri_owners == 0)
23797+ wake_up_all_lopri_owners(node);
23798+ }
23799+ list_add_tail(&owner->requestors_link, &lock->requestors);
23800+
23801+ /* Ok, here we have prepared a lock request, so unlock
23802+ a znode ... */
23803+ spin_unlock_zlock(lock);
23804+ /* ... and sleep */
23805+ go_to_sleep(owner);
23806+ if (owner->request.mode == ZNODE_NO_LOCK)
23807+ goto request_is_done;
23808+ spin_lock_zlock(lock);
23809+ if (owner->request.mode == ZNODE_NO_LOCK) {
23810+ spin_unlock_zlock(lock);
23811+ request_is_done:
23812+ if (owner->request.ret_code == 0) {
23813+ LOCK_CNT_INC(long_term_locked_znode);
23814+ zref(node);
23815+ }
23816+ return owner->request.ret_code;
23817+ }
23818+ remove_lock_request(owner);
23819+ }
23820+
23821+ return lock_tail(owner, ret, mode);
23822+}
23823+
23824+/* lock object invalidation means changing of lock object state to `INVALID'
23825+ and waiting for all other processes to cancel theirs lock requests. */
23826+void invalidate_lock(lock_handle * handle /* path to lock
23827+ * owner and lock
23828+ * object is being
23829+ * invalidated. */ )
23830+{
23831+ znode *node = handle->node;
23832+ lock_stack *owner = handle->owner;
23833+
23834+ assert("zam-325", owner == get_current_lock_stack());
23835+ assert("zam-103", znode_is_write_locked(node));
23836+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23837+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23838+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23839+ assert("nikita-3097", znode_is_wlocked_once(node));
23840+ assert_spin_locked(&(node->lock.guard));
23841+
23842+ if (handle->signaled)
23843+ atomic_dec(&owner->nr_signaled);
23844+
23845+ ZF_SET(node, JNODE_IS_DYING);
23846+ unlink_object(handle);
23847+ node->lock.nr_readers = 0;
23848+
23849+ invalidate_all_lock_requests(node);
23850+ spin_unlock_zlock(&node->lock);
23851+}
23852+
23853+/* Initializes lock_stack. */
23854+void init_lock_stack(lock_stack * owner /* pointer to
23855+ * allocated
23856+ * structure. */ )
23857+{
23858+ INIT_LIST_HEAD(&owner->locks);
23859+ INIT_LIST_HEAD(&owner->requestors_link);
23860+ spin_lock_init(&owner->sguard);
23861+ owner->curpri = 1;
23862+ sema_init(&owner->sema, 0);
23863+}
23864+
23865+/* Initializes lock object. */
23866+void reiser4_init_lock(zlock * lock /* pointer on allocated
23867+ * uninitialized lock object
23868+ * structure. */ )
23869+{
23870+ memset(lock, 0, sizeof(zlock));
23871+ spin_lock_init(&lock->guard);
23872+ INIT_LIST_HEAD(&lock->requestors);
23873+ INIT_LIST_HEAD(&lock->owners);
23874+}
23875+
23876+/* Transfer a lock handle (presumably so that variables can be moved between stack and
23877+ heap locations). */
23878+static void
23879+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23880+{
23881+ znode *node = old->node;
23882+ lock_stack *owner = old->owner;
23883+ int signaled;
23884+
23885+ /* locks_list, modified by link_object() is not protected by
23886+ anything. This is valid because only current thread ever modifies
23887+ locks_list of its lock_stack.
23888+ */
23889+ assert("nikita-1827", owner == get_current_lock_stack());
23890+ assert("nikita-1831", new->owner == NULL);
23891+
23892+ spin_lock_zlock(&node->lock);
23893+
23894+ signaled = old->signaled;
23895+ if (unlink_old) {
23896+ unlink_object(old);
23897+ } else {
23898+ if (node->lock.nr_readers > 0) {
23899+ node->lock.nr_readers += 1;
23900+ } else {
23901+ node->lock.nr_readers -= 1;
23902+ }
23903+ if (signaled) {
23904+ atomic_inc(&owner->nr_signaled);
23905+ }
23906+ if (owner->curpri) {
23907+ node->lock.nr_hipri_owners += 1;
23908+ }
23909+ LOCK_CNT_INC(long_term_locked_znode);
23910+
23911+ zref(node);
23912+ }
23913+ link_object(new, owner, node);
23914+ new->signaled = signaled;
23915+
23916+ spin_unlock_zlock(&node->lock);
23917+}
23918+
23919+void move_lh(lock_handle * new, lock_handle * old)
23920+{
23921+ move_lh_internal(new, old, /*unlink_old */ 1);
23922+}
23923+
23924+void copy_lh(lock_handle * new, lock_handle * old)
23925+{
23926+ move_lh_internal(new, old, /*unlink_old */ 0);
23927+}
23928+
23929+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23930+int check_deadlock(void)
23931+{
23932+ lock_stack *owner = get_current_lock_stack();
23933+ return atomic_read(&owner->nr_signaled) != 0;
23934+}
23935+
23936+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23937+ priorities. */
23938+int prepare_to_sleep(lock_stack * owner)
23939+{
23940+ assert("nikita-1847", owner == get_current_lock_stack());
23941+ /* NOTE(Zam): We cannot reset the lock semaphore here because it may
23942+ clear wake-up signal. The initial design was to re-check all
23943+ conditions under which we continue locking, release locks or sleep
23944+ until conditions are changed. However, even lock.c does not follow
23945+ that design. So, wake-up signal which is stored in semaphore state
23946+ could we loosen by semaphore reset. The less complex scheme without
23947+ resetting the semaphore is enough to not to loose wake-ups.
23948+
23949+ if (0) {
23950+
23951+ NOTE-NIKITA: I commented call to sema_init() out hoping
23952+ that it is the reason or thread sleeping in
23953+ down(&owner->sema) without any other thread running.
23954+
23955+ Anyway, it is just an optimization: is semaphore is not
23956+ reinitialised at this point, in the worst case
23957+ longterm_lock_znode() would have to iterate its loop once
23958+ more.
23959+ spin_lock_stack(owner);
23960+ sema_init(&owner->sema, 0);
23961+ spin_unlock_stack(owner);
23962+ }
23963+ */
23964+
23965+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23966+ * counted in nr_signaled */
23967+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23968+ assert("zam-959", !owner->curpri);
23969+ return RETERR(-E_DEADLOCK);
23970+ }
23971+ return 0;
23972+}
23973+
23974+/* Wakes up a single thread */
23975+void __reiser4_wake_up(lock_stack * owner)
23976+{
23977+ up(&owner->sema);
23978+}
23979+
23980+/* Puts a thread to sleep */
23981+void go_to_sleep(lock_stack * owner)
23982+{
23983+ /* Well, we might sleep here, so holding of any spinlocks is no-no */
23984+ assert("nikita-3027", schedulable());
23985+ /* return down_interruptible(&owner->sema); */
23986+ down(&owner->sema);
23987+}
23988+
23989+int lock_stack_isclean(lock_stack * owner)
23990+{
23991+ if (list_empty_careful(&owner->locks)) {
23992+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23993+ return 1;
23994+ }
23995+
23996+ return 0;
23997+}
23998+
23999+#if REISER4_DEBUG
24000+
24001+/*
24002+ * debugging functions
24003+ */
24004+
24005+static void list_check(struct list_head *head)
24006+{
24007+ struct list_head *pos;
24008+
24009+ list_for_each(pos, head)
24010+ assert("", (pos->prev != NULL && pos->next != NULL &&
24011+ pos->prev->next == pos && pos->next->prev == pos));
24012+}
24013+
24014+/* check consistency of locking data-structures hanging of the @stack */
24015+static void check_lock_stack(lock_stack * stack)
24016+{
24017+ spin_lock_stack(stack);
24018+ /* check that stack->locks is not corrupted */
24019+ list_check(&stack->locks);
24020+ spin_unlock_stack(stack);
24021+}
24022+
24023+/* check consistency of locking data structures */
24024+void check_lock_data(void)
24025+{
24026+ check_lock_stack(&get_current_context()->stack);
24027+}
24028+
24029+/* check consistency of locking data structures for @node */
24030+void check_lock_node_data(znode * node)
24031+{
24032+ spin_lock_zlock(&node->lock);
24033+ list_check(&node->lock.owners);
24034+ list_check(&node->lock.requestors);
24035+ spin_unlock_zlock(&node->lock);
24036+}
24037+
24038+/* check that given lock request is dead lock safe. This check is, of course,
24039+ * not exhaustive. */
24040+static int
24041+request_is_deadlock_safe(znode * node, znode_lock_mode mode,
24042+ znode_lock_request request)
24043+{
24044+ lock_stack *owner;
24045+
24046+ owner = get_current_lock_stack();
24047+ /*
24048+ * check that hipri lock request is not issued when there are locked
24049+ * nodes at the higher levels.
24050+ */
24051+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
24052+ znode_get_level(node) != 0) {
24053+ lock_handle *item;
24054+
24055+ list_for_each_entry(item, &owner->locks, locks_link) {
24056+ znode *other;
24057+
24058+ other = item->node;
24059+
24060+ if (znode_get_level(other) == 0)
24061+ continue;
24062+ if (znode_get_level(other) > znode_get_level(node))
24063+ return 0;
24064+ }
24065+ }
24066+ return 1;
24067+}
24068+
24069+#endif
24070+
24071+/* return pointer to static storage with name of lock_mode. For
24072+ debugging */
24073+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
24074+{
24075+ if (lock == ZNODE_READ_LOCK)
24076+ return "read";
24077+ else if (lock == ZNODE_WRITE_LOCK)
24078+ return "write";
24079+ else {
24080+ static char buf[30];
24081+
24082+ sprintf(buf, "unknown: %i", lock);
24083+ return buf;
24084+ }
24085+}
24086+
24087+/* Make Linus happy.
24088+ Local variables:
24089+ c-indentation-style: "K&R"
24090+ mode-name: "LC"
24091+ c-basic-offset: 8
24092+ tab-width: 8
24093+ fill-column: 79
24094+ End:
24095+*/
24096Index: linux-2.6.16/fs/reiser4/lock.h
24097===================================================================
24098--- /dev/null
24099+++ linux-2.6.16/fs/reiser4/lock.h
24100@@ -0,0 +1,272 @@
24101+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
24102+
24103+/* Long term locking data structures. See lock.c for details. */
24104+
24105+#ifndef __LOCK_H__
24106+#define __LOCK_H__
24107+
24108+#include "forward.h"
24109+#include "debug.h"
24110+#include "dformat.h"
24111+#include "key.h"
24112+#include "coord.h"
24113+#include "plugin/node/node.h"
24114+#include "txnmgr.h"
24115+#include "readahead.h"
24116+
24117+#include <linux/types.h>
24118+#include <linux/spinlock.h>
24119+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
24120+#include <asm/atomic.h>
24121+#include <asm/semaphore.h>
24122+
24123+/* Per-znode lock object */
24124+struct zlock {
24125+ spinlock_t guard;
24126+ /* The number of readers if positive; the number of recursively taken
24127+ write locks if negative. Protected by zlock spin lock. */
24128+ int nr_readers;
24129+ /* A number of processes (lock_stacks) that have this object
24130+ locked with high priority */
24131+ unsigned nr_hipri_owners;
24132+ /* A number of attempts to lock znode in high priority direction */
24133+ unsigned nr_hipri_requests;
24134+ /* A linked list of lock_handle objects that contains pointers
24135+ for all lock_stacks which have this lock object locked */
24136+ unsigned nr_hipri_write_requests;
24137+ struct list_head owners;
24138+ /* A linked list of lock_stacks that wait for this lock */
24139+ struct list_head requestors;
24140+};
24141+
24142+static inline void spin_lock_zlock(zlock *lock)
24143+{
24144+ /* check that zlock is not locked */
24145+ assert("", LOCK_CNT_NIL(spin_locked_zlock));
24146+ /* check that spinlocks of lower priorities are not held */
24147+ assert("", LOCK_CNT_NIL(spin_locked_stack));
24148+
24149+ spin_lock(&lock->guard);
24150+
24151+ LOCK_CNT_INC(spin_locked_zlock);
24152+ LOCK_CNT_INC(spin_locked);
24153+}
24154+
24155+static inline void spin_unlock_zlock(zlock *lock)
24156+{
24157+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
24158+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24159+
24160+ LOCK_CNT_DEC(spin_locked_zlock);
24161+ LOCK_CNT_DEC(spin_locked);
24162+
24163+ spin_unlock(&lock->guard);
24164+}
24165+
24166+#define lock_is_locked(lock) ((lock)->nr_readers != 0)
24167+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
24168+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
24169+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
24170+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
24171+#define lock_mode_compatible(lock, mode) \
24172+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
24173+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
24174+
24175+/* Since we have R/W znode locks we need additional bidirectional `link'
24176+ objects to implement n<->m relationship between lock owners and lock
24177+ objects. We call them `lock handles'.
24178+
24179+ Locking: see lock.c/"SHORT-TERM LOCKING"
24180+*/
24181+struct lock_handle {
24182+ /* This flag indicates that a signal to yield a lock was passed to
24183+ lock owner and counted in owner->nr_signalled
24184+
24185+ Locking: this is accessed under spin lock on ->node.
24186+ */
24187+ int signaled;
24188+ /* A link to owner of a lock */
24189+ lock_stack *owner;
24190+ /* A link to znode locked */
24191+ znode *node;
24192+ /* A list of all locks for a process */
24193+ struct list_head locks_link;
24194+ /* A list of all owners for a znode */
24195+ struct list_head owners_link;
24196+};
24197+
24198+typedef struct lock_request {
24199+ /* A pointer to uninitialized link object */
24200+ lock_handle *handle;
24201+ /* A pointer to the object we want to lock */
24202+ znode *node;
24203+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
24204+ znode_lock_mode mode;
24205+ /* how dispatch_lock_requests() returns lock request result code */
24206+ int ret_code;
24207+} lock_request;
24208+
24209+/* A lock stack structure for accumulating locks owned by a process */
24210+struct lock_stack {
24211+ /* A guard lock protecting a lock stack */
24212+ spinlock_t sguard;
24213+ /* number of znodes which were requested by high priority processes */
24214+ atomic_t nr_signaled;
24215+ /* Current priority of a process
24216+
24217+ This is only accessed by the current thread and thus requires no
24218+ locking.
24219+ */
24220+ int curpri;
24221+ /* A list of all locks owned by this process. Elements can be added to
24222+ * this list only by the current thread. ->node pointers in this list
24223+ * can be only changed by the current thread. */
24224+ struct list_head locks;
24225+ /* When lock_stack waits for the lock, it puts itself on double-linked
24226+ requestors list of that lock */
24227+ struct list_head requestors_link;
24228+ /* Current lock request info.
24229+
24230+ This is only accessed by the current thread and thus requires no
24231+ locking.
24232+ */
24233+ lock_request request;
24234+ /* It is a lock_stack's synchronization object for when process sleeps
24235+ when requested lock not on this lock_stack but which it wishes to
24236+ add to this lock_stack is not immediately available. It is used
24237+ instead of wait_queue_t object due to locking problems (lost wake
24238+ up). "lost wakeup" occurs when process is waken up before he actually
24239+ becomes 'sleepy' (through sleep_on()). Using of semaphore object is
24240+ simplest way to avoid that problem.
24241+
24242+ A semaphore is used in the following way: only the process that is
24243+ the owner of the lock_stack initializes it (to zero) and calls
24244+ down(sema) on it. Usually this causes the process to sleep on the
24245+ semaphore. Other processes may wake him up by calling up(sema). The
24246+ advantage to a semaphore is that up() and down() calls are not
24247+ required to preserve order. Unlike wait_queue it works when process
24248+ is woken up before getting to sleep.
24249+
24250+ NOTE-NIKITA: Transaction manager is going to have condition variables
24251+ (&kcondvar_t) anyway, so this probably will be replaced with
24252+ one in the future.
24253+
24254+ After further discussion, Nikita has shown me that Zam's implementation is
24255+ exactly a condition variable. The znode's {zguard,requestors_list} represents
24256+ condition variable and the lock_stack's {sguard,semaphore} guards entry and
24257+ exit from the condition variable's wait queue. But the existing code can't
24258+ just be replaced with a more general abstraction, and I think its fine the way
24259+ it is. */
24260+ struct semaphore sema;
24261+#if REISER4_DEBUG
24262+ int nr_locks; /* number of lock handles in the above list */
24263+#endif
24264+};
24265+
24266+
24267+/*
24268+ User-visible znode locking functions
24269+*/
24270+
24271+extern int longterm_lock_znode(lock_handle * handle,
24272+ znode * node,
24273+ znode_lock_mode mode,
24274+ znode_lock_request request);
24275+
24276+extern void longterm_unlock_znode(lock_handle * handle);
24277+
24278+extern int check_deadlock(void);
24279+
24280+extern lock_stack *get_current_lock_stack(void);
24281+
24282+extern void init_lock_stack(lock_stack * owner);
24283+extern void reiser4_init_lock(zlock * lock);
24284+
24285+static inline void init_lh(lock_handle *lh)
24286+{
24287+#if REISER4_DEBUG
24288+ memset(lh, 0, sizeof *lh);
24289+ INIT_LIST_HEAD(&lh->locks_link);
24290+ INIT_LIST_HEAD(&lh->owners_link);
24291+#else
24292+ lh->node = NULL;
24293+#endif
24294+}
24295+
24296+static inline void done_lh(lock_handle *lh)
24297+{
24298+ assert("zam-342", lh != NULL);
24299+ if (lh->node != NULL)
24300+ longterm_unlock_znode(lh);
24301+}
24302+
24303+extern void move_lh(lock_handle * new, lock_handle * old);
24304+extern void copy_lh(lock_handle * new, lock_handle * old);
24305+
24306+extern int prepare_to_sleep(lock_stack * owner);
24307+extern void go_to_sleep(lock_stack * owner);
24308+extern void __reiser4_wake_up(lock_stack * owner);
24309+
24310+extern int lock_stack_isclean(lock_stack * owner);
24311+
24312+/* zlock object state check macros: only used in assertions. Both forms imply that the
24313+ lock is held by the current thread. */
24314+extern int znode_is_write_locked(const znode *);
24315+extern void invalidate_lock(lock_handle *);
24316+
24317+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24318+#define spin_ordering_pred_stack(stack) \
24319+ (LOCK_CNT_NIL(spin_locked_stack) && \
24320+ LOCK_CNT_NIL(spin_locked_txnmgr) && \
24321+ LOCK_CNT_NIL(spin_locked_inode) && \
24322+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24323+ LOCK_CNT_NIL(spin_locked_super_eflush) )
24324+
24325+static inline void spin_lock_stack(lock_stack *stack)
24326+{
24327+ assert("", spin_ordering_pred_stack(stack));
24328+ spin_lock(&(stack->sguard));
24329+ LOCK_CNT_INC(spin_locked_stack);
24330+ LOCK_CNT_INC(spin_locked);
24331+}
24332+
24333+static inline void spin_unlock_stack(lock_stack *stack)
24334+{
24335+ assert_spin_locked(&(stack->sguard));
24336+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24337+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24338+ LOCK_CNT_DEC(spin_locked_stack);
24339+ LOCK_CNT_DEC(spin_locked);
24340+ spin_unlock(&(stack->sguard));
24341+}
24342+
24343+
24344+static inline void reiser4_wake_up(lock_stack * owner)
24345+{
24346+ spin_lock_stack(owner);
24347+ __reiser4_wake_up(owner);
24348+ spin_unlock_stack(owner);
24349+}
24350+
24351+const char *lock_mode_name(znode_lock_mode lock);
24352+
24353+#if REISER4_DEBUG
24354+extern void check_lock_data(void);
24355+extern void check_lock_node_data(znode * node);
24356+#else
24357+#define check_lock_data() noop
24358+#define check_lock_node_data() noop
24359+#endif
24360+
24361+/* __LOCK_H__ */
24362+#endif
24363+
24364+/* Make Linus happy.
24365+ Local variables:
24366+ c-indentation-style: "K&R"
24367+ mode-name: "LC"
24368+ c-basic-offset: 8
24369+ tab-width: 8
24370+ fill-column: 120
24371+ End:
24372+*/
24373Index: linux-2.6.16/fs/reiser4/oid.c
24374===================================================================
24375--- /dev/null
24376+++ linux-2.6.16/fs/reiser4/oid.c
24377@@ -0,0 +1,141 @@
24378+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24379+
24380+#include "debug.h"
24381+#include "super.h"
24382+#include "txnmgr.h"
24383+
24384+/* we used to have oid allocation plugin. It was removed because it
24385+ was recognized as providing unneeded level of abstraction. If one
24386+ ever will find it useful - look at yet_unneeded_abstractions/oid
24387+*/
24388+
24389+/*
24390+ * initialize in-memory data for oid allocator at @super. @nr_files and @next
24391+ * are provided by disk format plugin that reads them from the disk during
24392+ * mount.
24393+ */
24394+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24395+{
24396+ reiser4_super_info_data *sbinfo;
24397+
24398+ sbinfo = get_super_private(super);
24399+
24400+ sbinfo->next_to_use = next;
24401+ sbinfo->oids_in_use = nr_files;
24402+ return 0;
24403+}
24404+
24405+/*
24406+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24407+ * runs out of oids.
24408+ */
24409+oid_t oid_allocate(struct super_block * super)
24410+{
24411+ reiser4_super_info_data *sbinfo;
24412+ oid_t oid;
24413+
24414+ sbinfo = get_super_private(super);
24415+
24416+ spin_lock_reiser4_super(sbinfo);
24417+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24418+ oid = sbinfo->next_to_use++;
24419+ sbinfo->oids_in_use++;
24420+ } else
24421+ oid = ABSOLUTE_MAX_OID;
24422+ spin_unlock_reiser4_super(sbinfo);
24423+ return oid;
24424+}
24425+
24426+/*
24427+ * Tell oid allocator that @oid is now free.
24428+ */
24429+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24430+{
24431+ reiser4_super_info_data *sbinfo;
24432+
24433+ sbinfo = get_super_private(super);
24434+
24435+ spin_lock_reiser4_super(sbinfo);
24436+ sbinfo->oids_in_use--;
24437+ spin_unlock_reiser4_super(sbinfo);
24438+ return 0;
24439+}
24440+
24441+/*
24442+ * return next @oid that would be allocated (i.e., returned by oid_allocate())
24443+ * without actually allocating it. This is used by disk format plugin to save
24444+ * oid allocator state on the disk.
24445+ */
24446+oid_t oid_next(const struct super_block * super)
24447+{
24448+ reiser4_super_info_data *sbinfo;
24449+ oid_t oid;
24450+
24451+ sbinfo = get_super_private(super);
24452+
24453+ spin_lock_reiser4_super(sbinfo);
24454+ oid = sbinfo->next_to_use;
24455+ spin_unlock_reiser4_super(sbinfo);
24456+ return oid;
24457+}
24458+
24459+/*
24460+ * returns number of currently used oids. This is used by statfs(2) to report
24461+ * number of "inodes" and by disk format plugin to save oid allocator state on
24462+ * the disk.
24463+ */
24464+long oids_used(const struct super_block *super)
24465+{
24466+ reiser4_super_info_data *sbinfo;
24467+ oid_t used;
24468+
24469+ sbinfo = get_super_private(super);
24470+
24471+ spin_lock_reiser4_super(sbinfo);
24472+ used = sbinfo->oids_in_use;
24473+ spin_unlock_reiser4_super(sbinfo);
24474+ if (used < (__u64) ((long)~0) >> 1)
24475+ return (long)used;
24476+ else
24477+ return (long)-1;
24478+}
24479+
24480+/*
24481+ * Count oid as allocated in atom. This is done after call to oid_allocate()
24482+ * at the point when we are irrevocably committed to creation of the new file
24483+ * (i.e., when oid allocation cannot be any longer rolled back due to some
24484+ * error).
24485+ */
24486+void oid_count_allocated(void)
24487+{
24488+ txn_atom *atom;
24489+
24490+ atom = get_current_atom_locked();
24491+ atom->nr_objects_created++;
24492+ spin_unlock_atom(atom);
24493+}
24494+
24495+/*
24496+ * Count oid as free in atom. This is done after call to oid_release() at the
24497+ * point when we are irrevocably committed to the deletion of the file (i.e.,
24498+ * when oid release cannot be any longer rolled back due to some error).
24499+ */
24500+void oid_count_released(void)
24501+{
24502+ txn_atom *atom;
24503+
24504+ atom = get_current_atom_locked();
24505+ atom->nr_objects_deleted++;
24506+ spin_unlock_atom(atom);
24507+}
24508+
24509+/*
24510+ Local variables:
24511+ c-indentation-style: "K&R"
24512+ mode-name: "LC"
24513+ c-basic-offset: 8
24514+ tab-width: 8
24515+ fill-column: 120
24516+ scroll-step: 1
24517+ End:
24518+*/
24519Index: linux-2.6.16/fs/reiser4/page_cache.c
24520===================================================================
24521--- /dev/null
24522+++ linux-2.6.16/fs/reiser4/page_cache.c
24523@@ -0,0 +1,712 @@
24524+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24525+ * reiser4/README */
24526+
24527+/* Memory pressure hooks. Fake inodes handling. */
24528+/* We store all file system meta data (and data, of course) in the page cache.
24529+
24530+ What does this mean? In stead of using bread/brelse we create special
24531+ "fake" inode (one per super block) and store content of formatted nodes
24532+ into pages bound to this inode in the page cache. In newer kernels bread()
24533+ already uses inode attached to block device (bd_inode). Advantage of having
24534+ our own fake inode is that we can install appropriate methods in its
24535+ address_space operations. Such methods are called by VM on memory pressure
24536+ (or during background page flushing) and we can use them to react
24537+ appropriately.
24538+
24539+ In initial version we only support one block per page. Support for multiple
24540+ blocks per page is complicated by relocation.
24541+
24542+ To each page, used by reiser4, jnode is attached. jnode is analogous to
24543+ buffer head. Difference is that jnode is bound to the page permanently:
24544+ jnode cannot be removed from memory until its backing page is.
24545+
24546+ jnode contain pointer to page (->pg field) and page contain pointer to
24547+ jnode in ->private field. Pointer from jnode to page is protected to by
24548+ jnode's spinlock and pointer from page to jnode is protected by page lock
24549+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24550+ lock. To go into reverse direction use jnode_lock_page() function that uses
24551+ standard try-lock-and-release device.
24552+
24553+ Properties:
24554+
24555+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24556+ reference counter is increased.
24557+
24558+ 2. when jnode-to-page mapping is destroyed (by jnode_detach_page() and
24559+ page_detach_jnode()), page reference counter is decreased.
24560+
24561+ 3. on jload() reference counter on jnode page is increased, page is
24562+ kmapped and `referenced'.
24563+
24564+ 4. on jrelse() inverse operations are performed.
24565+
24566+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24567+
24568+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24569+ historically.]
24570+
24571+ [In the following discussion, `lock' invariably means long term lock on
24572+ znode.] (What about page locks?)
24573+
24574+ There is some special class of deadlock possibilities related to memory
24575+ pressure. Locks acquired by other reiser4 threads are accounted for in
24576+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24577+ invoked additional hidden arc is added to the locking graph: thread that
24578+ tries to allocate memory waits for ->vm_writeback() to finish. If this
24579+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24580+ prevention is useless.
24581+
24582+ Another related problem is possibility for ->vm_writeback() to run out of
24583+ memory itself. This is not a problem for ext2 and friends, because their
24584+ ->vm_writeback() don't allocate much memory, but reiser4 flush is
24585+ definitely able to allocate huge amounts of memory.
24586+
24587+ It seems that there is no reliable way to cope with the problems above. In
24588+ stead it was decided that ->vm_writeback() (as invoked in the kswapd
24589+ context) wouldn't perform any flushing itself, but rather should just wake
24590+ up some auxiliary thread dedicated for this purpose (or, the same thread
24591+ that does periodic commit of old atoms (ktxnmgrd.c)).
24592+
24593+ Details:
24594+
24595+ 1. Page is called `reclaimable' against particular reiser4 mount F if this
24596+ page can be ultimately released by try_to_free_pages() under presumptions
24597+ that:
24598+
24599+ a. ->vm_writeback() for F is no-op, and
24600+
24601+ b. none of the threads accessing F are making any progress, and
24602+
24603+ c. other reiser4 mounts obey the same memory reservation protocol as F
24604+ (described below).
24605+
24606+ For example, clean un-pinned page, or page occupied by ext2 data are
24607+ reclaimable against any reiser4 mount.
24608+
24609+ When there is more than one reiser4 mount in a system, condition (c) makes
24610+ reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24611+
24612+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24613+
24614+ Fake inode is used to bound formatted nodes and each node is indexed within
24615+ fake inode by its block number. If block size of smaller than page size, it
24616+ may so happen that block mapped to the page with formatted node is occupied
24617+ by unformatted node or is unallocated. This lead to some complications,
24618+ because flushing whole page can lead to an incorrect overwrite of
24619+ unformatted node that is moreover, can be cached in some other place as
24620+ part of the file body. To avoid this, buffers for unformatted nodes are
24621+ never marked dirty. Also pages in the fake are never marked dirty. This
24622+ rules out usage of ->writepage() as memory pressure hook. In stead
24623+ ->releasepage() is used.
24624+
24625+ Josh is concerned that page->buffer is going to die. This should not pose
24626+ significant problem though, because we need to add some data structures to
24627+ the page anyway (jnode) and all necessary book keeping can be put there.
24628+
24629+*/
24630+
24631+/* Life cycle of pages/nodes.
24632+
24633+ jnode contains reference to page and page contains reference back to
24634+ jnode. This reference is counted in page ->count. Thus, page bound to jnode
24635+ cannot be released back into free pool.
24636+
24637+ 1. Formatted nodes.
24638+
24639+ 1. formatted node is represented by znode. When new znode is created its
24640+ ->pg pointer is NULL initially.
24641+
24642+ 2. when node content is loaded into znode (by call to zload()) for the
24643+ first time following happens (in call to ->read_node() or
24644+ ->allocate_node()):
24645+
24646+ 1. new page is added to the page cache.
24647+
24648+ 2. this page is attached to znode and its ->count is increased.
24649+
24650+ 3. page is kmapped.
24651+
24652+ 3. if more calls to zload() follow (without corresponding zrelses), page
24653+ counter is left intact and in its stead ->d_count is increased in znode.
24654+
24655+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24656+ ->release_node() is called and page is kunmapped as result.
24657+
24658+ 5. at some moment node can be captured by a transaction. Its ->x_count
24659+ is then increased by transaction manager.
24660+
24661+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24662+ bit set) following will happen (also see comment at the top of znode.c):
24663+
24664+ 1. when last lock is released, node will be uncaptured from
24665+ transaction. This released reference that transaction manager acquired
24666+ at the step 5.
24667+
24668+ 2. when last reference is released, zput() detects that node is
24669+ actually deleted and calls ->delete_node()
24670+ operation. page_cache_delete_node() implementation detaches jnode from
24671+ page and releases page.
24672+
24673+ 7. otherwise (node wasn't removed from the tree), last reference to
24674+ znode will be released after transaction manager committed transaction
24675+ node was in. This implies squallocing of this node (see
24676+ flush.c). Nothing special happens at this point. Znode is still in the
24677+ hash table and page is still attached to it.
24678+
24679+ 8. znode is actually removed from the memory because of the memory
24680+ pressure, or during umount (znodes_tree_done()). Anyway, znode is
24681+ removed by the call to zdrop(). At this moment, page is detached from
24682+ znode and removed from the inode address space.
24683+
24684+*/
24685+
24686+#include "debug.h"
24687+#include "dformat.h"
24688+#include "key.h"
24689+#include "txnmgr.h"
24690+#include "jnode.h"
24691+#include "znode.h"
24692+#include "block_alloc.h"
24693+#include "tree.h"
24694+#include "vfs_ops.h"
24695+#include "inode.h"
24696+#include "super.h"
24697+#include "entd.h"
24698+#include "page_cache.h"
24699+#include "ktxnmgrd.h"
24700+
24701+#include <linux/types.h>
24702+#include <linux/fs.h>
24703+#include <linux/mm.h> /* for struct page */
24704+#include <linux/swap.h> /* for struct page */
24705+#include <linux/pagemap.h>
24706+#include <linux/bio.h>
24707+#include <linux/writeback.h>
24708+#include <linux/blkdev.h>
24709+
24710+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24711+
24712+static struct address_space_operations formatted_fake_as_ops;
24713+
24714+static const oid_t fake_ino = 0x1;
24715+static const oid_t bitmap_ino = 0x2;
24716+static const oid_t cc_ino = 0x3;
24717+
24718+static void
24719+init_fake_inode(struct super_block *super, struct inode *fake,
24720+ struct inode **pfake)
24721+{
24722+ assert("nikita-2168", fake->i_state & I_NEW);
24723+ fake->i_mapping->a_ops = &formatted_fake_as_ops;
24724+ *pfake = fake;
24725+ /* NOTE-NIKITA something else? */
24726+ unlock_new_inode(fake);
24727+}
24728+
24729+/**
24730+ * init_formatted_fake - iget inodes for formatted nodes and bitmaps
24731+ * @super: super block to init fake inode for
24732+ *
24733+ * Initializes fake inode to which formatted nodes are bound in the page cache
24734+ * and inode for bitmaps.
24735+ */
24736+int init_formatted_fake(struct super_block *super)
24737+{
24738+ struct inode *fake;
24739+ struct inode *bitmap;
24740+ struct inode *cc;
24741+ reiser4_super_info_data *sinfo;
24742+
24743+ assert("nikita-1703", super != NULL);
24744+
24745+ sinfo = get_super_private_nocheck(super);
24746+ fake = iget_locked(super, oid_to_ino(fake_ino));
24747+
24748+ if (fake != NULL) {
24749+ init_fake_inode(super, fake, &sinfo->fake);
24750+
24751+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24752+ if (bitmap != NULL) {
24753+ init_fake_inode(super, bitmap, &sinfo->bitmap);
24754+
24755+ cc = iget_locked(super, oid_to_ino(cc_ino));
24756+ if (cc != NULL) {
24757+ init_fake_inode(super, cc, &sinfo->cc);
24758+ return 0;
24759+ } else {
24760+ iput(sinfo->fake);
24761+ iput(sinfo->bitmap);
24762+ sinfo->fake = NULL;
24763+ sinfo->bitmap = NULL;
24764+ }
24765+ } else {
24766+ iput(sinfo->fake);
24767+ sinfo->fake = NULL;
24768+ }
24769+ }
24770+ return RETERR(-ENOMEM);
24771+}
24772+
24773+/**
24774+ * done_formatted_fake - release inode used by formatted nodes and bitmaps
24775+ * @super: super block to init fake inode for
24776+ *
24777+ * Releases inodes which were used as address spaces of bitmap and formatted
24778+ * nodes.
24779+ */
24780+void done_formatted_fake(struct super_block *super)
24781+{
24782+ reiser4_super_info_data *sinfo;
24783+
24784+ sinfo = get_super_private_nocheck(super);
24785+
24786+ if (sinfo->fake != NULL) {
24787+ assert("vs-1426", sinfo->fake->i_data.nrpages == 0);
24788+ iput(sinfo->fake);
24789+ sinfo->fake = NULL;
24790+ }
24791+
24792+ if (sinfo->bitmap != NULL) {
24793+ iput(sinfo->bitmap);
24794+ sinfo->bitmap = NULL;
24795+ }
24796+
24797+ if (sinfo->cc != NULL) {
24798+ iput(sinfo->cc);
24799+ sinfo->cc = NULL;
24800+ }
24801+ return;
24802+}
24803+
24804+void reiser4_wait_page_writeback(struct page *page)
24805+{
24806+ assert("zam-783", PageLocked(page));
24807+
24808+ do {
24809+ unlock_page(page);
24810+ wait_on_page_writeback(page);
24811+ lock_page(page);
24812+ } while (PageWriteback(page));
24813+}
24814+
24815+/* return tree @page is in */
24816+reiser4_tree *tree_by_page(const struct page *page /* page to query */ )
24817+{
24818+ assert("nikita-2461", page != NULL);
24819+ return &get_super_private(page->mapping->host->i_sb)->tree;
24820+}
24821+
24822+/* completion handler for single page bio-based read.
24823+
24824+ mpage_end_io_read() would also do. But it's static.
24825+
24826+*/
24827+static int
24828+end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24829+ int err UNUSED_ARG)
24830+{
24831+ struct page *page;
24832+
24833+ if (bio->bi_size != 0) {
24834+ warning("nikita-3332", "Truncated single page read: %i",
24835+ bio->bi_size);
24836+ return 1;
24837+ }
24838+
24839+ page = bio->bi_io_vec[0].bv_page;
24840+
24841+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24842+ SetPageUptodate(page);
24843+ } else {
24844+ ClearPageUptodate(page);
24845+ SetPageError(page);
24846+ }
24847+ unlock_page(page);
24848+ bio_put(bio);
24849+ return 0;
24850+}
24851+
24852+/* completion handler for single page bio-based write.
24853+
24854+ mpage_end_io_write() would also do. But it's static.
24855+
24856+*/
24857+static int
24858+end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24859+ int err UNUSED_ARG)
24860+{
24861+ struct page *page;
24862+
24863+ if (bio->bi_size != 0) {
24864+ warning("nikita-3333", "Truncated single page write: %i",
24865+ bio->bi_size);
24866+ return 1;
24867+ }
24868+
24869+ page = bio->bi_io_vec[0].bv_page;
24870+
24871+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24872+ SetPageError(page);
24873+ end_page_writeback(page);
24874+ bio_put(bio);
24875+ return 0;
24876+}
24877+
24878+/* ->readpage() method for formatted nodes */
24879+static int formatted_readpage(struct file *f UNUSED_ARG,
24880+ struct page *page /* page to read */ )
24881+{
24882+ assert("nikita-2412", PagePrivate(page) && jprivate(page));
24883+ return page_io(page, jprivate(page), READ, get_gfp_mask());
24884+}
24885+
24886+/**
24887+ * page_io - submit single-page bio request
24888+ * @page: page to perform io for
24889+ * @node: jnode of page
24890+ * @rw: read or write
24891+ * @gfp: gfp mask for bio allocation
24892+ *
24893+ * Submits single page read or write.
24894+ */
24895+int page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24896+{
24897+ struct bio *bio;
24898+ int result;
24899+
24900+ assert("nikita-2094", page != NULL);
24901+ assert("nikita-2226", PageLocked(page));
24902+ assert("nikita-2634", node != NULL);
24903+ assert("nikita-2893", rw == READ || rw == WRITE);
24904+
24905+ if (rw) {
24906+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24907+ unlock_page(page);
24908+ return 0;
24909+ }
24910+ }
24911+
24912+ bio = page_bio(page, node, rw, gfp);
24913+ if (!IS_ERR(bio)) {
24914+ if (rw == WRITE) {
24915+ SetPageWriteback(page);
24916+ unlock_page(page);
24917+ }
24918+ reiser4_submit_bio(rw, bio);
24919+ result = 0;
24920+ } else {
24921+ unlock_page(page);
24922+ result = PTR_ERR(bio);
24923+ }
24924+
24925+ return result;
24926+}
24927+
24928+/* helper function to construct bio for page */
24929+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24930+{
24931+ struct bio *bio;
24932+ assert("nikita-2092", page != NULL);
24933+ assert("nikita-2633", node != NULL);
24934+
24935+ /* Simple implementation in the assumption that blocksize == pagesize.
24936+
24937+ We only have to submit one block, but submit_bh() will allocate bio
24938+ anyway, so lets use all the bells-and-whistles of bio code.
24939+ */
24940+
24941+ bio = bio_alloc(gfp, 1);
24942+ if (bio != NULL) {
24943+ int blksz;
24944+ struct super_block *super;
24945+ reiser4_block_nr blocknr;
24946+
24947+ super = page->mapping->host->i_sb;
24948+ assert("nikita-2029", super != NULL);
24949+ blksz = super->s_blocksize;
24950+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24951+
24952+ spin_lock_jnode(node);
24953+ blocknr = *jnode_get_io_block(node);
24954+ spin_unlock_jnode(node);
24955+
24956+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24957+ assert("nikita-2276", !blocknr_is_fake(&blocknr));
24958+
24959+ bio->bi_bdev = super->s_bdev;
24960+ /* fill bio->bi_sector before calling bio_add_page(), because
24961+ * q->merge_bvec_fn may want to inspect it (see
24962+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24963+ bio->bi_sector = blocknr * (blksz >> 9);
24964+
24965+ if (!bio_add_page(bio, page, blksz, 0)) {
24966+ warning("nikita-3452",
24967+ "Single page bio cannot be constructed");
24968+ return ERR_PTR(RETERR(-EINVAL));
24969+ }
24970+
24971+ /* bio -> bi_idx is filled by bio_init() */
24972+ bio->bi_end_io = (rw == READ) ?
24973+ end_bio_single_page_read : end_bio_single_page_write;
24974+
24975+ return bio;
24976+ } else
24977+ return ERR_PTR(RETERR(-ENOMEM));
24978+}
24979+
24980+/* this function is internally called by jnode_make_dirty() */
24981+int set_page_dirty_internal(struct page *page)
24982+{
24983+ struct address_space *mapping;
24984+
24985+ mapping = page->mapping;
24986+ BUG_ON(mapping == NULL);
24987+
24988+ if (!TestSetPageDirty(page)) {
24989+ if (mapping_cap_account_dirty(mapping))
24990+ inc_page_state(nr_dirty);
24991+
24992+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24993+ }
24994+
24995+ /* znode must be dirty ? */
24996+ if (mapping->host == get_super_fake(mapping->host->i_sb))
24997+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24998+ return 0;
24999+}
25000+
25001+#if REISER4_DEBUG
25002+
25003+/**
25004+ * can_hit_entd
25005+ *
25006+ * This is used on
25007+ */
25008+static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
25009+{
25010+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
25011+ return 1;
25012+ if (ctx->super != s)
25013+ return 1;
25014+ if (get_super_private(s)->entd.tsk == current)
25015+ return 0;
25016+ if (!lock_stack_isclean(&ctx->stack))
25017+ return 0;
25018+ if (ctx->trans->atom != NULL)
25019+ return 0;
25020+ return 1;
25021+}
25022+
25023+#endif
25024+
25025+/**
25026+ * reiser4_writepage - writepage of struct address_space_operations
25027+ * @page: page to write
25028+ * @wbc:
25029+ *
25030+ *
25031+ */
25032+/* Common memory pressure notification. */
25033+int reiser4_writepage(struct page *page,
25034+ struct writeback_control *wbc)
25035+{
25036+ struct super_block *s;
25037+ reiser4_context *ctx;
25038+
25039+ assert("vs-828", PageLocked(page));
25040+
25041+ s = page->mapping->host->i_sb;
25042+ ctx = get_current_context_check();
25043+
25044+ assert("", can_hit_entd(ctx, s));
25045+
25046+ return write_page_by_ent(page, wbc);
25047+}
25048+
25049+/* ->set_page_dirty() method of formatted address_space */
25050+static int formatted_set_page_dirty(struct page *page)
25051+{
25052+ assert("nikita-2173", page != NULL);
25053+ BUG();
25054+ return __set_page_dirty_nobuffers(page);
25055+}
25056+
25057+/* writepages method of address space operations in reiser4 is used to involve
25058+ into transactions pages which are dirtied via mmap. Only regular files can
25059+ have such pages. Fake inode is used to access formatted nodes via page
25060+ cache. As formatted nodes can never be mmaped, fake inode's writepages has
25061+ nothing to do */
25062+static int
25063+writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
25064+{
25065+ return 0;
25066+}
25067+
25068+/* address space operations for the fake inode */
25069+static struct address_space_operations formatted_fake_as_ops = {
25070+ /* Perform a writeback of a single page as a memory-freeing
25071+ * operation. */
25072+ .writepage = reiser4_writepage,
25073+ /* this is called to read formatted node */
25074+ .readpage = formatted_readpage,
25075+ /* ->sync_page() method of fake inode address space operations. Called
25076+ from wait_on_page() and lock_page().
25077+
25078+ This is most annoyingly misnomered method. Actually it is called
25079+ from wait_on_page_bit() and lock_page() and its purpose is to
25080+ actually start io by jabbing device drivers.
25081+ */
25082+ .sync_page = block_sync_page,
25083+ /* Write back some dirty pages from this mapping. Called from sync.
25084+ called during sync (pdflush) */
25085+ .writepages = writepages_fake,
25086+ /* Set a page dirty */
25087+ .set_page_dirty = formatted_set_page_dirty,
25088+ /* used for read-ahead. Not applicable */
25089+ .readpages = NULL,
25090+ .prepare_write = NULL,
25091+ .commit_write = NULL,
25092+ .bmap = NULL,
25093+ /* called just before page is being detached from inode mapping and
25094+ removed from memory. Called on truncate, cut/squeeze, and
25095+ umount. */
25096+ .invalidatepage = reiser4_invalidatepage,
25097+ /* this is called by shrink_cache() so that file system can try to
25098+ release objects (jnodes, buffers, journal heads) attached to page
25099+ and, may be made page itself free-able.
25100+ */
25101+ .releasepage = reiser4_releasepage,
25102+ .direct_IO = NULL
25103+};
25104+
25105+/* called just before page is released (no longer used by reiser4). Callers:
25106+ jdelete() and extent2tail(). */
25107+void drop_page(struct page *page)
25108+{
25109+ assert("nikita-2181", PageLocked(page));
25110+ clear_page_dirty_for_io(page);
25111+ ClearPageUptodate(page);
25112+#if defined(PG_skipped)
25113+ ClearPageSkipped(page);
25114+#endif
25115+ if (page->mapping != NULL) {
25116+ remove_from_page_cache(page);
25117+ unlock_page(page);
25118+ page_cache_release(page);
25119+ } else
25120+ unlock_page(page);
25121+}
25122+
25123+/* this is called by truncate_jnodes_range which in its turn is always called
25124+ after truncate_mapping_pages_range. Therefore, here jnode can not have
25125+ page. New pages can not be created because truncate_jnodes_range goes under
25126+ exclusive access on file obtained, where as new page creation requires
25127+ non-exclusive access obtained */
25128+static void invalidate_unformatted(jnode * node)
25129+{
25130+ struct page *page;
25131+
25132+ spin_lock_jnode(node);
25133+ page = node->pg;
25134+ if (page) {
25135+ loff_t from, to;
25136+
25137+ page_cache_get(page);
25138+ spin_unlock_jnode(node);
25139+ /* FIXME: use truncate_complete_page instead */
25140+ from = (loff_t) page->index << PAGE_CACHE_SHIFT;
25141+ to = from + PAGE_CACHE_SIZE - 1;
25142+ truncate_inode_pages_range(page->mapping, from, to);
25143+ page_cache_release(page);
25144+ } else {
25145+ JF_SET(node, JNODE_HEARD_BANSHEE);
25146+ uncapture_jnode(node);
25147+ unhash_unformatted_jnode(node);
25148+ }
25149+}
25150+
25151+#define JNODE_GANG_SIZE (16)
25152+
25153+/* find all eflushed jnodes from range specified and invalidate them */
25154+static int
25155+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
25156+{
25157+ reiser4_inode *info;
25158+ int truncated_jnodes;
25159+ reiser4_tree *tree;
25160+ unsigned long index;
25161+ unsigned long end;
25162+
25163+ truncated_jnodes = 0;
25164+
25165+ info = reiser4_inode_data(inode);
25166+ tree = tree_by_inode(inode);
25167+
25168+ index = from;
25169+ end = from + count;
25170+
25171+ while (1) {
25172+ jnode *gang[JNODE_GANG_SIZE];
25173+ int taken;
25174+ int i;
25175+ jnode *node;
25176+
25177+ assert("nikita-3466", index <= end);
25178+
25179+ read_lock_tree(tree);
25180+ taken =
25181+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25182+ (void **)gang, index,
25183+ JNODE_GANG_SIZE);
25184+ for (i = 0; i < taken; ++i) {
25185+ node = gang[i];
25186+ if (index_jnode(node) < end)
25187+ jref(node);
25188+ else
25189+ gang[i] = NULL;
25190+ }
25191+ read_unlock_tree(tree);
25192+
25193+ for (i = 0; i < taken; ++i) {
25194+ node = gang[i];
25195+ if (node != NULL) {
25196+ index = max(index, index_jnode(node));
25197+ invalidate_unformatted(node);
25198+ truncated_jnodes++;
25199+ jput(node);
25200+ } else
25201+ break;
25202+ }
25203+ if (i != taken || taken == 0)
25204+ break;
25205+ }
25206+ return truncated_jnodes;
25207+}
25208+
25209+void
25210+reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25211+ unsigned long count, int even_cows)
25212+{
25213+ loff_t from_bytes, count_bytes;
25214+
25215+ if (count == 0)
25216+ return;
25217+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25218+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25219+
25220+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25221+ truncate_inode_pages_range(mapping, from_bytes,
25222+ from_bytes + count_bytes - 1);
25223+ truncate_jnodes_range(mapping->host, from, count);
25224+}
25225+
25226+/*
25227+ * Local variables:
25228+ * c-indentation-style: "K&R"
25229+ * mode-name: "LC"
25230+ * c-basic-offset: 8
25231+ * tab-width: 8
25232+ * fill-column: 120
25233+ * scroll-step: 1
25234+ * End:
25235+ */
25236Index: linux-2.6.16/fs/reiser4/page_cache.h
25237===================================================================
25238--- /dev/null
25239+++ linux-2.6.16/fs/reiser4/page_cache.h
25240@@ -0,0 +1,62 @@
25241+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25242+ * reiser4/README */
25243+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25244+
25245+#if !defined( __REISER4_PAGE_CACHE_H__ )
25246+#define __REISER4_PAGE_CACHE_H__
25247+
25248+#include "forward.h"
25249+#include "debug.h"
25250+
25251+#include <linux/fs.h> /* for struct super_block, address_space */
25252+#include <linux/mm.h> /* for struct page */
25253+#include <linux/pagemap.h> /* for lock_page() */
25254+
25255+
25256+extern int init_formatted_fake(struct super_block *);
25257+extern void done_formatted_fake(struct super_block *);
25258+
25259+extern reiser4_tree *tree_by_page(const struct page *);
25260+
25261+extern int set_page_dirty_internal(struct page *);
25262+
25263+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25264+
25265+extern void reiser4_wait_page_writeback(struct page *);
25266+static inline void lock_and_wait_page_writeback(struct page *page)
25267+{
25268+ lock_page(page);
25269+ if (unlikely(PageWriteback(page)))
25270+ reiser4_wait_page_writeback(page);
25271+}
25272+
25273+#define jprivate(page) ((jnode *)page_private(page))
25274+
25275+extern int page_io(struct page *, jnode *, int rw, gfp_t);
25276+extern void drop_page(struct page *);
25277+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25278+ unsigned long count, int even_cows);
25279+extern void capture_reiser4_inodes(struct super_block *,
25280+ struct writeback_control *);
25281+
25282+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25283+
25284+#if REISER4_DEBUG
25285+extern void print_page(const char *prefix, struct page *page);
25286+#else
25287+#define print_page(prf, p) noop
25288+#endif
25289+
25290+/* __REISER4_PAGE_CACHE_H__ */
25291+#endif
25292+
25293+/* Make Linus happy.
25294+ Local variables:
25295+ c-indentation-style: "K&R"
25296+ mode-name: "LC"
25297+ c-basic-offset: 8
25298+ tab-width: 8
25299+ fill-column: 120
25300+ scroll-step: 1
25301+ End:
25302+*/
25303Index: linux-2.6.16/fs/reiser4/plugin/Makefile
25304===================================================================
25305--- /dev/null
25306+++ linux-2.6.16/fs/reiser4/plugin/Makefile
25307@@ -0,0 +1,26 @@
25308+obj-$(CONFIG_REISER4_FS) += plugins.o
25309+
25310+plugins-objs := \
25311+ plugin.o \
25312+ plugin_set.o \
25313+ object.o \
25314+ inode_ops.o \
25315+ inode_ops_rename.o \
25316+ file_ops.o \
25317+ file_ops_readdir.o \
25318+ file_plugin_common.o \
25319+ dir_plugin_common.o \
25320+ digest.o \
25321+ hash.o \
25322+ fibration.o \
25323+ tail_policy.o \
25324+ regular.o
25325+
25326+obj-$(CONFIG_REISER4_FS) += item/
25327+obj-$(CONFIG_REISER4_FS) += file/
25328+obj-$(CONFIG_REISER4_FS) += dir/
25329+obj-$(CONFIG_REISER4_FS) += node/
25330+obj-$(CONFIG_REISER4_FS) += compress/
25331+obj-$(CONFIG_REISER4_FS) += space/
25332+obj-$(CONFIG_REISER4_FS) += disk_format/
25333+obj-$(CONFIG_REISER4_FS) += security/
25334Index: linux-2.6.16/fs/reiser4/plugin/cluster.c
25335===================================================================
25336--- /dev/null
25337+++ linux-2.6.16/fs/reiser4/plugin/cluster.c
25338@@ -0,0 +1,66 @@
25339+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25340+
25341+/* Contains reiser4 cluster plugins (see
25342+ http://www.namesys.com/cryptcompress_design.html
25343+ "Concepts of clustering" for details). */
25344+
25345+#include "plugin_header.h"
25346+#include "plugin.h"
25347+#include "../inode.h"
25348+
25349+static int change_cluster(struct inode *inode, reiser4_plugin * plugin)
25350+{
25351+ int result = 0;
25352+
25353+ assert("edward-1324", inode != NULL);
25354+ assert("edward-1325", plugin != NULL);
25355+ assert("edward-1326", is_reiser4_inode(inode));
25356+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25357+
25358+ if (inode_file_plugin(inode)->h.id == DIRECTORY_FILE_PLUGIN_ID)
25359+ result = plugin_set_cluster(&reiser4_inode_data(inode)->pset,
25360+ &plugin->clust);
25361+ else
25362+ result = RETERR(-EINVAL);
25363+ return result;
25364+}
25365+
25366+static reiser4_plugin_ops cluster_plugin_ops = {
25367+ .init = NULL,
25368+ .load = NULL,
25369+ .save_len = NULL,
25370+ .save = NULL,
25371+ .change = &change_cluster
25372+};
25373+
25374+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25375+ [CLUSTER_ ## ID ## _ID] = { \
25376+ .h = { \
25377+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25378+ .id = CLUSTER_ ## ID ## _ID, \
25379+ .pops = &cluster_plugin_ops, \
25380+ .label = LABEL, \
25381+ .desc = DESC, \
25382+ .linkage = {NULL, NULL} \
25383+ }, \
25384+ .shift = SHIFT \
25385+ }
25386+
25387+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25388+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25389+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25390+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25391+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25392+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25393+};
25394+
25395+/*
25396+ Local variables:
25397+ c-indentation-style: "K&R"
25398+ mode-name: "LC"
25399+ c-basic-offset: 8
25400+ tab-width: 8
25401+ fill-column: 120
25402+ scroll-step: 1
25403+ End:
25404+*/
25405Index: linux-2.6.16/fs/reiser4/plugin/cluster.h
25406===================================================================
25407--- /dev/null
25408+++ linux-2.6.16/fs/reiser4/plugin/cluster.h
25409@@ -0,0 +1,316 @@
25410+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25411+
25412+/* This file contains page/cluster index translators and offset modulators
25413+ See http://www.namesys.com/cryptcompress_design.html for details */
25414+
25415+#if !defined( __FS_REISER4_CLUSTER_H__ )
25416+#define __FS_REISER4_CLUSTER_H__
25417+
25418+#include "../inode.h"
25419+
25420+static inline int inode_cluster_shift(struct inode *inode)
25421+{
25422+ assert("edward-92", inode != NULL);
25423+ assert("edward-93", reiser4_inode_data(inode) != NULL);
25424+
25425+ return inode_cluster_plugin(inode)->shift;
25426+}
25427+
25428+static inline unsigned cluster_nrpages_shift(struct inode *inode)
25429+{
25430+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25431+}
25432+
25433+/* cluster size in page units */
25434+static inline unsigned cluster_nrpages(struct inode *inode)
25435+{
25436+ return 1U << cluster_nrpages_shift(inode);
25437+}
25438+
25439+static inline size_t inode_cluster_size(struct inode *inode)
25440+{
25441+ assert("edward-96", inode != NULL);
25442+
25443+ return 1U << inode_cluster_shift(inode);
25444+}
25445+
25446+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25447+{
25448+ return idx >> cluster_nrpages_shift(inode);
25449+}
25450+
25451+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25452+{
25453+ return idx << cluster_nrpages_shift(inode);
25454+}
25455+
25456+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25457+{
25458+ return clust_to_pg(pg_to_clust(idx, inode), inode);
25459+}
25460+
25461+static inline pgoff_t off_to_pg(loff_t off)
25462+{
25463+ return (off >> PAGE_CACHE_SHIFT);
25464+}
25465+
25466+static inline loff_t pg_to_off(pgoff_t idx)
25467+{
25468+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25469+}
25470+
25471+static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25472+{
25473+ return off >> inode_cluster_shift(inode);
25474+}
25475+
25476+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25477+{
25478+ return (loff_t) idx << inode_cluster_shift(inode);
25479+}
25480+
25481+static inline unsigned long count_to_nr(loff_t count, unsigned shift)
25482+{
25483+ return (count + (1UL << shift) - 1) >> shift;
25484+}
25485+
25486+/* number of pages occupied by @count bytes */
25487+static inline pgoff_t count_to_nrpages(loff_t count)
25488+{
25489+ return count_to_nr(count, PAGE_CACHE_SHIFT);
25490+}
25491+
25492+/* number of clusters occupied by @count bytes */
25493+static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
25494+{
25495+ return count_to_nr(count, inode_cluster_shift(inode));
25496+}
25497+
25498+/* number of clusters occupied by @count pages */
25499+static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
25500+{
25501+ return count_to_nr(count, cluster_nrpages_shift(inode));
25502+}
25503+
25504+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25505+{
25506+ return clust_to_off(off_to_clust(off, inode), inode);
25507+}
25508+
25509+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25510+{
25511+ return clust_to_pg(off_to_clust(off, inode), inode);
25512+}
25513+
25514+static inline unsigned off_to_pgoff(loff_t off)
25515+{
25516+ return off & (PAGE_CACHE_SIZE - 1);
25517+}
25518+
25519+static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25520+{
25521+ return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25522+}
25523+
25524+static inline unsigned
25525+pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25526+{
25527+ return off_to_cloff(pg_to_off(idx), inode);
25528+}
25529+
25530+/* if @size != 0, returns index of the page
25531+ which contains the last byte of the file */
25532+static inline pgoff_t size_to_pg(loff_t size)
25533+{
25534+ return (size ? off_to_pg(size - 1) : 0);
25535+}
25536+
25537+/* minimal index of the page which doesn't contain
25538+ file data */
25539+static inline pgoff_t size_to_next_pg(loff_t size)
25540+{
25541+ return (size ? off_to_pg(size - 1) + 1 : 0);
25542+}
25543+
25544+/* how many bytes of file of size @cnt can be contained
25545+ in page of index @idx */
25546+static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
25547+{
25548+ if (idx > off_to_pg(cnt))
25549+ return 0;
25550+ if (idx < off_to_pg(cnt))
25551+ return PAGE_CACHE_SIZE;
25552+ return off_to_pgoff(cnt);
25553+}
25554+
25555+/* how many bytes of file of size @cnt can be contained
25556+ in logical cluster of index @idx */
25557+static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
25558+ struct inode *inode)
25559+{
25560+ if (idx > off_to_clust(cnt, inode))
25561+ return 0;
25562+ if (idx < off_to_clust(cnt, inode))
25563+ return inode_cluster_size(inode);
25564+ return off_to_cloff(cnt, inode);
25565+}
25566+
25567+static inline unsigned
25568+fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
25569+{
25570+ assert("edward-288", clust != NULL);
25571+ assert("edward-289", inode != NULL);
25572+
25573+ return cnt_to_clcnt(inode->i_size, clust->index, inode);
25574+}
25575+
25576+static inline int
25577+cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
25578+{
25579+ return clust->tc.lsize == inode_cluster_size(inode);
25580+}
25581+
25582+static inline void reiser4_slide_init(reiser4_slide_t * win)
25583+{
25584+ assert("edward-1084", win != NULL);
25585+ memset(win, 0, sizeof *win);
25586+}
25587+
25588+static inline void
25589+tfm_cluster_init_act(tfm_cluster_t * tc, tfm_action act)
25590+{
25591+ assert("edward-1356", tc != NULL);
25592+ tc->act = act;
25593+}
25594+
25595+static inline void
25596+cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
25597+ assert("edward-84", clust != NULL);
25598+ memset(clust, 0, sizeof *clust);
25599+ tfm_cluster_init_act(&clust->tc, act);
25600+ clust->dstat = INVAL_DISK_CLUSTER;
25601+ clust->win = window;
25602+}
25603+
25604+static inline void
25605+cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
25606+{
25607+ cluster_init_act (clust, TFM_READ_ACT, window);
25608+}
25609+
25610+static inline void
25611+cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
25612+{
25613+ cluster_init_act (clust, TFM_WRITE_ACT, window);
25614+}
25615+
25616+static inline int dclust_get_extension(hint_t * hint)
25617+{
25618+ return hint->ext_coord.extension.ctail.shift;
25619+}
25620+
25621+static inline void dclust_set_extension(hint_t * hint)
25622+{
25623+ assert("edward-1270",
25624+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
25625+ hint->ext_coord.extension.ctail.shift =
25626+ cluster_shift_by_coord(&hint->ext_coord.coord);
25627+}
25628+
25629+static inline int hint_is_unprepped_dclust(hint_t * hint)
25630+{
25631+ return dclust_get_extension(hint) == (int)UCTAIL_SHIFT;
25632+}
25633+
25634+static inline void coord_set_between_clusters(coord_t * coord)
25635+{
25636+#if REISER4_DEBUG
25637+ int result;
25638+ result = zload(coord->node);
25639+ assert("edward-1296", !result);
25640+#endif
25641+ if (!coord_is_between_items(coord)) {
25642+ coord->between = AFTER_ITEM;
25643+ coord->unit_pos = 0;
25644+ }
25645+#if REISER4_DEBUG
25646+ zrelse(coord->node);
25647+#endif
25648+}
25649+
25650+int inflate_cluster(reiser4_cluster_t *, struct inode *);
25651+int find_cluster(reiser4_cluster_t *, struct inode *, int read, int write);
25652+void forget_cluster_pages(struct page **page, int nrpages);
25653+int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
25654+int deflate_cluster(reiser4_cluster_t *, struct inode *);
25655+void truncate_page_cluster(struct inode *inode, cloff_t start);
25656+void invalidate_hint_cluster(reiser4_cluster_t * clust);
25657+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
25658+ znode_lock_mode mode);
25659+int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
25660+ znode_lock_mode lock_mode);
25661+void reset_cluster_params(reiser4_cluster_t * clust);
25662+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
25663+ int count);
25664+int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
25665+ int capture);
25666+void release_cluster_pages(reiser4_cluster_t *);
25667+void put_cluster_handle(reiser4_cluster_t * clust);
25668+int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
25669+int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
25670+void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
25671+void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
25672+
25673+/* move cluster handle to the target position
25674+ specified by the page of index @pgidx
25675+*/
25676+static inline void
25677+move_cluster_forward(reiser4_cluster_t * clust, struct inode *inode,
25678+ pgoff_t pgidx, int *progress)
25679+{
25680+ assert("edward-1297", clust != NULL);
25681+ assert("edward-1298", inode != NULL);
25682+
25683+ reset_cluster_params(clust);
25684+ if (*progress &&
25685+ /* Hole in the indices. Hint became invalid and can not be
25686+ used by find_cluster_item() even if seal/node versions
25687+ will coincide */
25688+ pg_to_clust(pgidx, inode) != clust->index + 1) {
25689+ unset_hint(clust->hint);
25690+ invalidate_hint_cluster(clust);
25691+ }
25692+ *progress = 1;
25693+ clust->index = pg_to_clust(pgidx, inode);
25694+}
25695+
25696+static inline int
25697+alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
25698+{
25699+ assert("edward-791", clust != NULL);
25700+ assert("edward-792", inode != NULL);
25701+ clust->pages =
25702+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25703+ GFP_KERNEL);
25704+ if (!clust->pages)
25705+ return -ENOMEM;
25706+ return 0;
25707+}
25708+
25709+static inline void free_clust_pages(reiser4_cluster_t * clust)
25710+{
25711+ kfree(clust->pages);
25712+}
25713+
25714+#endif /* __FS_REISER4_CLUSTER_H__ */
25715+
25716+/* Make Linus happy.
25717+ Local variables:
25718+ c-indentation-style: "K&R"
25719+ mode-name: "LC"
25720+ c-basic-offset: 8
25721+ tab-width: 8
25722+ fill-column: 120
25723+ scroll-step: 1
25724+ End:
25725+*/
25726Index: linux-2.6.16/fs/reiser4/plugin/compress/Makefile
25727===================================================================
25728--- /dev/null
25729+++ linux-2.6.16/fs/reiser4/plugin/compress/Makefile
25730@@ -0,0 +1,6 @@
25731+obj-$(CONFIG_REISER4_FS) += compress_plugins.o
25732+
25733+compress_plugins-objs := \
25734+ compress.o \
25735+ minilzo.o \
25736+ compress_mode.o
25737Index: linux-2.6.16/fs/reiser4/plugin/compress/compress.c
25738===================================================================
25739--- /dev/null
25740+++ linux-2.6.16/fs/reiser4/plugin/compress/compress.c
25741@@ -0,0 +1,370 @@
25742+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25743+/* reiser4 compression transform plugins */
25744+
25745+#include "../../debug.h"
25746+#include "../../inode.h"
25747+#include "../plugin.h"
25748+#include "minilzo.h"
25749+
25750+#include <linux/config.h>
25751+#include <linux/zlib.h>
25752+#include <linux/types.h>
25753+#include <linux/hardirq.h>
25754+
25755+static int change_compression(struct inode *inode, reiser4_plugin * plugin)
25756+{
25757+ assert("edward-1316", inode != NULL);
25758+ assert("edward-1317", plugin != NULL);
25759+ assert("edward-1318", is_reiser4_inode(inode));
25760+ assert("edward-1319",
25761+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25762+ /* cannot change compression plugin of already existing object */
25763+ return RETERR(-EINVAL);
25764+}
25765+
25766+static reiser4_plugin_ops compression_plugin_ops = {
25767+ .init = NULL,
25768+ .load = NULL,
25769+ .save_len = NULL,
25770+ .save = NULL,
25771+ .change = &change_compression
25772+};
25773+
25774+/******************************************************************************/
25775+/* gzip1 compression */
25776+/******************************************************************************/
25777+
25778+#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25779+#define GZIP1_DEF_WINBITS 15
25780+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25781+
25782+static int gzip1_init(void)
25783+{
25784+ int ret = -EINVAL;
25785+#if REISER4_ZLIB
25786+ ret = 0;
25787+#endif
25788+ if (ret == -EINVAL)
25789+ warning("edward-1337", "Zlib not compiled into kernel");
25790+ return ret;
25791+}
25792+
25793+static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25794+{
25795+ return 0;
25796+}
25797+
25798+static coa_t gzip1_alloc(tfm_action act)
25799+{
25800+ coa_t coa = NULL;
25801+#if REISER4_ZLIB
25802+ int ret = 0;
25803+ switch (act) {
25804+ case TFM_WRITE_ACT: /* compress */
25805+ coa = vmalloc(zlib_deflate_workspacesize());
25806+ if (!coa) {
25807+ ret = -ENOMEM;
25808+ break;
25809+ }
25810+ memset(coa, 0, zlib_deflate_workspacesize());
25811+ break;
25812+ case TFM_READ_ACT: /* decompress */
25813+ coa = vmalloc(zlib_inflate_workspacesize());
25814+ if (!coa) {
25815+ ret = -ENOMEM;
25816+ break;
25817+ }
25818+ memset(coa, 0, zlib_inflate_workspacesize());
25819+ break;
25820+ default:
25821+ impossible("edward-767",
25822+ "trying to alloc workspace for unknown tfm action");
25823+ }
25824+ if (ret) {
25825+ warning("edward-768",
25826+ "alloc workspace for gzip1 (tfm action = %d) failed\n",
25827+ act);
25828+ return ERR_PTR(ret);
25829+ }
25830+#endif
25831+ return coa;
25832+}
25833+
25834+static void gzip1_free(coa_t coa, tfm_action act)
25835+{
25836+ assert("edward-769", coa != NULL);
25837+
25838+ switch (act) {
25839+ case TFM_WRITE_ACT: /* compress */
25840+ vfree(coa);
25841+ break;
25842+ case TFM_READ_ACT: /* decompress */
25843+ vfree(coa);
25844+ break;
25845+ default:
25846+ impossible("edward-770", "unknown tfm action");
25847+ }
25848+ return;
25849+}
25850+
25851+static int gzip1_min_size_deflate(void)
25852+{
25853+ return 64;
25854+}
25855+
25856+static void
25857+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25858+ __u8 * dst_first, unsigned *dst_len)
25859+{
25860+#if REISER4_ZLIB
25861+ int ret = 0;
25862+ struct z_stream_s stream;
25863+
25864+ memset(&stream, 0, sizeof(stream));
25865+
25866+ assert("edward-842", coa != NULL);
25867+ assert("edward-875", src_len != 0);
25868+
25869+ stream.workspace = coa;
25870+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25871+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25872+ Z_DEFAULT_STRATEGY);
25873+ if (ret != Z_OK) {
25874+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25875+ goto rollback;
25876+ }
25877+ ret = zlib_deflateReset(&stream);
25878+ if (ret != Z_OK) {
25879+ warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25880+ goto rollback;
25881+ }
25882+ stream.next_in = src_first;
25883+ stream.avail_in = src_len;
25884+ stream.next_out = dst_first;
25885+ stream.avail_out = *dst_len;
25886+
25887+ ret = zlib_deflate(&stream, Z_FINISH);
25888+ if (ret != Z_STREAM_END) {
25889+ if (ret != Z_OK)
25890+ warning("edward-773",
25891+ "zlib_deflate returned %d\n", ret);
25892+ goto rollback;
25893+ }
25894+ *dst_len = stream.total_out;
25895+ return;
25896+ rollback:
25897+ *dst_len = src_len;
25898+#endif
25899+ return;
25900+}
25901+
25902+static void
25903+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25904+ __u8 * dst_first, unsigned *dst_len)
25905+{
25906+#if REISER4_ZLIB
25907+ int ret = 0;
25908+ struct z_stream_s stream;
25909+
25910+ memset(&stream, 0, sizeof(stream));
25911+
25912+ assert("edward-843", coa != NULL);
25913+ assert("edward-876", src_len != 0);
25914+
25915+ stream.workspace = coa;
25916+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25917+ if (ret != Z_OK) {
25918+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25919+ return;
25920+ }
25921+ ret = zlib_inflateReset(&stream);
25922+ if (ret != Z_OK) {
25923+ warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25924+ return;
25925+ }
25926+
25927+ stream.next_in = src_first;
25928+ stream.avail_in = src_len;
25929+ stream.next_out = dst_first;
25930+ stream.avail_out = *dst_len;
25931+
25932+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25933+ /*
25934+ * Work around a bug in zlib, which sometimes wants to taste an extra
25935+ * byte when being used in the (undocumented) raw deflate mode.
25936+ * (From USAGI).
25937+ */
25938+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25939+ u8 zerostuff = 0;
25940+ stream.next_in = &zerostuff;
25941+ stream.avail_in = 1;
25942+ ret = zlib_inflate(&stream, Z_FINISH);
25943+ }
25944+ if (ret != Z_STREAM_END) {
25945+ warning("edward-776", "zlib_inflate returned %d\n", ret);
25946+ return;
25947+ }
25948+ *dst_len = stream.total_out;
25949+#endif
25950+ return;
25951+}
25952+
25953+/******************************************************************************/
25954+/* lzo1 compression */
25955+/******************************************************************************/
25956+
25957+static int lzo1_init(void)
25958+{
25959+ int ret;
25960+ ret = lzo_init();
25961+ if (ret != LZO_E_OK)
25962+ warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
25963+ return ret;
25964+}
25965+
25966+static int lzo1_overrun(unsigned in_len)
25967+{
25968+ return in_len / 64 + 16 + 3;
25969+}
25970+
25971+#define LZO_HEAP_SIZE(size) \
25972+ sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
25973+
25974+static coa_t lzo1_alloc(tfm_action act)
25975+{
25976+ int ret = 0;
25977+ coa_t coa = NULL;
25978+
25979+ switch (act) {
25980+ case TFM_WRITE_ACT: /* compress */
25981+ coa = vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25982+ if (!coa) {
25983+ ret = -ENOMEM;
25984+ break;
25985+ }
25986+ memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25987+ case TFM_READ_ACT: /* decompress */
25988+ break;
25989+ default:
25990+ impossible("edward-877",
25991+ "trying to alloc workspace for unknown tfm action");
25992+ }
25993+ if (ret) {
25994+ warning("edward-878",
25995+ "alloc workspace for lzo1 (tfm action = %d) failed\n",
25996+ act);
25997+ return ERR_PTR(ret);
25998+ }
25999+ return coa;
26000+}
26001+
26002+static void lzo1_free(coa_t coa, tfm_action act)
26003+{
26004+ assert("edward-879", coa != NULL);
26005+
26006+ switch (act) {
26007+ case TFM_WRITE_ACT: /* compress */
26008+ vfree(coa);
26009+ break;
26010+ case TFM_READ_ACT: /* decompress */
26011+ impossible("edward-1304",
26012+ "trying to free non-allocated workspace");
26013+ default:
26014+ impossible("edward-880", "unknown tfm action");
26015+ }
26016+ return;
26017+}
26018+
26019+static int lzo1_min_size_deflate(void)
26020+{
26021+ return 256;
26022+}
26023+
26024+static void
26025+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26026+ __u8 * dst_first, unsigned *dst_len)
26027+{
26028+ int result;
26029+
26030+ assert("edward-846", coa != NULL);
26031+ assert("edward-847", src_len != 0);
26032+
26033+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
26034+ if (result != LZO_E_OK) {
26035+ warning("edward-849", "lzo1x_1_compress failed\n");
26036+ goto out;
26037+ }
26038+ if (*dst_len >= src_len) {
26039+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
26040+ goto out;
26041+ }
26042+ return;
26043+ out:
26044+ *dst_len = src_len;
26045+ return;
26046+}
26047+
26048+static void
26049+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26050+ __u8 * dst_first, unsigned *dst_len)
26051+{
26052+ int result;
26053+
26054+ assert("edward-851", coa == NULL);
26055+ assert("edward-852", src_len != 0);
26056+
26057+ result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
26058+ if (result != LZO_E_OK)
26059+ warning("edward-853", "lzo1x_1_decompress failed\n");
26060+ return;
26061+}
26062+
26063+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
26064+ [LZO1_COMPRESSION_ID] = {
26065+ .h = {
26066+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26067+ .id = LZO1_COMPRESSION_ID,
26068+ .pops = &compression_plugin_ops,
26069+ .label = "lzo1",
26070+ .desc = "lzo1 compression transform",
26071+ .linkage = {NULL, NULL}
26072+ },
26073+ .init = lzo1_init,
26074+ .overrun = lzo1_overrun,
26075+ .alloc = lzo1_alloc,
26076+ .free = lzo1_free,
26077+ .min_size_deflate = lzo1_min_size_deflate,
26078+ .checksum = reiser4_adler32,
26079+ .compress = lzo1_compress,
26080+ .decompress = lzo1_decompress
26081+ },
26082+ [GZIP1_COMPRESSION_ID] = {
26083+ .h = {
26084+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26085+ .id = GZIP1_COMPRESSION_ID,
26086+ .pops = &compression_plugin_ops,
26087+ .label = "gzip1",
26088+ .desc = "gzip1 compression transform",
26089+ .linkage = {NULL, NULL}
26090+ },
26091+ .init = gzip1_init,
26092+ .overrun = gzip1_overrun,
26093+ .alloc = gzip1_alloc,
26094+ .free = gzip1_free,
26095+ .min_size_deflate = gzip1_min_size_deflate,
26096+ .checksum = NULL,
26097+ .compress = gzip1_compress,
26098+ .decompress = gzip1_decompress
26099+ }
26100+};
26101+
26102+/*
26103+ Local variables:
26104+ c-indentation-style: "K&R"
26105+ mode-name: "LC"
26106+ c-basic-offset: 8
26107+ tab-width: 8
26108+ fill-column: 120
26109+ scroll-step: 1
26110+ End:
26111+*/
26112Index: linux-2.6.16/fs/reiser4/plugin/compress/compress.h
26113===================================================================
26114--- /dev/null
26115+++ linux-2.6.16/fs/reiser4/plugin/compress/compress.h
26116@@ -0,0 +1,38 @@
26117+#if !defined( __FS_REISER4_COMPRESS_H__ )
26118+#define __FS_REISER4_COMPRESS_H__
26119+
26120+#include <linux/types.h>
26121+#include <linux/string.h>
26122+
26123+typedef enum {
26124+ TFM_READ_ACT,
26125+ TFM_WRITE_ACT,
26126+ TFM_LAST_ACT
26127+} tfm_action;
26128+
26129+/* builtin compression plugins */
26130+
26131+typedef enum {
26132+ LZO1_COMPRESSION_ID,
26133+ GZIP1_COMPRESSION_ID,
26134+ LAST_COMPRESSION_ID,
26135+} reiser4_compression_id;
26136+
26137+typedef unsigned long cloff_t;
26138+typedef void *coa_t;
26139+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFM_LAST_ACT];
26140+
26141+__u32 reiser4_adler32(char *data, __u32 len);
26142+
26143+#endif /* __FS_REISER4_COMPRESS_H__ */
26144+
26145+/* Make Linus happy.
26146+ Local variables:
26147+ c-indentation-style: "K&R"
26148+ mode-name: "LC"
26149+ c-basic-offset: 8
26150+ tab-width: 8
26151+ fill-column: 120
26152+ scroll-step: 1
26153+ End:
26154+*/
26155Index: linux-2.6.16/fs/reiser4/plugin/compress/compress_mode.c
26156===================================================================
26157--- /dev/null
26158+++ linux-2.6.16/fs/reiser4/plugin/compress/compress_mode.c
26159@@ -0,0 +1,163 @@
26160+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26161+/* This file contains Reiser4 compression mode plugins.
26162+
26163+ Compression mode plugin is a set of handlers called by compressor
26164+ at flush time and represent some heuristics including the ones
26165+ which are to avoid compression of incompressible data, see
26166+ http://www.namesys.com/cryptcompress_design.html for more details.
26167+*/
26168+#include "../../inode.h"
26169+#include "../plugin.h"
26170+
26171+static int should_deflate_test(struct inode * inode, cloff_t index)
26172+{
26173+ return !test_bit(0, &index);
26174+}
26175+
26176+static int should_deflate_none(struct inode * inode, cloff_t index)
26177+{
26178+ return 0;
26179+}
26180+
26181+static int should_deflate_common(struct inode * inode, cloff_t index)
26182+{
26183+ return compression_is_on(cryptcompress_inode_data(inode));
26184+}
26185+
26186+static int turn_off_compression(struct inode *inode, cloff_t index)
26187+{
26188+ toggle_compression(cryptcompress_inode_data(inode), 0);
26189+ return 0;
26190+}
26191+
26192+static int turn_on_compression(struct inode *inode, cloff_t index)
26193+{
26194+ toggle_compression(cryptcompress_inode_data(inode), 1);
26195+ return 0;
26196+}
26197+
26198+static int turn_off_compression_on_zero(struct inode *inode, cloff_t index)
26199+{
26200+ assert("edward-1308", inode != NULL);
26201+ if (index == 0)
26202+ toggle_compression(cryptcompress_inode_data(inode), 0);
26203+ return 0;
26204+}
26205+
26206+/* Check on lattice (COL) of some sparseness factor,
26207+ the family of adaptive compression modes which define
26208+ the following behavior:
26209+
26210+ Compression is on: try to compress everything and turn
26211+ it off, whenever cluster is incompressible.
26212+
26213+ Compression is off: try to compress clusters of indexes
26214+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26215+ them is compressible. */
26216+
26217+/* check if @index belongs to one-dimensional lattice
26218+ of sparce factor @factor */
26219+static int check_on_lattice(cloff_t index, int factor)
26220+{
26221+ return (factor ? index % factor == 0: index == 0);
26222+}
26223+
26224+#define DEFINE_CHECK_ON_LATTICE(FACTOR) \
26225+ static int check_on_lattice_ ## FACTOR (struct inode * inode, \
26226+ cloff_t index) \
26227+{ \
26228+ return should_deflate_common(inode, index) || \
26229+ check_on_lattice(index, FACTOR); \
26230+}
26231+
26232+#define SUPPORT_COL_COMPRESSION_MODE(FACTOR, LABEL) \
26233+[COL_ ## FACTOR ## _COMPRESSION_MODE_ID] = { \
26234+ .h = { \
26235+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, \
26236+ .id = COL_ ## FACTOR ## _COMPRESSION_MODE_ID, \
26237+ .pops = NULL, \
26238+ .label = LABEL, \
26239+ .desc = LABEL, \
26240+ .linkage = {NULL, NULL} \
26241+ }, \
26242+ .should_deflate = check_on_lattice_ ## FACTOR, \
26243+ .accept_hook = turn_on_compression, \
26244+ .discard_hook = turn_off_compression \
26245+}
26246+
26247+DEFINE_CHECK_ON_LATTICE(8)
26248+DEFINE_CHECK_ON_LATTICE(16)
26249+DEFINE_CHECK_ON_LATTICE(32)
26250+
26251+/* compression mode_plugins */
26252+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26253+ [NONE_COMPRESSION_MODE_ID] = {
26254+ .h = {
26255+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26256+ .id = NONE_COMPRESSION_MODE_ID,
26257+ .pops = NULL,
26258+ .label = "none",
26259+ .desc = "Don't compress",
26260+ .linkage = {NULL, NULL}
26261+ },
26262+ .should_deflate = should_deflate_none,
26263+ .accept_hook = NULL,
26264+ .discard_hook = NULL
26265+ },
26266+ /* Check-on-lattice adaptive compression modes */
26267+ SUPPORT_COL_COMPRESSION_MODE(8, "col8"),
26268+ SUPPORT_COL_COMPRESSION_MODE(16, "col16"),
26269+ SUPPORT_COL_COMPRESSION_MODE(32, "col32"),
26270+ /* Turn off compression if logical cluster of index == 0
26271+ is incompressible, then don't check anymore */
26272+ [COZ_COMPRESSION_MODE_ID] = {
26273+ .h = {
26274+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26275+ .id = COZ_COMPRESSION_MODE_ID,
26276+ .pops = NULL,
26277+ .label = "coz",
26278+ .desc = "Check on zero",
26279+ .linkage = {NULL, NULL}
26280+ },
26281+ .should_deflate = should_deflate_common,
26282+ .accept_hook = NULL,
26283+ .discard_hook = turn_off_compression_on_zero
26284+ },
26285+ [FORCE_COMPRESSION_MODE_ID] = {
26286+ .h = {
26287+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26288+ .id = FORCE_COMPRESSION_MODE_ID,
26289+ .pops = NULL,
26290+ .label = "force",
26291+ .desc = "Compress everything",
26292+ .linkage = {NULL, NULL}
26293+ },
26294+ .should_deflate = NULL,
26295+ .accept_hook = NULL,
26296+ .discard_hook = NULL
26297+ },
26298+ [TEST_COMPRESSION_MODE_ID] = {
26299+ .h = {
26300+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26301+ .id = TEST_COMPRESSION_MODE_ID,
26302+ .pops = NULL,
26303+ .label = "test", /* This mode is for benchmarks only */
26304+ .desc = "Don't compress odd clusters",
26305+ .linkage = {NULL, NULL}
26306+ },
26307+ .should_deflate = should_deflate_test,
26308+ .accept_hook = NULL,
26309+ .discard_hook = NULL
26310+ }
26311+};
26312+
26313+/*
26314+ Local variables:
26315+ c-indentation-style: "K&R"
26316+ mode-name: "LC"
26317+ c-basic-offset: 8
26318+ tab-width: 8
26319+ fill-column: 120
26320+ scroll-step: 1
26321+ End:
26322+*/
26323Index: linux-2.6.16/fs/reiser4/plugin/compress/lzoconf.h
26324===================================================================
26325--- /dev/null
26326+++ linux-2.6.16/fs/reiser4/plugin/compress/lzoconf.h
26327@@ -0,0 +1,420 @@
26328+/* lzoconf.h -- configuration for the LZO real-time data compression library
26329+ adopted for reiser4 compression transform plugin.
26330+
26331+ This file is part of the LZO real-time data compression library
26332+ and not included in any proprietary licenses of reiser4.
26333+
26334+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26335+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26336+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26337+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26338+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26339+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26340+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26341+ All Rights Reserved.
26342+
26343+ The LZO library is free software; you can redistribute it and/or
26344+ modify it under the terms of the GNU General Public License as
26345+ published by the Free Software Foundation; either version 2 of
26346+ the License, or (at your option) any later version.
26347+
26348+ The LZO library is distributed in the hope that it will be useful,
26349+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26350+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26351+ GNU General Public License for more details.
26352+
26353+ You should have received a copy of the GNU General Public License
26354+ along with the LZO library; see the file COPYING.
26355+ If not, write to the Free Software Foundation, Inc.,
26356+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26357+
26358+ Markus F.X.J. Oberhumer
26359+ <markus@oberhumer.com>
26360+ http://www.oberhumer.com/opensource/lzo/
26361+ */
26362+
26363+#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
26364+
26365+#ifndef __LZOCONF_H
26366+#define __LZOCONF_H
26367+
26368+#define LZO_VERSION 0x1080
26369+#define LZO_VERSION_STRING "1.08"
26370+#define LZO_VERSION_DATE "Jul 12 2002"
26371+
26372+/* internal Autoconf configuration file - only used when building LZO */
26373+#if defined(LZO_HAVE_CONFIG_H)
26374+# include <config.h>
26375+#endif
26376+#ifdef __cplusplus
26377+extern "C" {
26378+#endif
26379+
26380+/***********************************************************************
26381+// LZO requires a conforming <limits.h>
26382+************************************************************************/
26383+
26384+#define CHAR_BIT 8
26385+#define USHRT_MAX 0xffff
26386+
26387+/* workaround a cpp bug under hpux 10.20 */
26388+#define LZO_0xffffffffL 4294967295ul
26389+
26390+/***********************************************************************
26391+// architecture defines
26392+************************************************************************/
26393+
26394+#if !defined(__LZO_WIN) && !defined(__LZO_DOS) && !defined(__LZO_OS2)
26395+# if defined(__WINDOWS__) || defined(_WINDOWS) || defined(_Windows)
26396+# define __LZO_WIN
26397+# elif defined(__WIN32__) || defined(_WIN32) || defined(WIN32)
26398+# define __LZO_WIN
26399+# elif defined(__NT__) || defined(__NT_DLL__) || defined(__WINDOWS_386__)
26400+# define __LZO_WIN
26401+# elif defined(__DOS__) || defined(__MSDOS__) || defined(MSDOS)
26402+# define __LZO_DOS
26403+# elif defined(__OS2__) || defined(__OS2V2__) || defined(OS2)
26404+# define __LZO_OS2
26405+# elif defined(__palmos__)
26406+# define __LZO_PALMOS
26407+# elif defined(__TOS__) || defined(__atarist__)
26408+# define __LZO_TOS
26409+# endif
26410+#endif
26411+
26412+#if (UINT_MAX < LZO_0xffffffffL)
26413+# if defined(__LZO_WIN)
26414+# define __LZO_WIN16
26415+# elif defined(__LZO_DOS)
26416+# define __LZO_DOS16
26417+# elif defined(__LZO_PALMOS)
26418+# define __LZO_PALMOS16
26419+# elif defined(__LZO_TOS)
26420+# define __LZO_TOS16
26421+# elif defined(__C166__)
26422+# else
26423+ /* porting hint: for pure 16-bit architectures try compiling
26424+ * everything with -D__LZO_STRICT_16BIT */
26425+# error "16-bit target not supported - contact me for porting hints"
26426+# endif
26427+#endif
26428+
26429+#if !defined(__LZO_i386)
26430+# if defined(__LZO_DOS) || defined(__LZO_WIN16)
26431+# define __LZO_i386
26432+# elif defined(__i386__) || defined(__386__) || defined(_M_IX86)
26433+# define __LZO_i386
26434+# endif
26435+#endif
26436+
26437+#if defined(__LZO_STRICT_16BIT)
26438+# if (UINT_MAX < LZO_0xffffffffL)
26439+# include <lzo16bit.h>
26440+# endif
26441+#endif
26442+
26443+/* memory checkers */
26444+#if !defined(__LZO_CHECKER)
26445+# if defined(__BOUNDS_CHECKING_ON)
26446+# define __LZO_CHECKER
26447+# elif defined(__CHECKER__)
26448+# define __LZO_CHECKER
26449+# elif defined(__INSURE__)
26450+# define __LZO_CHECKER
26451+# elif defined(__PURIFY__)
26452+# define __LZO_CHECKER
26453+# endif
26454+#endif
26455+
26456+/***********************************************************************
26457+// integral and pointer types
26458+************************************************************************/
26459+
26460+/* Integral types with 32 bits or more */
26461+#if !defined(LZO_UINT32_MAX)
26462+# if (UINT_MAX >= LZO_0xffffffffL)
26463+ typedef unsigned int lzo_uint32;
26464+ typedef int lzo_int32;
26465+# define LZO_UINT32_MAX UINT_MAX
26466+# define LZO_INT32_MAX INT_MAX
26467+# define LZO_INT32_MIN INT_MIN
26468+# elif (ULONG_MAX >= LZO_0xffffffffL)
26469+ typedef unsigned long lzo_uint32;
26470+ typedef long lzo_int32;
26471+# define LZO_UINT32_MAX ULONG_MAX
26472+# define LZO_INT32_MAX LONG_MAX
26473+# define LZO_INT32_MIN LONG_MIN
26474+# else
26475+# error "lzo_uint32"
26476+# endif
26477+#endif
26478+
26479+/* lzo_uint is used like size_t */
26480+#if !defined(LZO_UINT_MAX)
26481+# if (UINT_MAX >= LZO_0xffffffffL)
26482+ typedef unsigned int lzo_uint;
26483+ typedef int lzo_int;
26484+# define LZO_UINT_MAX UINT_MAX
26485+# define LZO_INT_MAX INT_MAX
26486+# define LZO_INT_MIN INT_MIN
26487+# elif (ULONG_MAX >= LZO_0xffffffffL)
26488+ typedef unsigned long lzo_uint;
26489+ typedef long lzo_int;
26490+# define LZO_UINT_MAX ULONG_MAX
26491+# define LZO_INT_MAX LONG_MAX
26492+# define LZO_INT_MIN LONG_MIN
26493+# else
26494+# error "lzo_uint"
26495+# endif
26496+#endif
26497+
26498+ typedef int lzo_bool;
26499+
26500+/***********************************************************************
26501+// memory models
26502+************************************************************************/
26503+
26504+/* Memory model for the public code segment. */
26505+#if !defined(__LZO_CMODEL)
26506+# if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26507+# define __LZO_CMODEL __far
26508+# elif defined(__LZO_i386) && defined(__WATCOMC__)
26509+# define __LZO_CMODEL __near
26510+# else
26511+# define __LZO_CMODEL
26512+# endif
26513+#endif
26514+
26515+/* Memory model for the public data segment. */
26516+#if !defined(__LZO_DMODEL)
26517+# if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26518+# define __LZO_DMODEL __far
26519+# elif defined(__LZO_i386) && defined(__WATCOMC__)
26520+# define __LZO_DMODEL __near
26521+# else
26522+# define __LZO_DMODEL
26523+# endif
26524+#endif
26525+
26526+/* Memory model that allows to access memory at offsets of lzo_uint. */
26527+#if !defined(__LZO_MMODEL)
26528+# if (LZO_UINT_MAX <= UINT_MAX)
26529+# define __LZO_MMODEL
26530+# elif defined(__LZO_DOS16) || defined(__LZO_WIN16)
26531+# define __LZO_MMODEL __huge
26532+# define LZO_999_UNSUPPORTED
26533+# elif defined(__LZO_PALMOS16) || defined(__LZO_TOS16)
26534+# define __LZO_MMODEL
26535+# else
26536+# error "__LZO_MMODEL"
26537+# endif
26538+#endif
26539+
26540+/* no typedef here because of const-pointer issues */
26541+#define lzo_byte unsigned char __LZO_MMODEL
26542+#define lzo_bytep unsigned char __LZO_MMODEL *
26543+#define lzo_charp char __LZO_MMODEL *
26544+#define lzo_voidp void __LZO_MMODEL *
26545+#define lzo_shortp short __LZO_MMODEL *
26546+#define lzo_ushortp unsigned short __LZO_MMODEL *
26547+#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
26548+#define lzo_int32p lzo_int32 __LZO_MMODEL *
26549+#define lzo_uintp lzo_uint __LZO_MMODEL *
26550+#define lzo_intp lzo_int __LZO_MMODEL *
26551+#define lzo_voidpp lzo_voidp __LZO_MMODEL *
26552+#define lzo_bytepp lzo_bytep __LZO_MMODEL *
26553+
26554+#ifndef lzo_sizeof_dict_t
26555+# define lzo_sizeof_dict_t sizeof(lzo_bytep)
26556+#endif
26557+
26558+/***********************************************************************
26559+// calling conventions and function types
26560+************************************************************************/
26561+
26562+/* linkage */
26563+#if !defined(__LZO_EXTERN_C)
26564+# ifdef __cplusplus
26565+# define __LZO_EXTERN_C extern "C"
26566+# else
26567+# define __LZO_EXTERN_C extern
26568+# endif
26569+#endif
26570+
26571+/* calling convention */
26572+#if !defined(__LZO_CDECL)
26573+# if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26574+# define __LZO_CDECL __LZO_CMODEL __cdecl
26575+# elif defined(__LZO_i386) && defined(_MSC_VER)
26576+# define __LZO_CDECL __LZO_CMODEL __cdecl
26577+# elif defined(__LZO_i386) && defined(__WATCOMC__)
26578+# define __LZO_CDECL __LZO_CMODEL __cdecl
26579+# else
26580+# define __LZO_CDECL __LZO_CMODEL
26581+# endif
26582+#endif
26583+#if !defined(__LZO_ENTRY)
26584+# define __LZO_ENTRY __LZO_CDECL
26585+#endif
26586+
26587+/* C++ exception specification for extern "C" function types */
26588+#if !defined(__cplusplus)
26589+# undef LZO_NOTHROW
26590+# define LZO_NOTHROW
26591+#elif !defined(LZO_NOTHROW)
26592+# define LZO_NOTHROW
26593+#endif
26594+
26595+ typedef int
26596+ (__LZO_ENTRY * lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26597+ lzo_byte * dst, lzo_uintp dst_len,
26598+ lzo_voidp wrkmem);
26599+
26600+ typedef int
26601+ (__LZO_ENTRY * lzo_decompress_t) (const lzo_byte * src,
26602+ lzo_uint src_len, lzo_byte * dst,
26603+ lzo_uintp dst_len, lzo_voidp wrkmem);
26604+
26605+ typedef int
26606+ (__LZO_ENTRY * lzo_optimize_t) (lzo_byte * src, lzo_uint src_len,
26607+ lzo_byte * dst, lzo_uintp dst_len,
26608+ lzo_voidp wrkmem);
26609+
26610+ typedef int
26611+ (__LZO_ENTRY * lzo_compress_dict_t) (const lzo_byte * src,
26612+ lzo_uint src_len, lzo_byte * dst,
26613+ lzo_uintp dst_len,
26614+ lzo_voidp wrkmem,
26615+ const lzo_byte * dict,
26616+ lzo_uint dict_len);
26617+
26618+ typedef int
26619+ (__LZO_ENTRY * lzo_decompress_dict_t) (const lzo_byte * src,
26620+ lzo_uint src_len,
26621+ lzo_byte * dst,
26622+ lzo_uintp dst_len,
26623+ lzo_voidp wrkmem,
26624+ const lzo_byte * dict,
26625+ lzo_uint dict_len);
26626+
26627+/* assembler versions always use __cdecl */
26628+ typedef int
26629+ (__LZO_CDECL * lzo_compress_asm_t) (const lzo_byte * src,
26630+ lzo_uint src_len, lzo_byte * dst,
26631+ lzo_uintp dst_len,
26632+ lzo_voidp wrkmem);
26633+
26634+ typedef int
26635+ (__LZO_CDECL * lzo_decompress_asm_t) (const lzo_byte * src,
26636+ lzo_uint src_len, lzo_byte * dst,
26637+ lzo_uintp dst_len,
26638+ lzo_voidp wrkmem);
26639+
26640+/* a progress indicator callback function */
26641+ typedef void (__LZO_ENTRY * lzo_progress_callback_t) (lzo_uint,
26642+ lzo_uint);
26643+
26644+/***********************************************************************
26645+// export information
26646+************************************************************************/
26647+
26648+/* DLL export information */
26649+#if !defined(__LZO_EXPORT1)
26650+# define __LZO_EXPORT1
26651+#endif
26652+#if !defined(__LZO_EXPORT2)
26653+# define __LZO_EXPORT2
26654+#endif
26655+
26656+/* exported calling convention for C functions */
26657+#if !defined(LZO_PUBLIC)
26658+# define LZO_PUBLIC(_rettype) \
26659+ __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_ENTRY
26660+#endif
26661+#if !defined(LZO_EXTERN)
26662+# define LZO_EXTERN(_rettype) __LZO_EXTERN_C LZO_PUBLIC(_rettype)
26663+#endif
26664+#if !defined(LZO_PRIVATE)
26665+# define LZO_PRIVATE(_rettype) static _rettype __LZO_ENTRY
26666+#endif
26667+
26668+/* exported __cdecl calling convention for assembler functions */
26669+#if !defined(LZO_PUBLIC_CDECL)
26670+# define LZO_PUBLIC_CDECL(_rettype) \
26671+ __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_CDECL
26672+#endif
26673+#if !defined(LZO_EXTERN_CDECL)
26674+# define LZO_EXTERN_CDECL(_rettype) __LZO_EXTERN_C LZO_PUBLIC_CDECL(_rettype)
26675+#endif
26676+
26677+/* exported global variables (LZO currently uses no static variables and
26678+ * is fully thread safe) */
26679+#if !defined(LZO_PUBLIC_VAR)
26680+# define LZO_PUBLIC_VAR(_type) \
26681+ __LZO_EXPORT1 _type __LZO_EXPORT2 __LZO_DMODEL
26682+#endif
26683+#if !defined(LZO_EXTERN_VAR)
26684+# define LZO_EXTERN_VAR(_type) extern LZO_PUBLIC_VAR(_type)
26685+#endif
26686+
26687+/***********************************************************************
26688+// error codes and prototypes
26689+************************************************************************/
26690+
26691+/* Error codes for the compression/decompression functions. Negative
26692+ * values are errors, positive values will be used for special but
26693+ * normal events.
26694+ */
26695+#define LZO_E_OK 0
26696+#define LZO_E_ERROR (-1)
26697+#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
26698+#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
26699+#define LZO_E_INPUT_OVERRUN (-4)
26700+#define LZO_E_OUTPUT_OVERRUN (-5)
26701+#define LZO_E_LOOKBEHIND_OVERRUN (-6)
26702+#define LZO_E_EOF_NOT_FOUND (-7)
26703+#define LZO_E_INPUT_NOT_CONSUMED (-8)
26704+
26705+/* lzo_init() should be the first function you call.
26706+ * Check the return code !
26707+ *
26708+ * lzo_init() is a macro to allow checking that the library and the
26709+ * compiler's view of various types are consistent.
26710+ */
26711+#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26712+ (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26713+ (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26714+ (int)sizeof(lzo_compress_t))
26715+ LZO_EXTERN(int) __lzo_init2(unsigned, int, int, int, int, int, int,
26716+ int, int, int);
26717+
26718+/* checksum functions */
26719+ LZO_EXTERN(lzo_uint32)
26720+ lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf, lzo_uint _len);
26721+
26722+/* misc. */
26723+ typedef union {
26724+ lzo_bytep p;
26725+ lzo_uint u;
26726+ } __lzo_pu_u;
26727+ typedef union {
26728+ lzo_bytep p;
26729+ lzo_uint32 u32;
26730+ } __lzo_pu32_u;
26731+ typedef union {
26732+ void *vp;
26733+ lzo_bytep bp;
26734+ lzo_uint32 u32;
26735+ long l;
26736+ } lzo_align_t;
26737+
26738+#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26739+ ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26740+
26741+/* deprecated - only for backward compatibility */
26742+#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26743+
26744+#ifdef __cplusplus
26745+} /* extern "C" */
26746+#endif
26747+#endif /* already included */
26748Index: linux-2.6.16/fs/reiser4/plugin/compress/minilzo.c
26749===================================================================
26750--- /dev/null
26751+++ linux-2.6.16/fs/reiser4/plugin/compress/minilzo.c
26752@@ -0,0 +1,2155 @@
26753+/* minilzo.c -- mini subset of the LZO real-time data compression library
26754+ adopted for reiser4 compression transform plugin.
26755+
26756+ This file is part of the LZO real-time data compression library
26757+ and not included in any proprietary licenses of reiser4.
26758+
26759+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26760+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26761+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26762+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26763+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26764+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26765+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26766+ All Rights Reserved.
26767+
26768+ The LZO library is free software; you can redistribute it and/or
26769+ modify it under the terms of the GNU General Public License as
26770+ published by the Free Software Foundation; either version 2 of
26771+ the License, or (at your option) any later version.
26772+
26773+ The LZO library is distributed in the hope that it will be useful,
26774+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26775+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26776+ GNU General Public License for more details.
26777+
26778+ You should have received a copy of the GNU General Public License
26779+ along with the LZO library; see the file COPYING.
26780+ If not, write to the Free Software Foundation, Inc.,
26781+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26782+
26783+ Markus F.X.J. Oberhumer
26784+ <markus@oberhumer.com>
26785+ http://www.oberhumer.com/opensource/lzo/
26786+ */
26787+
26788+/*
26789+ * NOTE:
26790+ * the full LZO package can be found at
26791+ * http://www.oberhumer.com/opensource/lzo/
26792+ */
26793+
26794+#include "../../debug.h" /* for reiser4 assert macro -edward */
26795+
26796+#define __LZO_IN_MINILZO
26797+#define LZO_BUILD
26798+
26799+#ifdef MINILZO_HAVE_CONFIG_H
26800+# include <config.h>
26801+#endif
26802+
26803+#undef LZO_HAVE_CONFIG_H
26804+#include "minilzo.h"
26805+
26806+#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26807+# error "version mismatch in miniLZO source files"
26808+#endif
26809+
26810+#ifdef MINILZO_HAVE_CONFIG_H
26811+# define LZO_HAVE_CONFIG_H
26812+#endif
26813+
26814+
26815+#ifndef __LZO_CONF_H
26816+#define __LZO_CONF_H
26817+
26818+#if !defined(__LZO_IN_MINILZO)
26819+# ifndef __LZOCONF_H
26820+# include <lzoconf.h>
26821+# endif
26822+#endif
26823+
26824+#if defined(__BOUNDS_CHECKING_ON)
26825+# include <unchecked.h>
26826+#else
26827+# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
26828+# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
26829+#endif
26830+
26831+# define HAVE_MEMCMP
26832+# define HAVE_MEMCPY
26833+# define HAVE_MEMMOVE
26834+# define HAVE_MEMSET
26835+
26836+#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26837+# define HAVE_MALLOC_H
26838+# define HAVE_HALLOC
26839+#endif
26840+
26841+#undef NDEBUG
26842+#if !defined(LZO_DEBUG)
26843+# define NDEBUG
26844+#endif
26845+#if defined(LZO_DEBUG) || !defined(NDEBUG)
26846+# if !defined(NO_STDIO_H)
26847+# include <stdio.h>
26848+# endif
26849+#endif
26850+# if 0 /* edward */
26851+#include <assert.h>
26852+#endif /* edward */
26853+
26854+#if !defined(LZO_COMPILE_TIME_ASSERT)
26855+# define LZO_COMPILE_TIME_ASSERT(expr) \
26856+ { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26857+#endif
26858+
26859+#if !defined(LZO_UNUSED)
26860+# if 1
26861+# define LZO_UNUSED(var) ((void)&var)
26862+# elif 0
26863+# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26864+# else
26865+# define LZO_UNUSED(parm) (parm = parm)
26866+# endif
26867+#endif
26868+
26869+#if !defined(__inline__) && !defined(__GNUC__)
26870+# if defined(__cplusplus)
26871+# define __inline__ inline
26872+# else
26873+# define __inline__
26874+# endif
26875+#endif
26876+
26877+#if defined(NO_MEMCMP)
26878+# undef HAVE_MEMCMP
26879+#endif
26880+
26881+#if !defined(HAVE_MEMSET)
26882+# undef memset
26883+# define memset lzo_memset
26884+#endif
26885+
26886+# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
26887+
26888+#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
26889+#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
26890+#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26891+#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26892+
26893+#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
26894+
26895+#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26896+
26897+#define LZO_SIZE(bits) (1u << (bits))
26898+#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
26899+
26900+#define LZO_LSIZE(bits) (1ul << (bits))
26901+#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
26902+
26903+#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
26904+#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
26905+
26906+#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
26907+#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26908+
26909+#if !defined(SIZEOF_UNSIGNED)
26910+# if (UINT_MAX == 0xffff)
26911+# define SIZEOF_UNSIGNED 2
26912+# elif (UINT_MAX == LZO_0xffffffffL)
26913+# define SIZEOF_UNSIGNED 4
26914+# elif (UINT_MAX >= LZO_0xffffffffL)
26915+# define SIZEOF_UNSIGNED 8
26916+# else
26917+# error "SIZEOF_UNSIGNED"
26918+# endif
26919+#endif
26920+
26921+#if !defined(SIZEOF_UNSIGNED_LONG)
26922+# if (ULONG_MAX == LZO_0xffffffffL)
26923+# define SIZEOF_UNSIGNED_LONG 4
26924+# elif (ULONG_MAX >= LZO_0xffffffffL)
26925+# define SIZEOF_UNSIGNED_LONG 8
26926+# else
26927+# error "SIZEOF_UNSIGNED_LONG"
26928+# endif
26929+#endif
26930+
26931+#if !defined(SIZEOF_SIZE_T)
26932+# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
26933+#endif
26934+#if !defined(SIZE_T_MAX)
26935+# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26936+#endif
26937+
26938+#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26939+# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26940+# define LZO_UNALIGNED_OK_2
26941+# endif
26942+# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26943+# define LZO_UNALIGNED_OK_4
26944+# endif
26945+#endif
26946+
26947+#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26948+# if !defined(LZO_UNALIGNED_OK)
26949+# define LZO_UNALIGNED_OK
26950+# endif
26951+#endif
26952+
26953+#if defined(__LZO_NO_UNALIGNED)
26954+# undef LZO_UNALIGNED_OK
26955+# undef LZO_UNALIGNED_OK_2
26956+# undef LZO_UNALIGNED_OK_4
26957+#endif
26958+
26959+#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26960+# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26961+#endif
26962+#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26963+# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26964+#endif
26965+
26966+#if defined(__LZO_NO_ALIGNED)
26967+# undef LZO_ALIGNED_OK_4
26968+#endif
26969+
26970+#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26971+# error "LZO_ALIGNED_OK_4 must not be defined on this system"
26972+#endif
26973+
26974+#define LZO_LITTLE_ENDIAN 1234
26975+#define LZO_BIG_ENDIAN 4321
26976+#define LZO_PDP_ENDIAN 3412
26977+
26978+#if !defined(LZO_BYTE_ORDER)
26979+# if defined(MFX_BYTE_ORDER)
26980+# define LZO_BYTE_ORDER MFX_BYTE_ORDER
26981+# elif defined(__LZO_i386)
26982+# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
26983+# elif defined(BYTE_ORDER)
26984+# define LZO_BYTE_ORDER BYTE_ORDER
26985+# elif defined(__BYTE_ORDER)
26986+# define LZO_BYTE_ORDER __BYTE_ORDER
26987+# endif
26988+#endif
26989+
26990+#if defined(LZO_BYTE_ORDER)
26991+# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26992+ (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26993+# error "invalid LZO_BYTE_ORDER"
26994+# endif
26995+#endif
26996+
26997+#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
26998+# error "LZO_BYTE_ORDER is not defined"
26999+#endif
27000+
27001+#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
27002+
27003+#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
27004+# if defined(__GNUC__) && defined(__i386__)
27005+# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
27006+# define LZO_OPTIMIZE_GNUC_i386
27007+# endif
27008+# endif
27009+#endif
27010+
27011+__LZO_EXTERN_C const lzo_uint32 _lzo_crc32_table[256];
27012+
27013+#define _LZO_STRINGIZE(x) #x
27014+#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
27015+
27016+#define _LZO_CONCAT2(a,b) a ## b
27017+#define _LZO_CONCAT3(a,b,c) a ## b ## c
27018+#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
27019+#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
27020+
27021+#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
27022+#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
27023+#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
27024+#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
27025+
27026+#ifndef __LZO_PTR_H
27027+#define __LZO_PTR_H
27028+
27029+#ifdef __cplusplus
27030+extern "C" {
27031+#endif
27032+
27033+#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27034+# include <dos.h>
27035+# if 1 && defined(__WATCOMC__)
27036+# include <i86.h>
27037+ __LZO_EXTERN_C unsigned char _HShift;
27038+# define __LZO_HShift _HShift
27039+# elif 1 && defined(_MSC_VER)
27040+ __LZO_EXTERN_C unsigned short __near _AHSHIFT;
27041+# define __LZO_HShift ((unsigned) &_AHSHIFT)
27042+# elif defined(__LZO_WIN16)
27043+# define __LZO_HShift 3
27044+# else
27045+# define __LZO_HShift 12
27046+# endif
27047+# if !defined(_FP_SEG) && defined(FP_SEG)
27048+# define _FP_SEG FP_SEG
27049+# endif
27050+# if !defined(_FP_OFF) && defined(FP_OFF)
27051+# define _FP_OFF FP_OFF
27052+# endif
27053+#endif
27054+
27055+#if !defined(lzo_ptrdiff_t)
27056+# if (UINT_MAX >= LZO_0xffffffffL)
27057+ typedef ptrdiff_t lzo_ptrdiff_t;
27058+# else
27059+ typedef long lzo_ptrdiff_t;
27060+# endif
27061+#endif
27062+
27063+#if !defined(__LZO_HAVE_PTR_T)
27064+# if defined(lzo_ptr_t)
27065+# define __LZO_HAVE_PTR_T
27066+# endif
27067+#endif
27068+#if !defined(__LZO_HAVE_PTR_T)
27069+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
27070+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
27071+ typedef unsigned long lzo_ptr_t;
27072+ typedef long lzo_sptr_t;
27073+# define __LZO_HAVE_PTR_T
27074+# endif
27075+# endif
27076+#endif
27077+#if !defined(__LZO_HAVE_PTR_T)
27078+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
27079+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
27080+ typedef unsigned int lzo_ptr_t;
27081+ typedef int lzo_sptr_t;
27082+# define __LZO_HAVE_PTR_T
27083+# endif
27084+# endif
27085+#endif
27086+#if !defined(__LZO_HAVE_PTR_T)
27087+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
27088+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
27089+ typedef unsigned short lzo_ptr_t;
27090+ typedef short lzo_sptr_t;
27091+# define __LZO_HAVE_PTR_T
27092+# endif
27093+# endif
27094+#endif
27095+#if !defined(__LZO_HAVE_PTR_T)
27096+# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
27097+# error "no suitable type for lzo_ptr_t"
27098+# else
27099+ typedef unsigned long lzo_ptr_t;
27100+ typedef long lzo_sptr_t;
27101+# define __LZO_HAVE_PTR_T
27102+# endif
27103+#endif
27104+
27105+#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27106+#define PTR(a) ((lzo_bytep) (a))
27107+#define PTR_ALIGNED_4(a) ((_FP_OFF(a) & 3) == 0)
27108+#define PTR_ALIGNED2_4(a,b) (((_FP_OFF(a) | _FP_OFF(b)) & 3) == 0)
27109+#else
27110+#define PTR(a) ((lzo_ptr_t) (a))
27111+#define PTR_LINEAR(a) PTR(a)
27112+#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
27113+#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
27114+#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
27115+#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
27116+#endif
27117+
27118+#define PTR_LT(a,b) (PTR(a) < PTR(b))
27119+#define PTR_GE(a,b) (PTR(a) >= PTR(b))
27120+#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
27121+#define pd(a,b) ((lzo_uint) ((a)-(b)))
27122+
27123+ typedef union {
27124+ char a_char;
27125+ unsigned char a_uchar;
27126+ short a_short;
27127+ unsigned short a_ushort;
27128+ int a_int;
27129+ unsigned int a_uint;
27130+ long a_long;
27131+ unsigned long a_ulong;
27132+ lzo_int a_lzo_int;
27133+ lzo_uint a_lzo_uint;
27134+ lzo_int32 a_lzo_int32;
27135+ lzo_uint32 a_lzo_uint32;
27136+ ptrdiff_t a_ptrdiff_t;
27137+ lzo_ptrdiff_t a_lzo_ptrdiff_t;
27138+ lzo_ptr_t a_lzo_ptr_t;
27139+ lzo_voidp a_lzo_voidp;
27140+ void *a_void_p;
27141+ lzo_bytep a_lzo_bytep;
27142+ lzo_bytepp a_lzo_bytepp;
27143+ lzo_uintp a_lzo_uintp;
27144+ lzo_uint *a_lzo_uint_p;
27145+ lzo_uint32p a_lzo_uint32p;
27146+ lzo_uint32 *a_lzo_uint32_p;
27147+ unsigned char *a_uchar_p;
27148+ char *a_char_p;
27149+ } lzo_full_align_t;
27150+
27151+#ifdef __cplusplus
27152+}
27153+#endif
27154+#endif
27155+#define LZO_DETERMINISTIC
27156+#define LZO_DICT_USE_PTR
27157+#if defined(__LZO_DOS16) || defined(__LZO_WIN16) || defined(__LZO_STRICT_16BIT)
27158+# undef LZO_DICT_USE_PTR
27159+#endif
27160+#if defined(LZO_DICT_USE_PTR)
27161+# define lzo_dict_t const lzo_bytep
27162+# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
27163+#else
27164+# define lzo_dict_t lzo_uint
27165+# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
27166+#endif
27167+#if !defined(lzo_moff_t)
27168+#define lzo_moff_t lzo_uint
27169+#endif
27170+#endif
27171+static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
27172+{
27173+ lzo_ptr_t p;
27174+
27175+#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27176+ p = (((lzo_ptr_t) (_FP_SEG(ptr))) << (16 - __LZO_HShift)) +
27177+ (_FP_OFF(ptr));
27178+#else
27179+ p = PTR_LINEAR(ptr);
27180+#endif
27181+
27182+ return p;
27183+}
27184+
27185+static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
27186+{
27187+ lzo_ptr_t p, s, n;
27188+
27189+ assert("lzo-01", size > 0);
27190+
27191+ p = __lzo_ptr_linear(ptr);
27192+ s = (lzo_ptr_t) (size - 1);
27193+ n = (((p + s) / size) * size) - p;
27194+
27195+ assert("lzo-02", (long)n >= 0);
27196+ assert("lzo-03", n <= s);
27197+
27198+ return (unsigned)n;
27199+}
27200+
27201+#ifndef __LZO_UTIL_H
27202+#define __LZO_UTIL_H
27203+
27204+#ifndef __LZO_CONF_H
27205+#endif
27206+
27207+#ifdef __cplusplus
27208+extern "C" {
27209+#endif
27210+
27211+#if 1 && defined(HAVE_MEMCPY)
27212+#if !defined(__LZO_DOS16) && !defined(__LZO_WIN16)
27213+
27214+#define MEMCPY8_DS(dest,src,len) \
27215+ memcpy(dest,src,len); \
27216+ dest += len; \
27217+ src += len
27218+
27219+#endif
27220+#endif
27221+
27222+#if !defined(MEMCPY8_DS)
27223+
27224+#define MEMCPY8_DS(dest,src,len) \
27225+ { register lzo_uint __l = (len) / 8; \
27226+ do { \
27227+ *dest++ = *src++; \
27228+ *dest++ = *src++; \
27229+ *dest++ = *src++; \
27230+ *dest++ = *src++; \
27231+ *dest++ = *src++; \
27232+ *dest++ = *src++; \
27233+ *dest++ = *src++; \
27234+ *dest++ = *src++; \
27235+ } while (--__l > 0); }
27236+
27237+#endif
27238+
27239+#define MEMCPY_DS(dest,src,len) \
27240+ do *dest++ = *src++; \
27241+ while (--len > 0)
27242+
27243+#define MEMMOVE_DS(dest,src,len) \
27244+ do *dest++ = *src++; \
27245+ while (--len > 0)
27246+
27247+
27248+#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
27249+
27250+#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
27251+
27252+#else
27253+
27254+#define BZERO8_PTR(s,l,n) \
27255+ lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
27256+
27257+#endif
27258+
27259+#ifdef __cplusplus
27260+}
27261+#endif
27262+
27263+#endif
27264+
27265+/* If you use the LZO library in a product, you *must* keep this
27266+ * copyright string in the executable of your product.
27267+ */
27268+
27269+static const lzo_byte __lzo_copyright[] =
27270+#if !defined(__LZO_IN_MINLZO)
27271+ LZO_VERSION_STRING;
27272+#else
27273+ "\n\n\n"
27274+ "LZO real-time data compression library.\n"
27275+ "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
27276+ "<markus.oberhumer@jk.uni-linz.ac.at>\n"
27277+ "http://www.oberhumer.com/opensource/lzo/\n"
27278+ "\n"
27279+ "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
27280+ "LZO build date: " __DATE__ " " __TIME__ "\n\n"
27281+ "LZO special compilation options:\n"
27282+#ifdef __cplusplus
27283+ " __cplusplus\n"
27284+#endif
27285+#if defined(__PIC__)
27286+ " __PIC__\n"
27287+#elif defined(__pic__)
27288+ " __pic__\n"
27289+#endif
27290+#if (UINT_MAX < LZO_0xffffffffL)
27291+ " 16BIT\n"
27292+#endif
27293+#if defined(__LZO_STRICT_16BIT)
27294+ " __LZO_STRICT_16BIT\n"
27295+#endif
27296+#if (UINT_MAX > LZO_0xffffffffL)
27297+ " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
27298+#endif
27299+#if (ULONG_MAX > LZO_0xffffffffL)
27300+ " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
27301+#endif
27302+#if defined(LZO_BYTE_ORDER)
27303+ " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
27304+#endif
27305+#if defined(LZO_UNALIGNED_OK_2)
27306+ " LZO_UNALIGNED_OK_2\n"
27307+#endif
27308+#if defined(LZO_UNALIGNED_OK_4)
27309+ " LZO_UNALIGNED_OK_4\n"
27310+#endif
27311+#if defined(LZO_ALIGNED_OK_4)
27312+ " LZO_ALIGNED_OK_4\n"
27313+#endif
27314+#if defined(LZO_DICT_USE_PTR)
27315+ " LZO_DICT_USE_PTR\n"
27316+#endif
27317+#if defined(__LZO_QUERY_COMPRESS)
27318+ " __LZO_QUERY_COMPRESS\n"
27319+#endif
27320+#if defined(__LZO_QUERY_DECOMPRESS)
27321+ " __LZO_QUERY_DECOMPRESS\n"
27322+#endif
27323+#if defined(__LZO_IN_MINILZO)
27324+ " __LZO_IN_MINILZO\n"
27325+#endif
27326+ "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
27327+#if defined(__GNUC__) && defined(__VERSION__)
27328+ " by gcc " __VERSION__
27329+#elif defined(__BORLANDC__)
27330+ " by Borland C " _LZO_MEXPAND(__BORLANDC__)
27331+#elif defined(_MSC_VER)
27332+ " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
27333+#elif defined(__PUREC__)
27334+ " by Pure C " _LZO_MEXPAND(__PUREC__)
27335+#elif defined(__SC__)
27336+ " by Symantec C " _LZO_MEXPAND(__SC__)
27337+#elif defined(__TURBOC__)
27338+ " by Turbo C " _LZO_MEXPAND(__TURBOC__)
27339+#elif defined(__WATCOMC__)
27340+ " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
27341+#endif
27342+ " $\n"
27343+ "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
27344+#endif
27345+
27346+#define LZO_BASE 65521u
27347+#define LZO_NMAX 5552
27348+
27349+#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
27350+#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
27351+#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
27352+#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
27353+#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
27354+
27355+# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
27356+# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
27357+
27358+#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
27359+
27360+static lzo_bool schedule_insns_bug(void);
27361+static lzo_bool strength_reduce_bug(int *);
27362+
27363+# define __lzo_assert(x) ((x) ? 1 : 0)
27364+
27365+#undef COMPILE_TIME_ASSERT
27366+
27367+# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
27368+
27369+static lzo_bool basic_integral_check(void)
27370+{
27371+ lzo_bool r = 1;
27372+
27373+ COMPILE_TIME_ASSERT(CHAR_BIT == 8);
27374+ COMPILE_TIME_ASSERT(sizeof(char) == 1);
27375+ COMPILE_TIME_ASSERT(sizeof(short) >= 2);
27376+ COMPILE_TIME_ASSERT(sizeof(long) >= 4);
27377+ COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
27378+ COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
27379+
27380+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
27381+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
27382+
27383+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
27384+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
27385+#if defined(__LZO_STRICT_16BIT)
27386+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
27387+#else
27388+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
27389+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
27390+#endif
27391+
27392+#if (USHRT_MAX == 65535u)
27393+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
27394+#elif (USHRT_MAX == LZO_0xffffffffL)
27395+ COMPILE_TIME_ASSERT(sizeof(short) == 4);
27396+#elif (USHRT_MAX >= LZO_0xffffffffL)
27397+ COMPILE_TIME_ASSERT(sizeof(short) > 4);
27398+#endif
27399+#if 0 /* to make gcc happy -edward */
27400+#if (UINT_MAX == 65535u)
27401+ COMPILE_TIME_ASSERT(sizeof(int) == 2);
27402+#elif (UINT_MAX == LZO_0xffffffffL)
27403+ COMPILE_TIME_ASSERT(sizeof(int) == 4);
27404+#elif (UINT_MAX >= LZO_0xffffffffL)
27405+ COMPILE_TIME_ASSERT(sizeof(int) > 4);
27406+#endif
27407+#if (ULONG_MAX == 65535ul)
27408+ COMPILE_TIME_ASSERT(sizeof(long) == 2);
27409+#elif (ULONG_MAX == LZO_0xffffffffL)
27410+ COMPILE_TIME_ASSERT(sizeof(long) == 4);
27411+#elif (ULONG_MAX >= LZO_0xffffffffL)
27412+ COMPILE_TIME_ASSERT(sizeof(long) > 4);
27413+#endif
27414+#if defined(SIZEOF_UNSIGNED)
27415+ COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED == sizeof(unsigned));
27416+#endif
27417+#if defined(SIZEOF_UNSIGNED_LONG)
27418+ COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_LONG == sizeof(unsigned long));
27419+#endif
27420+#if defined(SIZEOF_UNSIGNED_SHORT)
27421+ COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_SHORT == sizeof(unsigned short));
27422+#endif
27423+#if !defined(__LZO_IN_MINILZO)
27424+#if defined(SIZEOF_SIZE_T)
27425+ COMPILE_TIME_ASSERT(SIZEOF_SIZE_T == sizeof(size_t));
27426+#endif
27427+#endif
27428+#endif /* -edward */
27429+
27430+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
27431+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
27432+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
27433+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
27434+ COMPILE_TIME_ASSERT(IS_SIGNED(short));
27435+ COMPILE_TIME_ASSERT(IS_SIGNED(int));
27436+ COMPILE_TIME_ASSERT(IS_SIGNED(long));
27437+
27438+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
27439+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
27440+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
27441+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
27442+
27443+ COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
27444+ COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
27445+ COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
27446+ COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
27447+ // COMPILE_TIME_ASSERT(SHRT_MAX == LZO_STYPE_MAX(sizeof(short))); /* edward */
27448+ COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
27449+ COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
27450+ LZO_UTYPE_MAX(sizeof(lzo_uint32)));
27451+ COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
27452+#if !defined(__LZO_IN_MINILZO)
27453+ COMPILE_TIME_ASSERT(SIZE_T_MAX == LZO_UTYPE_MAX(sizeof(size_t)));
27454+#endif
27455+
27456+ r &= __lzo_assert(LZO_BYTE(257) == 1);
27457+
27458+ return r;
27459+}
27460+
27461+static lzo_bool basic_ptr_check(void)
27462+{
27463+ lzo_bool r = 1;
27464+
27465+ COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
27466+ COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
27467+
27468+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
27469+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
27470+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
27471+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
27472+
27473+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
27474+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
27475+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
27476+
27477+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
27478+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
27479+
27480+ COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
27481+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
27482+
27483+#if defined(SIZEOF_CHAR_P)
27484+ COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
27485+#endif
27486+#if defined(SIZEOF_PTRDIFF_T)
27487+ COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
27488+#endif
27489+
27490+ COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
27491+ COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
27492+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
27493+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
27494+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
27495+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
27496+
27497+ return r;
27498+}
27499+
27500+static lzo_bool ptr_check(void)
27501+{
27502+ lzo_bool r = 1;
27503+ int i;
27504+ char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
27505+ lzo_bytep wrkmem;
27506+ lzo_bytepp dict;
27507+ unsigned char x[4 * sizeof(lzo_full_align_t)];
27508+ long d;
27509+ lzo_full_align_t a;
27510+ lzo_full_align_t u;
27511+
27512+ for (i = 0; i < (int)sizeof(x); i++)
27513+ x[i] = LZO_BYTE(i);
27514+
27515+ wrkmem =
27516+ LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
27517+
27518+ u.a_lzo_bytep = wrkmem;
27519+ dict = u.a_lzo_bytepp;
27520+
27521+ d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
27522+ r &= __lzo_assert(d >= 0);
27523+ r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
27524+
27525+ memset(&a, 0, sizeof(a));
27526+ r &= __lzo_assert(a.a_lzo_voidp == NULL);
27527+
27528+ memset(&a, 0xff, sizeof(a));
27529+ r &= __lzo_assert(a.a_ushort == USHRT_MAX);
27530+ r &= __lzo_assert(a.a_uint == UINT_MAX);
27531+ r &= __lzo_assert(a.a_ulong == ULONG_MAX);
27532+ r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
27533+ r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
27534+
27535+ if (r == 1) {
27536+ for (i = 0; i < 8; i++)
27537+ r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
27538+ (const
27539+ lzo_voidp)(&wrkmem[i *
27540+ sizeof(lzo_byte
27541+ *)]));
27542+ }
27543+
27544+ memset(&a, 0, sizeof(a));
27545+ r &= __lzo_assert(a.a_char_p == NULL);
27546+ r &= __lzo_assert(a.a_lzo_bytep == NULL);
27547+ r &= __lzo_assert(NULL == (void *)0);
27548+ if (r == 1) {
27549+ for (i = 0; i < 10; i++)
27550+ dict[i] = wrkmem;
27551+ BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
27552+ r &= __lzo_assert(dict[0] == wrkmem);
27553+ for (i = 1; i < 9; i++)
27554+ r &= __lzo_assert(dict[i] == NULL);
27555+ r &= __lzo_assert(dict[9] == wrkmem);
27556+ }
27557+
27558+ if (r == 1) {
27559+ unsigned k = 1;
27560+ const unsigned n = (unsigned)sizeof(lzo_uint32);
27561+ lzo_byte *p0;
27562+ lzo_byte *p1;
27563+
27564+ k += __lzo_align_gap(&x[k], n);
27565+ p0 = (lzo_bytep) & x[k];
27566+#if defined(PTR_LINEAR)
27567+ r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27568+#else
27569+ r &= __lzo_assert(n == 4);
27570+ r &= __lzo_assert(PTR_ALIGNED_4(p0));
27571+#endif
27572+
27573+ r &= __lzo_assert(k >= 1);
27574+ p1 = (lzo_bytep) & x[1];
27575+ r &= __lzo_assert(PTR_GE(p0, p1));
27576+
27577+ r &= __lzo_assert(k < 1 + n);
27578+ p1 = (lzo_bytep) & x[1 + n];
27579+ r &= __lzo_assert(PTR_LT(p0, p1));
27580+
27581+ if (r == 1) {
27582+ lzo_uint32 v0, v1;
27583+
27584+ u.a_uchar_p = &x[k];
27585+ v0 = *u.a_lzo_uint32_p;
27586+ u.a_uchar_p = &x[k + n];
27587+ v1 = *u.a_lzo_uint32_p;
27588+
27589+ r &= __lzo_assert(v0 > 0);
27590+ r &= __lzo_assert(v1 > 0);
27591+ }
27592+ }
27593+
27594+ return r;
27595+}
27596+
27597+static int _lzo_config_check(void)
27598+{
27599+ lzo_bool r = 1;
27600+ int i;
27601+ union {
27602+ lzo_uint32 a;
27603+ unsigned short b;
27604+ lzo_uint32 aa[4];
27605+ unsigned char x[4 * sizeof(lzo_full_align_t)];
27606+ }
27607+ u;
27608+
27609+ COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27610+ COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27611+ < 0);
27612+
27613+ r &= basic_integral_check();
27614+ r &= basic_ptr_check();
27615+ if (r != 1)
27616+ return LZO_E_ERROR;
27617+
27618+ u.a = 0;
27619+ u.b = 0;
27620+ for (i = 0; i < (int)sizeof(u.x); i++)
27621+ u.x[i] = LZO_BYTE(i);
27622+
27623+#if defined(LZO_BYTE_ORDER)
27624+ if (r == 1) {
27625+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27626+ lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27627+ unsigned short b = (unsigned short)(u.b & 0xffff);
27628+ r &= __lzo_assert(a == 0x03020100L);
27629+ r &= __lzo_assert(b == 0x0100);
27630+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27631+ lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27632+ unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27633+ r &= __lzo_assert(a == 0x00010203L);
27634+ r &= __lzo_assert(b == 0x0001);
27635+# else
27636+# error "invalid LZO_BYTE_ORDER"
27637+# endif
27638+ }
27639+#endif
27640+
27641+#if defined(LZO_UNALIGNED_OK_2)
27642+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
27643+ if (r == 1) {
27644+ unsigned short b[4];
27645+
27646+ for (i = 0; i < 4; i++)
27647+ b[i] = *(const unsigned short *)&u.x[i];
27648+
27649+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27650+ r &= __lzo_assert(b[0] == 0x0100);
27651+ r &= __lzo_assert(b[1] == 0x0201);
27652+ r &= __lzo_assert(b[2] == 0x0302);
27653+ r &= __lzo_assert(b[3] == 0x0403);
27654+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27655+ r &= __lzo_assert(b[0] == 0x0001);
27656+ r &= __lzo_assert(b[1] == 0x0102);
27657+ r &= __lzo_assert(b[2] == 0x0203);
27658+ r &= __lzo_assert(b[3] == 0x0304);
27659+# endif
27660+ }
27661+#endif
27662+
27663+#if defined(LZO_UNALIGNED_OK_4)
27664+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27665+ if (r == 1) {
27666+ lzo_uint32 a[4];
27667+
27668+ for (i = 0; i < 4; i++)
27669+ a[i] = *(const lzo_uint32 *)&u.x[i];
27670+
27671+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27672+ r &= __lzo_assert(a[0] == 0x03020100L);
27673+ r &= __lzo_assert(a[1] == 0x04030201L);
27674+ r &= __lzo_assert(a[2] == 0x05040302L);
27675+ r &= __lzo_assert(a[3] == 0x06050403L);
27676+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27677+ r &= __lzo_assert(a[0] == 0x00010203L);
27678+ r &= __lzo_assert(a[1] == 0x01020304L);
27679+ r &= __lzo_assert(a[2] == 0x02030405L);
27680+ r &= __lzo_assert(a[3] == 0x03040506L);
27681+# endif
27682+ }
27683+#endif
27684+
27685+#if defined(LZO_ALIGNED_OK_4)
27686+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27687+#endif
27688+
27689+ COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27690+
27691+ if (r == 1) {
27692+ r &= __lzo_assert(!schedule_insns_bug());
27693+ }
27694+
27695+ if (r == 1) {
27696+ static int x[3];
27697+ static unsigned xn = 3;
27698+ register unsigned j;
27699+
27700+ for (j = 0; j < xn; j++)
27701+ x[j] = (int)j - 3;
27702+ r &= __lzo_assert(!strength_reduce_bug(x));
27703+ }
27704+
27705+ if (r == 1) {
27706+ r &= ptr_check();
27707+ }
27708+
27709+ return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27710+}
27711+
27712+static lzo_bool schedule_insns_bug(void)
27713+{
27714+#if defined(__LZO_CHECKER)
27715+ return 0;
27716+#else
27717+ const int clone[] = { 1, 2, 0 };
27718+ const int *q;
27719+ q = clone;
27720+ return (*q) ? 0 : 1;
27721+#endif
27722+}
27723+
27724+static lzo_bool strength_reduce_bug(int *x)
27725+{
27726+ return x[0] != -3 || x[1] != -2 || x[2] != -1;
27727+}
27728+
27729+#undef COMPILE_TIME_ASSERT
27730+
27731+LZO_PUBLIC(int)
27732+ __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27733+ int s6, int s7, int s8, int s9)
27734+{
27735+ int r;
27736+
27737+ if (v == 0)
27738+ return LZO_E_ERROR;
27739+
27740+ r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27741+ (s2 == -1 || s2 == (int)sizeof(int)) &&
27742+ (s3 == -1 || s3 == (int)sizeof(long)) &&
27743+ (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27744+ (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27745+ (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27746+ (s7 == -1 || s7 == (int)sizeof(char *)) &&
27747+ (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27748+ (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27749+ if (!r)
27750+ return LZO_E_ERROR;
27751+
27752+ r = _lzo_config_check();
27753+ if (r != LZO_E_OK)
27754+ return r;
27755+
27756+ return r;
27757+}
27758+
27759+#if !defined(__LZO_IN_MINILZO)
27760+
27761+LZO_EXTERN(int)
27762+ __lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7);
27763+
27764+LZO_PUBLIC(int)
27765+__lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7)
27766+{
27767+ if (v == 0 || v > 0x1010)
27768+ return LZO_E_ERROR;
27769+ return __lzo_init2(v, s1, s2, s3, s4, s5, -1, -1, s6, s7);
27770+}
27771+
27772+#endif
27773+
27774+#define do_compress _lzo1x_1_do_compress
27775+
27776+#define LZO_NEED_DICT_H
27777+#define D_BITS 14
27778+#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
27779+#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27780+
27781+#ifndef __LZO_CONFIG1X_H
27782+#define __LZO_CONFIG1X_H
27783+
27784+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27785+# define LZO1X
27786+#endif
27787+
27788+#if !defined(__LZO_IN_MINILZO)
27789+#include <lzo1x.h>
27790+#endif
27791+
27792+#define LZO_EOF_CODE
27793+#undef LZO_DETERMINISTIC
27794+
27795+#define M1_MAX_OFFSET 0x0400
27796+#ifndef M2_MAX_OFFSET
27797+#define M2_MAX_OFFSET 0x0800
27798+#endif
27799+#define M3_MAX_OFFSET 0x4000
27800+#define M4_MAX_OFFSET 0xbfff
27801+
27802+#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
27803+
27804+#define M1_MIN_LEN 2
27805+#define M1_MAX_LEN 2
27806+#define M2_MIN_LEN 3
27807+#ifndef M2_MAX_LEN
27808+#define M2_MAX_LEN 8
27809+#endif
27810+#define M3_MIN_LEN 3
27811+#define M3_MAX_LEN 33
27812+#define M4_MIN_LEN 3
27813+#define M4_MAX_LEN 9
27814+
27815+#define M1_MARKER 0
27816+#define M2_MARKER 64
27817+#define M3_MARKER 32
27818+#define M4_MARKER 16
27819+
27820+#ifndef MIN_LOOKAHEAD
27821+#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
27822+#endif
27823+
27824+#if defined(LZO_NEED_DICT_H)
27825+
27826+#ifndef LZO_HASH
27827+#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
27828+#endif
27829+#define DL_MIN_LEN M2_MIN_LEN
27830+
27831+#ifndef __LZO_DICT_H
27832+#define __LZO_DICT_H
27833+
27834+#ifdef __cplusplus
27835+extern "C" {
27836+#endif
27837+
27838+#if !defined(D_BITS) && defined(DBITS)
27839+# define D_BITS DBITS
27840+#endif
27841+#if !defined(D_BITS)
27842+# error "D_BITS is not defined"
27843+#endif
27844+#if (D_BITS < 16)
27845+# define D_SIZE LZO_SIZE(D_BITS)
27846+# define D_MASK LZO_MASK(D_BITS)
27847+#else
27848+# define D_SIZE LZO_USIZE(D_BITS)
27849+# define D_MASK LZO_UMASK(D_BITS)
27850+#endif
27851+#define D_HIGH ((D_MASK >> 1) + 1)
27852+
27853+#if !defined(DD_BITS)
27854+# define DD_BITS 0
27855+#endif
27856+#define DD_SIZE LZO_SIZE(DD_BITS)
27857+#define DD_MASK LZO_MASK(DD_BITS)
27858+
27859+#if !defined(DL_BITS)
27860+# define DL_BITS (D_BITS - DD_BITS)
27861+#endif
27862+#if (DL_BITS < 16)
27863+# define DL_SIZE LZO_SIZE(DL_BITS)
27864+# define DL_MASK LZO_MASK(DL_BITS)
27865+#else
27866+# define DL_SIZE LZO_USIZE(DL_BITS)
27867+# define DL_MASK LZO_UMASK(DL_BITS)
27868+#endif
27869+
27870+#if (D_BITS != DL_BITS + DD_BITS)
27871+# error "D_BITS does not match"
27872+#endif
27873+#if (D_BITS < 8 || D_BITS > 18)
27874+# error "invalid D_BITS"
27875+#endif
27876+#if (DL_BITS < 8 || DL_BITS > 20)
27877+# error "invalid DL_BITS"
27878+#endif
27879+#if (DD_BITS < 0 || DD_BITS > 6)
27880+# error "invalid DD_BITS"
27881+#endif
27882+
27883+#if !defined(DL_MIN_LEN)
27884+# define DL_MIN_LEN 3
27885+#endif
27886+#if !defined(DL_SHIFT)
27887+# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27888+#endif
27889+
27890+#define LZO_HASH_GZIP 1
27891+#define LZO_HASH_GZIP_INCREMENTAL 2
27892+#define LZO_HASH_LZO_INCREMENTAL_A 3
27893+#define LZO_HASH_LZO_INCREMENTAL_B 4
27894+
27895+#if !defined(LZO_HASH)
27896+# error "choose a hashing strategy"
27897+#endif
27898+
27899+#if (DL_MIN_LEN == 3)
27900+# define _DV2_A(p,shift1,shift2) \
27901+ (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27902+# define _DV2_B(p,shift1,shift2) \
27903+ (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27904+# define _DV3_B(p,shift1,shift2,shift3) \
27905+ ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27906+#elif (DL_MIN_LEN == 2)
27907+# define _DV2_A(p,shift1,shift2) \
27908+ (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27909+# define _DV2_B(p,shift1,shift2) \
27910+ (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27911+#else
27912+# error "invalid DL_MIN_LEN"
27913+#endif
27914+#define _DV_A(p,shift) _DV2_A(p,shift,shift)
27915+#define _DV_B(p,shift) _DV2_B(p,shift,shift)
27916+#define DA2(p,s1,s2) \
27917+ (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27918+#define DS2(p,s1,s2) \
27919+ (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27920+#define DX2(p,s1,s2) \
27921+ (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27922+#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27923+#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27924+#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27925+#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27926+#define DM(v) DMS(v,0)
27927+
27928+#if (LZO_HASH == LZO_HASH_GZIP)
27929+# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
27930+
27931+#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27932+# define __LZO_HASH_INCREMENTAL
27933+# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
27934+# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
27935+# define _DINDEX(dv,p) (dv)
27936+# define DVAL_LOOKAHEAD DL_MIN_LEN
27937+
27938+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27939+# define __LZO_HASH_INCREMENTAL
27940+# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
27941+# define DVAL_NEXT(dv,p) \
27942+ dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27943+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27944+# define DVAL_LOOKAHEAD DL_MIN_LEN
27945+
27946+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27947+# define __LZO_HASH_INCREMENTAL
27948+# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
27949+# define DVAL_NEXT(dv,p) \
27950+ dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27951+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27952+# define DVAL_LOOKAHEAD DL_MIN_LEN
27953+
27954+#else
27955+# error "choose a hashing strategy"
27956+#endif
27957+
27958+#ifndef DINDEX
27959+#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27960+#endif
27961+#if !defined(DINDEX1) && defined(D_INDEX1)
27962+#define DINDEX1 D_INDEX1
27963+#endif
27964+#if !defined(DINDEX2) && defined(D_INDEX2)
27965+#define DINDEX2 D_INDEX2
27966+#endif
27967+
27968+#if !defined(__LZO_HASH_INCREMENTAL)
27969+# define DVAL_FIRST(dv,p) ((void) 0)
27970+# define DVAL_NEXT(dv,p) ((void) 0)
27971+# define DVAL_LOOKAHEAD 0
27972+#endif
27973+
27974+#if !defined(DVAL_ASSERT)
27975+#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
27976+ static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p) {
27977+ lzo_uint32 df;
27978+ DVAL_FIRST(df, (p));
27979+ assert(DINDEX(dv, p) == DINDEX(df, p));
27980+ }
27981+#else
27982+# define DVAL_ASSERT(dv,p) ((void) 0)
27983+#endif
27984+#endif
27985+
27986+#if defined(LZO_DICT_USE_PTR)
27987+# define DENTRY(p,in) (p)
27988+# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
27989+#else
27990+# define DENTRY(p,in) ((lzo_uint) ((p)-(in)))
27991+# define GINDEX(m_pos,m_off,dict,dindex,in) m_off = dict[dindex]
27992+#endif
27993+
27994+#if (DD_BITS == 0)
27995+
27996+# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27997+# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
27998+# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
27999+
28000+#else
28001+
28002+# define UPDATE_D(dict,drun,dv,p,in) \
28003+ dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
28004+# define UPDATE_I(dict,drun,index,p,in) \
28005+ dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
28006+# define UPDATE_P(ptr,drun,p,in) \
28007+ (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
28008+
28009+#endif
28010+
28011+#if defined(LZO_DICT_USE_PTR)
28012+
28013+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
28014+ (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
28015+
28016+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
28017+ (BOUNDS_CHECKING_OFF_IN_EXPR( \
28018+ (PTR_LT(m_pos,in) || \
28019+ (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
28020+ m_off > max_offset) ))
28021+
28022+#else
28023+
28024+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
28025+ (m_off == 0 || \
28026+ ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
28027+ (m_pos = (ip) - (m_off), 0) )
28028+
28029+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
28030+ ((lzo_moff_t) ((ip)-(in)) <= m_off || \
28031+ ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
28032+ (m_pos = (ip) - (m_off), 0) )
28033+
28034+#endif
28035+
28036+#if defined(LZO_DETERMINISTIC)
28037+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
28038+#else
28039+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
28040+#endif
28041+
28042+#ifdef __cplusplus
28043+}
28044+#endif
28045+#endif
28046+#endif
28047+#endif
28048+#define DO_COMPRESS lzo1x_1_compress
28049+static
28050+lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
28051+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28052+{
28053+ register const lzo_byte *ip;
28054+ lzo_byte *op;
28055+ const lzo_byte *const in_end = in + in_len;
28056+ const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
28057+ const lzo_byte *ii;
28058+ lzo_dict_p const dict = (lzo_dict_p) wrkmem;
28059+
28060+ op = out;
28061+ ip = in;
28062+ ii = ip;
28063+
28064+ ip += 4;
28065+ for (;;) {
28066+ register const lzo_byte *m_pos;
28067+
28068+ lzo_moff_t m_off;
28069+ lzo_uint m_len;
28070+ lzo_uint dindex;
28071+
28072+ DINDEX1(dindex, ip);
28073+ GINDEX(m_pos, m_off, dict, dindex, in);
28074+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
28075+ goto literal;
28076+#if 1
28077+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
28078+ goto try_match;
28079+ DINDEX2(dindex, ip);
28080+#endif
28081+ GINDEX(m_pos, m_off, dict, dindex, in);
28082+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
28083+ goto literal;
28084+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
28085+ goto try_match;
28086+ goto literal;
28087+
28088+ try_match:
28089+#if 1 && defined(LZO_UNALIGNED_OK_2)
28090+ if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
28091+#else
28092+ if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
28093+#endif
28094+ ;
28095+ } else {
28096+ if (m_pos[2] == ip[2]) {
28097+ goto match;
28098+ } else {
28099+ ;
28100+ }
28101+ }
28102+
28103+ literal:
28104+ UPDATE_I(dict, 0, dindex, ip, in);
28105+ ++ip;
28106+ if (ip >= ip_end)
28107+ break;
28108+ continue;
28109+
28110+ match:
28111+ UPDATE_I(dict, 0, dindex, ip, in);
28112+ if (pd(ip, ii) > 0) {
28113+ register lzo_uint t = pd(ip, ii);
28114+
28115+ if (t <= 3) {
28116+ assert("lzo-04", op - 2 > out);
28117+ op[-2] |= LZO_BYTE(t);
28118+ } else if (t <= 18)
28119+ *op++ = LZO_BYTE(t - 3);
28120+ else {
28121+ register lzo_uint tt = t - 18;
28122+
28123+ *op++ = 0;
28124+ while (tt > 255) {
28125+ tt -= 255;
28126+ *op++ = 0;
28127+ }
28128+ assert("lzo-05", tt > 0);
28129+ *op++ = LZO_BYTE(tt);
28130+ }
28131+ do
28132+ *op++ = *ii++;
28133+ while (--t > 0);
28134+ }
28135+
28136+ assert("lzo-06", ii == ip);
28137+ ip += 3;
28138+ if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
28139+ || m_pos[6] != *ip++ || m_pos[7] != *ip++
28140+ || m_pos[8] != *ip++
28141+#ifdef LZO1Y
28142+ || m_pos[9] != *ip++ || m_pos[10] != *ip++
28143+ || m_pos[11] != *ip++ || m_pos[12] != *ip++
28144+ || m_pos[13] != *ip++ || m_pos[14] != *ip++
28145+#endif
28146+ ) {
28147+ --ip;
28148+ m_len = ip - ii;
28149+ assert("lzo-07", m_len >= 3);
28150+ assert("lzo-08", m_len <= M2_MAX_LEN);
28151+
28152+ if (m_off <= M2_MAX_OFFSET) {
28153+ m_off -= 1;
28154+#if defined(LZO1X)
28155+ *op++ =
28156+ LZO_BYTE(((m_len -
28157+ 1) << 5) | ((m_off & 7) << 2));
28158+ *op++ = LZO_BYTE(m_off >> 3);
28159+#elif defined(LZO1Y)
28160+ *op++ =
28161+ LZO_BYTE(((m_len +
28162+ 1) << 4) | ((m_off & 3) << 2));
28163+ *op++ = LZO_BYTE(m_off >> 2);
28164+#endif
28165+ } else if (m_off <= M3_MAX_OFFSET) {
28166+ m_off -= 1;
28167+ *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
28168+ goto m3_m4_offset;
28169+ } else
28170+#if defined(LZO1X)
28171+ {
28172+ m_off -= 0x4000;
28173+ assert("lzo-09", m_off > 0);
28174+ assert("lzo-10", m_off <= 0x7fff);
28175+ *op++ = LZO_BYTE(M4_MARKER |
28176+ ((m_off & 0x4000) >> 11) |
28177+ (m_len - 2));
28178+ goto m3_m4_offset;
28179+ }
28180+#elif defined(LZO1Y)
28181+ goto m4_match;
28182+#endif
28183+ } else {
28184+ {
28185+ const lzo_byte *end = in_end;
28186+ const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
28187+ while (ip < end && *m == *ip)
28188+ m++, ip++;
28189+ m_len = (ip - ii);
28190+ }
28191+ assert("lzo-11", m_len > M2_MAX_LEN);
28192+
28193+ if (m_off <= M3_MAX_OFFSET) {
28194+ m_off -= 1;
28195+ if (m_len <= 33)
28196+ *op++ =
28197+ LZO_BYTE(M3_MARKER | (m_len - 2));
28198+ else {
28199+ m_len -= 33;
28200+ *op++ = M3_MARKER | 0;
28201+ goto m3_m4_len;
28202+ }
28203+ } else {
28204+#if defined(LZO1Y)
28205+ m4_match:
28206+#endif
28207+ m_off -= 0x4000;
28208+ assert("lzo-12", m_off > 0);
28209+ assert("lzo-13", m_off <= 0x7fff);
28210+ if (m_len <= M4_MAX_LEN)
28211+ *op++ = LZO_BYTE(M4_MARKER |
28212+ ((m_off & 0x4000) >>
28213+ 11) | (m_len - 2));
28214+ else {
28215+ m_len -= M4_MAX_LEN;
28216+ *op++ =
28217+ LZO_BYTE(M4_MARKER |
28218+ ((m_off & 0x4000) >> 11));
28219+ m3_m4_len:
28220+ while (m_len > 255) {
28221+ m_len -= 255;
28222+ *op++ = 0;
28223+ }
28224+ assert("lzo-14", m_len > 0);
28225+ *op++ = LZO_BYTE(m_len);
28226+ }
28227+ }
28228+
28229+ m3_m4_offset:
28230+ *op++ = LZO_BYTE((m_off & 63) << 2);
28231+ *op++ = LZO_BYTE(m_off >> 6);
28232+ }
28233+
28234+ ii = ip;
28235+ if (ip >= ip_end)
28236+ break;
28237+ }
28238+
28239+ *out_len = op - out;
28240+ return pd(in_end, ii);
28241+}
28242+
28243+LZO_PUBLIC(int)
28244+ DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
28245+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28246+{
28247+ lzo_byte *op = out;
28248+ lzo_uint t;
28249+
28250+#if defined(__LZO_QUERY_COMPRESS)
28251+ if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28252+ return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
28253+ D_SIZE, lzo_sizeof(lzo_dict_t));
28254+#endif
28255+
28256+ if (in_len <= M2_MAX_LEN + 5)
28257+ t = in_len;
28258+ else {
28259+ t = do_compress(in, in_len, op, out_len, wrkmem);
28260+ op += *out_len;
28261+ }
28262+
28263+ if (t > 0) {
28264+ const lzo_byte *ii = in + in_len - t;
28265+
28266+ if (op == out && t <= 238)
28267+ *op++ = LZO_BYTE(17 + t);
28268+ else if (t <= 3)
28269+ op[-2] |= LZO_BYTE(t);
28270+ else if (t <= 18)
28271+ *op++ = LZO_BYTE(t - 3);
28272+ else {
28273+ lzo_uint tt = t - 18;
28274+
28275+ *op++ = 0;
28276+ while (tt > 255) {
28277+ tt -= 255;
28278+ *op++ = 0;
28279+ }
28280+ assert("lzo-15", tt > 0);
28281+ *op++ = LZO_BYTE(tt);
28282+ }
28283+ do
28284+ *op++ = *ii++;
28285+ while (--t > 0);
28286+ }
28287+
28288+ *op++ = M4_MARKER | 1;
28289+ *op++ = 0;
28290+ *op++ = 0;
28291+
28292+ *out_len = op - out;
28293+ return LZO_E_OK;
28294+}
28295+
28296+#undef do_compress
28297+#undef DO_COMPRESS
28298+#undef LZO_HASH
28299+
28300+#undef LZO_TEST_DECOMPRESS_OVERRUN
28301+#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
28302+#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
28303+#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28304+#undef DO_DECOMPRESS
28305+#define DO_DECOMPRESS lzo1x_decompress
28306+
28307+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28308+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28309+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28310+# endif
28311+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28312+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28313+# endif
28314+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28315+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28316+# endif
28317+#endif
28318+
28319+#undef TEST_IP
28320+#undef TEST_OP
28321+#undef TEST_LOOKBEHIND
28322+#undef NEED_IP
28323+#undef NEED_OP
28324+#undef HAVE_TEST_IP
28325+#undef HAVE_TEST_OP
28326+#undef HAVE_NEED_IP
28327+#undef HAVE_NEED_OP
28328+#undef HAVE_ANY_IP
28329+#undef HAVE_ANY_OP
28330+
28331+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28332+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28333+# define TEST_IP (ip < ip_end)
28334+# endif
28335+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28336+# define NEED_IP(x) \
28337+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28338+# endif
28339+#endif
28340+
28341+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28342+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28343+# define TEST_OP (op <= op_end)
28344+# endif
28345+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28346+# undef TEST_OP
28347+# define NEED_OP(x) \
28348+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28349+# endif
28350+#endif
28351+
28352+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28353+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28354+#else
28355+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28356+#endif
28357+
28358+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28359+# define TEST_IP (ip < ip_end)
28360+#endif
28361+
28362+#if defined(TEST_IP)
28363+# define HAVE_TEST_IP
28364+#else
28365+# define TEST_IP 1
28366+#endif
28367+#if defined(TEST_OP)
28368+# define HAVE_TEST_OP
28369+#else
28370+# define TEST_OP 1
28371+#endif
28372+
28373+#if defined(NEED_IP)
28374+# define HAVE_NEED_IP
28375+#else
28376+# define NEED_IP(x) ((void) 0)
28377+#endif
28378+#if defined(NEED_OP)
28379+# define HAVE_NEED_OP
28380+#else
28381+# define NEED_OP(x) ((void) 0)
28382+#endif
28383+
28384+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28385+# define HAVE_ANY_IP
28386+#endif
28387+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28388+# define HAVE_ANY_OP
28389+#endif
28390+
28391+#undef __COPY4
28392+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28393+
28394+#undef COPY4
28395+#if defined(LZO_UNALIGNED_OK_4)
28396+# define COPY4(dst,src) __COPY4(dst,src)
28397+#elif defined(LZO_ALIGNED_OK_4)
28398+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28399+#endif
28400+
28401+#if defined(DO_DECOMPRESS)
28402+LZO_PUBLIC(int)
28403+ DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
28404+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28405+#endif
28406+{
28407+ register lzo_byte *op;
28408+ register const lzo_byte *ip;
28409+ register lzo_uint t;
28410+#if defined(COPY_DICT)
28411+ lzo_uint m_off;
28412+ const lzo_byte *dict_end;
28413+#else
28414+ register const lzo_byte *m_pos;
28415+#endif
28416+
28417+ const lzo_byte *const ip_end = in + in_len;
28418+#if defined(HAVE_ANY_OP)
28419+ lzo_byte *const op_end = out + *out_len;
28420+#endif
28421+#if defined(LZO1Z)
28422+ lzo_uint last_m_off = 0;
28423+#endif
28424+
28425+ LZO_UNUSED(wrkmem);
28426+
28427+#if defined(__LZO_QUERY_DECOMPRESS)
28428+ if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28429+ return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
28430+ 0, 0);
28431+#endif
28432+
28433+#if defined(COPY_DICT)
28434+ if (dict) {
28435+ if (dict_len > M4_MAX_OFFSET) {
28436+ dict += dict_len - M4_MAX_OFFSET;
28437+ dict_len = M4_MAX_OFFSET;
28438+ }
28439+ dict_end = dict + dict_len;
28440+ } else {
28441+ dict_len = 0;
28442+ dict_end = NULL;
28443+ }
28444+#endif
28445+
28446+ *out_len = 0;
28447+
28448+ op = out;
28449+ ip = in;
28450+
28451+ if (*ip > 17) {
28452+ t = *ip++ - 17;
28453+ if (t < 4)
28454+ goto match_next;
28455+ assert("lzo-16", t > 0);
28456+ NEED_OP(t);
28457+ NEED_IP(t + 1);
28458+ do
28459+ *op++ = *ip++;
28460+ while (--t > 0);
28461+ goto first_literal_run;
28462+ }
28463+
28464+ while (TEST_IP && TEST_OP) {
28465+ t = *ip++;
28466+ if (t >= 16)
28467+ goto match;
28468+ if (t == 0) {
28469+ NEED_IP(1);
28470+ while (*ip == 0) {
28471+ t += 255;
28472+ ip++;
28473+ NEED_IP(1);
28474+ }
28475+ t += 15 + *ip++;
28476+ }
28477+ assert("lzo-17", t > 0);
28478+ NEED_OP(t + 3);
28479+ NEED_IP(t + 4);
28480+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28481+#if !defined(LZO_UNALIGNED_OK_4)
28482+ if (PTR_ALIGNED2_4(op, ip)) {
28483+#endif
28484+ COPY4(op, ip);
28485+ op += 4;
28486+ ip += 4;
28487+ if (--t > 0) {
28488+ if (t >= 4) {
28489+ do {
28490+ COPY4(op, ip);
28491+ op += 4;
28492+ ip += 4;
28493+ t -= 4;
28494+ } while (t >= 4);
28495+ if (t > 0)
28496+ do
28497+ *op++ = *ip++;
28498+ while (--t > 0);
28499+ } else
28500+ do
28501+ *op++ = *ip++;
28502+ while (--t > 0);
28503+ }
28504+#if !defined(LZO_UNALIGNED_OK_4)
28505+ } else
28506+#endif
28507+#endif
28508+#if !defined(LZO_UNALIGNED_OK_4)
28509+ {
28510+ *op++ = *ip++;
28511+ *op++ = *ip++;
28512+ *op++ = *ip++;
28513+ do
28514+ *op++ = *ip++;
28515+ while (--t > 0);
28516+ }
28517+#endif
28518+
28519+ first_literal_run:
28520+
28521+ t = *ip++;
28522+ if (t >= 16)
28523+ goto match;
28524+#if defined(COPY_DICT)
28525+#if defined(LZO1Z)
28526+ m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28527+ last_m_off = m_off;
28528+#else
28529+ m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
28530+#endif
28531+ NEED_OP(3);
28532+ t = 3;
28533+ COPY_DICT(t, m_off)
28534+#else
28535+#if defined(LZO1Z)
28536+ t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28537+ m_pos = op - t;
28538+ last_m_off = t;
28539+#else
28540+ m_pos = op - (1 + M2_MAX_OFFSET);
28541+ m_pos -= t >> 2;
28542+ m_pos -= *ip++ << 2;
28543+#endif
28544+ TEST_LOOKBEHIND(m_pos, out);
28545+ NEED_OP(3);
28546+ *op++ = *m_pos++;
28547+ *op++ = *m_pos++;
28548+ *op++ = *m_pos;
28549+#endif
28550+ goto match_done;
28551+
28552+ while (TEST_IP && TEST_OP) {
28553+ match:
28554+ if (t >= 64) {
28555+#if defined(COPY_DICT)
28556+#if defined(LZO1X)
28557+ m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
28558+ t = (t >> 5) - 1;
28559+#elif defined(LZO1Y)
28560+ m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
28561+ t = (t >> 4) - 3;
28562+#elif defined(LZO1Z)
28563+ m_off = t & 0x1f;
28564+ if (m_off >= 0x1c)
28565+ m_off = last_m_off;
28566+ else {
28567+ m_off = 1 + (m_off << 6) + (*ip++ >> 2);
28568+ last_m_off = m_off;
28569+ }
28570+ t = (t >> 5) - 1;
28571+#endif
28572+#else
28573+#if defined(LZO1X)
28574+ m_pos = op - 1;
28575+ m_pos -= (t >> 2) & 7;
28576+ m_pos -= *ip++ << 3;
28577+ t = (t >> 5) - 1;
28578+#elif defined(LZO1Y)
28579+ m_pos = op - 1;
28580+ m_pos -= (t >> 2) & 3;
28581+ m_pos -= *ip++ << 2;
28582+ t = (t >> 4) - 3;
28583+#elif defined(LZO1Z)
28584+ {
28585+ lzo_uint off = t & 0x1f;
28586+ m_pos = op;
28587+ if (off >= 0x1c) {
28588+ assert(last_m_off > 0);
28589+ m_pos -= last_m_off;
28590+ } else {
28591+ off =
28592+ 1 + (off << 6) +
28593+ (*ip++ >> 2);
28594+ m_pos -= off;
28595+ last_m_off = off;
28596+ }
28597+ }
28598+ t = (t >> 5) - 1;
28599+#endif
28600+ TEST_LOOKBEHIND(m_pos, out);
28601+ assert("lzo-18", t > 0);
28602+ NEED_OP(t + 3 - 1);
28603+ goto copy_match;
28604+#endif
28605+ } else if (t >= 32) {
28606+ t &= 31;
28607+ if (t == 0) {
28608+ NEED_IP(1);
28609+ while (*ip == 0) {
28610+ t += 255;
28611+ ip++;
28612+ NEED_IP(1);
28613+ }
28614+ t += 31 + *ip++;
28615+ }
28616+#if defined(COPY_DICT)
28617+#if defined(LZO1Z)
28618+ m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28619+ last_m_off = m_off;
28620+#else
28621+ m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28622+#endif
28623+#else
28624+#if defined(LZO1Z)
28625+ {
28626+ lzo_uint off =
28627+ 1 + (ip[0] << 6) + (ip[1] >> 2);
28628+ m_pos = op - off;
28629+ last_m_off = off;
28630+ }
28631+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28632+ m_pos = op - 1;
28633+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28634+#else
28635+ m_pos = op - 1;
28636+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28637+#endif
28638+#endif
28639+ ip += 2;
28640+ } else if (t >= 16) {
28641+#if defined(COPY_DICT)
28642+ m_off = (t & 8) << 11;
28643+#else
28644+ m_pos = op;
28645+ m_pos -= (t & 8) << 11;
28646+#endif
28647+ t &= 7;
28648+ if (t == 0) {
28649+ NEED_IP(1);
28650+ while (*ip == 0) {
28651+ t += 255;
28652+ ip++;
28653+ NEED_IP(1);
28654+ }
28655+ t += 7 + *ip++;
28656+ }
28657+#if defined(COPY_DICT)
28658+#if defined(LZO1Z)
28659+ m_off += (ip[0] << 6) + (ip[1] >> 2);
28660+#else
28661+ m_off += (ip[0] >> 2) + (ip[1] << 6);
28662+#endif
28663+ ip += 2;
28664+ if (m_off == 0)
28665+ goto eof_found;
28666+ m_off += 0x4000;
28667+#if defined(LZO1Z)
28668+ last_m_off = m_off;
28669+#endif
28670+#else
28671+#if defined(LZO1Z)
28672+ m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28673+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28674+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28675+#else
28676+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28677+#endif
28678+ ip += 2;
28679+ if (m_pos == op)
28680+ goto eof_found;
28681+ m_pos -= 0x4000;
28682+#if defined(LZO1Z)
28683+ last_m_off = op - m_pos;
28684+#endif
28685+#endif
28686+ } else {
28687+#if defined(COPY_DICT)
28688+#if defined(LZO1Z)
28689+ m_off = 1 + (t << 6) + (*ip++ >> 2);
28690+ last_m_off = m_off;
28691+#else
28692+ m_off = 1 + (t >> 2) + (*ip++ << 2);
28693+#endif
28694+ NEED_OP(2);
28695+ t = 2;
28696+ COPY_DICT(t, m_off)
28697+#else
28698+#if defined(LZO1Z)
28699+ t = 1 + (t << 6) + (*ip++ >> 2);
28700+ m_pos = op - t;
28701+ last_m_off = t;
28702+#else
28703+ m_pos = op - 1;
28704+ m_pos -= t >> 2;
28705+ m_pos -= *ip++ << 2;
28706+#endif
28707+ TEST_LOOKBEHIND(m_pos, out);
28708+ NEED_OP(2);
28709+ *op++ = *m_pos++;
28710+ *op++ = *m_pos;
28711+#endif
28712+ goto match_done;
28713+ }
28714+
28715+#if defined(COPY_DICT)
28716+
28717+ NEED_OP(t + 3 - 1);
28718+ t += 3 - 1;
28719+ COPY_DICT(t, m_off)
28720+#else
28721+
28722+ TEST_LOOKBEHIND(m_pos, out);
28723+ assert("lzo-19", t > 0);
28724+ NEED_OP(t + 3 - 1);
28725+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28726+#if !defined(LZO_UNALIGNED_OK_4)
28727+ if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28728+ assert((op - m_pos) >= 4);
28729+#else
28730+ if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28731+#endif
28732+ COPY4(op, m_pos);
28733+ op += 4;
28734+ m_pos += 4;
28735+ t -= 4 - (3 - 1);
28736+ do {
28737+ COPY4(op, m_pos);
28738+ op += 4;
28739+ m_pos += 4;
28740+ t -= 4;
28741+ } while (t >= 4);
28742+ if (t > 0)
28743+ do
28744+ *op++ = *m_pos++;
28745+ while (--t > 0);
28746+ } else
28747+#endif
28748+ {
28749+ copy_match:
28750+ *op++ = *m_pos++;
28751+ *op++ = *m_pos++;
28752+ do
28753+ *op++ = *m_pos++;
28754+ while (--t > 0);
28755+ }
28756+
28757+#endif
28758+
28759+ match_done:
28760+#if defined(LZO1Z)
28761+ t = ip[-1] & 3;
28762+#else
28763+ t = ip[-2] & 3;
28764+#endif
28765+ if (t == 0)
28766+ break;
28767+
28768+ match_next:
28769+ assert("lzo-20", t > 0);
28770+ NEED_OP(t);
28771+ NEED_IP(t + 1);
28772+ do
28773+ *op++ = *ip++;
28774+ while (--t > 0);
28775+ t = *ip++;
28776+ }
28777+ }
28778+
28779+#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28780+ *out_len = op - out;
28781+ return LZO_E_EOF_NOT_FOUND;
28782+#endif
28783+
28784+ eof_found:
28785+ assert("lzo-21", t == 1);
28786+ *out_len = op - out;
28787+ return (ip == ip_end ? LZO_E_OK :
28788+ (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28789+
28790+#if defined(HAVE_NEED_IP)
28791+ input_overrun:
28792+ *out_len = op - out;
28793+ return LZO_E_INPUT_OVERRUN;
28794+#endif
28795+
28796+#if defined(HAVE_NEED_OP)
28797+ output_overrun:
28798+ *out_len = op - out;
28799+ return LZO_E_OUTPUT_OVERRUN;
28800+#endif
28801+
28802+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28803+ lookbehind_overrun:
28804+ *out_len = op - out;
28805+ return LZO_E_LOOKBEHIND_OVERRUN;
28806+#endif
28807+}
28808+
28809+#define LZO_TEST_DECOMPRESS_OVERRUN
28810+#undef DO_DECOMPRESS
28811+#define DO_DECOMPRESS lzo1x_decompress_safe
28812+
28813+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28814+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28815+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28816+# endif
28817+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28818+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28819+# endif
28820+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28821+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28822+# endif
28823+#endif
28824+
28825+#undef TEST_IP
28826+#undef TEST_OP
28827+#undef TEST_LOOKBEHIND
28828+#undef NEED_IP
28829+#undef NEED_OP
28830+#undef HAVE_TEST_IP
28831+#undef HAVE_TEST_OP
28832+#undef HAVE_NEED_IP
28833+#undef HAVE_NEED_OP
28834+#undef HAVE_ANY_IP
28835+#undef HAVE_ANY_OP
28836+
28837+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28838+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28839+# define TEST_IP (ip < ip_end)
28840+# endif
28841+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28842+# define NEED_IP(x) \
28843+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28844+# endif
28845+#endif
28846+
28847+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28848+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28849+# define TEST_OP (op <= op_end)
28850+# endif
28851+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28852+# undef TEST_OP
28853+# define NEED_OP(x) \
28854+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28855+# endif
28856+#endif
28857+
28858+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28859+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28860+#else
28861+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28862+#endif
28863+
28864+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28865+# define TEST_IP (ip < ip_end)
28866+#endif
28867+
28868+#if defined(TEST_IP)
28869+# define HAVE_TEST_IP
28870+#else
28871+# define TEST_IP 1
28872+#endif
28873+#if defined(TEST_OP)
28874+# define HAVE_TEST_OP
28875+#else
28876+# define TEST_OP 1
28877+#endif
28878+
28879+#if defined(NEED_IP)
28880+# define HAVE_NEED_IP
28881+#else
28882+# define NEED_IP(x) ((void) 0)
28883+#endif
28884+#if defined(NEED_OP)
28885+# define HAVE_NEED_OP
28886+#else
28887+# define NEED_OP(x) ((void) 0)
28888+#endif
28889+
28890+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28891+# define HAVE_ANY_IP
28892+#endif
28893+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28894+# define HAVE_ANY_OP
28895+#endif
28896+
28897+#undef __COPY4
28898+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28899+
28900+#undef COPY4
28901+#if defined(LZO_UNALIGNED_OK_4)
28902+# define COPY4(dst,src) __COPY4(dst,src)
28903+#elif defined(LZO_ALIGNED_OK_4)
28904+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28905+#endif
28906+
28907+/***** End of minilzo.c *****/
28908Index: linux-2.6.16/fs/reiser4/plugin/compress/minilzo.h
28909===================================================================
28910--- /dev/null
28911+++ linux-2.6.16/fs/reiser4/plugin/compress/minilzo.h
28912@@ -0,0 +1,94 @@
28913+/* minilzo.h -- mini subset of the LZO real-time data compression library
28914+ adopted for reiser4 compression transform plugin.
28915+
28916+ This file is part of the LZO real-time data compression library
28917+ and not included in any proprietary licenses of reiser4.
28918+
28919+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28920+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28921+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28922+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28923+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28924+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28925+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28926+ All Rights Reserved.
28927+
28928+ The LZO library is free software; you can redistribute it and/or
28929+ modify it under the terms of the GNU General Public License as
28930+ published by the Free Software Foundation; either version 2 of
28931+ the License, or (at your option) any later version.
28932+
28933+ The LZO library is distributed in the hope that it will be useful,
28934+ but WITHOUT ANY WARRANTY; without even the implied warranty of
28935+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28936+ GNU General Public License for more details.
28937+
28938+ You should have received a copy of the GNU General Public License
28939+ along with the LZO library; see the file COPYING.
28940+ If not, write to the Free Software Foundation, Inc.,
28941+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28942+
28943+ Markus F.X.J. Oberhumer
28944+ <markus@oberhumer.com>
28945+ http://www.oberhumer.com/opensource/lzo/
28946+ */
28947+
28948+/*
28949+ * NOTE:
28950+ * the full LZO package can be found at
28951+ * http://www.oberhumer.com/opensource/lzo/
28952+ */
28953+
28954+#ifndef __MINILZO_H
28955+#define __MINILZO_H
28956+
28957+#define MINILZO_VERSION 0x1080
28958+
28959+#ifdef __LZOCONF_H
28960+# error "you cannot use both LZO and miniLZO"
28961+#endif
28962+
28963+#undef LZO_HAVE_CONFIG_H
28964+#include "lzoconf.h"
28965+
28966+#if !defined(LZO_VERSION) || (LZO_VERSION != MINILZO_VERSION)
28967+# error "version mismatch in header files"
28968+#endif
28969+
28970+#ifdef __cplusplus
28971+extern "C" {
28972+#endif
28973+
28974+/***********************************************************************
28975+//
28976+************************************************************************/
28977+
28978+/* Memory required for the wrkmem parameter.
28979+ * When the required size is 0, you can also pass a NULL pointer.
28980+ */
28981+
28982+#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
28983+#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28984+#define LZO1X_MEM_DECOMPRESS (0)
28985+
28986+/* compression */
28987+ LZO_EXTERN(int)
28988+ lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28989+ lzo_byte * dst, lzo_uintp dst_len, lzo_voidp wrkmem);
28990+
28991+/* decompression */
28992+ LZO_EXTERN(int)
28993+ lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28994+ lzo_byte * dst, lzo_uintp dst_len,
28995+ lzo_voidp wrkmem /* NOT USED */ );
28996+
28997+/* safe decompression with overrun testing */
28998+ LZO_EXTERN(int)
28999+ lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
29000+ lzo_byte * dst, lzo_uintp dst_len,
29001+ lzo_voidp wrkmem /* NOT USED */ );
29002+
29003+#ifdef __cplusplus
29004+} /* extern "C" */
29005+#endif
29006+#endif /* already included */
29007Index: linux-2.6.16/fs/reiser4/plugin/crypto/cipher.c
29008===================================================================
29009--- /dev/null
29010+++ linux-2.6.16/fs/reiser4/plugin/crypto/cipher.c
29011@@ -0,0 +1,116 @@
29012+/* Copyright 2001, 2002, 2003 by Hans Reiser,
29013+ licensing governed by reiser4/README */
29014+/* Reiser4 cipher transform plugins */
29015+
29016+#include "../../debug.h"
29017+#include "../plugin.h"
29018+#include "../file/cryptcompress.h"
29019+#include <linux/types.h>
29020+#include <linux/random.h>
29021+
29022+#define MIN_CIPHER_BLOCKSIZE 8
29023+#define MAX_CIPHER_BLOCKSIZE 128
29024+
29025+/*
29026+ Default align() method of the cipher plugin (look for description of this
29027+ method in plugin/plugin.h)
29028+
29029+ 1) creates the aligning armored format of the input flow before encryption.
29030+ "armored" means that padding is filled by private data (for example,
29031+ pseudo-random sequence of bytes is not private data).
29032+ 2) returns length of appended padding
29033+
29034+ [ flow | aligning_padding ]
29035+ ^
29036+ |
29037+ @pad
29038+*/
29039+static int align_stream_common(__u8 * pad,
29040+ int flow_size /* size of non-aligned flow */,
29041+ int blocksize /* cipher block size */)
29042+{
29043+ int pad_size;
29044+
29045+ assert("edward-01", pad != NULL);
29046+ assert("edward-02", flow_size != 0);
29047+ assert("edward-03", blocksize != 0
29048+ || blocksize <= MAX_CIPHER_BLOCKSIZE);
29049+
29050+ pad_size = blocksize - (flow_size % blocksize);
29051+ get_random_bytes(pad, pad_size);
29052+ return pad_size;
29053+}
29054+
29055+/* This is used for all the cipher algorithms which do not inflate
29056+ block-aligned data */
29057+static loff_t scale_common(struct inode *inode, size_t blocksize,
29058+ loff_t src_off /* offset to scale */ )
29059+{
29060+ return src_off;
29061+}
29062+
29063+static void free_aes (struct crypto_tfm * tfm)
29064+{
29065+#if REISER4_AES
29066+ crypto_free_tfm(tfm);
29067+#endif
29068+ return;
29069+}
29070+
29071+static struct crypto_tfm * alloc_aes (void)
29072+{
29073+#if REISER4_AES
29074+ return crypto_alloc_tfm ("aes", 0);
29075+#else
29076+ warning("edward-1417", "aes unsupported");
29077+ return ERR_PTR(-EINVAL);
29078+#endif /* REISER4_AES */
29079+}
29080+
29081+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
29082+ [NONE_CIPHER_ID] = {
29083+ .h = {
29084+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
29085+ .id = NONE_CIPHER_ID,
29086+ .pops = NULL,
29087+ .label = "none",
29088+ .desc = "no cipher transform",
29089+ .linkage = {NULL, NULL}
29090+ },
29091+ .alloc = NULL,
29092+ .free = NULL,
29093+ .scale = NULL,
29094+ .align_stream = NULL,
29095+ .setkey = NULL,
29096+ .encrypt = NULL,
29097+ .decrypt = NULL
29098+ },
29099+ [AES_CIPHER_ID] = {
29100+ .h = {
29101+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
29102+ .id = AES_CIPHER_ID,
29103+ .pops = NULL,
29104+ .label = "aes",
29105+ .desc = "aes cipher transform",
29106+ .linkage = {NULL, NULL}
29107+ },
29108+ .alloc = alloc_aes,
29109+ .free = free_aes,
29110+ .scale = scale_common,
29111+ .align_stream = align_stream_common,
29112+ .setkey = NULL,
29113+ .encrypt = NULL,
29114+ .decrypt = NULL
29115+ }
29116+};
29117+
29118+/* Make Linus happy.
29119+ Local variables:
29120+ c-indentation-style: "K&R"
29121+ mode-name: "LC"
29122+ c-basic-offset: 8
29123+ tab-width: 8
29124+ fill-column: 120
29125+ scroll-step: 1
29126+ End:
29127+*/
29128Index: linux-2.6.16/fs/reiser4/plugin/crypto/cipher.h
29129===================================================================
29130--- /dev/null
29131+++ linux-2.6.16/fs/reiser4/plugin/crypto/cipher.h
29132@@ -0,0 +1,67 @@
29133+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29134+/* This file contains definitions for the objects operated
29135+ by reiser4 key manager, which is something like keyring
29136+ wrapped by appropriate reiser4 plugin */
29137+
29138+#if !defined( __FS_REISER4_CRYPT_H__ )
29139+#define __FS_REISER4_CRYPT_H__
29140+
29141+#include <linux/crypto.h>
29142+
29143+
29144+/* Transform actions involved in ciphering process and
29145+ supported by reiser4 via appropriate transform plugins */
29146+typedef enum {
29147+ CIPHER_TFM, /* cipher transform */
29148+ DIGEST_TFM, /* digest transform */
29149+ LAST_TFM
29150+} reiser4_tfm;
29151+
29152+/* This represents a transform action in reiser4 */
29153+typedef struct reiser4_tfma {
29154+ reiser4_plugin * plug; /* transform plugin */
29155+ struct crypto_tfm * tfm; /* low-level info, operated by
29156+ linux crypto-api (see linux/crypto) */
29157+} reiser4_tfma_t;
29158+
29159+/* key info imported from user space */
29160+typedef struct crypto_data {
29161+ int keysize; /* uninstantiated key size */
29162+ __u8 * key; /* uninstantiated key */
29163+ int keyid_size; /* size of passphrase */
29164+ __u8 * keyid; /* passphrase */
29165+} crypto_data_t;
29166+
29167+/* This object contains all needed infrastructure to implement
29168+ cipher transform. This is operated (allocating, inheriting,
29169+ validating, binding to host inode, etc..) by reiser4 key manager.
29170+
29171+ This info can be allocated in two cases:
29172+ 1. importing a key from user space.
29173+ 2. reading inode from disk */
29174+typedef struct crypto_stat {
29175+ reiser4_tfma_t tfma[LAST_TFM];
29176+// cipher_key_plugin * kplug; /* key manager */
29177+ __u8 * keyid; /* key fingerprint, created by digest plugin,
29178+ using uninstantiated key and passphrase.
29179+ supposed to be stored in disk stat-data */
29180+ int inst; /* this indicates if the cipher key is
29181+ instantiated (case 1 above) */
29182+ int keysize; /* uninstantiated key size (bytes), supposed
29183+ to be stored in disk stat-data */
29184+ int keyload_count; /* number of the objects which has this
29185+ crypto-stat attached */
29186+} crypto_stat_t;
29187+
29188+#endif /* __FS_REISER4_CRYPT_H__ */
29189+
29190+/*
29191+ Local variables:
29192+ c-indentation-style: "K&R"
29193+ mode-name: "LC"
29194+ c-basic-offset: 8
29195+ tab-width: 8
29196+ fill-column: 120
29197+ scroll-step: 1
29198+ End:
29199+*/
29200Index: linux-2.6.16/fs/reiser4/plugin/crypto/digest.c
29201===================================================================
29202--- /dev/null
29203+++ linux-2.6.16/fs/reiser4/plugin/crypto/digest.c
29204@@ -0,0 +1,58 @@
29205+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29206+
29207+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
29208+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
29209+#include "../../debug.h"
29210+#include "../plugin_header.h"
29211+#include "../plugin.h"
29212+#include "../file/cryptcompress.h"
29213+
29214+#include <linux/types.h>
29215+
29216+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
29217+
29218+static struct crypto_tfm * alloc_sha256 (void)
29219+{
29220+#if REISER4_SHA256
29221+ return crypto_alloc_tfm ("sha256", 0);
29222+#else
29223+ warning("edward-1418", "sha256 unsupported");
29224+ return ERR_PTR(-EINVAL);
29225+#endif
29226+}
29227+
29228+static void free_sha256 (struct crypto_tfm * tfm)
29229+{
29230+#if REISER4_SHA256
29231+ crypto_free_tfm(tfm);
29232+#endif
29233+ return;
29234+}
29235+
29236+/* digest plugins */
29237+digest_plugin digest_plugins[LAST_DIGEST_ID] = {
29238+ [SHA256_32_DIGEST_ID] = {
29239+ .h = {
29240+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
29241+ .id = SHA256_32_DIGEST_ID,
29242+ .pops = NULL,
29243+ .label = "sha256_32",
29244+ .desc = "sha256_32 digest transform",
29245+ .linkage = {NULL, NULL}
29246+ },
29247+ .fipsize = sizeof(__u32),
29248+ .alloc = alloc_sha256,
29249+ .free = free_sha256
29250+ }
29251+};
29252+
29253+/*
29254+ Local variables:
29255+ c-indentation-style: "K&R"
29256+ mode-name: "LC"
29257+ c-basic-offset: 8
29258+ tab-width: 8
29259+ fill-column: 120
29260+ scroll-step: 1
29261+ End:
29262+*/
29263Index: linux-2.6.16/fs/reiser4/plugin/dir/Makefile
29264===================================================================
29265--- /dev/null
29266+++ linux-2.6.16/fs/reiser4/plugin/dir/Makefile
29267@@ -0,0 +1,5 @@
29268+obj-$(CONFIG_REISER4_FS) += dir_plugins.o
29269+
29270+dir_plugins-objs := \
29271+ hashed_dir.o \
29272+ seekable_dir.o
29273Index: linux-2.6.16/fs/reiser4/plugin/dir/dir.h
29274===================================================================
29275--- /dev/null
29276+++ linux-2.6.16/fs/reiser4/plugin/dir/dir.h
29277@@ -0,0 +1,36 @@
29278+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29279+ * reiser4/README */
29280+
29281+/* this file contains declarations of methods implementing directory plugins */
29282+
29283+#if !defined( __REISER4_DIR_H__ )
29284+#define __REISER4_DIR_H__
29285+
29286+/*#include "../../key.h"
29287+
29288+#include <linux/fs.h>*/
29289+
29290+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
29291+
29292+/* "hashed" directory methods of dir plugin */
29293+void build_entry_key_hashed(const struct inode *, const struct qstr *,
29294+ reiser4_key *);
29295+
29296+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
29297+
29298+/* "seekable" directory methods of dir plugin */
29299+void build_entry_key_seekable(const struct inode *, const struct qstr *,
29300+ reiser4_key *);
29301+
29302+/* __REISER4_DIR_H__ */
29303+#endif
29304+
29305+/*
29306+ Local variables:
29307+ c-indentation-style: "K&R"
29308+ mode-name: "LC"
29309+ c-basic-offset: 8
29310+ tab-width: 8
29311+ fill-column: 120
29312+ End:
29313+*/
29314Index: linux-2.6.16/fs/reiser4/plugin/dir/hashed_dir.c
29315===================================================================
29316--- /dev/null
29317+++ linux-2.6.16/fs/reiser4/plugin/dir/hashed_dir.c
29318@@ -0,0 +1,81 @@
29319+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29320+ * reiser4/README */
29321+
29322+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
29323+ names to the files. */
29324+
29325+/*
29326+ * Hashed directory logically consists of persistent directory
29327+ * entries. Directory entry is a pair of a file name and a key of stat-data of
29328+ * a file that has this name in the given directory.
29329+ *
29330+ * Directory entries are stored in the tree in the form of directory
29331+ * items. Directory item should implement dir_entry_ops portion of item plugin
29332+ * interface (see plugin/item/item.h). Hashed directory interacts with
29333+ * directory item plugin exclusively through dir_entry_ops operations.
29334+ *
29335+ * Currently there are two implementations of directory items: "simple
29336+ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
29337+ * (plugin/item/cde.[ch]) with the latter being the default.
29338+ *
29339+ * There is, however some delicate way through which directory code interferes
29340+ * with item plugin: key assignment policy. A key for a directory item is
29341+ * chosen by directory code, and as described in kassign.c, this key contains
29342+ * a portion of file name. Directory item uses this knowledge to avoid storing
29343+ * this portion of file name twice: in the key and in the directory item body.
29344+ *
29345+ */
29346+
29347+#include "../../inode.h"
29348+
29349+void complete_entry_key(const struct inode *, const char *name,
29350+ int len, reiser4_key * result);
29351+
29352+/* this is implementation of build_entry_key method of dir
29353+ plugin for HASHED_DIR_PLUGIN_ID
29354+ */
29355+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
29356+ * (or will be) in.*/
29357+ const struct qstr *qname, /* name of file referenced
29358+ * by this entry */
29359+ reiser4_key * result /* resulting key of directory
29360+ * entry */ )
29361+{
29362+ const char *name;
29363+ int len;
29364+
29365+ assert("nikita-1139", dir != NULL);
29366+ assert("nikita-1140", qname != NULL);
29367+ assert("nikita-1141", qname->name != NULL);
29368+ assert("nikita-1142", result != NULL);
29369+
29370+ name = qname->name;
29371+ len = qname->len;
29372+
29373+ assert("nikita-2867", strlen(name) == len);
29374+
29375+ reiser4_key_init(result);
29376+ /* locality of directory entry's key is objectid of parent
29377+ directory */
29378+ set_key_locality(result, get_inode_oid(dir));
29379+ /* minor packing locality is constant */
29380+ set_key_type(result, KEY_FILE_NAME_MINOR);
29381+ /* dot is special case---we always want it to be first entry in
29382+ a directory. Actually, we just want to have smallest
29383+ directory entry.
29384+ */
29385+ if (len == 1 && name[0] == '.')
29386+ return;
29387+
29388+ /* initialize part of entry key which depends on file name */
29389+ complete_entry_key(dir, name, len, result);
29390+}
29391+
29392+/* Local variables:
29393+ c-indentation-style: "K&R"
29394+ mode-name: "LC"
29395+ c-basic-offset: 8
29396+ tab-width: 8
29397+ fill-column: 120
29398+ End:
29399+*/
29400Index: linux-2.6.16/fs/reiser4/plugin/dir/seekable_dir.c
29401===================================================================
29402--- /dev/null
29403+++ linux-2.6.16/fs/reiser4/plugin/dir/seekable_dir.c
29404@@ -0,0 +1,46 @@
29405+/* Copyright 2005 by Hans Reiser, licensing governed by
29406+ * reiser4/README */
29407+
29408+#include "../../inode.h"
29409+
29410+/* this is implementation of build_entry_key method of dir
29411+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
29412+ This is for directories where we want repeatable and restartable readdir()
29413+ even in case 32bit user level struct dirent (readdir(3)).
29414+*/
29415+void
29416+build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
29417+ reiser4_key * result)
29418+{
29419+ oid_t objectid;
29420+
29421+ assert("nikita-2283", dir != NULL);
29422+ assert("nikita-2284", name != NULL);
29423+ assert("nikita-2285", name->name != NULL);
29424+ assert("nikita-2286", result != NULL);
29425+
29426+ reiser4_key_init(result);
29427+ /* locality of directory entry's key is objectid of parent
29428+ directory */
29429+ set_key_locality(result, get_inode_oid(dir));
29430+ /* minor packing locality is constant */
29431+ set_key_type(result, KEY_FILE_NAME_MINOR);
29432+ /* dot is special case---we always want it to be first entry in
29433+ a directory. Actually, we just want to have smallest
29434+ directory entry.
29435+ */
29436+ if ((name->len == 1) && (name->name[0] == '.'))
29437+ return;
29438+
29439+ /* objectid of key is 31 lowest bits of hash. */
29440+ objectid =
29441+ inode_hash_plugin(dir)->hash(name->name,
29442+ (int)name->len) & 0x7fffffff;
29443+
29444+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
29445+ set_key_objectid(result, objectid);
29446+
29447+ /* offset is always 0. */
29448+ set_key_offset(result, (__u64) 0);
29449+ return;
29450+}
29451Index: linux-2.6.16/fs/reiser4/plugin/dir_plugin_common.c
29452===================================================================
29453--- /dev/null
29454+++ linux-2.6.16/fs/reiser4/plugin/dir_plugin_common.c
29455@@ -0,0 +1,864 @@
29456+/* Copyright 2005 by Hans Reiser, licensing governed by
29457+ reiser4/README */
29458+
29459+/* this file contains typical implementations for most of methods of
29460+ directory plugin
29461+*/
29462+
29463+#include "../inode.h"
29464+
29465+int find_entry(struct inode *dir, struct dentry *name,
29466+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
29467+int lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
29468+void check_light_weight(struct inode *inode, struct inode *parent);
29469+
29470+/* this is common implementation of get_parent method of dir plugin
29471+ this is used by NFS kernel server to "climb" up directory tree to
29472+ check permissions
29473+ */
29474+struct dentry *get_parent_common(struct inode *child)
29475+{
29476+ struct super_block *s;
29477+ struct inode *parent;
29478+ struct dentry dotdot;
29479+ struct dentry *dentry;
29480+ reiser4_key key;
29481+ int result;
29482+
29483+ /*
29484+ * lookup dotdot entry.
29485+ */
29486+
29487+ s = child->i_sb;
29488+ memset(&dotdot, 0, sizeof(dotdot));
29489+ dotdot.d_name.name = "..";
29490+ dotdot.d_name.len = 2;
29491+ dotdot.d_op = &get_super_private(s)->ops.dentry;
29492+
29493+ result = lookup_name(child, &dotdot, &key);
29494+ if (result != 0)
29495+ return ERR_PTR(result);
29496+
29497+ parent = reiser4_iget(s, &key, 1);
29498+ if (!IS_ERR(parent)) {
29499+ /*
29500+ * FIXME-NIKITA dubious: attributes are inherited from @child
29501+ * to @parent. But:
29502+ *
29503+ * (*) this is the only this we can do
29504+ *
29505+ * (*) attributes of light-weight object are inherited
29506+ * from a parent through which object was looked up first,
29507+ * so it is ambiguous anyway.
29508+ *
29509+ */
29510+ check_light_weight(parent, child);
29511+ reiser4_iget_complete(parent);
29512+ dentry = d_alloc_anon(parent);
29513+ if (dentry == NULL) {
29514+ iput(parent);
29515+ dentry = ERR_PTR(RETERR(-ENOMEM));
29516+ } else
29517+ dentry->d_op = &get_super_private(s)->ops.dentry;
29518+ } else if (PTR_ERR(parent) == -ENOENT)
29519+ dentry = ERR_PTR(RETERR(-ESTALE));
29520+ else
29521+ dentry = (void *)parent;
29522+ return dentry;
29523+}
29524+
29525+/* this is common implementation of is_name_acceptable method of dir
29526+ plugin
29527+ */
29528+int is_name_acceptable_common(const struct inode *inode, /* directory to check */
29529+ const char *name UNUSED_ARG, /* name to check */
29530+ int len /* @name's length */ )
29531+{
29532+ assert("nikita-733", inode != NULL);
29533+ assert("nikita-734", name != NULL);
29534+ assert("nikita-735", len > 0);
29535+
29536+ return len <= reiser4_max_filename_len(inode);
29537+}
29538+
29539+/* there is no common implementation of build_entry_key method of dir
29540+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
29541+ plugin/dir/seekable.c:build_entry_key_seekable() for example
29542+*/
29543+
29544+/* this is common implementation of build_readdir_key method of dir
29545+ plugin
29546+ see readdir_common for more details
29547+*/
29548+int build_readdir_key_common(struct file *dir /* directory being read */ ,
29549+ reiser4_key * result /* where to store key */ )
29550+{
29551+ reiser4_file_fsdata *fdata;
29552+ struct inode *inode;
29553+
29554+ assert("nikita-1361", dir != NULL);
29555+ assert("nikita-1362", result != NULL);
29556+ assert("nikita-1363", dir->f_dentry != NULL);
29557+ inode = dir->f_dentry->d_inode;
29558+ assert("nikita-1373", inode != NULL);
29559+
29560+ fdata = reiser4_get_file_fsdata(dir);
29561+ if (IS_ERR(fdata))
29562+ return PTR_ERR(fdata);
29563+ assert("nikita-1364", fdata != NULL);
29564+ return extract_key_from_de_id(get_inode_oid(inode),
29565+ &fdata->dir.readdir.position.
29566+ dir_entry_key, result);
29567+
29568+}
29569+
29570+void adjust_dir_file(struct inode *, const struct dentry *, int offset,
29571+ int adj);
29572+
29573+/* this is common implementation of add_entry method of dir plugin
29574+*/
29575+int add_entry_common(struct inode *object, /* directory to add new name
29576+ * in */
29577+ struct dentry *where, /* new name */
29578+ reiser4_object_create_data * data UNUSED_ARG, /* parameters
29579+ * of new
29580+ * object */
29581+ reiser4_dir_entry_desc * entry /* parameters of new
29582+ * directory entry */ )
29583+{
29584+ int result;
29585+ coord_t *coord;
29586+ lock_handle lh;
29587+ reiser4_dentry_fsdata *fsdata;
29588+ reiser4_block_nr reserve;
29589+
29590+ assert("nikita-1114", object != NULL);
29591+ assert("nikita-1250", where != NULL);
29592+
29593+ fsdata = reiser4_get_dentry_fsdata(where);
29594+ if (unlikely(IS_ERR(fsdata)))
29595+ return PTR_ERR(fsdata);
29596+
29597+ reserve = inode_dir_plugin(object)->estimate.add_entry(object);
29598+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29599+ return RETERR(-ENOSPC);
29600+
29601+ init_lh(&lh);
29602+ coord = &fsdata->dec.entry_coord;
29603+ coord_clear_iplug(coord);
29604+
29605+ /* check for this entry in a directory. This is plugin method. */
29606+ result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry);
29607+ if (likely(result == -ENOENT)) {
29608+ /* add new entry. Just pass control to the directory
29609+ item plugin. */
29610+ assert("nikita-1709", inode_dir_item_plugin(object));
29611+ assert("nikita-2230", coord->node == lh.node);
29612+ seal_done(&fsdata->dec.entry_seal);
29613+ result =
29614+ inode_dir_item_plugin(object)->s.dir.add_entry(object,
29615+ coord, &lh,
29616+ where,
29617+ entry);
29618+ if (result == 0) {
29619+ adjust_dir_file(object, where, fsdata->dec.pos + 1, +1);
29620+ INODE_INC_FIELD(object, i_size);
29621+ }
29622+ } else if (result == 0) {
29623+ assert("nikita-2232", coord->node == lh.node);
29624+ result = RETERR(-EEXIST);
29625+ }
29626+ done_lh(&lh);
29627+
29628+ return result;
29629+}
29630+
29631+/**
29632+ * rem_entry - remove entry from directory item
29633+ * @dir:
29634+ * @dentry:
29635+ * @entry:
29636+ * @coord:
29637+ * @lh:
29638+ *
29639+ * Checks that coordinate @coord is set properly and calls item plugin
29640+ * method to cut entry.
29641+ */
29642+static int
29643+rem_entry(struct inode *dir, struct dentry *dentry,
29644+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
29645+{
29646+ item_plugin *iplug;
29647+ struct inode *child;
29648+
29649+ iplug = inode_dir_item_plugin(dir);
29650+ child = dentry->d_inode;
29651+ assert("nikita-3399", child != NULL);
29652+
29653+ /* check that we are really destroying an entry for @child */
29654+ if (REISER4_DEBUG) {
29655+ int result;
29656+ reiser4_key key;
29657+
29658+ result = iplug->s.dir.extract_key(coord, &key);
29659+ if (result != 0)
29660+ return result;
29661+ if (get_key_objectid(&key) != get_inode_oid(child)) {
29662+ warning("nikita-3397",
29663+ "rem_entry: %#llx != %#llx\n",
29664+ get_key_objectid(&key),
29665+ (unsigned long long)get_inode_oid(child));
29666+ return RETERR(-EIO);
29667+ }
29668+ }
29669+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
29670+}
29671+
29672+/**
29673+ * rem_entry_common - remove entry from a directory
29674+ * @dir: directory to remove entry from
29675+ * @where: name that is being removed
29676+ * @entry: description of entry being removed
29677+ *
29678+ * This is common implementation of rem_entry method of dir plugin.
29679+ */
29680+int rem_entry_common(struct inode *dir,
29681+ struct dentry *dentry,
29682+ reiser4_dir_entry_desc *entry)
29683+{
29684+ int result;
29685+ coord_t *coord;
29686+ lock_handle lh;
29687+ reiser4_dentry_fsdata *fsdata;
29688+ __u64 tograb;
29689+
29690+ assert("nikita-1124", dir != NULL);
29691+ assert("nikita-1125", dentry != NULL);
29692+
29693+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
29694+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
29695+ if (result != 0)
29696+ return RETERR(-ENOSPC);
29697+
29698+ init_lh(&lh);
29699+
29700+ /* check for this entry in a directory. This is plugin method. */
29701+ result = find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
29702+ fsdata = reiser4_get_dentry_fsdata(dentry);
29703+ if (IS_ERR(fsdata)) {
29704+ done_lh(&lh);
29705+ return PTR_ERR(fsdata);
29706+ }
29707+
29708+ coord = &fsdata->dec.entry_coord;
29709+
29710+ assert("nikita-3404",
29711+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
29712+ dir->i_size <= 1);
29713+
29714+ coord_clear_iplug(coord);
29715+ if (result == 0) {
29716+ /* remove entry. Just pass control to the directory item
29717+ plugin. */
29718+ assert("vs-542", inode_dir_item_plugin(dir));
29719+ seal_done(&fsdata->dec.entry_seal);
29720+ adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
29721+ result =
29722+ WITH_COORD(coord,
29723+ rem_entry(dir, dentry, entry, coord, &lh));
29724+ if (result == 0) {
29725+ if (dir->i_size >= 1)
29726+ INODE_DEC_FIELD(dir, i_size);
29727+ else {
29728+ warning("nikita-2509", "Dir %llu is runt",
29729+ (unsigned long long)
29730+ get_inode_oid(dir));
29731+ result = RETERR(-EIO);
29732+ }
29733+
29734+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
29735+ dentry->d_inode->i_size != 2 ||
29736+ inode_dir_plugin(dentry->d_inode) == NULL);
29737+ }
29738+ }
29739+ done_lh(&lh);
29740+
29741+ return result;
29742+}
29743+
29744+static reiser4_block_nr estimate_init(struct inode *parent,
29745+ struct inode *object);
29746+static int create_dot_dotdot(struct inode *object, struct inode *parent);
29747+
29748+/* this is common implementation of init method of dir plugin
29749+ create "." and ".." entries
29750+*/
29751+int init_common(struct inode *object, /* new directory */
29752+ struct inode *parent, /* parent directory */
29753+ reiser4_object_create_data * data UNUSED_ARG /* info passed
29754+ * to us, this
29755+ * is filled by
29756+ * reiser4()
29757+ * syscall in
29758+ * particular */ )
29759+{
29760+ reiser4_block_nr reserve;
29761+
29762+ assert("nikita-680", object != NULL);
29763+ assert("nikita-681", S_ISDIR(object->i_mode));
29764+ assert("nikita-682", parent != NULL);
29765+ assert("nikita-684", data != NULL);
29766+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29767+ assert("nikita-687", object->i_mode & S_IFDIR);
29768+
29769+ reserve = estimate_init(parent, object);
29770+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29771+ return RETERR(-ENOSPC);
29772+
29773+ return create_dot_dotdot(object, parent);
29774+}
29775+
29776+/* this is common implementation of done method of dir plugin
29777+ remove "." entry
29778+*/
29779+int done_common(struct inode *object /* object being deleted */ )
29780+{
29781+ int result;
29782+ reiser4_block_nr reserve;
29783+ struct dentry goodby_dots;
29784+ reiser4_dir_entry_desc entry;
29785+
29786+ assert("nikita-1449", object != NULL);
29787+
29788+ if (inode_get_flag(object, REISER4_NO_SD))
29789+ return 0;
29790+
29791+ /* of course, this can be rewritten to sweep everything in one
29792+ cut_tree(). */
29793+ memset(&entry, 0, sizeof entry);
29794+
29795+ /* FIXME: this done method is called from delete_directory_common which
29796+ * reserved space already */
29797+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29798+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29799+ return RETERR(-ENOSPC);
29800+
29801+ memset(&goodby_dots, 0, sizeof goodby_dots);
29802+ entry.obj = goodby_dots.d_inode = object;
29803+ goodby_dots.d_name.name = ".";
29804+ goodby_dots.d_name.len = 1;
29805+ result = rem_entry_common(object, &goodby_dots, &entry);
29806+ reiser4_free_dentry_fsdata(&goodby_dots);
29807+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29808+ /* only worth a warning
29809+
29810+ "values of \ eB\ f will give rise to dom!\n"
29811+ -- v6src/s2/mv.c:89
29812+ */
29813+ warning("nikita-2252", "Cannot remove dot of %lli: %i",
29814+ (unsigned long long)get_inode_oid(object), result);
29815+ return 0;
29816+}
29817+
29818+/* this is common implementation of attach method of dir plugin
29819+*/
29820+int
29821+attach_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG)
29822+{
29823+ assert("nikita-2647", child != NULL);
29824+ assert("nikita-2648", parent != NULL);
29825+
29826+ return 0;
29827+}
29828+
29829+/* this is common implementation of detach method of dir plugin
29830+ remove "..", decrease nlink on parent
29831+*/
29832+int detach_common(struct inode *object, struct inode *parent)
29833+{
29834+ int result;
29835+ struct dentry goodby_dots;
29836+ reiser4_dir_entry_desc entry;
29837+
29838+ assert("nikita-2885", object != NULL);
29839+ assert("nikita-2886", !inode_get_flag(object, REISER4_NO_SD));
29840+
29841+ memset(&entry, 0, sizeof entry);
29842+
29843+ /* NOTE-NIKITA this only works if @parent is -the- parent of
29844+ @object, viz. object whose key is stored in dotdot
29845+ entry. Wouldn't work with hard-links on directories. */
29846+ memset(&goodby_dots, 0, sizeof goodby_dots);
29847+ entry.obj = goodby_dots.d_inode = parent;
29848+ goodby_dots.d_name.name = "..";
29849+ goodby_dots.d_name.len = 2;
29850+ result = rem_entry_common(object, &goodby_dots, &entry);
29851+ reiser4_free_dentry_fsdata(&goodby_dots);
29852+ if (result == 0) {
29853+ /* the dot should be the only entry remaining at this time... */
29854+ assert("nikita-3400", object->i_size == 1 &&
29855+ (object->i_nlink >= 0 && object->i_nlink <= 2));
29856+#if 0
29857+ /* and, together with the only name directory can have, they
29858+ * provides for the last 2 remaining references. If we get
29859+ * here as part of error handling during mkdir, @object
29860+ * possibly has no name yet, so its nlink == 1. If we get here
29861+ * from rename (targeting empty directory), it has no name
29862+ * already, so its nlink == 1. */
29863+ assert("nikita-3401",
29864+ object->i_nlink == 2 || object->i_nlink == 1);
29865+#endif
29866+
29867+ /* decrement nlink of directory removed ".." pointed
29868+ to */
29869+ reiser4_del_nlink(parent, NULL, 0);
29870+ }
29871+ return result;
29872+}
29873+
29874+/* this is common implementation of estimate.add_entry method of
29875+ dir plugin
29876+ estimation of adding entry which supposes that entry is inserting a
29877+ unit into item
29878+*/
29879+reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29880+{
29881+ return estimate_one_insert_into_item(tree_by_inode(inode));
29882+}
29883+
29884+/* this is common implementation of estimate.rem_entry method of dir
29885+ plugin
29886+*/
29887+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29888+{
29889+ return estimate_one_item_removal(tree_by_inode(inode));
29890+}
29891+
29892+/* this is common implementation of estimate.unlink method of dir
29893+ plugin
29894+*/
29895+reiser4_block_nr
29896+dir_estimate_unlink_common(const struct inode * parent,
29897+ const struct inode * object)
29898+{
29899+ reiser4_block_nr res;
29900+
29901+ /* hashed_rem_entry(object) */
29902+ res = inode_dir_plugin(object)->estimate.rem_entry(object);
29903+ /* del_nlink(parent) */
29904+ res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29905+
29906+ return res;
29907+}
29908+
29909+/*
29910+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29911+ * methods: if @inode is a light-weight file, setup its credentials
29912+ * that are not stored in the stat-data in this case
29913+ */
29914+void check_light_weight(struct inode *inode, struct inode *parent)
29915+{
29916+ if (inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
29917+ inode->i_uid = parent->i_uid;
29918+ inode->i_gid = parent->i_gid;
29919+ /* clear light-weight flag. If inode would be read by any
29920+ other name, [ug]id wouldn't change. */
29921+ inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
29922+ }
29923+}
29924+
29925+/* looks for name specified in @dentry in directory @parent and if name is
29926+ found - key of object found entry points to is stored in @entry->key */
29927+int lookup_name(struct inode *parent, /* inode of directory to lookup for
29928+ * name in */
29929+ struct dentry *dentry, /* name to look for */
29930+ reiser4_key * key /* place to store key */ )
29931+{
29932+ int result;
29933+ coord_t *coord;
29934+ lock_handle lh;
29935+ const char *name;
29936+ int len;
29937+ reiser4_dir_entry_desc entry;
29938+ reiser4_dentry_fsdata *fsdata;
29939+
29940+ assert("nikita-1247", parent != NULL);
29941+ assert("nikita-1248", dentry != NULL);
29942+ assert("nikita-1123", dentry->d_name.name != NULL);
29943+ assert("vs-1486",
29944+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29945+
29946+ name = dentry->d_name.name;
29947+ len = dentry->d_name.len;
29948+
29949+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29950+ /* some arbitrary error code to return */
29951+ return RETERR(-ENAMETOOLONG);
29952+
29953+ fsdata = reiser4_get_dentry_fsdata(dentry);
29954+ if (IS_ERR(fsdata))
29955+ return PTR_ERR(fsdata);
29956+
29957+ coord = &fsdata->dec.entry_coord;
29958+ coord_clear_iplug(coord);
29959+ init_lh(&lh);
29960+
29961+ /* find entry in a directory. This is plugin method. */
29962+ result = find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, &entry);
29963+ if (result == 0) {
29964+ /* entry was found, extract object key from it. */
29965+ result =
29966+ WITH_COORD(coord,
29967+ item_plugin_by_coord(coord)->s.dir.
29968+ extract_key(coord, key));
29969+ }
29970+ done_lh(&lh);
29971+ return result;
29972+
29973+}
29974+
29975+/* helper for init_common(): estimate number of blocks to reserve */
29976+static reiser4_block_nr
29977+estimate_init(struct inode *parent, struct inode *object)
29978+{
29979+ reiser4_block_nr res = 0;
29980+
29981+ assert("vpf-321", parent != NULL);
29982+ assert("vpf-322", object != NULL);
29983+
29984+ /* hashed_add_entry(object) */
29985+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29986+ /* reiser4_add_nlink(object) */
29987+ res += inode_file_plugin(object)->estimate.update(object);
29988+ /* hashed_add_entry(object) */
29989+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29990+ /* reiser4_add_nlink(parent) */
29991+ res += inode_file_plugin(parent)->estimate.update(parent);
29992+
29993+ return 0;
29994+}
29995+
29996+/* helper function for init_common(). Create "." and ".." */
29997+static int create_dot_dotdot(struct inode *object /* object to create dot and
29998+ * dotdot for */ ,
29999+ struct inode *parent /* parent of @object */ )
30000+{
30001+ int result;
30002+ struct dentry dots_entry;
30003+ reiser4_dir_entry_desc entry;
30004+
30005+ assert("nikita-688", object != NULL);
30006+ assert("nikita-689", S_ISDIR(object->i_mode));
30007+ assert("nikita-691", parent != NULL);
30008+
30009+ /* We store dot and dotdot as normal directory entries. This is
30010+ not necessary, because almost all information stored in them
30011+ is already in the stat-data of directory, the only thing
30012+ being missed is objectid of grand-parent directory that can
30013+ easily be added there as extension.
30014+
30015+ But it is done the way it is done, because not storing dot
30016+ and dotdot will lead to the following complications:
30017+
30018+ . special case handling in ->lookup().
30019+ . addition of another extension to the sd.
30020+ . dependency on key allocation policy for stat data.
30021+
30022+ */
30023+
30024+ memset(&entry, 0, sizeof entry);
30025+ memset(&dots_entry, 0, sizeof dots_entry);
30026+ entry.obj = dots_entry.d_inode = object;
30027+ dots_entry.d_name.name = ".";
30028+ dots_entry.d_name.len = 1;
30029+ result = add_entry_common(object, &dots_entry, NULL, &entry);
30030+ reiser4_free_dentry_fsdata(&dots_entry);
30031+
30032+ if (result == 0) {
30033+ result = reiser4_add_nlink(object, object, 0);
30034+ if (result == 0) {
30035+ entry.obj = dots_entry.d_inode = parent;
30036+ dots_entry.d_name.name = "..";
30037+ dots_entry.d_name.len = 2;
30038+ result = add_entry_common(object,
30039+ &dots_entry, NULL, &entry);
30040+ reiser4_free_dentry_fsdata(&dots_entry);
30041+ /* if creation of ".." failed, iput() will delete
30042+ object with ".". */
30043+ if (result == 0) {
30044+ result = reiser4_add_nlink(parent, object, 0);
30045+ if (result != 0)
30046+ /*
30047+ * if we failed to bump i_nlink, try
30048+ * to remove ".."
30049+ */
30050+ detach_common(object, parent);
30051+ }
30052+ }
30053+ }
30054+
30055+ if (result != 0) {
30056+ /*
30057+ * in the case of error, at least update stat-data so that,
30058+ * ->i_nlink updates are not lingering.
30059+ */
30060+ reiser4_update_sd(object);
30061+ reiser4_update_sd(parent);
30062+ }
30063+
30064+ return result;
30065+}
30066+
30067+/*
30068+ * return 0 iff @coord contains a directory entry for the file with the name
30069+ * @name.
30070+ */
30071+static int
30072+check_item(const struct inode *dir, const coord_t * coord, const char *name)
30073+{
30074+ item_plugin *iplug;
30075+ char buf[DE_NAME_BUF_LEN];
30076+
30077+ iplug = item_plugin_by_coord(coord);
30078+ if (iplug == NULL) {
30079+ warning("nikita-1135", "Cannot get item plugin");
30080+ print_coord("coord", coord, 1);
30081+ return RETERR(-EIO);
30082+ } else if (item_id_by_coord(coord) !=
30083+ item_id_by_plugin(inode_dir_item_plugin(dir))) {
30084+ /* item id of current item does not match to id of items a
30085+ directory is built of */
30086+ warning("nikita-1136", "Wrong item plugin");
30087+ print_coord("coord", coord, 1);
30088+ return RETERR(-EIO);
30089+ }
30090+ assert("nikita-1137", iplug->s.dir.extract_name);
30091+
30092+ /* Compare name stored in this entry with name we are looking for.
30093+
30094+ NOTE-NIKITA Here should go code for support of something like
30095+ unicode, code tables, etc.
30096+ */
30097+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
30098+}
30099+
30100+static int
30101+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
30102+{
30103+ return WITH_COORD(coord, check_item(dir, coord, name->name));
30104+}
30105+
30106+/*
30107+ * argument package used by entry_actor to scan entries with identical keys.
30108+ */
30109+typedef struct entry_actor_args {
30110+ /* name we are looking for */
30111+ const char *name;
30112+ /* key of directory entry. entry_actor() scans through sequence of
30113+ * items/units having the same key */
30114+ reiser4_key *key;
30115+ /* how many entries with duplicate key was scanned so far. */
30116+ int non_uniq;
30117+#if REISER4_USE_COLLISION_LIMIT
30118+ /* scan limit */
30119+ int max_non_uniq;
30120+#endif
30121+ /* return parameter: set to true, if ->name wasn't found */
30122+ int not_found;
30123+ /* what type of lock to take when moving to the next node during
30124+ * scan */
30125+ znode_lock_mode mode;
30126+
30127+ /* last coord that was visited during scan */
30128+ coord_t last_coord;
30129+ /* last node locked during scan */
30130+ lock_handle last_lh;
30131+ /* inode of directory */
30132+ const struct inode *inode;
30133+} entry_actor_args;
30134+
30135+/* Function called by find_entry() to look for given name in the directory. */
30136+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
30137+ coord_t * coord /* current coord */ ,
30138+ lock_handle * lh /* current lock handle */ ,
30139+ void *entry_actor_arg /* argument to scan */ )
30140+{
30141+ reiser4_key unit_key;
30142+ entry_actor_args *args;
30143+
30144+ assert("nikita-1131", tree != NULL);
30145+ assert("nikita-1132", coord != NULL);
30146+ assert("nikita-1133", entry_actor_arg != NULL);
30147+
30148+ args = entry_actor_arg;
30149+ ++args->non_uniq;
30150+#if REISER4_USE_COLLISION_LIMIT
30151+ if (args->non_uniq > args->max_non_uniq) {
30152+ args->not_found = 1;
30153+ /* hash collision overflow. */
30154+ return RETERR(-EBUSY);
30155+ }
30156+#endif
30157+
30158+ /*
30159+ * did we just reach the end of the sequence of items/units with
30160+ * identical keys?
30161+ */
30162+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
30163+ assert("nikita-1791",
30164+ keylt(args->key, unit_key_by_coord(coord, &unit_key)));
30165+ args->not_found = 1;
30166+ args->last_coord.between = AFTER_UNIT;
30167+ return 0;
30168+ }
30169+
30170+ coord_dup(&args->last_coord, coord);
30171+ /*
30172+ * did scan just moved to the next node?
30173+ */
30174+ if (args->last_lh.node != lh->node) {
30175+ int lock_result;
30176+
30177+ /*
30178+ * if so, lock new node with the mode requested by the caller
30179+ */
30180+ done_lh(&args->last_lh);
30181+ assert("nikita-1896", znode_is_any_locked(lh->node));
30182+ lock_result = longterm_lock_znode(&args->last_lh, lh->node,
30183+ args->mode, ZNODE_LOCK_HIPRI);
30184+ if (lock_result != 0)
30185+ return lock_result;
30186+ }
30187+ return check_item(args->inode, coord, args->name);
30188+}
30189+
30190+/* Look for given @name within directory @dir.
30191+
30192+ This is called during lookup, creation and removal of directory
30193+ entries and on rename_common
30194+
30195+ First calculate key that directory entry for @name would have. Search
30196+ for this key in the tree. If such key is found, scan all items with
30197+ the same key, checking name in each directory entry along the way.
30198+*/
30199+int find_entry(struct inode *dir, /* directory to scan */
30200+ struct dentry *de, /* name to search for */
30201+ lock_handle * lh, /* resulting lock handle */
30202+ znode_lock_mode mode, /* required lock mode */
30203+ reiser4_dir_entry_desc * entry /* parameters of found directory
30204+ * entry */ )
30205+{
30206+ const struct qstr *name;
30207+ seal_t *seal;
30208+ coord_t *coord;
30209+ int result;
30210+ __u32 flags;
30211+ de_location *dec;
30212+ reiser4_dentry_fsdata *fsdata;
30213+
30214+ assert("nikita-1130", lh != NULL);
30215+ assert("nikita-1128", dir != NULL);
30216+
30217+ name = &de->d_name;
30218+ assert("nikita-1129", name != NULL);
30219+
30220+ /* dentry private data don't require lock, because dentry
30221+ manipulations are protected by i_mutex on parent.
30222+
30223+ This is not so for inodes, because there is no -the- parent in
30224+ inode case.
30225+ */
30226+ fsdata = reiser4_get_dentry_fsdata(de);
30227+ if (IS_ERR(fsdata))
30228+ return PTR_ERR(fsdata);
30229+ dec = &fsdata->dec;
30230+
30231+ coord = &dec->entry_coord;
30232+ coord_clear_iplug(coord);
30233+ seal = &dec->entry_seal;
30234+ /* compose key of directory entry for @name */
30235+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
30236+
30237+ if (seal_is_set(seal)) {
30238+ /* check seal */
30239+ result = seal_validate(seal, coord, &entry->key,
30240+ lh, mode, ZNODE_LOCK_LOPRI);
30241+ if (result == 0) {
30242+ /* key was found. Check that it is really item we are
30243+ looking for. */
30244+ result = check_entry(dir, coord, name);
30245+ if (result == 0)
30246+ return 0;
30247+ }
30248+ }
30249+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
30250+ /*
30251+ * find place in the tree where directory item should be located.
30252+ */
30253+ result = object_lookup(dir, &entry->key, coord, lh, mode,
30254+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags,
30255+ NULL /*ra_info */ );
30256+ if (result == CBK_COORD_FOUND) {
30257+ entry_actor_args arg;
30258+
30259+ /* fast path: no hash collisions */
30260+ result = check_entry(dir, coord, name);
30261+ if (result == 0) {
30262+ seal_init(seal, coord, &entry->key);
30263+ dec->pos = 0;
30264+ } else if (result > 0) {
30265+ /* Iterate through all units with the same keys. */
30266+ arg.name = name->name;
30267+ arg.key = &entry->key;
30268+ arg.not_found = 0;
30269+ arg.non_uniq = 0;
30270+#if REISER4_USE_COLLISION_LIMIT
30271+ arg.max_non_uniq = max_hash_collisions(dir);
30272+ assert("nikita-2851", arg.max_non_uniq > 1);
30273+#endif
30274+ arg.mode = mode;
30275+ arg.inode = dir;
30276+ coord_init_zero(&arg.last_coord);
30277+ init_lh(&arg.last_lh);
30278+
30279+ result = iterate_tree(tree_by_inode(dir), coord, lh,
30280+ entry_actor, &arg, mode, 1);
30281+ /* if end of the tree or extent was reached during
30282+ scanning. */
30283+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
30284+ /* step back */
30285+ done_lh(lh);
30286+
30287+ result = zload(arg.last_coord.node);
30288+ if (result == 0) {
30289+ coord_clear_iplug(&arg.last_coord);
30290+ coord_dup(coord, &arg.last_coord);
30291+ move_lh(lh, &arg.last_lh);
30292+ result = RETERR(-ENOENT);
30293+ zrelse(arg.last_coord.node);
30294+ --arg.non_uniq;
30295+ }
30296+ }
30297+
30298+ done_lh(&arg.last_lh);
30299+ if (result == 0)
30300+ seal_init(seal, coord, &entry->key);
30301+
30302+ if (result == 0 || result == -ENOENT) {
30303+ assert("nikita-2580", arg.non_uniq > 0);
30304+ dec->pos = arg.non_uniq - 1;
30305+ }
30306+ }
30307+ } else
30308+ dec->pos = -1;
30309+ return result;
30310+}
30311+
30312+/* Local variables:
30313+ c-indentation-style: "K&R"
30314+ mode-name: "LC"
30315+ c-basic-offset: 8
30316+ tab-width: 8
30317+ fill-column: 120
30318+ End:
30319+*/
30320Index: linux-2.6.16/fs/reiser4/plugin/disk_format/Makefile
30321===================================================================
30322--- /dev/null
30323+++ linux-2.6.16/fs/reiser4/plugin/disk_format/Makefile
30324@@ -0,0 +1,5 @@
30325+obj-$(CONFIG_REISER4_FS) += df_plugins.o
30326+
30327+df_plugins-objs := \
30328+ disk_format40.o \
30329+ disk_format.o
30330Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.c
30331===================================================================
30332--- /dev/null
30333+++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.c
30334@@ -0,0 +1,37 @@
30335+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30336+
30337+#include "../../debug.h"
30338+#include "../plugin_header.h"
30339+#include "disk_format40.h"
30340+#include "disk_format.h"
30341+#include "../plugin.h"
30342+
30343+/* initialization of disk layout plugins */
30344+disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30345+ [FORMAT40_ID] = {
30346+ .h = {
30347+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30348+ .id = FORMAT40_ID,
30349+ .pops = NULL,
30350+ .label = "reiser40",
30351+ .desc = "standard disk layout for reiser40",
30352+ .linkage = {NULL, NULL}
30353+ },
30354+ .init_format = init_format_format40,
30355+ .root_dir_key = root_dir_key_format40,
30356+ .release = release_format40,
30357+ .log_super = log_super_format40,
30358+ .check_open = check_open_format40
30359+ }
30360+};
30361+
30362+/* Make Linus happy.
30363+ Local variables:
30364+ c-indentation-style: "K&R"
30365+ mode-name: "LC"
30366+ c-basic-offset: 8
30367+ tab-width: 8
30368+ fill-column: 120
30369+ scroll-step: 1
30370+ End:
30371+*/
30372Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.h
30373===================================================================
30374--- /dev/null
30375+++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.h
30376@@ -0,0 +1,27 @@
30377+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30378+
30379+/* identifiers for disk layouts, they are also used as indexes in array of disk
30380+ plugins */
30381+
30382+#if !defined( __REISER4_DISK_FORMAT_H__ )
30383+#define __REISER4_DISK_FORMAT_H__
30384+
30385+typedef enum {
30386+ /* standard reiser4 disk layout plugin id */
30387+ FORMAT40_ID,
30388+ LAST_FORMAT_ID
30389+} disk_format_id;
30390+
30391+/* __REISER4_DISK_FORMAT_H__ */
30392+#endif
30393+
30394+/* Make Linus happy.
30395+ Local variables:
30396+ c-indentation-style: "K&R"
30397+ mode-name: "LC"
30398+ c-basic-offset: 8
30399+ tab-width: 8
30400+ fill-column: 120
30401+ scroll-step: 1
30402+ End:
30403+*/
30404Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.c
30405===================================================================
30406--- /dev/null
30407+++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.c
30408@@ -0,0 +1,556 @@
30409+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30410+
30411+#include "../../debug.h"
30412+#include "../../dformat.h"
30413+#include "../../key.h"
30414+#include "../node/node.h"
30415+#include "../space/space_allocator.h"
30416+#include "disk_format40.h"
30417+#include "../plugin.h"
30418+#include "../../txnmgr.h"
30419+#include "../../jnode.h"
30420+#include "../../tree.h"
30421+#include "../../super.h"
30422+#include "../../wander.h"
30423+#include "../../inode.h"
30424+#include "../../ktxnmgrd.h"
30425+#include "../../status_flags.h"
30426+
30427+#include <linux/types.h> /* for __u?? */
30428+#include <linux/fs.h> /* for struct super_block */
30429+#include <linux/buffer_head.h>
30430+
30431+/* reiser 4.0 default disk layout */
30432+
30433+/* Amount of free blocks needed to perform release_format40 when fs gets
30434+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
30435+ & tx record. */
30436+#define RELEASE_RESERVED 4
30437+
30438+/* functions to access fields of format40_disk_super_block */
30439+static __u64 get_format40_block_count(const format40_disk_super_block * sb)
30440+{
30441+ return le64_to_cpu(get_unaligned(&sb->block_count));
30442+}
30443+
30444+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
30445+{
30446+ return le64_to_cpu(get_unaligned(&sb->free_blocks));
30447+}
30448+
30449+static __u64 get_format40_root_block(const format40_disk_super_block * sb)
30450+{
30451+ return le64_to_cpu(get_unaligned(&sb->root_block));
30452+}
30453+
30454+static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
30455+{
30456+ return le16_to_cpu(get_unaligned(&sb->tree_height));
30457+}
30458+
30459+static __u64 get_format40_file_count(const format40_disk_super_block * sb)
30460+{
30461+ return le64_to_cpu(get_unaligned(&sb->file_count));
30462+}
30463+
30464+static __u64 get_format40_oid(const format40_disk_super_block * sb)
30465+{
30466+ return le64_to_cpu(get_unaligned(&sb->oid));
30467+}
30468+
30469+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
30470+{
30471+ return le32_to_cpu(get_unaligned(&sb->mkfs_id));
30472+}
30473+
30474+static __u64 get_format40_flags(const format40_disk_super_block * sb)
30475+{
30476+ return le64_to_cpu(get_unaligned(&sb->flags));
30477+}
30478+
30479+static format40_super_info *get_sb_info(struct super_block *super)
30480+{
30481+ return &get_super_private(super)->u.format40;
30482+}
30483+
30484+static int consult_diskmap(struct super_block *s)
30485+{
30486+ format40_super_info *info;
30487+ journal_location *jloc;
30488+
30489+ info = get_sb_info(s);
30490+ jloc = &get_super_private(s)->jloc;
30491+ /* Default format-specific locations, if there is nothing in
30492+ * diskmap */
30493+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
30494+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
30495+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
30496+#ifdef CONFIG_REISER4_BADBLOCKS
30497+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
30498+ &jloc->footer);
30499+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
30500+ &jloc->header);
30501+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
30502+ &info->loc.super);
30503+#endif
30504+ return 0;
30505+}
30506+
30507+/* find any valid super block of disk_format40 (even if the first
30508+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
30509+ if needed */
30510+static struct buffer_head *find_a_disk_format40_super_block(struct super_block
30511+ *s)
30512+{
30513+ struct buffer_head *super_bh;
30514+ format40_disk_super_block *disk_sb;
30515+ format40_super_info *info;
30516+
30517+ assert("umka-487", s != NULL);
30518+
30519+ info = get_sb_info(s);
30520+
30521+ super_bh = sb_bread(s, info->loc.super);
30522+ if (super_bh == NULL)
30523+ return ERR_PTR(RETERR(-EIO));
30524+
30525+ disk_sb = (format40_disk_super_block *) super_bh->b_data;
30526+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
30527+ brelse(super_bh);
30528+ return ERR_PTR(RETERR(-EINVAL));
30529+ }
30530+
30531+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
30532+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
30533+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30534+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30535+
30536+ return super_bh;
30537+}
30538+
30539+/* find the most recent version of super block. This is called after journal is
30540+ replayed */
30541+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
30542+{
30543+ /* Here the most recent superblock copy has to be read. However, as
30544+ journal replay isn't complete, we are using
30545+ find_a_disk_format40_super_block() function. */
30546+ return find_a_disk_format40_super_block(s);
30547+}
30548+
30549+static int get_super_jnode(struct super_block *s)
30550+{
30551+ reiser4_super_info_data *sbinfo = get_super_private(s);
30552+ jnode *sb_jnode;
30553+ int ret;
30554+
30555+ sb_jnode = alloc_io_head(&get_sb_info(s)->loc.super);
30556+
30557+ ret = jload(sb_jnode);
30558+
30559+ if (ret) {
30560+ drop_io_head(sb_jnode);
30561+ return ret;
30562+ }
30563+
30564+ pin_jnode_data(sb_jnode);
30565+ jrelse(sb_jnode);
30566+
30567+ sbinfo->u.format40.sb_jnode = sb_jnode;
30568+
30569+ return 0;
30570+}
30571+
30572+static void done_super_jnode(struct super_block *s)
30573+{
30574+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30575+
30576+ if (sb_jnode) {
30577+ unpin_jnode_data(sb_jnode);
30578+ drop_io_head(sb_jnode);
30579+ }
30580+}
30581+
30582+typedef enum format40_init_stage {
30583+ NONE_DONE = 0,
30584+ CONSULT_DISKMAP,
30585+ FIND_A_SUPER,
30586+ INIT_JOURNAL_INFO,
30587+ INIT_STATUS,
30588+ JOURNAL_REPLAY,
30589+ READ_SUPER,
30590+ KEY_CHECK,
30591+ INIT_OID,
30592+ INIT_TREE,
30593+ JOURNAL_RECOVER,
30594+ INIT_SA,
30595+ INIT_JNODE,
30596+ ALL_DONE
30597+} format40_init_stage;
30598+
30599+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
30600+{
30601+ format40_disk_super_block *sb_copy;
30602+
30603+ sb_copy = kmalloc(sizeof(format40_disk_super_block), get_gfp_mask());
30604+ if (sb_copy == NULL)
30605+ return ERR_PTR(RETERR(-ENOMEM));
30606+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
30607+ sizeof(format40_disk_super_block));
30608+ return sb_copy;
30609+}
30610+
30611+static int check_key_format(const format40_disk_super_block *sb_copy)
30612+{
30613+ if (!equi(REISER4_LARGE_KEY,
30614+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
30615+ warning("nikita-3228", "Key format mismatch. "
30616+ "Only %s keys are supported.",
30617+ REISER4_LARGE_KEY ? "large" : "small");
30618+ return RETERR(-EINVAL);
30619+ }
30620+ return 0;
30621+}
30622+
30623+/**
30624+ * try_init_format40
30625+ * @super:
30626+ * @stage:
30627+ *
30628+ */
30629+static int try_init_format40(struct super_block *super,
30630+ format40_init_stage *stage)
30631+{
30632+ int result;
30633+ struct buffer_head *super_bh;
30634+ reiser4_super_info_data *sbinfo;
30635+ format40_disk_super_block *sb_copy;
30636+ tree_level height;
30637+ reiser4_block_nr root_block;
30638+ node_plugin *nplug;
30639+
30640+ assert("vs-475", super != NULL);
30641+ assert("vs-474", get_super_private(super));
30642+
30643+ *stage = NONE_DONE;
30644+
30645+ result = consult_diskmap(super);
30646+ if (result)
30647+ return result;
30648+ *stage = CONSULT_DISKMAP;
30649+
30650+ super_bh = find_a_disk_format40_super_block(super);
30651+ if (IS_ERR(super_bh))
30652+ return PTR_ERR(super_bh);
30653+ brelse(super_bh);
30654+ *stage = FIND_A_SUPER;
30655+
30656+ /* map jnodes for journal control blocks (header, footer) to disk */
30657+ result = init_journal_info(super);
30658+ if (result)
30659+ return result;
30660+ *stage = INIT_JOURNAL_INFO;
30661+
30662+ /* ok, we are sure that filesystem format is a format40 format */
30663+ /* Now check it's state */
30664+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
30665+ if (result != 0 && result != -EINVAL)
30666+ /* -EINVAL means there is no magic, so probably just old
30667+ * fs. */
30668+ return result;
30669+ *stage = INIT_STATUS;
30670+
30671+ result = reiser4_status_query(NULL, NULL);
30672+ if (result == REISER4_STATUS_MOUNT_WARN)
30673+ printk("Warning, mounting filesystem with errors\n");
30674+ if (result == REISER4_STATUS_MOUNT_RO) {
30675+ printk
30676+ ("Warning, mounting filesystem with fatal errors, forcing read-only mount\n");
30677+ /* FIXME: here we should actually enforce read-only mount,
30678+ * only it is unsupported yet. */
30679+ }
30680+
30681+ result = reiser4_journal_replay(super);
30682+ if (result)
30683+ return result;
30684+ *stage = JOURNAL_REPLAY;
30685+
30686+ super_bh = read_super_block(super);
30687+ if (IS_ERR(super_bh))
30688+ return PTR_ERR(super_bh);
30689+ *stage = READ_SUPER;
30690+
30691+ /* allocate and make a copy of format40_disk_super_block */
30692+ sb_copy = copy_sb(super_bh);
30693+ brelse(super_bh);
30694+ if (IS_ERR(sb_copy))
30695+ return PTR_ERR(sb_copy);
30696+
30697+ /* make sure that key format of kernel and filesyste match */
30698+ result = check_key_format(sb_copy);
30699+ if (result) {
30700+ kfree(sb_copy);
30701+ return result;
30702+ }
30703+ *stage = KEY_CHECK;
30704+
30705+ result = oid_init_allocator(super, get_format40_file_count(sb_copy),
30706+ get_format40_oid(sb_copy));
30707+ if (result) {
30708+ kfree(sb_copy);
30709+ return result;
30710+ }
30711+ *stage = INIT_OID;
30712+
30713+ /* get things necessary to init reiser4_tree */
30714+ root_block = get_format40_root_block(sb_copy);
30715+ height = get_format40_tree_height(sb_copy);
30716+ nplug = node_plugin_by_id(NODE40_ID);
30717+
30718+
30719+ /* initialize reiser4_super_info_data */
30720+ sbinfo = get_super_private(super);
30721+ assert("", sbinfo->tree.super == super);
30722+ /* init reiser4_tree for the filesystem */
30723+ result = init_tree(&sbinfo->tree, &root_block, height, nplug);
30724+ if (result) {
30725+ kfree(sb_copy);
30726+ return result;
30727+ }
30728+ *stage = INIT_TREE;
30729+
30730+ /*
30731+ * initialize reiser4_super_info_data with data from format40 super
30732+ * block
30733+ */
30734+ sbinfo->default_uid = 0;
30735+ sbinfo->default_gid = 0;
30736+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
30737+ /* number of blocks in filesystem and reserved space */
30738+ reiser4_set_block_count(super, get_format40_block_count(sb_copy));
30739+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
30740+ kfree(sb_copy);
30741+
30742+ sbinfo->fsuid = 0;
30743+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
30744+ * are not supported */
30745+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
30746+ * layout 40 are
30747+ * of one
30748+ * plugin */
30749+ /* sbinfo->tmgr is initialized already */
30750+
30751+ /* recover sb data which were logged separately from sb block */
30752+
30753+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
30754+ * oid_init_allocator() and reiser4_set_free_blocks() with new
30755+ * data. What's the reason to call them above? */
30756+ result = reiser4_journal_recover_sb_data(super);
30757+ if (result != 0)
30758+ return result;
30759+ *stage = JOURNAL_RECOVER;
30760+
30761+ /*
30762+ * Set number of used blocks. The number of used blocks is not stored
30763+ * neither in on-disk super block nor in the journal footer blocks. At
30764+ * this moment actual values of total blocks and free block counters
30765+ * are set in the reiser4 super block (in-memory structure) and we can
30766+ * calculate number of used blocks from them.
30767+ */
30768+ reiser4_set_data_blocks(super,
30769+ reiser4_block_count(super) -
30770+ reiser4_free_blocks(super));
30771+
30772+#if REISER4_DEBUG
30773+ sbinfo->min_blocks_used = 16 /* reserved area */ +
30774+ 2 /* super blocks */ +
30775+ 2 /* journal footer and header */ ;
30776+#endif
30777+
30778+ /* init disk space allocator */
30779+ result = sa_init_allocator(get_space_allocator(super), super, NULL);
30780+ if (result)
30781+ return result;
30782+ *stage = INIT_SA;
30783+
30784+ result = get_super_jnode(super);
30785+ if (result == 0)
30786+ *stage = ALL_DONE;
30787+ return result;
30788+}
30789+
30790+/* plugin->u.format.get_ready */
30791+int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30792+{
30793+ int result;
30794+ format40_init_stage stage;
30795+
30796+ result = try_init_format40(s, &stage);
30797+ switch (stage) {
30798+ case ALL_DONE:
30799+ assert("nikita-3458", result == 0);
30800+ break;
30801+ case INIT_JNODE:
30802+ done_super_jnode(s);
30803+ case INIT_SA:
30804+ sa_destroy_allocator(get_space_allocator(s), s);
30805+ case JOURNAL_RECOVER:
30806+ case INIT_TREE:
30807+ done_tree(&get_super_private(s)->tree);
30808+ case INIT_OID:
30809+ case KEY_CHECK:
30810+ case READ_SUPER:
30811+ case JOURNAL_REPLAY:
30812+ case INIT_STATUS:
30813+ reiser4_status_finish();
30814+ case INIT_JOURNAL_INFO:
30815+ done_journal_info(s);
30816+ case FIND_A_SUPER:
30817+ case CONSULT_DISKMAP:
30818+ case NONE_DONE:
30819+ break;
30820+ default:
30821+ impossible("nikita-3457", "init stage: %i", stage);
30822+ }
30823+
30824+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30825+ return RETERR(-ENOSPC);
30826+
30827+ return result;
30828+}
30829+
30830+static void pack_format40_super(const struct super_block *s, char *data)
30831+{
30832+ format40_disk_super_block *super_data =
30833+ (format40_disk_super_block *) data;
30834+ reiser4_super_info_data *sbinfo = get_super_private(s);
30835+
30836+ assert("zam-591", data != NULL);
30837+
30838+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30839+ &super_data->free_blocks);
30840+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block), &super_data->root_block);
30841+
30842+ put_unaligned(cpu_to_le64(oid_next(s)), &super_data->oid);
30843+ put_unaligned(cpu_to_le64(oids_used(s)), &super_data->file_count);
30844+
30845+ put_unaligned(cpu_to_le16(sbinfo->tree.height), &super_data->tree_height);
30846+}
30847+
30848+/* plugin->u.format.log_super
30849+ return a jnode which should be added to transaction when the super block
30850+ gets logged */
30851+jnode *log_super_format40(struct super_block *s)
30852+{
30853+ jnode *sb_jnode;
30854+
30855+ sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30856+
30857+ jload(sb_jnode);
30858+
30859+ pack_format40_super(s, jdata(sb_jnode));
30860+
30861+ jrelse(sb_jnode);
30862+
30863+ return sb_jnode;
30864+}
30865+
30866+/* plugin->u.format.release */
30867+int release_format40(struct super_block *s)
30868+{
30869+ int ret;
30870+ reiser4_super_info_data *sbinfo;
30871+
30872+ sbinfo = get_super_private(s);
30873+ assert("zam-579", sbinfo != NULL);
30874+
30875+ if (!rofs_super(s)) {
30876+ ret = capture_super_block(s);
30877+ if (ret != 0)
30878+ warning("vs-898", "capture_super_block failed: %d",
30879+ ret);
30880+
30881+ ret = txnmgr_force_commit_all(s, 1);
30882+ if (ret != 0)
30883+ warning("jmacd-74438", "txn_force failed: %d", ret);
30884+
30885+ all_grabbed2free();
30886+ }
30887+
30888+ sa_destroy_allocator(&sbinfo->space_allocator, s);
30889+ done_journal_info(s);
30890+ done_super_jnode(s);
30891+
30892+ rcu_barrier();
30893+ done_tree(&sbinfo->tree);
30894+ /* call finish_rcu(), because some znode were "released" in
30895+ * done_tree(). */
30896+ rcu_barrier();
30897+
30898+ return 0;
30899+}
30900+
30901+#define FORMAT40_ROOT_LOCALITY 41
30902+#define FORMAT40_ROOT_OBJECTID 42
30903+
30904+/* plugin->u.format.root_dir_key */
30905+const reiser4_key *root_dir_key_format40(const struct super_block *super
30906+ UNUSED_ARG)
30907+{
30908+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30909+ .el = {
30910+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30911+#if REISER4_LARGE_KEY
30912+ ON_LARGE_KEY(0ull,)
30913+#endif
30914+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30915+ 0ull
30916+ }
30917+ };
30918+
30919+ return &FORMAT40_ROOT_DIR_KEY;
30920+}
30921+
30922+/* plugin->u.format.check_open.
30923+ Check the opened object for validness. For now it checks for the valid oid &
30924+ locality only, can be improved later and it its work may depend on the mount
30925+ options. */
30926+int check_open_format40(const struct inode *object)
30927+{
30928+ oid_t max, oid;
30929+
30930+ max = oid_next(object->i_sb) - 1;
30931+
30932+ /* Check the oid. */
30933+ oid = get_inode_oid(object);
30934+ if (oid > max) {
30935+ warning("vpf-1360", "The object with the oid %llu "
30936+ "greater then the max used oid %llu found.",
30937+ (unsigned long long)oid, (unsigned long long)max);
30938+
30939+ return RETERR(-EIO);
30940+ }
30941+
30942+ /* Check the locality. */
30943+ oid = reiser4_inode_data(object)->locality_id;
30944+ if (oid > max) {
30945+ warning("vpf-1360", "The object with the locality %llu "
30946+ "greater then the max used oid %llu found.",
30947+ (unsigned long long)oid, (unsigned long long)max);
30948+
30949+ return RETERR(-EIO);
30950+ }
30951+
30952+ return 0;
30953+}
30954+
30955+/* Make Linus happy.
30956+ Local variables:
30957+ c-indentation-style: "K&R"
30958+ mode-name: "LC"
30959+ c-basic-offset: 8
30960+ tab-width: 8
30961+ fill-column: 120
30962+ scroll-step: 1
30963+ End:
30964+*/
30965Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.h
30966===================================================================
30967--- /dev/null
30968+++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.h
30969@@ -0,0 +1,99 @@
30970+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30971+
30972+/* this file contains:
30973+ - definition of ondisk super block of standart disk layout for
30974+ reiser 4.0 (layout 40)
30975+ - definition of layout 40 specific portion of in-core super block
30976+ - declarations of functions implementing methods of layout plugin
30977+ for layout 40
30978+ - declarations of functions used to get/set fields in layout 40 super block
30979+*/
30980+
30981+#ifndef __DISK_FORMAT40_H__
30982+#define __DISK_FORMAT40_H__
30983+
30984+/* magic for default reiser4 layout */
30985+#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30986+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30987+
30988+#include "../../dformat.h"
30989+
30990+#include <linux/fs.h> /* for struct super_block */
30991+
30992+typedef enum {
30993+ FORMAT40_LARGE_KEYS
30994+} format40_flags;
30995+
30996+/* ondisk super block for format 40. It is 512 bytes long */
30997+typedef struct format40_disk_super_block {
30998+ /* 0 */ d64 block_count;
30999+ /* number of block in a filesystem */
31000+ /* 8 */ d64 free_blocks;
31001+ /* number of free blocks */
31002+ /* 16 */ d64 root_block;
31003+ /* filesystem tree root block */
31004+ /* 24 */ d64 oid;
31005+ /* smallest free objectid */
31006+ /* 32 */ d64 file_count;
31007+ /* number of files in a filesystem */
31008+ /* 40 */ d64 flushes;
31009+ /* number of times super block was
31010+ flushed. Needed if format 40
31011+ will have few super blocks */
31012+ /* 48 */ d32 mkfs_id;
31013+ /* unique identifier of fs */
31014+ /* 52 */ char magic[16];
31015+ /* magic string ReIsEr40FoRmAt */
31016+ /* 68 */ d16 tree_height;
31017+ /* height of filesystem tree */
31018+ /* 70 */ d16 formatting_policy;
31019+ /* 72 */ d64 flags;
31020+ /* 72 */ char not_used[432];
31021+} format40_disk_super_block;
31022+
31023+/* format 40 specific part of reiser4_super_info_data */
31024+typedef struct format40_super_info {
31025+/* format40_disk_super_block actual_sb; */
31026+ jnode *sb_jnode;
31027+ struct {
31028+ reiser4_block_nr super;
31029+ } loc;
31030+} format40_super_info;
31031+
31032+/* Defines for journal header and footer respectively. */
31033+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
31034+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
31035+
31036+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
31037+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
31038+
31039+#define FORMAT40_STATUS_BLOCKNR \
31040+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
31041+
31042+/* Diskmap declarations */
31043+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
31044+#define FORMAT40_SUPER 1
31045+#define FORMAT40_JH 2
31046+#define FORMAT40_JF 3
31047+
31048+/* declarations of functions implementing methods of layout plugin for
31049+ format 40. The functions theirself are in disk_format40.c */
31050+int init_format_format40(struct super_block *, void *data);
31051+const reiser4_key *root_dir_key_format40(const struct super_block *);
31052+int release_format40(struct super_block *s);
31053+jnode *log_super_format40(struct super_block *s);
31054+int check_open_format40(const struct inode *object);
31055+
31056+/* __DISK_FORMAT40_H__ */
31057+#endif
31058+
31059+/* Make Linus happy.
31060+ Local variables:
31061+ c-indentation-style: "K&R"
31062+ mode-name: "LC"
31063+ c-basic-offset: 8
31064+ tab-width: 8
31065+ fill-column: 120
31066+ scroll-step: 1
31067+ End:
31068+*/
31069Index: linux-2.6.16/fs/reiser4/plugin/fibration.c
31070===================================================================
31071--- /dev/null
31072+++ linux-2.6.16/fs/reiser4/plugin/fibration.c
31073@@ -0,0 +1,174 @@
31074+/* Copyright 2004 by Hans Reiser, licensing governed by
31075+ * reiser4/README */
31076+
31077+/* Directory fibrations */
31078+
31079+/*
31080+ * Suppose we have a directory tree with sources of some project. During
31081+ * compilation .o files are created within this tree. This makes access
31082+ * to the original source files less efficient, because source files are
31083+ * now "diluted" by object files: default directory plugin uses prefix
31084+ * of a file name as a part of the key for directory entry (and this
31085+ * part is also inherited by the key of file body). This means that
31086+ * foo.o will be located close to foo.c and foo.h in the tree.
31087+ *
31088+ * To avoid this effect directory plugin fill highest 7 (unused
31089+ * originally) bits of the second component of the directory entry key
31090+ * by bit-pattern depending on the file name (see
31091+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
31092+ * "fibre". Fibre of the file name key is inherited by key of stat data
31093+ * and keys of file body (in the case of REISER4_LARGE_KEY).
31094+ *
31095+ * Fibre for a given file is chosen by per-directory fibration
31096+ * plugin. Names within given fibre are ordered lexicographically.
31097+ */
31098+
31099+#include "../debug.h"
31100+#include "plugin_header.h"
31101+#include "plugin.h"
31102+#include "../super.h"
31103+#include "../inode.h"
31104+
31105+#include <linux/types.h>
31106+
31107+static const int fibre_shift = 57;
31108+
31109+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
31110+
31111+/*
31112+ * Trivial fibration: all files of directory are just ordered
31113+ * lexicographically.
31114+ */
31115+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
31116+{
31117+ return FIBRE_NO(0);
31118+}
31119+
31120+/*
31121+ * dot-o fibration: place .o files after all others.
31122+ */
31123+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
31124+{
31125+ /* special treatment for .*\.o */
31126+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
31127+ return FIBRE_NO(1);
31128+ else
31129+ return FIBRE_NO(0);
31130+}
31131+
31132+/*
31133+ * ext.1 fibration: subdivide directory into 128 fibrations one for each
31134+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
31135+ * default fibre for the rest.
31136+ */
31137+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
31138+{
31139+ if (len > 2 && name[len - 2] == '.')
31140+ return FIBRE_NO(name[len - 1]);
31141+ else
31142+ return FIBRE_NO(0);
31143+}
31144+
31145+/*
31146+ * ext.3 fibration: try to separate files with different 3-character
31147+ * extensions from each other.
31148+ */
31149+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
31150+{
31151+ if (len > 4 && name[len - 4] == '.')
31152+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
31153+ else
31154+ return FIBRE_NO(0);
31155+}
31156+
31157+static int change_fibration(struct inode *inode, reiser4_plugin * plugin)
31158+{
31159+ int result;
31160+
31161+ assert("nikita-3503", inode != NULL);
31162+ assert("nikita-3504", plugin != NULL);
31163+
31164+ assert("nikita-3505", is_reiser4_inode(inode));
31165+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
31166+ assert("nikita-3507",
31167+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
31168+
31169+ result = 0;
31170+ if (inode_fibration_plugin(inode) == NULL ||
31171+ inode_fibration_plugin(inode)->h.id != plugin->h.id) {
31172+ if (is_dir_empty(inode) == 0)
31173+ result =
31174+ plugin_set_fibration(&reiser4_inode_data(inode)->
31175+ pset, &plugin->fibration);
31176+ else
31177+ result = RETERR(-ENOTEMPTY);
31178+
31179+ }
31180+ return result;
31181+}
31182+
31183+static reiser4_plugin_ops fibration_plugin_ops = {
31184+ .init = NULL,
31185+ .load = NULL,
31186+ .save_len = NULL,
31187+ .save = NULL,
31188+ .change = change_fibration
31189+};
31190+
31191+/* fibration plugins */
31192+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
31193+ [FIBRATION_LEXICOGRAPHIC] = {
31194+ .h = {
31195+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31196+ .id = FIBRATION_LEXICOGRAPHIC,
31197+ .pops = &fibration_plugin_ops,
31198+ .label = "lexicographic",
31199+ .desc = "no fibration",
31200+ .linkage = {NULL, NULL}
31201+ },
31202+ .fibre = fibre_trivial
31203+ },
31204+ [FIBRATION_DOT_O] = {
31205+ .h = {
31206+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31207+ .id = FIBRATION_DOT_O,
31208+ .pops = &fibration_plugin_ops,
31209+ .label = "dot-o",
31210+ .desc = "fibrate .o files separately",
31211+ .linkage = {NULL, NULL}
31212+ },
31213+ .fibre = fibre_dot_o
31214+ },
31215+ [FIBRATION_EXT_1] = {
31216+ .h = {
31217+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31218+ .id = FIBRATION_EXT_1,
31219+ .pops = &fibration_plugin_ops,
31220+ .label = "ext-1",
31221+ .desc = "fibrate file by single character extension",
31222+ .linkage = {NULL, NULL}
31223+ },
31224+ .fibre = fibre_ext_1
31225+ },
31226+ [FIBRATION_EXT_3] = {
31227+ .h = {
31228+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31229+ .id = FIBRATION_EXT_3,
31230+ .pops = &fibration_plugin_ops,
31231+ .label = "ext-3",
31232+ .desc = "fibrate file by three character extension",
31233+ .linkage = {NULL, NULL}
31234+ },
31235+ .fibre = fibre_ext_3
31236+ }
31237+};
31238+
31239+/*
31240+ * Local variables:
31241+ * c-indentation-style: "K&R"
31242+ * mode-name: "LC"
31243+ * c-basic-offset: 8
31244+ * tab-width: 8
31245+ * fill-column: 79
31246+ * End:
31247+ */
31248Index: linux-2.6.16/fs/reiser4/plugin/fibration.h
31249===================================================================
31250--- /dev/null
31251+++ linux-2.6.16/fs/reiser4/plugin/fibration.h
31252@@ -0,0 +1,37 @@
31253+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
31254+
31255+/* Fibration plugin used by hashed directory plugin to segment content
31256+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
31257+
31258+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
31259+#define __FS_REISER4_PLUGIN_FIBRATION_H__
31260+
31261+#include "plugin_header.h"
31262+
31263+typedef struct fibration_plugin {
31264+ /* generic fields */
31265+ plugin_header h;
31266+
31267+ __u64(*fibre) (const struct inode * dir, const char *name, int len);
31268+} fibration_plugin;
31269+
31270+typedef enum {
31271+ FIBRATION_LEXICOGRAPHIC,
31272+ FIBRATION_DOT_O,
31273+ FIBRATION_EXT_1,
31274+ FIBRATION_EXT_3,
31275+ LAST_FIBRATION_ID
31276+} reiser4_fibration_id;
31277+
31278+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
31279+#endif
31280+
31281+/* Make Linus happy.
31282+ Local variables:
31283+ c-indentation-style: "K&R"
31284+ mode-name: "LC"
31285+ c-basic-offset: 8
31286+ tab-width: 8
31287+ fill-column: 120
31288+ End:
31289+*/
31290Index: linux-2.6.16/fs/reiser4/plugin/file/Makefile
31291===================================================================
31292--- /dev/null
31293+++ linux-2.6.16/fs/reiser4/plugin/file/Makefile
31294@@ -0,0 +1,7 @@
31295+obj-$(CONFIG_REISER4_FS) += file_plugins.o
31296+
31297+file_plugins-objs := \
31298+ file.o \
31299+ tail_conversion.o \
31300+ symlink.o \
31301+ cryptcompress.o
31302Index: linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.c
31303===================================================================
31304--- /dev/null
31305+++ linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.c
31306@@ -0,0 +1,3817 @@
31307+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
31308+ reiser4/README */
31309+
31310+/* This file contains implementations of inode/file/address_space/file plugin
31311+ * operations specific for cryptcompress file plugin which manages files with
31312+ * compressed and encrypted bodies. "Cryptcompress file" is built of items of
31313+ * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
31314+ */
31315+
31316+#include "../../page_cache.h"
31317+#include "../../inode.h"
31318+#include "../cluster.h"
31319+#include "../object.h"
31320+#include "../../tree_walk.h"
31321+#include "cryptcompress.h"
31322+
31323+#include <asm/scatterlist.h>
31324+#include <linux/pagevec.h>
31325+#include <asm/uaccess.h>
31326+#include <linux/swap.h>
31327+#include <linux/writeback.h>
31328+#include <linux/random.h>
31329+
31330+/* get cryptcompress specific portion of inode */
31331+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
31332+{
31333+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
31334+}
31335+
31336+/* plugin->u.file.init_inode_data */
31337+void
31338+init_inode_data_cryptcompress(struct inode *inode,
31339+ reiser4_object_create_data * crd, int create)
31340+{
31341+ cryptcompress_info_t *data;
31342+
31343+ data = cryptcompress_inode_data(inode);
31344+ assert("edward-685", data != NULL);
31345+
31346+ memset(data, 0, sizeof(*data));
31347+
31348+ init_rwsem(&data->lock);
31349+ toggle_compression(data, 1);
31350+ init_inode_ordering(inode, crd, create);
31351+}
31352+
31353+#if REISER4_DEBUG
31354+int crc_inode_ok(struct inode *inode)
31355+{
31356+ if (cluster_shift_ok(inode_cluster_shift(inode)))
31357+ return 1;
31358+ assert("edward-686", 0);
31359+ return 0;
31360+}
31361+#endif
31362+
31363+static int check_cryptcompress(struct inode *inode)
31364+{
31365+ int result = 0;
31366+ assert("edward-1307", inode_compression_plugin(inode) != NULL);
31367+
31368+ if (inode_cluster_size(inode) < PAGE_CACHE_SIZE) {
31369+ warning("edward-1331",
31370+ "%s clusters are unsupported",
31371+ inode_cluster_plugin(inode)->h.label);
31372+ return RETERR(-EINVAL);
31373+ }
31374+
31375+ /* FIXME-EDWARD: init? or check? */
31376+ if (inode_compression_plugin(inode)->init)
31377+ result = inode_compression_plugin(inode)->init();
31378+ return result;
31379+}
31380+
31381+/* The following is a part of reiser4 cipher key manager
31382+ which is called when opening/creating a cryptcompress file */
31383+
31384+/* get/set cipher key info */
31385+crypto_stat_t * inode_crypto_stat (struct inode * inode)
31386+{
31387+ assert("edward-90", inode != NULL);
31388+ assert("edward-91", reiser4_inode_data(inode) != NULL);
31389+ return cryptcompress_inode_data(inode)->crypt;
31390+}
31391+
31392+static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
31393+{
31394+ cryptcompress_inode_data(inode)->crypt = stat;
31395+}
31396+
31397+/* allocate a cipher key info */
31398+crypto_stat_t * alloc_crypto_stat (struct inode * inode)
31399+{
31400+ crypto_stat_t * info;
31401+ int fipsize;
31402+
31403+ assert("edward-1421", 0);
31404+ info = kmalloc(sizeof(*info), GFP_KERNEL);
31405+ if (!info)
31406+ return ERR_PTR(-ENOMEM);
31407+ memset(info, 0, sizeof (*info));
31408+ fipsize = inode_digest_plugin(inode)->fipsize;
31409+ info->keyid = kmalloc(fipsize, GFP_KERNEL);
31410+ if (!info->keyid) {
31411+ kfree(info);
31412+ return ERR_PTR(-ENOMEM);
31413+ }
31414+ return info;
31415+}
31416+
31417+#if 0
31418+/* allocate/free low-level info for cipher and digest
31419+ transforms */
31420+static int
31421+alloc_crypto_tfms(plugin_set * pset, crypto_stat_t * info)
31422+{
31423+ struct crypto_tfm * ret = NULL;
31424+ cipher_plugin * cplug = pset->cipher;
31425+ digest_plugin * dplug = pset->digest;
31426+
31427+ assert("edward-1363", info != NULL);
31428+ assert("edward-414", cplug != NULL);
31429+ assert("edward-415", dplug != NULL);
31430+
31431+ if (cplug->alloc) {
31432+ ret = cplug->alloc();
31433+ if (ret == NULL) {
31434+ warning("edward-1364",
31435+ "Can not allocate info for %s\n",
31436+ cplug->h.desc);
31437+ return RETERR(-EINVAL);
31438+ }
31439+ }
31440+ info_set_tfm(info, CIPHER_TFM, ret);
31441+ if (dplug->alloc) {
31442+ ret = dplug->alloc();
31443+ if (ret == NULL) {
31444+ warning("edward-1365",
31445+ "Can not allocate info for %s\n",
31446+ dplug->h.desc);
31447+ goto err;
31448+ }
31449+ }
31450+ info_set_tfm(info, DIGEST_TFM, ret);
31451+ return 0;
31452+ err:
31453+ if (cplug->free) {
31454+ cplug->free(info->tfma[CIPHER_TFM].tfm);
31455+ info_set_tfm(info, CIPHER_TFM, NULL);
31456+ }
31457+ return RETERR(-EINVAL);
31458+}
31459+#endif
31460+
31461+static void
31462+free_crypto_tfms(crypto_stat_t * info)
31463+{
31464+ assert("edward-1366", info != NULL);
31465+ if (!info_cipher_tfm(info))
31466+ return;
31467+ info_cipher_plugin(info)->free(info_cipher_tfm(info));
31468+ info_set_tfm(info, CIPHER_TFM, NULL);
31469+ info_digest_plugin(info)->free(info_digest_tfm(info));
31470+ info_set_tfm(info, DIGEST_TFM, NULL);
31471+ return;
31472+}
31473+
31474+#if 0
31475+/* create a key fingerprint for disk stat-data */
31476+static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
31477+{
31478+ int ret = -ENOMEM;
31479+ size_t blk, pad;
31480+ __u8 * dmem;
31481+ __u8 * cmem;
31482+ struct crypto_tfm * dtfm;
31483+ struct crypto_tfm * ctfm;
31484+ struct scatterlist sg;
31485+
31486+ assert("edward-1422", 0);
31487+ assert("edward-1367", info != NULL);
31488+ assert("edward-1368", info->keyid != NULL);
31489+
31490+ dtfm = info_digest_tfm(info);
31491+ ctfm = info_cipher_tfm(info);
31492+
31493+ dmem = kmalloc((size_t)crypto_tfm_alg_digestsize(dtfm),
31494+ GFP_KERNEL);
31495+ if (!dmem)
31496+ goto exit1;
31497+
31498+ blk = crypto_tfm_alg_blocksize(ctfm);
31499+
31500+ pad = data->keyid_size % blk;
31501+ pad = (pad ? blk - pad : 0);
31502+
31503+ cmem = kmalloc((size_t)data->keyid_size + pad, GFP_KERNEL);
31504+ if (!cmem)
31505+ goto exit2;
31506+ memcpy(cmem, data->keyid, data->keyid_size);
31507+ memset(cmem + data->keyid_size, 0, pad);
31508+
31509+ sg.page = virt_to_page(cmem);
31510+ sg.offset = offset_in_page(cmem);
31511+ sg.length = data->keyid_size + pad;
31512+
31513+ ret = crypto_cipher_encrypt(ctfm, &sg, &sg, data->keyid_size + pad);
31514+ if (ret) {
31515+ warning("edward-1369",
31516+ "encryption failed flags=%x\n", ctfm->crt_flags);
31517+ goto exit3;
31518+ }
31519+ crypto_digest_init (dtfm);
31520+ crypto_digest_update (dtfm, &sg, 1);
31521+ crypto_digest_final (dtfm, dmem);
31522+ memcpy(info->keyid, dmem, info_digest_plugin(info)->fipsize);
31523+ exit3:
31524+ kfree(cmem);
31525+ exit2:
31526+ kfree(dmem);
31527+ exit1:
31528+ return ret;
31529+}
31530+#endif
31531+
31532+static void destroy_keyid(crypto_stat_t * info)
31533+{
31534+ assert("edward-1370", info != NULL);
31535+ assert("edward-1371", info->keyid != NULL);
31536+ kfree(info->keyid);
31537+ return;
31538+}
31539+
31540+static void free_crypto_stat (crypto_stat_t * info)
31541+{
31542+ assert("edward-1372", info != NULL);
31543+
31544+ free_crypto_tfms(info);
31545+ destroy_keyid(info);
31546+ kfree(info);
31547+}
31548+
31549+#if 0
31550+static void instantiate_crypto_stat(crypto_stat_t * info)
31551+{
31552+ assert("edward-1373", info != NULL);
31553+ assert("edward-1374", info->inst == 0);
31554+ info->inst = 1;
31555+}
31556+#endif
31557+
31558+static void uninstantiate_crypto_stat(crypto_stat_t * info)
31559+{
31560+ assert("edward-1375", info != NULL);
31561+ info->inst = 0;
31562+}
31563+
31564+static int crypto_stat_instantiated(crypto_stat_t * info)
31565+{
31566+ return info->inst;
31567+}
31568+
31569+static int inode_has_cipher_key(struct inode * inode)
31570+{
31571+ assert("edward-1376", inode != NULL);
31572+ return inode_crypto_stat(inode) &&
31573+ crypto_stat_instantiated(inode_crypto_stat(inode));
31574+}
31575+
31576+static void inode_free_crypto_stat (struct inode * inode)
31577+{
31578+ uninstantiate_crypto_stat(inode_crypto_stat(inode));
31579+ free_crypto_stat(inode_crypto_stat(inode));
31580+}
31581+
31582+static int need_cipher(struct inode * inode)
31583+{
31584+ return inode_cipher_plugin(inode) !=
31585+ cipher_plugin_by_id(NONE_CIPHER_ID);
31586+}
31587+
31588+/* Create a crypto-stat and attach result to the @object.
31589+ If success is returned, then low-level cipher info contains
31590+ an instantiated key */
31591+#if 0
31592+crypto_stat_t *
31593+create_crypto_stat(struct inode * object,
31594+ crypto_data_t * data /* this contains a (uninstantiated)
31595+ cipher key imported from user
31596+ space */)
31597+{
31598+ int ret;
31599+ crypto_stat_t * info;
31600+
31601+ assert("edward-1377", data != NULL);
31602+ assert("edward-1378", need_cipher(object));
31603+
31604+ if (inode_file_plugin(object) !=
31605+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
31606+ return ERR_PTR(-EINVAL);
31607+
31608+ info = alloc_crypto_stat(object);
31609+ if (IS_ERR(info))
31610+ return info;
31611+ ret = alloc_crypto_tfms(reiser4_inode_data(object)->pset, info);
31612+ if (ret)
31613+ goto err;
31614+ /* Someone can change plugins of the host (for example if
31615+ the host is a directory), so we keep the original ones
31616+ in the crypto-stat. */
31617+ info_set_cipher_plugin(info, inode_cipher_plugin(object));
31618+ info_set_digest_plugin(info, inode_digest_plugin(object));
31619+ /* instantiating a key */
31620+ ret = crypto_cipher_setkey(info_cipher_tfm(info),
31621+ data->key,
31622+ data->keysize);
31623+ if (ret) {
31624+ warning("edward-1379",
31625+ "setkey failed flags=%x\n",
31626+ info_cipher_tfm(info)->crt_flags);
31627+ goto err;
31628+ }
31629+ info->keysize = data->keysize;
31630+ ret = create_keyid(info, data);
31631+ if (ret)
31632+ goto err;
31633+ instantiate_crypto_stat(info);
31634+ return info;
31635+ err:
31636+ free_crypto_stat(info);
31637+ return ERR_PTR(ret);
31638+}
31639+#endif
31640+
31641+/* increment/decrement a load counter when
31642+ attaching/detaching the crypto-stat to any object */
31643+static void load_crypto_stat(crypto_stat_t * info)
31644+{
31645+ assert("edward-1380", info != NULL);
31646+ inc_keyload_count(info);
31647+}
31648+
31649+static void unload_crypto_stat(struct inode * inode)
31650+{
31651+ crypto_stat_t * info = inode_crypto_stat(inode);
31652+ assert("edward-1381", info->keyload_count > 0);
31653+
31654+ dec_keyload_count(inode_crypto_stat(inode));
31655+ if (info->keyload_count == 0)
31656+ /* final release */
31657+ inode_free_crypto_stat(inode);
31658+}
31659+
31660+/* attach/detach an existing crypto-stat */
31661+void attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
31662+{
31663+ assert("edward-1382", inode != NULL);
31664+ assert("edward-1383", info != NULL);
31665+ assert("edward-1384", inode_crypto_stat(inode) == NULL);
31666+
31667+ set_inode_crypto_stat(inode, info);
31668+ load_crypto_stat(info);
31669+}
31670+
31671+/* returns true, if crypto stat can be attached to the @host */
31672+#if REISER4_DEBUG
31673+static int host_allows_crypto_stat(struct inode * host)
31674+{
31675+ int ret;
31676+ file_plugin * fplug = inode_file_plugin(host);
31677+
31678+ switch (fplug->h.id) {
31679+ case CRC_FILE_PLUGIN_ID:
31680+ ret = 1;
31681+ break;
31682+ default:
31683+ ret = 0;
31684+ }
31685+ return ret;
31686+}
31687+#endif /* REISER4_DEBUG */
31688+
31689+void detach_crypto_stat(struct inode * inode)
31690+{
31691+ assert("edward-1385", inode != NULL);
31692+ assert("edward-1386", host_allows_crypto_stat(inode));
31693+
31694+ if (inode_crypto_stat(inode))
31695+ unload_crypto_stat(inode);
31696+ set_inode_crypto_stat(inode, NULL);
31697+}
31698+
31699+#if 0
31700+
31701+/* compare fingerprints of @child and @parent */
31702+static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
31703+{
31704+ return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
31705+}
31706+
31707+/* check if a crypto-stat (which is bound to @parent) can be inherited */
31708+int can_inherit_crypto_crc(struct inode *child, struct inode *parent)
31709+{
31710+ if (!need_cipher(child))
31711+ return 0;
31712+ /* the child is created */
31713+ if (!inode_crypto_stat(child))
31714+ return 1;
31715+ /* the child is looked up */
31716+ if (!inode_crypto_stat(parent))
31717+ return 0;
31718+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31719+ inode_digest_plugin(child) == inode_digest_plugin(parent) &&
31720+ inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
31721+ keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
31722+}
31723+#endif
31724+
31725+/* helper functions for ->create() method of the cryptcompress plugin */
31726+static int inode_set_crypto(struct inode * object)
31727+{
31728+ reiser4_inode * info;
31729+ if (!inode_crypto_stat(object)) {
31730+ if (need_cipher(object))
31731+ return RETERR(-EINVAL);
31732+ /* the file is not to be encrypted */
31733+ return 0;
31734+ }
31735+ info = reiser4_inode_data(object);
31736+ info->extmask |= (1 << CRYPTO_STAT);
31737+ info->plugin_mask |= (1 << PSET_CIPHER) | (1 << PSET_DIGEST);
31738+ return 0;
31739+}
31740+
31741+static int
31742+inode_set_compression(struct inode * object)
31743+{
31744+ int result = 0;
31745+ compression_plugin * cplug;
31746+ reiser4_inode * info = reiser4_inode_data(object);
31747+
31748+ cplug = inode_compression_plugin(object);
31749+
31750+ if (cplug->init != NULL) {
31751+ result = cplug->init();
31752+ if (result)
31753+ return result;
31754+ }
31755+ info->plugin_mask |= (1 << PSET_COMPRESSION);
31756+
31757+ return 0;
31758+}
31759+
31760+static void
31761+inode_set_compression_mode(struct inode * object)
31762+{
31763+ reiser4_inode * info = reiser4_inode_data(object);
31764+
31765+ info->plugin_mask |= (1 << PSET_COMPRESSION_MODE);
31766+ return;
31767+}
31768+
31769+static int inode_set_cluster(struct inode *object)
31770+{
31771+ reiser4_inode *info;
31772+ cluster_plugin *cplug;
31773+
31774+ assert("edward-696", object != NULL);
31775+
31776+ info = reiser4_inode_data(object);
31777+ cplug = inode_cluster_plugin(object);
31778+
31779+ if (cplug->shift < PAGE_CACHE_SHIFT) {
31780+ warning("edward-1320",
31781+ "Can not support %p clusters (less then page size)",
31782+ cplug->h.label);
31783+ return RETERR(-EINVAL);
31784+ }
31785+ info->plugin_mask |= (1 << PSET_CLUSTER);
31786+ return 0;
31787+}
31788+
31789+/* ->destroy_inode() method of the cryptcompress plugin */
31790+void destroy_inode_cryptcompress(struct inode * inode)
31791+{
31792+ assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
31793+ detach_crypto_stat(inode);
31794+ return;
31795+}
31796+
31797+/* ->create() method of the cryptcompress plugin
31798+
31799+. install plugins
31800+. attach crypto info if specified
31801+. attach compression info if specified
31802+. attach cluster info
31803+*/
31804+int
31805+create_cryptcompress(struct inode *object, struct inode *parent,
31806+ reiser4_object_create_data * data)
31807+{
31808+ int result;
31809+ reiser4_inode *info;
31810+
31811+ assert("edward-23", object != NULL);
31812+ assert("edward-24", parent != NULL);
31813+ assert("edward-30", data != NULL);
31814+ assert("edward-26", inode_get_flag(object, REISER4_NO_SD));
31815+ assert("edward-27", data->id == CRC_FILE_PLUGIN_ID);
31816+
31817+ info = reiser4_inode_data(object);
31818+
31819+ assert("edward-29", info != NULL);
31820+
31821+ /* set file bit */
31822+ info->plugin_mask |= (1 << PSET_FILE);
31823+
31824+ /* set crypto */
31825+ result = inode_set_crypto(object);
31826+ if (result)
31827+ goto error;
31828+ /* set compression */
31829+ result = inode_set_compression(object);
31830+ if (result)
31831+ goto error;
31832+ inode_set_compression_mode(object);
31833+
31834+ /* set cluster info */
31835+ result = inode_set_cluster(object);
31836+ if (result)
31837+ goto error;
31838+ /* set plugin mask */
31839+ info->extmask |= (1 << PLUGIN_STAT);
31840+
31841+ /* save everything in disk stat-data */
31842+ result = write_sd_by_inode_common(object);
31843+ if (!result)
31844+ return 0;
31845+ error:
31846+ detach_crypto_stat(object);
31847+ return result;
31848+}
31849+
31850+/* ->open() method of the cryptcompress plugin */
31851+int open_cryptcompress(struct inode * inode, struct file * file)
31852+{
31853+ struct inode * parent;
31854+
31855+ assert("edward-1394", inode != NULL);
31856+ assert("edward-1395", file != NULL);
31857+ assert("edward-1396", file != NULL);
31858+ assert("edward-1397", file->f_dentry->d_inode == inode);
31859+ assert("edward-1398", file->f_dentry->d_parent != NULL);
31860+ assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31861+ assert("edward-698",
31862+ inode_file_plugin(inode) ==
31863+ file_plugin_by_id(CRC_FILE_PLUGIN_ID));
31864+
31865+ if (!need_cipher(inode))
31866+ /* the file is not to be ciphered */
31867+ return 0;
31868+ parent = file->f_dentry->d_parent->d_inode;
31869+ if (!inode_has_cipher_key(inode))
31870+ return RETERR(-EINVAL);
31871+ return 0;
31872+}
31873+
31874+/* returns a blocksize, the attribute of a cipher algorithm */
31875+static unsigned int
31876+cipher_blocksize(struct inode * inode)
31877+{
31878+ assert("edward-758", need_cipher(inode));
31879+ assert("edward-1400", inode_crypto_stat(inode) != NULL);
31880+ return crypto_tfm_alg_blocksize
31881+ (info_cipher_tfm(inode_crypto_stat(inode)));
31882+}
31883+
31884+/* returns offset translated by scale factor of the crypto-algorithm */
31885+static loff_t inode_scaled_offset (struct inode * inode,
31886+ const loff_t src_off /* input offset */)
31887+{
31888+ assert("edward-97", inode != NULL);
31889+
31890+ if (!need_cipher(inode) ||
31891+ src_off == get_key_offset(min_key()) ||
31892+ src_off == get_key_offset(max_key()))
31893+ return src_off;
31894+
31895+ return inode_cipher_plugin(inode)->scale(inode,
31896+ cipher_blocksize(inode),
31897+ src_off);
31898+}
31899+
31900+/* returns disk cluster size */
31901+size_t inode_scaled_cluster_size(struct inode * inode)
31902+{
31903+ assert("edward-110", inode != NULL);
31904+
31905+ return inode_scaled_offset(inode, inode_cluster_size(inode));
31906+}
31907+
31908+static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
31909+{
31910+ return (clust_to_off(clust->index, inode) >= inode->i_size);
31911+}
31912+
31913+/* set number of cluster pages */
31914+static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
31915+{
31916+ reiser4_slide_t *win;
31917+
31918+ assert("edward-180", clust != NULL);
31919+ assert("edward-1040", inode != NULL);
31920+
31921+ win = clust->win;
31922+ if (!win) {
31923+ /* NOTE-EDWARD: i_size should be protected */
31924+ clust->nr_pages =
31925+ count_to_nrpages(fsize_to_count(clust, inode));
31926+ return;
31927+ }
31928+ assert("edward-1176", clust->op != PCL_UNKNOWN);
31929+ assert("edward-1064", win->off + win->count + win->delta != 0);
31930+
31931+ if (win->stat == HOLE_WINDOW &&
31932+ win->off == 0 && win->count == inode_cluster_size(inode)) {
31933+ /* special case: we start write hole from fake cluster */
31934+ clust->nr_pages = 0;
31935+ return;
31936+ }
31937+ clust->nr_pages =
31938+ count_to_nrpages(max_count(win->off + win->count + win->delta,
31939+ fsize_to_count(clust, inode)));
31940+ return;
31941+}
31942+
31943+/* ->key_by_inode() method of the cryptcompress plugin */
31944+/* see plugin/plugin.h for details */
31945+int
31946+key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
31947+{
31948+ loff_t clust_off;
31949+
31950+ assert("edward-64", inode != 0);
31951+ // assert("edward-112", ergo(off != get_key_offset(max_key()), !off_to_cloff(off, inode)));
31952+ /* don't come here with other offsets */
31953+
31954+ clust_off =
31955+ (off ==
31956+ get_key_offset(max_key())? get_key_offset(max_key()) :
31957+ off_to_clust_to_off(off, inode));
31958+
31959+ key_by_inode_and_offset_common(inode, 0, key);
31960+ set_key_offset(key,
31961+ (__u64) (!inode_crypto_stat(inode) ? clust_off :
31962+ inode_scaled_offset(inode, clust_off)));
31963+ return 0;
31964+}
31965+
31966+/* plugin->flow_by_inode */
31967+int
31968+flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
31969+ const char __user *buf /* user level buffer */ ,
31970+ int user /* 1 if @buf is of user space, 0 - if it is
31971+ kernel space */ ,
31972+ loff_t size /* buffer size */ ,
31973+ loff_t off /* offset to start io from */ ,
31974+ rw_op op /* READ or WRITE */ ,
31975+ flow_t * f /* resulting flow */ )
31976+{
31977+ assert("edward-436", f != NULL);
31978+ assert("edward-149", inode != NULL);
31979+ assert("edward-150", inode_file_plugin(inode) != NULL);
31980+ assert("edward-151",
31981+ inode_file_plugin(inode)->key_by_inode ==
31982+ key_by_inode_cryptcompress);
31983+
31984+ f->length = size;
31985+ memcpy(&f->data, &buf, sizeof(buf));
31986+ f->user = user;
31987+ f->op = op;
31988+
31989+ if (op == WRITE_OP && user == 1)
31990+ return 0;
31991+ return key_by_inode_cryptcompress(inode, off, &f->key);
31992+}
31993+
31994+static int
31995+crc_hint_validate(hint_t * hint, const reiser4_key * key,
31996+ znode_lock_mode lock_mode)
31997+{
31998+ coord_t *coord;
31999+
32000+ assert("edward-704", hint != NULL);
32001+ assert("edward-1089", !hint->ext_coord.valid);
32002+ assert("edward-706", hint->lh.owner == NULL);
32003+
32004+ coord = &hint->ext_coord.coord;
32005+
32006+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
32007+ /* hint either not set or set by different operation */
32008+ return RETERR(-E_REPEAT);
32009+
32010+ if (get_key_offset(key) != hint->offset)
32011+ /* hint is set for different key */
32012+ return RETERR(-E_REPEAT);
32013+
32014+ assert("edward-707", schedulable());
32015+
32016+ return seal_validate(&hint->seal, &hint->ext_coord.coord,
32017+ key, &hint->lh, lock_mode, ZNODE_LOCK_LOPRI);
32018+}
32019+
32020+/* reserve disk space when writing a logical cluster */
32021+static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
32022+{
32023+ int result = 0;
32024+
32025+ assert("edward-965", schedulable());
32026+ assert("edward-439", inode != NULL);
32027+ assert("edward-440", clust != NULL);
32028+ assert("edward-441", clust->pages != NULL);
32029+ assert("edward-1261", get_current_context()->grabbed_blocks == 0);
32030+
32031+ if (clust->nr_pages == 0) {
32032+ assert("edward-1152", clust->win != NULL);
32033+ assert("edward-1153", clust->win->stat == HOLE_WINDOW);
32034+ /* don't reserve space for fake disk clusteer */
32035+ return 0;
32036+ }
32037+ assert("edward-442", jprivate(clust->pages[0]) != NULL);
32038+
32039+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
32040+ estimate_update_cluster(inode),
32041+ BA_CAN_COMMIT);
32042+ if (result)
32043+ return result;
32044+ clust->reserved = 1;
32045+ grabbed2cluster_reserved(estimate_insert_cluster(inode) +
32046+ estimate_update_cluster(inode));
32047+#if REISER4_DEBUG
32048+ clust->reserved_prepped = estimate_update_cluster(inode);
32049+ clust->reserved_unprepped = estimate_insert_cluster(inode);
32050+#endif
32051+ /* there can be space grabbed by txnmgr_force_commit_all */
32052+ all_grabbed2free();
32053+ return 0;
32054+}
32055+
32056+/* free reserved disk space if writing a logical cluster fails */
32057+static void
32058+free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
32059+{
32060+ assert("edward-967", clust->reserved == 1);
32061+
32062+ cluster_reserved2free(count);
32063+ clust->reserved = 0;
32064+}
32065+
32066+/* The core search procedure of the cryptcompress plugin.
32067+ If returned value is not cbk_errored, then current znode is locked */
32068+static int find_cluster_item(hint_t * hint,
32069+ const reiser4_key * key, /* key of the item we are
32070+ looking for */
32071+ znode_lock_mode lock_mode /* which lock */ ,
32072+ ra_info_t * ra_info, lookup_bias bias, __u32 flags)
32073+{
32074+ int result;
32075+ reiser4_key ikey;
32076+ coord_t *coord = &hint->ext_coord.coord;
32077+ coord_t orig = *coord;
32078+
32079+ assert("edward-152", hint != NULL);
32080+
32081+ if (hint->ext_coord.valid == 0) {
32082+ result = crc_hint_validate(hint, key, lock_mode);
32083+ if (result == -E_REPEAT)
32084+ goto traverse_tree;
32085+ else if (result) {
32086+ assert("edward-1216", 0);
32087+ return result;
32088+ }
32089+ hint->ext_coord.valid = 1;
32090+ }
32091+ assert("edward-709", znode_is_any_locked(coord->node));
32092+
32093+ /* In-place lookup is going here, it means we just need to
32094+ check if next item of the @coord match to the @keyhint) */
32095+
32096+ if (equal_to_rdk(coord->node, key)) {
32097+ result = goto_right_neighbor(coord, &hint->lh);
32098+ if (result == -E_NO_NEIGHBOR) {
32099+ assert("edward-1217", 0);
32100+ return RETERR(-EIO);
32101+ }
32102+ if (result)
32103+ return result;
32104+ assert("edward-1218", equal_to_ldk(coord->node, key));
32105+ } else {
32106+ coord->item_pos++;
32107+ coord->unit_pos = 0;
32108+ coord->between = AT_UNIT;
32109+ }
32110+ result = zload(coord->node);
32111+ if (result)
32112+ return result;
32113+ assert("edward-1219", !node_is_empty(coord->node));
32114+
32115+ if (!coord_is_existing_item(coord)) {
32116+ zrelse(coord->node);
32117+ goto not_found;
32118+ }
32119+ item_key_by_coord(coord, &ikey);
32120+ zrelse(coord->node);
32121+ if (!keyeq(key, &ikey))
32122+ goto not_found;
32123+ return CBK_COORD_FOUND;
32124+
32125+ not_found:
32126+ assert("edward-1220", coord->item_pos > 0);
32127+ //coord->item_pos--;
32128+ /* roll back */
32129+ *coord = orig;
32130+ ON_DEBUG(coord_update_v(coord));
32131+ return CBK_COORD_NOTFOUND;
32132+
32133+ traverse_tree:
32134+ assert("edward-713", hint->lh.owner == NULL);
32135+ assert("edward-714", schedulable());
32136+
32137+ unset_hint(hint);
32138+ coord_init_zero(coord);
32139+ result = coord_by_key(current_tree, key, coord, &hint->lh,
32140+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
32141+ CBK_UNIQUE | flags, ra_info);
32142+ if (cbk_errored(result))
32143+ return result;
32144+ hint->ext_coord.valid = 1;
32145+ return result;
32146+}
32147+
32148+/* This function is called by deflate[inflate] manager when
32149+ creating a transformed/plain stream to check if we should
32150+ create/cut some overhead. If this returns true, then @oh
32151+ contains the size of this overhead.
32152+ */
32153+static int
32154+need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
32155+ rw_op rw, int * oh)
32156+{
32157+ tfm_cluster_t * tc = &clust->tc;
32158+ switch (rw) {
32159+ case WRITE_OP: /* estimate align */
32160+ *oh = tc->len % cipher_blocksize(inode);
32161+ if (*oh != 0)
32162+ return 1;
32163+ break;
32164+ case READ_OP: /* estimate cut */
32165+ *oh = *(tfm_output_data(clust) + tc->len - 1);
32166+ break;
32167+ default:
32168+ impossible("edward-1401", "bad option");
32169+ }
32170+ return (tc->len != tc->lsize);
32171+}
32172+
32173+/* create/cut an overhead of transformed/plain stream */
32174+static void
32175+align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
32176+{
32177+ int oh;
32178+ cipher_plugin * cplug = inode_cipher_plugin(inode);
32179+
32180+ assert("edward-1402", need_cipher(inode));
32181+
32182+ if (!need_cut_or_align(inode, clust, rw, &oh))
32183+ return;
32184+ switch (rw) {
32185+ case WRITE_OP: /* do align */
32186+ clust->tc.len +=
32187+ cplug->align_stream(tfm_input_data(clust) +
32188+ clust->tc.len, clust->tc.len,
32189+ cipher_blocksize(inode));
32190+ *(tfm_input_data(clust) + clust->tc.len - 1) =
32191+ cipher_blocksize(inode) - oh;
32192+ break;
32193+ case READ_OP: /* do cut */
32194+ assert("edward-1403", oh <= cipher_blocksize(inode));
32195+ clust->tc.len -= oh;
32196+ break;
32197+ default:
32198+ impossible("edward-1404", "bad option");
32199+ }
32200+ return;
32201+}
32202+
32203+/* the following two functions are to evaluate results
32204+ of compression transform */
32205+static unsigned
32206+max_cipher_overhead(struct inode * inode)
32207+{
32208+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
32209+ return 0;
32210+ return cipher_blocksize(inode);
32211+}
32212+
32213+static int deflate_overhead(struct inode *inode)
32214+{
32215+ return (inode_compression_plugin(inode)->
32216+ checksum ? DC_CHECKSUM_SIZE : 0);
32217+}
32218+
32219+static unsigned deflate_overrun(struct inode * inode, int ilen)
32220+{
32221+ return coa_overrun(inode_compression_plugin(inode), ilen);
32222+}
32223+
32224+/* Estimating compressibility of a logical cluster by various
32225+ policies represented by compression mode plugin.
32226+ If this returns false, then compressor won't be called for
32227+ the cluster of index @index.
32228+*/
32229+static int should_compress(tfm_cluster_t * tc, cloff_t index,
32230+ struct inode *inode)
32231+{
32232+ compression_plugin *cplug = inode_compression_plugin(inode);
32233+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
32234+
32235+ assert("edward-1321", tc->len != 0);
32236+ assert("edward-1322", cplug != NULL);
32237+ assert("edward-1323", mplug != NULL);
32238+
32239+ return /* estimate by size */
32240+ (cplug->min_size_deflate ?
32241+ tc->len >= cplug->min_size_deflate() :
32242+ 1) &&
32243+ /* estimate by compression mode plugin */
32244+ (mplug->should_deflate ?
32245+ mplug->should_deflate(inode, index) :
32246+ 1);
32247+}
32248+
32249+/* Evaluating results of compression transform.
32250+ Returns true, if we need to accept this results */
32251+static int
32252+save_compressed(int size_before, int size_after, struct inode * inode)
32253+{
32254+ return (size_after + deflate_overhead(inode) +
32255+ max_cipher_overhead(inode) < size_before);
32256+}
32257+
32258+/* Guess result of the evaluation above */
32259+static int
32260+need_inflate(reiser4_cluster_t * clust, struct inode *inode,
32261+ int encrypted /* is cluster encrypted */ )
32262+{
32263+ tfm_cluster_t *tc = &clust->tc;
32264+
32265+ assert("edward-142", tc != 0);
32266+ assert("edward-143", inode != NULL);
32267+
32268+ return tc->len <
32269+ (encrypted ?
32270+ inode_scaled_offset(inode, tc->lsize) :
32271+ tc->lsize);
32272+}
32273+
32274+/* If results of compression were accepted, then we add
32275+ a checksum to catch possible disk cluster corruption.
32276+ The following is a format of the data stored in disk clusters:
32277+
32278+ data This is (transformed) logical cluster.
32279+ cipher_overhead This is created by ->align() method
32280+ of cipher plugin. May be absent.
32281+ checksum (4) This is created by ->checksum method
32282+ of compression plugin to check
32283+ integrity. May be absent.
32284+
32285+ Crypto overhead format:
32286+
32287+ data
32288+ control_byte (1) contains aligned overhead size:
32289+ 1 <= overhead <= cipher_blksize
32290+*/
32291+/* Append a checksum at the end of a transformed stream */
32292+static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32293+{
32294+ __u32 checksum;
32295+
32296+ assert("edward-1309", tc != NULL);
32297+ assert("edward-1310", tc->len > 0);
32298+ assert("edward-1311", cplug->checksum != NULL);
32299+
32300+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
32301+ put_unaligned(cpu_to_le32(checksum),
32302+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
32303+ tc->len += (int)DC_CHECKSUM_SIZE;
32304+}
32305+
32306+/* Check a disk cluster checksum.
32307+ Returns 0 if checksum is correct, otherwise returns 1 */
32308+static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32309+{
32310+ assert("edward-1312", tc != NULL);
32311+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
32312+ assert("edward-1314", cplug->checksum != NULL);
32313+
32314+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
32315+ tc->len - (int)DC_CHECKSUM_SIZE) !=
32316+ le32_to_cpu(get_unaligned((d32 *)
32317+ (tfm_stream_data(tc, INPUT_STREAM)
32318+ + tc->len - (int)DC_CHECKSUM_SIZE)))) {
32319+ warning("edward-156",
32320+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
32321+ (int)le32_to_cpu
32322+ (get_unaligned((d32 *)
32323+ (tfm_stream_data(tc, INPUT_STREAM) +
32324+ tc->len - (int)DC_CHECKSUM_SIZE))),
32325+ (int)cplug->checksum
32326+ (tfm_stream_data(tc, INPUT_STREAM),
32327+ tc->len - (int)DC_CHECKSUM_SIZE));
32328+ return 1;
32329+ }
32330+ tc->len -= (int)DC_CHECKSUM_SIZE;
32331+ return 0;
32332+}
32333+
32334+/* get input/output stream for some transform action */
32335+int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
32336+ tfm_stream_id id)
32337+{
32338+ size_t size = inode_scaled_cluster_size(inode);
32339+
32340+ assert("edward-901", tc != NULL);
32341+ assert("edward-1027", inode_compression_plugin(inode) != NULL);
32342+
32343+ if (tc->act == TFM_WRITE_ACT)
32344+ size += deflate_overrun(inode, inode_cluster_size(inode));
32345+
32346+ if (!tfm_stream(tc, id) && id == INPUT_STREAM)
32347+ alternate_streams(tc);
32348+ if (!tfm_stream(tc, id))
32349+ return alloc_tfm_stream(tc, size, id);
32350+
32351+ assert("edward-902", tfm_stream_is_set(tc, id));
32352+
32353+ if (tfm_stream_size(tc, id) < size)
32354+ return realloc_tfm_stream(tc, size, id);
32355+ return 0;
32356+}
32357+
32358+/* Common deflate manager */
32359+int deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32360+{
32361+ int result = 0;
32362+ int compressed = 0;
32363+ int encrypted = 0;
32364+ tfm_cluster_t * tc = &clust->tc;
32365+ compression_plugin * coplug;
32366+
32367+ assert("edward-401", inode != NULL);
32368+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
32369+ assert("edward-1348", tc->act == TFM_WRITE_ACT);
32370+ assert("edward-498", !tfm_cluster_is_uptodate(tc));
32371+
32372+ coplug = inode_compression_plugin(inode);
32373+ if (should_compress(tc, clust->index, inode)) {
32374+ /* try to compress, discard bad results */
32375+ __u32 dst_len;
32376+ compression_mode_plugin * mplug =
32377+ inode_compression_mode_plugin(inode);
32378+ assert("edward-602", coplug != NULL);
32379+ assert("edward-1423", coplug->compress != NULL);
32380+
32381+ result = grab_coa(tc, coplug);
32382+ if (result) {
32383+ warning("edward-1424",
32384+ "alloc_coa failed with ret=%d, skipped compression",
32385+ result);
32386+ goto cipher;
32387+ }
32388+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32389+ if (result) {
32390+ warning("edward-1425",
32391+ "alloc stream failed with ret=%d, skipped compression",
32392+ result);
32393+ goto cipher;
32394+ }
32395+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
32396+ coplug->compress(get_coa(tc, coplug->h.id, tc->act),
32397+ tfm_input_data(clust), tc->len,
32398+ tfm_output_data(clust), &dst_len);
32399+ /* make sure we didn't overwrite extra bytes */
32400+ assert("edward-603",
32401+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
32402+
32403+ /* evaluate results of compression transform */
32404+ if (save_compressed(tc->len, dst_len, inode)) {
32405+ /* good result, accept */
32406+ tc->len = dst_len;
32407+ if (mplug->accept_hook != NULL) {
32408+ result = mplug->accept_hook(inode, clust->index);
32409+ if (result)
32410+ warning("edward-1426",
32411+ "accept_hook failed with ret=%d",
32412+ result);
32413+ }
32414+ compressed = 1;
32415+ }
32416+ else {
32417+ /* bad result, discard */
32418+#if REISER4_DEBUG
32419+ if (cluster_is_complete(clust, inode))
32420+ warning("edward-1338",
32421+ "incompressible cluster %lu (inode %llu)",
32422+ clust->index,
32423+ (unsigned long long)get_inode_oid(inode));
32424+#endif
32425+ if (mplug->discard_hook != NULL &&
32426+ cluster_is_complete(clust, inode)) {
32427+ result = mplug->discard_hook(inode,
32428+ clust->index);
32429+ if (result)
32430+ warning("edward-1427",
32431+ "discard_hook failed with ret=%d",
32432+ result);
32433+ }
32434+ }
32435+ }
32436+ cipher:
32437+ if (need_cipher(inode)) {
32438+ cipher_plugin * ciplug;
32439+ struct crypto_tfm * tfm;
32440+ struct scatterlist src;
32441+ struct scatterlist dst;
32442+
32443+ ciplug = inode_cipher_plugin(inode);
32444+ tfm = info_cipher_tfm(inode_crypto_stat(inode));
32445+ if (compressed)
32446+ alternate_streams(tc);
32447+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32448+ if (result)
32449+ return result;
32450+
32451+ align_or_cut_overhead(inode, clust, WRITE_OP);
32452+ src.page = virt_to_page(tfm_input_data(clust));
32453+ src.offset = offset_in_page(tfm_input_data(clust));
32454+ src.length = tc->len;
32455+
32456+ dst.page = virt_to_page(tfm_output_data(clust));
32457+ dst.offset = offset_in_page(tfm_output_data(clust));
32458+ dst.length = tc->len;
32459+
32460+ result = crypto_cipher_encrypt(tfm, &dst, &src, tc->len);
32461+ if (result) {
32462+ warning("edward-1405",
32463+ "encryption failed flags=%x\n", tfm->crt_flags);
32464+ return result;
32465+ }
32466+ encrypted = 1;
32467+ }
32468+ if (compressed && coplug->checksum != NULL)
32469+ dc_set_checksum(coplug, tc);
32470+ if (!compressed && !encrypted)
32471+ alternate_streams(tc);
32472+ return result;
32473+}
32474+
32475+/* Common inflate manager. */
32476+int inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32477+{
32478+ int result = 0;
32479+ int transformed = 0;
32480+ tfm_cluster_t * tc = &clust->tc;
32481+ compression_plugin * coplug;
32482+
32483+ assert("edward-905", inode != NULL);
32484+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
32485+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
32486+ assert("edward-1349", tc->act == TFM_READ_ACT);
32487+ assert("edward-907", !tfm_cluster_is_uptodate(tc));
32488+
32489+ /* Handle a checksum (if any) */
32490+ coplug = inode_compression_plugin(inode);
32491+ if (need_inflate(clust, inode, need_cipher(inode)) &&
32492+ coplug->checksum != NULL) {
32493+ result = dc_check_checksum(coplug, tc);
32494+ if (result)
32495+ return RETERR(-EIO);
32496+ }
32497+ if (need_cipher(inode)) {
32498+ cipher_plugin * ciplug;
32499+ struct crypto_tfm * tfm;
32500+ struct scatterlist src;
32501+ struct scatterlist dst;
32502+
32503+ ciplug = inode_cipher_plugin(inode);
32504+ tfm = info_cipher_tfm(inode_crypto_stat(inode));
32505+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32506+ if (result)
32507+ return result;
32508+ assert("edward-909", tfm_cluster_is_set(tc));
32509+
32510+ src.page = virt_to_page(tfm_input_data(clust));
32511+ src.offset = offset_in_page(tfm_input_data(clust));
32512+ src.length = tc->len;
32513+
32514+ dst.page = virt_to_page(tfm_output_data(clust));
32515+ dst.offset = offset_in_page(tfm_output_data(clust));
32516+ dst.length = tc->len;
32517+
32518+ result = crypto_cipher_decrypt(tfm, &dst, &src, tc->len);
32519+ if (result)
32520+ return result;
32521+ align_or_cut_overhead(inode, clust, READ_OP);
32522+ transformed = 1;
32523+ }
32524+ if (need_inflate(clust, inode, 0)) {
32525+ unsigned dst_len = inode_cluster_size(inode);
32526+ if(transformed)
32527+ alternate_streams(tc);
32528+
32529+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32530+ if (result)
32531+ return result;
32532+ assert("edward-1305", coplug->decompress != NULL);
32533+ assert("edward-910", tfm_cluster_is_set(tc));
32534+
32535+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
32536+ tfm_input_data(clust), tc->len,
32537+ tfm_output_data(clust), &dst_len);
32538+ /* check length */
32539+ tc->len = dst_len;
32540+ assert("edward-157", dst_len == tc->lsize);
32541+ transformed = 1;
32542+ }
32543+ if (!transformed)
32544+ alternate_streams(tc);
32545+ return result;
32546+}
32547+
32548+/* This is implementation of readpage method of struct
32549+ address_space_operations for cryptcompress plugin. */
32550+int readpage_cryptcompress(struct file *file, struct page *page)
32551+{
32552+ reiser4_context *ctx;
32553+ reiser4_cluster_t clust;
32554+ item_plugin *iplug;
32555+ int result;
32556+
32557+ assert("edward-88", PageLocked(page));
32558+ assert("vs-976", !PageUptodate(page));
32559+ assert("edward-89", page->mapping && page->mapping->host);
32560+
32561+ ctx = init_context(page->mapping->host->i_sb);
32562+ if (IS_ERR(ctx))
32563+ return PTR_ERR(ctx);
32564+ result = check_cryptcompress(page->mapping->host);
32565+ if (result) {
32566+ unlock_page(page);
32567+ reiser4_exit_context(ctx);
32568+ return result;
32569+ }
32570+ assert("edward-113",
32571+ ergo(file != NULL,
32572+ page->mapping == file->f_dentry->d_inode->i_mapping));
32573+
32574+ if (PageUptodate(page)) {
32575+ warning("edward-1338", "page is already uptodate\n");
32576+ reiser4_exit_context(ctx);
32577+ return 0;
32578+ }
32579+ cluster_init_read(&clust, NULL);
32580+ clust.file = file;
32581+ iplug = item_plugin_by_id(CTAIL_ID);
32582+ if (!iplug->s.file.readpage) {
32583+ unlock_page(page);
32584+ put_cluster_handle(&clust);
32585+ reiser4_exit_context(ctx);
32586+ return -EINVAL;
32587+ }
32588+ result = iplug->s.file.readpage(&clust, page);
32589+ if (result)
32590+ unlock_page(page);
32591+ assert("edward-64",
32592+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
32593+ put_cluster_handle(&clust);
32594+ reiser4_exit_context(ctx);
32595+ return result;
32596+}
32597+
32598+/* how much pages will be captured */
32599+static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
32600+{
32601+ switch (clust->op) {
32602+ case PCL_APPEND:
32603+ return clust->nr_pages;
32604+ case PCL_TRUNCATE:
32605+ assert("edward-1179", clust->win != NULL);
32606+ return count_to_nrpages(clust->win->off + clust->win->count);
32607+ default:
32608+ impossible("edward-1180", "bad page cluster option");
32609+ return 0;
32610+ }
32611+}
32612+
32613+static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
32614+{
32615+ int i;
32616+ struct page *pg;
32617+ int nrpages = cluster_nrpages_to_capture(clust);
32618+
32619+ for (i = 0; i < nrpages; i++) {
32620+
32621+ pg = clust->pages[i];
32622+ assert("edward-968", pg != NULL);
32623+ lock_page(pg);
32624+ assert("edward-1065", PageUptodate(pg));
32625+ set_page_dirty_internal(pg);
32626+ unlock_page(pg);
32627+ mark_page_accessed(pg);
32628+ }
32629+}
32630+
32631+static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
32632+{
32633+ int i;
32634+ assert("edward-1275", clust != NULL);
32635+
32636+ for (i = 0; i < clust->nr_pages; i++) {
32637+ assert("edward-1276", clust->pages[i] != NULL);
32638+
32639+ lock_page(clust->pages[i]);
32640+ if (PageDirty(clust->pages[i])) {
32641+ assert("edward-1277", PageUptodate(clust->pages[i]));
32642+ clear_page_dirty_for_io(clust->pages[i]);
32643+ }
32644+#if REISER4_DEBUG
32645+ else
32646+ /* Race between flush and write:
32647+ some pages became clean when write() (or another
32648+ process which modifies data) capture the cluster. */
32649+ warning("edward-985", "Page of index %lu (inode %llu)"
32650+ " is not dirty\n", clust->pages[i]->index,
32651+ (unsigned long long)get_inode_oid(clust->
32652+ pages[i]->
32653+ mapping->
32654+ host));
32655+#endif
32656+ unlock_page(clust->pages[i]);
32657+ }
32658+}
32659+
32660+/* update i_size by window */
32661+static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
32662+{
32663+ loff_t size;
32664+ reiser4_slide_t *win;
32665+
32666+ assert("edward-1181", clust != NULL);
32667+ assert("edward-1182", inode != NULL);
32668+
32669+ win = clust->win;
32670+ assert("edward-1183", win != NULL);
32671+
32672+ size = clust_to_off(clust->index, inode) + win->off;
32673+
32674+ switch (clust->op) {
32675+ case PCL_APPEND:
32676+ if (size + win->count <= inode->i_size)
32677+ /* overwrite only */
32678+ return;
32679+ size += win->count;
32680+ break;
32681+ case PCL_TRUNCATE:
32682+ break;
32683+ default:
32684+ impossible("edward-1184", "bad page cluster option");
32685+ break;
32686+ }
32687+ inode_check_scale_nolock(inode, inode->i_size, size);
32688+ inode->i_size = size;
32689+ return;
32690+}
32691+
32692+/* Check in page cluster modifications.
32693+ . Make jnode dirty, if it wasn't;
32694+ . Reserve space for a disk cluster update by flush algorithm, if needed;
32695+ . Clean up old references (if any).
32696+ . Put pages (grabbed in this thread) which will be truncated
32697+*/
32698+static void
32699+make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
32700+ loff_t * old_isize, struct inode *inode)
32701+{
32702+ int i;
32703+ int old_nrpages;
32704+ int new_nrpages = cluster_nrpages_to_capture(clust);
32705+
32706+ assert("edward-973", new_nrpages > 0);
32707+ assert("edward-221", node != NULL);
32708+ assert("edward-971", clust->reserved == 1);
32709+ assert_spin_locked(&(node->guard));
32710+ assert("edward-972", node->page_count < cluster_nrpages(inode));
32711+ assert("edward-1263",
32712+ clust->reserved_prepped == estimate_update_cluster(inode));
32713+ assert("edward-1264", clust->reserved_unprepped == 0);
32714+
32715+ if (JF_ISSET(node, JNODE_DIRTY)) {
32716+ /* someone has modified this cluster, but
32717+ the modifications are not committed yet */
32718+ old_nrpages =
32719+ count_to_nrpages(cnt_to_clcnt(*old_isize,
32720+ clust->index, inode));
32721+ /* free space which is already reserved */
32722+ free_reserved4cluster(inode, clust,
32723+ estimate_update_cluster(inode));
32724+ /* put old references */
32725+ for (i = 0; i < old_nrpages; i++) {
32726+ assert("edward-975", clust->pages[i]);
32727+ assert("edward-1185", PageUptodate(clust->pages[i]));
32728+
32729+ page_cache_release(clust->pages[i]);
32730+#if REISER4_DEBUG
32731+ cryptcompress_inode_data(inode)->pgcount --;
32732+#endif
32733+ }
32734+ } else {
32735+ /* no captured pages */
32736+ assert("edward-1043", node->page_count == 0);
32737+ jnode_make_dirty_locked(node);
32738+ clust->reserved = 0;
32739+ }
32740+ /* put pages that will be truncated (if any) */
32741+ for (i = new_nrpages; i < clust->nr_pages; i++) {
32742+ assert("edward-1433", clust->pages[i]);
32743+ assert("edward-1434", PageUptodate(clust->pages[i]));
32744+ page_cache_release(clust->pages[i]);
32745+#if REISER4_DEBUG
32746+ cryptcompress_inode_data(inode)->pgcount --;
32747+#endif
32748+ }
32749+#if REISER4_DEBUG
32750+ clust->reserved_prepped -= estimate_update_cluster(inode);
32751+ node->page_count = new_nrpages - 1;
32752+#endif
32753+ return;
32754+}
32755+
32756+/* This function spawns a transaction and
32757+ is called by any thread as a final step in page cluster modification.
32758+*/
32759+static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
32760+{
32761+ int result = 0;
32762+ loff_t old_size;
32763+ jnode *node;
32764+
32765+ assert("edward-1029", clust != NULL);
32766+ assert("edward-1030", clust->reserved == 1);
32767+ assert("edward-1031", clust->nr_pages != 0);
32768+ assert("edward-1032", clust->pages != NULL);
32769+ assert("edward-1033", clust->pages[0] != NULL);
32770+
32771+ node = jprivate(clust->pages[0]);
32772+
32773+ assert("edward-1035", node != NULL);
32774+
32775+ spin_lock_jnode(node);
32776+ old_size = inode->i_size;
32777+ if (clust->win)
32778+ inode_set_new_size(clust, inode);
32779+
32780+ result = try_capture(node, ZNODE_WRITE_LOCK, 0);
32781+ if (result)
32782+ goto exit;
32783+ make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
32784+ exit:
32785+ assert("edward-1034", !result);
32786+ spin_unlock_jnode(node);
32787+ jput(node);
32788+ return result;
32789+}
32790+
32791+/* Collect unlocked cluster pages for any modifications and attach a jnode.
32792+ We allocate only one jnode per cluster, this jnode is binded to the first
32793+ page of this cluster, so we have an extra-reference that will exist with
32794+ this jnode, other references will be cleaned up in flush time.
32795+*/
32796+static int
32797+grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
32798+{
32799+ int i;
32800+ int result = 0;
32801+ jnode *node = NULL;
32802+
32803+ assert("edward-182", clust != NULL);
32804+ assert("edward-183", clust->pages != NULL);
32805+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32806+
32807+ if (clust->nr_pages == 0)
32808+ return 0;
32809+
32810+ for (i = 0; i < clust->nr_pages; i++) {
32811+
32812+ assert("edward-1044", clust->pages[i] == NULL);
32813+
32814+ clust->pages[i] =
32815+ grab_cache_page(inode->i_mapping,
32816+ clust_to_pg(clust->index, inode) + i);
32817+ if (!clust->pages[i]) {
32818+ result = RETERR(-ENOMEM);
32819+ break;
32820+ }
32821+ if (i == 0) {
32822+ node = jnode_of_page(clust->pages[i]);
32823+ if (IS_ERR(node)) {
32824+ result = PTR_ERR(node);
32825+ unlock_page(clust->pages[i]);
32826+ break;
32827+ }
32828+ JF_SET(node, JNODE_CLUSTER_PAGE);
32829+ unlock_page(clust->pages[i]);
32830+ assert("edward-919", node);
32831+ continue;
32832+ }
32833+ unlock_page(clust->pages[i]);
32834+ }
32835+ if (result) {
32836+ while (i)
32837+ page_cache_release(clust->pages[--i]);
32838+ if (node && !IS_ERR(node))
32839+ jput(node);
32840+ return result;
32841+ }
32842+ assert("edward-920", jprivate(clust->pages[0]));
32843+#if REISER4_DEBUG
32844+ cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
32845+#endif
32846+ return 0;
32847+}
32848+
32849+/* Collect unlocked cluster pages only for read (not to modify) */
32850+static int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32851+{
32852+ int i;
32853+ int result = 0;
32854+
32855+ assert("edward-1428", inode != NULL);
32856+ assert("edward-1429", inode->i_mapping != NULL);
32857+ assert("edward-787", clust != NULL);
32858+ assert("edward-788", clust->pages != NULL);
32859+ assert("edward-789", clust->nr_pages != 0);
32860+ assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
32861+
32862+ for (i = 0; i < clust->nr_pages; i++) {
32863+ clust->pages[i] =
32864+ grab_cache_page(inode->i_mapping,
32865+ clust_to_pg(clust->index, inode) + i);
32866+ if (!clust->pages[i]) {
32867+ result = RETERR(-ENOMEM);
32868+ break;
32869+ }
32870+ unlock_page(clust->pages[i]);
32871+ }
32872+ if (result)
32873+ while (i)
32874+ page_cache_release(clust->pages[--i]);
32875+ return result;
32876+}
32877+
32878+/* @node might be attached by reiser4_writepage(), not by
32879+ cryptcompress plugin code, but emergency flush should
32880+ understand that pages of cryptcompress files are not
32881+ flushable.
32882+*/
32883+int jnode_of_cluster(const jnode * node, struct page * page)
32884+{
32885+ assert("edward-1339", node != NULL);
32886+ assert("edward-1340", page != NULL);
32887+ assert("edward-1341", page->mapping != NULL);
32888+ assert("edward-1342", page->mapping->host != NULL);
32889+ assert("edward-1343",
32890+ ergo(jnode_is_unformatted(node),
32891+ get_inode_oid(page->mapping->host) ==
32892+ node->key.j.objectid));
32893+ if (inode_file_plugin(page->mapping->host) ==
32894+ file_plugin_by_id(CRC_FILE_PLUGIN_ID)) {
32895+#if REISER4_DEBUG
32896+ if (!jnode_is_cluster_page(node))
32897+ warning("edward-1345",
32898+ "inode %llu: cluster page of index %lu became private",
32899+ (unsigned long long)get_inode_oid(page->mapping->host),
32900+ page->index);
32901+#endif
32902+ return 1;
32903+ }
32904+ return 0;
32905+}
32906+
32907+/* put cluster pages */
32908+void release_cluster_pages(reiser4_cluster_t * clust)
32909+{
32910+ int i;
32911+
32912+ assert("edward-447", clust != NULL);
32913+ for (i = 0; i < clust->nr_pages; i++) {
32914+
32915+ assert("edward-449", clust->pages[i] != NULL);
32916+
32917+ page_cache_release(clust->pages[i]);
32918+ }
32919+}
32920+
32921+/* this is called when something is failed */
32922+static void release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
32923+{
32924+ jnode *node;
32925+
32926+ assert("edward-445", clust != NULL);
32927+ assert("edward-922", clust->pages != NULL);
32928+ assert("edward-446", clust->pages[0] != NULL);
32929+
32930+ node = jprivate(clust->pages[0]);
32931+
32932+ assert("edward-447", node != NULL);
32933+
32934+ release_cluster_pages(clust);
32935+ jput(node);
32936+}
32937+
32938+#if REISER4_DEBUG
32939+static int window_ok(reiser4_slide_t * win, struct inode *inode)
32940+{
32941+ assert("edward-1115", win != NULL);
32942+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32943+
32944+ return (win->off != inode_cluster_size(inode)) &&
32945+ (win->off + win->count + win->delta <= inode_cluster_size(inode));
32946+}
32947+
32948+static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
32949+{
32950+ assert("edward-279", clust != NULL);
32951+
32952+ if (!clust->pages)
32953+ return 0;
32954+ return (clust->win ? window_ok(clust->win, inode) : 1);
32955+}
32956+#endif
32957+
32958+/* guess next window stat */
32959+static inline window_stat next_window_stat(reiser4_slide_t * win)
32960+{
32961+ assert("edward-1130", win != NULL);
32962+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32963+ HOLE_WINDOW : DATA_WINDOW);
32964+}
32965+
32966+/* guess next cluster index and window params */
32967+static void
32968+update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32969+ loff_t to_file)
32970+{
32971+ reiser4_slide_t *win;
32972+
32973+ assert("edward-185", clust != NULL);
32974+ assert("edward-438", clust->pages != NULL);
32975+ assert("edward-281", cluster_ok(clust, inode));
32976+
32977+ win = clust->win;
32978+ if (!win)
32979+ return;
32980+
32981+ switch (win->stat) {
32982+ case DATA_WINDOW:
32983+ /* increment window position */
32984+ clust->index++;
32985+ win->stat = DATA_WINDOW;
32986+ win->off = 0;
32987+ win->count = min_count(inode_cluster_size(inode), to_file);
32988+ break;
32989+ case HOLE_WINDOW:
32990+ switch (next_window_stat(win)) {
32991+ case HOLE_WINDOW:
32992+ /* set window to fit the offset we start write from */
32993+ clust->index = off_to_clust(file_off, inode);
32994+ win->stat = HOLE_WINDOW;
32995+ win->off = 0;
32996+ win->count = off_to_cloff(file_off, inode);
32997+ win->delta =
32998+ min_count(inode_cluster_size(inode) - win->count,
32999+ to_file);
33000+ break;
33001+ case DATA_WINDOW:
33002+ /* do not move the window, just change its state,
33003+ off+count+delta=inv */
33004+ win->stat = DATA_WINDOW;
33005+ win->off = win->off + win->count;
33006+ win->count = win->delta;
33007+ win->delta = 0;
33008+ break;
33009+ default:
33010+ impossible("edward-282", "wrong next window state");
33011+ }
33012+ break;
33013+ default:
33014+ impossible("edward-283", "wrong current window state");
33015+ }
33016+ assert("edward-1068", cluster_ok(clust, inode));
33017+}
33018+
33019+static int update_sd_cryptcompress(struct inode *inode)
33020+{
33021+ int result = 0;
33022+
33023+ assert("edward-978", schedulable());
33024+ assert("edward-1265", get_current_context()->grabbed_blocks == 0);
33025+
33026+ result = reiser4_grab_space_force( /* one for stat data update */
33027+ estimate_update_common(inode),
33028+ BA_CAN_COMMIT);
33029+ assert("edward-979", !result);
33030+ if (result)
33031+ return result;
33032+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33033+ result = reiser4_update_sd(inode);
33034+
33035+ all_grabbed2free();
33036+ return result;
33037+}
33038+
33039+
33040+/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
33041+static void uncapture_cluster_jnode(jnode * node)
33042+{
33043+ txn_atom *atom;
33044+
33045+ assert_spin_locked(&(node->guard));
33046+
33047+ /*jnode_make_clean(node); */
33048+ atom = jnode_get_atom(node);
33049+ if (atom == NULL) {
33050+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
33051+ spin_unlock_jnode(node);
33052+ return;
33053+ }
33054+
33055+ uncapture_block(node);
33056+ spin_unlock_atom(atom);
33057+ jput(node);
33058+}
33059+
33060+void forget_cluster_pages(struct page **pages, int nr)
33061+{
33062+ int i;
33063+ for (i = 0; i < nr; i++) {
33064+
33065+ assert("edward-1045", pages[i] != NULL);
33066+ page_cache_release(pages[i]);
33067+ }
33068+}
33069+
33070+/* Check out last modifications we are about to commit,
33071+ and prepare input stream for transform operations.
33072+*/
33073+int
33074+flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
33075+ struct inode *inode)
33076+{
33077+ int result = 0;
33078+ int i;
33079+ int nr_pages = 0;
33080+ tfm_cluster_t *tc = &clust->tc;
33081+
33082+ assert("edward-980", node != NULL);
33083+ assert("edward-236", inode != NULL);
33084+ assert("edward-237", clust != NULL);
33085+ assert("edward-240", !clust->win);
33086+ assert("edward-241", schedulable());
33087+ assert("edward-718", crc_inode_ok(inode));
33088+
33089+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
33090+ if (result) {
33091+ warning("edward-1430",
33092+ "alloc stream failed with ret=%d", result);
33093+ return result;
33094+ }
33095+ spin_lock_jnode(node);
33096+ assert("edward-1435", JF_ISSET(node, JNODE_DIRTY));
33097+
33098+ /* Check out a size of logical cluster and
33099+ set a number of cluster pages to commit. */
33100+ tc->len = tc->lsize = fsize_to_count(clust, inode);
33101+ clust->nr_pages = count_to_nrpages(tc->len);
33102+
33103+ assert("edward-983", clust->nr_pages == node->page_count + 1);
33104+#if REISER4_DEBUG
33105+ node->page_count = 0;
33106+#endif
33107+ cluster_reserved2grabbed(estimate_update_cluster(inode));
33108+ uncapture_cluster_jnode(node);
33109+
33110+ assert("edward-1224", schedulable());
33111+ /* Check out cluster pages to commit */
33112+ nr_pages =
33113+ find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
33114+ clust->nr_pages, clust->pages);
33115+
33116+ assert("edward-1280", nr_pages == clust->nr_pages);
33117+ /* Construct input stream from the checked out pages */
33118+ for (i = 0; i < clust->nr_pages; i++) {
33119+ char *data;
33120+
33121+ assert("edward-242", clust->pages[i] != NULL);
33122+ assert("edward-1436", clust->pages[i]->index ==
33123+ clust_to_pg(clust->index, inode) + i);
33124+ assert("edward-1437", PageUptodate(clust->pages[i]));
33125+ /* flush the page into the input stream */
33126+ lock_page(clust->pages[i]);
33127+ data = kmap(clust->pages[i]);
33128+
33129+ assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
33130+
33131+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
33132+ data, cnt_to_pgcnt(tc->len, i));
33133+ kunmap(clust->pages[i]);
33134+ unlock_page(clust->pages[i]);
33135+ }
33136+ clear_cluster_pages_dirty(clust);
33137+ release_cluster_pages(clust);
33138+#if REISER4_DEBUG
33139+ cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
33140+#endif
33141+ /* put pages that were found here */
33142+ release_cluster_pages(clust);
33143+ return result;
33144+}
33145+
33146+/* set hint for the cluster of the index @index */
33147+static void set_hint_cluster(struct inode *inode, hint_t * hint,
33148+ cloff_t index, znode_lock_mode mode)
33149+{
33150+ reiser4_key key;
33151+ assert("edward-722", crc_inode_ok(inode));
33152+ assert("edward-723",
33153+ inode_file_plugin(inode) ==
33154+ file_plugin_by_id(CRC_FILE_PLUGIN_ID));
33155+
33156+ inode_file_plugin(inode)->key_by_inode(inode,
33157+ clust_to_off(index, inode),
33158+ &key);
33159+
33160+ seal_init(&hint->seal, &hint->ext_coord.coord, &key);
33161+ hint->offset = get_key_offset(&key);
33162+ hint->mode = mode;
33163+}
33164+
33165+void invalidate_hint_cluster(reiser4_cluster_t * clust)
33166+{
33167+ assert("edward-1291", clust != NULL);
33168+ assert("edward-1292", clust->hint != NULL);
33169+
33170+ done_lh(&clust->hint->lh);
33171+ clust->hint->ext_coord.valid = 0;
33172+}
33173+
33174+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
33175+ znode_lock_mode mode)
33176+{
33177+ assert("edward-1286", clust != NULL);
33178+ assert("edward-1287", clust->hint != NULL);
33179+
33180+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
33181+ invalidate_hint_cluster(clust);
33182+}
33183+
33184+static int
33185+balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
33186+ loff_t off, loff_t to_file)
33187+{
33188+ int result;
33189+
33190+ assert("edward-724", inode != NULL);
33191+ assert("edward-725", crc_inode_ok(inode));
33192+ assert("edward-1272", get_current_context()->grabbed_blocks == 0);
33193+
33194+ /* set next window params */
33195+ update_cluster(inode, clust, off, to_file);
33196+
33197+ result = update_sd_cryptcompress(inode);
33198+ assert("edward-988", !result);
33199+ if (result)
33200+ return result;
33201+ assert("edward-726", clust->hint->lh.owner == NULL);
33202+
33203+ reiser4_throttle_write(inode);
33204+ all_grabbed2free();
33205+ return 0;
33206+}
33207+
33208+/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
33209+static int
33210+write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
33211+ loff_t to_file)
33212+{
33213+ char *data;
33214+ int result = 0;
33215+ unsigned cl_off, cl_count = 0;
33216+ unsigned to_pg, pg_off;
33217+ reiser4_slide_t *win;
33218+
33219+ assert("edward-190", clust != NULL);
33220+ assert("edward-1069", clust->win != NULL);
33221+ assert("edward-191", inode != NULL);
33222+ assert("edward-727", crc_inode_ok(inode));
33223+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
33224+ assert("edward-1154",
33225+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
33226+
33227+ win = clust->win;
33228+
33229+ assert("edward-1070", win != NULL);
33230+ assert("edward-201", win->stat == HOLE_WINDOW);
33231+ assert("edward-192", cluster_ok(clust, inode));
33232+
33233+ if (win->off == 0 && win->count == inode_cluster_size(inode)) {
33234+ /* the hole will be represented by fake disk cluster */
33235+ update_cluster(inode, clust, file_off, to_file);
33236+ return 0;
33237+ }
33238+ cl_count = win->count; /* number of zeroes to write */
33239+ cl_off = win->off;
33240+ pg_off = off_to_pgoff(win->off);
33241+
33242+ while (cl_count) {
33243+ struct page *page;
33244+ page = clust->pages[off_to_pg(cl_off)];
33245+
33246+ assert("edward-284", page != NULL);
33247+
33248+ to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
33249+ lock_page(page);
33250+ data = kmap_atomic(page, KM_USER0);
33251+ memset(data + pg_off, 0, to_pg);
33252+ flush_dcache_page(page);
33253+ kunmap_atomic(data, KM_USER0);
33254+ SetPageUptodate(page);
33255+ unlock_page(page);
33256+
33257+ cl_off += to_pg;
33258+ cl_count -= to_pg;
33259+ pg_off = 0;
33260+ }
33261+ if (!win->delta) {
33262+ /* only zeroes, try to capture */
33263+
33264+ set_cluster_pages_dirty(clust);
33265+ result = try_capture_cluster(clust, inode);
33266+ if (result)
33267+ return result;
33268+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
33269+ result =
33270+ balance_dirty_page_cluster(clust, inode, file_off, to_file);
33271+ } else
33272+ update_cluster(inode, clust, file_off, to_file);
33273+ return result;
33274+}
33275+
33276+/*
33277+ The main disk search procedure for cryptcompress plugins, which
33278+ . scans all items of disk cluster
33279+ . maybe reads each one (if @read != 0)
33280+ . maybe makes its znode dirty (if @write != 0)
33281+
33282+ NOTE-EDWARD: Callers should handle the case when disk cluster
33283+ is incomplete (-EIO)
33284+*/
33285+int
33286+find_cluster(reiser4_cluster_t * clust,
33287+ struct inode *inode, int read, int write)
33288+{
33289+ flow_t f;
33290+ hint_t *hint;
33291+ int result = 0;
33292+ unsigned long cl_idx;
33293+ ra_info_t ra_info;
33294+ file_plugin *fplug;
33295+ item_plugin *iplug;
33296+ tfm_cluster_t *tc;
33297+ int was_grabbed;
33298+
33299+ assert("edward-138", clust != NULL);
33300+ assert("edward-728", clust->hint != NULL);
33301+ assert("edward-225", read || write);
33302+ assert("edward-226", schedulable());
33303+ assert("edward-137", inode != NULL);
33304+ assert("edward-729", crc_inode_ok(inode));
33305+
33306+ hint = clust->hint;
33307+ cl_idx = clust->index;
33308+ fplug = inode_file_plugin(inode);
33309+ was_grabbed = get_current_context()->grabbed_blocks;
33310+ tc = &clust->tc;
33311+
33312+ assert("edward-462", !tfm_cluster_is_uptodate(tc));
33313+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
33314+
33315+ /* set key of the first disk cluster item */
33316+ fplug->flow_by_inode(inode,
33317+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
33318+ 0 /* kernel space */ ,
33319+ inode_scaled_cluster_size(inode),
33320+ clust_to_off(cl_idx, inode), READ_OP, &f);
33321+ if (write) {
33322+ /* reserve for flush to make dirty all the leaf nodes
33323+ which contain disk cluster */
33324+ result =
33325+ reiser4_grab_space_force(estimate_dirty_cluster(inode),
33326+ BA_CAN_COMMIT);
33327+ assert("edward-990", !result);
33328+ if (result)
33329+ goto out;
33330+ }
33331+
33332+ ra_info.key_to_stop = f.key;
33333+ set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
33334+
33335+ while (f.length) {
33336+ result = find_cluster_item(hint,
33337+ &f.key,
33338+ (write ? ZNODE_WRITE_LOCK :
33339+ ZNODE_READ_LOCK), NULL, FIND_EXACT,
33340+ (write ? CBK_FOR_INSERT : 0));
33341+ switch (result) {
33342+ case CBK_COORD_NOTFOUND:
33343+ result = 0;
33344+ if (inode_scaled_offset
33345+ (inode,
33346+ clust_to_off(cl_idx,
33347+ inode)) == get_key_offset(&f.key)) {
33348+ /* first item not found, this is treated
33349+ as disk cluster is absent */
33350+ clust->dstat = FAKE_DISK_CLUSTER;
33351+ goto out;
33352+ }
33353+ /* we are outside the cluster, stop search here */
33354+ assert("edward-146",
33355+ f.length != inode_scaled_cluster_size(inode));
33356+ goto ok;
33357+ case CBK_COORD_FOUND:
33358+ assert("edward-148",
33359+ hint->ext_coord.coord.between == AT_UNIT);
33360+ assert("edward-460",
33361+ hint->ext_coord.coord.unit_pos == 0);
33362+
33363+ coord_clear_iplug(&hint->ext_coord.coord);
33364+ result = zload_ra(hint->ext_coord.coord.node, &ra_info);
33365+ if (unlikely(result))
33366+ goto out;
33367+ iplug = item_plugin_by_coord(&hint->ext_coord.coord);
33368+ assert("edward-147",
33369+ item_id_by_coord(&hint->ext_coord.coord) ==
33370+ CTAIL_ID);
33371+
33372+ result = iplug->s.file.read(NULL, &f, hint);
33373+ if (result) {
33374+ zrelse(hint->ext_coord.coord.node);
33375+ goto out;
33376+ }
33377+ if (write) {
33378+ znode_make_dirty(hint->ext_coord.coord.node);
33379+ znode_set_convertible(hint->ext_coord.coord.
33380+ node);
33381+ }
33382+ zrelse(hint->ext_coord.coord.node);
33383+ break;
33384+ default:
33385+ goto out;
33386+ }
33387+ }
33388+ ok:
33389+ /* at least one item was found */
33390+ /* NOTE-EDWARD: Callers should handle the case
33391+ when disk cluster is incomplete (-EIO) */
33392+ tc->len = inode_scaled_cluster_size(inode) - f.length;
33393+ tc->lsize = fsize_to_count(clust, inode);
33394+ assert("edward-1196", tc->len > 0);
33395+ assert("edward-1406", tc->lsize > 0);
33396+
33397+ if (hint_is_unprepped_dclust(clust->hint))
33398+ clust->dstat = UNPR_DISK_CLUSTER;
33399+ else
33400+ clust->dstat = PREP_DISK_CLUSTER;
33401+ out:
33402+ assert("edward-1339",
33403+ get_current_context()->grabbed_blocks >= was_grabbed);
33404+ grabbed2free(get_current_context(),
33405+ get_current_super_private(),
33406+ get_current_context()->grabbed_blocks - was_grabbed);
33407+ return result;
33408+}
33409+
33410+int
33411+get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
33412+ znode_lock_mode lock_mode)
33413+{
33414+ reiser4_key key;
33415+ ra_info_t ra_info;
33416+
33417+ assert("edward-730", schedulable());
33418+ assert("edward-731", clust != NULL);
33419+ assert("edward-732", inode != NULL);
33420+
33421+ if (clust->hint->ext_coord.valid) {
33422+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
33423+ assert("edward-1294",
33424+ znode_is_write_locked(clust->hint->lh.node));
33425+ /* already have a valid locked position */
33426+ return (clust->dstat ==
33427+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
33428+ CBK_COORD_FOUND);
33429+ }
33430+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
33431+ &key);
33432+ ra_info.key_to_stop = key;
33433+ set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
33434+
33435+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
33436+ CBK_FOR_INSERT);
33437+}
33438+
33439+/* Read needed cluster pages before modifying.
33440+ If success, @clust->hint contains locked position in the tree.
33441+ Also:
33442+ . find and set disk cluster state
33443+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
33444+*/
33445+static int
33446+read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
33447+{
33448+ int i;
33449+ int result = 0;
33450+ item_plugin *iplug;
33451+ reiser4_slide_t *win = clust->win;
33452+
33453+ iplug = item_plugin_by_id(CTAIL_ID);
33454+
33455+ assert("edward-733", get_current_context()->grabbed_blocks == 0);
33456+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
33457+
33458+#if REISER4_DEBUG
33459+ if (clust->nr_pages == 0) {
33460+ /* start write hole from fake disk cluster */
33461+ assert("edward-1117", win != NULL);
33462+ assert("edward-1118", win->stat == HOLE_WINDOW);
33463+ assert("edward-1119", new_cluster(clust, inode));
33464+ }
33465+#endif
33466+ if (new_cluster(clust, inode)) {
33467+ /*
33468+ new page cluster is about to be written, nothing to read,
33469+ */
33470+ assert("edward-734", schedulable());
33471+ assert("edward-735", clust->hint->lh.owner == NULL);
33472+
33473+ if (clust->nr_pages) {
33474+ int off;
33475+ char *data;
33476+ struct page * pg;
33477+ assert("edward-1419", clust->pages != NULL);
33478+ pg = clust->pages[clust->nr_pages - 1];
33479+ assert("edward-1420", pg != NULL);
33480+ off = off_to_pgoff(win->off+win->count+win->delta);
33481+ if (off) {
33482+ lock_page(pg);
33483+ data = kmap_atomic(pg, KM_USER0);
33484+ memset(data + off, 0, PAGE_CACHE_SIZE - off);
33485+ flush_dcache_page(pg);
33486+ kunmap_atomic(data, KM_USER0);
33487+ unlock_page(pg);
33488+ }
33489+ }
33490+ clust->dstat = FAKE_DISK_CLUSTER;
33491+ return 0;
33492+ }
33493+ /*
33494+ Here we should search for disk cluster to figure out its real state.
33495+ Also there is one more important reason to do disk search: we need
33496+ to make disk cluster _dirty_ if it exists
33497+ */
33498+
33499+ /* if windows is specified, read the only pages
33500+ that will be modified partially */
33501+
33502+ for (i = 0; i < clust->nr_pages; i++) {
33503+ struct page *pg = clust->pages[i];
33504+
33505+ lock_page(pg);
33506+ if (PageUptodate(pg)) {
33507+ unlock_page(pg);
33508+ continue;
33509+ }
33510+ unlock_page(pg);
33511+
33512+ if (win &&
33513+ i >= count_to_nrpages(win->off) &&
33514+ i < off_to_pg(win->off + win->count + win->delta))
33515+ /* page will be completely overwritten */
33516+ continue;
33517+
33518+ if (win && (i == clust->nr_pages - 1) &&
33519+ /* the last page is
33520+ partially modified,
33521+ not uptodate .. */
33522+ (count_to_nrpages(inode->i_size) <= pg->index)) {
33523+ /* .. and appended,
33524+ so set zeroes to the rest */
33525+ char *data;
33526+ int offset;
33527+ lock_page(pg);
33528+ data = kmap_atomic(pg, KM_USER0);
33529+
33530+ assert("edward-1260",
33531+ count_to_nrpages(win->off + win->count +
33532+ win->delta) - 1 == i);
33533+
33534+ offset =
33535+ off_to_pgoff(win->off + win->count + win->delta);
33536+ memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
33537+ flush_dcache_page(pg);
33538+ kunmap_atomic(data, KM_USER0);
33539+ unlock_page(pg);
33540+ /* still not uptodate */
33541+ break;
33542+ }
33543+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
33544+ result = ctail_read_disk_cluster(clust, inode, 1);
33545+ assert("edward-992", !result);
33546+ if (result)
33547+ goto out;
33548+ assert("edward-925",
33549+ tfm_cluster_is_uptodate(&clust->tc));
33550+ }
33551+ lock_page(pg);
33552+ result = do_readpage_ctail(inode, clust, pg);
33553+ unlock_page(pg);
33554+ assert("edward-993", !result);
33555+ if (result) {
33556+ impossible("edward-219",
33557+ "do_readpage_ctail returned crap");
33558+ goto out;
33559+ }
33560+ }
33561+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
33562+ /* disk cluster unclaimed, but we need to make its znodes dirty
33563+ to make flush update convert its content */
33564+ result =
33565+ find_cluster(clust, inode, 0 /* do not read */ ,
33566+ 1 /* write */ );
33567+ assert("edward-994", !result);
33568+ }
33569+ out:
33570+ tfm_cluster_clr_uptodate(&clust->tc);
33571+ return result;
33572+}
33573+
33574+static int
33575+should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33576+{
33577+ assert("edward-737", clust != NULL);
33578+
33579+ switch (clust->dstat) {
33580+ case PREP_DISK_CLUSTER:
33581+ case UNPR_DISK_CLUSTER:
33582+ return 0;
33583+ case FAKE_DISK_CLUSTER:
33584+ if (clust->win &&
33585+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
33586+ assert("edward-1172", new_cluster(clust, inode));
33587+ return 0;
33588+ }
33589+ return 1;
33590+ default:
33591+ impossible("edward-1173", "bad disk cluster state");
33592+ return 0;
33593+ }
33594+}
33595+
33596+static int
33597+crc_make_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33598+{
33599+ int result;
33600+
33601+ assert("edward-1123", schedulable());
33602+ assert("edward-737", clust != NULL);
33603+ assert("edward-738", inode != NULL);
33604+ assert("edward-739", crc_inode_ok(inode));
33605+ assert("edward-1053", clust->hint != NULL);
33606+ assert("edward-1266", get_current_context()->grabbed_blocks == 0);
33607+
33608+ if (clust->reserved) {
33609+ cluster_reserved2grabbed(estimate_insert_cluster(inode));
33610+#if REISER4_DEBUG
33611+ assert("edward-1267",
33612+ clust->reserved_unprepped ==
33613+ estimate_insert_cluster(inode));
33614+ clust->reserved_unprepped -= estimate_insert_cluster(inode);
33615+#endif
33616+ }
33617+ if (!should_create_unprepped_cluster(clust, inode)) {
33618+ all_grabbed2free();
33619+ return 0;
33620+ } else {
33621+ assert("edward-1268", clust->reserved == 1);
33622+ }
33623+ result = ctail_insert_unprepped_cluster(clust, inode);
33624+ all_grabbed2free();
33625+ if (result)
33626+ return result;
33627+
33628+ assert("edward-743", crc_inode_ok(inode));
33629+ assert("edward-1269", get_current_context()->grabbed_blocks == 0);
33630+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
33631+
33632+ clust->dstat = UNPR_DISK_CLUSTER;
33633+ return 0;
33634+}
33635+
33636+#if REISER4_DEBUG
33637+static int jnode_truncate_ok(struct inode *inode, cloff_t index)
33638+{
33639+ jnode *node;
33640+ node =
33641+ jlookup(current_tree, get_inode_oid(inode),
33642+ clust_to_pg(index, inode));
33643+ if (likely(!node))
33644+ return 1;
33645+ /* someone got this jnode */
33646+ warning("edward-1315", "jnode %p is untruncated\n", node);
33647+ jput(node);
33648+ return (atomic_read(&node->x_count));
33649+}
33650+#endif
33651+
33652+/* Collect unlocked cluster pages and jnode (the last is in the
33653+ case when the page cluster will be modified and captured) */
33654+int
33655+prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
33656+ int capture)
33657+{
33658+ assert("edward-177", inode != NULL);
33659+ assert("edward-741", crc_inode_ok(inode));
33660+ assert("edward-740", clust->pages != NULL);
33661+
33662+ set_cluster_nrpages(clust, inode);
33663+ reset_cluster_pgset(clust, cluster_nrpages(inode));
33664+ return (capture ?
33665+ grab_cluster_pages_jnode(inode, clust) :
33666+ grab_cluster_pages(inode, clust));
33667+}
33668+
33669+/* Truncate all pages of the cluster of index @index.
33670+ This is called by ->kill_hook() method of item plugin */
33671+void truncate_page_cluster(struct inode *inode, cloff_t index)
33672+{
33673+ int i;
33674+ int found = 0;
33675+ int nr_pages;
33676+ jnode *node;
33677+ struct page *pages[MAX_CLUSTER_NRPAGES];
33678+
33679+ node =
33680+ jlookup(current_tree, get_inode_oid(inode),
33681+ clust_to_pg(index, inode));
33682+ /* jnode is absent, just drop pages which can not
33683+ acquire jnode because of exclusive access */
33684+ if (!node) {
33685+ truncate_inode_pages_range(inode->i_mapping,
33686+ clust_to_off(index, inode),
33687+ clust_to_off(index,
33688+ inode) +
33689+ inode_cluster_size(inode) - 1);
33690+ return;
33691+ }
33692+ /* jnode is present and may be dirty */
33693+ nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
33694+
33695+ found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
33696+ nr_pages, pages);
33697+ spin_lock_jnode(node);
33698+ if (JF_ISSET(node, JNODE_DIRTY)) {
33699+ /* someone has done modifications which are not
33700+ yet committed, so we need to release some resources */
33701+
33702+ /* free disk space grabbed for disk cluster converting */
33703+ cluster_reserved2grabbed(estimate_update_cluster(inode));
33704+ grabbed2free(get_current_context(),
33705+ get_current_super_private(),
33706+ estimate_update_cluster(inode));
33707+
33708+ assert("edward-1198", found == nr_pages);
33709+ assert("edward-1199", node->page_count + 1 == nr_pages);
33710+#if REISER4_DEBUG
33711+ node->page_count = 0;
33712+#endif
33713+ /* This will clear dirty bit */
33714+ uncapture_cluster_jnode(node);
33715+
33716+ /* put pages grabbed for last uncommitted modifications */
33717+ for (i = 0; i < nr_pages; i++) {
33718+ assert("edward-1200", PageUptodate(pages[i]));
33719+ page_cache_release(pages[i]);
33720+#if REISER4_DEBUG
33721+ cryptcompress_inode_data(inode)->pgcount --;
33722+#endif
33723+ }
33724+ } else
33725+ spin_unlock_jnode(node);
33726+ /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
33727+
33728+ jput(node);
33729+ /* put pages found here */
33730+ forget_cluster_pages(pages, found);
33731+ truncate_inode_pages_range(inode->i_mapping,
33732+ clust_to_off(index, inode),
33733+ clust_to_off(index,
33734+ inode) +
33735+ inode_cluster_size(inode) - 1);
33736+ assert("edward-1201", jnode_truncate_ok(inode, index));
33737+ return;
33738+}
33739+
33740+/* Prepare cluster handle before(after) modifications
33741+ which are supposed to be committed.
33742+
33743+ . grab cluster pages;
33744+ . reserve disk space;
33745+ . maybe read pages from disk and set the disk cluster dirty;
33746+ . maybe write hole;
33747+ . maybe create 'unprepped' disk cluster if the last one is fake
33748+ (i.e. is not represenred by any items)
33749+*/
33750+
33751+static int
33752+prepare_cluster(struct inode *inode,
33753+ loff_t file_off /* write position in the file */ ,
33754+ loff_t to_file, /* bytes of users data to write to the file */
33755+ reiser4_cluster_t * clust, page_cluster_op op)
33756+{
33757+ int result = 0;
33758+ reiser4_slide_t *win = clust->win;
33759+
33760+ assert("edward-1273", get_current_context()->grabbed_blocks == 0);
33761+ reset_cluster_params(clust);
33762+#if REISER4_DEBUG
33763+ clust->ctx = get_current_context();
33764+#endif
33765+ assert("edward-1190", op != PCL_UNKNOWN);
33766+
33767+ clust->op = op;
33768+
33769+ result = prepare_page_cluster(inode, clust, 1);
33770+ if (result)
33771+ return result;
33772+ result = reserve4cluster(inode, clust);
33773+ if (result)
33774+ goto err1;
33775+ result = read_some_cluster_pages(inode, clust);
33776+ if (result) {
33777+ free_reserved4cluster(inode,
33778+ clust,
33779+ estimate_update_cluster(inode) +
33780+ estimate_insert_cluster(inode));
33781+ goto err1;
33782+ }
33783+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33784+
33785+ result = crc_make_unprepped_cluster(clust, inode);
33786+ if (result)
33787+ goto err2;
33788+ if (win && win->stat == HOLE_WINDOW) {
33789+ result = write_hole(inode, clust, file_off, to_file);
33790+ if (result)
33791+ goto err2;
33792+ }
33793+ return 0;
33794+ err2:
33795+ free_reserved4cluster(inode, clust,
33796+ estimate_update_cluster(inode));
33797+ err1:
33798+ release_cluster_pages_and_jnode(clust);
33799+ assert("edward-1125", result == -ENOSPC);
33800+ return result;
33801+}
33802+
33803+/* set window by two offsets */
33804+static void
33805+set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
33806+ struct inode *inode, loff_t o1, loff_t o2)
33807+{
33808+ assert("edward-295", clust != NULL);
33809+ assert("edward-296", inode != NULL);
33810+ assert("edward-1071", win != NULL);
33811+ assert("edward-297", o1 <= o2);
33812+
33813+ clust->index = off_to_clust(o1, inode);
33814+
33815+ win->off = off_to_cloff(o1, inode);
33816+ win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
33817+ win->delta = 0;
33818+
33819+ clust->win = win;
33820+}
33821+
33822+static int
33823+set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
33824+ reiser4_slide_t * win, flow_t * f, loff_t file_off)
33825+{
33826+ int result;
33827+
33828+ assert("edward-197", clust != NULL);
33829+ assert("edward-1072", win != NULL);
33830+ assert("edward-198", inode != NULL);
33831+
33832+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33833+ if (result)
33834+ return result;
33835+
33836+ if (file_off > inode->i_size) {
33837+ /* Uhmm, hole in cryptcompress file... */
33838+ loff_t hole_size;
33839+ hole_size = file_off - inode->i_size;
33840+
33841+ set_window(clust, win, inode, inode->i_size, file_off);
33842+ win->stat = HOLE_WINDOW;
33843+ if (win->off + hole_size < inode_cluster_size(inode))
33844+ /* there is also user's data to append to the hole */
33845+ win->delta =
33846+ min_count(inode_cluster_size(inode) -
33847+ (win->off + win->count), f->length);
33848+ return 0;
33849+ }
33850+ set_window(clust, win, inode, file_off, file_off + f->length);
33851+ win->stat = DATA_WINDOW;
33852+ return 0;
33853+}
33854+
33855+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
33856+ int count)
33857+{
33858+ int result = 0;
33859+ int (*setting_actor)(reiser4_cluster_t * clust, int count);
33860+
33861+ assert("edward-1358", clust != NULL);
33862+ assert("edward-1359", page != NULL);
33863+ assert("edward-1360", page->mapping != NULL);
33864+ assert("edward-1361", page->mapping->host != NULL);
33865+
33866+ setting_actor = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
33867+ result = setting_actor(clust, count);
33868+ clust->index = pg_to_clust(page->index, page->mapping->host);
33869+ return result;
33870+}
33871+
33872+/* reset all the params that not get updated */
33873+void reset_cluster_params(reiser4_cluster_t * clust)
33874+{
33875+ assert("edward-197", clust != NULL);
33876+
33877+ clust->dstat = INVAL_DISK_CLUSTER;
33878+ clust->tc.uptodate = 0;
33879+ clust->tc.len = 0;
33880+}
33881+
33882+/* Core write procedure of cryptcompress plugin, which slices user's
33883+ flow into logical clusters, maps the last ones to the appropriate
33884+ page clusters, and tries to capture them.
33885+ If @buf != NULL, returns number of successfully written bytes,
33886+ otherwise returns error
33887+*/
33888+static loff_t
33889+write_cryptcompress_flow(struct file *file, struct inode *inode,
33890+ const char __user *buf, size_t count, loff_t pos)
33891+{
33892+ int i;
33893+ flow_t f;
33894+ hint_t *hint;
33895+ int result = 0;
33896+ size_t to_write = 0;
33897+ loff_t file_off;
33898+ reiser4_slide_t win;
33899+ reiser4_cluster_t clust;
33900+
33901+ assert("edward-161", schedulable());
33902+ assert("edward-748", crc_inode_ok(inode));
33903+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33904+ assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33905+
33906+ result = check_cryptcompress(inode);
33907+ if (result)
33908+ return result;
33909+ hint = kmalloc(sizeof(*hint), GFP_KERNEL);
33910+ if (hint == NULL)
33911+ return RETERR(-ENOMEM);
33912+
33913+ result = load_file_hint(file, hint);
33914+ if (result) {
33915+ kfree(hint);
33916+ return result;
33917+ }
33918+
33919+ result =
33920+ flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
33921+ count, pos, WRITE_OP, &f);
33922+ if (result)
33923+ goto out;
33924+ to_write = f.length;
33925+
33926+ /* current write position in file */
33927+ file_off = pos;
33928+ reiser4_slide_init(&win);
33929+ cluster_init_read(&clust, &win);
33930+ clust.hint = hint;
33931+
33932+ result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
33933+ if (result)
33934+ goto out;
33935+
33936+ if (next_window_stat(&win) == HOLE_WINDOW) {
33937+ result =
33938+ prepare_cluster(inode, file_off, f.length, &clust,
33939+ PCL_APPEND);
33940+ if (result)
33941+ goto out;
33942+ }
33943+ do {
33944+ char *src;
33945+ unsigned page_off, page_count;
33946+
33947+ assert("edward-750", schedulable());
33948+
33949+ result =
33950+ prepare_cluster(inode, file_off, f.length, &clust,
33951+ PCL_APPEND);
33952+ if (result)
33953+ goto out;
33954+
33955+ assert("edward-751", crc_inode_ok(inode));
33956+ assert("edward-204", win.stat == DATA_WINDOW);
33957+ assert("edward-1288", clust.hint->ext_coord.valid);
33958+ assert("edward-752",
33959+ znode_is_write_locked(hint->ext_coord.coord.node));
33960+
33961+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33962+
33963+ /* set write position in page */
33964+ page_off = off_to_pgoff(win.off);
33965+
33966+ /* copy user's data to cluster pages */
33967+ for (i = off_to_pg(win.off), src = f.data;
33968+ i < count_to_nrpages(win.off + win.count);
33969+ i++, src += page_count) {
33970+ page_count =
33971+ cnt_to_pgcnt(win.off + win.count, i) - page_off;
33972+
33973+ assert("edward-1039",
33974+ page_off + page_count <= PAGE_CACHE_SIZE);
33975+ assert("edward-287", clust.pages[i] != NULL);
33976+
33977+ lock_page(clust.pages[i]);
33978+ result =
33979+ __copy_from_user((char *)kmap(clust.pages[i]) +
33980+ page_off, (char __user *)src, page_count);
33981+ kunmap(clust.pages[i]);
33982+ if (unlikely(result)) {
33983+ unlock_page(clust.pages[i]);
33984+ result = -EFAULT;
33985+ goto err2;
33986+ }
33987+ SetPageUptodate(clust.pages[i]);
33988+ unlock_page(clust.pages[i]);
33989+ page_off = 0;
33990+ }
33991+ assert("edward-753", crc_inode_ok(inode));
33992+
33993+ set_cluster_pages_dirty(&clust);
33994+
33995+ result = try_capture_cluster(&clust, inode);
33996+ if (result)
33997+ goto err2;
33998+
33999+ assert("edward-998", f.user == 1);
34000+
34001+ move_flow_forward(&f, win.count);
34002+
34003+ /* disk cluster may be already clean at this point */
34004+
34005+ /* . update cluster
34006+ . set hint for new offset
34007+ . unlock znode
34008+ . update inode
34009+ . balance dirty pages
34010+ */
34011+ result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
34012+ if (result)
34013+ goto err1;
34014+ assert("edward-755", hint->lh.owner == NULL);
34015+ reset_cluster_params(&clust);
34016+ continue;
34017+ err2:
34018+ release_cluster_pages_and_jnode(&clust);
34019+ err1:
34020+ if (clust.reserved)
34021+ free_reserved4cluster(inode,
34022+ &clust,
34023+ estimate_update_cluster(inode));
34024+ break;
34025+ } while (f.length);
34026+ out:
34027+ done_lh(&hint->lh);
34028+ if (result == -EEXIST)
34029+ warning("edward-1407", "write returns EEXIST!\n");
34030+
34031+ put_cluster_handle(&clust);
34032+ save_file_hint(file, hint);
34033+ kfree(hint);
34034+ if (buf) {
34035+ /* if nothing were written - there must be an error */
34036+ assert("edward-195", ergo((to_write == f.length), result < 0));
34037+ return (to_write - f.length) ? (to_write - f.length) : result;
34038+ }
34039+ return result;
34040+}
34041+
34042+static ssize_t write_crc_file(struct file *file, /* file to write to */
34043+ struct inode *inode, /* inode */
34044+ const char __user *buf, /* address of user-space buffer */
34045+ size_t count, /* number of bytes to write */
34046+ loff_t * off /* position to write which */ )
34047+{
34048+
34049+ int result;
34050+ loff_t pos;
34051+ ssize_t written;
34052+ cryptcompress_info_t *info = cryptcompress_inode_data(inode);
34053+
34054+ assert("edward-196", crc_inode_ok(inode));
34055+
34056+ result = generic_write_checks(file, off, &count, 0);
34057+ if (unlikely(result != 0))
34058+ return result;
34059+
34060+ if (unlikely(count == 0))
34061+ return 0;
34062+
34063+ down_write(&info->lock);
34064+ LOCK_CNT_INC(inode_sem_w);
34065+
34066+ pos = *off;
34067+ written =
34068+ write_cryptcompress_flow(file, inode, buf, count, pos);
34069+
34070+ up_write(&info->lock);
34071+ LOCK_CNT_DEC(inode_sem_w);
34072+
34073+ if (written < 0) {
34074+ if (written == -EEXIST)
34075+ printk("write_crc_file returns EEXIST!\n");
34076+ return written;
34077+ }
34078+ /* update position in a file */
34079+ *off = pos + written;
34080+ /* return number of written bytes */
34081+ return written;
34082+}
34083+
34084+/**
34085+ * write_cryptcompress - write of struct file_operations
34086+ * @file: file to write to
34087+ * @buf: address of user-space buffer
34088+ * @read_amount: number of bytes to write
34089+ * @off: position in file to write to
34090+ *
34091+ * This is implementation of vfs's write method of struct file_operations for
34092+ * cryptcompress plugin.
34093+ */
34094+ssize_t write_cryptcompress(struct file *file, const char __user *buf,
34095+ size_t count, loff_t *off)
34096+{
34097+ ssize_t result;
34098+ struct inode *inode;
34099+ reiser4_context *ctx;
34100+
34101+ inode = file->f_dentry->d_inode;
34102+
34103+ ctx = init_context(inode->i_sb);
34104+ if (IS_ERR(ctx))
34105+ return PTR_ERR(ctx);
34106+
34107+ mutex_lock(&inode->i_mutex);
34108+
34109+ result = write_crc_file(file, inode, buf, count, off);
34110+
34111+ mutex_unlock(&inode->i_mutex);
34112+
34113+ context_set_commit_async(ctx);
34114+ reiser4_exit_context(ctx);
34115+ return result;
34116+}
34117+
34118+static void
34119+readpages_crc(struct address_space *mapping, struct list_head *pages,
34120+ void *data)
34121+{
34122+ file_plugin *fplug;
34123+ item_plugin *iplug;
34124+
34125+ assert("edward-1112", mapping != NULL);
34126+ assert("edward-1113", mapping->host != NULL);
34127+
34128+ fplug = inode_file_plugin(mapping->host);
34129+ assert("edward-1114", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
34130+ iplug = item_plugin_by_id(CTAIL_ID);
34131+
34132+ iplug->s.file.readpages(data, mapping, pages);
34133+
34134+ return;
34135+}
34136+
34137+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
34138+{
34139+ /* reserve one block to update stat data item */
34140+ assert("edward-1193",
34141+ inode_file_plugin(inode)->estimate.update ==
34142+ estimate_update_common);
34143+ return estimate_update_common(inode);
34144+}
34145+
34146+/**
34147+ * read_cryptcompress - read of struct file_operations
34148+ * @file: file to read from
34149+ * @buf: address of user-space buffer
34150+ * @read_amount: number of bytes to read
34151+ * @off: position in file to read from
34152+ *
34153+ * This is implementation of vfs's read method of struct file_operations for
34154+ * cryptcompress plugin.
34155+ */
34156+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
34157+ loff_t * off)
34158+{
34159+ ssize_t result;
34160+ struct inode *inode;
34161+ reiser4_context *ctx;
34162+ reiser4_file_fsdata *fsdata;
34163+ cryptcompress_info_t *info;
34164+ reiser4_block_nr needed;
34165+
34166+ inode = file->f_dentry->d_inode;
34167+ assert("edward-1194", !inode_get_flag(inode, REISER4_NO_SD));
34168+
34169+ ctx = init_context(inode->i_sb);
34170+ if (IS_ERR(ctx))
34171+ return PTR_ERR(ctx);
34172+
34173+ info = cryptcompress_inode_data(inode);
34174+ needed = cryptcompress_estimate_read(inode);
34175+
34176+ /* FIXME-EDWARD:
34177+ Grab space for sd_update so find_cluster will be happy */
34178+ result = reiser4_grab_space(needed, BA_CAN_COMMIT);
34179+ if (result != 0) {
34180+ reiser4_exit_context(ctx);
34181+ return result;
34182+ }
34183+ fsdata = reiser4_get_file_fsdata(file);
34184+ fsdata->ra2.data = file;
34185+ fsdata->ra2.readpages = readpages_crc;
34186+
34187+ down_read(&info->lock);
34188+ LOCK_CNT_INC(inode_sem_r);
34189+
34190+ result = generic_file_read(file, buf, size, off);
34191+
34192+ up_read(&info->lock);
34193+ LOCK_CNT_DEC(inode_sem_r);
34194+
34195+ context_set_commit_async(ctx);
34196+ reiser4_exit_context(ctx);
34197+
34198+ return result;
34199+}
34200+
34201+/* If @index > 0, find real disk cluster of the index (@index - 1),
34202+ If @index == 0 find the real disk cluster of the object of maximal index.
34203+ Keep incremented index of the result in @found.
34204+ It succes was returned:
34205+ (@index == 0 && @found == 0) means that the object doesn't have real disk
34206+ clusters.
34207+ (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
34208+ exist.
34209+*/
34210+static int
34211+find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
34212+{
34213+ int result;
34214+ reiser4_key key;
34215+ loff_t offset;
34216+ hint_t *hint;
34217+ lock_handle *lh;
34218+ lookup_bias bias;
34219+ coord_t *coord;
34220+ item_plugin *iplug;
34221+
34222+ assert("edward-1131", inode != NULL);
34223+ assert("edward-95", crc_inode_ok(inode));
34224+
34225+ hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34226+ if (hint == NULL)
34227+ return RETERR(-ENOMEM);
34228+ hint_init_zero(hint);
34229+ lh = &hint->lh;
34230+
34231+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
34232+ offset =
34233+ (index ? clust_to_off(index, inode) -
34234+ 1 : get_key_offset(max_key()));
34235+
34236+ key_by_inode_cryptcompress(inode, offset, &key);
34237+
34238+ /* find the last item of this object */
34239+ result =
34240+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
34241+ bias, 0);
34242+ if (cbk_errored(result)) {
34243+ done_lh(lh);
34244+ kfree(hint);
34245+ return result;
34246+ }
34247+ if (result == CBK_COORD_NOTFOUND) {
34248+ /* no real disk clusters */
34249+ done_lh(lh);
34250+ kfree(hint);
34251+ *found = 0;
34252+ return 0;
34253+ }
34254+ /* disk cluster is found */
34255+ coord = &hint->ext_coord.coord;
34256+ coord_clear_iplug(coord);
34257+ result = zload(coord->node);
34258+ if (unlikely(result)) {
34259+ done_lh(lh);
34260+ kfree(hint);
34261+ return result;
34262+ }
34263+ iplug = item_plugin_by_coord(coord);
34264+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
34265+ assert("edward-1202", ctail_ok(coord));
34266+
34267+ item_key_by_coord(coord, &key);
34268+ *found = off_to_clust(get_key_offset(&key), inode) + 1;
34269+
34270+ assert("edward-1132", ergo(index, index == *found));
34271+
34272+ zrelse(coord->node);
34273+ done_lh(lh);
34274+ kfree(hint);
34275+ return 0;
34276+}
34277+
34278+static int find_fake_appended(struct inode *inode, cloff_t * index)
34279+{
34280+ return find_real_disk_cluster(inode, index,
34281+ 0 /* find last real one */ );
34282+}
34283+
34284+/* Set left coord when unit is not found after node_lookup()
34285+ This takes into account that there can be holes in a sequence
34286+ of disk clusters */
34287+
34288+static void adjust_left_coord(coord_t * left_coord)
34289+{
34290+ switch (left_coord->between) {
34291+ case AFTER_UNIT:
34292+ left_coord->between = AFTER_ITEM;
34293+ case AFTER_ITEM:
34294+ case BEFORE_UNIT:
34295+ break;
34296+ default:
34297+ impossible("edward-1204", "bad left coord to cut");
34298+ }
34299+ return;
34300+}
34301+
34302+#define CRC_CUT_TREE_MIN_ITERATIONS 64
34303+int
34304+cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
34305+ const reiser4_key * to_key,
34306+ reiser4_key * smallest_removed,
34307+ struct inode *object, int truncate, int *progress)
34308+{
34309+ lock_handle next_node_lock;
34310+ coord_t left_coord;
34311+ int result;
34312+
34313+ assert("edward-1158", tap->coord->node != NULL);
34314+ assert("edward-1159", znode_is_write_locked(tap->coord->node));
34315+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
34316+
34317+ *progress = 0;
34318+ init_lh(&next_node_lock);
34319+
34320+ while (1) {
34321+ znode *node; /* node from which items are cut */
34322+ node_plugin *nplug; /* node plugin for @node */
34323+
34324+ node = tap->coord->node;
34325+
34326+ /* Move next_node_lock to the next node on the left. */
34327+ result =
34328+ reiser4_get_left_neighbor(&next_node_lock, node,
34329+ ZNODE_WRITE_LOCK,
34330+ GN_CAN_USE_UPPER_LEVELS);
34331+ if (result != 0 && result != -E_NO_NEIGHBOR)
34332+ break;
34333+ /* FIXME-EDWARD: Check can we delete the node as a whole. */
34334+ result = tap_load(tap);
34335+ if (result)
34336+ return result;
34337+
34338+ /* Prepare the second (right) point for cut_node() */
34339+ if (*progress)
34340+ coord_init_last_unit(tap->coord, node);
34341+
34342+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
34343+ /* set rightmost unit for the items without lookup method */
34344+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
34345+
34346+ nplug = node->nplug;
34347+
34348+ assert("edward-1161", nplug);
34349+ assert("edward-1162", nplug->lookup);
34350+
34351+ /* left_coord is leftmost unit cut from @node */
34352+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
34353+
34354+ if (IS_CBKERR(result))
34355+ break;
34356+
34357+ if (result == CBK_COORD_NOTFOUND)
34358+ adjust_left_coord(&left_coord);
34359+
34360+ /* adjust coordinates so that they are set to existing units */
34361+ if (coord_set_to_right(&left_coord)
34362+ || coord_set_to_left(tap->coord)) {
34363+ result = 0;
34364+ break;
34365+ }
34366+
34367+ if (coord_compare(&left_coord, tap->coord) ==
34368+ COORD_CMP_ON_RIGHT) {
34369+ /* keys from @from_key to @to_key are not in the tree */
34370+ result = 0;
34371+ break;
34372+ }
34373+
34374+ /* cut data from one node */
34375+ *smallest_removed = *min_key();
34376+ result = kill_node_content(&left_coord,
34377+ tap->coord,
34378+ from_key,
34379+ to_key,
34380+ smallest_removed,
34381+ next_node_lock.node,
34382+ object, truncate);
34383+#if REISER4_DEBUG
34384+ /*node_check(node, ~0U); */
34385+#endif
34386+ tap_relse(tap);
34387+
34388+ if (result)
34389+ break;
34390+
34391+ ++(*progress);
34392+
34393+ /* Check whether all items with keys >= from_key were removed
34394+ * from the tree. */
34395+ if (keyle(smallest_removed, from_key))
34396+ /* result = 0; */
34397+ break;
34398+
34399+ if (next_node_lock.node == NULL)
34400+ break;
34401+
34402+ result = tap_move(tap, &next_node_lock);
34403+ done_lh(&next_node_lock);
34404+ if (result)
34405+ break;
34406+
34407+ /* Break long cut_tree operation (deletion of a large file) if
34408+ * atom requires commit. */
34409+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
34410+ && current_atom_should_commit()) {
34411+ result = -E_REPEAT;
34412+ break;
34413+ }
34414+ }
34415+ done_lh(&next_node_lock);
34416+ return result;
34417+}
34418+
34419+/* Append or expand hole in two steps (exclusive access should be aquired!)
34420+ 1) write zeroes to the current real cluster,
34421+ 2) expand hole via fake clusters (just increase i_size) */
34422+static int
34423+cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
34424+ loff_t new_size)
34425+{
34426+ int result = 0;
34427+ hint_t *hint;
34428+ lock_handle *lh;
34429+ loff_t hole_size;
34430+ int nr_zeroes;
34431+ reiser4_slide_t win;
34432+ reiser4_cluster_t clust;
34433+
34434+ assert("edward-1133", inode->i_size < new_size);
34435+ assert("edward-1134", schedulable());
34436+ assert("edward-1135", crc_inode_ok(inode));
34437+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
34438+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
34439+
34440+ hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34441+ if (hint == NULL)
34442+ return RETERR(-ENOMEM);
34443+ hint_init_zero(hint);
34444+ lh = &hint->lh;
34445+
34446+ reiser4_slide_init(&win);
34447+ cluster_init_read(&clust, &win);
34448+ clust.hint = hint;
34449+
34450+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34451+ if (result)
34452+ goto out;
34453+ if (off_to_cloff(inode->i_size, inode) == 0)
34454+ goto fake_append;
34455+ hole_size = new_size - inode->i_size;
34456+ nr_zeroes =
34457+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
34458+ if (hole_size < nr_zeroes)
34459+ nr_zeroes = hole_size;
34460+ set_window(&clust, &win, inode, inode->i_size,
34461+ inode->i_size + nr_zeroes);
34462+ win.stat = HOLE_WINDOW;
34463+
34464+ assert("edward-1137",
34465+ clust.index == off_to_clust(inode->i_size, inode));
34466+
34467+ result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
34468+
34469+ assert("edward-1271", !result || result == -ENOSPC);
34470+ if (result)
34471+ goto out;
34472+ assert("edward-1139",
34473+ clust.dstat == PREP_DISK_CLUSTER ||
34474+ clust.dstat == UNPR_DISK_CLUSTER);
34475+
34476+ assert("edward-1431", hole_size >= nr_zeroes);
34477+ if (hole_size == nr_zeroes)
34478+ /* nothing to append anymore */
34479+ goto out;
34480+ fake_append:
34481+ INODE_SET_FIELD(inode, i_size, new_size);
34482+ out:
34483+ done_lh(lh);
34484+ kfree(hint);
34485+ put_cluster_handle(&clust);
34486+ return result;
34487+}
34488+
34489+#if REISER4_DEBUG
34490+static int
34491+pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
34492+{
34493+ struct pagevec pvec;
34494+ int i;
34495+ int count;
34496+ int rest;
34497+
34498+ rest = count_to_nrpages(old_size) - start;
34499+
34500+ pagevec_init(&pvec, 0);
34501+ count = min_count(pagevec_space(&pvec), rest);
34502+
34503+ while (rest) {
34504+ count = min_count(pagevec_space(&pvec), rest);
34505+ pvec.nr = find_get_pages(inode->i_mapping, start,
34506+ count, pvec.pages);
34507+ for (i = 0; i < pagevec_count(&pvec); i++) {
34508+ if (PageUptodate(pvec.pages[i])) {
34509+ warning("edward-1205",
34510+ "truncated page of index %lu is uptodate",
34511+ pvec.pages[i]->index);
34512+ return 0;
34513+ }
34514+ }
34515+ start += count;
34516+ rest -= count;
34517+ pagevec_release(&pvec);
34518+ }
34519+ return 1;
34520+}
34521+
34522+static int body_truncate_ok(struct inode *inode, cloff_t aidx)
34523+{
34524+ int result;
34525+ cloff_t raidx;
34526+
34527+ result = find_fake_appended(inode, &raidx);
34528+ return !result && (aidx == raidx);
34529+}
34530+#endif
34531+
34532+static int
34533+update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
34534+{
34535+ return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
34536+ ? 0 : update_file_size(inode, key, update_sd));
34537+}
34538+
34539+/* prune cryptcompress file in two steps (exclusive access should be acquired!)
34540+ 1) cut all disk clusters but the last one partially truncated,
34541+ 2) set zeroes and capture last partially truncated page cluster if the last
34542+ one exists, otherwise truncate via prune fake cluster (just decrease i_size)
34543+*/
34544+static int
34545+prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
34546+ cloff_t aidx)
34547+{
34548+ int result = 0;
34549+ unsigned nr_zeroes;
34550+ loff_t to_prune;
34551+ loff_t old_size;
34552+ cloff_t ridx;
34553+
34554+ hint_t *hint;
34555+ lock_handle *lh;
34556+ reiser4_slide_t win;
34557+ reiser4_cluster_t clust;
34558+
34559+ assert("edward-1140", inode->i_size >= new_size);
34560+ assert("edward-1141", schedulable());
34561+ assert("edward-1142", crc_inode_ok(inode));
34562+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
34563+
34564+ hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34565+ if (hint == NULL)
34566+ return RETERR(-ENOMEM);
34567+ hint_init_zero(hint);
34568+ lh = &hint->lh;
34569+
34570+ reiser4_slide_init(&win);
34571+ cluster_init_read(&clust, &win);
34572+ clust.hint = hint;
34573+
34574+ /* rightmost completely truncated cluster */
34575+ ridx = count_to_nrclust(new_size, inode);
34576+
34577+ assert("edward-1174", ridx <= aidx);
34578+ old_size = inode->i_size;
34579+ if (ridx != aidx) {
34580+ result = cut_file_items(inode,
34581+ clust_to_off(ridx, inode),
34582+ update_sd,
34583+ clust_to_off(aidx, inode),
34584+ update_cryptcompress_size);
34585+ if (result)
34586+ goto out;
34587+ }
34588+ if (!off_to_cloff(new_size, inode)) {
34589+ /* no partially truncated clusters */
34590+ assert("edward-1145", inode->i_size == new_size);
34591+ goto finish;
34592+ }
34593+ assert("edward-1146", new_size < inode->i_size);
34594+
34595+ to_prune = inode->i_size - new_size;
34596+
34597+ /* partial truncate of leftmost cluster,
34598+ first check if it is fake */
34599+ result = find_real_disk_cluster(inode, &aidx, ridx);
34600+ if (result)
34601+ goto out;
34602+ if (!aidx)
34603+ /* yup, this is fake one */
34604+ goto finish;
34605+
34606+ assert("edward-1148", aidx == ridx);
34607+
34608+ /* do partial truncate of the leftmost page cluster,
34609+ then try to capture this one */
34610+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34611+ if (result)
34612+ goto out;
34613+ nr_zeroes = (off_to_pgoff(new_size) ?
34614+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
34615+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
34616+ win.stat = HOLE_WINDOW;
34617+
34618+ assert("edward-1149", clust.index == ridx - 1);
34619+
34620+ result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
34621+ if (result)
34622+ goto out;
34623+ assert("edward-1151",
34624+ clust.dstat == PREP_DISK_CLUSTER ||
34625+ clust.dstat == UNPR_DISK_CLUSTER);
34626+
34627+ assert("edward-1191", inode->i_size == new_size);
34628+ assert("edward-1206", body_truncate_ok(inode, ridx));
34629+ finish:
34630+ /* drop all the pages that don't have jnodes (i.e. pages
34631+ which can not be truncated by cut_file_items() because
34632+ of holes represented by fake disk clusters) including
34633+ the pages of partially truncated cluster which was
34634+ released by prepare_cluster() */
34635+ truncate_inode_pages(inode->i_mapping, new_size);
34636+ INODE_SET_FIELD(inode, i_size, new_size);
34637+ out:
34638+ assert("edward-1334", !result || result == -ENOSPC);
34639+ assert("edward-1209",
34640+ pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
34641+ done_lh(lh);
34642+ kfree(hint);
34643+ put_cluster_handle(&clust);
34644+ return result;
34645+}
34646+
34647+/* Prepare cryptcompress file for truncate:
34648+ prune or append rightmost fake logical clusters (if any)
34649+*/
34650+static int
34651+start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
34652+ int update_sd)
34653+{
34654+ int result = 0;
34655+ int bytes;
34656+
34657+ if (new_size > inode->i_size) {
34658+ /* append */
34659+ if (inode->i_size < clust_to_off(aidx, inode))
34660+ /* no fake bytes */
34661+ return 0;
34662+ bytes = new_size - inode->i_size;
34663+ INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
34664+ } else {
34665+ /* prune */
34666+ if (inode->i_size <= clust_to_off(aidx, inode))
34667+ /* no fake bytes */
34668+ return 0;
34669+ bytes =
34670+ inode->i_size - max_count(new_size,
34671+ clust_to_off(aidx, inode));
34672+ if (!bytes)
34673+ return 0;
34674+ INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
34675+ /* In the case of fake prune we need to drop page cluster.
34676+ There are only 2 cases for partially truncated page:
34677+ 1. If is is dirty, therefore it is anonymous
34678+ (was dirtied via mmap), and will be captured
34679+ later via ->capture().
34680+ 2. If is clean, therefore it is filled by zeroes.
34681+ In both cases we don't need to make it dirty and
34682+ capture here.
34683+ */
34684+ truncate_inode_pages(inode->i_mapping, inode->i_size);
34685+ }
34686+ if (update_sd)
34687+ result = update_sd_cryptcompress(inode);
34688+ return result;
34689+}
34690+
34691+/* This is called in setattr_cryptcompress when it is used to truncate,
34692+ and in delete_cryptcompress */
34693+static int cryptcompress_truncate(struct inode *inode, /* old size */
34694+ loff_t new_size, /* new size */
34695+ int update_sd)
34696+{
34697+ int result;
34698+ cloff_t aidx;
34699+
34700+ result = find_fake_appended(inode, &aidx);
34701+ if (result)
34702+ return result;
34703+ assert("edward-1208",
34704+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34705+
34706+ result = start_truncate_fake(inode, aidx, new_size, update_sd);
34707+ if (result)
34708+ return result;
34709+ if (inode->i_size == new_size)
34710+ /* nothing to truncate anymore */
34711+ return 0;
34712+ return (inode->i_size < new_size ?
34713+ cryptcompress_append_hole(inode, new_size) :
34714+ prune_cryptcompress(inode, new_size, update_sd, aidx));
34715+}
34716+
34717+static void clear_moved_tag_cluster(struct address_space * mapping,
34718+ reiser4_cluster_t * clust)
34719+{
34720+ int i;
34721+ void * ret;
34722+ read_lock_irq(&mapping->tree_lock);
34723+ for (i = 0; i < clust->nr_pages; i++) {
34724+ assert("edward-1438", clust->pages[i] != NULL);
34725+ ret = radix_tree_tag_clear(&mapping->page_tree,
34726+ clust->pages[i]->index,
34727+ PAGECACHE_TAG_REISER4_MOVED);
34728+ assert("edward-1439", ret == clust->pages[i]);
34729+ }
34730+ read_unlock_irq(&mapping->tree_lock);
34731+}
34732+
34733+/* Capture an anonymous pager cluster. (Page cluser is
34734+ anonymous if it contains at least one anonymous page */
34735+static int
34736+capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
34737+{
34738+ int result;
34739+
34740+ assert("edward-1073", clust != NULL);
34741+ assert("edward-1074", inode != NULL);
34742+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34743+
34744+ result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
34745+ if (result)
34746+ return result;
34747+ set_cluster_pages_dirty(clust);
34748+ clear_moved_tag_cluster(inode->i_mapping, clust);
34749+
34750+ result = try_capture_cluster(clust, inode);
34751+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
34752+ if (unlikely(result)) {
34753+ /* set cleared tag back, so it will be
34754+ possible to capture it again later */
34755+ read_lock_irq(&inode->i_mapping->tree_lock);
34756+ radix_tree_tag_set(&inode->i_mapping->page_tree,
34757+ clust_to_pg(clust->index, inode),
34758+ PAGECACHE_TAG_REISER4_MOVED);
34759+ read_unlock_irq(&inode->i_mapping->tree_lock);
34760+
34761+ release_cluster_pages_and_jnode(clust);
34762+ }
34763+ return result;
34764+}
34765+
34766+#define MAX_CLUSTERS_TO_CAPTURE(inode) (1024 >> cluster_nrpages_shift(inode))
34767+
34768+/* read lock should be acquired */
34769+static int
34770+capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
34771+ int to_capture)
34772+{
34773+ int result = 0;
34774+ int found;
34775+ int progress = 0;
34776+ struct page *page = NULL;
34777+ hint_t *hint;
34778+ lock_handle *lh;
34779+ reiser4_cluster_t clust;
34780+
34781+ assert("edward-1127", mapping != NULL);
34782+ assert("edward-1128", mapping->host != NULL);
34783+ assert("edward-1440", mapping->host->i_mapping == mapping);
34784+
34785+ hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34786+ if (hint == NULL)
34787+ return RETERR(-ENOMEM);
34788+ hint_init_zero(hint);
34789+ lh = &hint->lh;
34790+
34791+ cluster_init_read(&clust, NULL);
34792+ clust.hint = hint;
34793+
34794+ result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
34795+ if (result)
34796+ goto out;
34797+
34798+ while (to_capture > 0) {
34799+ found =
34800+ find_get_pages_tag(mapping, index,
34801+ PAGECACHE_TAG_REISER4_MOVED, 1, &page);
34802+ if (!found) {
34803+ *index = (pgoff_t) - 1;
34804+ break;
34805+ }
34806+ assert("edward-1109", page != NULL);
34807+
34808+ move_cluster_forward(&clust, mapping->host, page->index,
34809+ &progress);
34810+ result = capture_page_cluster(&clust, mapping->host);
34811+ page_cache_release(page);
34812+ if (result)
34813+ break;
34814+ to_capture--;
34815+ }
34816+ if (result) {
34817+ warning("edward-1077",
34818+ "Cannot capture anon pages: result=%i (captured=%d)\n",
34819+ result,
34820+ ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
34821+ to_capture);
34822+ } else {
34823+ /* something had to be found */
34824+ assert("edward-1078",
34825+ to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
34826+ if (to_capture <= 0)
34827+ /* there may be left more pages */
34828+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
34829+ }
34830+ out:
34831+ done_lh(lh);
34832+ kfree(hint);
34833+ put_cluster_handle(&clust);
34834+ return result;
34835+}
34836+
34837+/* Check mapping for existence of not captured dirty pages.
34838+ This returns !0 if either page tree contains pages tagged
34839+ PAGECACHE_TAG_REISER4_MOVED */
34840+static int crc_inode_has_anon_pages(struct inode *inode)
34841+{
34842+ return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
34843+}
34844+
34845+/* this is implementation of vfs's writepages method of struct
34846+ address_space_operations */
34847+int
34848+writepages_cryptcompress(struct address_space *mapping,
34849+ struct writeback_control *wbc)
34850+{
34851+ int result;
34852+ int to_capture;
34853+ pgoff_t nrpages;
34854+ pgoff_t index = 0;
34855+ cryptcompress_info_t *info;
34856+ struct inode *inode;
34857+
34858+ inode = mapping->host;
34859+ if (!crc_inode_has_anon_pages(inode)) {
34860+ result = 0;
34861+ goto end;
34862+ }
34863+
34864+ info = cryptcompress_inode_data(inode);
34865+ nrpages = count_to_nrpages(i_size_read(inode));
34866+
34867+ if (wbc->sync_mode != WB_SYNC_ALL)
34868+ to_capture =
34869+ min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
34870+ else
34871+ to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
34872+ do {
34873+ reiser4_context *ctx;
34874+
34875+ if (is_in_reiser4_context()) {
34876+ /* FIXME-EDWARD: REMOVEME */
34877+ all_grabbed2free();
34878+
34879+ /* It can be in the context of write system call from
34880+ balance_dirty_pages() */
34881+ if (down_read_trylock(&info->lock) == 0) {
34882+ result = RETERR(-EBUSY);
34883+ break;
34884+ }
34885+ } else
34886+ down_read(&info->lock);
34887+
34888+ ctx = init_context(inode->i_sb);
34889+ if (IS_ERR(ctx)) {
34890+ result = PTR_ERR(ctx);
34891+ break;
34892+ }
34893+ ctx->nobalance = 1;
34894+
34895+ assert("edward-1079",
34896+ lock_stack_isclean(get_current_lock_stack()));
34897+
34898+ LOCK_CNT_INC(inode_sem_r);
34899+
34900+ result =
34901+ capture_anonymous_clusters(inode->i_mapping, &index,
34902+ to_capture);
34903+
34904+ up_read(&info->lock);
34905+
34906+ LOCK_CNT_DEC(inode_sem_r);
34907+
34908+ if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
34909+ reiser4_exit_context(ctx);
34910+ break;
34911+ }
34912+ result = txnmgr_force_commit_all(inode->i_sb, 0);
34913+ reiser4_exit_context(ctx);
34914+ } while (result == 0 && index < nrpages);
34915+
34916+ end:
34917+ if (is_in_reiser4_context()) {
34918+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34919+ /* there are already pages to flush, flush them out, do
34920+ not delay until end of reiser4_sync_inodes */
34921+ writeout(inode->i_sb, wbc);
34922+ get_current_context()->nr_captured = 0;
34923+ }
34924+ }
34925+ return result;
34926+}
34927+
34928+/* plugin->u.file.mmap */
34929+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34930+{
34931+ //return -ENOSYS;
34932+ return generic_file_mmap(file, vma);
34933+}
34934+
34935+/* plugin->u.file.release */
34936+/* plugin->u.file.get_block */
34937+
34938+/* this is implementation of delete method of file plugin for
34939+ cryptcompress objects */
34940+int delete_cryptcompress(struct inode *inode)
34941+{
34942+ int result;
34943+
34944+ assert("edward-429", inode->i_nlink == 0);
34945+
34946+ if (inode->i_size) {
34947+ result = cryptcompress_truncate(inode, 0, 0);
34948+ if (result) {
34949+ warning("edward-430",
34950+ "cannot truncate cryptcompress file %lli: %i",
34951+ (unsigned long long)get_inode_oid(inode),
34952+ result);
34953+ return result;
34954+ }
34955+ }
34956+ /* and remove stat data */
34957+ return delete_object_common(inode);
34958+}
34959+
34960+/* plugin->u.file.setattr method
34961+ see plugin.h for description */
34962+int setattr_cryptcompress(struct dentry *dentry, /* Object to change attributes */
34963+ struct iattr *attr /* change description */ )
34964+{
34965+ int result;
34966+ struct inode *inode;
34967+
34968+ inode = dentry->d_inode;
34969+ result = check_cryptcompress(inode);
34970+ if (result)
34971+ return result;
34972+ if (attr->ia_valid & ATTR_SIZE) {
34973+ /* EDWARD-FIXME-HANS: VS-FIXME-HANS:
34974+ Q: this case occurs when? truncate?
34975+ A: yes
34976+
34977+ Q: If so, why isn't this code in truncate itself instead of here?
34978+
34979+ A: because vfs calls fs's truncate after it has called truncate_inode_pages to get rid of pages
34980+ corresponding to part of file being truncated. In reiser4 it may cause existence of unallocated
34981+ extents which do not have jnodes. Flush code does not expect that. Solution of this problem is
34982+ straightforward. As vfs's truncate is implemented using setattr operation (common implementaion of
34983+ which calls truncate_inode_pages and fs's truncate in case when size of file changes) - it seems
34984+ reasonable to have reiser4_setattr which will take care of removing pages, jnodes and extents
34985+ simultaneously in case of truncate.
34986+ Q: do you think implementing truncate using setattr is ugly,
34987+ and vfs needs improving, or is there some sense in which this is a good design?
34988+
34989+ A: VS-FIXME-HANS:
34990+ */
34991+
34992+ /* truncate does reservation itself and requires exclusive access obtained */
34993+ if (inode->i_size != attr->ia_size) {
34994+ reiser4_context *ctx;
34995+ loff_t old_size;
34996+ cryptcompress_info_t *info =
34997+ cryptcompress_inode_data(inode);
34998+
34999+ ctx = init_context(dentry->d_inode->i_sb);
35000+ if (IS_ERR(ctx))
35001+ return PTR_ERR(ctx);
35002+
35003+ down_write(&info->lock);
35004+ LOCK_CNT_INC(inode_sem_w);
35005+
35006+ inode_check_scale(inode, inode->i_size, attr->ia_size);
35007+
35008+ old_size = inode->i_size;
35009+
35010+ result =
35011+ cryptcompress_truncate(inode, attr->ia_size,
35012+ 1 /* update stat data */ );
35013+ if (result) {
35014+ warning("edward-1192",
35015+ "truncate_cryptcompress failed: oid %lli, "
35016+ "old size %lld, new size %lld, retval %d",
35017+ (unsigned long long)
35018+ get_inode_oid(inode), old_size,
35019+ attr->ia_size, result);
35020+ }
35021+ up_write(&info->lock);
35022+ LOCK_CNT_DEC(inode_sem_w);
35023+ context_set_commit_async(ctx);
35024+ reiser4_exit_context(ctx);
35025+ } else
35026+ result = 0;
35027+ } else
35028+ result = setattr_common(dentry, attr);
35029+ return result;
35030+}
35031+
35032+/* sendfile_cryptcompress - sendfile of struct file_operations */
35033+ssize_t
35034+sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
35035+ read_actor_t actor, void *target)
35036+{
35037+ reiser4_context *ctx;
35038+ ssize_t result;
35039+ struct inode *inode;
35040+ cryptcompress_info_t *info;
35041+
35042+ inode = file->f_dentry->d_inode;
35043+ ctx = init_context(inode->i_sb);
35044+ if (IS_ERR(ctx))
35045+ return PTR_ERR(ctx);
35046+ /*
35047+ * generic_file_sndfile may want to call update_atime. Grab space for
35048+ * stat data update
35049+ */
35050+ result = reiser4_grab_space(estimate_update_common(inode),
35051+ BA_CAN_COMMIT);
35052+ if (result)
35053+ goto exit;
35054+ info = cryptcompress_inode_data(inode);
35055+ down_read(&info->lock);
35056+ result = generic_file_sendfile(file, ppos, count, actor, target);
35057+ up_read(&info->lock);
35058+ exit:
35059+ reiser4_exit_context(ctx);
35060+ return result;
35061+}
35062+
35063+/*
35064+ * release_cryptcompress - release of struct file_operations
35065+ * @inode: inode of released file
35066+ * @file: file to release
35067+ */
35068+int release_cryptcompress(struct inode *inode, struct file *file)
35069+{
35070+ reiser4_context *ctx = init_context(inode->i_sb);
35071+
35072+ if (IS_ERR(ctx))
35073+ return PTR_ERR(ctx);
35074+ reiser4_free_file_fsdata(file);
35075+ reiser4_exit_context(ctx);
35076+ return 0;
35077+}
35078+
35079+static int
35080+save_len_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin)
35081+{
35082+ assert("edward-457", inode != NULL);
35083+ assert("edward-458", plugin != NULL);
35084+ assert("edward-459", plugin->h.id == CRC_FILE_PLUGIN_ID);
35085+ return 0;
35086+}
35087+
35088+static int
35089+load_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin,
35090+ char **area, int *len)
35091+{
35092+ assert("edward-455", inode != NULL);
35093+ assert("edward-456", (reiser4_inode_data(inode)->pset != NULL));
35094+
35095+ plugin_set_file(&reiser4_inode_data(inode)->pset,
35096+ file_plugin_by_id(CRC_FILE_PLUGIN_ID));
35097+ return 0;
35098+}
35099+
35100+static int change_cryptcompress(struct inode *inode, reiser4_plugin * plugin)
35101+{
35102+ /* cannot change object plugin of already existing object */
35103+ return RETERR(-EINVAL);
35104+}
35105+
35106+struct reiser4_plugin_ops cryptcompress_plugin_ops = {
35107+ .load = load_cryptcompress_plugin,
35108+ .save_len = save_len_cryptcompress_plugin,
35109+ .save = NULL,
35110+ .alignment = 8,
35111+ .change = change_cryptcompress
35112+};
35113+
35114+/*
35115+ Local variables:
35116+ c-indentation-style: "K&R"
35117+ mode-name: "LC"
35118+ c-basic-offset: 8
35119+ tab-width: 8
35120+ fill-column: 80
35121+ scroll-step: 1
35122+ End:
35123+*/
35124Index: linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.h
35125===================================================================
35126--- /dev/null
35127+++ linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.h
35128@@ -0,0 +1,551 @@
35129+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
35130+/* See http://www.namesys.com/cryptcompress_design.html */
35131+
35132+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
35133+#define __FS_REISER4_CRYPTCOMPRESS_H__
35134+
35135+#include "../compress/compress.h"
35136+#include "../crypto/cipher.h"
35137+
35138+#include <linux/pagemap.h>
35139+#include <linux/vmalloc.h>
35140+
35141+#define MIN_CLUSTER_SIZE PAGE_CACHE_SIZE
35142+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
35143+#define MAX_CLUSTER_SHIFT 16
35144+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
35145+#define DC_CHECKSUM_SIZE 4
35146+
35147+static inline loff_t min_count(loff_t a, loff_t b)
35148+{
35149+ return (a < b ? a : b);
35150+}
35151+
35152+static inline loff_t max_count(loff_t a, loff_t b)
35153+{
35154+ return (a > b ? a : b);
35155+}
35156+
35157+#if REISER4_DEBUG
35158+static inline int cluster_shift_ok(int shift)
35159+{
35160+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
35161+}
35162+#endif
35163+
35164+typedef struct tfm_stream {
35165+ __u8 *data;
35166+ size_t size;
35167+} tfm_stream_t;
35168+
35169+typedef enum {
35170+ INPUT_STREAM,
35171+ OUTPUT_STREAM,
35172+ LAST_STREAM
35173+} tfm_stream_id;
35174+
35175+typedef tfm_stream_t *tfm_unit[LAST_STREAM];
35176+
35177+static inline __u8 *ts_data(tfm_stream_t * stm)
35178+{
35179+ assert("edward-928", stm != NULL);
35180+ return stm->data;
35181+}
35182+
35183+static inline size_t ts_size(tfm_stream_t * stm)
35184+{
35185+ assert("edward-929", stm != NULL);
35186+ return stm->size;
35187+}
35188+
35189+static inline void set_ts_size(tfm_stream_t * stm, size_t size)
35190+{
35191+ assert("edward-930", stm != NULL);
35192+
35193+ stm->size = size;
35194+}
35195+
35196+static inline int alloc_ts(tfm_stream_t ** stm)
35197+{
35198+ assert("edward-931", stm);
35199+ assert("edward-932", *stm == NULL);
35200+
35201+ *stm = kmalloc(sizeof **stm, GFP_KERNEL);
35202+ if (*stm == NULL)
35203+ return -ENOMEM;
35204+ memset(*stm, 0, sizeof **stm);
35205+ return 0;
35206+}
35207+
35208+static inline void free_ts(tfm_stream_t * stm)
35209+{
35210+ assert("edward-933", !ts_data(stm));
35211+ assert("edward-934", !ts_size(stm));
35212+
35213+ kfree(stm);
35214+}
35215+
35216+static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
35217+{
35218+ assert("edward-935", !ts_data(stm));
35219+ assert("edward-936", !ts_size(stm));
35220+ assert("edward-937", size != 0);
35221+
35222+ stm->data = vmalloc(size);
35223+ if (!stm->data)
35224+ return -ENOMEM;
35225+ set_ts_size(stm, size);
35226+ return 0;
35227+}
35228+
35229+static inline void free_ts_data(tfm_stream_t * stm)
35230+{
35231+ assert("edward-938", equi(ts_data(stm), ts_size(stm)));
35232+
35233+ if (ts_data(stm))
35234+ vfree(ts_data(stm));
35235+ memset(stm, 0, sizeof *stm);
35236+}
35237+
35238+/* Write modes for item conversion in flush convert phase */
35239+typedef enum {
35240+ CRC_APPEND_ITEM = 1,
35241+ CRC_OVERWRITE_ITEM = 2,
35242+ CRC_CUT_ITEM = 3
35243+} crc_write_mode_t;
35244+
35245+typedef enum {
35246+ PCL_UNKNOWN = 0, /* invalid option */
35247+ PCL_APPEND = 1, /* append and/or overwrite */
35248+ PCL_TRUNCATE = 2 /* truncate */
35249+} page_cluster_op;
35250+
35251+/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
35252+ using crypto/compression transforms implemented by reiser4 transform plugins.
35253+ Before each transform we allocate a pair of streams (tfm_unit) and assemble
35254+ page cluster into the input one. After transform we split output stream into
35255+ a set of items (disk cluster).
35256+*/
35257+typedef struct tfm_cluster {
35258+ coa_set coa;
35259+ tfm_unit tun;
35260+ tfm_action act;
35261+ int uptodate;
35262+ int lsize; /* size of the logical cluster */
35263+ int len; /* length of the transform stream */
35264+} tfm_cluster_t;
35265+
35266+static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
35267+{
35268+ return tc->coa[id][act];
35269+}
35270+
35271+static inline void
35272+set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
35273+{
35274+ tc->coa[id][act] = coa;
35275+}
35276+
35277+static inline int
35278+alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35279+{
35280+ coa_t coa;
35281+
35282+ coa = cplug->alloc(tc->act);
35283+ if (IS_ERR(coa))
35284+ return PTR_ERR(coa);
35285+ set_coa(tc, cplug->h.id, tc->act, coa);
35286+ return 0;
35287+}
35288+
35289+static inline int
35290+grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35291+{
35292+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
35293+ alloc_coa(tc, cplug) : 0);
35294+}
35295+
35296+static inline void free_coa_set(tfm_cluster_t * tc)
35297+{
35298+ tfm_action j;
35299+ reiser4_compression_id i;
35300+ compression_plugin *cplug;
35301+
35302+ assert("edward-810", tc != NULL);
35303+
35304+ for (j = 0; j < LAST_TFM; j++)
35305+ for (i = 0; i < LAST_COMPRESSION_ID; i++) {
35306+ if (!get_coa(tc, i, j))
35307+ continue;
35308+ cplug = compression_plugin_by_id(i);
35309+ assert("edward-812", cplug->free != NULL);
35310+ cplug->free(get_coa(tc, i, j), j);
35311+ set_coa(tc, i, j, 0);
35312+ }
35313+ return;
35314+}
35315+
35316+static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35317+{
35318+ return tc->tun[id];
35319+}
35320+
35321+static inline void
35322+set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
35323+{
35324+ tc->tun[id] = ts;
35325+}
35326+
35327+static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
35328+{
35329+ return ts_data(tfm_stream(tc, id));
35330+}
35331+
35332+static inline void
35333+set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
35334+{
35335+ tfm_stream(tc, id)->data = data;
35336+}
35337+
35338+static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
35339+{
35340+ return ts_size(tfm_stream(tc, id));
35341+}
35342+
35343+static inline void
35344+set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
35345+{
35346+ tfm_stream(tc, id)->size = size;
35347+}
35348+
35349+static inline int
35350+alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35351+{
35352+ assert("edward-939", tc != NULL);
35353+ assert("edward-940", !tfm_stream(tc, id));
35354+
35355+ tc->tun[id] = kmalloc(sizeof(tfm_stream_t), GFP_KERNEL);
35356+ if (!tc->tun[id])
35357+ return -ENOMEM;
35358+ memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
35359+ return alloc_ts_data(tfm_stream(tc, id), size);
35360+}
35361+
35362+static inline int
35363+realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35364+{
35365+ assert("edward-941", tfm_stream_size(tc, id) < size);
35366+ free_ts_data(tfm_stream(tc, id));
35367+ return alloc_ts_data(tfm_stream(tc, id), size);
35368+}
35369+
35370+static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35371+{
35372+ free_ts_data(tfm_stream(tc, id));
35373+ free_ts(tfm_stream(tc, id));
35374+ set_tfm_stream(tc, id, 0);
35375+}
35376+
35377+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
35378+{
35379+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
35380+}
35381+
35382+static inline void free_tfm_unit(tfm_cluster_t * tc)
35383+{
35384+ tfm_stream_id id;
35385+ for (id = 0; id < LAST_STREAM; id++) {
35386+ if (!tfm_stream(tc, id))
35387+ continue;
35388+ free_tfm_stream(tc, id);
35389+ }
35390+}
35391+
35392+static inline void put_tfm_cluster(tfm_cluster_t * tc)
35393+{
35394+ assert("edward-942", tc != NULL);
35395+ free_coa_set(tc);
35396+ free_tfm_unit(tc);
35397+}
35398+
35399+static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
35400+{
35401+ assert("edward-943", tc != NULL);
35402+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
35403+ return (tc->uptodate == 1);
35404+}
35405+
35406+static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
35407+{
35408+ assert("edward-945", tc != NULL);
35409+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
35410+ tc->uptodate = 1;
35411+ return;
35412+}
35413+
35414+static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
35415+{
35416+ assert("edward-947", tc != NULL);
35417+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
35418+ tc->uptodate = 0;
35419+ return;
35420+}
35421+
35422+static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
35423+{
35424+ return (tfm_stream(tc, id) &&
35425+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
35426+}
35427+
35428+static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
35429+{
35430+ int i;
35431+ for (i = 0; i < LAST_STREAM; i++)
35432+ if (!tfm_stream_is_set(tc, i))
35433+ return 0;
35434+ return 1;
35435+}
35436+
35437+static inline void alternate_streams(tfm_cluster_t * tc)
35438+{
35439+ tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
35440+
35441+ set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
35442+ set_tfm_stream(tc, OUTPUT_STREAM, tmp);
35443+}
35444+
35445+/* a kind of data that we can write to the window */
35446+typedef enum {
35447+ DATA_WINDOW, /* the data we copy form user space */
35448+ HOLE_WINDOW /* zeroes if we write hole */
35449+} window_stat;
35450+
35451+/* Sliding window of cluster size which should be set to the approprite position
35452+ (defined by cluster index) in a file before page cluster modification by
35453+ file_write. Then we translate file size, offset to write from, number of
35454+ bytes to write, etc.. to the following configuration needed to estimate
35455+ number of pages to read before write, etc...
35456+*/
35457+typedef struct reiser4_slide {
35458+ unsigned off; /* offset we start to write/truncate from */
35459+ unsigned count; /* number of bytes (zeroes) to write/truncate */
35460+ unsigned delta; /* number of bytes to append to the hole */
35461+ window_stat stat; /* a kind of data to write to the window */
35462+} reiser4_slide_t;
35463+
35464+/* The following is a set of possible disk cluster states */
35465+typedef enum {
35466+ INVAL_DISK_CLUSTER, /* unknown state */
35467+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush
35468+ at least 1 time */
35469+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be
35470+ converted by flush */
35471+ FAKE_DISK_CLUSTER /* disk cluster doesn't exist neither in memory
35472+ nor on disk */
35473+} disk_cluster_stat;
35474+
35475+/*
35476+ While implementing all transforms (from page to disk cluster, and back)
35477+ reiser4 cluster manager fills the following structure incapsulating pointers
35478+ to all the clusters for the same index including the sliding window above
35479+*/
35480+typedef struct reiser4_cluster {
35481+ tfm_cluster_t tc; /* transform cluster */
35482+ int nr_pages; /* number of pages */
35483+ struct page **pages; /* page cluster */
35484+ page_cluster_op op; /* page cluster operation */
35485+ struct file *file;
35486+ hint_t *hint; /* disk cluster item for traversal */
35487+ disk_cluster_stat dstat; /* state of the current disk cluster */
35488+ cloff_t index; /* offset in the units of cluster size */
35489+ reiser4_slide_t *win; /* sliding window of cluster size */
35490+ int reserved; /* this indicates that space for disk
35491+ cluster modification is reserved */
35492+#if REISER4_DEBUG
35493+ reiser4_context *ctx;
35494+ int reserved_prepped;
35495+ int reserved_unprepped;
35496+#endif
35497+
35498+} reiser4_cluster_t;
35499+
35500+static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
35501+{
35502+ return tfm_stream_data(&clust->tc, INPUT_STREAM);
35503+}
35504+
35505+static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
35506+{
35507+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
35508+}
35509+
35510+static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35511+{
35512+ assert("edward-1057", clust->pages != NULL);
35513+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
35514+ return 0;
35515+}
35516+
35517+static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35518+{
35519+ assert("edward-949", clust != NULL);
35520+ assert("edward-1362", clust->pages == NULL);
35521+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
35522+
35523+ clust->pages =
35524+ kmalloc(sizeof(*clust->pages) * nrpages, GFP_KERNEL);
35525+ if (!clust->pages)
35526+ return RETERR(-ENOMEM);
35527+ reset_cluster_pgset(clust, nrpages);
35528+ return 0;
35529+}
35530+
35531+static inline void free_cluster_pgset(reiser4_cluster_t * clust)
35532+{
35533+ assert("edward-951", clust->pages != NULL);
35534+ kfree(clust->pages);
35535+ clust->pages = NULL;
35536+}
35537+
35538+static inline void put_cluster_handle(reiser4_cluster_t * clust)
35539+{
35540+ assert("edward-435", clust != NULL);
35541+
35542+ put_tfm_cluster(&clust->tc);
35543+ if (clust->pages)
35544+ free_cluster_pgset(clust);
35545+ memset(clust, 0, sizeof *clust);
35546+}
35547+
35548+static inline void inc_keyload_count(crypto_stat_t * data)
35549+{
35550+ assert("edward-1410", data != NULL);
35551+ data->keyload_count++;
35552+}
35553+
35554+static inline void dec_keyload_count(crypto_stat_t * data)
35555+{
35556+ assert("edward-1411", data != NULL);
35557+ assert("edward-1412", data->keyload_count > 0);
35558+ data->keyload_count--;
35559+}
35560+
35561+/* cryptcompress specific part of reiser4_inode */
35562+typedef struct cryptcompress_info {
35563+ struct rw_semaphore lock;
35564+ crypto_stat_t *crypt;
35565+ int compress_toggle; /* current status of compressibility
35566+ is set by compression mode plugin */
35567+#if REISER4_DEBUG
35568+ int pgcount; /* number of captured pages */
35569+#endif
35570+} cryptcompress_info_t;
35571+
35572+
35573+static inline void toggle_compression (cryptcompress_info_t * info, int val)
35574+{
35575+ info->compress_toggle = val;
35576+}
35577+
35578+static inline int compression_is_on (cryptcompress_info_t * info)
35579+{
35580+ return info->compress_toggle;
35581+}
35582+
35583+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
35584+int equal_to_rdk(znode *, const reiser4_key *);
35585+int goto_right_neighbor(coord_t *, lock_handle *);
35586+int load_file_hint(struct file *, hint_t *);
35587+void save_file_hint(struct file *, const hint_t *);
35588+void hint_init_zero(hint_t *);
35589+int crc_inode_ok(struct inode *inode);
35590+int jnode_of_cluster(const jnode * node, struct page * page);
35591+extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *, int);
35592+extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
35593+ struct page * page);
35594+extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
35595+ struct inode * inode);
35596+int bind_cryptcompress(struct inode *child, struct inode *parent);
35597+void destroy_inode_cryptcompress(struct inode * inode);
35598+crypto_stat_t * inode_crypto_stat (struct inode * inode);
35599+void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
35600+ int (*can_inherit)(struct inode * child,
35601+ struct inode * parent));
35602+void attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
35603+void detach_crypto_stat(struct inode * inode);
35604+void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
35605+crypto_stat_t * alloc_crypto_stat (struct inode * inode);
35606+
35607+
35608+static inline reiser4_tfma_t *
35609+info_get_tfma (crypto_stat_t * info, reiser4_tfm id)
35610+{
35611+ return &info->tfma[id];
35612+}
35613+
35614+static inline struct crypto_tfm *
35615+info_get_tfm (crypto_stat_t * info, reiser4_tfm id)
35616+{
35617+ return info_get_tfma(info, id)->tfm;
35618+}
35619+
35620+static inline void
35621+info_set_tfm (crypto_stat_t * info, reiser4_tfm id, struct crypto_tfm * tfm)
35622+{
35623+ info_get_tfma(info, id)->tfm = tfm;
35624+}
35625+
35626+static inline struct crypto_tfm *
35627+info_cipher_tfm (crypto_stat_t * info)
35628+{
35629+ return info_get_tfm(info, CIPHER_TFM);
35630+}
35631+
35632+static inline struct crypto_tfm *
35633+info_digest_tfm (crypto_stat_t * info)
35634+{
35635+ return info_get_tfm(info, DIGEST_TFM);
35636+}
35637+
35638+static inline cipher_plugin *
35639+info_cipher_plugin (crypto_stat_t * info)
35640+{
35641+ return &info_get_tfma(info, CIPHER_TFM)->plug->cipher;
35642+}
35643+
35644+static inline digest_plugin *
35645+info_digest_plugin (crypto_stat_t * info)
35646+{
35647+ return &info_get_tfma(info, DIGEST_TFM)->plug->digest;
35648+}
35649+
35650+static inline void
35651+info_set_plugin(crypto_stat_t * info, reiser4_tfm id, reiser4_plugin * plugin)
35652+{
35653+ info_get_tfma(info, id)->plug = plugin;
35654+}
35655+
35656+static inline void
35657+info_set_cipher_plugin(crypto_stat_t * info, cipher_plugin * cplug)
35658+{
35659+ info_set_plugin(info, CIPHER_TFM, cipher_plugin_to_plugin(cplug));
35660+}
35661+
35662+static inline void
35663+info_set_digest_plugin(crypto_stat_t * info, digest_plugin * plug)
35664+{
35665+ info_set_plugin(info, DIGEST_TFM, digest_plugin_to_plugin(plug));
35666+}
35667+
35668+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
35669+
35670+/* Make Linus happy.
35671+ Local variables:
35672+ c-indentation-style: "K&R"
35673+ mode-name: "LC"
35674+ c-basic-offset: 8
35675+ tab-width: 8
35676+ fill-column: 120
35677+ scroll-step: 1
35678+ End:
35679+*/
35680Index: linux-2.6.16/fs/reiser4/plugin/file/file.c
35681===================================================================
35682--- /dev/null
35683+++ linux-2.6.16/fs/reiser4/plugin/file/file.c
35684@@ -0,0 +1,2712 @@
35685+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
35686+ * reiser4/README */
35687+
35688+/*
35689+ * this file contains implementations of inode/file/address_space/file plugin
35690+ * operations specific for "unix file plugin" (plugin id is
35691+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
35692+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
35693+ * no items but stat data)
35694+ */
35695+
35696+#include "../../inode.h"
35697+#include "../../super.h"
35698+#include "../../tree_walk.h"
35699+#include "../../carry.h"
35700+#include "../../page_cache.h"
35701+#include "../../ioctl.h"
35702+#include "../object.h"
35703+#include "../../safe_link.h"
35704+
35705+#include <linux/writeback.h>
35706+#include <linux/pagevec.h>
35707+#include <linux/syscalls.h>
35708+
35709+
35710+static int unpack(struct file *file, struct inode *inode, int forever);
35711+
35712+/* get unix file plugin specific portion of inode */
35713+unix_file_info_t *unix_file_inode_data(const struct inode *inode)
35714+{
35715+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35716+}
35717+
35718+/**
35719+ * equal_to_rdk - compare key and znode's right delimiting key
35720+ * @node: node whose right delimiting key to compare with @key
35721+ * @key: key to compare with @node's right delimiting key
35722+ *
35723+ * Returns true if @key is equal to right delimiting key of @node.
35724+ */
35725+int equal_to_rdk(znode *node, const reiser4_key *key)
35726+{
35727+ int result;
35728+
35729+ read_lock_dk(znode_get_tree(node));
35730+ result = keyeq(key, znode_get_rd_key(node));
35731+ read_unlock_dk(znode_get_tree(node));
35732+ return result;
35733+}
35734+
35735+#if REISER4_DEBUG
35736+
35737+/**
35738+ * equal_to_ldk - compare key and znode's left delimiting key
35739+ * @node: node whose left delimiting key to compare with @key
35740+ * @key: key to compare with @node's left delimiting key
35741+ *
35742+ * Returns true if @key is equal to left delimiting key of @node.
35743+ */
35744+int equal_to_ldk(znode *node, const reiser4_key *key)
35745+{
35746+ int result;
35747+
35748+ read_lock_dk(znode_get_tree(node));
35749+ result = keyeq(key, znode_get_ld_key(node));
35750+ read_unlock_dk(znode_get_tree(node));
35751+ return result;
35752+}
35753+
35754+/**
35755+ * check_coord - check whether coord corresponds to key
35756+ * @coord: coord to check
35757+ * @key: key @coord has to correspond to
35758+ *
35759+ * Returns true if @coord is set as if it was set as result of lookup with @key
35760+ * in coord->node.
35761+ */
35762+static int check_coord(const coord_t *coord, const reiser4_key *key)
35763+{
35764+ coord_t twin;
35765+
35766+ node_plugin_by_node(coord->node)->lookup(coord->node, key,
35767+ FIND_MAX_NOT_MORE_THAN, &twin);
35768+ return coords_equal(coord, &twin);
35769+}
35770+
35771+#endif /* REISER4_DEBUG */
35772+
35773+/**
35774+ * init_uf_coord - initialize extended coord
35775+ * @uf_coord:
35776+ * @lh:
35777+ *
35778+ *
35779+ */
35780+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35781+{
35782+ coord_init_zero(&uf_coord->coord);
35783+ coord_clear_iplug(&uf_coord->coord);
35784+ uf_coord->lh = lh;
35785+ init_lh(lh);
35786+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35787+ uf_coord->valid = 0;
35788+}
35789+
35790+void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
35791+{
35792+ assert("vs-1333", uf_coord->valid == 0);
35793+
35794+ if (coord_is_between_items(&uf_coord->coord))
35795+ return;
35796+
35797+ assert("vs-1348",
35798+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35799+ init_coord_extension);
35800+
35801+ item_body_by_coord(&uf_coord->coord);
35802+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35803+ init_coord_extension(uf_coord, offset);
35804+}
35805+
35806+/**
35807+ * goto_right_neighbor - lock right neighbor, drop current node lock
35808+ * @coord:
35809+ * @lh:
35810+ *
35811+ * Obtain lock on right neighbor and drop lock on current node.
35812+ */
35813+int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35814+{
35815+ int result;
35816+ lock_handle lh_right;
35817+
35818+ assert("vs-1100", znode_is_locked(coord->node));
35819+
35820+ init_lh(&lh_right);
35821+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
35822+ znode_is_wlocked(coord->node) ?
35823+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35824+ GN_CAN_USE_UPPER_LEVELS);
35825+ if (result) {
35826+ done_lh(&lh_right);
35827+ return result;
35828+ }
35829+
35830+ /*
35831+ * we hold two longterm locks on neighboring nodes. Unlock left of
35832+ * them
35833+ */
35834+ done_lh(lh);
35835+
35836+ coord_init_first_unit_nocheck(coord, lh_right.node);
35837+ move_lh(lh, &lh_right);
35838+
35839+ return 0;
35840+
35841+}
35842+
35843+/**
35844+ * set_file_state
35845+ * @uf_info:
35846+ * @cbk_result:
35847+ * @level:
35848+ *
35849+ * This is to be used by find_file_item and in find_file_state to
35850+ * determine real state of file
35851+ */
35852+static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
35853+ tree_level level)
35854+{
35855+ if (cbk_errored(cbk_result))
35856+ /* error happened in find_file_item */
35857+ return;
35858+
35859+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35860+
35861+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35862+ /*
35863+ * container is unknown, therefore conversion can not be in
35864+ * progress
35865+ */
35866+ assert("", !inode_get_flag(unix_file_info_to_inode(uf_info),
35867+ REISER4_PART_IN_CONV));
35868+ if (cbk_result == CBK_COORD_NOTFOUND)
35869+ uf_info->container = UF_CONTAINER_EMPTY;
35870+ else if (level == LEAF_LEVEL)
35871+ uf_info->container = UF_CONTAINER_TAILS;
35872+ else
35873+ uf_info->container = UF_CONTAINER_EXTENTS;
35874+ } else {
35875+ /*
35876+ * file state is known, check whether it is set correctly if
35877+ * file is not being tail converted
35878+ */
35879+ if (!inode_get_flag(unix_file_info_to_inode(uf_info),
35880+ REISER4_PART_IN_CONV)) {
35881+ assert("vs-1162",
35882+ ergo(level == LEAF_LEVEL &&
35883+ cbk_result == CBK_COORD_FOUND,
35884+ uf_info->container == UF_CONTAINER_TAILS));
35885+ assert("vs-1165",
35886+ ergo(level == TWIG_LEVEL &&
35887+ cbk_result == CBK_COORD_FOUND,
35888+ uf_info->container == UF_CONTAINER_EXTENTS));
35889+ }
35890+ }
35891+}
35892+
35893+int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35894+ const reiser4_key *key, znode_lock_mode lock_mode,
35895+ struct inode *inode)
35896+{
35897+ return object_lookup(inode, key, coord, lh, lock_mode,
35898+ FIND_MAX_NOT_MORE_THAN,
35899+ TWIG_LEVEL, LEAF_LEVEL,
35900+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35901+ (CBK_UNIQUE | CBK_FOR_INSERT),
35902+ NULL /* ra_info */ );
35903+}
35904+
35905+/**
35906+ * find_file_item - look for file item in the tree
35907+ * @hint: provides coordinate, lock handle, seal
35908+ * @key: key for search
35909+ * @mode: mode of lock to put on returned node
35910+ * @ra_info:
35911+ * @inode:
35912+ *
35913+ * This finds position in the tree corresponding to @key. It first tries to use
35914+ * @hint's seal if it is set.
35915+ */
35916+int find_file_item(hint_t *hint, const reiser4_key *key,
35917+ znode_lock_mode lock_mode,
35918+ struct inode *inode)
35919+{
35920+ int result;
35921+ coord_t *coord;
35922+ lock_handle *lh;
35923+
35924+ assert("nikita-3030", schedulable());
35925+ assert("vs-1707", hint != NULL);
35926+ assert("vs-47", inode != NULL);
35927+
35928+ coord = &hint->ext_coord.coord;
35929+ lh = hint->ext_coord.lh;
35930+ init_lh(lh);
35931+
35932+ result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35933+ if (!result) {
35934+ if (coord->between == AFTER_UNIT &&
35935+ equal_to_rdk(coord->node, key)) {
35936+ result = goto_right_neighbor(coord, lh);
35937+ if (result == -E_NO_NEIGHBOR)
35938+ return RETERR(-EIO);
35939+ if (result)
35940+ return result;
35941+ assert("vs-1152", equal_to_ldk(coord->node, key));
35942+ /*
35943+ * we moved to different node. Invalidate coord
35944+ * extension, zload is necessary to init it again
35945+ */
35946+ hint->ext_coord.valid = 0;
35947+ }
35948+
35949+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35950+ znode_get_level(coord->node));
35951+
35952+ return CBK_COORD_FOUND;
35953+ }
35954+
35955+ coord_init_zero(coord);
35956+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35957+ set_file_state(unix_file_inode_data(inode), result,
35958+ znode_get_level(coord->node));
35959+
35960+ /* FIXME: we might already have coord extension initialized */
35961+ hint->ext_coord.valid = 0;
35962+ return result;
35963+}
35964+
35965+/* plugin->u.file.write_flowom = NULL
35966+ plugin->u.file.read_flow = NULL */
35967+
35968+void hint_init_zero(hint_t * hint)
35969+{
35970+ memset(hint, 0, sizeof(*hint));
35971+ init_lh(&hint->lh);
35972+ hint->ext_coord.lh = &hint->lh;
35973+}
35974+
35975+static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
35976+{
35977+ int result;
35978+ reiser4_key key;
35979+ coord_t coord;
35980+ lock_handle lh;
35981+
35982+ assert("vs-1628", ea_obtained(uf_info));
35983+
35984+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35985+ key_by_inode_and_offset_common(inode, 0, &key);
35986+ init_lh(&lh);
35987+ result = find_file_item_nohint(&coord, &lh, &key,
35988+ ZNODE_READ_LOCK, inode);
35989+ set_file_state(uf_info, result, znode_get_level(coord.node));
35990+ done_lh(&lh);
35991+ if (!cbk_errored(result))
35992+ result = 0;
35993+ } else
35994+ result = 0;
35995+ assert("vs-1074",
35996+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
35997+ txn_restart_current();
35998+ return result;
35999+}
36000+
36001+/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
36002+ data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
36003+ if page corresponds to hole extent and unallocated one will have to be created */
36004+static int reserve_partial_page(reiser4_tree * tree)
36005+{
36006+ grab_space_enable();
36007+ return reiser4_grab_reserved(reiser4_get_current_sb(),
36008+ 1 +
36009+ 2 * estimate_one_insert_into_item(tree),
36010+ BA_CAN_COMMIT);
36011+}
36012+
36013+/* estimate and reserve space needed to cut one item and update one stat data */
36014+static int reserve_cut_iteration(reiser4_tree * tree)
36015+{
36016+ __u64 estimate = estimate_one_item_removal(tree)
36017+ + estimate_one_insert_into_item(tree);
36018+
36019+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
36020+
36021+ grab_space_enable();
36022+ /* We need to double our estimate now that we can delete more than one
36023+ node. */
36024+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
36025+ BA_CAN_COMMIT);
36026+}
36027+
36028+int update_file_size(struct inode *inode, reiser4_key * key, int update_sd)
36029+{
36030+ int result = 0;
36031+
36032+ INODE_SET_FIELD(inode, i_size, get_key_offset(key));
36033+ if (update_sd) {
36034+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
36035+ result = reiser4_update_sd(inode);
36036+ }
36037+ return result;
36038+}
36039+
36040+/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
36041+ and update file stat data on every single cut from the tree */
36042+int
36043+cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
36044+ loff_t cur_size, int (*update_actor) (struct inode *,
36045+ reiser4_key *, int))
36046+{
36047+ reiser4_key from_key, to_key;
36048+ reiser4_key smallest_removed;
36049+ file_plugin *fplug = inode_file_plugin(inode);
36050+ int result;
36051+ int progress = 0;
36052+
36053+ assert("vs-1248",
36054+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
36055+ fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
36056+
36057+ fplug->key_by_inode(inode, new_size, &from_key);
36058+ to_key = from_key;
36059+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(max_key()) */ );
36060+ /* this loop normally runs just once */
36061+ while (1) {
36062+ result = reserve_cut_iteration(tree_by_inode(inode));
36063+ if (result)
36064+ break;
36065+
36066+ result = cut_tree_object(current_tree, &from_key, &to_key,
36067+ &smallest_removed, inode, 1,
36068+ &progress);
36069+ if (result == -E_REPEAT) {
36070+ /* -E_REPEAT is a signal to interrupt a long file truncation process */
36071+ if (progress) {
36072+ result =
36073+ update_actor(inode, &smallest_removed,
36074+ update_sd);
36075+ if (result)
36076+ break;
36077+ }
36078+
36079+ /* the below does up(sbinfo->delete_sema). Do not get folled */
36080+ reiser4_release_reserved(inode->i_sb);
36081+
36082+ /* cut_tree_object() was interrupted probably because
36083+ * current atom requires commit, we have to release
36084+ * transaction handle to allow atom commit. */
36085+ txn_restart_current();
36086+ continue;
36087+ }
36088+ if (result
36089+ && !(result == CBK_COORD_NOTFOUND && new_size == 0
36090+ && inode->i_size == 0))
36091+ break;
36092+
36093+ set_key_offset(&smallest_removed, new_size);
36094+ /* Final sd update after the file gets its correct size */
36095+ result = update_actor(inode, &smallest_removed, update_sd);
36096+ break;
36097+ }
36098+
36099+ /* the below does up(sbinfo->delete_sema). Do not get folled */
36100+ reiser4_release_reserved(inode->i_sb);
36101+
36102+ return result;
36103+}
36104+
36105+int find_or_create_extent(struct page *page);
36106+
36107+static int filler(void *vp, struct page *page)
36108+{
36109+ return readpage_unix_file_nolock(vp, page);
36110+}
36111+
36112+/* part of truncate_file_body: it is called when truncate is used to make file
36113+ shorter */
36114+static int shorten_file(struct inode *inode, loff_t new_size)
36115+{
36116+ int result;
36117+ struct page *page;
36118+ int padd_from;
36119+ unsigned long index;
36120+ char *kaddr;
36121+ unix_file_info_t *uf_info;
36122+
36123+ /*
36124+ * all items of ordinary reiser4 file are grouped together. That is why
36125+ * we can use cut_tree. Plan B files (for instance) can not be
36126+ * truncated that simply
36127+ */
36128+ result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
36129+ get_key_offset(max_key()), update_file_size);
36130+ if (result)
36131+ return result;
36132+
36133+ uf_info = unix_file_inode_data(inode);
36134+ assert("vs-1105", new_size == inode->i_size);
36135+ if (new_size == 0) {
36136+ uf_info->container = UF_CONTAINER_EMPTY;
36137+ return 0;
36138+ }
36139+
36140+ result = find_file_state(inode, uf_info);
36141+ if (result)
36142+ return result;
36143+ if (uf_info->container == UF_CONTAINER_TAILS)
36144+ /*
36145+ * No need to worry about zeroing last page after new file
36146+ * end
36147+ */
36148+ return 0;
36149+
36150+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
36151+ if (!padd_from)
36152+ /* file is truncated to page boundary */
36153+ return 0;
36154+
36155+ result = reserve_partial_page(tree_by_inode(inode));
36156+ if (result) {
36157+ reiser4_release_reserved(inode->i_sb);
36158+ return result;
36159+ }
36160+
36161+ /* last page is partially truncated - zero its content */
36162+ index = (inode->i_size >> PAGE_CACHE_SHIFT);
36163+ page = read_cache_page(inode->i_mapping, index, filler, NULL);
36164+ if (IS_ERR(page)) {
36165+ /*
36166+ * the below does up(sbinfo->delete_sema). Do not get
36167+ * confused
36168+ */
36169+ reiser4_release_reserved(inode->i_sb);
36170+ if (likely(PTR_ERR(page) == -EINVAL)) {
36171+ /* looks like file is built of tail items */
36172+ return 0;
36173+ }
36174+ return PTR_ERR(page);
36175+ }
36176+ wait_on_page_locked(page);
36177+ if (!PageUptodate(page)) {
36178+ page_cache_release(page);
36179+ /*
36180+ * the below does up(sbinfo->delete_sema). Do not get
36181+ * confused
36182+ */
36183+ reiser4_release_reserved(inode->i_sb);
36184+ return RETERR(-EIO);
36185+ }
36186+
36187+ /*
36188+ * if page correspons to hole extent unit - unallocated one will be
36189+ * created here. This is not necessary
36190+ */
36191+ result = find_or_create_extent(page);
36192+
36193+ /*
36194+ * FIXME: cut_file_items has already updated inode. Probably it would
36195+ * be better to update it here when file is really truncated
36196+ */
36197+ if (result) {
36198+ page_cache_release(page);
36199+ /*
36200+ * the below does up(sbinfo->delete_sema). Do not get
36201+ * confused
36202+ */
36203+ reiser4_release_reserved(inode->i_sb);
36204+ return result;
36205+ }
36206+
36207+ lock_page(page);
36208+ assert("vs-1066", PageLocked(page));
36209+ kaddr = kmap_atomic(page, KM_USER0);
36210+ memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
36211+ flush_dcache_page(page);
36212+ kunmap_atomic(kaddr, KM_USER0);
36213+ unlock_page(page);
36214+ page_cache_release(page);
36215+ /* the below does up(sbinfo->delete_sema). Do not get confused */
36216+ reiser4_release_reserved(inode->i_sb);
36217+ return 0;
36218+}
36219+
36220+/**
36221+ * should_have_notail
36222+ * @uf_info:
36223+ * @new_size:
36224+ *
36225+ * Calls formatting plugin to see whether file of size @new_size has to be
36226+ * stored in unformatted nodes or in tail items. 0 is returned for later case.
36227+ */
36228+static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
36229+{
36230+ if (!uf_info->tplug)
36231+ return 1;
36232+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
36233+ new_size);
36234+
36235+}
36236+
36237+/**
36238+ * truncate_file_body - change length of file
36239+ * @inode: inode of file
36240+ * @new_size: new file length
36241+ *
36242+ * Adjusts items file @inode is built of to match @new_size. It may either cut
36243+ * items or add them to represent a hole at the end of file. The caller has to
36244+ * obtain exclusive access to the file.
36245+ */
36246+static int truncate_file_body(struct inode *inode, loff_t new_size)
36247+{
36248+ int result;
36249+
36250+ if (inode->i_size < new_size) {
36251+ /* expanding truncate */
36252+ struct dentry dentry;
36253+ struct file file;
36254+ unix_file_info_t *uf_info;
36255+
36256+ dentry.d_inode = inode;
36257+ file.f_dentry = &dentry;
36258+ file.private_data = NULL;
36259+ file.f_pos = new_size;
36260+ file.private_data = NULL;
36261+ uf_info = unix_file_inode_data(inode);
36262+ result = find_file_state(inode, uf_info);
36263+ if (result)
36264+ return result;
36265+
36266+ if (should_have_notail(uf_info, new_size)) {
36267+ /*
36268+ * file of size @new_size has to be built of
36269+ * extents. If it is built of tails - convert to
36270+ * extents
36271+ */
36272+ if (uf_info->container == UF_CONTAINER_TAILS) {
36273+ /*
36274+ * if file is being convered by another process
36275+ * - wait until it completes
36276+ */
36277+ while (1) {
36278+ if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
36279+ drop_exclusive_access(uf_info);
36280+ schedule();
36281+ get_exclusive_access(uf_info);
36282+ continue;
36283+ }
36284+ break;
36285+ }
36286+
36287+ if (uf_info->container == UF_CONTAINER_TAILS) {
36288+ result = tail2extent(uf_info);
36289+ if (result)
36290+ return result;
36291+ }
36292+ }
36293+ result = write_extent(&file, NULL, 0, &new_size);
36294+ if (result)
36295+ return result;
36296+ uf_info->container = UF_CONTAINER_EXTENTS;
36297+ } else {
36298+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
36299+ result = write_extent(&file, NULL, 0, &new_size);
36300+ if (result)
36301+ return result;
36302+ } else {
36303+ result = write_tail(&file, NULL, 0, &new_size);
36304+ if (result)
36305+ return result;
36306+ uf_info->container = UF_CONTAINER_TAILS;
36307+ }
36308+ }
36309+ BUG_ON(result > 0);
36310+ INODE_SET_FIELD(inode, i_size, new_size);
36311+ file_update_time(&file);
36312+ result = reiser4_update_sd(inode);
36313+ BUG_ON(result != 0);
36314+ reiser4_free_file_fsdata(&file);
36315+ } else
36316+ result = shorten_file(inode, new_size);
36317+ return result;
36318+}
36319+
36320+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
36321+
36322+/**
36323+ * load_file_hint - copy hint from struct file to local variable
36324+ * @file: file to get hint from
36325+ * @hint: structure to fill
36326+ *
36327+ * Reiser4 specific portion of struct file may contain information (hint)
36328+ * stored on exiting from previous read or write. That information includes
36329+ * seal of znode and coord within that znode where previous read or write
36330+ * stopped. This function copies that information to @hint if it was stored or
36331+ * initializes @hint by 0s otherwise.
36332+ */
36333+int load_file_hint(struct file *file, hint_t *hint)
36334+{
36335+ reiser4_file_fsdata *fsdata;
36336+
36337+ if (file) {
36338+ fsdata = reiser4_get_file_fsdata(file);
36339+ if (IS_ERR(fsdata))
36340+ return PTR_ERR(fsdata);
36341+
36342+ spin_lock_inode(file->f_dentry->d_inode);
36343+ if (seal_is_set(&fsdata->reg.hint.seal)) {
36344+ *hint = fsdata->reg.hint;
36345+ init_lh(&hint->lh);
36346+ hint->ext_coord.lh = &hint->lh;
36347+ spin_unlock_inode(file->f_dentry->d_inode);
36348+ /*
36349+ * force re-validation of the coord on the first
36350+ * iteration of the read/write loop.
36351+ */
36352+ hint->ext_coord.valid = 0;
36353+ assert("nikita-19892", coords_equal(&hint->seal.coord1,
36354+ &hint->ext_coord.
36355+ coord));
36356+ return 0;
36357+ }
36358+ memset(&fsdata->reg.hint, 0, sizeof(hint_t));
36359+ spin_unlock_inode(file->f_dentry->d_inode);
36360+ }
36361+ hint_init_zero(hint);
36362+ return 0;
36363+}
36364+
36365+/**
36366+ * save_file_hint - copy hint to reiser4 private struct file's part
36367+ * @file: file to save hint in
36368+ * @hint: hint to save
36369+ *
36370+ * This copies @hint to reiser4 private part of struct file. It can help
36371+ * speedup future accesses to the file.
36372+ */
36373+void save_file_hint(struct file *file, const hint_t *hint)
36374+{
36375+ reiser4_file_fsdata *fsdata;
36376+
36377+ assert("edward-1337", hint != NULL);
36378+
36379+ if (!file || !seal_is_set(&hint->seal))
36380+ return;
36381+ fsdata = reiser4_get_file_fsdata(file);
36382+ assert("vs-965", !IS_ERR(fsdata));
36383+ assert("nikita-19891",
36384+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
36385+ assert("vs-30", hint->lh.owner == NULL);
36386+ spin_lock_inode(file->f_dentry->d_inode);
36387+ fsdata->reg.hint = *hint;
36388+ spin_unlock_inode(file->f_dentry->d_inode);
36389+ return;
36390+}
36391+
36392+void unset_hint(hint_t * hint)
36393+{
36394+ assert("vs-1315", hint);
36395+ hint->ext_coord.valid = 0;
36396+ seal_done(&hint->seal);
36397+ done_lh(&hint->lh);
36398+}
36399+
36400+/* coord must be set properly. So, that set_hint has nothing to do */
36401+void set_hint(hint_t * hint, const reiser4_key * key, znode_lock_mode mode)
36402+{
36403+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
36404+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
36405+
36406+ seal_init(&hint->seal, &hint->ext_coord.coord, key);
36407+ hint->offset = get_key_offset(key);
36408+ hint->mode = mode;
36409+ done_lh(&hint->lh);
36410+}
36411+
36412+int hint_is_set(const hint_t * hint)
36413+{
36414+ return seal_is_set(&hint->seal);
36415+}
36416+
36417+#if REISER4_DEBUG
36418+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
36419+{
36420+ return (get_key_locality(k1) == get_key_locality(k2) &&
36421+ get_key_type(k1) == get_key_type(k2) &&
36422+ get_key_band(k1) == get_key_band(k2) &&
36423+ get_key_ordering(k1) == get_key_ordering(k2) &&
36424+ get_key_objectid(k1) == get_key_objectid(k2));
36425+}
36426+#endif
36427+
36428+int
36429+hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
36430+ znode_lock_mode lock_mode)
36431+{
36432+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
36433+ /* hint either not set or set by different operation */
36434+ return RETERR(-E_REPEAT);
36435+
36436+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
36437+
36438+ if (check_key && get_key_offset(key) != hint->offset)
36439+ /* hint is set for different key */
36440+ return RETERR(-E_REPEAT);
36441+
36442+ assert("vs-31", hint->ext_coord.lh == &hint->lh);
36443+ return seal_validate(&hint->seal, &hint->ext_coord.coord, key,
36444+ hint->ext_coord.lh, lock_mode, ZNODE_LOCK_LOPRI);
36445+}
36446+
36447+int xversion;
36448+
36449+/**
36450+ * find_or_create_extent -
36451+ * @page:
36452+ *
36453+ *
36454+ */
36455+/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
36456+ unallocated extent if it does not exist yet, initialize jnode, capture page */
36457+int find_or_create_extent(struct page *page)
36458+{
36459+ int result;
36460+ struct inode *inode;
36461+ int plugged_hole;
36462+
36463+ jnode *node;
36464+
36465+ assert("vs-1065", page->mapping && page->mapping->host);
36466+ inode = page->mapping->host;
36467+
36468+ lock_page(page);
36469+ node = jnode_of_page(page);
36470+ unlock_page(page);
36471+ if (IS_ERR(node))
36472+ return PTR_ERR(node);
36473+
36474+ if (node->blocknr == 0) {
36475+ plugged_hole = 0;
36476+ result = update_extent(inode, node,
36477+ (loff_t)page->index << PAGE_CACHE_SHIFT,
36478+ &plugged_hole);
36479+ if (result) {
36480+ jput(node);
36481+ warning("", "update_extent failed: %d", result);
36482+ return result;
36483+ }
36484+ if (plugged_hole)
36485+ reiser4_update_sd(inode);
36486+ } else {
36487+ spin_lock_jnode(node);
36488+ result = try_capture(node, ZNODE_WRITE_LOCK, 0);
36489+ BUG_ON(result != 0);
36490+ jnode_make_dirty_locked(node);
36491+ spin_unlock_jnode(node);
36492+ }
36493+
36494+ BUG_ON(node->atom == NULL);
36495+ jput(node);
36496+
36497+ if (get_current_context()->entd) {
36498+ entd_context *ent = get_entd_context(node->tree->super);
36499+
36500+ if (ent->cur_request->page == page)
36501+ ent->cur_request->node = node;
36502+ }
36503+ return 0;
36504+}
36505+
36506+/**
36507+ * has_anonymous_pages - check whether inode has pages dirtied via mmap
36508+ * @inode: inode to check
36509+ *
36510+ * Returns true if inode's mapping has dirty pages which do not belong to any
36511+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
36512+ * tree or were eflushed and can be found via jnodes tagged
36513+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
36514+ */
36515+static int has_anonymous_pages(struct inode *inode)
36516+{
36517+ int result;
36518+
36519+ read_lock_irq(&inode->i_mapping->tree_lock);
36520+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
36521+ read_unlock_irq(&inode->i_mapping->tree_lock);
36522+ return result;
36523+}
36524+
36525+/**
36526+ * capture_page_and_create_extent -
36527+ * @page: page to be captured
36528+ *
36529+ * Grabs space for extent creation and stat data update and calls function to
36530+ * do actual work.
36531+ */
36532+static int capture_page_and_create_extent(struct page *page)
36533+{
36534+ int result;
36535+ struct inode *inode;
36536+
36537+ assert("vs-1084", page->mapping && page->mapping->host);
36538+ inode = page->mapping->host;
36539+ assert("vs-1139",
36540+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
36541+ /* page belongs to file */
36542+ assert("vs-1393",
36543+ inode->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT));
36544+
36545+ /* page capture may require extent creation (if it does not exist yet)
36546+ and stat data's update (number of blocks changes on extent
36547+ creation) */
36548+ grab_space_enable();
36549+ result =
36550+ reiser4_grab_space(2 *
36551+ estimate_one_insert_into_item(tree_by_inode
36552+ (inode)),
36553+ BA_CAN_COMMIT);
36554+ if (likely(!result))
36555+ result = find_or_create_extent(page);
36556+
36557+ if (result != 0)
36558+ SetPageError(page);
36559+ return result;
36560+}
36561+
36562+/* this is implementation of method commit_write of struct
36563+ address_space_operations for unix file plugin */
36564+int
36565+commit_write_unix_file(struct file *file, struct page *page,
36566+ unsigned from, unsigned to)
36567+{
36568+ reiser4_context *ctx;
36569+ struct inode *inode;
36570+ int result;
36571+
36572+ assert("umka-3101", file != NULL);
36573+ assert("umka-3102", page != NULL);
36574+ assert("umka-3093", PageLocked(page));
36575+
36576+ SetPageUptodate(page);
36577+
36578+ inode = page->mapping->host;
36579+ ctx = init_context(page->mapping->host->i_sb);
36580+ if (IS_ERR(ctx))
36581+ return PTR_ERR(ctx);
36582+ page_cache_get(page);
36583+ unlock_page(page);
36584+ result = capture_page_and_create_extent(page);
36585+ lock_page(page);
36586+ page_cache_release(page);
36587+
36588+ /* don't commit transaction under inode semaphore */
36589+ context_set_commit_async(ctx);
36590+ reiser4_exit_context(ctx);
36591+ return result;
36592+}
36593+
36594+/*
36595+ * Support for "anonymous" pages and jnodes.
36596+ *
36597+ * When file is write-accessed through mmap pages can be dirtied from the user
36598+ * level. In this case kernel is not notified until one of following happens:
36599+ *
36600+ * (1) msync()
36601+ *
36602+ * (2) truncate() (either explicit or through unlink)
36603+ *
36604+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before
36605+ * starting write-back.
36606+ *
36607+ * As a result of (3) ->writepage may be called on a dirty page without
36608+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
36609+ * (iozone) generate huge number of anonymous pages. Emergency flush handles
36610+ * this situation by creating jnode for anonymous page, starting IO on the
36611+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
36612+ * memory. Such jnode is also called anonymous.
36613+ *
36614+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
36615+ * tree. This is done by capture_anonymous_*() functions below.
36616+ */
36617+
36618+/**
36619+ * capture_anonymous_page - involve page into transaction
36620+ * @pg: page to deal with
36621+ *
36622+ * Takes care that @page has corresponding metadata in the tree, creates jnode
36623+ * for @page and captures it. On success 1 is returned.
36624+ */
36625+static int capture_anonymous_page(struct page *page)
36626+{
36627+ int result;
36628+
36629+ if (PageWriteback(page))
36630+ /* FIXME: do nothing? */
36631+ return 0;
36632+
36633+ result = capture_page_and_create_extent(page);
36634+ if (result == 0) {
36635+ result = 1;
36636+ } else
36637+ warning("nikita-3329",
36638+ "Cannot capture anon page: %i", result);
36639+
36640+ return result;
36641+}
36642+
36643+/**
36644+ * capture_anonymous_pages - find and capture pages dirtied via mmap
36645+ * @mapping: address space where to look for pages
36646+ * @index: start index
36647+ * @to_capture: maximum number of pages to capture
36648+ *
36649+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
36650+ * captures (involves into atom) them, returns number of captured pages,
36651+ * updates @index to next page after the last captured one.
36652+ */
36653+static int
36654+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
36655+ unsigned int to_capture)
36656+{
36657+ int result;
36658+ struct pagevec pvec;
36659+ unsigned int i, count;
36660+ int nr;
36661+
36662+ pagevec_init(&pvec, 0);
36663+ count = min(pagevec_space(&pvec), to_capture);
36664+ nr = 0;
36665+
36666+ /* find pages tagged MOVED */
36667+ write_lock_irq(&mapping->tree_lock);
36668+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
36669+ (void **)pvec.pages, *index, count,
36670+ PAGECACHE_TAG_REISER4_MOVED);
36671+ if (pagevec_count(&pvec) == 0) {
36672+ /*
36673+ * there are no pages tagged MOVED in mapping->page_tree
36674+ * starting from *index
36675+ */
36676+ write_unlock_irq(&mapping->tree_lock);
36677+ *index = (pgoff_t)-1;
36678+ return 0;
36679+ }
36680+
36681+ /* clear MOVED tag for all found pages */
36682+ for (i = 0; i < pagevec_count(&pvec); i++) {
36683+ void *p;
36684+
36685+ page_cache_get(pvec.pages[i]);
36686+ p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36687+ PAGECACHE_TAG_REISER4_MOVED);
36688+ assert("vs-49", p == pvec.pages[i]);
36689+ }
36690+ write_unlock_irq(&mapping->tree_lock);
36691+
36692+
36693+ *index = pvec.pages[i - 1]->index + 1;
36694+
36695+ for (i = 0; i < pagevec_count(&pvec); i++) {
36696+ /*
36697+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36698+ * set_page_dirty_internal which is called when jnode is
36699+ * captured
36700+ */
36701+ result = capture_anonymous_page(pvec.pages[i]);
36702+ if (result == 1)
36703+ nr++;
36704+ else {
36705+ if (result < 0) {
36706+ warning("vs-1454",
36707+ "failed to capture page: "
36708+ "result=%d, captured=%d)\n",
36709+ result, i);
36710+
36711+ /*
36712+ * set MOVED tag to all pages which left not
36713+ * captured
36714+ */
36715+ write_lock_irq(&mapping->tree_lock);
36716+ for (; i < pagevec_count(&pvec); i ++) {
36717+ radix_tree_tag_set(&mapping->page_tree,
36718+ pvec.pages[i]->index,
36719+ PAGECACHE_TAG_REISER4_MOVED);
36720+ }
36721+ write_unlock_irq(&mapping->tree_lock);
36722+
36723+ pagevec_release(&pvec);
36724+ return result;
36725+ } else {
36726+ /*
36727+ * result == 0. capture_anonymous_page returns
36728+ * 0 for Writeback-ed page. Set MOVED tag on
36729+ * that page
36730+ */
36731+ write_lock_irq(&mapping->tree_lock);
36732+ radix_tree_tag_set(&mapping->page_tree,
36733+ pvec.pages[i]->index,
36734+ PAGECACHE_TAG_REISER4_MOVED);
36735+ write_unlock_irq(&mapping->tree_lock);
36736+ if (i == 0)
36737+ *index = pvec.pages[0]->index;
36738+ else
36739+ *index = pvec.pages[i - 1]->index + 1;
36740+ }
36741+ }
36742+ }
36743+ pagevec_release(&pvec);
36744+ return nr;
36745+}
36746+
36747+/**
36748+ * capture_anonymous_jnodes - find and capture anonymous jnodes
36749+ * @mapping: address space where to look for jnodes
36750+ * @from: start index
36751+ * @to: end index
36752+ * @to_capture: maximum number of jnodes to capture
36753+ *
36754+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36755+ * the range of indexes @from-@to and captures them, returns number of captured
36756+ * jnodes, updates @from to next jnode after the last captured one.
36757+ */
36758+static int
36759+capture_anonymous_jnodes(struct address_space *mapping,
36760+ pgoff_t *from, pgoff_t to, int to_capture)
36761+{
36762+ *from = to;
36763+ return 0;
36764+}
36765+
36766+/*
36767+ * Commit atom of the jnode of a page.
36768+ */
36769+static int sync_page(struct page *page)
36770+{
36771+ int result;
36772+ do {
36773+ jnode *node;
36774+ txn_atom *atom;
36775+
36776+ lock_page(page);
36777+ node = jprivate(page);
36778+ if (node != NULL) {
36779+ spin_lock_jnode(node);
36780+ atom = jnode_get_atom(node);
36781+ spin_unlock_jnode(node);
36782+ } else
36783+ atom = NULL;
36784+ unlock_page(page);
36785+ result = sync_atom(atom);
36786+ } while (result == -E_REPEAT);
36787+ /*
36788+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36789+ * handle the case where more pages get added to the atom while we are
36790+ * syncing it?
36791+ */
36792+ assert("nikita-3485", ergo(result == 0,
36793+ get_current_context()->trans->atom == NULL));
36794+ return result;
36795+}
36796+
36797+/*
36798+ * Commit atoms of pages on @pages list.
36799+ * call sync_page for each page from mapping's page tree
36800+ */
36801+static int sync_page_list(struct inode *inode)
36802+{
36803+ int result;
36804+ struct address_space *mapping;
36805+ unsigned long from; /* start index for radix_tree_gang_lookup */
36806+ unsigned int found; /* return value for radix_tree_gang_lookup */
36807+
36808+ mapping = inode->i_mapping;
36809+ from = 0;
36810+ result = 0;
36811+ read_lock_irq(&mapping->tree_lock);
36812+ while (result == 0) {
36813+ struct page *page;
36814+
36815+ found =
36816+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36817+ from, 1);
36818+ assert("", found < 2);
36819+ if (found == 0)
36820+ break;
36821+
36822+ /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36823+ sys_fsync */
36824+ page_cache_get(page);
36825+ read_unlock_irq(&mapping->tree_lock);
36826+
36827+ from = page->index + 1;
36828+
36829+ result = sync_page(page);
36830+
36831+ page_cache_release(page);
36832+ read_lock_irq(&mapping->tree_lock);
36833+ }
36834+
36835+ read_unlock_irq(&mapping->tree_lock);
36836+ return result;
36837+}
36838+
36839+static int commit_file_atoms(struct inode *inode)
36840+{
36841+ int result;
36842+ unix_file_info_t *uf_info;
36843+
36844+ uf_info = unix_file_inode_data(inode);
36845+
36846+ get_exclusive_access(uf_info);
36847+ /*
36848+ * find what items file is made from
36849+ */
36850+ result = find_file_state(inode, uf_info);
36851+ drop_exclusive_access(uf_info);
36852+ if (result != 0)
36853+ return result;
36854+
36855+ /*
36856+ * file state cannot change because we are under ->i_mutex
36857+ */
36858+ switch (uf_info->container) {
36859+ case UF_CONTAINER_EXTENTS:
36860+ /* find_file_state might open join an atom */
36861+ txn_restart_current();
36862+ result =
36863+ /*
36864+ * when we are called by
36865+ * filemap_fdatawrite->
36866+ * do_writepages()->
36867+ * reiser4_writepages()
36868+ *
36869+ * inode->i_mapping->dirty_pages are spices into
36870+ * ->io_pages, leaving ->dirty_pages dirty.
36871+ *
36872+ * When we are called from
36873+ * reiser4_fsync()->sync_unix_file(), we have to
36874+ * commit atoms of all pages on the ->dirty_list.
36875+ *
36876+ * So for simplicity we just commit ->io_pages and
36877+ * ->dirty_pages.
36878+ */
36879+ sync_page_list(inode);
36880+ break;
36881+ case UF_CONTAINER_TAILS:
36882+ /*
36883+ * NOTE-NIKITA probably we can be smarter for tails. For now
36884+ * just commit all existing atoms.
36885+ */
36886+ result = txnmgr_force_commit_all(inode->i_sb, 0);
36887+ break;
36888+ case UF_CONTAINER_EMPTY:
36889+ result = 0;
36890+ break;
36891+ case UF_CONTAINER_UNKNOWN:
36892+ default:
36893+ result = -EIO;
36894+ break;
36895+ }
36896+
36897+ /*
36898+ * commit current transaction: there can be captured nodes from
36899+ * find_file_state() and finish_conversion().
36900+ */
36901+ txn_restart_current();
36902+ return result;
36903+}
36904+
36905+/**
36906+ * writepages_unix_file - writepages of struct address_space_operations
36907+ * @mapping:
36908+ * @wbc:
36909+ *
36910+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36911+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36912+ * created by reiser4_writepage.
36913+ */
36914+int writepages_unix_file(struct address_space *mapping,
36915+ struct writeback_control *wbc)
36916+{
36917+ int result;
36918+ unix_file_info_t *uf_info;
36919+ pgoff_t pindex, jindex, nr_pages;
36920+ long to_capture;
36921+ struct inode *inode;
36922+
36923+ inode = mapping->host;
36924+ if (!has_anonymous_pages(inode)) {
36925+ result = 0;
36926+ goto end;
36927+ }
36928+ jindex = pindex = wbc->start >> PAGE_CACHE_SHIFT;
36929+ result = 0;
36930+ nr_pages =
36931+ (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
36932+ uf_info = unix_file_inode_data(inode);
36933+
36934+ do {
36935+ reiser4_context *ctx;
36936+
36937+ if (wbc->sync_mode != WB_SYNC_ALL)
36938+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36939+ else
36940+ to_capture = CAPTURE_APAGE_BURST;
36941+
36942+ ctx = init_context(inode->i_sb);
36943+ if (IS_ERR(ctx)) {
36944+ result = PTR_ERR(ctx);
36945+ break;
36946+ }
36947+ /* avoid recursive calls to ->sync_inodes */
36948+ ctx->nobalance = 1;
36949+ assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36950+ assert("", LOCK_CNT_NIL(inode_sem_w));
36951+ assert("", LOCK_CNT_NIL(inode_sem_r));
36952+
36953+ txn_restart_current();
36954+
36955+ /* we have to get nonexclusive access to the file */
36956+ if (get_current_context()->entd) {
36957+ /*
36958+ * use nonblocking version of nonexclusive_access to
36959+ * avoid deadlock which might look like the following:
36960+ * process P1 holds NEA on file F1 and called entd to
36961+ * reclaim some memory. Entd works for P1 and is going
36962+ * to capture pages of file F2. To do that entd has to
36963+ * get NEA to F2. F2 is held by process P2 which also
36964+ * called entd. But entd is serving P1 at the moment
36965+ * and P2 has to wait. Process P3 trying to get EA to
36966+ * file F2. Existence of pending EA request to file F2
36967+ * makes impossible for entd to get NEA to file
36968+ * F2. Neither of these process can continue. Using
36969+ * nonblocking version of gettign NEA is supposed to
36970+ * avoid this deadlock.
36971+ */
36972+ if (try_to_get_nonexclusive_access(uf_info) == 0) {
36973+ result = RETERR(-EBUSY);
36974+ reiser4_exit_context(ctx);
36975+ break;
36976+ }
36977+ } else
36978+ get_nonexclusive_access(uf_info);
36979+
36980+ while (to_capture > 0) {
36981+ pgoff_t start;
36982+
36983+ assert("vs-1727", jindex <= pindex);
36984+ if (pindex == jindex) {
36985+ start = pindex;
36986+ result =
36987+ capture_anonymous_pages(inode->i_mapping,
36988+ &pindex,
36989+ to_capture);
36990+ if (result <= 0)
36991+ break;
36992+ to_capture -= result;
36993+ wbc->nr_to_write -= result;
36994+ if (start + result == pindex) {
36995+ jindex = pindex;
36996+ continue;
36997+ }
36998+ if (to_capture <= 0)
36999+ break;
37000+ }
37001+ /* deal with anonymous jnodes between jindex and pindex */
37002+ result =
37003+ capture_anonymous_jnodes(inode->i_mapping, &jindex,
37004+ pindex, to_capture);
37005+ if (result < 0)
37006+ break;
37007+ to_capture -= result;
37008+ get_current_context()->nr_captured += result;
37009+
37010+ if (jindex == (pgoff_t) - 1) {
37011+ assert("vs-1728", pindex == (pgoff_t) - 1);
37012+ break;
37013+ }
37014+ }
37015+ if (to_capture <= 0)
37016+ /* there may be left more pages */
37017+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
37018+
37019+ drop_nonexclusive_access(uf_info);
37020+ if (result < 0) {
37021+ /* error happened */
37022+ reiser4_exit_context(ctx);
37023+ return result;
37024+ }
37025+ if (wbc->sync_mode != WB_SYNC_ALL) {
37026+ reiser4_exit_context(ctx);
37027+ return 0;
37028+ }
37029+ result = commit_file_atoms(inode);
37030+ reiser4_exit_context(ctx);
37031+ if (pindex >= nr_pages && jindex == pindex)
37032+ break;
37033+ } while (1);
37034+
37035+ end:
37036+ if (is_in_reiser4_context()) {
37037+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
37038+ /*
37039+ * there are already pages to flush, flush them out, do
37040+ * not delay until end of reiser4_sync_inodes
37041+ */
37042+ writeout(inode->i_sb, wbc);
37043+ get_current_context()->nr_captured = 0;
37044+ }
37045+ }
37046+ return result;
37047+}
37048+
37049+/*
37050+ * ->sync() method for unix file.
37051+ *
37052+ * We are trying to be smart here. Instead of committing all atoms (original
37053+ * solution), we scan dirty pages of this file and commit all atoms they are
37054+ * part of.
37055+ *
37056+ * Situation is complicated by anonymous pages: i.e., extent-less pages
37057+ * dirtied through mmap. Fortunately sys_fsync() first calls
37058+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37059+ * all missing extents and capture anonymous pages.
37060+ */
37061+int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
37062+{
37063+ reiser4_context *ctx;
37064+ txn_atom *atom;
37065+ reiser4_block_nr reserve;
37066+
37067+ ctx = init_context(dentry->d_inode->i_sb);
37068+ if (IS_ERR(ctx))
37069+ return PTR_ERR(ctx);
37070+
37071+ reserve = estimate_update_common(dentry->d_inode);
37072+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37073+ reiser4_exit_context(ctx);
37074+ return RETERR(-ENOSPC);
37075+ }
37076+ write_sd_by_inode_common(dentry->d_inode);
37077+
37078+ atom = get_current_atom_locked();
37079+ spin_lock_txnh(ctx->trans);
37080+ force_commit_atom(ctx->trans);
37081+ reiser4_exit_context(ctx);
37082+ return 0;
37083+}
37084+
37085+/**
37086+ * readpage_unix_file_nolock - readpage of struct address_space_operations
37087+ * @file:
37088+ * @page:
37089+ *
37090+ * Compose a key and search for item containing information about @page
37091+ * data. If item is found - its readpage method is called.
37092+ */
37093+int readpage_unix_file_nolock(struct file *file, struct page *page)
37094+{
37095+ reiser4_context *ctx;
37096+ int result;
37097+ struct inode *inode;
37098+ reiser4_key key;
37099+ item_plugin *iplug;
37100+ hint_t *hint;
37101+ lock_handle *lh;
37102+ coord_t *coord;
37103+
37104+ assert("vs-1062", PageLocked(page));
37105+ assert("vs-976", !PageUptodate(page));
37106+ assert("vs-1061", page->mapping && page->mapping->host);
37107+
37108+ if ((page->mapping->host->i_size <=
37109+ ((loff_t) page->index << PAGE_CACHE_SHIFT))) {
37110+ /* page is out of file already */
37111+ unlock_page(page);
37112+ return -EINVAL;
37113+ }
37114+
37115+ inode = page->mapping->host;
37116+ ctx = init_context(inode->i_sb);
37117+ if (IS_ERR(ctx)) {
37118+ unlock_page(page);
37119+ return PTR_ERR(ctx);
37120+ }
37121+
37122+ hint = kmalloc(sizeof(*hint), get_gfp_mask());
37123+ if (hint == NULL) {
37124+ unlock_page(page);
37125+ reiser4_exit_context(ctx);
37126+ return RETERR(-ENOMEM);
37127+ }
37128+
37129+ result = load_file_hint(file, hint);
37130+ if (result) {
37131+ kfree(hint);
37132+ unlock_page(page);
37133+ reiser4_exit_context(ctx);
37134+ return result;
37135+ }
37136+ lh = &hint->lh;
37137+
37138+ /* get key of first byte of the page */
37139+ key_by_inode_and_offset_common(inode,
37140+ (loff_t) page->index << PAGE_CACHE_SHIFT,
37141+ &key);
37142+
37143+ /* look for file metadata corresponding to first byte of page */
37144+ page_cache_get(page);
37145+ unlock_page(page);
37146+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
37147+ lock_page(page);
37148+ page_cache_release(page);
37149+
37150+ if (page->mapping == NULL) {
37151+ /*
37152+ * readpage allows truncate to run concurrently. Page was
37153+ * truncated while it was not locked
37154+ */
37155+ done_lh(lh);
37156+ kfree(hint);
37157+ unlock_page(page);
37158+ txn_restart(ctx);
37159+ reiser4_exit_context(ctx);
37160+ return -EINVAL;
37161+ }
37162+
37163+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
37164+ if (result == CBK_COORD_FOUND &&
37165+ hint->ext_coord.coord.between != AT_UNIT)
37166+ /* file is truncated */
37167+ result = -EINVAL;
37168+ done_lh(lh);
37169+ kfree(hint);
37170+ unlock_page(page);
37171+ txn_restart(ctx);
37172+ reiser4_exit_context(ctx);
37173+ return result;
37174+ }
37175+
37176+ /*
37177+ * item corresponding to page is found. It can not be removed because
37178+ * znode lock is held
37179+ */
37180+ if (PageUptodate(page)) {
37181+ done_lh(lh);
37182+ kfree(hint);
37183+ unlock_page(page);
37184+ txn_restart(ctx);
37185+ reiser4_exit_context(ctx);
37186+ return 0;
37187+ }
37188+
37189+ coord = &hint->ext_coord.coord;
37190+ result = zload(coord->node);
37191+ if (result) {
37192+ done_lh(lh);
37193+ kfree(hint);
37194+ unlock_page(page);
37195+ txn_restart(ctx);
37196+ reiser4_exit_context(ctx);
37197+ return result;
37198+ }
37199+
37200+ validate_extended_coord(&hint->ext_coord,
37201+ (loff_t) page->index << PAGE_CACHE_SHIFT);
37202+
37203+ if (!coord_is_existing_unit(coord)) {
37204+ /* this indicates corruption */
37205+ warning("vs-280",
37206+ "Looking for page %lu of file %llu (size %lli). "
37207+ "No file items found (%d). File is corrupted?\n",
37208+ page->index, (unsigned long long)get_inode_oid(inode),
37209+ inode->i_size, result);
37210+ zrelse(coord->node);
37211+ done_lh(lh);
37212+ kfree(hint);
37213+ unlock_page(page);
37214+ txn_restart(ctx);
37215+ reiser4_exit_context(ctx);
37216+ return RETERR(-EIO);
37217+ }
37218+
37219+ /*
37220+ * get plugin of found item or use plugin if extent if there are no
37221+ * one
37222+ */
37223+ iplug = item_plugin_by_coord(coord);
37224+ if (iplug->s.file.readpage)
37225+ result = iplug->s.file.readpage(coord, page);
37226+ else
37227+ result = RETERR(-EINVAL);
37228+
37229+ if (!result) {
37230+ set_key_offset(&key,
37231+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
37232+ /* FIXME should call set_hint() */
37233+ unset_hint(hint);
37234+ } else {
37235+ unlock_page(page);
37236+ unset_hint(hint);
37237+ }
37238+ assert("vs-979",
37239+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
37240+ assert("vs-9791", ergo(result != 0, !PageLocked(page)));
37241+
37242+ zrelse(coord->node);
37243+ done_lh(lh);
37244+
37245+ save_file_hint(file, hint);
37246+ kfree(hint);
37247+
37248+ /*
37249+ * FIXME: explain why it is needed. HINT: page allocation in write can
37250+ * not be done when atom is not NULL because reiser4_writepage can not
37251+ * kick entd and have to eflush
37252+ */
37253+ txn_restart(ctx);
37254+ reiser4_exit_context(ctx);
37255+ return result;
37256+}
37257+
37258+/**
37259+ * readpage_unix_file - readpage of struct address_space_operations
37260+ * @file: file @page belongs to
37261+ * @page: page to read
37262+ *
37263+ * Get non exclusive access to a file to avoid races with truncate. If page is
37264+ * out of file - return error. Call readpage_unix_file_nolock to do the rest.
37265+ */
37266+int readpage_unix_file(struct file *file, struct page *page)
37267+{
37268+ return readpage_unix_file_nolock(file, page);
37269+}
37270+
37271+static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
37272+ loff_t count UNUSED_ARG)
37273+{
37274+ /* We should reserve one block, because of updating of the stat data
37275+ item */
37276+ assert("vs-1249",
37277+ inode_file_plugin(inode)->estimate.update ==
37278+ estimate_update_common);
37279+ return estimate_update_common(inode);
37280+}
37281+
37282+/* this is called with nonexclusive access obtained, file's container can not change */
37283+static size_t read_file(hint_t * hint, struct file *file, /* file to read from to */
37284+ char __user *buf, /* address of user-space buffer */
37285+ size_t count, /* number of bytes to read */
37286+ loff_t * off)
37287+{
37288+ int result;
37289+ struct inode *inode;
37290+ flow_t flow;
37291+ int (*read_f) (struct file *, flow_t *, hint_t *);
37292+ coord_t *coord;
37293+ znode *loaded;
37294+
37295+ inode = file->f_dentry->d_inode;
37296+
37297+ /* build flow */
37298+ assert("vs-1250",
37299+ inode_file_plugin(inode)->flow_by_inode ==
37300+ flow_by_inode_unix_file);
37301+ result =
37302+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
37303+ *off, READ_OP, &flow);
37304+ if (unlikely(result))
37305+ return result;
37306+
37307+ /* get seal and coord sealed with it from reiser4 private data
37308+ of struct file. The coord will tell us where our last read
37309+ of this file finished, and the seal will help to determine
37310+ if that location is still valid.
37311+ */
37312+ coord = &hint->ext_coord.coord;
37313+ while (flow.length && result == 0) {
37314+ result =
37315+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
37316+ if (cbk_errored(result))
37317+ /* error happened */
37318+ break;
37319+
37320+ if (coord->between != AT_UNIT) {
37321+ /* there were no items corresponding to given offset */
37322+ done_lh(hint->ext_coord.lh);
37323+ break;
37324+ }
37325+
37326+ loaded = coord->node;
37327+ result = zload(loaded);
37328+ if (unlikely(result)) {
37329+ done_lh(hint->ext_coord.lh);
37330+ break;
37331+ }
37332+
37333+ if (hint->ext_coord.valid == 0)
37334+ validate_extended_coord(&hint->ext_coord,
37335+ get_key_offset(&flow.key));
37336+
37337+ assert("vs-4", hint->ext_coord.valid == 1);
37338+ assert("vs-33", hint->ext_coord.lh == &hint->lh);
37339+ /* call item's read method */
37340+ read_f = item_plugin_by_coord(coord)->s.file.read;
37341+ result = read_f(file, &flow, hint);
37342+ zrelse(loaded);
37343+ done_lh(hint->ext_coord.lh);
37344+ }
37345+
37346+ return (count - flow.length) ? (count - flow.length) : result;
37347+}
37348+
37349+/**
37350+ * read_unix_file - read of struct file_operations
37351+ * @file: file to read from
37352+ * @buf: address of user-space buffer
37353+ * @read_amount: number of bytes to read
37354+ * @off: position in file to read from
37355+ *
37356+ * This is implementation of vfs's read method of struct file_operations for
37357+ * unix file plugin.
37358+ */
37359+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
37360+ loff_t *off)
37361+{
37362+ reiser4_context *ctx;
37363+ int result;
37364+ struct inode *inode;
37365+ hint_t *hint;
37366+ unix_file_info_t *uf_info;
37367+ size_t count, read, left;
37368+ reiser4_block_nr needed;
37369+ loff_t size;
37370+
37371+ if (unlikely(read_amount == 0))
37372+ return 0;
37373+
37374+ assert("umka-072", file != NULL);
37375+ assert("umka-074", off != NULL);
37376+ inode = file->f_dentry->d_inode;
37377+ assert("vs-972", !inode_get_flag(inode, REISER4_NO_SD));
37378+
37379+ ctx = init_context(inode->i_sb);
37380+ if (IS_ERR(ctx))
37381+ return PTR_ERR(ctx);
37382+
37383+ hint = kmalloc(sizeof(*hint), get_gfp_mask());
37384+ if (hint == NULL) {
37385+ context_set_commit_async(ctx);
37386+ reiser4_exit_context(ctx);
37387+ return RETERR(-ENOMEM);
37388+ }
37389+
37390+ result = load_file_hint(file, hint);
37391+ if (result) {
37392+ kfree(hint);
37393+ context_set_commit_async(ctx);
37394+ reiser4_exit_context(ctx);
37395+ return result;
37396+ }
37397+
37398+ left = read_amount;
37399+ count = 0;
37400+ uf_info = unix_file_inode_data(inode);
37401+ while (left > 0) {
37402+ txn_restart_current();
37403+
37404+ get_nonexclusive_access(uf_info);
37405+
37406+ size = i_size_read(inode);
37407+ if (*off >= size) {
37408+ /* position to read from is past the end of file */
37409+ drop_nonexclusive_access(uf_info);
37410+ break;
37411+ }
37412+ if (*off + left > size)
37413+ left = size - *off;
37414+
37415+ /* faultin user page */
37416+ if(fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left)) {
37417+ drop_nonexclusive_access(uf_info);
37418+ result = RETERR(-EFAULT);
37419+ break;
37420+ }
37421+
37422+ read = read_file(hint, file, buf,
37423+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
37424+ off);
37425+
37426+ drop_nonexclusive_access(uf_info);
37427+
37428+ if (read < 0) {
37429+ result = read;
37430+ break;
37431+ }
37432+ left -= read;
37433+ buf += read;
37434+
37435+ /* update position in a file */
37436+ *off += read;
37437+ /* total number of read bytes */
37438+ count += read;
37439+ }
37440+ save_file_hint(file, hint);
37441+ done_lh(&hint->lh);
37442+ kfree(hint);
37443+
37444+ if (count) {
37445+ /*
37446+ * something was read. Grab space for stat data update and
37447+ * update atime
37448+ */
37449+ needed = unix_file_estimate_read(inode, read_amount);
37450+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37451+ if (result == 0)
37452+ file_accessed(file);
37453+ else
37454+ warning("", "failed to grab space for atime update");
37455+ }
37456+
37457+ context_set_commit_async(ctx);
37458+ reiser4_exit_context(ctx);
37459+
37460+ /* return number of read bytes or error code if nothing is read */
37461+ return count ? count : result;
37462+}
37463+
37464+/* This function takes care about @file's pages. First of all it checks if
37465+ filesystems readonly and if so gets out. Otherwise, it throws out all
37466+ pages of file if it was mapped for read and going to be mapped for write
37467+ and consists of tails. This is done in order to not manage few copies
37468+ of the data (first in page cache and second one in tails them selves)
37469+ for the case of mapping files consisting tails.
37470+
37471+ Here also tail2extent conversion is performed if it is allowed and file
37472+ is going to be written or mapped for write. This functions may be called
37473+ from write_unix_file() or mmap_unix_file(). */
37474+static int check_pages_unix_file(struct file *file, struct inode *inode)
37475+{
37476+ reiser4_invalidate_pages(inode->i_mapping, 0,
37477+ (inode->i_size + PAGE_CACHE_SIZE -
37478+ 1) >> PAGE_CACHE_SHIFT, 0);
37479+ return unpack(file, inode, 0 /* not forever */ );
37480+}
37481+
37482+/**
37483+ * mmap_unix_file - mmap of struct file_operations
37484+ * @file: file to mmap
37485+ * @vma:
37486+ *
37487+ * This is implementation of vfs's mmap method of struct file_operations for
37488+ * unix file plugin. It converts file to extent if necessary. Sets
37489+ * reiser4_inode's flag - REISER4_HAS_MMAP.
37490+ */
37491+int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
37492+{
37493+ reiser4_context *ctx;
37494+ int result;
37495+ struct inode *inode;
37496+ unix_file_info_t *uf_info;
37497+ reiser4_block_nr needed;
37498+
37499+ inode = file->f_dentry->d_inode;
37500+ ctx = init_context(inode->i_sb);
37501+ if (IS_ERR(ctx))
37502+ return PTR_ERR(ctx);
37503+
37504+ uf_info = unix_file_inode_data(inode);
37505+
37506+ down(&uf_info->write);
37507+ get_exclusive_access(uf_info);
37508+
37509+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
37510+ /*
37511+ * we need file built of extent items. If it is still built of
37512+ * tail items we have to convert it. Find what items the file
37513+ * is built of
37514+ */
37515+ result = find_file_state(inode, uf_info);
37516+ if (result != 0) {
37517+ drop_exclusive_access(uf_info);
37518+ up(&uf_info->write);
37519+ reiser4_exit_context(ctx);
37520+ return result;
37521+ }
37522+
37523+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
37524+ uf_info->container == UF_CONTAINER_EXTENTS ||
37525+ uf_info->container == UF_CONTAINER_EMPTY));
37526+ if (uf_info->container == UF_CONTAINER_TAILS) {
37527+ /*
37528+ * invalidate all pages and convert file from tails to
37529+ * extents
37530+ */
37531+ result = check_pages_unix_file(file, inode);
37532+ if (result) {
37533+ drop_exclusive_access(uf_info);
37534+ up(&uf_info->write);
37535+ reiser4_exit_context(ctx);
37536+ return result;
37537+ }
37538+ }
37539+ }
37540+
37541+ /*
37542+ * generic_file_mmap will do update_atime. Grab space for stat data
37543+ * update.
37544+ */
37545+ needed = inode_file_plugin(inode)->estimate.update(inode);
37546+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37547+ if (result) {
37548+ drop_exclusive_access(uf_info);
37549+ up(&uf_info->write);
37550+ reiser4_exit_context(ctx);
37551+ return result;
37552+ }
37553+
37554+ result = generic_file_mmap(file, vma);
37555+ if (result == 0) {
37556+ /* mark file as having mapping. */
37557+ inode_set_flag(inode, REISER4_HAS_MMAP);
37558+ }
37559+
37560+ drop_exclusive_access(uf_info);
37561+ up(&uf_info->write);
37562+ reiser4_exit_context(ctx);
37563+ return result;
37564+}
37565+
37566+/**
37567+ * find_first_item
37568+ * @inode:
37569+ *
37570+ * Finds file item which is responsible for first byte in the file.
37571+ */
37572+static int find_first_item(struct inode *inode)
37573+{
37574+ coord_t coord;
37575+ lock_handle lh;
37576+ reiser4_key key;
37577+ int result;
37578+
37579+ coord_init_zero(&coord);
37580+ init_lh(&lh);
37581+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37582+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37583+ inode);
37584+ if (result == CBK_COORD_FOUND) {
37585+ if (coord.between == AT_UNIT) {
37586+ result = zload(coord.node);
37587+ if (result == 0) {
37588+ result = item_id_by_coord(&coord);
37589+ zrelse(coord.node);
37590+ if (result != EXTENT_POINTER_ID &&
37591+ result != FORMATTING_ID)
37592+ result = RETERR(-EIO);
37593+ }
37594+ } else
37595+ result = RETERR(-EIO);
37596+ }
37597+ done_lh(&lh);
37598+ return result;
37599+}
37600+
37601+/**
37602+ * open_unix_file
37603+ * @inode:
37604+ * @file:
37605+ *
37606+ * If filesystem is not readonly - complete uncompleted tail conversion if
37607+ * there was one
37608+ */
37609+int open_unix_file(struct inode *inode, struct file *file)
37610+{
37611+ int result;
37612+ reiser4_context *ctx;
37613+ unix_file_info_t *uf_info;
37614+
37615+ if (IS_RDONLY(inode))
37616+ return 0;
37617+
37618+ if (!inode_get_flag(inode, REISER4_PART_MIXED))
37619+ return 0;
37620+
37621+ ctx = init_context(inode->i_sb);
37622+ if (IS_ERR(ctx))
37623+ return PTR_ERR(ctx);
37624+
37625+ uf_info = unix_file_inode_data(inode);
37626+ get_exclusive_access(uf_info);
37627+
37628+ /*
37629+ * it may happen that another process is doing tail conversion. Wait
37630+ * until it completes
37631+ */
37632+ while (1) {
37633+ if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
37634+ drop_exclusive_access(uf_info);
37635+ schedule();
37636+ get_exclusive_access(uf_info);
37637+ continue;
37638+ }
37639+ break;
37640+ }
37641+
37642+ if (!inode_get_flag(inode, REISER4_PART_MIXED)) {
37643+ /*
37644+ * other process completed the conversion
37645+ */
37646+ drop_exclusive_access(uf_info);
37647+ reiser4_exit_context(ctx);
37648+ return 0;
37649+ }
37650+
37651+ /*
37652+ * file left in semi converted state after unclean shutdown or another
37653+ * thread is doing conversion and dropped exclusive access which doing
37654+ * balance dirty pages. Complete the conversion
37655+ */
37656+ result = find_first_item(inode);
37657+ if (result == EXTENT_POINTER_ID)
37658+ /*
37659+ * first item is extent, therefore there was incomplete
37660+ * tail2extent conversion. Complete it
37661+ */
37662+ result = tail2extent(unix_file_inode_data(inode));
37663+ else if (result == FORMATTING_ID)
37664+ /*
37665+ * first item is formatting item, therefore there was
37666+ * incomplete extent2tail conversion. Complete it
37667+ */
37668+ result = extent2tail(unix_file_inode_data(inode));
37669+ else
37670+ result = -EIO;
37671+
37672+ assert("vs-1712",
37673+ ergo(result == 0, (!inode_get_flag(inode, REISER4_PART_MIXED) &&
37674+ !inode_get_flag(inode, REISER4_PART_IN_CONV))));
37675+ drop_exclusive_access(uf_info);
37676+ reiser4_exit_context(ctx);
37677+ return result;
37678+}
37679+
37680+#define NEITHER_OBTAINED 0
37681+#define EA_OBTAINED 1
37682+#define NEA_OBTAINED 2
37683+
37684+static void drop_access(unix_file_info_t *uf_info)
37685+{
37686+ if (uf_info->exclusive_use)
37687+ drop_exclusive_access(uf_info);
37688+ else
37689+ drop_nonexclusive_access(uf_info);
37690+}
37691+
37692+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37693+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37694+
37695+void balance_dirty_pages(struct address_space *mapping);
37696+
37697+/**
37698+ * write_unix_file - write of struct file_operations
37699+ * @file: file to write to
37700+ * @buf: address of user-space buffer
37701+ * @write_amount: number of bytes to write
37702+ * @off: position in file to write to
37703+ *
37704+ * This is implementation of vfs's write method of struct file_operations for
37705+ * unix file plugin.
37706+ */
37707+ssize_t write_unix_file(struct file *file, const char __user *buf,
37708+ size_t count, loff_t *pos)
37709+{
37710+ int result;
37711+ reiser4_context *ctx;
37712+ struct inode *inode;
37713+ unix_file_info_t *uf_info;
37714+ ssize_t written;
37715+ int try_free_space;
37716+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37717+ size_t left;
37718+ ssize_t (*write_op)(struct file *, const char __user *, size_t,
37719+ loff_t *pos);
37720+ int ea;
37721+ loff_t new_size;
37722+
37723+ inode = file->f_dentry->d_inode;
37724+ ctx = init_context(inode->i_sb);
37725+ if (IS_ERR(ctx))
37726+ return PTR_ERR(ctx);
37727+
37728+ mutex_lock(&inode->i_mutex);
37729+
37730+ assert("vs-947", !inode_get_flag(inode, REISER4_NO_SD));
37731+ assert("vs-9471", (!inode_get_flag(inode, REISER4_PART_MIXED)));
37732+
37733+ /* check amount of bytes to write and writing position */
37734+ result = generic_write_checks(file, pos, &count, 0);
37735+ if (result) {
37736+ mutex_unlock(&inode->i_mutex);
37737+ context_set_commit_async(ctx);
37738+ reiser4_exit_context(ctx);
37739+ return result;
37740+ }
37741+
37742+ result = remove_suid(file->f_dentry);
37743+ if (result) {
37744+ mutex_unlock(&inode->i_mutex);
37745+ context_set_commit_async(ctx);
37746+ reiser4_exit_context(ctx);
37747+ return result;
37748+ }
37749+
37750+ uf_info = unix_file_inode_data(inode);
37751+
37752+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
37753+ written = 0;
37754+ try_free_space = 0;
37755+ left = count;
37756+ ea = NEITHER_OBTAINED;
37757+
37758+ new_size = i_size_read(inode);
37759+ if (*pos + count > new_size)
37760+ new_size = *pos + count;
37761+
37762+ while (left) {
37763+ if (left < to_write)
37764+ to_write = left;
37765+
37766+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37767+ get_exclusive_access(uf_info);
37768+ ea = EA_OBTAINED;
37769+ if (uf_info->container != UF_CONTAINER_EMPTY) {
37770+ /* file is made not empty by another process */
37771+ drop_exclusive_access(uf_info);
37772+ ea = NEITHER_OBTAINED;
37773+ continue;
37774+ }
37775+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37776+ /*
37777+ * get exclusive access directly just to not have to
37778+ * re-obtain it if file will appear empty
37779+ */
37780+ get_exclusive_access(uf_info);
37781+ ea = EA_OBTAINED;
37782+ result = find_file_state(inode, uf_info);
37783+ if (result) {
37784+ drop_exclusive_access(uf_info);
37785+ ea = NEITHER_OBTAINED;
37786+ break;
37787+ }
37788+ } else {
37789+ get_nonexclusive_access(uf_info);
37790+ ea = NEA_OBTAINED;
37791+ }
37792+
37793+ /* either EA or NEA is obtained. Choose item write method */
37794+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
37795+ /* file is built of extent items */
37796+ write_op = write_extent;
37797+ } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37798+ /* file is empty */
37799+ if (should_have_notail(uf_info, new_size))
37800+ write_op = write_extent;
37801+ else
37802+ write_op = write_tail;
37803+ } else {
37804+ /* file is built of tail items */
37805+ if (should_have_notail(uf_info, new_size)) {
37806+ if (ea == NEA_OBTAINED) {
37807+ drop_nonexclusive_access(uf_info);
37808+ get_exclusive_access(uf_info);
37809+ ea = EA_OBTAINED;
37810+ }
37811+ if (uf_info->container == UF_CONTAINER_TAILS) {
37812+ /*
37813+ * if file is being convered by another
37814+ * process - wait until it completes
37815+ */
37816+ while (1) {
37817+ if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
37818+ drop_exclusive_access(uf_info);
37819+ schedule();
37820+ get_exclusive_access(uf_info);
37821+ continue;
37822+ }
37823+ break;
37824+ }
37825+ if (uf_info->container == UF_CONTAINER_TAILS) {
37826+ result = tail2extent(uf_info);
37827+ if (result)
37828+ break;
37829+ }
37830+ }
37831+ drop_exclusive_access(uf_info);
37832+ ea = NEITHER_OBTAINED;
37833+ continue;
37834+ }
37835+ write_op = write_tail;
37836+ }
37837+
37838+ written = write_op(file, buf, to_write, pos);
37839+ if (written == -ENOSPC && try_free_space) {
37840+ drop_access(uf_info);
37841+ txnmgr_force_commit_all(inode->i_sb, 0);
37842+ try_free_space = 0;
37843+ continue;
37844+ }
37845+ if (written < 0) {
37846+ drop_access(uf_info);
37847+ result = written;
37848+ break;
37849+ }
37850+ /* something is written. */
37851+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37852+ assert("", ea == EA_OBTAINED);
37853+ uf_info->container = (write_op == write_extent) ?
37854+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37855+ } else {
37856+ assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
37857+ write_op == write_extent));
37858+ assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
37859+ write_op == write_tail));
37860+ }
37861+ if (*pos + written > inode->i_size)
37862+ INODE_SET_FIELD(inode, i_size, *pos + written);
37863+ file_update_time(file);
37864+ result = reiser4_update_sd(inode);
37865+ if (result) {
37866+ mutex_unlock(&inode->i_mutex);
37867+ current->backing_dev_info = NULL;
37868+ drop_access(uf_info);
37869+ context_set_commit_async(ctx);
37870+ reiser4_exit_context(ctx);
37871+ return result;
37872+ }
37873+ drop_access(uf_info);
37874+ ea = NEITHER_OBTAINED;
37875+ txn_restart(ctx);
37876+ current->journal_info = NULL;
37877+ /*
37878+ * tell VM how many pages were dirtied. Maybe number of pages
37879+ * which were dirty already should not be counted
37880+ */
37881+ balance_dirty_pages(inode->i_mapping);
37882+ current->journal_info = ctx;
37883+
37884+ left -= written;
37885+ buf += written;
37886+ *pos += written;
37887+ }
37888+
37889+ mutex_unlock(&inode->i_mutex);
37890+
37891+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
37892+ txn_restart_current();
37893+ grab_space_enable();
37894+ result = sync_unix_file(file, file->f_dentry,
37895+ 0 /* data and stat data */ );
37896+ if (result)
37897+ warning("reiser4-7", "failed to sync file %llu",
37898+ (unsigned long long)get_inode_oid(inode));
37899+ }
37900+
37901+ current->backing_dev_info = NULL;
37902+
37903+ reiser4_exit_context(ctx);
37904+
37905+ /*
37906+ * return number of written bytes or error code if nothing is
37907+ * written. Note, that it does not work correctly in case when
37908+ * sync_unix_file returns error
37909+ */
37910+ return (count - left) ? (count - left) : result;
37911+}
37912+
37913+/**
37914+ * release_unix_file - release of struct file_operations
37915+ * @inode: inode of released file
37916+ * @file: file to release
37917+ *
37918+ * Implementation of release method of struct file_operations for unix file
37919+ * plugin. If last reference to indode is released - convert all extent items
37920+ * into tail items if necessary. Frees reiser4 specific file data.
37921+ */
37922+int release_unix_file(struct inode *inode, struct file *file)
37923+{
37924+ reiser4_context *ctx;
37925+ unix_file_info_t *uf_info;
37926+ int result;
37927+ int in_reiser4;
37928+
37929+ in_reiser4 = is_in_reiser4_context();
37930+
37931+ ctx = init_context(inode->i_sb);
37932+ if (IS_ERR(ctx))
37933+ return PTR_ERR(ctx);
37934+
37935+ result = 0;
37936+ if (in_reiser4 == 0) {
37937+ uf_info = unix_file_inode_data(inode);
37938+
37939+ down(&uf_info->write);
37940+ get_exclusive_access(uf_info);
37941+ if (atomic_read(&file->f_dentry->d_count) == 1 &&
37942+ uf_info->container == UF_CONTAINER_EXTENTS &&
37943+ !should_have_notail(uf_info, inode->i_size) &&
37944+ !rofs_inode(inode)) {
37945+ result = extent2tail(uf_info);
37946+ if (result != 0) {
37947+ warning("nikita-3233",
37948+ "Failed (%d) to convert in %s (%llu)",
37949+ result, __FUNCTION__,
37950+ (unsigned long long)
37951+ get_inode_oid(inode));
37952+ }
37953+ }
37954+ drop_exclusive_access(uf_info);
37955+ up(&uf_info->write);
37956+ } else {
37957+ /*
37958+ we are within reiser4 context already. How latter is
37959+ possible? Simple:
37960+
37961+ (gdb) bt
37962+ #0 get_exclusive_access ()
37963+ #2 0xc01e56d3 in release_unix_file ()
37964+ #3 0xc01c3643 in reiser4_release ()
37965+ #4 0xc014cae0 in __fput ()
37966+ #5 0xc013ffc3 in remove_vm_struct ()
37967+ #6 0xc0141786 in exit_mmap ()
37968+ #7 0xc0118480 in mmput ()
37969+ #8 0xc0133205 in oom_kill ()
37970+ #9 0xc01332d1 in out_of_memory ()
37971+ #10 0xc013bc1d in try_to_free_pages ()
37972+ #11 0xc013427b in __alloc_pages ()
37973+ #12 0xc013f058 in do_anonymous_page ()
37974+ #13 0xc013f19d in do_no_page ()
37975+ #14 0xc013f60e in handle_mm_fault ()
37976+ #15 0xc01131e5 in do_page_fault ()
37977+ #16 0xc0104935 in error_code ()
37978+ #17 0xc025c0c6 in __copy_to_user_ll ()
37979+ #18 0xc01d496f in read_tail ()
37980+ #19 0xc01e4def in read_unix_file ()
37981+ #20 0xc01c3504 in reiser4_read ()
37982+ #21 0xc014bd4f in vfs_read ()
37983+ #22 0xc014bf66 in sys_read ()
37984+ */
37985+ warning("vs-44", "out of memory?");
37986+ }
37987+
37988+ reiser4_free_file_fsdata(file);
37989+
37990+ reiser4_exit_context(ctx);
37991+ return result;
37992+}
37993+
37994+static void set_file_notail(struct inode *inode)
37995+{
37996+ reiser4_inode *state;
37997+ formatting_plugin *tplug;
37998+
37999+ state = reiser4_inode_data(inode);
38000+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
38001+ plugin_set_formatting(&state->pset, tplug);
38002+ inode_set_plugin(inode,
38003+ formatting_plugin_to_plugin(tplug), PSET_FORMATTING);
38004+}
38005+
38006+/* if file is built of tails - convert it to extents */
38007+static int unpack(struct file *filp, struct inode *inode, int forever)
38008+{
38009+ int result = 0;
38010+ unix_file_info_t *uf_info;
38011+
38012+ uf_info = unix_file_inode_data(inode);
38013+ assert("vs-1628", ea_obtained(uf_info));
38014+
38015+ result = find_file_state(inode, uf_info);
38016+ if (result)
38017+ return result;
38018+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
38019+
38020+ if (uf_info->container == UF_CONTAINER_TAILS) {
38021+ /*
38022+ * if file is being convered by another process - wait until it
38023+ * completes
38024+ */
38025+ while (1) {
38026+ if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
38027+ drop_exclusive_access(uf_info);
38028+ schedule();
38029+ get_exclusive_access(uf_info);
38030+ continue;
38031+ }
38032+ break;
38033+ }
38034+ if (uf_info->container == UF_CONTAINER_TAILS) {
38035+ result = tail2extent(uf_info);
38036+ if (result)
38037+ return result;
38038+ }
38039+ }
38040+ if (forever) {
38041+ /* safe new formatting plugin in stat data */
38042+ __u64 tograb;
38043+
38044+ set_file_notail(inode);
38045+
38046+ grab_space_enable();
38047+ tograb = inode_file_plugin(inode)->estimate.update(inode);
38048+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
38049+ result = reiser4_update_sd(inode);
38050+ }
38051+
38052+ return result;
38053+}
38054+
38055+/* implentation of vfs' ioctl method of struct file_operations for unix file
38056+ plugin
38057+*/
38058+int
38059+ioctl_unix_file(struct inode *inode, struct file *filp,
38060+ unsigned int cmd, unsigned long arg UNUSED_ARG)
38061+{
38062+ reiser4_context *ctx;
38063+ int result;
38064+
38065+ ctx = init_context(inode->i_sb);
38066+ if (IS_ERR(ctx))
38067+ return PTR_ERR(ctx);
38068+
38069+ switch (cmd) {
38070+ case REISER4_IOC_UNPACK:
38071+ get_exclusive_access(unix_file_inode_data(inode));
38072+ result = unpack(filp, inode, 1 /* forever */ );
38073+ drop_exclusive_access(unix_file_inode_data(inode));
38074+ break;
38075+
38076+ default:
38077+ result = RETERR(-ENOSYS);
38078+ break;
38079+ }
38080+ reiser4_exit_context(ctx);
38081+ return result;
38082+}
38083+
38084+/* implentation of vfs' bmap method of struct address_space_operations for unix
38085+ file plugin
38086+*/
38087+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
38088+{
38089+ reiser4_context *ctx;
38090+ sector_t result;
38091+ reiser4_key key;
38092+ coord_t coord;
38093+ lock_handle lh;
38094+ struct inode *inode;
38095+ item_plugin *iplug;
38096+ sector_t block;
38097+
38098+ inode = mapping->host;
38099+
38100+ ctx = init_context(inode->i_sb);
38101+ if (IS_ERR(ctx))
38102+ return PTR_ERR(ctx);
38103+ key_by_inode_and_offset_common(inode,
38104+ (loff_t) lblock * current_blocksize,
38105+ &key);
38106+
38107+ init_lh(&lh);
38108+ result =
38109+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
38110+ if (cbk_errored(result)) {
38111+ done_lh(&lh);
38112+ reiser4_exit_context(ctx);
38113+ return result;
38114+ }
38115+
38116+ result = zload(coord.node);
38117+ if (result) {
38118+ done_lh(&lh);
38119+ reiser4_exit_context(ctx);
38120+ return result;
38121+ }
38122+
38123+ iplug = item_plugin_by_coord(&coord);
38124+ if (iplug->s.file.get_block) {
38125+ result = iplug->s.file.get_block(&coord, lblock, &block);
38126+ if (result == 0)
38127+ result = block;
38128+ } else
38129+ result = RETERR(-EINVAL);
38130+
38131+ zrelse(coord.node);
38132+ done_lh(&lh);
38133+ reiser4_exit_context(ctx);
38134+ return result;
38135+}
38136+
38137+/**
38138+ * flow_by_inode_unix_file - initizlize structure flow
38139+ * @inode: inode of file for which read or write is abou
38140+ * @buf: buffer to perform read to or write from
38141+ * @user: flag showing whether @buf is user space or kernel space
38142+ * @size: size of buffer @buf
38143+ * @off: start offset fro read or write
38144+ * @op: READ or WRITE
38145+ * @flow:
38146+ *
38147+ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
38148+ */
38149+int flow_by_inode_unix_file(struct inode *inode,
38150+ const char __user *buf, int user,
38151+ loff_t size, loff_t off,
38152+ rw_op op, flow_t *flow)
38153+{
38154+ assert("nikita-1100", inode != NULL);
38155+
38156+ flow->length = size;
38157+ memcpy(&flow->data, &buf, sizeof(buf));
38158+ flow->user = user;
38159+ flow->op = op;
38160+ assert("nikita-1931", inode_file_plugin(inode) != NULL);
38161+ assert("nikita-1932",
38162+ inode_file_plugin(inode)->key_by_inode ==
38163+ key_by_inode_and_offset_common);
38164+ /* calculate key of write position and insert it into flow->key */
38165+ return key_by_inode_and_offset_common(inode, off, &flow->key);
38166+}
38167+
38168+/* plugin->u.file.set_plug_in_sd = NULL
38169+ plugin->u.file.set_plug_in_inode = NULL
38170+ plugin->u.file.create_blank_sd = NULL */
38171+/* plugin->u.file.delete */
38172+/*
38173+ plugin->u.file.add_link = add_link_common
38174+ plugin->u.file.rem_link = NULL */
38175+
38176+/* plugin->u.file.owns_item
38177+ this is common_file_owns_item with assertion */
38178+/* Audited by: green(2002.06.15) */
38179+int
38180+owns_item_unix_file(const struct inode *inode /* object to check against */ ,
38181+ const coord_t * coord /* coord to check */ )
38182+{
38183+ int result;
38184+
38185+ result = owns_item_common(inode, coord);
38186+ if (!result)
38187+ return 0;
38188+ if (item_type_by_coord(coord) != UNIX_FILE_METADATA_ITEM_TYPE)
38189+ return 0;
38190+ assert("vs-547",
38191+ item_id_by_coord(coord) == EXTENT_POINTER_ID ||
38192+ item_id_by_coord(coord) == FORMATTING_ID);
38193+ return 1;
38194+}
38195+
38196+static int setattr_truncate(struct inode *inode, struct iattr *attr)
38197+{
38198+ int result;
38199+ int s_result;
38200+ loff_t old_size;
38201+ reiser4_tree *tree;
38202+
38203+ inode_check_scale(inode, inode->i_size, attr->ia_size);
38204+
38205+ old_size = inode->i_size;
38206+ tree = tree_by_inode(inode);
38207+
38208+ result = safe_link_grab(tree, BA_CAN_COMMIT);
38209+ if (result == 0)
38210+ result = safe_link_add(inode, SAFE_TRUNCATE);
38211+ if (result == 0)
38212+ result = truncate_file_body(inode, attr->ia_size);
38213+ if (result)
38214+ warning("vs-1588", "truncate_file failed: oid %lli, "
38215+ "old size %lld, new size %lld, retval %d",
38216+ (unsigned long long)get_inode_oid(inode),
38217+ old_size, attr->ia_size, result);
38218+
38219+ s_result = safe_link_grab(tree, BA_CAN_COMMIT);
38220+ if (s_result == 0)
38221+ s_result =
38222+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
38223+ if (s_result != 0) {
38224+ warning("nikita-3417", "Cannot kill safelink %lli: %i",
38225+ (unsigned long long)get_inode_oid(inode), s_result);
38226+ }
38227+ safe_link_release(tree);
38228+ return result;
38229+}
38230+
38231+/* plugin->u.file.setattr method */
38232+/* This calls inode_setattr and if truncate is in effect it also takes
38233+ exclusive inode access to avoid races */
38234+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
38235+ struct iattr *attr /* change description */ )
38236+{
38237+ int result;
38238+
38239+ if (attr->ia_valid & ATTR_SIZE) {
38240+ reiser4_context *ctx;
38241+ unix_file_info_t *uf_info;
38242+
38243+ /* truncate does reservation itself and requires exclusive
38244+ access obtained */
38245+ ctx = init_context(dentry->d_inode->i_sb);
38246+ if (IS_ERR(ctx))
38247+ return PTR_ERR(ctx);
38248+
38249+ uf_info = unix_file_inode_data(dentry->d_inode);
38250+ down(&uf_info->write);
38251+ get_exclusive_access(uf_info);
38252+ result = setattr_truncate(dentry->d_inode, attr);
38253+ drop_exclusive_access(uf_info);
38254+ up(&uf_info->write);
38255+ context_set_commit_async(ctx);
38256+ reiser4_exit_context(ctx);
38257+ } else
38258+ result = setattr_common(dentry, attr);
38259+
38260+ return result;
38261+}
38262+
38263+/* plugin->u.file.init_inode_data */
38264+void
38265+init_inode_data_unix_file(struct inode *inode,
38266+ reiser4_object_create_data * crd, int create)
38267+{
38268+ unix_file_info_t *data;
38269+
38270+ data = unix_file_inode_data(inode);
38271+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
38272+ init_rwsem(&data->latch);
38273+ sema_init(&data->write, 1);
38274+ data->tplug = inode_formatting_plugin(inode);
38275+ data->exclusive_use = 0;
38276+
38277+#if REISER4_DEBUG
38278+ data->ea_owner = NULL;
38279+ atomic_set(&data->nr_neas, 0);
38280+#endif
38281+ init_inode_ordering(inode, crd, create);
38282+}
38283+
38284+/**
38285+ * delete_object_unix_file - delete_object of file_plugin
38286+ * @inode: inode to be deleted
38287+ *
38288+ * Truncates file to length 0, removes stat data and safe link.
38289+ */
38290+int delete_object_unix_file(struct inode *inode)
38291+{
38292+ unix_file_info_t *uf_info;
38293+ int result;
38294+
38295+ if (inode_get_flag(inode, REISER4_NO_SD))
38296+ return 0;
38297+
38298+ /* truncate file bogy first */
38299+ uf_info = unix_file_inode_data(inode);
38300+ get_exclusive_access(uf_info);
38301+ result = truncate_file_body(inode, 0 /* size */ );
38302+ drop_exclusive_access(uf_info);
38303+
38304+ if (result)
38305+ warning("", "failed to truncate file (%llu) on removal: %d",
38306+ get_inode_oid(inode), result);
38307+
38308+ /* remove stat data and safe link */
38309+ return delete_object_common(inode);
38310+}
38311+
38312+/**
38313+ * sendfile_unix_file - sendfile of struct file_operations
38314+ * @file: file to be sent
38315+ * @ppos: position to start from
38316+ * @count: number of bytes to send
38317+ * @actor: function to copy data
38318+ * @target: where to copy read data
38319+ *
38320+ * Reads @count bytes from @file and calls @actor for every page read. This is
38321+ * needed for loop back devices support.
38322+ */
38323+ssize_t
38324+sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
38325+ read_actor_t actor, void *target)
38326+{
38327+ reiser4_context *ctx;
38328+ ssize_t result;
38329+ struct inode *inode;
38330+ unix_file_info_t *uf_info;
38331+
38332+ inode = file->f_dentry->d_inode;
38333+ ctx = init_context(inode->i_sb);
38334+ if (IS_ERR(ctx))
38335+ return PTR_ERR(ctx);
38336+
38337+ /*
38338+ * generic_file_sndfile may want to call update_atime. Grab space for
38339+ * stat data update
38340+ */
38341+ result = reiser4_grab_space(estimate_update_common(inode),
38342+ BA_CAN_COMMIT);
38343+ if (result)
38344+ goto error;
38345+ mutex_lock(&inode->i_mutex);
38346+ inode_set_flag(inode, REISER4_HAS_MMAP);
38347+ mutex_unlock(&inode->i_mutex);
38348+
38349+ uf_info = unix_file_inode_data(inode);
38350+ get_nonexclusive_access(uf_info);
38351+ result = generic_file_sendfile(file, ppos, count, actor, target);
38352+ drop_nonexclusive_access(uf_info);
38353+ error:
38354+ reiser4_exit_context(ctx);
38355+ return result;
38356+}
38357+
38358+int
38359+prepare_write_unix_file(struct file *file, struct page *page,
38360+ unsigned from, unsigned to)
38361+{
38362+ reiser4_context *ctx;
38363+ unix_file_info_t *uf_info;
38364+ int ret;
38365+
38366+ ctx = init_context(file->f_dentry->d_inode->i_sb);
38367+ if (IS_ERR(ctx))
38368+ return PTR_ERR(ctx);
38369+
38370+ uf_info = unix_file_inode_data(file->f_dentry->d_inode);
38371+ get_exclusive_access(uf_info);
38372+ ret = find_file_state(file->f_dentry->d_inode, uf_info);
38373+ if (ret == 0) {
38374+ if (uf_info->container == UF_CONTAINER_TAILS)
38375+ ret = -EINVAL;
38376+ else
38377+ ret = do_prepare_write(file, page, from, to);
38378+ }
38379+ drop_exclusive_access(uf_info);
38380+
38381+ /* don't commit transaction under inode semaphore */
38382+ context_set_commit_async(ctx);
38383+ reiser4_exit_context(ctx);
38384+ return ret;
38385+}
38386+
38387+/*
38388+ * Local variables:
38389+ * c-indentation-style: "K&R"
38390+ * mode-name: "LC"
38391+ * c-basic-offset: 8
38392+ * tab-width: 8
38393+ * fill-column: 79
38394+ * scroll-step: 1
38395+ * End:
38396+ */
38397Index: linux-2.6.16/fs/reiser4/plugin/file/file.h
38398===================================================================
38399--- /dev/null
38400+++ linux-2.6.16/fs/reiser4/plugin/file/file.h
38401@@ -0,0 +1,257 @@
38402+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38403+ * reiser4/README */
38404+
38405+/* this file contains declarations of methods implementing file plugins
38406+ (UNIX_FILE_PLUGIN_ID, SYMLINK_FILE_PLUGIN_ID and CRC_FILE_PLUGIN_ID) */
38407+
38408+#if !defined( __REISER4_FILE_H__ )
38409+#define __REISER4_FILE_H__
38410+
38411+/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38412+
38413+/* inode operations */
38414+int setattr_unix_file(struct dentry *, struct iattr *);
38415+
38416+/* file operations */
38417+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38418+ loff_t *off);
38419+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38420+ loff_t * off);
38421+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38422+ unsigned long arg);
38423+int mmap_unix_file(struct file *, struct vm_area_struct *);
38424+int open_unix_file(struct inode *, struct file *);
38425+int release_unix_file(struct inode *, struct file *);
38426+int sync_unix_file(struct file *, struct dentry *, int datasync);
38427+ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38428+ read_actor_t, void *target);
38429+
38430+/* address space operations */
38431+int readpage_unix_file(struct file *, struct page *);
38432+int readpage_unix_file_nolock(struct file *, struct page *);
38433+int writepages_unix_file(struct address_space *, struct writeback_control *);
38434+int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38435+ unsigned to);
38436+int commit_write_unix_file(struct file *, struct page *, unsigned from,
38437+ unsigned to);
38438+sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38439+
38440+/* file plugin operations */
38441+int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38442+ int user, loff_t, loff_t, rw_op, flow_t *);
38443+int owns_item_unix_file(const struct inode *, const coord_t *);
38444+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38445+ int create);
38446+int delete_object_unix_file(struct inode *);
38447+
38448+/*
38449+ * all the write into unix file is performed by item write method. Write method
38450+ * of unix file plugin only decides which item plugin (extent or tail) and in
38451+ * which mode (one from the enum below) to call
38452+ */
38453+typedef enum {
38454+ FIRST_ITEM = 1,
38455+ APPEND_ITEM = 2,
38456+ OVERWRITE_ITEM = 3
38457+} write_mode_t;
38458+
38459+/* unix file may be in one the following states */
38460+typedef enum {
38461+ UF_CONTAINER_UNKNOWN = 0,
38462+ UF_CONTAINER_TAILS = 1,
38463+ UF_CONTAINER_EXTENTS = 2,
38464+ UF_CONTAINER_EMPTY = 3
38465+} file_container_t;
38466+
38467+struct formatting_plugin;
38468+struct inode;
38469+
38470+/* unix file plugin specific part of reiser4 inode */
38471+typedef struct unix_file_info {
38472+ /*
38473+ * this read-write lock protects file containerization change. Accesses
38474+ * which do not change file containerization (see file_container_t)
38475+ * (read, readpage, writepage, write (until tail conversion is
38476+ * involved)) take read-lock. Accesses which modify file
38477+ * containerization (truncate, conversion from tail to extent and back)
38478+ * take write-lock.
38479+ */
38480+ struct rw_semaphore latch;
38481+ /*
38482+ * this semaphore is used to serialize writes instead of inode->i_mutex,
38483+ * because write_unix_file uses get_user_pages which is to be used
38484+ * under mm->mmap_sem and because it is required to take mm->mmap_sem
38485+ * before inode->i_mutex, so inode->i_mutex would have to be unlocked
38486+ * before calling to get_user_pages which is unacceptable
38487+ */
38488+ struct semaphore write;
38489+ /* this enum specifies which items are used to build the file */
38490+ file_container_t container;
38491+ /*
38492+ * plugin which controls when file is to be converted to extents and
38493+ * back to tail
38494+ */
38495+ struct formatting_plugin *tplug;
38496+ /* if this is set, file is in exclusive use */
38497+ int exclusive_use;
38498+#if REISER4_DEBUG
38499+ /* pointer to task struct of thread owning exclusive access to file */
38500+ void *ea_owner;
38501+ atomic_t nr_neas;
38502+ void *last_reader;
38503+#endif
38504+} unix_file_info_t;
38505+
38506+struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38507+void get_exclusive_access(unix_file_info_t *);
38508+void drop_exclusive_access(unix_file_info_t *);
38509+void get_nonexclusive_access(unix_file_info_t *);
38510+void drop_nonexclusive_access(unix_file_info_t *);
38511+int try_to_get_nonexclusive_access(unix_file_info_t *);
38512+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38513+ struct inode *);
38514+int find_file_item_nohint(coord_t *, lock_handle *,
38515+ const reiser4_key *, znode_lock_mode,
38516+ struct inode *);
38517+
38518+void validate_extended_coord(uf_coord_t *, loff_t offset);
38519+int load_file_hint(struct file *, hint_t *);
38520+void save_file_hint(struct file *, const hint_t *);
38521+
38522+
38523+#include "../item/extent.h"
38524+#include "../item/tail.h"
38525+#include "../item/ctail.h"
38526+
38527+struct uf_coord {
38528+ coord_t coord;
38529+ lock_handle *lh;
38530+ int valid;
38531+ union {
38532+ extent_coord_extension_t extent;
38533+ tail_coord_extension_t tail;
38534+ ctail_coord_extension_t ctail;
38535+ } extension;
38536+};
38537+
38538+#include "../../forward.h"
38539+#include "../../seal.h"
38540+#include "../../lock.h"
38541+
38542+/*
38543+ * This structure is used to speed up file operations (reads and writes). A
38544+ * hint is a suggestion about where a key resolved to last time. A seal
38545+ * indicates whether a node has been modified since a hint was last recorded.
38546+ * You check the seal, and if the seal is still valid, you can use the hint
38547+ * without traversing the tree again.
38548+ */
38549+struct hint {
38550+ seal_t seal; /* a seal over last file item accessed */
38551+ uf_coord_t ext_coord;
38552+ loff_t offset;
38553+ znode_lock_mode mode;
38554+ lock_handle lh;
38555+};
38556+
38557+void set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38558+int hint_is_set(const hint_t *);
38559+void unset_hint(hint_t *);
38560+int hint_validate(hint_t *, const reiser4_key *, int check_key,
38561+ znode_lock_mode);
38562+void hint_init_zero(hint_t *);
38563+
38564+int update_file_size(struct inode *, reiser4_key *, int update_sd);
38565+int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38566+ loff_t cur_size, int (*update_actor) (struct inode *,
38567+ reiser4_key *, int));
38568+
38569+
38570+#if REISER4_DEBUG
38571+
38572+/* return 1 is exclusive access is obtained, 0 - otherwise */
38573+static inline int ea_obtained(unix_file_info_t * uf_info)
38574+{
38575+ int ret;
38576+
38577+ ret = down_read_trylock(&uf_info->latch);
38578+ if (ret)
38579+ up_read(&uf_info->latch);
38580+ return !ret;
38581+}
38582+
38583+#endif
38584+
38585+/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38586+int create_symlink(struct inode *symlink, struct inode *dir,
38587+ reiser4_object_create_data *);
38588+void destroy_inode_symlink(struct inode *);
38589+
38590+/* declarations of functions implementing CRC_FILE_PLUGIN_ID file plugin */
38591+
38592+/* inode operations */
38593+int setattr_cryptcompress(struct dentry *, struct iattr *);
38594+
38595+/* file operations */
38596+ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38597+ loff_t * off);
38598+ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38599+ loff_t * off);
38600+int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38601+ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38602+ read_actor_t actor, void *target);
38603+int release_cryptcompress(struct inode *, struct file *);
38604+
38605+/* address space operations */
38606+extern int readpage_cryptcompress(struct file *, struct page *);
38607+extern int writepages_cryptcompress(struct address_space *,
38608+ struct writeback_control *);
38609+
38610+
38611+/* file plugin operations */
38612+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38613+ int user, loff_t, loff_t, rw_op, flow_t *);
38614+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38615+int create_cryptcompress(struct inode *, struct inode *,
38616+ reiser4_object_create_data *);
38617+int delete_cryptcompress(struct inode *);
38618+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38619+ int create);
38620+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38621+ const reiser4_key * to_key,
38622+ reiser4_key * smallest_removed,
38623+ struct inode *object, int truncate,
38624+ int *progress);
38625+void destroy_inode_cryptcompress(struct inode *);
38626+
38627+extern reiser4_plugin_ops cryptcompress_plugin_ops;
38628+
38629+#define WRITE_GRANULARITY 32
38630+
38631+
38632+int tail2extent(unix_file_info_t *);
38633+int extent2tail(unix_file_info_t *);
38634+
38635+int goto_right_neighbor(coord_t *, lock_handle *);
38636+int find_or_create_extent(struct page *);
38637+int equal_to_ldk(znode *, const reiser4_key *);
38638+
38639+
38640+extern inline int cbk_errored(int cbk_result)
38641+{
38642+ return (cbk_result != CBK_COORD_NOTFOUND
38643+ && cbk_result != CBK_COORD_FOUND);
38644+}
38645+
38646+/* __REISER4_FILE_H__ */
38647+#endif
38648+
38649+/*
38650+ * Local variables:
38651+ * c-indentation-style: "K&R"
38652+ * mode-name: "LC"
38653+ * c-basic-offset: 8
38654+ * tab-width: 8
38655+ * fill-column: 79
38656+ * scroll-step: 1
38657+ * End:
38658+*/
38659Index: linux-2.6.16/fs/reiser4/plugin/file/invert.c
38660===================================================================
38661--- /dev/null
38662+++ linux-2.6.16/fs/reiser4/plugin/file/invert.c
38663@@ -0,0 +1,493 @@
38664+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38665+
38666+/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
38667+ buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert
38668+ provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files
38669+ when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it
38670+ to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to
38671+ make that easy for you by providing those delimiters in what you read from it.
38672+
38673+ When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a
38674+ bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
38675+ would create those files. But which files? Well, that must be specified in the body of the invert using a special
38676+ syntax, and that specification is called the invert of the assignment.
38677+
38678+ When written to, an invert performs the assignment command that is written
38679+ to it, and modifies its own body to contain the invert of that
38680+ assignment.
38681+
38682+ In other words, writing to an invert file what you have read from it
38683+ is the identity operation.
38684+
38685+ Malformed assignments cause write errors. Partial writes are not
38686+ supported in v4.0, but will be.
38687+
38688+ Example:
38689+
38690+ If an invert contains:
38691+
38692+ /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
38693+
38694+======================
38695+Each element in this definition should be an invert, and all files
38696+should be called recursively - too. This is bad. If one of the
38697+included files in not a regular or invert file, then we can't read
38698+main file.
38699+
38700+I think to make it is possible easier:
38701+
38702+internal structure of invert file should be like symlink file. But
38703+read and write method should be explitely indicated in i/o operation..
38704+
38705+By default we read and write (if probably) as symlink and if we
38706+specify ..invert at reading time that too we can specify it at write time.
38707+
38708+example:
38709+/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
38710+will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
38711+
38712+read of /my_invert_file/..invert will be
38713+/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38714+
38715+but read of /my_invert_file/ will be
38716+The contents of filenameAsome text stored in the invertThe contents of filenameB
38717+
38718+we also can creat this file as
38719+/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
38720+will create /my_invert_file , and use existing files /filenameA and /filenameB.
38721+
38722+and when we will read it will be as previously invert file.
38723+
38724+This is correct?
38725+
38726+ vv
38727+DEMIDOV-FIXME-HANS:
38728+
38729+Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
38730+
38731+Do you agree? Discuss it on reiserfs-list....
38732+
38733+-Hans
38734+=======================
38735+
38736+ Then a read will return:
38737+
38738+ /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38739+
38740+ and a write of the line above to the invert will set the contents of
38741+ the invert and filenameA and filenameB to their original values.
38742+
38743+ Note that the contents of an invert have no influence on the effect
38744+ of a write unless the write is a partial write (and a write of a
38745+ shorter file without using truncate first is a partial write).
38746+
38747+ truncate() has no effect on filenameA and filenameB, it merely
38748+ resets the value of the invert.
38749+
38750+ Writes to subfiles via the invert are implemented by preceding them
38751+ with truncates.
38752+
38753+ Parse failures cause write failures.
38754+
38755+ Questions to ponder: should the invert be acted on prior to file
38756+ close when writing to an open filedescriptor?
38757+
38758+ Example:
38759+
38760+ If an invert contains:
38761+
38762+ "(This text and a pair of quotes are all that is here.)
38763+
38764+Then a read will return:
38765+
38766+ "(This text and a pair of quotes are all that is here.)
38767+
38768+*/
38769+
38770+/* OPEN method places a struct file in memory associated with invert body
38771+ and returns something like file descriptor to the user for the future access
38772+ to the invert file.
38773+ During opening we parse the body of invert and get a list of the 'entryes'
38774+ (that describes all its subfiles) and place pointer on the first struct in
38775+ reiserfs-specific part of invert inode (arbitrary decision).
38776+
38777+ Each subfile is described by the struct inv_entry that has a pointer @sd on
38778+ in-core based stat-data and a pointer on struct file @f (if we find that the
38779+ subfile uses more then one unformated node (arbitrary decision), we load
38780+ struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
38781+ of some other information we need)
38782+
38783+ Since READ and WRITE methods for inverts were formulated in assignment
38784+ language, they don't contain arguments 'size' and 'offset' that make sense
38785+ only in ordinary read/write methods.
38786+
38787+ READ method is a combination of two methods:
38788+ 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
38789+ with @f != 0, this method uses pointer on struct file as an argument
38790+ 2) read method for inode-less files with @sd != 0, this method uses
38791+ in-core based stat-data instead struct file as an argument.
38792+ in the first case we don't use pagecache, just copy data that we got after
38793+ cbk() into userspace.
38794+
38795+ WRITE method for invert files is more complex.
38796+ Besides declared WRITE-interface in assignment languageb above we need
38797+ to have an opportunity to edit unwrapped body of invert file with some
38798+ text editor, it means we need GENERIC WRITE METHOD for invert file:
38799+
38800+ my_invert_file/..invert <- "string"
38801+
38802+ this method parses "string" and looks for correct subfile signatures, also
38803+ the parsing process splits this "string" on the set of flows in accordance
38804+ with the set of subfiles specified by this signarure.
38805+ The found list of signatures #S is compared with the opened one #I of invert
38806+ file. If it doesn't have this one (#I==0, it will be so for instance if we
38807+ have just create this invert file) the write method assignes found signature
38808+ (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
38809+ itself to the some write methods for ordinary or light-weight, or call itself
38810+ recursively for invert files with corresponding flows.
38811+ I am not sure, but the list of signatures looks like what mr.Demidov means
38812+ by 'delimiters'.
38813+
38814+ The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
38815+ and cause delete (create new) subfiles (arbitrary decision - it may looks
38816+ too complex, but this interface will be the completest). The order of entries
38817+ of list #S (#I) and inherited order on #I (#S) must coincide.
38818+ The other parsing results give malformed signature that aborts READ method
38819+ and releases all resources.
38820+
38821+ Format of subfile (entry) signature:
38822+
38823+ "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
38824+
38825+ Legend:
38826+
38827+ START_MAGIC - keyword indicates the start of subfile signature;
38828+
38829+ <> indicates the start of 'subfile metadata', that is the pair
38830+ (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
38831+
38832+ TYPE - the string "type" indicates the start of one of the three words:
38833+ - ORDINARY_FILE,
38834+ - LIGHT_WEIGHT_FILE,
38835+ - INVERT_FILE;
38836+
38837+ LOOKUP_ARG - lookup argument depends on previous type:
38838+ */
38839+
38840+ /************************************************************/
38841+ /* TYPE * LOOKUP ARGUMENT */
38842+ /************************************************************/
38843+ /* LIGH_WEIGHT_FILE * stat-data key */
38844+ /************************************************************/
38845+ /* ORDINARY_FILE * filename */
38846+ /************************************************************/
38847+ /* INVERT_FILE * filename */
38848+ /************************************************************/
38849+
38850+ /* where:
38851+ *stat-data key - the string contains stat data key of this subfile, it will be
38852+ passed to fast-access lookup method for light-weight files;
38853+ *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
38854+ for ordinary and invert files;
38855+
38856+ SUBFILE_BODY - data of this subfile (it will go to the flow)
38857+ END_MAGIC - the keyword indicates the end of subfile signature.
38858+
38859+ The other simbols inside the signature interpreted as 'unformatted content',
38860+ which is available with VFS's read_link() (arbitraruy decision).
38861+
38862+ NOTE: Parse method for a body of invert file uses mentioned signatures _without_
38863+ subfile bodies.
38864+
38865+ Now the only unclear thing is WRITE in regular light-weight subfile A that we
38866+ can describe only in assignment language:
38867+
38868+ A <- "some_string"
38869+
38870+ I guess we don't want to change stat-data and body items of file A
38871+ if this file exist, and size(A) != size("some_string") because this operation is
38872+ expencive, so we only do the partial write if size(A) > size("some_string")
38873+ and do truncate of the "some_string", and then do A <- "truncated string", if
38874+ size(A) < size("some_string"). This decision is also arbitrary..
38875+ */
38876+
38877+/* here is infrastructure for formated flows */
38878+
38879+#define SUBFILE_HEADER_MAGIC 0x19196605
38880+#define FLOW_HEADER_MAGIC 0x01194304
38881+
38882+#include "../plugin.h"
38883+#include "../../debug.h"
38884+#include "../../forward.h"
38885+#include "../object.h"
38886+#include "../item/item.h"
38887+#include "../item/static_stat.h"
38888+#include "../../dformat.h"
38889+#include "../znode.h"
38890+#include "../inode.h"
38891+
38892+#include <linux/types.h>
38893+#include <linux/fs.h> /* for struct file */
38894+#include <linux/list.h> /* for struct list_head */
38895+
38896+typedef enum {
38897+ LIGHT_WEIGHT_FILE,
38898+ ORDINARY_FILE,
38899+ INVERT_FILE
38900+} inv_entry_type;
38901+
38902+typedef struct flow_header {
38903+ d32 fl_magic;
38904+ d16 fl_nr; /* number of subfiles in the flow */
38905+};
38906+
38907+typedef struct subfile_header {
38908+ d32 sh_magic; /* subfile magic */
38909+ d16 sh_type; /* type of subfile: light-weight, ordinary, invert */
38910+ d16 sh_arg_len; /* lenght of lookup argument (filename, key) */
38911+ d32 sh_body_len; /* lenght of subfile body */
38912+};
38913+
38914+/* functions to get/set fields of flow header */
38915+
38916+static void fl_set_magic(flow_header * fh, __u32 value)
38917+{
38918+ cputod32(value, &fh->fh_magic);
38919+}
38920+
38921+static __u32 fl_get_magic(flow_header * fh)
38922+{
38923+ return d32tocpu(&fh->fh_magic);
38924+}
38925+static void fl_set_number(flow_header * fh, __u16 value)
38926+{
38927+ cputod16(value, &fh->fh_nr);
38928+}
38929+static unsigned fl_get_number(flow_header * fh)
38930+{
38931+ return d16tocpu(&fh->fh_nr);
38932+}
38933+
38934+/* functions to get/set fields of subfile header */
38935+
38936+static void sh_set_magic(subfile_header * sh, __u32 value)
38937+{
38938+ cputod32(value, &sh->sh_magic);
38939+}
38940+
38941+static __u32 sh_get_magic(subfile_header * sh)
38942+{
38943+ return d32tocpu(&sh->sh_magic);
38944+}
38945+static void sh_set_type(subfile_header * sh, __u16 value)
38946+{
38947+ cputod16(value, &sh->sh_magic);
38948+}
38949+static unsigned sh_get_type(subfile_header * sh)
38950+{
38951+ return d16tocpu(&sh->sh_magic);
38952+}
38953+static void sh_set_arg_len(subfile_header * sh, __u16 value)
38954+{
38955+ cputod16(value, &sh->sh_arg_len);
38956+}
38957+static unsigned sh_get_arg_len(subfile_header * sh)
38958+{
38959+ return d16tocpu(&sh->sh_arg_len);
38960+}
38961+static void sh_set_body_len(subfile_header * sh, __u32 value)
38962+{
38963+ cputod32(value, &sh->sh_body_len);
38964+}
38965+
38966+static __u32 sh_get_body_len(subfile_header * sh)
38967+{
38968+ return d32tocpu(&sh->sh_body_len);
38969+}
38970+
38971+/* in-core minimal stat-data, light-weight analog of inode */
38972+
38973+struct incore_sd_base {
38974+ umode_t isd_mode;
38975+ nlink_t isd_nlink;
38976+ loff_t isd_size;
38977+ char *isd_data; /* 'subflow' to write */
38978+};
38979+
38980+/* open invert create a list of invert entries,
38981+ every entry is represented by structure inv_entry */
38982+
38983+struct inv_entry {
38984+ struct list_head *ie_list;
38985+ struct file *ie_file; /* this is NULL if the file doesn't
38986+ have unformated nodes */
38987+ struct incore_sd_base *ie_sd; /* inode-less analog of struct file */
38988+};
38989+
38990+/* allocate and init invert entry */
38991+
38992+static struct inv_entry *allocate_inv_entry(void)
38993+{
38994+ struct inv_entry *inv_entry;
38995+
38996+ inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
38997+ if (!inv_entry)
38998+ return ERR_PTR(RETERR(-ENOMEM));
38999+ inv_entry->ie_file = NULL;
39000+ inv_entry->ie_sd = NULL;
39001+ INIT_LIST_HEAD(&inv_entry->ie_list);
39002+ return inv_entry;
39003+}
39004+
39005+static int put_inv_entry(struct inv_entry *ientry)
39006+{
39007+ int result = 0;
39008+
39009+ assert("edward-96", ientry != NULL);
39010+ assert("edward-97", ientry->ie_list != NULL);
39011+
39012+ list_del(ientry->ie_list);
39013+ if (ientry->ie_sd != NULL) {
39014+ kfree(ientry->ie_sd);
39015+ kfree(ientry);
39016+ }
39017+ if (ientry->ie_file != NULL)
39018+ result = filp_close(ientry->file, NULL);
39019+ return result;
39020+}
39021+
39022+static int allocate_incore_sd_base(struct inv_entry *inv_entry)
39023+{
39024+ struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
39025+ assert("edward-99", inv_entry->ie_inode = NULL);
39026+ assert("edward-100", inv_entry->ie_sd = NULL);
39027+
39028+ isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
39029+ if (!isd_base)
39030+ return RETERR(-ENOMEM);
39031+ inv_entry->ie_sd = isd_base;
39032+ return 0;
39033+}
39034+
39035+/* this can be installed as ->init_inv_entry () method of
39036+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
39037+ Copies data from on-disk stat-data format into light-weight analog of inode .
39038+ Doesn't hanlde stat-data extensions. */
39039+
39040+static void sd_base_load(struct inv_entry *inv_entry, char *sd)
39041+{
39042+ reiser4_stat_data_base *sd_base;
39043+
39044+ assert("edward-101", inv_entry != NULL);
39045+ assert("edward-101", inv_entry->ie_sd != NULL);
39046+ assert("edward-102", sd != NULL);
39047+
39048+ sd_base = (reiser4_stat_data_base *) sd;
39049+ inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
39050+ inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
39051+ inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
39052+ inv_entry->incore_sd_base->isd_data = NULL;
39053+}
39054+
39055+/* initialise incore stat-data */
39056+
39057+static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
39058+{
39059+ reiser4_plugin *plugin = item_plugin_by_coord(coord);
39060+ void *body = item_body_by_coord(coord);
39061+
39062+ assert("edward-103", inv_entry != NULL);
39063+ assert("edward-104", plugin != NULL);
39064+ assert("edward-105", body != NULL);
39065+
39066+ sd_base_load(inv_entry, body);
39067+}
39068+
39069+/* takes a key or filename and allocates new invert_entry,
39070+ init and adds it into the list,
39071+ we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
39072+
39073+int get_inv_entry(struct inode *invert_inode, /* inode of invert's body */
39074+ inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */
39075+ const reiser4_key * key, /* key of invert entry stat-data */
39076+ char *filename, /* filename of the file to be opened */
39077+ int flags, int mode)
39078+{
39079+ int result;
39080+ struct inv_entry *ientry;
39081+
39082+ assert("edward-107", invert_inode != NULL);
39083+
39084+ ientry = allocate_inv_entry();
39085+ if (IS_ERR(ientry))
39086+ return (PTR_ERR(ientry));
39087+
39088+ if (type == LIGHT_WEIGHT_FILE) {
39089+ coord_t coord;
39090+ lock_handle lh;
39091+
39092+ assert("edward-108", key != NULL);
39093+
39094+ init_coord(&coord);
39095+ init_lh(&lh);
39096+ result =
39097+ lookup_sd_by_key(tree_by_inode(invert_inode),
39098+ ZNODE_READ_LOCK, &coord, &lh, key);
39099+ if (result == 0)
39100+ init_incore_sd_base(ientry, coord);
39101+
39102+ done_lh(&lh);
39103+ done_coord(&coord);
39104+ return (result);
39105+ } else {
39106+ struct file *file = filp_open(filename, flags, mode);
39107+ /* FIXME_EDWARD here we need to check if we
39108+ did't follow to any mount point */
39109+
39110+ assert("edward-108", filename != NULL);
39111+
39112+ if (IS_ERR(file))
39113+ return (PTR_ERR(file));
39114+ ientry->ie_file = file;
39115+ return 0;
39116+ }
39117+}
39118+
39119+/* takes inode of invert, reads the body of this invert, parses it,
39120+ opens all invert entries and return pointer on the first inv_entry */
39121+
39122+struct inv_entry *open_invert(struct file *invert_file)
39123+{
39124+
39125+}
39126+
39127+ssize_t subfile_read(struct *invert_entry, flow * f)
39128+{
39129+
39130+}
39131+
39132+ssize_t subfile_write(struct *invert_entry, flow * f)
39133+{
39134+
39135+}
39136+
39137+ssize_t invert_read(struct *file, flow * f)
39138+{
39139+
39140+}
39141+
39142+ssize_t invert_write(struct *file, flow * f)
39143+{
39144+
39145+}
39146+
39147+/* Make Linus happy.
39148+ Local variables:
39149+ c-indentation-style: "K&R"
39150+ mode-name: "LC"
39151+ c-basic-offset: 8
39152+ tab-width: 8
39153+ fill-column: 120
39154+ scroll-step: 1
39155+ End:
39156+*/
39157Index: linux-2.6.16/fs/reiser4/plugin/file/symfile.c
39158===================================================================
39159--- /dev/null
39160+++ linux-2.6.16/fs/reiser4/plugin/file/symfile.c
39161@@ -0,0 +1,87 @@
39162+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39163+
39164+/* Symfiles are a generalization of Unix symlinks.
39165+
39166+ A symfile when read behaves as though you took its contents and
39167+ substituted them into the reiser4 naming system as the right hand side
39168+ of an assignment, and then read that which you had assigned to it.
39169+
39170+ A key issue for symfiles is how to implement writes through to
39171+ subfiles. In general, one must have some method of determining what
39172+ of that which is written to the symfile is written to what subfile.
39173+ This can be done by use of custom plugin methods written by users, or
39174+ by using a few general methods we provide for those willing to endure
39175+ the insertion of delimiters into what is read.
39176+
39177+ Writing to symfiles without delimiters to denote what is written to
39178+ what subfile is not supported by any plugins we provide in this
39179+ release. Our most sophisticated support for writes is that embodied
39180+ by the invert plugin (see invert.c).
39181+
39182+ A read only version of the /etc/passwd file might be
39183+ constructed as a symfile whose contents are as follows:
39184+
39185+ /etc/passwd/userlines/*
39186+
39187+ or
39188+
39189+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
39190+
39191+ or
39192+
39193+ /etc/passwd/userlines/(demidov+edward+reiser+root)
39194+
39195+ A symfile with contents
39196+
39197+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
39198+
39199+ will return when read
39200+
39201+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
39202+
39203+ and write of what has been read will not be possible to implement as
39204+ an identity operation because there are no delimiters denoting the
39205+ boundaries of what is to be written to what subfile.
39206+
39207+ Note that one could make this a read/write symfile if one specified
39208+ delimiters, and the write method understood those delimiters delimited
39209+ what was written to subfiles.
39210+
39211+ So, specifying the symfile in a manner that allows writes:
39212+
39213+ /etc/passwd/userlines/demidov+"(
39214+ )+/etc/passwd/userlines/edward+"(
39215+ )+/etc/passwd/userlines/reiser+"(
39216+ )+/etc/passwd/userlines/root+"(
39217+ )
39218+
39219+ or
39220+
39221+ /etc/passwd/userlines/(demidov+"(
39222+ )+edward+"(
39223+ )+reiser+"(
39224+ )+root+"(
39225+ ))
39226+
39227+ and the file demidov might be specified as:
39228+
39229+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39230+
39231+ or
39232+
39233+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39234+
39235+ Notice that if the file demidov has a carriage return in it, the
39236+ parsing fails, but then if you put carriage returns in the wrong place
39237+ in a normal /etc/passwd file it breaks things also.
39238+
39239+ Note that it is forbidden to have no text between two interpolations
39240+ if one wants to be able to define what parts of a write go to what
39241+ subfiles referenced in an interpolation.
39242+
39243+ If one wants to be able to add new lines by writing to the file, one
39244+ must either write a custom plugin for /etc/passwd that knows how to
39245+ name an added line, or one must use an invert, or one must use a more
39246+ sophisticated symfile syntax that we are not planning to write for
39247+ version 4.0.
39248+*/
39249Index: linux-2.6.16/fs/reiser4/plugin/file/symlink.c
39250===================================================================
39251--- /dev/null
39252+++ linux-2.6.16/fs/reiser4/plugin/file/symlink.c
39253@@ -0,0 +1,92 @@
39254+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39255+
39256+#include "../../inode.h"
39257+
39258+#include <linux/types.h>
39259+#include <linux/fs.h>
39260+
39261+/* file plugin methods specific for symlink files
39262+ (SYMLINK_FILE_PLUGIN_ID) */
39263+
39264+/* this is implementation of create_object method of file plugin for
39265+ SYMLINK_FILE_PLUGIN_ID
39266+ */
39267+
39268+/**
39269+ * create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39270+ * @symlink: inode of symlink object
39271+ * @dir: inode of parent directory
39272+ * @info: parameters of new object
39273+ *
39274+ * Inserts stat data with symlink extension where into the tree.
39275+ */
39276+int create_symlink(struct inode *symlink,
39277+ struct inode *dir UNUSED_ARG,
39278+ reiser4_object_create_data *data /* info passed to us,
39279+ * this is filled by
39280+ * reiser4() syscall
39281+ * in particular */ )
39282+{
39283+ int result;
39284+
39285+ assert("nikita-680", symlink != NULL);
39286+ assert("nikita-681", S_ISLNK(symlink->i_mode));
39287+ assert("nikita-685", inode_get_flag(symlink, REISER4_NO_SD));
39288+ assert("nikita-682", dir != NULL);
39289+ assert("nikita-684", data != NULL);
39290+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39291+
39292+ /*
39293+ * stat data of symlink has symlink extension in which we store
39294+ * symlink content, that is, path symlink is pointing to.
39295+ */
39296+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39297+
39298+ assert("vs-838", symlink->u.generic_ip == NULL);
39299+ symlink->u.generic_ip = (void *)data->name;
39300+
39301+ assert("vs-843", symlink->i_size == 0);
39302+ INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39303+
39304+ /* insert stat data appended with data->name */
39305+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39306+ if (result) {
39307+ /* FIXME-VS: Make sure that symlink->u.generic_ip is not attached
39308+ to kmalloced data */
39309+ INODE_SET_FIELD(symlink, i_size, 0);
39310+ } else {
39311+ assert("vs-849", symlink->u.generic_ip
39312+ && inode_get_flag(symlink, REISER4_GENERIC_PTR_USED));
39313+ assert("vs-850",
39314+ !memcmp((char *)symlink->u.generic_ip, data->name,
39315+ (size_t) symlink->i_size + 1));
39316+ }
39317+ return result;
39318+}
39319+
39320+/* this is implementation of destroy_inode method of file plugin for
39321+ SYMLINK_FILE_PLUGIN_ID
39322+ */
39323+void destroy_inode_symlink(struct inode *inode)
39324+{
39325+ assert("edward-799",
39326+ inode_file_plugin(inode) ==
39327+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39328+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39329+ assert("edward-801", inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
39330+ assert("vs-839", S_ISLNK(inode->i_mode));
39331+
39332+ kfree(inode->u.generic_ip);
39333+ inode->u.generic_ip = NULL;
39334+ inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39335+}
39336+
39337+/* Local variables:
39338+ c-indentation-style: "K&R"
39339+ mode-name: "LC"
39340+ c-basic-offset: 8
39341+ tab-width: 8
39342+ fill-column: 120
39343+ scroll-step: 1
39344+ End:
39345+*/
39346Index: linux-2.6.16/fs/reiser4/plugin/file/tail_conversion.c
39347===================================================================
39348--- /dev/null
39349+++ linux-2.6.16/fs/reiser4/plugin/file/tail_conversion.c
39350@@ -0,0 +1,728 @@
39351+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39352+
39353+#include "../../inode.h"
39354+#include "../../super.h"
39355+#include "../../page_cache.h"
39356+#include "../../carry.h"
39357+#include "../../safe_link.h"
39358+#include "../../vfs_ops.h"
39359+
39360+#include <linux/writeback.h>
39361+
39362+/* this file contains:
39363+ tail2extent and extent2tail */
39364+
39365+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39366+void get_exclusive_access(unix_file_info_t * uf_info)
39367+{
39368+ assert("nikita-3028", schedulable());
39369+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39370+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39371+ /*
39372+ * "deadlock avoidance": sometimes we commit a transaction under
39373+ * rw-semaphore on a file. Such commit can deadlock with another
39374+ * thread that captured some block (hence preventing atom from being
39375+ * committed) and waits on rw-semaphore.
39376+ */
39377+ txn_restart_current();
39378+ LOCK_CNT_INC(inode_sem_w);
39379+ down_write(&uf_info->latch);
39380+ uf_info->exclusive_use = 1;
39381+ assert("vs-1713", uf_info->ea_owner == NULL);
39382+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39383+ ON_DEBUG(uf_info->ea_owner = current);
39384+}
39385+
39386+void drop_exclusive_access(unix_file_info_t * uf_info)
39387+{
39388+ assert("vs-1714", uf_info->ea_owner == current);
39389+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39390+ ON_DEBUG(uf_info->ea_owner = NULL);
39391+ uf_info->exclusive_use = 0;
39392+ up_write(&uf_info->latch);
39393+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39394+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39395+ LOCK_CNT_DEC(inode_sem_w);
39396+ txn_restart_current();
39397+}
39398+
39399+/**
39400+ * nea_grabbed - do something when file semaphore is down_read-ed
39401+ * @uf_info:
39402+ *
39403+ * This is called when nonexclisive access is obtained on file. All it does is
39404+ * for debugging purposes.
39405+ */
39406+static void nea_grabbed(unix_file_info_t *uf_info)
39407+{
39408+#if REISER4_DEBUG
39409+ LOCK_CNT_INC(inode_sem_r);
39410+ assert("vs-1716", uf_info->ea_owner == NULL);
39411+ atomic_inc(&uf_info->nr_neas);
39412+ uf_info->last_reader = current;
39413+#endif
39414+}
39415+
39416+/**
39417+ * get_nonexclusive_access - get nonexclusive access to a file
39418+ * @uf_info: unix file specific part of inode to obtain access to
39419+ *
39420+ * Nonexclusive access is obtained on a file before read, write, readpage.
39421+ */
39422+void get_nonexclusive_access(unix_file_info_t *uf_info)
39423+{
39424+ assert("nikita-3029", schedulable());
39425+ assert("nikita-3361", get_current_context()->trans->atom == NULL);
39426+
39427+ down_read(&uf_info->latch);
39428+ nea_grabbed(uf_info);
39429+}
39430+
39431+/**
39432+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39433+ * @uf_info: unix file specific part of inode to obtain access to
39434+ *
39435+ * Non-blocking version of nonexclusive access obtaining.
39436+ */
39437+int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
39438+{
39439+ int result;
39440+
39441+ result = down_read_trylock(&uf_info->latch);
39442+ if (result)
39443+ nea_grabbed(uf_info);
39444+ return result;
39445+}
39446+
39447+void drop_nonexclusive_access(unix_file_info_t * uf_info)
39448+{
39449+ assert("vs-1718", uf_info->ea_owner == NULL);
39450+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
39451+ ON_DEBUG(atomic_dec(&uf_info->nr_neas));
39452+
39453+ up_read(&uf_info->latch);
39454+
39455+ LOCK_CNT_DEC(inode_sem_r);
39456+ txn_restart_current();
39457+}
39458+
39459+/* part of tail2extent. Cut all items covering @count bytes starting from
39460+ @offset */
39461+/* Audited by: green(2002.06.15) */
39462+static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
39463+{
39464+ reiser4_key from, to;
39465+
39466+ /* AUDIT: How about putting an assertion here, what would check
39467+ all provided range is covered by tail items only? */
39468+ /* key of first byte in the range to be cut */
39469+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39470+
39471+ /* key of last byte in that range */
39472+ to = from;
39473+ set_key_offset(&to, (__u64) (offset + count - 1));
39474+
39475+ /* cut everything between those keys */
39476+ return cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
39477+}
39478+
39479+static void release_all_pages(struct page **pages, unsigned nr_pages)
39480+{
39481+ unsigned i;
39482+
39483+ for (i = 0; i < nr_pages; i++) {
39484+ if (pages[i] == NULL) {
39485+ unsigned j;
39486+ for (j = i + 1; j < nr_pages; j++)
39487+ assert("vs-1620", pages[j] == NULL);
39488+ break;
39489+ }
39490+ page_cache_release(pages[i]);
39491+ pages[i] = NULL;
39492+ }
39493+}
39494+
39495+/* part of tail2extent. replace tail items with extent one. Content of tail
39496+ items (@count bytes) being cut are copied already into
39497+ pages. extent_writepage method is called to create extents corresponding to
39498+ those pages */
39499+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
39500+{
39501+ int result;
39502+ unsigned i;
39503+ STORE_COUNTERS;
39504+
39505+ if (nr_pages == 0)
39506+ return 0;
39507+
39508+ assert("vs-596", pages[0]);
39509+
39510+ /* cut copied items */
39511+ result =
39512+ cut_formatting_items(inode,
39513+ (loff_t) pages[0]->index << PAGE_CACHE_SHIFT,
39514+ count);
39515+ if (result)
39516+ return result;
39517+
39518+ CHECK_COUNTERS;
39519+
39520+ /* put into tree replacement for just removed items: extent item, namely */
39521+ for (i = 0; i < nr_pages; i++) {
39522+ result = add_to_page_cache_lru(pages[i], inode->i_mapping,
39523+ pages[i]->index,
39524+ mapping_gfp_mask(inode->
39525+ i_mapping));
39526+ if (result)
39527+ break;
39528+ unlock_page(pages[i]);
39529+ result = find_or_create_extent(pages[i]);
39530+ if (result)
39531+ break;
39532+ SetPageUptodate(pages[i]);
39533+ }
39534+ return result;
39535+}
39536+
39537+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
39538+ * items */
39539+
39540+static int reserve_tail2extent_iteration(struct inode *inode)
39541+{
39542+ reiser4_block_nr unformatted_nodes;
39543+ reiser4_tree *tree;
39544+
39545+ tree = tree_by_inode(inode);
39546+
39547+ /* number of unformatted nodes which will be created */
39548+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
39549+
39550+ /*
39551+ * space required for one iteration of extent->tail conversion:
39552+ *
39553+ * 1. kill N tail items
39554+ *
39555+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
39556+ *
39557+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
39558+ * extents) extent units.
39559+ *
39560+ * 4. drilling to the leaf level by coord_by_key()
39561+ *
39562+ * 5. possible update of stat-data
39563+ *
39564+ */
39565+ grab_space_enable();
39566+ return reiser4_grab_space
39567+ (2 * tree->height +
39568+ TAIL2EXTENT_PAGE_NUM +
39569+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
39570+ 1 + estimate_one_insert_item(tree) +
39571+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39572+}
39573+
39574+/* clear stat data's flag indicating that conversion is being converted */
39575+static int complete_conversion(struct inode *inode)
39576+{
39577+ int result;
39578+
39579+ grab_space_enable();
39580+ result =
39581+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39582+ BA_CAN_COMMIT);
39583+ if (result == 0) {
39584+ inode_clr_flag(inode, REISER4_PART_MIXED);
39585+ result = reiser4_update_sd(inode);
39586+ }
39587+ if (result)
39588+ warning("vs-1696", "Failed to clear converting bit of %llu: %i",
39589+ (unsigned long long)get_inode_oid(inode), result);
39590+ return 0;
39591+}
39592+
39593+/**
39594+ * find_start
39595+ * @inode:
39596+ * @id:
39597+ * @offset:
39598+ *
39599+ * this is used by tail2extent and extent2tail to detect where previous
39600+ * uncompleted conversion stopped
39601+ */
39602+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
39603+{
39604+ int result;
39605+ lock_handle lh;
39606+ coord_t coord;
39607+ unix_file_info_t *ufo;
39608+ int found;
39609+ reiser4_key key;
39610+
39611+ ufo = unix_file_inode_data(inode);
39612+ init_lh(&lh);
39613+ result = 0;
39614+ found = 0;
39615+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
39616+ do {
39617+ init_lh(&lh);
39618+ result = find_file_item_nohint(&coord, &lh, &key,
39619+ ZNODE_READ_LOCK, inode);
39620+
39621+ if (result == CBK_COORD_FOUND) {
39622+ if (coord.between == AT_UNIT) {
39623+ /*coord_clear_iplug(&coord); */
39624+ result = zload(coord.node);
39625+ if (result == 0) {
39626+ if (item_id_by_coord(&coord) == id)
39627+ found = 1;
39628+ else
39629+ item_plugin_by_coord(&coord)->s.
39630+ file.append_key(&coord,
39631+ &key);
39632+ zrelse(coord.node);
39633+ }
39634+ } else
39635+ result = RETERR(-ENOENT);
39636+ }
39637+ done_lh(&lh);
39638+ } while (result == 0 && !found);
39639+ *offset = get_key_offset(&key);
39640+ return result;
39641+}
39642+
39643+/**
39644+ * tail2extent
39645+ * @uf_info:
39646+ *
39647+ *
39648+ */
39649+int tail2extent(unix_file_info_t *uf_info)
39650+{
39651+ int result;
39652+ reiser4_key key; /* key of next byte to be moved to page */
39653+ char *p_data; /* data of page */
39654+ unsigned page_off = 0, /* offset within the page where to copy data */
39655+ count; /* number of bytes of item which can be
39656+ * copied to page */
39657+ struct page *pages[TAIL2EXTENT_PAGE_NUM];
39658+ struct page *page;
39659+ int done; /* set to 1 when all file is read */
39660+ char *item;
39661+ int i;
39662+ struct inode *inode;
39663+ int first_iteration;
39664+ int bytes;
39665+ __u64 offset;
39666+
39667+ assert("nikita-3362", ea_obtained(uf_info));
39668+ inode = unix_file_info_to_inode(uf_info);
39669+ assert("nikita-3412", !IS_RDONLY(inode));
39670+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
39671+ assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
39672+
39673+ offset = 0;
39674+ first_iteration = 1;
39675+ result = 0;
39676+ if (inode_get_flag(inode, REISER4_PART_MIXED)) {
39677+ /*
39678+ * file is marked on disk as there was a conversion which did
39679+ * not complete due to either crash or some error. Find which
39680+ * offset tail conversion stopped at
39681+ */
39682+ result = find_start(inode, FORMATTING_ID, &offset);
39683+ if (result == -ENOENT) {
39684+ /* no tail items found, everything is converted */
39685+ uf_info->container = UF_CONTAINER_EXTENTS;
39686+ complete_conversion(inode);
39687+ return 0;
39688+ } else if (result != 0)
39689+ /* some other error */
39690+ return result;
39691+ first_iteration = 0;
39692+ }
39693+
39694+ inode_set_flag(inode, REISER4_PART_IN_CONV);
39695+
39696+ /* get key of first byte of a file */
39697+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
39698+
39699+ done = 0;
39700+ while (done == 0) {
39701+ memset(pages, 0, sizeof(pages));
39702+ result = reserve_tail2extent_iteration(inode);
39703+ if (result != 0)
39704+ goto out;
39705+ if (first_iteration) {
39706+ inode_set_flag(inode, REISER4_PART_MIXED);
39707+ reiser4_update_sd(inode);
39708+ first_iteration = 0;
39709+ }
39710+ bytes = 0;
39711+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
39712+ assert("vs-598",
39713+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
39714+ page = alloc_page(get_gfp_mask());
39715+ if (!page) {
39716+ result = RETERR(-ENOMEM);
39717+ goto error;
39718+ }
39719+
39720+ page->index =
39721+ (unsigned long)(get_key_offset(&key) >>
39722+ PAGE_CACHE_SHIFT);
39723+ /*
39724+ * usually when one is going to longterm lock znode (as
39725+ * find_file_item does, for instance) he must not hold
39726+ * locked pages. However, there is an exception for
39727+ * case tail2extent. Pages appearing here are not
39728+ * reachable to everyone else, they are clean, they do
39729+ * not have jnodes attached so keeping them locked do
39730+ * not risk deadlock appearance
39731+ */
39732+ assert("vs-983", !PagePrivate(page));
39733+ reiser4_invalidate_pages(inode->i_mapping, page->index,
39734+ 1, 0);
39735+
39736+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
39737+ coord_t coord;
39738+ lock_handle lh;
39739+
39740+ /* get next item */
39741+ /* FIXME: we might want to readahead here */
39742+ init_lh(&lh);
39743+ result =
39744+ find_file_item_nohint(&coord, &lh, &key,
39745+ ZNODE_READ_LOCK,
39746+ inode);
39747+ if (result != CBK_COORD_FOUND) {
39748+ /*
39749+ * error happened of not items of file
39750+ * were found
39751+ */
39752+ done_lh(&lh);
39753+ page_cache_release(page);
39754+ goto error;
39755+ }
39756+
39757+ if (coord.between == AFTER_UNIT) {
39758+ /*
39759+ * end of file is reached. Padd page
39760+ * with zeros
39761+ */
39762+ done_lh(&lh);
39763+ done = 1;
39764+ p_data = kmap_atomic(page, KM_USER0);
39765+ memset(p_data + page_off, 0,
39766+ PAGE_CACHE_SIZE - page_off);
39767+ kunmap_atomic(p_data, KM_USER0);
39768+ break;
39769+ }
39770+
39771+ result = zload(coord.node);
39772+ if (result) {
39773+ page_cache_release(page);
39774+ done_lh(&lh);
39775+ goto error;
39776+ }
39777+ assert("vs-856", coord.between == AT_UNIT);
39778+ item = ((char *)item_body_by_coord(&coord)) +
39779+ coord.unit_pos;
39780+
39781+ /* how many bytes to copy */
39782+ count =
39783+ item_length_by_coord(&coord) -
39784+ coord.unit_pos;
39785+ /* limit length of copy to end of page */
39786+ if (count > PAGE_CACHE_SIZE - page_off)
39787+ count = PAGE_CACHE_SIZE - page_off;
39788+
39789+ /*
39790+ * copy item (as much as will fit starting from
39791+ * the beginning of the item) into the page
39792+ */
39793+ p_data = kmap_atomic(page, KM_USER0);
39794+ memcpy(p_data + page_off, item, count);
39795+ kunmap_atomic(p_data, KM_USER0);
39796+
39797+ page_off += count;
39798+ bytes += count;
39799+ set_key_offset(&key,
39800+ get_key_offset(&key) + count);
39801+
39802+ zrelse(coord.node);
39803+ done_lh(&lh);
39804+ } /* end of loop which fills one page by content of
39805+ * formatting items */
39806+
39807+ if (page_off) {
39808+ /* something was copied into page */
39809+ pages[i] = page;
39810+ } else {
39811+ page_cache_release(page);
39812+ assert("vs-1648", done == 1);
39813+ break;
39814+ }
39815+ } /* end of loop through pages of one conversion iteration */
39816+
39817+ if (i > 0) {
39818+ result = replace(inode, pages, i, bytes);
39819+ release_all_pages(pages, sizeof_array(pages));
39820+ if (result)
39821+ goto error;
39822+ /*
39823+ * we have to drop exclusive access to avoid deadlock
39824+ * which may happen because called by
39825+ * reiser4_writepages capture_unix_file requires to get
39826+ * non-exclusive access to a file. It is safe to drop
39827+ * EA in the middle of tail2extent conversion because
39828+ * write_unix_file/unix_setattr(truncate)/release_unix_file(extent2tail)
39829+ * are serialized by uf_info->write semaphore and
39830+ * because read_unix_file works (should at least) on
39831+ * partially converted files
39832+ */
39833+ drop_exclusive_access(uf_info);
39834+ /* throttle the conversion */
39835+ reiser4_throttle_write(inode);
39836+ get_exclusive_access(uf_info);
39837+
39838+ /*
39839+ * nobody is allowed to complete conversion but a
39840+ * process which started it
39841+ */
39842+ assert("", inode_get_flag(inode, REISER4_PART_MIXED));
39843+ }
39844+ }
39845+
39846+ inode_clr_flag(inode, REISER4_PART_IN_CONV);
39847+
39848+ if (result == 0) {
39849+ /* file is converted to extent items */
39850+ assert("vs-1697", inode_get_flag(inode, REISER4_PART_MIXED));
39851+
39852+ uf_info->container = UF_CONTAINER_EXTENTS;
39853+ complete_conversion(inode);
39854+ } else {
39855+ /*
39856+ * conversion is not complete. Inode was already marked as
39857+ * REISER4_PART_CONV and stat-data were updated at the first
39858+ * iteration of the loop above.
39859+ */
39860+ error:
39861+ release_all_pages(pages, sizeof_array(pages));
39862+ warning("nikita-2282", "Partial conversion of %llu: %i",
39863+ (unsigned long long)get_inode_oid(inode), result);
39864+ }
39865+
39866+ out:
39867+ return result;
39868+}
39869+
39870+static int reserve_extent2tail_iteration(struct inode *inode)
39871+{
39872+ reiser4_tree *tree;
39873+
39874+ tree = tree_by_inode(inode);
39875+ /*
39876+ * reserve blocks for (in this order):
39877+ *
39878+ * 1. removal of extent item
39879+ *
39880+ * 2. insertion of tail by insert_flow()
39881+ *
39882+ * 3. drilling to the leaf level by coord_by_key()
39883+ *
39884+ * 4. possible update of stat-data
39885+ */
39886+ grab_space_enable();
39887+ return reiser4_grab_space
39888+ (estimate_one_item_removal(tree) +
39889+ estimate_insert_flow(tree->height) +
39890+ 1 + estimate_one_insert_item(tree) +
39891+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39892+}
39893+
39894+static int filler(void *vp, struct page *page)
39895+{
39896+ return readpage_unix_file_nolock(vp, page);
39897+}
39898+
39899+/* for every page of file: read page, cut part of extent pointing to this page,
39900+ put data of page tree by tail item */
39901+int extent2tail(unix_file_info_t *uf_info)
39902+{
39903+ int result;
39904+ struct inode *inode;
39905+ struct page *page;
39906+ unsigned long num_pages, i;
39907+ unsigned long start_page;
39908+ reiser4_key from;
39909+ reiser4_key to;
39910+ unsigned count;
39911+ __u64 offset;
39912+
39913+ assert("nikita-3362", ea_obtained(uf_info));
39914+ inode = unix_file_info_to_inode(uf_info);
39915+ assert("nikita-3412", !IS_RDONLY(inode));
39916+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
39917+ assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
39918+
39919+ offset = 0;
39920+ if (inode_get_flag(inode, REISER4_PART_MIXED)) {
39921+ /*
39922+ * file is marked on disk as there was a conversion which did
39923+ * not complete due to either crash or some error. Find which
39924+ * offset tail conversion stopped at
39925+ */
39926+ result = find_start(inode, EXTENT_POINTER_ID, &offset);
39927+ if (result == -ENOENT) {
39928+ /* no extent found, everything is converted */
39929+ uf_info->container = UF_CONTAINER_TAILS;
39930+ complete_conversion(inode);
39931+ return 0;
39932+ } else if (result != 0)
39933+ /* some other error */
39934+ return result;
39935+ }
39936+
39937+ inode_set_flag(inode, REISER4_PART_IN_CONV);
39938+
39939+ /* number of pages in the file */
39940+ num_pages =
39941+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
39942+ start_page = offset >> PAGE_CACHE_SHIFT;
39943+
39944+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39945+ to = from;
39946+
39947+ result = 0;
39948+ for (i = 0; i < num_pages; i++) {
39949+ __u64 start_byte;
39950+
39951+ result = reserve_extent2tail_iteration(inode);
39952+ if (result != 0)
39953+ break;
39954+ if (i == 0 && offset == 0) {
39955+ inode_set_flag(inode, REISER4_PART_MIXED);
39956+ reiser4_update_sd(inode);
39957+ }
39958+
39959+ page = read_cache_page(inode->i_mapping,
39960+ (unsigned)(i + start_page), filler, NULL);
39961+ if (IS_ERR(page)) {
39962+ result = PTR_ERR(page);
39963+ break;
39964+ }
39965+
39966+ wait_on_page_locked(page);
39967+
39968+ if (!PageUptodate(page)) {
39969+ page_cache_release(page);
39970+ result = RETERR(-EIO);
39971+ break;
39972+ }
39973+
39974+ /* cut part of file we have read */
39975+ start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
39976+ set_key_offset(&from, start_byte);
39977+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
39978+ /*
39979+ * cut_tree_object() returns -E_REPEAT to allow atom
39980+ * commits during over-long truncates. But
39981+ * extent->tail conversion should be performed in one
39982+ * transaction.
39983+ */
39984+ result = cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
39985+
39986+ if (result) {
39987+ page_cache_release(page);
39988+ break;
39989+ }
39990+
39991+ /* put page data into tree via tail_write */
39992+ count = PAGE_CACHE_SIZE;
39993+ if ((i == (num_pages - 1)) &&
39994+ (inode->i_size & ~PAGE_CACHE_MASK))
39995+ /* last page can be incompleted */
39996+ count = (inode->i_size & ~PAGE_CACHE_MASK);
39997+ while (count) {
39998+ struct dentry dentry;
39999+ struct file file;
40000+ loff_t pos;
40001+
40002+ dentry.d_inode = inode;
40003+ file.f_dentry = &dentry;
40004+ file.private_data = NULL;
40005+ file.f_pos = start_byte;
40006+ file.private_data = NULL;
40007+ pos = start_byte;
40008+ result = write_tail(&file, (char __user *)kmap(page),
40009+ count, &pos);
40010+ reiser4_free_file_fsdata(&file);
40011+ if (result <= 0) {
40012+ warning("", "write_tail failed");
40013+ page_cache_release(page);
40014+ inode_clr_flag(inode, REISER4_PART_IN_CONV);
40015+ return result;
40016+ }
40017+ count -= result;
40018+ }
40019+
40020+ /* release page */
40021+ lock_page(page);
40022+ /* page is already detached from jnode and mapping. */
40023+ assert("vs-1086", page->mapping == NULL);
40024+ assert("nikita-2690",
40025+ (!PagePrivate(page) && jprivate(page) == 0));
40026+ /* waiting for writeback completion with page lock held is
40027+ * perfectly valid. */
40028+ wait_on_page_writeback(page);
40029+ drop_page(page);
40030+ /* release reference taken by read_cache_page() above */
40031+ page_cache_release(page);
40032+
40033+ drop_exclusive_access(uf_info);
40034+ /* throttle the conversion */
40035+ reiser4_throttle_write(inode);
40036+ get_exclusive_access(uf_info);
40037+ /*
40038+ * nobody is allowed to complete conversion but a process which
40039+ * started it
40040+ */
40041+ assert("", inode_get_flag(inode, REISER4_PART_MIXED));
40042+ }
40043+
40044+ inode_clr_flag(inode, REISER4_PART_IN_CONV);
40045+
40046+ if (i == num_pages) {
40047+ /* file is converted to formatted items */
40048+ assert("vs-1698", inode_get_flag(inode, REISER4_PART_MIXED));
40049+ assert("vs-1260",
40050+ inode_has_no_jnodes(reiser4_inode_data(inode)));
40051+
40052+ uf_info->container = UF_CONTAINER_TAILS;
40053+ complete_conversion(inode);
40054+ return 0;
40055+ }
40056+ /*
40057+ * conversion is not complete. Inode was already marked as
40058+ * REISER4_PART_MIXED and stat-data were updated at the first *
40059+ * iteration of the loop above.
40060+ */
40061+ warning("nikita-2282",
40062+ "Partial conversion of %llu: %lu of %lu: %i",
40063+ (unsigned long long)get_inode_oid(inode), i,
40064+ num_pages, result);
40065+
40066+ return result;
40067+}
40068+
40069+/*
40070+ * Local variables:
40071+ * c-indentation-style: "K&R"
40072+ * mode-name: "LC"
40073+ * c-basic-offset: 8
40074+ * tab-width: 8
40075+ * fill-column: 79
40076+ * scroll-step: 1
40077+ * End:
40078+ */
40079Index: linux-2.6.16/fs/reiser4/plugin/file_ops.c
40080===================================================================
40081--- /dev/null
40082+++ linux-2.6.16/fs/reiser4/plugin/file_ops.c
40083@@ -0,0 +1,167 @@
40084+/* Copyright 2005 by Hans Reiser, licensing governed by
40085+ reiser4/README */
40086+
40087+/* this file contains typical implementations for some of methods of
40088+ struct file_operations and of struct address_space_operations
40089+*/
40090+
40091+#include "../inode.h"
40092+#include "object.h"
40093+
40094+/* file operations */
40095+
40096+/* implementation of vfs's llseek method of struct file_operations for
40097+ typical directory can be found in readdir_common.c
40098+*/
40099+loff_t llseek_common_dir(struct file *, loff_t, int origin);
40100+
40101+/* implementation of vfs's readdir method of struct file_operations for
40102+ typical directory can be found in readdir_common.c
40103+*/
40104+int readdir_common(struct file *, void *dirent, filldir_t);
40105+
40106+/**
40107+ * release_dir_common - release of struct file_operations
40108+ * @inode: inode of released file
40109+ * @file: file to release
40110+ *
40111+ * Implementation of release method of struct file_operations for typical
40112+ * directory. All it does is freeing of reiser4 specific file data.
40113+*/
40114+int release_dir_common(struct inode *inode, struct file *file)
40115+{
40116+ reiser4_context *ctx;
40117+
40118+ ctx = init_context(inode->i_sb);
40119+ if (IS_ERR(ctx))
40120+ return PTR_ERR(ctx);
40121+ reiser4_free_file_fsdata(file);
40122+ reiser4_exit_context(ctx);
40123+ return 0;
40124+}
40125+
40126+/* this is common implementation of vfs's fsync method of struct
40127+ file_operations
40128+*/
40129+int sync_common(struct file *file, struct dentry *dentry, int datasync)
40130+{
40131+ reiser4_context *ctx;
40132+ int result;
40133+
40134+ ctx = init_context(dentry->d_inode->i_sb);
40135+ if (IS_ERR(ctx))
40136+ return PTR_ERR(ctx);
40137+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
40138+
40139+ context_set_commit_async(ctx);
40140+ reiser4_exit_context(ctx);
40141+ return result;
40142+}
40143+
40144+/* this is common implementation of vfs's sendfile method of struct
40145+ file_operations
40146+
40147+ Reads @count bytes from @file and calls @actor for every page read. This is
40148+ needed for loop back devices support.
40149+*/
40150+#if 0
40151+ssize_t
40152+sendfile_common(struct file *file, loff_t *ppos, size_t count,
40153+ read_actor_t actor, void *target)
40154+{
40155+ reiser4_context *ctx;
40156+ ssize_t result;
40157+
40158+ ctx = init_context(file->f_dentry->d_inode->i_sb);
40159+ if (IS_ERR(ctx))
40160+ return PTR_ERR(ctx);
40161+ result = generic_file_sendfile(file, ppos, count, actor, target);
40162+ reiser4_exit_context(ctx);
40163+ return result;
40164+}
40165+#endif /* 0 */
40166+
40167+/* address space operations */
40168+
40169+/* this is common implementation of vfs's prepare_write method of struct
40170+ address_space_operations
40171+*/
40172+int
40173+prepare_write_common(struct file *file, struct page *page, unsigned from,
40174+ unsigned to)
40175+{
40176+ reiser4_context *ctx;
40177+ int result;
40178+
40179+ ctx = init_context(page->mapping->host->i_sb);
40180+ result = do_prepare_write(file, page, from, to);
40181+
40182+ /* don't commit transaction under inode semaphore */
40183+ context_set_commit_async(ctx);
40184+ reiser4_exit_context(ctx);
40185+
40186+ return result;
40187+}
40188+
40189+/* this is helper for prepare_write_common and prepare_write_unix_file
40190+ */
40191+int
40192+do_prepare_write(struct file *file, struct page *page, unsigned from,
40193+ unsigned to)
40194+{
40195+ int result;
40196+ file_plugin *fplug;
40197+ struct inode *inode;
40198+
40199+ assert("umka-3099", file != NULL);
40200+ assert("umka-3100", page != NULL);
40201+ assert("umka-3095", PageLocked(page));
40202+
40203+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
40204+ return 0;
40205+
40206+ inode = page->mapping->host;
40207+ fplug = inode_file_plugin(inode);
40208+
40209+ if (page->mapping->a_ops->readpage == NULL)
40210+ return RETERR(-EINVAL);
40211+
40212+ result = page->mapping->a_ops->readpage(file, page);
40213+ if (result != 0) {
40214+ SetPageError(page);
40215+ ClearPageUptodate(page);
40216+ /* All reiser4 readpage() implementations should return the
40217+ * page locked in case of error. */
40218+ assert("nikita-3472", PageLocked(page));
40219+ } else {
40220+ /*
40221+ * ->readpage() either:
40222+ *
40223+ * 1. starts IO against @page. @page is locked for IO in
40224+ * this case.
40225+ *
40226+ * 2. doesn't start IO. @page is unlocked.
40227+ *
40228+ * In either case, page should be locked.
40229+ */
40230+ lock_page(page);
40231+ /*
40232+ * IO (if any) is completed at this point. Check for IO
40233+ * errors.
40234+ */
40235+ if (!PageUptodate(page))
40236+ result = RETERR(-EIO);
40237+ }
40238+ assert("umka-3098", PageLocked(page));
40239+ return result;
40240+}
40241+
40242+/*
40243+ * Local variables:
40244+ * c-indentation-style: "K&R"
40245+ * mode-name: "LC"
40246+ * c-basic-offset: 8
40247+ * tab-width: 8
40248+ * fill-column: 79
40249+ * End:
40250+ */
40251Index: linux-2.6.16/fs/reiser4/plugin/file_ops_readdir.c
40252===================================================================
40253--- /dev/null
40254+++ linux-2.6.16/fs/reiser4/plugin/file_ops_readdir.c
40255@@ -0,0 +1,654 @@
40256+/* Copyright 2005 by Hans Reiser, licensing governed by
40257+ * reiser4/README */
40258+
40259+#include "../inode.h"
40260+
40261+/* return true, iff @coord points to the valid directory item that is part of
40262+ * @inode directory. */
40263+static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40264+{
40265+ return
40266+ item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE &&
40267+ inode_file_plugin(inode)->owns_item(inode, coord);
40268+}
40269+
40270+/* compare two logical positions within the same directory */
40271+static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
40272+{
40273+ cmp_t result;
40274+
40275+ assert("nikita-2534", p1 != NULL);
40276+ assert("nikita-2535", p2 != NULL);
40277+
40278+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40279+ if (result == EQUAL_TO) {
40280+ int diff;
40281+
40282+ diff = p1->pos - p2->pos;
40283+ result =
40284+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40285+ }
40286+ return result;
40287+}
40288+
40289+
40290+/* see comment before readdir_common() for overview of why "adjustment" is
40291+ * necessary. */
40292+static void
40293+adjust_dir_pos(struct file *dir,
40294+ readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
40295+{
40296+ dir_pos *pos;
40297+
40298+ /*
40299+ * new directory entry was added (adj == +1) or removed (adj == -1) at
40300+ * the @mod_point. Directory file descriptor @dir is doing readdir and
40301+ * is currently positioned at @readdir_spot. Latter has to be updated
40302+ * to maintain stable readdir.
40303+ */
40304+ /* directory is positioned to the beginning. */
40305+ if (readdir_spot->entry_no == 0)
40306+ return;
40307+
40308+ pos = &readdir_spot->position;
40309+ switch (dir_pos_cmp(mod_point, pos)) {
40310+ case LESS_THAN:
40311+ /* @mod_pos is _before_ @readdir_spot, that is, entry was
40312+ * added/removed on the left (in key order) of current
40313+ * position. */
40314+ /* logical number of directory entry readdir is "looking" at
40315+ * changes */
40316+ readdir_spot->entry_no += adj;
40317+ assert("nikita-2577",
40318+ ergo(dir != NULL, get_dir_fpos(dir) + adj >= 0));
40319+ if (de_id_cmp(&pos->dir_entry_key,
40320+ &mod_point->dir_entry_key) == EQUAL_TO) {
40321+ assert("nikita-2575", mod_point->pos < pos->pos);
40322+ /*
40323+ * if entry added/removed has the same key as current
40324+ * for readdir, update counter of duplicate keys in
40325+ * @readdir_spot.
40326+ */
40327+ pos->pos += adj;
40328+ }
40329+ break;
40330+ case GREATER_THAN:
40331+ /* directory is modified after @pos: nothing to do. */
40332+ break;
40333+ case EQUAL_TO:
40334+ /* cannot insert an entry readdir is looking at, because it
40335+ already exists. */
40336+ assert("nikita-2576", adj < 0);
40337+ /* directory entry to which @pos points to is being
40338+ removed.
40339+
40340+ NOTE-NIKITA: Right thing to do is to update @pos to point
40341+ to the next entry. This is complex (we are under spin-lock
40342+ for one thing). Just rewind it to the beginning. Next
40343+ readdir will have to scan the beginning of
40344+ directory. Proper solution is to use semaphore in
40345+ spin lock's stead and use rewind_right() here.
40346+
40347+ NOTE-NIKITA: now, semaphore is used, so...
40348+ */
40349+ memset(readdir_spot, 0, sizeof *readdir_spot);
40350+ }
40351+}
40352+
40353+/* scan all file-descriptors for this directory and adjust their
40354+ positions respectively. Should be used by implementations of
40355+ add_entry and rem_entry of dir plugin */
40356+void
40357+adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj)
40358+{
40359+ reiser4_file_fsdata *scan;
40360+ dir_pos mod_point;
40361+
40362+ assert("nikita-2536", dir != NULL);
40363+ assert("nikita-2538", de != NULL);
40364+ assert("nikita-2539", adj != 0);
40365+
40366+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40367+ mod_point.pos = offset;
40368+
40369+ spin_lock_inode(dir);
40370+
40371+ /*
40372+ * new entry was added/removed in directory @dir. Scan all file
40373+ * descriptors for @dir that are currently involved into @readdir and
40374+ * update them.
40375+ */
40376+
40377+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40378+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40379+
40380+ spin_unlock_inode(dir);
40381+}
40382+
40383+/*
40384+ * traverse tree to start/continue readdir from the readdir position @pos.
40385+ */
40386+static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
40387+{
40388+ reiser4_key key;
40389+ int result;
40390+ struct inode *inode;
40391+
40392+ assert("nikita-2554", pos != NULL);
40393+
40394+ inode = dir->f_dentry->d_inode;
40395+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40396+ if (result != 0)
40397+ return result;
40398+ result = object_lookup(inode,
40399+ &key,
40400+ tap->coord,
40401+ tap->lh,
40402+ tap->mode,
40403+ FIND_EXACT,
40404+ LEAF_LEVEL, LEAF_LEVEL, 0, &tap->ra_info);
40405+ if (result == CBK_COORD_FOUND)
40406+ result = rewind_right(tap, (int)pos->position.pos);
40407+ else {
40408+ tap->coord->node = NULL;
40409+ done_lh(tap->lh);
40410+ result = RETERR(-EIO);
40411+ }
40412+ return result;
40413+}
40414+
40415+/*
40416+ * handling of non-unique keys: calculate at what ordinal position within
40417+ * sequence of directory items with identical keys @pos is.
40418+ */
40419+static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
40420+{
40421+ int result;
40422+ coord_t coord;
40423+ lock_handle lh;
40424+ tap_t scan;
40425+ de_id *did;
40426+ reiser4_key de_key;
40427+
40428+ coord_init_zero(&coord);
40429+ init_lh(&lh);
40430+ tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40431+ tap_copy(&scan, tap);
40432+ tap_load(&scan);
40433+ pos->position.pos = 0;
40434+
40435+ did = &pos->position.dir_entry_key;
40436+
40437+ if (is_valid_dir_coord(inode, scan.coord)) {
40438+
40439+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40440+
40441+ while (1) {
40442+
40443+ result = go_prev_unit(&scan);
40444+ if (result != 0)
40445+ break;
40446+
40447+ if (!is_valid_dir_coord(inode, scan.coord)) {
40448+ result = -EINVAL;
40449+ break;
40450+ }
40451+
40452+ /* get key of directory entry */
40453+ unit_key_by_coord(scan.coord, &de_key);
40454+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
40455+ /* duplicate-sequence is over */
40456+ break;
40457+ }
40458+ pos->position.pos++;
40459+ }
40460+ } else
40461+ result = RETERR(-ENOENT);
40462+ tap_relse(&scan);
40463+ tap_done(&scan);
40464+ return result;
40465+}
40466+
40467+/*
40468+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
40469+ */
40470+static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
40471+{
40472+ __u64 destination;
40473+ __s64 shift;
40474+ int result;
40475+ struct inode *inode;
40476+ loff_t dirpos;
40477+
40478+ assert("nikita-2553", dir != NULL);
40479+ assert("nikita-2548", pos != NULL);
40480+ assert("nikita-2551", tap->coord != NULL);
40481+ assert("nikita-2552", tap->lh != NULL);
40482+
40483+ dirpos = get_dir_fpos(dir);
40484+ shift = dirpos - pos->fpos;
40485+ /* this is logical directory entry within @dir which we are rewinding
40486+ * to */
40487+ destination = pos->entry_no + shift;
40488+
40489+ inode = dir->f_dentry->d_inode;
40490+ if (dirpos < 0)
40491+ return RETERR(-EINVAL);
40492+ else if (destination == 0ll || dirpos == 0) {
40493+ /* rewind to the beginning of directory */
40494+ memset(pos, 0, sizeof *pos);
40495+ return dir_go_to(dir, pos, tap);
40496+ } else if (destination >= inode->i_size)
40497+ return RETERR(-ENOENT);
40498+
40499+ if (shift < 0) {
40500+ /* I am afraid of negative numbers */
40501+ shift = -shift;
40502+ /* rewinding to the left */
40503+ if (shift <= (int)pos->position.pos) {
40504+ /* destination is within sequence of entries with
40505+ duplicate keys. */
40506+ result = dir_go_to(dir, pos, tap);
40507+ } else {
40508+ shift -= pos->position.pos;
40509+ while (1) {
40510+ /* repetitions: deadlock is possible when
40511+ going to the left. */
40512+ result = dir_go_to(dir, pos, tap);
40513+ if (result == 0) {
40514+ result = rewind_left(tap, shift);
40515+ if (result == -E_DEADLOCK) {
40516+ tap_done(tap);
40517+ continue;
40518+ }
40519+ }
40520+ break;
40521+ }
40522+ }
40523+ } else {
40524+ /* rewinding to the right */
40525+ result = dir_go_to(dir, pos, tap);
40526+ if (result == 0)
40527+ result = rewind_right(tap, shift);
40528+ }
40529+ if (result == 0) {
40530+ result = set_pos(inode, pos, tap);
40531+ if (result == 0) {
40532+ /* update pos->position.pos */
40533+ pos->entry_no = destination;
40534+ pos->fpos = dirpos;
40535+ }
40536+ }
40537+ return result;
40538+}
40539+
40540+/*
40541+ * Function that is called by common_readdir() on each directory entry while
40542+ * doing readdir. ->filldir callback may block, so we had to release long term
40543+ * lock while calling it. To avoid repeating tree traversal, seal is used. If
40544+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
40545+ *
40546+ * Whether node is unlocked in case of any other error is undefined. It is
40547+ * guaranteed to be still locked if success (0) is returned.
40548+ *
40549+ * When ->filldir() wants no more, feed_entry() returns 1, and node is
40550+ * unlocked.
40551+ */
40552+static int
40553+feed_entry(struct file *f,
40554+ readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
40555+{
40556+ item_plugin *iplug;
40557+ char *name;
40558+ reiser4_key sd_key;
40559+ int result;
40560+ char buf[DE_NAME_BUF_LEN];
40561+ char name_buf[32];
40562+ char *local_name;
40563+ unsigned file_type;
40564+ seal_t seal;
40565+ coord_t *coord;
40566+ reiser4_key entry_key;
40567+
40568+ coord = tap->coord;
40569+ iplug = item_plugin_by_coord(coord);
40570+
40571+ /* pointer to name within the node */
40572+ name = iplug->s.dir.extract_name(coord, buf);
40573+ assert("nikita-1371", name != NULL);
40574+
40575+ /* key of object the entry points to */
40576+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
40577+ return RETERR(-EIO);
40578+
40579+ /* we must release longterm znode lock before calling filldir to avoid
40580+ deadlock which may happen if filldir causes page fault. So, copy
40581+ name to intermediate buffer */
40582+ if (strlen(name) + 1 > sizeof(name_buf)) {
40583+ local_name = kmalloc(strlen(name) + 1, get_gfp_mask());
40584+ if (local_name == NULL)
40585+ return RETERR(-ENOMEM);
40586+ } else
40587+ local_name = name_buf;
40588+
40589+ strcpy(local_name, name);
40590+ file_type = iplug->s.dir.extract_file_type(coord);
40591+
40592+ unit_key_by_coord(coord, &entry_key);
40593+ seal_init(&seal, coord, &entry_key);
40594+
40595+ longterm_unlock_znode(tap->lh);
40596+
40597+ /*
40598+ * send information about directory entry to the ->filldir() filler
40599+ * supplied to us by caller (VFS).
40600+ *
40601+ * ->filldir is entitled to do weird things. For example, ->filldir
40602+ * supplied by knfsd re-enters file system. Make sure no locks are
40603+ * held.
40604+ */
40605+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
40606+
40607+ result = filldir(dirent, name, (int)strlen(name),
40608+ /* offset of this entry */
40609+ f->f_pos,
40610+ /* inode number of object bounden by this entry */
40611+ oid_to_uino(get_key_objectid(&sd_key)), file_type);
40612+ if (local_name != name_buf)
40613+ kfree(local_name);
40614+ if (result < 0)
40615+ /* ->filldir() is satisfied. (no space in buffer, IOW) */
40616+ result = 1;
40617+ else
40618+ result = seal_validate(&seal, coord, &entry_key,
40619+ tap->lh, tap->mode, ZNODE_LOCK_HIPRI);
40620+ return result;
40621+}
40622+
40623+static void move_entry(readdir_pos * pos, coord_t * coord)
40624+{
40625+ reiser4_key de_key;
40626+ de_id *did;
40627+
40628+ /* update @pos */
40629+ ++pos->entry_no;
40630+ did = &pos->position.dir_entry_key;
40631+
40632+ /* get key of directory entry */
40633+ unit_key_by_coord(coord, &de_key);
40634+
40635+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
40636+ /* we are within sequence of directory entries
40637+ with duplicate keys. */
40638+ ++pos->position.pos;
40639+ else {
40640+ pos->position.pos = 0;
40641+ build_de_id_by_key(&de_key, did);
40642+ }
40643+ ++pos->fpos;
40644+}
40645+
40646+/*
40647+ * STATELESS READDIR
40648+ *
40649+ * readdir support in reiser4 relies on ability to update readdir_pos embedded
40650+ * into reiser4_file_fsdata on each directory modification (name insertion and
40651+ * removal), see readdir_common() function below. This obviously doesn't work
40652+ * when reiser4 is accessed over NFS, because NFS doesn't keep any state
40653+ * across client READDIR requests for the same directory.
40654+ *
40655+ * To address this we maintain a "pool" of detached reiser4_file_fsdata
40656+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
40657+ * find detached reiser4_file_fsdata corresponding to previous readdir
40658+ * request. In other words, additional state is maintained on the
40659+ * server. (This is somewhat contrary to the design goals of NFS protocol.)
40660+ *
40661+ * To efficiently detect when our ->readdir() method is called by NFS server,
40662+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
40663+ * file_is_stateless() function).
40664+ *
40665+ * To find out d_cursor in the pool, we encode client id (cid) in the highest
40666+ * bits of NFS readdir cookie: when first readdir request comes to the given
40667+ * directory from the given client, cookie is set to 0. This situation is
40668+ * detected, global cid_counter is incremented, and stored in highest bits of
40669+ * all direntry offsets returned to the client, including last one. As the
40670+ * only valid readdir cookie is one obtained as direntry->offset, we are
40671+ * guaranteed that next readdir request (continuing current one) will have
40672+ * current cid in the highest bits of starting readdir cookie. All d_cursors
40673+ * are hashed into per-super-block hash table by (oid, cid) key.
40674+ *
40675+ * In addition d_cursors are placed into per-super-block radix tree where they
40676+ * are keyed by oid alone. This is necessary to efficiently remove them during
40677+ * rmdir.
40678+ *
40679+ * At last, currently unused d_cursors are linked into special list. This list
40680+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
40681+ *
40682+ */
40683+
40684+
40685+/*
40686+ * prepare for readdir.
40687+ */
40688+static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
40689+{
40690+ struct inode *inode;
40691+ reiser4_file_fsdata *fsdata;
40692+ int result;
40693+
40694+ assert("nikita-1359", f != NULL);
40695+ inode = f->f_dentry->d_inode;
40696+ assert("nikita-1360", inode != NULL);
40697+
40698+ if (!S_ISDIR(inode->i_mode))
40699+ return RETERR(-ENOTDIR);
40700+
40701+ /* try to find detached readdir state */
40702+ result = try_to_attach_fsdata(f, inode);
40703+ if (result != 0)
40704+ return result;
40705+
40706+ fsdata = reiser4_get_file_fsdata(f);
40707+ assert("nikita-2571", fsdata != NULL);
40708+ if (IS_ERR(fsdata))
40709+ return PTR_ERR(fsdata);
40710+
40711+ /* add file descriptor to the readdir list hanging of directory
40712+ * inode. This list is used to scan "readdirs-in-progress" while
40713+ * inserting or removing names in the directory. */
40714+ spin_lock_inode(inode);
40715+ if (list_empty_careful(&fsdata->dir.linkage))
40716+ list_add(&fsdata->dir.linkage, get_readdir_list(inode));
40717+ *pos = &fsdata->dir.readdir;
40718+ spin_unlock_inode(inode);
40719+
40720+ /* move @tap to the current position */
40721+ return dir_rewind(f, *pos, tap);
40722+}
40723+
40724+/* this is implementation of vfs's llseek method of struct file_operations for
40725+ typical directory
40726+ See comment before readdir_common() for explanation.
40727+*/
40728+loff_t llseek_common_dir(struct file * file, loff_t off, int origin)
40729+{
40730+ reiser4_context *ctx;
40731+ loff_t result;
40732+ struct inode *inode;
40733+
40734+ inode = file->f_dentry->d_inode;
40735+
40736+ ctx = init_context(inode->i_sb);
40737+ if (IS_ERR(ctx))
40738+ return PTR_ERR(ctx);
40739+
40740+ mutex_lock(&inode->i_mutex);
40741+
40742+ /* update ->f_pos */
40743+ result = default_llseek(file, off, origin);
40744+ if (result >= 0) {
40745+ int ff;
40746+ coord_t coord;
40747+ lock_handle lh;
40748+ tap_t tap;
40749+ readdir_pos *pos;
40750+
40751+ coord_init_zero(&coord);
40752+ init_lh(&lh);
40753+ tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40754+
40755+ ff = dir_readdir_init(file, &tap, &pos);
40756+ detach_fsdata(file);
40757+ if (ff != 0)
40758+ result = (loff_t) ff;
40759+ tap_done(&tap);
40760+ }
40761+ detach_fsdata(file);
40762+ mutex_unlock(&inode->i_mutex);
40763+
40764+ reiser4_exit_context(ctx);
40765+ return result;
40766+}
40767+
40768+/* this is common implementation of vfs's readdir method of struct
40769+ file_operations
40770+
40771+ readdir problems:
40772+
40773+ readdir(2)/getdents(2) interface is based on implicit assumption that
40774+ readdir can be restarted from any particular point by supplying file system
40775+ with off_t-full of data. That is, file system fills ->d_off field in struct
40776+ dirent and later user passes ->d_off to the seekdir(3), which is, actually,
40777+ implemented by glibc as lseek(2) on directory.
40778+
40779+ Reiser4 cannot restart readdir from 64 bits of data, because two last
40780+ components of the key of directory entry are unknown, which given 128 bits:
40781+ locality and type fields in the key of directory entry are always known, to
40782+ start readdir() from given point objectid and offset fields have to be
40783+ filled.
40784+
40785+ Traditional UNIX API for scanning through directory
40786+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
40787+ assumption that directory is structured very much like regular file, in
40788+ particular, it is implied that each name within given directory (directory
40789+ entry) can be uniquely identified by scalar offset and that such offset is
40790+ stable across the life-time of the name is identifies.
40791+
40792+ This is manifestly not so for reiser4. In reiser4 the only stable unique
40793+ identifies for the directory entry is its key that doesn't fit into
40794+ seekdir/telldir API.
40795+
40796+ solution:
40797+
40798+ Within each file descriptor participating in readdir-ing of directory
40799+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
40800+ the "current" directory entry that file descriptor looks at. It contains a
40801+ key of directory entry (plus some additional info to deal with non-unique
40802+ keys that we wouldn't dwell onto here) and a logical position of this
40803+ directory entry starting from the beginning of the directory, that is
40804+ ordinal number of this entry in the readdir order.
40805+
40806+ Obviously this logical position is not stable in the face of directory
40807+ modifications. To work around this, on each addition or removal of directory
40808+ entry all file descriptors for directory inode are scanned and their
40809+ readdir_pos are updated accordingly (adjust_dir_pos()).
40810+*/
40811+int readdir_common(struct file *f /* directory file being read */ ,
40812+ void *dirent /* opaque data passed to us by VFS */ ,
40813+ filldir_t filld /* filler function passed to us by VFS */ )
40814+{
40815+ reiser4_context *ctx;
40816+ int result;
40817+ struct inode *inode;
40818+ coord_t coord;
40819+ lock_handle lh;
40820+ tap_t tap;
40821+ readdir_pos *pos;
40822+
40823+ assert("nikita-1359", f != NULL);
40824+ inode = f->f_dentry->d_inode;
40825+ assert("nikita-1360", inode != NULL);
40826+
40827+ if (!S_ISDIR(inode->i_mode))
40828+ return RETERR(-ENOTDIR);
40829+
40830+ ctx = init_context(inode->i_sb);
40831+ if (IS_ERR(ctx))
40832+ return PTR_ERR(ctx);
40833+
40834+ coord_init_zero(&coord);
40835+ init_lh(&lh);
40836+ tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40837+
40838+ reiser4_readdir_readahead_init(inode, &tap);
40839+
40840+ repeat:
40841+ result = dir_readdir_init(f, &tap, &pos);
40842+ if (result == 0) {
40843+ result = tap_load(&tap);
40844+ /* scan entries one by one feeding them to @filld */
40845+ while (result == 0) {
40846+ coord_t *coord;
40847+
40848+ coord = tap.coord;
40849+ assert("nikita-2572", coord_is_existing_unit(coord));
40850+ assert("nikita-3227", is_valid_dir_coord(inode, coord));
40851+
40852+ result = feed_entry(f, pos, &tap, filld, dirent);
40853+ if (result > 0) {
40854+ break;
40855+ } else if (result == 0) {
40856+ ++f->f_pos;
40857+ result = go_next_unit(&tap);
40858+ if (result == -E_NO_NEIGHBOR ||
40859+ result == -ENOENT) {
40860+ result = 0;
40861+ break;
40862+ } else if (result == 0) {
40863+ if (is_valid_dir_coord(inode, coord))
40864+ move_entry(pos, coord);
40865+ else
40866+ break;
40867+ }
40868+ } else if (result == -E_REPEAT) {
40869+ /* feed_entry() had to restart. */
40870+ ++f->f_pos;
40871+ tap_relse(&tap);
40872+ goto repeat;
40873+ } else
40874+ warning("vs-1617",
40875+ "readdir_common: unexpected error %d",
40876+ result);
40877+ }
40878+ tap_relse(&tap);
40879+
40880+ if (result >= 0)
40881+ f->f_version = inode->i_version;
40882+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
40883+ result = 0;
40884+ tap_done(&tap);
40885+ detach_fsdata(f);
40886+
40887+ /* try to update directory's atime */
40888+ if (reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
40889+ BA_CAN_COMMIT) != 0)
40890+ warning("", "failed to update atime on readdir: %llu",
40891+ get_inode_oid(inode));
40892+ else
40893+ file_accessed(f);
40894+
40895+ context_set_commit_async(ctx);
40896+ reiser4_exit_context(ctx);
40897+
40898+ return (result <= 0) ? result : 0;
40899+}
40900+
40901+/*
40902+ * Local variables:
40903+ * c-indentation-style: "K&R"
40904+ * mode-name: "LC"
40905+ * c-basic-offset: 8
40906+ * tab-width: 8
40907+ * fill-column: 79
40908+ * End:
40909+ */
40910Index: linux-2.6.16/fs/reiser4/plugin/file_plugin_common.c
40911===================================================================
40912--- /dev/null
40913+++ linux-2.6.16/fs/reiser4/plugin/file_plugin_common.c
40914@@ -0,0 +1,929 @@
40915+/* Copyright 2005 by Hans Reiser, licensing governed by
40916+ reiser4/README */
40917+
40918+/* this file contains typical implementations for most of methods of
40919+ file plugin
40920+*/
40921+
40922+#include "../inode.h"
40923+#include "object.h"
40924+#include "../safe_link.h"
40925+
40926+#include <linux/quotaops.h>
40927+
40928+static int insert_new_sd(struct inode *inode);
40929+static int update_sd(struct inode *inode);
40930+
40931+/* this is common implementation of write_sd_by_inode method of file plugin
40932+ either insert stat data or update it
40933+ */
40934+int write_sd_by_inode_common(struct inode *inode /* object to save */ )
40935+{
40936+ int result;
40937+
40938+ assert("nikita-730", inode != NULL);
40939+
40940+ if (inode_get_flag(inode, REISER4_NO_SD))
40941+ /* object doesn't have stat-data yet */
40942+ result = insert_new_sd(inode);
40943+ else
40944+ result = update_sd(inode);
40945+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
40946+ /* Don't issue warnings about "name is too long" */
40947+ warning("nikita-2221", "Failed to save sd for %llu: %i",
40948+ (unsigned long long)get_inode_oid(inode), result);
40949+ return result;
40950+}
40951+
40952+/* this is common implementation of key_by_inode method of file plugin
40953+ */
40954+int
40955+key_by_inode_and_offset_common(struct inode *inode, loff_t off,
40956+ reiser4_key * key)
40957+{
40958+ reiser4_key_init(key);
40959+ set_key_locality(key, reiser4_inode_data(inode)->locality_id);
40960+ set_key_ordering(key, get_inode_ordering(inode));
40961+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
40962+ set_key_type(key, KEY_BODY_MINOR);
40963+ set_key_offset(key, (__u64) off);
40964+ return 0;
40965+}
40966+
40967+/* this is common implementation of set_plug_in_inode method of file plugin
40968+ */
40969+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
40970+ struct inode *parent /* parent object */ ,
40971+ reiser4_object_create_data * data /* creational
40972+ * data */ )
40973+{
40974+ __u64 mask;
40975+
40976+ object->i_mode = data->mode;
40977+ /* this should be plugin decision */
40978+ object->i_uid = current->fsuid;
40979+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
40980+
40981+ /* support for BSD style group-id assignment. See mount's manual page
40982+ description of bsdgroups ext2 mount options for more details */
40983+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
40984+ object->i_gid = parent->i_gid;
40985+ else if (parent->i_mode & S_ISGID) {
40986+ /* parent directory has sguid bit */
40987+ object->i_gid = parent->i_gid;
40988+ if (S_ISDIR(object->i_mode))
40989+ /* sguid is inherited by sub-directories */
40990+ object->i_mode |= S_ISGID;
40991+ } else
40992+ object->i_gid = current->fsgid;
40993+
40994+ /* this object doesn't have stat-data yet */
40995+ inode_set_flag(object, REISER4_NO_SD);
40996+#if 0
40997+ /* this is now called after all inode plugins are initialized:
40998+ do_create_vfs_child after adjust_to_parent */
40999+ /* setup inode and file-operations for this inode */
41000+ setup_inode_ops(object, data);
41001+#endif
41002+ object->i_nlink = 0;
41003+ seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
41004+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
41005+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
41006+ mask |= (1 << LARGE_TIMES_STAT);
41007+
41008+ reiser4_inode_data(object)->extmask = mask;
41009+ return 0;
41010+}
41011+
41012+/* this is common implementation of adjust_to_parent method of file plugin for
41013+ regular files
41014+ */
41015+int adjust_to_parent_common(struct inode *object /* new object */ ,
41016+ struct inode *parent /* parent directory */ ,
41017+ struct inode *root /* root directory */ )
41018+{
41019+ assert("nikita-2165", object != NULL);
41020+ if (parent == NULL)
41021+ parent = root;
41022+ assert("nikita-2069", parent != NULL);
41023+
41024+ /*
41025+ * inherit missing plugins from parent
41026+ */
41027+
41028+ grab_plugin(object, parent, PSET_FILE);
41029+ grab_plugin(object, parent, PSET_SD);
41030+ grab_plugin(object, parent, PSET_FORMATTING);
41031+ grab_plugin(object, parent, PSET_PERM);
41032+ return 0;
41033+}
41034+
41035+/* this is common implementation of adjust_to_parent method of file plugin for
41036+ typical directories
41037+ */
41038+int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
41039+ struct inode *parent /* parent directory */ ,
41040+ struct inode *root /* root directory */ )
41041+{
41042+ int result = 0;
41043+ pset_member memb;
41044+
41045+ assert("nikita-2166", object != NULL);
41046+ if (parent == NULL)
41047+ parent = root;
41048+ assert("nikita-2167", parent != NULL);
41049+
41050+ /*
41051+ * inherit missing plugins from parent
41052+ */
41053+ for (memb = 0; memb < PSET_LAST; ++memb) {
41054+ result = grab_plugin(object, parent, memb);
41055+ if (result != 0)
41056+ break;
41057+ }
41058+ return result;
41059+}
41060+
41061+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
41062+ struct inode *parent /* parent directory */,
41063+ struct inode *root /* root directory */)
41064+{
41065+ int result;
41066+ result = adjust_to_parent_common(object, parent, root);
41067+ if (result)
41068+ return result;
41069+ assert("edward-1416", parent != NULL);
41070+
41071+ grab_plugin(object, parent, PSET_CLUSTER);
41072+ grab_plugin(object, parent, PSET_CIPHER);
41073+ grab_plugin(object, parent, PSET_DIGEST);
41074+ grab_plugin(object, parent, PSET_COMPRESSION);
41075+ grab_plugin(object, parent, PSET_COMPRESSION_MODE);
41076+
41077+ return 0;
41078+}
41079+
41080+/* this is common implementation of create_object method of file plugin
41081+ */
41082+int
41083+create_object_common(struct inode *object, struct inode *parent UNUSED_ARG,
41084+ reiser4_object_create_data * data UNUSED_ARG)
41085+{
41086+ reiser4_block_nr reserve;
41087+ assert("nikita-744", object != NULL);
41088+ assert("nikita-745", parent != NULL);
41089+ assert("nikita-747", data != NULL);
41090+ assert("nikita-748", inode_get_flag(object, REISER4_NO_SD));
41091+
41092+ reserve = estimate_create_common(object);
41093+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41094+ return RETERR(-ENOSPC);
41095+ return write_sd_by_inode_common(object);
41096+}
41097+
41098+static int common_object_delete_no_reserve(struct inode *inode);
41099+
41100+/**
41101+ * delete_object_common - delete_object of file_plugin
41102+ * @inode: inode to be deleted
41103+ *
41104+ * This is common implementation of delete_object method of file_plugin. It
41105+ * applies to object its deletion consists of removing two items - stat data
41106+ * and safe-link.
41107+ */
41108+int delete_object_common(struct inode *inode)
41109+{
41110+ int result;
41111+
41112+ assert("nikita-1477", inode != NULL);
41113+ /* FIXME: if file body deletion failed (i/o error, for instance),
41114+ inode->i_size can be != 0 here */
41115+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
41116+ assert("nikita-3421", inode->i_nlink == 0);
41117+
41118+
41119+ if (!inode_get_flag(inode, REISER4_NO_SD)) {
41120+ reiser4_block_nr reserve;
41121+
41122+ /* grab space which is needed to remove 2 items from the tree:
41123+ stat data and safe-link */
41124+ reserve = 2 * estimate_one_item_removal(tree_by_inode(inode));
41125+ if (reiser4_grab_space_force(reserve,
41126+ BA_RESERVED | BA_CAN_COMMIT))
41127+ return RETERR(-ENOSPC);
41128+ result = common_object_delete_no_reserve(inode);
41129+ } else
41130+ result = 0;
41131+ return result;
41132+}
41133+
41134+/**
41135+ * delete_directory_common - delete_object of file_plugin
41136+ * @inode: inode to be deleted
41137+ *
41138+ * This is common implementation of delete_object method of file_plugin for
41139+ * typical directory. It calls done method of dir_plugin to remove "." and
41140+ * removes stat data and safe-link.
41141+ */
41142+int delete_directory_common(struct inode *inode)
41143+{
41144+ int result;
41145+ dir_plugin *dplug;
41146+
41147+ assert("", (get_current_context() &&
41148+ get_current_context()->trans->atom == NULL));
41149+
41150+ dplug = inode_dir_plugin(inode);
41151+ assert("vs-1101", dplug && dplug->done);
41152+
41153+ /* kill cursors which might be attached to inode */
41154+ kill_cursors(inode);
41155+
41156+ /* grab space enough for removing two items */
41157+ if (reiser4_grab_space
41158+ (2 * estimate_one_item_removal(tree_by_inode(inode)),
41159+ BA_RESERVED | BA_CAN_COMMIT))
41160+ return RETERR(-ENOSPC);
41161+
41162+ result = dplug->done(inode);
41163+ if (!result)
41164+ result = common_object_delete_no_reserve(inode);
41165+ return result;
41166+}
41167+
41168+/* this is common implementation of add_link method of file plugin
41169+ */
41170+int add_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
41171+{
41172+ /*
41173+ * increment ->i_nlink and update ->i_ctime
41174+ */
41175+
41176+ INODE_INC_FIELD(object, i_nlink);
41177+ object->i_ctime = CURRENT_TIME;
41178+ return 0;
41179+}
41180+
41181+/* this is common implementation of rem_link method of file plugin
41182+ */
41183+int rem_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
41184+{
41185+ assert("nikita-2021", object != NULL);
41186+ assert("nikita-2163", object->i_nlink > 0);
41187+
41188+ /*
41189+ * decrement ->i_nlink and update ->i_ctime
41190+ */
41191+
41192+ INODE_DEC_FIELD(object, i_nlink);
41193+ object->i_ctime = CURRENT_TIME;
41194+ return 0;
41195+}
41196+
41197+/* this is common implementation of rem_link method of file plugin for typical
41198+ directory
41199+*/
41200+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
41201+{
41202+ assert("nikita-20211", object != NULL);
41203+ assert("nikita-21631", object->i_nlink > 0);
41204+
41205+ /*
41206+ * decrement ->i_nlink and update ->i_ctime
41207+ */
41208+ INODE_DEC_FIELD(object, i_nlink);
41209+ if (object->i_nlink == 1)
41210+ INODE_DEC_FIELD(object, i_nlink);
41211+ object->i_ctime = CURRENT_TIME;
41212+ return 0;
41213+}
41214+
41215+/* this is common implementation of owns_item method of file plugin
41216+ compare objectids of keys in inode and coord */
41217+int owns_item_common(const struct inode *inode, /* object to check
41218+ * against */
41219+ const coord_t * coord /* coord to check */ )
41220+{
41221+ reiser4_key item_key;
41222+ reiser4_key file_key;
41223+
41224+ assert("nikita-760", inode != NULL);
41225+ assert("nikita-761", coord != NULL);
41226+
41227+ return coord_is_existing_item(coord) &&
41228+ (get_key_objectid(build_sd_key(inode, &file_key)) ==
41229+ get_key_objectid(item_key_by_coord(coord, &item_key)));
41230+}
41231+
41232+/* this is common implementation of owns_item method of file plugin
41233+ for typical directory
41234+*/
41235+int owns_item_common_dir(const struct inode *inode, /* object to check against */
41236+ const coord_t * coord /* coord of item to check */ )
41237+{
41238+ reiser4_key item_key;
41239+
41240+ assert("nikita-1335", inode != NULL);
41241+ assert("nikita-1334", coord != NULL);
41242+
41243+ if (item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE)
41244+ return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41245+ get_inode_oid(inode);
41246+ else
41247+ return owns_item_common(inode, coord);
41248+}
41249+
41250+/* this is common implementation of can_add_link method of file plugin
41251+ checks whether yet another hard links to this object can be added
41252+*/
41253+int can_add_link_common(const struct inode *object /* object to check */ )
41254+{
41255+ assert("nikita-732", object != NULL);
41256+
41257+ /* inode->i_nlink is unsigned int, so just check for integer
41258+ overflow */
41259+ return object->i_nlink + 1 != 0;
41260+}
41261+
41262+/* this is common implementation of can_rem_link method of file plugin for
41263+ typical directory
41264+*/
41265+int can_rem_link_common_dir(const struct inode *inode)
41266+{
41267+ /* is_dir_empty() returns 0 is dir is empty */
41268+ return !is_dir_empty(inode);
41269+}
41270+
41271+/* this is common implementation of detach method of file plugin for typical
41272+ directory
41273+*/
41274+int detach_common_dir(struct inode *child, struct inode *parent)
41275+{
41276+ dir_plugin *dplug;
41277+
41278+ dplug = inode_dir_plugin(child);
41279+ assert("nikita-2883", dplug != NULL);
41280+ assert("nikita-2884", dplug->detach != NULL);
41281+ return dplug->detach(child, parent);
41282+}
41283+
41284+/* this is common implementation of bind method of file plugin for typical
41285+ directory
41286+*/
41287+int bind_common_dir(struct inode *child, struct inode *parent)
41288+{
41289+ dir_plugin *dplug;
41290+
41291+ dplug = inode_dir_plugin(child);
41292+ assert("nikita-2646", dplug != NULL);
41293+ return dplug->attach(child, parent);
41294+}
41295+
41296+static int process_truncate(struct inode *, __u64 size);
41297+
41298+/* this is common implementation of safelink method of file plugin
41299+ */
41300+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41301+{
41302+ int result;
41303+
41304+ assert("vs-1705", get_current_context()->trans->atom == NULL);
41305+ if (link == SAFE_UNLINK)
41306+ /* nothing to do. iput() in the caller (process_safelink) will
41307+ * finish with file */
41308+ result = 0;
41309+ else if (link == SAFE_TRUNCATE)
41310+ result = process_truncate(object, value);
41311+ else {
41312+ warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41313+ result = RETERR(-EIO);
41314+ }
41315+ return result;
41316+}
41317+
41318+/* this is common implementation of estimate.create method of file plugin
41319+ can be used when object creation involves insertion of one item (usually stat
41320+ data) into tree
41321+*/
41322+reiser4_block_nr estimate_create_common(const struct inode * object)
41323+{
41324+ return estimate_one_insert_item(tree_by_inode(object));
41325+}
41326+
41327+/* this is common implementation of estimate.create method of file plugin for
41328+ typical directory
41329+ can be used when directory creation involves insertion of two items (usually
41330+ stat data and item containing "." and "..") into tree
41331+*/
41332+reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41333+{
41334+ return 2 * estimate_one_insert_item(tree_by_inode(object));
41335+}
41336+
41337+/* this is common implementation of estimate.update method of file plugin
41338+ can be used when stat data update does not do more than inserting a unit
41339+ into a stat data item which is probably true for most cases
41340+*/
41341+reiser4_block_nr estimate_update_common(const struct inode * inode)
41342+{
41343+ return estimate_one_insert_into_item(tree_by_inode(inode));
41344+}
41345+
41346+/* this is common implementation of estimate.unlink method of file plugin
41347+ */
41348+reiser4_block_nr
41349+estimate_unlink_common(const struct inode * object UNUSED_ARG,
41350+ const struct inode * parent UNUSED_ARG)
41351+{
41352+ return 0;
41353+}
41354+
41355+/* this is common implementation of estimate.unlink method of file plugin for
41356+ typical directory
41357+*/
41358+reiser4_block_nr
41359+estimate_unlink_common_dir(const struct inode * object,
41360+ const struct inode * parent)
41361+{
41362+ dir_plugin *dplug;
41363+
41364+ dplug = inode_dir_plugin(object);
41365+ assert("nikita-2888", dplug != NULL);
41366+ assert("nikita-2887", dplug->estimate.unlink != NULL);
41367+ return dplug->estimate.unlink(object, parent);
41368+}
41369+
41370+char *wire_write_common(struct inode *inode, char *start)
41371+{
41372+ return build_inode_onwire(inode, start);
41373+}
41374+
41375+char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41376+{
41377+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41378+}
41379+
41380+struct dentry *wire_get_common(struct super_block *sb,
41381+ reiser4_object_on_wire * obj)
41382+{
41383+ struct inode *inode;
41384+ struct dentry *dentry;
41385+ reiser4_key key;
41386+
41387+ extract_key_from_id(&obj->u.std.key_id, &key);
41388+ inode = reiser4_iget(sb, &key, 1);
41389+ if (!IS_ERR(inode)) {
41390+ reiser4_iget_complete(inode);
41391+ dentry = d_alloc_anon(inode);
41392+ if (dentry == NULL) {
41393+ iput(inode);
41394+ dentry = ERR_PTR(-ENOMEM);
41395+ } else
41396+ dentry->d_op = &get_super_private(sb)->ops.dentry;
41397+ } else if (PTR_ERR(inode) == -ENOENT)
41398+ /*
41399+ * inode wasn't found at the key encoded in the file
41400+ * handle. Hence, file handle is stale.
41401+ */
41402+ dentry = ERR_PTR(RETERR(-ESTALE));
41403+ else
41404+ dentry = (void *)inode;
41405+ return dentry;
41406+}
41407+
41408+int wire_size_common(struct inode *inode)
41409+{
41410+ return inode_onwire_size(inode);
41411+}
41412+
41413+void wire_done_common(reiser4_object_on_wire * obj)
41414+{
41415+ /* nothing to do */
41416+}
41417+
41418+/* helper function to print errors */
41419+static void key_warning(const reiser4_key * key /* key to print */ ,
41420+ const struct inode *inode,
41421+ int code /* error code to print */ )
41422+{
41423+ assert("nikita-716", key != NULL);
41424+
41425+ if (code != -ENOMEM) {
41426+ warning("nikita-717", "Error for inode %llu (%i)",
41427+ (unsigned long long)get_key_objectid(key), code);
41428+ print_key("for key", key);
41429+ }
41430+}
41431+
41432+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41433+#if REISER4_DEBUG
41434+static void
41435+check_inode_seal(const struct inode *inode,
41436+ const coord_t * coord, const reiser4_key * key)
41437+{
41438+ reiser4_key unit_key;
41439+
41440+ unit_key_by_coord(coord, &unit_key);
41441+ assert("nikita-2752",
41442+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
41443+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
41444+}
41445+
41446+static void check_sd_coord(coord_t * coord, const reiser4_key * key)
41447+{
41448+ reiser4_key ukey;
41449+
41450+ coord_clear_iplug(coord);
41451+ if (zload(coord->node))
41452+ return;
41453+
41454+ if (!coord_is_existing_unit(coord) ||
41455+ !item_plugin_by_coord(coord) ||
41456+ !keyeq(unit_key_by_coord(coord, &ukey), key) ||
41457+ (znode_get_level(coord->node) != LEAF_LEVEL) ||
41458+ !item_is_statdata(coord)) {
41459+ warning("nikita-1901", "Conspicuous seal");
41460+ print_key("key", key);
41461+ print_coord("coord", coord, 1);
41462+ impossible("nikita-2877", "no way");
41463+ }
41464+ zrelse(coord->node);
41465+}
41466+
41467+#else
41468+#define check_inode_seal(inode, coord, key) noop
41469+#define check_sd_coord(coord, key) noop
41470+#endif
41471+
41472+/* insert new stat-data into tree. Called with inode state
41473+ locked. Return inode state locked. */
41474+static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
41475+{
41476+ int result;
41477+ reiser4_key key;
41478+ coord_t coord;
41479+ reiser4_item_data data;
41480+ char *area;
41481+ reiser4_inode *ref;
41482+ lock_handle lh;
41483+ oid_t oid;
41484+
41485+ assert("nikita-723", inode != NULL);
41486+ assert("nikita-3406", inode_get_flag(inode, REISER4_NO_SD));
41487+
41488+ ref = reiser4_inode_data(inode);
41489+ spin_lock_inode(inode);
41490+
41491+ if (ref->plugin_mask != 0)
41492+ /* inode has non-standard plugins */
41493+ inode_set_extension(inode, PLUGIN_STAT);
41494+ /*
41495+ * prepare specification of new item to be inserted
41496+ */
41497+
41498+ data.iplug = inode_sd_plugin(inode);
41499+ data.length = data.iplug->s.sd.save_len(inode);
41500+ spin_unlock_inode(inode);
41501+
41502+ data.data = NULL;
41503+ data.user = 0;
41504+/* could be optimized for case where there is only one node format in
41505+ * use in the filesystem, probably there are lots of such
41506+ * places we could optimize for only one node layout.... -Hans */
41507+ if (data.length > tree_by_inode(inode)->nplug->max_item_size()) {
41508+ /* This is silly check, but we don't know actual node where
41509+ insertion will go into. */
41510+ return RETERR(-ENAMETOOLONG);
41511+ }
41512+ oid = oid_allocate(inode->i_sb);
41513+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
41514+ if (oid == ABSOLUTE_MAX_OID)
41515+ return RETERR(-EOVERFLOW);
41516+
41517+ set_inode_oid(inode, oid);
41518+
41519+ coord_init_zero(&coord);
41520+ init_lh(&lh);
41521+
41522+ result = insert_by_key(tree_by_inode(inode),
41523+ build_sd_key(inode, &key), &data, &coord, &lh,
41524+ /* stat data lives on a leaf level */
41525+ LEAF_LEVEL, CBK_UNIQUE);
41526+
41527+ /* we don't want to re-check that somebody didn't insert
41528+ stat-data while we were doing io, because if it did,
41529+ insert_by_key() returned error. */
41530+ /* but what _is_ possible is that plugin for inode's stat-data,
41531+ list of non-standard plugins or their state would change
41532+ during io, so that stat-data wouldn't fit into sd. To avoid
41533+ this race we keep inode_state lock. This lock has to be
41534+ taken each time you access inode in a way that would cause
41535+ changes in sd size: changing plugins etc.
41536+ */
41537+
41538+ if (result == IBK_INSERT_OK) {
41539+ coord_clear_iplug(&coord);
41540+ result = zload(coord.node);
41541+ if (result == 0) {
41542+ /* have we really inserted stat data? */
41543+ assert("nikita-725", item_is_statdata(&coord));
41544+
41545+ /* inode was just created. It is inserted into hash
41546+ table, but no directory entry was yet inserted into
41547+ parent. So, inode is inaccessible through
41548+ ->lookup(). All places that directly grab inode
41549+ from hash-table (like old knfsd), should check
41550+ IMMUTABLE flag that is set by common_create_child.
41551+ */
41552+ assert("nikita-3240", data.iplug != NULL);
41553+ assert("nikita-3241", data.iplug->s.sd.save != NULL);
41554+ area = item_body_by_coord(&coord);
41555+ result = data.iplug->s.sd.save(inode, &area);
41556+ znode_make_dirty(coord.node);
41557+ if (result == 0) {
41558+ /* object has stat-data now */
41559+ inode_clr_flag(inode, REISER4_NO_SD);
41560+ inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41561+ /* initialise stat-data seal */
41562+ seal_init(&ref->sd_seal, &coord, &key);
41563+ ref->sd_coord = coord;
41564+ check_inode_seal(inode, &coord, &key);
41565+ } else if (result != -ENOMEM)
41566+ /*
41567+ * convert any other error code to -EIO to
41568+ * avoid confusing user level with unexpected
41569+ * errors.
41570+ */
41571+ result = RETERR(-EIO);
41572+ zrelse(coord.node);
41573+ }
41574+ }
41575+ done_lh(&lh);
41576+
41577+ if (result != 0)
41578+ key_warning(&key, inode, result);
41579+ else
41580+ oid_count_allocated();
41581+
41582+ return result;
41583+}
41584+
41585+/* find sd of inode in a tree, deal with errors */
41586+int lookup_sd(struct inode *inode /* inode to look sd for */ ,
41587+ znode_lock_mode lock_mode /* lock mode */ ,
41588+ coord_t * coord /* resulting coord */ ,
41589+ lock_handle * lh /* resulting lock handle */ ,
41590+ const reiser4_key * key /* resulting key */ ,
41591+ int silent)
41592+{
41593+ int result;
41594+ __u32 flags;
41595+
41596+ assert("nikita-1692", inode != NULL);
41597+ assert("nikita-1693", coord != NULL);
41598+ assert("nikita-1694", key != NULL);
41599+
41600+ /* look for the object's stat data in a tree.
41601+ This returns in "node" pointer to a locked znode and in "pos"
41602+ position of an item found in node. Both are only valid if
41603+ coord_found is returned. */
41604+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
41605+ flags |= CBK_UNIQUE;
41606+ /*
41607+ * traverse tree to find stat data. We cannot use vroot here, because
41608+ * it only covers _body_ of the file, and stat data don't belong
41609+ * there.
41610+ */
41611+ result = coord_by_key(tree_by_inode(inode),
41612+ key,
41613+ coord,
41614+ lh,
41615+ lock_mode,
41616+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
41617+ if (REISER4_DEBUG && result == 0)
41618+ check_sd_coord(coord, key);
41619+
41620+ if (result != 0 && !silent)
41621+ key_warning(key, inode, result);
41622+ return result;
41623+}
41624+
41625+static int
41626+locate_inode_sd(struct inode *inode,
41627+ reiser4_key * key, coord_t * coord, lock_handle * lh)
41628+{
41629+ reiser4_inode *state;
41630+ seal_t seal;
41631+ int result;
41632+
41633+ assert("nikita-3483", inode != NULL);
41634+
41635+ state = reiser4_inode_data(inode);
41636+ spin_lock_inode(inode);
41637+ *coord = state->sd_coord;
41638+ coord_clear_iplug(coord);
41639+ seal = state->sd_seal;
41640+ spin_unlock_inode(inode);
41641+
41642+ build_sd_key(inode, key);
41643+ if (seal_is_set(&seal)) {
41644+ /* first, try to use seal */
41645+ result = seal_validate(&seal,
41646+ coord,
41647+ key,
41648+ lh, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
41649+ if (result == 0)
41650+ check_sd_coord(coord, key);
41651+ } else
41652+ result = -E_REPEAT;
41653+
41654+ if (result != 0) {
41655+ coord_init_zero(coord);
41656+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
41657+ }
41658+ return result;
41659+}
41660+
41661+/* update stat-data at @coord */
41662+static int
41663+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
41664+ lock_handle * lh)
41665+{
41666+ int result;
41667+ reiser4_item_data data;
41668+ char *area;
41669+ reiser4_inode *state;
41670+ znode *loaded;
41671+
41672+ state = reiser4_inode_data(inode);
41673+
41674+ coord_clear_iplug(coord);
41675+ result = zload(coord->node);
41676+ if (result != 0)
41677+ return result;
41678+ loaded = coord->node;
41679+
41680+ spin_lock_inode(inode);
41681+ assert("nikita-728", inode_sd_plugin(inode) != NULL);
41682+ data.iplug = inode_sd_plugin(inode);
41683+
41684+ /* if inode has non-standard plugins, add appropriate stat data
41685+ * extension */
41686+ if (state->plugin_mask != 0)
41687+ inode_set_extension(inode, PLUGIN_STAT);
41688+
41689+ /* data.length is how much space to add to (or remove
41690+ from if negative) sd */
41691+ if (!inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
41692+ /* recalculate stat-data length */
41693+ data.length =
41694+ data.iplug->s.sd.save_len(inode) -
41695+ item_length_by_coord(coord);
41696+ inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41697+ } else
41698+ data.length = 0;
41699+ spin_unlock_inode(inode);
41700+
41701+ /* if on-disk stat data is of different length than required
41702+ for this inode, resize it */
41703+ if (data.length != 0) {
41704+ data.data = NULL;
41705+ data.user = 0;
41706+
41707+ /* insertion code requires that insertion point (coord) was
41708+ * between units. */
41709+ coord->between = AFTER_UNIT;
41710+ result = resize_item(coord,
41711+ &data, key, lh, COPI_DONT_SHIFT_LEFT);
41712+ if (result != 0) {
41713+ key_warning(key, inode, result);
41714+ zrelse(loaded);
41715+ return result;
41716+ }
41717+ if (loaded != coord->node) {
41718+ /* resize_item moved coord to another node. Zload it */
41719+ zrelse(loaded);
41720+ coord_clear_iplug(coord);
41721+ result = zload(coord->node);
41722+ if (result != 0)
41723+ return result;
41724+ loaded = coord->node;
41725+ }
41726+ }
41727+
41728+ area = item_body_by_coord(coord);
41729+ spin_lock_inode(inode);
41730+ result = data.iplug->s.sd.save(inode, &area);
41731+ znode_make_dirty(coord->node);
41732+
41733+ /* re-initialise stat-data seal */
41734+
41735+ /*
41736+ * coord.between was possibly skewed from AT_UNIT when stat-data size
41737+ * was changed and new extensions were pasted into item.
41738+ */
41739+ coord->between = AT_UNIT;
41740+ seal_init(&state->sd_seal, coord, key);
41741+ state->sd_coord = *coord;
41742+ spin_unlock_inode(inode);
41743+ check_inode_seal(inode, coord, key);
41744+ zrelse(loaded);
41745+ return result;
41746+}
41747+
41748+/* Update existing stat-data in a tree. Called with inode state locked. Return
41749+ inode state locked. */
41750+static int update_sd(struct inode *inode /* inode to update sd for */ )
41751+{
41752+ int result;
41753+ reiser4_key key;
41754+ coord_t coord;
41755+ lock_handle lh;
41756+
41757+ assert("nikita-726", inode != NULL);
41758+
41759+ /* no stat-data, nothing to update?! */
41760+ assert("nikita-3482", !inode_get_flag(inode, REISER4_NO_SD));
41761+
41762+ init_lh(&lh);
41763+
41764+ result = locate_inode_sd(inode, &key, &coord, &lh);
41765+ if (result == 0)
41766+ result = update_sd_at(inode, &coord, &key, &lh);
41767+ done_lh(&lh);
41768+
41769+ return result;
41770+}
41771+
41772+/* helper for delete_object_common and delete_directory_common. Remove object
41773+ stat data. Space for that must be reserved by caller before
41774+*/
41775+static int
41776+common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
41777+{
41778+ int result;
41779+
41780+ assert("nikita-1477", inode != NULL);
41781+
41782+ if (!inode_get_flag(inode, REISER4_NO_SD)) {
41783+ reiser4_key sd_key;
41784+
41785+ DQUOT_FREE_INODE(inode);
41786+ DQUOT_DROP(inode);
41787+
41788+ build_sd_key(inode, &sd_key);
41789+ result =
41790+ cut_tree(tree_by_inode(inode), &sd_key, &sd_key, NULL, 0);
41791+ if (result == 0) {
41792+ inode_set_flag(inode, REISER4_NO_SD);
41793+ result = oid_release(inode->i_sb, get_inode_oid(inode));
41794+ if (result == 0) {
41795+ oid_count_released();
41796+
41797+ result = safe_link_del(tree_by_inode(inode),
41798+ get_inode_oid(inode),
41799+ SAFE_UNLINK);
41800+ }
41801+ }
41802+ } else
41803+ result = 0;
41804+ return result;
41805+}
41806+
41807+/* helper for safelink_common */
41808+static int process_truncate(struct inode *inode, __u64 size)
41809+{
41810+ int result;
41811+ struct iattr attr;
41812+ file_plugin *fplug;
41813+ reiser4_context *ctx;
41814+ struct dentry dentry;
41815+
41816+ assert("vs-21", is_in_reiser4_context());
41817+ ctx = init_context(inode->i_sb);
41818+ assert("vs-22", !IS_ERR(ctx));
41819+
41820+ attr.ia_size = size;
41821+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
41822+ fplug = inode_file_plugin(inode);
41823+
41824+ mutex_lock(&inode->i_mutex);
41825+ assert("vs-1704", get_current_context()->trans->atom == NULL);
41826+ dentry.d_inode = inode;
41827+ result = inode->i_op->setattr(&dentry, &attr);
41828+ mutex_unlock(&inode->i_mutex);
41829+
41830+ context_set_commit_async(ctx);
41831+ reiser4_exit_context(ctx);
41832+
41833+ return result;
41834+}
41835+
41836+/* Local variables:
41837+ c-indentation-style: "K&R"
41838+ mode-name: "LC"
41839+ c-basic-offset: 8
41840+ tab-width: 8
41841+ fill-column: 120
41842+ End:
41843+*/
41844Index: linux-2.6.16/fs/reiser4/plugin/hash.c
41845===================================================================
41846--- /dev/null
41847+++ linux-2.6.16/fs/reiser4/plugin/hash.c
41848@@ -0,0 +1,350 @@
41849+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
41850+ * reiser4/README */
41851+
41852+/* Hash functions */
41853+
41854+#include "../debug.h"
41855+#include "plugin_header.h"
41856+#include "plugin.h"
41857+#include "../super.h"
41858+#include "../inode.h"
41859+
41860+#include <linux/types.h>
41861+
41862+/* old rupasov (yura) hash */
41863+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
41864+ int len /* @name's length */ )
41865+{
41866+ int i;
41867+ int j;
41868+ int pow;
41869+ __u64 a;
41870+ __u64 c;
41871+
41872+ assert("nikita-672", name != NULL);
41873+ assert("nikita-673", len >= 0);
41874+
41875+ for (pow = 1, i = 1; i < len; ++i)
41876+ pow = pow * 10;
41877+
41878+ if (len == 1)
41879+ a = name[0] - 48;
41880+ else
41881+ a = (name[0] - 48) * pow;
41882+
41883+ for (i = 1; i < len; ++i) {
41884+ c = name[i] - 48;
41885+ for (pow = 1, j = i; j < len - 1; ++j)
41886+ pow = pow * 10;
41887+ a = a + c * pow;
41888+ }
41889+ for (; i < 40; ++i) {
41890+ c = '0' - 48;
41891+ for (pow = 1, j = i; j < len - 1; ++j)
41892+ pow = pow * 10;
41893+ a = a + c * pow;
41894+ }
41895+
41896+ for (; i < 256; ++i) {
41897+ c = i;
41898+ for (pow = 1, j = i; j < len - 1; ++j)
41899+ pow = pow * 10;
41900+ a = a + c * pow;
41901+ }
41902+
41903+ a = a << 7;
41904+ return a;
41905+}
41906+
41907+/* r5 hash */
41908+static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
41909+ int len UNUSED_ARG /* @name's length */ )
41910+{
41911+ __u64 a = 0;
41912+
41913+ assert("nikita-674", name != NULL);
41914+ assert("nikita-675", len >= 0);
41915+
41916+ while (*name) {
41917+ a += *name << 4;
41918+ a += *name >> 4;
41919+ a *= 11;
41920+ name++;
41921+ }
41922+ return a;
41923+}
41924+
41925+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
41926+ H0 = Key
41927+ Hi = E Mi(Hi-1) + Hi-1
41928+
41929+ (see Applied Cryptography, 2nd edition, p448).
41930+
41931+ Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
41932+
41933+ Jeremy has agreed to the contents of reiserfs/README. -Hans
41934+
41935+ This code was blindly upgraded to __u64 by s/__u32/__u64/g.
41936+*/
41937+static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
41938+ int len /* @name's length */ )
41939+{
41940+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
41941+
41942+ __u64 h0 = k[0], h1 = k[1];
41943+ __u64 a, b, c, d;
41944+ __u64 pad;
41945+ int i;
41946+
41947+ assert("nikita-676", name != NULL);
41948+ assert("nikita-677", len >= 0);
41949+
41950+#define DELTA 0x9E3779B9u
41951+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
41952+#define PARTROUNDS 6 /* 6 gets complete mixing */
41953+
41954+/* a, b, c, d - data; h0, h1 - accumulated hash */
41955+#define TEACORE(rounds) \
41956+ do { \
41957+ __u64 sum = 0; \
41958+ int n = rounds; \
41959+ __u64 b0, b1; \
41960+ \
41961+ b0 = h0; \
41962+ b1 = h1; \
41963+ \
41964+ do \
41965+ { \
41966+ sum += DELTA; \
41967+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
41968+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
41969+ } while(--n); \
41970+ \
41971+ h0 += b0; \
41972+ h1 += b1; \
41973+ } while(0)
41974+
41975+ pad = (__u64) len | ((__u64) len << 8);
41976+ pad |= pad << 16;
41977+
41978+ while (len >= 16) {
41979+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41980+ 16 | (__u64) name[3] << 24;
41981+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41982+ 16 | (__u64) name[7] << 24;
41983+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41984+ 16 | (__u64) name[11] << 24;
41985+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
41986+ << 16 | (__u64) name[15] << 24;
41987+
41988+ TEACORE(PARTROUNDS);
41989+
41990+ len -= 16;
41991+ name += 16;
41992+ }
41993+
41994+ if (len >= 12) {
41995+ //assert(len < 16);
41996+ if (len >= 16)
41997+ *(int *)0 = 0;
41998+
41999+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42000+ 16 | (__u64) name[3] << 24;
42001+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42002+ 16 | (__u64) name[7] << 24;
42003+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42004+ 16 | (__u64) name[11] << 24;
42005+
42006+ d = pad;
42007+ for (i = 12; i < len; i++) {
42008+ d <<= 8;
42009+ d |= name[i];
42010+ }
42011+ } else if (len >= 8) {
42012+ //assert(len < 12);
42013+ if (len >= 12)
42014+ *(int *)0 = 0;
42015+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42016+ 16 | (__u64) name[3] << 24;
42017+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42018+ 16 | (__u64) name[7] << 24;
42019+
42020+ c = d = pad;
42021+ for (i = 8; i < len; i++) {
42022+ c <<= 8;
42023+ c |= name[i];
42024+ }
42025+ } else if (len >= 4) {
42026+ //assert(len < 8);
42027+ if (len >= 8)
42028+ *(int *)0 = 0;
42029+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42030+ 16 | (__u64) name[3] << 24;
42031+
42032+ b = c = d = pad;
42033+ for (i = 4; i < len; i++) {
42034+ b <<= 8;
42035+ b |= name[i];
42036+ }
42037+ } else {
42038+ //assert(len < 4);
42039+ if (len >= 4)
42040+ *(int *)0 = 0;
42041+ a = b = c = d = pad;
42042+ for (i = 0; i < len; i++) {
42043+ a <<= 8;
42044+ a |= name[i];
42045+ }
42046+ }
42047+
42048+ TEACORE(FULLROUNDS);
42049+
42050+/* return 0;*/
42051+ return h0 ^ h1;
42052+
42053+}
42054+
42055+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
42056+
42057+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
42058+
42059+ Excerpts:
42060+
42061+ FNV hashes are designed to be fast while maintaining a low collision
42062+ rate.
42063+
42064+ [This version also seems to preserve lexicographical order locally.]
42065+
42066+ FNV hash algorithms and source code have been released into the public
42067+ domain.
42068+
42069+*/
42070+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
42071+ int len UNUSED_ARG /* @name's length */ )
42072+{
42073+ unsigned long long a = 0xcbf29ce484222325ull;
42074+ const unsigned long long fnv_64_prime = 0x100000001b3ull;
42075+
42076+ assert("nikita-678", name != NULL);
42077+ assert("nikita-679", len >= 0);
42078+
42079+ /* FNV-1 hash each octet in the buffer */
42080+ for (; *name; ++name) {
42081+ /* multiply by the 32 bit FNV magic prime mod 2^64 */
42082+ a *= fnv_64_prime;
42083+ /* xor the bottom with the current octet */
42084+ a ^= (unsigned long long)(*name);
42085+ }
42086+ /* return our new hash value */
42087+ return a;
42088+}
42089+
42090+/* degenerate hash function used to simplify testing of non-unique key
42091+ handling */
42092+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
42093+ int len UNUSED_ARG /* @name's length */ )
42094+{
42095+ return 0xc0c0c0c010101010ull;
42096+}
42097+
42098+static int change_hash(struct inode *inode, reiser4_plugin * plugin)
42099+{
42100+ int result;
42101+
42102+ assert("nikita-3503", inode != NULL);
42103+ assert("nikita-3504", plugin != NULL);
42104+
42105+ assert("nikita-3505", is_reiser4_inode(inode));
42106+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
42107+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
42108+
42109+ result = 0;
42110+ if (inode_hash_plugin(inode) == NULL ||
42111+ inode_hash_plugin(inode)->h.id != plugin->h.id) {
42112+ if (is_dir_empty(inode) == 0)
42113+ result =
42114+ plugin_set_hash(&reiser4_inode_data(inode)->pset,
42115+ &plugin->hash);
42116+ else
42117+ result = RETERR(-ENOTEMPTY);
42118+
42119+ }
42120+ return result;
42121+}
42122+
42123+static reiser4_plugin_ops hash_plugin_ops = {
42124+ .init = NULL,
42125+ .load = NULL,
42126+ .save_len = NULL,
42127+ .save = NULL,
42128+ .change = change_hash
42129+};
42130+
42131+/* hash plugins */
42132+hash_plugin hash_plugins[LAST_HASH_ID] = {
42133+ [RUPASOV_HASH_ID] = {
42134+ .h = {
42135+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42136+ .id = RUPASOV_HASH_ID,
42137+ .pops = &hash_plugin_ops,
42138+ .label = "rupasov",
42139+ .desc = "Original Yura's hash",
42140+ .linkage = {NULL, NULL}
42141+ },
42142+ .hash = hash_rupasov
42143+ },
42144+ [R5_HASH_ID] = {
42145+ .h = {
42146+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42147+ .id = R5_HASH_ID,
42148+ .pops = &hash_plugin_ops,
42149+ .label = "r5",
42150+ .desc = "r5 hash",
42151+ .linkage = {NULL, NULL}
42152+ },
42153+ .hash = hash_r5
42154+ },
42155+ [TEA_HASH_ID] = {
42156+ .h = {
42157+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42158+ .id = TEA_HASH_ID,
42159+ .pops = &hash_plugin_ops,
42160+ .label = "tea",
42161+ .desc = "tea hash",
42162+ .linkage = {NULL, NULL}
42163+ },
42164+ .hash = hash_tea
42165+ },
42166+ [FNV1_HASH_ID] = {
42167+ .h = {
42168+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42169+ .id = FNV1_HASH_ID,
42170+ .pops = &hash_plugin_ops,
42171+ .label = "fnv1",
42172+ .desc = "fnv1 hash",
42173+ .linkage = {NULL, NULL}
42174+ },
42175+ .hash = hash_fnv1
42176+ },
42177+ [DEGENERATE_HASH_ID] = {
42178+ .h = {
42179+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42180+ .id = DEGENERATE_HASH_ID,
42181+ .pops = &hash_plugin_ops,
42182+ .label = "degenerate hash",
42183+ .desc = "Degenerate hash: only for testing",
42184+ .linkage = {NULL, NULL}
42185+ },
42186+ .hash = hash_deg
42187+ }
42188+};
42189+
42190+/* Make Linus happy.
42191+ Local variables:
42192+ c-indentation-style: "K&R"
42193+ mode-name: "LC"
42194+ c-basic-offset: 8
42195+ tab-width: 8
42196+ fill-column: 120
42197+ End:
42198+*/
42199Index: linux-2.6.16/fs/reiser4/plugin/inode_ops.c
42200===================================================================
42201--- /dev/null
42202+++ linux-2.6.16/fs/reiser4/plugin/inode_ops.c
42203@@ -0,0 +1,886 @@
42204+/*
42205+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42206+ */
42207+
42208+/*
42209+ * this file contains typical implementations for most of methods of struct
42210+ * inode_operations
42211+ */
42212+
42213+#include "../inode.h"
42214+#include "../safe_link.h"
42215+
42216+#include <linux/quotaops.h>
42217+#include <linux/namei.h>
42218+
42219+
42220+static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42221+ reiser4_object_create_data *data);
42222+
42223+/**
42224+ * create_common - create of inode operations
42225+ * @parent: inode of parent directory
42226+ * @dentry: dentry of new object to create
42227+ * @mode: the permissions to use
42228+ * @nameidata:
42229+ *
42230+ * This is common implementation of vfs's create method of struct
42231+ * inode_operations.
42232+ * Creates regular file using file plugin from parent directory plugin set.
42233+ */
42234+int create_common(struct inode *parent, struct dentry *dentry,
42235+ int mode, struct nameidata *nameidata)
42236+{
42237+ reiser4_object_create_data data;
42238+
42239+ memset(&data, 0, sizeof data);
42240+ data.mode = S_IFREG | mode;
42241+ data.id = inode_regular_plugin(parent)->id;
42242+ return create_vfs_object(parent, dentry, &data);
42243+}
42244+
42245+int lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42246+void check_light_weight(struct inode *inode, struct inode *parent);
42247+
42248+/**
42249+ * lookup_common - lookup of inode operations
42250+ * @parent: inode of directory to lookup into
42251+ * @dentry: name to look for
42252+ * @nameidata:
42253+ *
42254+ * This is common implementation of vfs's lookup method of struct
42255+ * inode_operations.
42256+ */
42257+struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
42258+ struct nameidata *nameidata)
42259+{
42260+ reiser4_context *ctx;
42261+ int result;
42262+ struct dentry *new;
42263+ struct inode *inode;
42264+ reiser4_dir_entry_desc entry;
42265+
42266+ ctx = init_context(parent->i_sb);
42267+ if (IS_ERR(ctx))
42268+ return (struct dentry *)ctx;
42269+
42270+ /* set up operations on dentry. */
42271+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42272+
42273+ result = lookup_name(parent, dentry, &entry.key);
42274+ if (result) {
42275+ context_set_commit_async(ctx);
42276+ reiser4_exit_context(ctx);
42277+ if (result == -ENOENT) {
42278+ /* object not found */
42279+ if (!IS_DEADDIR(parent))
42280+ d_add(dentry, NULL);
42281+ return NULL;
42282+ }
42283+ return ERR_PTR(result);
42284+ }
42285+
42286+ inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42287+ if (IS_ERR(inode)) {
42288+ context_set_commit_async(ctx);
42289+ reiser4_exit_context(ctx);
42290+ return ERR_PTR(PTR_ERR(inode));
42291+ }
42292+
42293+ /* success */
42294+ check_light_weight(inode, parent);
42295+ new = d_splice_alias(inode, dentry);
42296+ reiser4_iget_complete(inode);
42297+
42298+ /* prevent balance_dirty_pages() from being called: we don't want to
42299+ * do this under directory i_mutex. */
42300+ context_set_commit_async(ctx);
42301+ reiser4_exit_context(ctx);
42302+ return new;
42303+}
42304+
42305+static reiser4_block_nr common_estimate_link(struct inode *parent,
42306+ struct inode *object);
42307+int reiser4_update_dir(struct inode *);
42308+
42309+/**
42310+ * link_common - link of inode operations
42311+ * @existing: dentry of object which is to get new name
42312+ * @parent: directory where new name is to be created
42313+ * @newname: new name
42314+ *
42315+ * This is common implementation of vfs's link method of struct
42316+ * inode_operations.
42317+ */
42318+int link_common(struct dentry *existing, struct inode *parent,
42319+ struct dentry *newname)
42320+{
42321+ reiser4_context *ctx;
42322+ int result;
42323+ struct inode *object;
42324+ dir_plugin *parent_dplug;
42325+ reiser4_dir_entry_desc entry;
42326+ reiser4_object_create_data data;
42327+ reiser4_block_nr reserve;
42328+
42329+ ctx = init_context(parent->i_sb);
42330+ if (IS_ERR(ctx))
42331+ return PTR_ERR(ctx);
42332+
42333+ assert("nikita-1431", existing != NULL);
42334+ assert("nikita-1432", parent != NULL);
42335+ assert("nikita-1433", newname != NULL);
42336+
42337+ object = existing->d_inode;
42338+ assert("nikita-1434", object != NULL);
42339+
42340+ /* check for race with create_object() */
42341+ if (inode_get_flag(object, REISER4_IMMUTABLE)) {
42342+ context_set_commit_async(ctx);
42343+ reiser4_exit_context(ctx);
42344+ return RETERR(-E_REPEAT);
42345+ }
42346+
42347+ parent_dplug = inode_dir_plugin(parent);
42348+
42349+ memset(&entry, 0, sizeof entry);
42350+ entry.obj = object;
42351+
42352+ data.mode = object->i_mode;
42353+ data.id = inode_file_plugin(object)->h.id;
42354+
42355+ reserve = common_estimate_link(parent, existing->d_inode);
42356+ if ((__s64) reserve < 0) {
42357+ context_set_commit_async(ctx);
42358+ reiser4_exit_context(ctx);
42359+ return reserve;
42360+ }
42361+
42362+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42363+ context_set_commit_async(ctx);
42364+ reiser4_exit_context(ctx);
42365+ return RETERR(-ENOSPC);
42366+ }
42367+
42368+ /*
42369+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
42370+ * means that link(2) can race against unlink(2) or rename(2), and
42371+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
42372+ *
42373+ * For such inode we have to undo special processing done in
42374+ * reiser4_unlink() viz. creation of safe-link.
42375+ */
42376+ if (unlikely(object->i_nlink == 0)) {
42377+ result = safe_link_del(tree_by_inode(object),
42378+ get_inode_oid(object), SAFE_UNLINK);
42379+ if (result != 0) {
42380+ context_set_commit_async(ctx);
42381+ reiser4_exit_context(ctx);
42382+ return result;
42383+ }
42384+ }
42385+
42386+ /* increment nlink of @existing and update its stat data */
42387+ result = reiser4_add_nlink(object, parent, 1);
42388+ if (result == 0) {
42389+ /* add entry to the parent */
42390+ result =
42391+ parent_dplug->add_entry(parent, newname, &data, &entry);
42392+ if (result != 0) {
42393+ /* failed to add entry to the parent, decrement nlink
42394+ of @existing */
42395+ reiser4_del_nlink(object, parent, 1);
42396+ /*
42397+ * now, if that failed, we have a file with too big
42398+ * nlink---space leak, much better than directory
42399+ * entry pointing to nowhere
42400+ */
42401+ }
42402+ }
42403+ if (result == 0) {
42404+ atomic_inc(&object->i_count);
42405+ /*
42406+ * Upon successful completion, link() shall mark for update
42407+ * the st_ctime field of the file. Also, the st_ctime and
42408+ * st_mtime fields of the directory that contains the new
42409+ * entry shall be marked for update. --SUS
42410+ */
42411+ result = reiser4_update_dir(parent);
42412+ }
42413+ if (result == 0)
42414+ d_instantiate(newname, existing->d_inode);
42415+
42416+ context_set_commit_async(ctx);
42417+ reiser4_exit_context(ctx);
42418+ return result;
42419+}
42420+
42421+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
42422+
42423+/**
42424+ * unlink_common - unlink of inode operations
42425+ * @parent: inode of directory to remove name from
42426+ * @victim: name to be removed
42427+ *
42428+ * This is common implementation of vfs's unlink method of struct
42429+ * inode_operations.
42430+ */
42431+int unlink_common(struct inode *parent, struct dentry *victim)
42432+{
42433+ reiser4_context *ctx;
42434+ int result;
42435+ struct inode *object;
42436+ file_plugin *fplug;
42437+
42438+ ctx = init_context(parent->i_sb);
42439+ if (IS_ERR(ctx))
42440+ return PTR_ERR(ctx);
42441+
42442+ object = victim->d_inode;
42443+ fplug = inode_file_plugin(object);
42444+ assert("nikita-2882", fplug->detach != NULL);
42445+
42446+ result = unlink_check_and_grab(parent, victim);
42447+ if (result != 0) {
42448+ context_set_commit_async(ctx);
42449+ reiser4_exit_context(ctx);
42450+ return result;
42451+ }
42452+
42453+ result = fplug->detach(object, parent);
42454+ if (result == 0) {
42455+ dir_plugin *parent_dplug;
42456+ reiser4_dir_entry_desc entry;
42457+
42458+ parent_dplug = inode_dir_plugin(parent);
42459+ memset(&entry, 0, sizeof entry);
42460+
42461+ /* first, delete directory entry */
42462+ result = parent_dplug->rem_entry(parent, victim, &entry);
42463+ if (result == 0) {
42464+ /*
42465+ * if name was removed successfully, we _have_ to
42466+ * return 0 from this function, because upper level
42467+ * caller (vfs_{rmdir,unlink}) expect this.
42468+ *
42469+ * now that directory entry is removed, update
42470+ * stat-data
42471+ */
42472+ reiser4_del_nlink(object, parent, 1);
42473+ /*
42474+ * Upon successful completion, unlink() shall mark for
42475+ * update the st_ctime and st_mtime fields of the
42476+ * parent directory. Also, if the file's link count is
42477+ * not 0, the st_ctime field of the file shall be
42478+ * marked for update. --SUS
42479+ */
42480+ reiser4_update_dir(parent);
42481+ /* add safe-link for this file */
42482+ if (object->i_nlink == 0)
42483+ safe_link_add(object, SAFE_UNLINK);
42484+ }
42485+ }
42486+
42487+ if (unlikely(result != 0)) {
42488+ if (result != -ENOMEM)
42489+ warning("nikita-3398", "Cannot unlink %llu (%i)",
42490+ (unsigned long long)get_inode_oid(object),
42491+ result);
42492+ /* if operation failed commit pending inode modifications to
42493+ * the stat-data */
42494+ reiser4_update_sd(object);
42495+ reiser4_update_sd(parent);
42496+ }
42497+
42498+ reiser4_release_reserved(object->i_sb);
42499+
42500+ /* @object's i_ctime was updated by ->rem_link() method(). */
42501+
42502+ /* @victim can be already removed from the disk by this time. Inode is
42503+ then marked so that iput() wouldn't try to remove stat data. But
42504+ inode itself is still there.
42505+ */
42506+
42507+ /*
42508+ * we cannot release directory semaphore here, because name has
42509+ * already been deleted, but dentry (@victim) still exists. Prevent
42510+ * balance_dirty_pages() from being called on exiting this context: we
42511+ * don't want to do this under directory i_mutex.
42512+ */
42513+ context_set_commit_async(ctx);
42514+ reiser4_exit_context(ctx);
42515+ return result;
42516+}
42517+
42518+/**
42519+ * symlink_common - symlink of inode operations
42520+ * @parent: inode of parent directory
42521+ * @dentry: dentry of object to be created
42522+ * @linkname: string symlink is to contain
42523+ *
42524+ * This is common implementation of vfs's symlink method of struct
42525+ * inode_operations.
42526+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
42527+ */
42528+int symlink_common(struct inode *parent, struct dentry *dentry,
42529+ const char *linkname)
42530+{
42531+ reiser4_object_create_data data;
42532+
42533+ memset(&data, 0, sizeof data);
42534+ data.name = linkname;
42535+ data.id = SYMLINK_FILE_PLUGIN_ID;
42536+ data.mode = S_IFLNK | S_IRWXUGO;
42537+ return create_vfs_object(parent, dentry, &data);
42538+}
42539+
42540+/**
42541+ * mkdir_common - mkdir of inode operations
42542+ * @parent: inode of parent directory
42543+ * @dentry: dentry of object to be created
42544+ * @mode: the permissions to use
42545+ *
42546+ * This is common implementation of vfs's mkdir method of struct
42547+ * inode_operations.
42548+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
42549+ */
42550+int mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
42551+{
42552+ reiser4_object_create_data data;
42553+
42554+ memset(&data, 0, sizeof data);
42555+ data.mode = S_IFDIR | mode;
42556+ data.id = DIRECTORY_FILE_PLUGIN_ID;
42557+ return create_vfs_object(parent, dentry, &data);
42558+}
42559+
42560+/**
42561+ * mknod_common - mknod of inode operations
42562+ * @parent: inode of parent directory
42563+ * @dentry: dentry of object to be created
42564+ * @mode: the permissions to use and file type
42565+ * @rdev: minor and major of new device file
42566+ *
42567+ * This is common implementation of vfs's mknod method of struct
42568+ * inode_operations.
42569+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
42570+ */
42571+int mknod_common(struct inode *parent, struct dentry *dentry,
42572+ int mode, dev_t rdev)
42573+{
42574+ reiser4_object_create_data data;
42575+
42576+ memset(&data, 0, sizeof data);
42577+ data.mode = mode;
42578+ data.rdev = rdev;
42579+ data.id = SPECIAL_FILE_PLUGIN_ID;
42580+ return create_vfs_object(parent, dentry, &data);
42581+}
42582+
42583+/*
42584+ * implementation of vfs's rename method of struct inode_operations for typical
42585+ * directory is in inode_ops_rename.c
42586+ */
42587+
42588+/**
42589+ * follow_link_common - follow_link of inode operations
42590+ * @dentry: dentry of symlink
42591+ * @data:
42592+ *
42593+ * This is common implementation of vfs's followlink method of struct
42594+ * inode_operations.
42595+ * Assumes that inode's generic_ip points to the content of symbolic link.
42596+ */
42597+void *follow_link_common(struct dentry *dentry, struct nameidata *nd)
42598+{
42599+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
42600+
42601+ if (!dentry->d_inode->u.generic_ip
42602+ || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED))
42603+ return ERR_PTR(RETERR(-EINVAL));
42604+ nd_set_link(nd, dentry->d_inode->u.generic_ip);
42605+ return NULL;
42606+}
42607+
42608+/**
42609+ * permission_common - permission of inode operations
42610+ * @inode: inode to check permissions for
42611+ * @mask: mode bits to check permissions for
42612+ * @nameidata:
42613+ *
42614+ * Uses generic function to check for rwx permissions.
42615+ */
42616+int permission_common(struct inode *inode, int mask,
42617+ struct nameidata *nameidata)
42618+{
42619+ return generic_permission(inode, mask, NULL);
42620+}
42621+
42622+static int setattr_reserve(reiser4_tree *);
42623+
42624+/* this is common implementation of vfs's setattr method of struct
42625+ inode_operations
42626+*/
42627+int setattr_common(struct dentry *dentry, struct iattr *attr)
42628+{
42629+ reiser4_context *ctx;
42630+ struct inode *inode;
42631+ int result;
42632+
42633+ inode = dentry->d_inode;
42634+ result = inode_change_ok(inode, attr);
42635+ if (result)
42636+ return result;
42637+
42638+ ctx = init_context(inode->i_sb);
42639+ if (IS_ERR(ctx))
42640+ return PTR_ERR(ctx);
42641+
42642+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
42643+
42644+ /*
42645+ * grab disk space and call standard inode_setattr().
42646+ */
42647+ result = setattr_reserve(tree_by_inode(inode));
42648+ if (!result) {
42649+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
42650+ || (attr->ia_valid & ATTR_GID
42651+ && attr->ia_gid != inode->i_gid)) {
42652+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
42653+ if (result) {
42654+ context_set_commit_async(ctx);
42655+ reiser4_exit_context(ctx);
42656+ return result;
42657+ }
42658+ }
42659+ result = inode_setattr(inode, attr);
42660+ if (!result)
42661+ reiser4_update_sd(inode);
42662+ }
42663+
42664+ context_set_commit_async(ctx);
42665+ reiser4_exit_context(ctx);
42666+ return result;
42667+}
42668+
42669+/* this is common implementation of vfs's getattr method of struct
42670+ inode_operations
42671+*/
42672+int
42673+getattr_common(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry,
42674+ struct kstat *stat)
42675+{
42676+ struct inode *obj;
42677+
42678+ assert("nikita-2298", dentry != NULL);
42679+ assert("nikita-2299", stat != NULL);
42680+ assert("nikita-2300", dentry->d_inode != NULL);
42681+
42682+ obj = dentry->d_inode;
42683+
42684+ stat->dev = obj->i_sb->s_dev;
42685+ stat->ino = oid_to_uino(get_inode_oid(obj));
42686+ stat->mode = obj->i_mode;
42687+ /* don't confuse userland with huge nlink. This is not entirely
42688+ * correct, because nlink_t is not necessary 16 bit signed. */
42689+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
42690+ stat->uid = obj->i_uid;
42691+ stat->gid = obj->i_gid;
42692+ stat->rdev = obj->i_rdev;
42693+ stat->atime = obj->i_atime;
42694+ stat->mtime = obj->i_mtime;
42695+ stat->ctime = obj->i_ctime;
42696+ stat->size = obj->i_size;
42697+ stat->blocks =
42698+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
42699+ /* "preferred" blocksize for efficient file system I/O */
42700+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
42701+
42702+ return 0;
42703+}
42704+
42705+/* Estimate the maximum amount of nodes which might be allocated or changed on
42706+ typical new object creation. Typical creation consists of calling create
42707+ method of file plugin, adding directory entry to parent and update parent
42708+ directory's stat data.
42709+*/
42710+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
42711+ struct inode *object
42712+ /* object */ )
42713+{
42714+ assert("vpf-309", parent != NULL);
42715+ assert("vpf-307", object != NULL);
42716+
42717+ return
42718+ /* object creation estimation */
42719+ inode_file_plugin(object)->estimate.create(object) +
42720+ /* stat data of parent directory estimation */
42721+ inode_file_plugin(parent)->estimate.update(parent) +
42722+ /* adding entry estimation */
42723+ inode_dir_plugin(parent)->estimate.add_entry(parent) +
42724+ /* to undo in the case of failure */
42725+ inode_dir_plugin(parent)->estimate.rem_entry(parent);
42726+}
42727+
42728+/* Create child in directory.
42729+
42730+ . get object's plugin
42731+ . get fresh inode
42732+ . initialize inode
42733+ . add object's stat-data
42734+ . initialize object's directory
42735+ . add entry to the parent
42736+ . instantiate dentry
42737+
42738+*/
42739+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
42740+ object */
42741+ struct inode **retobj)
42742+{
42743+ int result;
42744+
42745+ struct dentry *dentry; /* parent object */
42746+ struct inode *parent; /* new name */
42747+
42748+ dir_plugin *par_dir; /* directory plugin on the parent */
42749+ dir_plugin *obj_dir; /* directory plugin on the new object */
42750+ file_plugin *obj_plug; /* object plugin on the new object */
42751+ struct inode *object; /* new object */
42752+ reiser4_block_nr reserve;
42753+
42754+ reiser4_dir_entry_desc entry; /* new directory entry */
42755+
42756+ assert("nikita-1420", data != NULL);
42757+ parent = data->parent;
42758+ dentry = data->dentry;
42759+
42760+ assert("nikita-1418", parent != NULL);
42761+ assert("nikita-1419", dentry != NULL);
42762+
42763+ /* check, that name is acceptable for parent */
42764+ par_dir = inode_dir_plugin(parent);
42765+ if (par_dir->is_name_acceptable &&
42766+ !par_dir->is_name_acceptable(parent,
42767+ dentry->d_name.name,
42768+ (int)dentry->d_name.len))
42769+ return RETERR(-ENAMETOOLONG);
42770+
42771+ result = 0;
42772+ obj_plug = file_plugin_by_id((int)data->id);
42773+ if (obj_plug == NULL) {
42774+ warning("nikita-430", "Cannot find plugin %i", data->id);
42775+ return RETERR(-ENOENT);
42776+ }
42777+ object = new_inode(parent->i_sb);
42778+ if (object == NULL)
42779+ return RETERR(-ENOMEM);
42780+ /* we'll update i_nlink below */
42781+ object->i_nlink = 0;
42782+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
42783+ * to simplify error handling: if some error occurs before i_ino is
42784+ * initialized with oid, i_ino should already be set to some
42785+ * distinguished value. */
42786+ object->i_ino = 0;
42787+
42788+ /* So that on error iput will be called. */
42789+ *retobj = object;
42790+
42791+ if (DQUOT_ALLOC_INODE(object)) {
42792+ DQUOT_DROP(object);
42793+ object->i_flags |= S_NOQUOTA;
42794+ return RETERR(-EDQUOT);
42795+ }
42796+
42797+ memset(&entry, 0, sizeof entry);
42798+ entry.obj = object;
42799+
42800+ plugin_set_file(&reiser4_inode_data(object)->pset, obj_plug);
42801+ result = obj_plug->set_plug_in_inode(object, parent, data);
42802+ if (result) {
42803+ warning("nikita-431", "Cannot install plugin %i on %llx",
42804+ data->id, (unsigned long long)get_inode_oid(object));
42805+ DQUOT_FREE_INODE(object);
42806+ object->i_flags |= S_NOQUOTA;
42807+ return result;
42808+ }
42809+
42810+ /* reget plugin after installation */
42811+ obj_plug = inode_file_plugin(object);
42812+
42813+ if (obj_plug->create_object == NULL) {
42814+ DQUOT_FREE_INODE(object);
42815+ object->i_flags |= S_NOQUOTA;
42816+ return RETERR(-EPERM);
42817+ }
42818+
42819+ /* if any of hash, tail, sd or permission plugins for newly created
42820+ object are not set yet set them here inheriting them from parent
42821+ directory
42822+ */
42823+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
42824+ result = obj_plug->adjust_to_parent(object,
42825+ parent,
42826+ object->i_sb->s_root->d_inode);
42827+ if (result != 0) {
42828+ warning("nikita-432", "Cannot inherit from %llx to %llx",
42829+ (unsigned long long)get_inode_oid(parent),
42830+ (unsigned long long)get_inode_oid(object));
42831+ DQUOT_FREE_INODE(object);
42832+ object->i_flags |= S_NOQUOTA;
42833+ return result;
42834+ }
42835+
42836+ /* setup inode and file-operations for this inode */
42837+ setup_inode_ops(object, data);
42838+
42839+ /* call file plugin's method to initialize plugin specific part of
42840+ * inode */
42841+ if (obj_plug->init_inode_data)
42842+ obj_plug->init_inode_data(object, data, 1 /*create */ );
42843+
42844+ /* obtain directory plugin (if any) for new object. */
42845+ obj_dir = inode_dir_plugin(object);
42846+ if (obj_dir != NULL && obj_dir->init == NULL) {
42847+ DQUOT_FREE_INODE(object);
42848+ object->i_flags |= S_NOQUOTA;
42849+ return RETERR(-EPERM);
42850+ }
42851+
42852+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
42853+
42854+ reserve = estimate_create_vfs_object(parent, object);
42855+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42856+ DQUOT_FREE_INODE(object);
42857+ object->i_flags |= S_NOQUOTA;
42858+ return RETERR(-ENOSPC);
42859+ }
42860+
42861+ /* mark inode `immutable'. We disable changes to the file being
42862+ created until valid directory entry for it is inserted. Otherwise,
42863+ if file were expanded and insertion of directory entry fails, we
42864+ have to remove file, but we only alloted enough space in
42865+ transaction to remove _empty_ file. 3.x code used to remove stat
42866+ data in different transaction thus possibly leaking disk space on
42867+ crash. This all only matters if it's possible to access file
42868+ without name, for example, by inode number
42869+ */
42870+ inode_set_flag(object, REISER4_IMMUTABLE);
42871+
42872+ /* create empty object, this includes allocation of new objectid. For
42873+ directories this implies creation of dot and dotdot */
42874+ assert("nikita-2265", inode_get_flag(object, REISER4_NO_SD));
42875+
42876+ /* mark inode as `loaded'. From this point onward
42877+ reiser4_delete_inode() will try to remove its stat-data. */
42878+ inode_set_flag(object, REISER4_LOADED);
42879+
42880+ result = obj_plug->create_object(object, parent, data);
42881+ if (result != 0) {
42882+ inode_clr_flag(object, REISER4_IMMUTABLE);
42883+ if (result != -ENAMETOOLONG && result != -ENOMEM)
42884+ warning("nikita-2219",
42885+ "Failed to create sd for %llu",
42886+ (unsigned long long)get_inode_oid(object));
42887+ DQUOT_FREE_INODE(object);
42888+ object->i_flags |= S_NOQUOTA;
42889+ return result;
42890+ }
42891+
42892+ if (obj_dir != NULL)
42893+ result = obj_dir->init(object, parent, data);
42894+ if (result == 0) {
42895+ assert("nikita-434", !inode_get_flag(object, REISER4_NO_SD));
42896+ /* insert inode into VFS hash table */
42897+ insert_inode_hash(object);
42898+ /* create entry */
42899+ result = par_dir->add_entry(parent, dentry, data, &entry);
42900+ if (result == 0) {
42901+ result = reiser4_add_nlink(object, parent, 0);
42902+ /* If O_CREAT is set and the file did not previously
42903+ exist, upon successful completion, open() shall
42904+ mark for update the st_atime, st_ctime, and
42905+ st_mtime fields of the file and the st_ctime and
42906+ st_mtime fields of the parent directory. --SUS
42907+ */
42908+ /* @object times are already updated by
42909+ reiser4_add_nlink() */
42910+ if (result == 0)
42911+ reiser4_update_dir(parent);
42912+ if (result != 0)
42913+ /* cleanup failure to add nlink */
42914+ par_dir->rem_entry(parent, dentry, &entry);
42915+ }
42916+ if (result != 0)
42917+ /* cleanup failure to add entry */
42918+ obj_plug->detach(object, parent);
42919+ } else if (result != -ENOMEM)
42920+ warning("nikita-2219", "Failed to initialize dir for %llu: %i",
42921+ (unsigned long long)get_inode_oid(object), result);
42922+
42923+ /*
42924+ * update stat-data, committing all pending modifications to the inode
42925+ * fields.
42926+ */
42927+ reiser4_update_sd(object);
42928+ if (result != 0) {
42929+ DQUOT_FREE_INODE(object);
42930+ object->i_flags |= S_NOQUOTA;
42931+ /* if everything was ok (result == 0), parent stat-data is
42932+ * already updated above (update_parent_dir()) */
42933+ reiser4_update_sd(parent);
42934+ /* failure to create entry, remove object */
42935+ obj_plug->delete_object(object);
42936+ }
42937+
42938+ /* file has name now, clear immutable flag */
42939+ inode_clr_flag(object, REISER4_IMMUTABLE);
42940+
42941+ /* on error, iput() will call ->delete_inode(). We should keep track
42942+ of the existence of stat-data for this inode and avoid attempt to
42943+ remove it in reiser4_delete_inode(). This is accomplished through
42944+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
42945+ */
42946+ return result;
42947+}
42948+
42949+/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
42950+ reiser4_mknod and reiser4_symlink
42951+*/
42952+static int
42953+create_vfs_object(struct inode *parent,
42954+ struct dentry *dentry, reiser4_object_create_data * data)
42955+{
42956+ reiser4_context *ctx;
42957+ int result;
42958+ struct inode *child;
42959+
42960+ ctx = init_context(parent->i_sb);
42961+ if (IS_ERR(ctx))
42962+ return PTR_ERR(ctx);
42963+ context_set_commit_async(ctx);
42964+
42965+ data->parent = parent;
42966+ data->dentry = dentry;
42967+ child = NULL;
42968+ result = do_create_vfs_child(data, &child);
42969+ if (unlikely(result != 0)) {
42970+ if (child != NULL) {
42971+ reiser4_make_bad_inode(child);
42972+ iput(child);
42973+ }
42974+ } else
42975+ d_instantiate(dentry, child);
42976+
42977+ reiser4_exit_context(ctx);
42978+ return result;
42979+}
42980+
42981+/* helper for link_common. Estimate disk space necessary to add a link
42982+ from @parent to @object
42983+*/
42984+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
42985+ struct inode *object
42986+ /* object to which new link is being cerated */
42987+ )
42988+{
42989+ reiser4_block_nr res = 0;
42990+ file_plugin *fplug;
42991+ dir_plugin *dplug;
42992+
42993+ assert("vpf-317", object != NULL);
42994+ assert("vpf-318", parent != NULL);
42995+
42996+ fplug = inode_file_plugin(object);
42997+ dplug = inode_dir_plugin(parent);
42998+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
42999+ /* reiser4_add_nlink(object) */
43000+ res += fplug->estimate.update(object);
43001+ /* add_entry(parent) */
43002+ res += dplug->estimate.add_entry(parent);
43003+ /* reiser4_del_nlink(object) */
43004+ res += fplug->estimate.update(object);
43005+ /* update_dir(parent) */
43006+ res += inode_file_plugin(parent)->estimate.update(parent);
43007+ /* safe-link */
43008+ res += estimate_one_item_removal(tree_by_inode(object));
43009+
43010+ return res;
43011+}
43012+
43013+/* Estimate disk space necessary to remove a link between @parent and
43014+ @object.
43015+*/
43016+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
43017+ struct inode *object
43018+ /* object to which new link is being cerated */
43019+ )
43020+{
43021+ reiser4_block_nr res = 0;
43022+ file_plugin *fplug;
43023+ dir_plugin *dplug;
43024+
43025+ assert("vpf-317", object != NULL);
43026+ assert("vpf-318", parent != NULL);
43027+
43028+ fplug = inode_file_plugin(object);
43029+ dplug = inode_dir_plugin(parent);
43030+
43031+ /* rem_entry(parent) */
43032+ res += dplug->estimate.rem_entry(parent);
43033+ /* reiser4_del_nlink(object) */
43034+ res += fplug->estimate.update(object);
43035+ /* update_dir(parent) */
43036+ res += inode_file_plugin(parent)->estimate.update(parent);
43037+ /* fplug->unlink */
43038+ res += fplug->estimate.unlink(object, parent);
43039+ /* safe-link */
43040+ res += estimate_one_insert_item(tree_by_inode(object));
43041+
43042+ return res;
43043+}
43044+
43045+/* helper for unlink_common. Estimate and grab space for unlink. */
43046+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
43047+{
43048+ file_plugin *fplug;
43049+ struct inode *child;
43050+ int result;
43051+
43052+ result = 0;
43053+ child = victim->d_inode;
43054+ fplug = inode_file_plugin(child);
43055+
43056+ /* check for race with create_object() */
43057+ if (inode_get_flag(child, REISER4_IMMUTABLE))
43058+ return RETERR(-E_REPEAT);
43059+ /* object being deleted should have stat data */
43060+ assert("vs-949", !inode_get_flag(child, REISER4_NO_SD));
43061+
43062+ /* ask object plugin */
43063+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
43064+ return RETERR(-ENOTEMPTY);
43065+
43066+ result = (int)estimate_unlink(parent, child);
43067+ if (result < 0)
43068+ return result;
43069+
43070+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
43071+}
43072+
43073+/* helper for setattr_common */
43074+static int setattr_reserve(reiser4_tree * tree)
43075+{
43076+ assert("vs-1096", is_grab_enabled(get_current_context()));
43077+ return reiser4_grab_space(estimate_one_insert_into_item(tree),
43078+ BA_CAN_COMMIT);
43079+}
43080+
43081+/* helper function. Standards require that for many file-system operations
43082+ on success ctime and mtime of parent directory is to be updated. */
43083+int reiser4_update_dir(struct inode *dir)
43084+{
43085+ assert("nikita-2525", dir != NULL);
43086+
43087+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
43088+ return reiser4_update_sd(dir);
43089+}
43090Index: linux-2.6.16/fs/reiser4/plugin/inode_ops_rename.c
43091===================================================================
43092--- /dev/null
43093+++ linux-2.6.16/fs/reiser4/plugin/inode_ops_rename.c
43094@@ -0,0 +1,904 @@
43095+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
43096+ * reiser4/README */
43097+
43098+#include "../inode.h"
43099+#include "../safe_link.h"
43100+
43101+static const char *possible_leak = "Possible disk space leak.";
43102+
43103+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43104+
43105+ Helper function called from hashed_rename() */
43106+static int replace_name(struct inode *to_inode, /* inode where @from_coord is
43107+ * to be re-targeted at */
43108+ struct inode *from_dir, /* directory where @from_coord
43109+ * lives */
43110+ struct inode *from_inode, /* inode @from_coord
43111+ * originally point to */
43112+ coord_t * from_coord, /* where directory entry is in
43113+ * the tree */
43114+ lock_handle * from_lh /* lock handle on @from_coord */ )
43115+{
43116+ item_plugin *from_item;
43117+ int result;
43118+ znode *node;
43119+
43120+ coord_clear_iplug(from_coord);
43121+ node = from_coord->node;
43122+ result = zload(node);
43123+ if (result != 0)
43124+ return result;
43125+ from_item = item_plugin_by_coord(from_coord);
43126+ if (item_type_by_coord(from_coord) == DIR_ENTRY_ITEM_TYPE) {
43127+ reiser4_key to_key;
43128+
43129+ build_sd_key(to_inode, &to_key);
43130+
43131+ /* everything is found and prepared to change directory entry
43132+ at @from_coord to point to @to_inode.
43133+
43134+ @to_inode is just about to get new name, so bump its link
43135+ counter.
43136+
43137+ */
43138+ result = reiser4_add_nlink(to_inode, from_dir, 0);
43139+ if (result != 0) {
43140+ /* Don't issue warning: this may be plain -EMLINK */
43141+ zrelse(node);
43142+ return result;
43143+ }
43144+
43145+ result =
43146+ from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43147+ if (result != 0) {
43148+ reiser4_del_nlink(to_inode, from_dir, 0);
43149+ zrelse(node);
43150+ return result;
43151+ }
43152+
43153+ /* @from_inode just lost its name, he-he.
43154+
43155+ If @from_inode was directory, it contained dotdot pointing
43156+ to @from_dir. @from_dir i_nlink will be decreased when
43157+ iput() will be called on @from_inode.
43158+
43159+ If file-system is not ADG (hard-links are
43160+ supported on directories), iput(from_inode) will not remove
43161+ @from_inode, and thus above is incorrect, but hard-links on
43162+ directories are problematic in many other respects.
43163+ */
43164+ result = reiser4_del_nlink(from_inode, from_dir, 0);
43165+ if (result != 0) {
43166+ warning("nikita-2330",
43167+ "Cannot remove link from source: %i. %s",
43168+ result, possible_leak);
43169+ }
43170+ /* Has to return success, because entry is already
43171+ * modified. */
43172+ result = 0;
43173+
43174+ /* NOTE-NIKITA consider calling plugin method in stead of
43175+ accessing inode fields directly. */
43176+ from_dir->i_mtime = CURRENT_TIME;
43177+ } else {
43178+ warning("nikita-2326", "Unexpected item type");
43179+ result = RETERR(-EIO);
43180+ }
43181+ zrelse(node);
43182+ return result;
43183+}
43184+
43185+/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43186+
43187+ Helper function used by hashed_rename(). */
43188+static int add_name(struct inode *inode, /* inode where @coord is to be
43189+ * re-targeted at */
43190+ struct inode *dir, /* directory where @coord lives */
43191+ struct dentry *name, /* new name */
43192+ coord_t * coord, /* where directory entry is in the tree */
43193+ lock_handle * lh, /* lock handle on @coord */
43194+ int is_dir /* true, if @inode is directory */ )
43195+{
43196+ int result;
43197+ reiser4_dir_entry_desc entry;
43198+
43199+ assert("nikita-2333", lh->node == coord->node);
43200+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43201+
43202+ memset(&entry, 0, sizeof entry);
43203+ entry.obj = inode;
43204+ /* build key of directory entry description */
43205+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43206+
43207+ /* ext2 does this in different order: first inserts new entry,
43208+ then increases directory nlink. We don't want do this,
43209+ because reiser4_add_nlink() calls ->add_link() plugin
43210+ method that can fail for whatever reason, leaving as with
43211+ cleanup problems.
43212+ */
43213+ /* @inode is getting new name */
43214+ reiser4_add_nlink(inode, dir, 0);
43215+ /* create @new_name in @new_dir pointing to
43216+ @old_inode */
43217+ result = WITH_COORD(coord,
43218+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43219+ coord,
43220+ lh,
43221+ name,
43222+ &entry));
43223+ if (result != 0) {
43224+ int result2;
43225+ result2 = reiser4_del_nlink(inode, dir, 0);
43226+ if (result2 != 0) {
43227+ warning("nikita-2327",
43228+ "Cannot drop link on %lli %i. %s",
43229+ (unsigned long long)get_inode_oid(inode),
43230+ result2, possible_leak);
43231+ }
43232+ } else
43233+ INODE_INC_FIELD(dir, i_size);
43234+ return result;
43235+}
43236+
43237+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43238+ struct dentry *old_name, /* old name */
43239+ struct inode *new_dir, /* directory where @new is located */
43240+ struct dentry *new_name /* new name */ )
43241+{
43242+ reiser4_block_nr res1, res2;
43243+ dir_plugin *p_parent_old, *p_parent_new;
43244+ file_plugin *p_child_old, *p_child_new;
43245+
43246+ assert("vpf-311", old_dir != NULL);
43247+ assert("vpf-312", new_dir != NULL);
43248+ assert("vpf-313", old_name != NULL);
43249+ assert("vpf-314", new_name != NULL);
43250+
43251+ p_parent_old = inode_dir_plugin(old_dir);
43252+ p_parent_new = inode_dir_plugin(new_dir);
43253+ p_child_old = inode_file_plugin(old_name->d_inode);
43254+ if (new_name->d_inode)
43255+ p_child_new = inode_file_plugin(new_name->d_inode);
43256+ else
43257+ p_child_new = NULL;
43258+
43259+ /* find_entry - can insert one leaf. */
43260+ res1 = res2 = 1;
43261+
43262+ /* replace_name */
43263+ {
43264+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43265+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43266+ /* update key */
43267+ res1 += 1;
43268+ /* reiser4_del_nlink(p_child_new) */
43269+ if (p_child_new)
43270+ res1 += p_child_new->estimate.update(new_name->d_inode);
43271+ }
43272+
43273+ /* else add_name */
43274+ {
43275+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43276+ res2 +=
43277+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43278+ /* reiser4_add_nlink(p_parent_old) */
43279+ res2 += p_child_old->estimate.update(old_name->d_inode);
43280+ /* add_entry(p_parent_new) */
43281+ res2 += p_parent_new->estimate.add_entry(new_dir);
43282+ /* reiser4_del_nlink(p_parent_old) */
43283+ res2 += p_child_old->estimate.update(old_name->d_inode);
43284+ }
43285+
43286+ res1 = res1 < res2 ? res2 : res1;
43287+
43288+ /* reiser4_write_sd(p_parent_new) */
43289+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43290+
43291+ /* reiser4_write_sd(p_child_new) */
43292+ if (p_child_new)
43293+ res1 += p_child_new->estimate.update(new_name->d_inode);
43294+
43295+ /* hashed_rem_entry(p_parent_old) */
43296+ res1 += p_parent_old->estimate.rem_entry(old_dir);
43297+
43298+ /* reiser4_del_nlink(p_child_old) */
43299+ res1 += p_child_old->estimate.update(old_name->d_inode);
43300+
43301+ /* replace_name */
43302+ {
43303+ /* reiser4_add_nlink(p_parent_dir_new) */
43304+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43305+ /* update_key */
43306+ res1 += 1;
43307+ /* reiser4_del_nlink(p_parent_new) */
43308+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43309+ /* reiser4_del_nlink(p_parent_old) */
43310+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43311+ }
43312+
43313+ /* reiser4_write_sd(p_parent_old) */
43314+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43315+
43316+ /* reiser4_write_sd(p_child_old) */
43317+ res1 += p_child_old->estimate.update(old_name->d_inode);
43318+
43319+ return res1;
43320+}
43321+
43322+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
43323+ struct dentry *old_name, /* old name */
43324+ struct inode *new_dir, /* directory where @new is located */
43325+ struct dentry *new_name
43326+ /* new name */ )
43327+{
43328+ reiser4_block_nr reserve;
43329+
43330+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43331+
43332+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43333+ return RETERR(-ENOSPC);
43334+
43335+ return 0;
43336+}
43337+
43338+/* check whether @old_inode and @new_inode can be moved within file system
43339+ * tree. This singles out attempts to rename pseudo-files, for example. */
43340+static int can_rename(struct inode *old_dir, struct inode *old_inode,
43341+ struct inode *new_dir, struct inode *new_inode)
43342+{
43343+ file_plugin *fplug;
43344+ dir_plugin *dplug;
43345+
43346+ assert("nikita-3370", old_inode != NULL);
43347+
43348+ dplug = inode_dir_plugin(new_dir);
43349+ fplug = inode_file_plugin(old_inode);
43350+
43351+ if (dplug == NULL)
43352+ return RETERR(-ENOTDIR);
43353+ else if (new_dir->i_op->create == NULL)
43354+ return RETERR(-EPERM);
43355+ else if (!fplug->can_add_link(old_inode))
43356+ return RETERR(-EMLINK);
43357+ else if (new_inode != NULL) {
43358+ fplug = inode_file_plugin(new_inode);
43359+ if (fplug->can_rem_link != NULL &&
43360+ !fplug->can_rem_link(new_inode))
43361+ return RETERR(-EBUSY);
43362+ }
43363+ return 0;
43364+}
43365+
43366+int find_entry(struct inode *, struct dentry *, lock_handle *,
43367+ znode_lock_mode, reiser4_dir_entry_desc *);
43368+int reiser4_update_dir(struct inode *);
43369+
43370+/* this is common implementation of vfs's rename method of struct
43371+ inode_operations
43372+ See comments in the body.
43373+
43374+ It is arguable that this function can be made generic so, that it
43375+ will be applicable to any kind of directory plugin that deals with
43376+ directories composed out of directory entries. The only obstacle
43377+ here is that we don't have any data-type to represent directory
43378+ entry. This should be re-considered when more than one different
43379+ directory plugin will be implemented.
43380+*/
43381+int rename_common(struct inode *old_dir /* directory where @old is located */ ,
43382+ struct dentry *old_name /* old name */ ,
43383+ struct inode *new_dir /* directory where @new is located */ ,
43384+ struct dentry *new_name /* new name */ )
43385+{
43386+ /* From `The Open Group Base Specifications Issue 6'
43387+
43388+ If either the old or new argument names a symbolic link, rename()
43389+ shall operate on the symbolic link itself, and shall not resolve
43390+ the last component of the argument. If the old argument and the new
43391+ argument resolve to the same existing file, rename() shall return
43392+ successfully and perform no other action.
43393+
43394+ [this is done by VFS: vfs_rename()]
43395+
43396+ If the old argument points to the pathname of a file that is not a
43397+ directory, the new argument shall not point to the pathname of a
43398+ directory.
43399+
43400+ [checked by VFS: vfs_rename->may_delete()]
43401+
43402+ If the link named by the new argument exists, it shall
43403+ be removed and old renamed to new. In this case, a link named new
43404+ shall remain visible to other processes throughout the renaming
43405+ operation and refer either to the file referred to by new or old
43406+ before the operation began.
43407+
43408+ [we should assure this]
43409+
43410+ Write access permission is required for
43411+ both the directory containing old and the directory containing new.
43412+
43413+ [checked by VFS: vfs_rename->may_delete(), may_create()]
43414+
43415+ If the old argument points to the pathname of a directory, the new
43416+ argument shall not point to the pathname of a file that is not a
43417+ directory.
43418+
43419+ [checked by VFS: vfs_rename->may_delete()]
43420+
43421+ If the directory named by the new argument exists, it
43422+ shall be removed and old renamed to new. In this case, a link named
43423+ new shall exist throughout the renaming operation and shall refer
43424+ either to the directory referred to by new or old before the
43425+ operation began.
43426+
43427+ [we should assure this]
43428+
43429+ If new names an existing directory, it shall be
43430+ required to be an empty directory.
43431+
43432+ [we should check this]
43433+
43434+ If the old argument points to a pathname of a symbolic link, the
43435+ symbolic link shall be renamed. If the new argument points to a
43436+ pathname of a symbolic link, the symbolic link shall be removed.
43437+
43438+ The new pathname shall not contain a path prefix that names
43439+ old. Write access permission is required for the directory
43440+ containing old and the directory containing new. If the old
43441+ argument points to the pathname of a directory, write access
43442+ permission may be required for the directory named by old, and, if
43443+ it exists, the directory named by new.
43444+
43445+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
43446+
43447+ If the link named by the new argument exists and the file's link
43448+ count becomes 0 when it is removed and no process has the file
43449+ open, the space occupied by the file shall be freed and the file
43450+ shall no longer be accessible. If one or more processes have the
43451+ file open when the last link is removed, the link shall be removed
43452+ before rename() returns, but the removal of the file contents shall
43453+ be postponed until all references to the file are closed.
43454+
43455+ [iput() handles this, but we can do this manually, a la
43456+ reiser4_unlink()]
43457+
43458+ Upon successful completion, rename() shall mark for update the
43459+ st_ctime and st_mtime fields of the parent directory of each file.
43460+
43461+ [N/A]
43462+
43463+ */
43464+ reiser4_context *ctx;
43465+ int result;
43466+ int is_dir; /* is @old_name directory */
43467+
43468+ struct inode *old_inode;
43469+ struct inode *new_inode;
43470+ coord_t *new_coord;
43471+
43472+ reiser4_dentry_fsdata *new_fsdata;
43473+ dir_plugin *dplug;
43474+ file_plugin *fplug;
43475+
43476+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
43477+ lock_handle *new_lh, *dotdot_lh;
43478+ struct dentry *dotdot_name;
43479+ reiser4_dentry_fsdata *dataonstack;
43480+
43481+ ctx = init_context(old_dir->i_sb);
43482+ if (IS_ERR(ctx))
43483+ return PTR_ERR(ctx);
43484+
43485+ old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43486+ sizeof(*dotdot_name) + sizeof(*dataonstack),
43487+ GFP_KERNEL);
43488+ if (old_entry == NULL) {
43489+ context_set_commit_async(ctx);
43490+ reiser4_exit_context(ctx);
43491+ return RETERR(-ENOMEM);
43492+ }
43493+ memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43494+ sizeof(*dotdot_name) + sizeof(*dataonstack));
43495+
43496+ new_entry = old_entry + 1;
43497+ dotdot_entry = old_entry + 2;
43498+ new_lh = (lock_handle *)(old_entry + 3);
43499+ dotdot_lh = new_lh + 1;
43500+ dotdot_name = (struct dentry *)(new_lh + 2);
43501+ dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
43502+
43503+ assert("nikita-2318", old_dir != NULL);
43504+ assert("nikita-2319", new_dir != NULL);
43505+ assert("nikita-2320", old_name != NULL);
43506+ assert("nikita-2321", new_name != NULL);
43507+
43508+ old_inode = old_name->d_inode;
43509+ new_inode = new_name->d_inode;
43510+
43511+ dplug = inode_dir_plugin(old_dir);
43512+ fplug = NULL;
43513+
43514+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
43515+ if (IS_ERR(new_fsdata)) {
43516+ kfree(old_entry);
43517+ context_set_commit_async(ctx);
43518+ reiser4_exit_context(ctx);
43519+ return PTR_ERR(new_fsdata);
43520+ }
43521+
43522+ new_coord = &new_fsdata->dec.entry_coord;
43523+ coord_clear_iplug(new_coord);
43524+
43525+ is_dir = S_ISDIR(old_inode->i_mode);
43526+
43527+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43528+
43529+ /* if target is existing directory and it's not empty---return error.
43530+
43531+ This check is done specifically, because is_dir_empty() requires
43532+ tree traversal and have to be done before locks are taken.
43533+ */
43534+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
43535+ kfree(old_entry);
43536+ context_set_commit_async(ctx);
43537+ reiser4_exit_context(ctx);
43538+ return RETERR(-ENOTEMPTY);
43539+ }
43540+
43541+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
43542+ if (result != 0) {
43543+ kfree(old_entry);
43544+ context_set_commit_async(ctx);
43545+ reiser4_exit_context(ctx);
43546+ return result;
43547+ }
43548+
43549+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
43550+ new_dir, new_name);
43551+ if (result != 0) {
43552+ kfree(old_entry);
43553+ context_set_commit_async(ctx);
43554+ reiser4_exit_context(ctx);
43555+ return result;
43556+ }
43557+
43558+ init_lh(new_lh);
43559+
43560+ /* find entry for @new_name */
43561+ result = find_entry(new_dir,
43562+ new_name, new_lh, ZNODE_WRITE_LOCK, new_entry);
43563+
43564+ if (IS_CBKERR(result)) {
43565+ done_lh(new_lh);
43566+ kfree(old_entry);
43567+ context_set_commit_async(ctx);
43568+ reiser4_exit_context(ctx);
43569+ return result;
43570+ }
43571+
43572+ seal_done(&new_fsdata->dec.entry_seal);
43573+
43574+ /* add or replace name for @old_inode as @new_name */
43575+ if (new_inode != NULL) {
43576+ /* target (@new_name) exists. */
43577+ /* Not clear what to do with objects that are
43578+ both directories and files at the same time. */
43579+ if (result == CBK_COORD_FOUND) {
43580+ result = replace_name(old_inode,
43581+ new_dir,
43582+ new_inode, new_coord, new_lh);
43583+ if (result == 0)
43584+ fplug = inode_file_plugin(new_inode);
43585+ } else if (result == CBK_COORD_NOTFOUND) {
43586+ /* VFS told us that @new_name is bound to existing
43587+ inode, but we failed to find directory entry. */
43588+ warning("nikita-2324", "Target not found");
43589+ result = RETERR(-ENOENT);
43590+ }
43591+ } else {
43592+ /* target (@new_name) doesn't exists. */
43593+ if (result == CBK_COORD_NOTFOUND)
43594+ result = add_name(old_inode,
43595+ new_dir,
43596+ new_name, new_coord, new_lh, is_dir);
43597+ else if (result == CBK_COORD_FOUND) {
43598+ /* VFS told us that @new_name is "negative" dentry,
43599+ but we found directory entry. */
43600+ warning("nikita-2331", "Target found unexpectedly");
43601+ result = RETERR(-EIO);
43602+ }
43603+ }
43604+
43605+ assert("nikita-3462", ergo(result == 0,
43606+ old_inode->i_nlink >= 2 + !!is_dir));
43607+
43608+ /* We are done with all modifications to the @new_dir, release lock on
43609+ node. */
43610+ done_lh(new_lh);
43611+
43612+ if (fplug != NULL) {
43613+ /* detach @new_inode from name-space */
43614+ result = fplug->detach(new_inode, new_dir);
43615+ if (result != 0)
43616+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
43617+ (unsigned long long)get_inode_oid(new_inode),
43618+ result, possible_leak);
43619+ }
43620+
43621+ if (new_inode != NULL)
43622+ reiser4_update_sd(new_inode);
43623+
43624+ if (result == 0) {
43625+ old_entry->obj = old_inode;
43626+
43627+ dplug->build_entry_key(old_dir,
43628+ &old_name->d_name, &old_entry->key);
43629+
43630+ /* At this stage new name was introduced for
43631+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43632+ counters were updated.
43633+
43634+ We want to remove @old_name now. If @old_inode wasn't
43635+ directory this is simple.
43636+ */
43637+ result = dplug->rem_entry(old_dir, old_name, old_entry);
43638+ if (result != 0 && result != -ENOMEM) {
43639+ warning("nikita-2335",
43640+ "Cannot remove old name: %i", result);
43641+ } else {
43642+ result = reiser4_del_nlink(old_inode, old_dir, 0);
43643+ if (result != 0 && result != -ENOMEM) {
43644+ warning("nikita-2337",
43645+ "Cannot drop link on old: %i", result);
43646+ }
43647+ }
43648+
43649+ if (result == 0 && is_dir) {
43650+ /* @old_inode is directory. We also have to update
43651+ dotdot entry. */
43652+ coord_t *dotdot_coord;
43653+
43654+ memset(dataonstack, 0, sizeof dataonstack);
43655+ memset(dotdot_entry, 0, sizeof dotdot_entry);
43656+ dotdot_entry->obj = old_dir;
43657+ memset(dotdot_name, 0, sizeof dotdot_name);
43658+ dotdot_name->d_name.name = "..";
43659+ dotdot_name->d_name.len = 2;
43660+ /*
43661+ * allocate ->d_fsdata on the stack to avoid using
43662+ * reiser4_get_dentry_fsdata(). Locking is not needed,
43663+ * because dentry is private to the current thread.
43664+ */
43665+ dotdot_name->d_fsdata = dataonstack;
43666+ init_lh(dotdot_lh);
43667+
43668+ dotdot_coord = &dataonstack->dec.entry_coord;
43669+ coord_clear_iplug(dotdot_coord);
43670+
43671+ result = find_entry(old_inode, dotdot_name, dotdot_lh,
43672+ ZNODE_WRITE_LOCK, dotdot_entry);
43673+ if (result == 0) {
43674+ /* replace_name() decreases i_nlink on
43675+ * @old_dir */
43676+ result = replace_name(new_dir,
43677+ old_inode,
43678+ old_dir,
43679+ dotdot_coord, dotdot_lh);
43680+ } else
43681+ result = RETERR(-EIO);
43682+ done_lh(dotdot_lh);
43683+ }
43684+ }
43685+ reiser4_update_dir(new_dir);
43686+ reiser4_update_dir(old_dir);
43687+ reiser4_update_sd(old_inode);
43688+ if (result == 0) {
43689+ file_plugin *fplug;
43690+
43691+ if (new_inode != NULL) {
43692+ /* add safe-link for target file (in case we removed
43693+ * last reference to the poor fellow */
43694+ fplug = inode_file_plugin(new_inode);
43695+ if (new_inode->i_nlink == 0)
43696+ result = safe_link_add(new_inode, SAFE_UNLINK);
43697+ }
43698+ }
43699+ kfree(old_entry);
43700+ context_set_commit_async(ctx);
43701+ reiser4_exit_context(ctx);
43702+ return result;
43703+}
43704+
43705+#if 0
43706+int rename_common(struct inode *old_dir /* directory where @old is located */ ,
43707+ struct dentry *old_name /* old name */ ,
43708+ struct inode *new_dir /* directory where @new is located */ ,
43709+ struct dentry *new_name /* new name */ )
43710+{
43711+ /* From `The Open Group Base Specifications Issue 6'
43712+
43713+ If either the old or new argument names a symbolic link, rename()
43714+ shall operate on the symbolic link itself, and shall not resolve
43715+ the last component of the argument. If the old argument and the new
43716+ argument resolve to the same existing file, rename() shall return
43717+ successfully and perform no other action.
43718+
43719+ [this is done by VFS: vfs_rename()]
43720+
43721+ If the old argument points to the pathname of a file that is not a
43722+ directory, the new argument shall not point to the pathname of a
43723+ directory.
43724+
43725+ [checked by VFS: vfs_rename->may_delete()]
43726+
43727+ If the link named by the new argument exists, it shall
43728+ be removed and old renamed to new. In this case, a link named new
43729+ shall remain visible to other processes throughout the renaming
43730+ operation and refer either to the file referred to by new or old
43731+ before the operation began.
43732+
43733+ [we should assure this]
43734+
43735+ Write access permission is required for
43736+ both the directory containing old and the directory containing new.
43737+
43738+ [checked by VFS: vfs_rename->may_delete(), may_create()]
43739+
43740+ If the old argument points to the pathname of a directory, the new
43741+ argument shall not point to the pathname of a file that is not a
43742+ directory.
43743+
43744+ [checked by VFS: vfs_rename->may_delete()]
43745+
43746+ If the directory named by the new argument exists, it
43747+ shall be removed and old renamed to new. In this case, a link named
43748+ new shall exist throughout the renaming operation and shall refer
43749+ either to the directory referred to by new or old before the
43750+ operation began.
43751+
43752+ [we should assure this]
43753+
43754+ If new names an existing directory, it shall be
43755+ required to be an empty directory.
43756+
43757+ [we should check this]
43758+
43759+ If the old argument points to a pathname of a symbolic link, the
43760+ symbolic link shall be renamed. If the new argument points to a
43761+ pathname of a symbolic link, the symbolic link shall be removed.
43762+
43763+ The new pathname shall not contain a path prefix that names
43764+ old. Write access permission is required for the directory
43765+ containing old and the directory containing new. If the old
43766+ argument points to the pathname of a directory, write access
43767+ permission may be required for the directory named by old, and, if
43768+ it exists, the directory named by new.
43769+
43770+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
43771+
43772+ If the link named by the new argument exists and the file's link
43773+ count becomes 0 when it is removed and no process has the file
43774+ open, the space occupied by the file shall be freed and the file
43775+ shall no longer be accessible. If one or more processes have the
43776+ file open when the last link is removed, the link shall be removed
43777+ before rename() returns, but the removal of the file contents shall
43778+ be postponed until all references to the file are closed.
43779+
43780+ [iput() handles this, but we can do this manually, a la
43781+ reiser4_unlink()]
43782+
43783+ Upon successful completion, rename() shall mark for update the
43784+ st_ctime and st_mtime fields of the parent directory of each file.
43785+
43786+ [N/A]
43787+
43788+ */
43789+ reiser4_context *ctx;
43790+ int result;
43791+ int is_dir; /* is @old_name directory */
43792+ struct inode *old_inode;
43793+ struct inode *new_inode;
43794+ reiser4_dir_entry_desc old_entry;
43795+ reiser4_dir_entry_desc new_entry;
43796+ coord_t *new_coord;
43797+ reiser4_dentry_fsdata *new_fsdata;
43798+ lock_handle new_lh;
43799+ dir_plugin *dplug;
43800+ file_plugin *fplug;
43801+
43802+ ctx = init_context(old_dir->i_sb);
43803+ if (IS_ERR(ctx))
43804+ return PTR_ERR(ctx);
43805+
43806+ assert("nikita-2318", old_dir != NULL);
43807+ assert("nikita-2319", new_dir != NULL);
43808+ assert("nikita-2320", old_name != NULL);
43809+ assert("nikita-2321", new_name != NULL);
43810+
43811+ old_inode = old_name->d_inode;
43812+ new_inode = new_name->d_inode;
43813+
43814+ dplug = inode_dir_plugin(old_dir);
43815+ fplug = NULL;
43816+
43817+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
43818+ if (IS_ERR(new_fsdata)) {
43819+ result = PTR_ERR(new_fsdata);
43820+ goto exit;
43821+ }
43822+
43823+ new_coord = &new_fsdata->dec.entry_coord;
43824+ coord_clear_iplug(new_coord);
43825+
43826+ is_dir = S_ISDIR(old_inode->i_mode);
43827+
43828+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43829+
43830+ /* if target is existing directory and it's not empty---return error.
43831+
43832+ This check is done specifically, because is_dir_empty() requires
43833+ tree traversal and have to be done before locks are taken.
43834+ */
43835+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
43836+ return RETERR(-ENOTEMPTY);
43837+
43838+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
43839+ if (result != 0)
43840+ goto exit;
43841+
43842+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
43843+ new_dir, new_name);
43844+ if (result != 0)
43845+ goto exit;
43846+
43847+ init_lh(&new_lh);
43848+
43849+ /* find entry for @new_name */
43850+ result = find_entry(new_dir,
43851+ new_name, &new_lh, ZNODE_WRITE_LOCK, &new_entry);
43852+
43853+ if (IS_CBKERR(result)) {
43854+ done_lh(&new_lh);
43855+ goto exit;
43856+ }
43857+
43858+ seal_done(&new_fsdata->dec.entry_seal);
43859+
43860+ /* add or replace name for @old_inode as @new_name */
43861+ if (new_inode != NULL) {
43862+ /* target (@new_name) exists. */
43863+ /* Not clear what to do with objects that are
43864+ both directories and files at the same time. */
43865+ if (result == CBK_COORD_FOUND) {
43866+ result = replace_name(old_inode,
43867+ new_dir,
43868+ new_inode, new_coord, &new_lh);
43869+ if (result == 0)
43870+ fplug = inode_file_plugin(new_inode);
43871+ } else if (result == CBK_COORD_NOTFOUND) {
43872+ /* VFS told us that @new_name is bound to existing
43873+ inode, but we failed to find directory entry. */
43874+ warning("nikita-2324", "Target not found");
43875+ result = RETERR(-ENOENT);
43876+ }
43877+ } else {
43878+ /* target (@new_name) doesn't exists. */
43879+ if (result == CBK_COORD_NOTFOUND)
43880+ result = add_name(old_inode,
43881+ new_dir,
43882+ new_name, new_coord, &new_lh, is_dir);
43883+ else if (result == CBK_COORD_FOUND) {
43884+ /* VFS told us that @new_name is "negative" dentry,
43885+ but we found directory entry. */
43886+ warning("nikita-2331", "Target found unexpectedly");
43887+ result = RETERR(-EIO);
43888+ }
43889+ }
43890+
43891+ assert("nikita-3462", ergo(result == 0,
43892+ old_inode->i_nlink >= 2 + !!is_dir));
43893+
43894+ /* We are done with all modifications to the @new_dir, release lock on
43895+ node. */
43896+ done_lh(&new_lh);
43897+
43898+ if (fplug != NULL) {
43899+ /* detach @new_inode from name-space */
43900+ result = fplug->detach(new_inode, new_dir);
43901+ if (result != 0)
43902+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
43903+ (unsigned long long)get_inode_oid(new_inode),
43904+ result, possible_leak);
43905+ }
43906+
43907+ if (new_inode != NULL)
43908+ reiser4_update_sd(new_inode);
43909+
43910+ if (result == 0) {
43911+ memset(&old_entry, 0, sizeof old_entry);
43912+ old_entry.obj = old_inode;
43913+
43914+ dplug->build_entry_key(old_dir,
43915+ &old_name->d_name, &old_entry.key);
43916+
43917+ /* At this stage new name was introduced for
43918+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43919+ counters were updated.
43920+
43921+ We want to remove @old_name now. If @old_inode wasn't
43922+ directory this is simple.
43923+ */
43924+ result = dplug->rem_entry(old_dir, old_name, &old_entry);
43925+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
43926+ if (result != 0 && result != -ENOMEM) {
43927+ warning("nikita-2335",
43928+ "Cannot remove old name: %i", result);
43929+ } else {
43930+ result = reiser4_del_nlink(old_inode, old_dir, 0);
43931+ if (result != 0 && result != -ENOMEM) {
43932+ warning("nikita-2337",
43933+ "Cannot drop link on old: %i", result);
43934+ }
43935+ }
43936+
43937+ if (result == 0 && is_dir) {
43938+ /* @old_inode is directory. We also have to update
43939+ dotdot entry. */
43940+ coord_t *dotdot_coord;
43941+ lock_handle dotdot_lh;
43942+ struct dentry dotdot_name;
43943+ reiser4_dir_entry_desc dotdot_entry;
43944+ reiser4_dentry_fsdata dataonstack;
43945+ reiser4_dentry_fsdata *fsdata;
43946+
43947+ memset(&dataonstack, 0, sizeof dataonstack);
43948+ memset(&dotdot_entry, 0, sizeof dotdot_entry);
43949+ dotdot_entry.obj = old_dir;
43950+ memset(&dotdot_name, 0, sizeof dotdot_name);
43951+ dotdot_name.d_name.name = "..";
43952+ dotdot_name.d_name.len = 2;
43953+ /*
43954+ * allocate ->d_fsdata on the stack to avoid using
43955+ * reiser4_get_dentry_fsdata(). Locking is not needed,
43956+ * because dentry is private to the current thread.
43957+ */
43958+ dotdot_name.d_fsdata = &dataonstack;
43959+ init_lh(&dotdot_lh);
43960+
43961+ fsdata = &dataonstack;
43962+ dotdot_coord = &fsdata->dec.entry_coord;
43963+ coord_clear_iplug(dotdot_coord);
43964+
43965+ result = find_entry(old_inode, &dotdot_name, &dotdot_lh,
43966+ ZNODE_WRITE_LOCK, &dotdot_entry);
43967+ if (result == 0) {
43968+ /* replace_name() decreases i_nlink on
43969+ * @old_dir */
43970+ result = replace_name(new_dir,
43971+ old_inode,
43972+ old_dir,
43973+ dotdot_coord, &dotdot_lh);
43974+ } else
43975+ result = RETERR(-EIO);
43976+ done_lh(&dotdot_lh);
43977+ }
43978+ }
43979+ reiser4_update_dir(new_dir);
43980+ reiser4_update_dir(old_dir);
43981+ reiser4_update_sd(old_inode);
43982+ if (result == 0) {
43983+ file_plugin *fplug;
43984+
43985+ if (new_inode != NULL) {
43986+ /* add safe-link for target file (in case we removed
43987+ * last reference to the poor fellow */
43988+ fplug = inode_file_plugin(new_inode);
43989+ if (new_inode->i_nlink == 0)
43990+ result = safe_link_add(new_inode, SAFE_UNLINK);
43991+ }
43992+ }
43993+ exit:
43994+ context_set_commit_async(ctx);
43995+ reiser4_exit_context(ctx);
43996+ return result;
43997+}
43998+#endif
43999Index: linux-2.6.16/fs/reiser4/plugin/item/Makefile
44000===================================================================
44001--- /dev/null
44002+++ linux-2.6.16/fs/reiser4/plugin/item/Makefile
44003@@ -0,0 +1,18 @@
44004+obj-$(CONFIG_REISER4_FS) += item_plugins.o
44005+
44006+item_plugins-objs := \
44007+ item.o \
44008+ static_stat.o \
44009+ sde.o \
44010+ cde.o \
44011+ blackbox.o \
44012+ internal.o \
44013+ tail.o \
44014+ ctail.o \
44015+ extent.o \
44016+ extent_item_ops.o \
44017+ extent_file_ops.o \
44018+ extent_flush_ops.o
44019+
44020+
44021+
44022Index: linux-2.6.16/fs/reiser4/plugin/item/acl.h
44023===================================================================
44024--- /dev/null
44025+++ linux-2.6.16/fs/reiser4/plugin/item/acl.h
44026@@ -0,0 +1,66 @@
44027+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44028+
44029+/* Directory entry. */
44030+
44031+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
44032+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
44033+
44034+#include "../../forward.h"
44035+#include "../../dformat.h"
44036+#include "../../kassign.h"
44037+#include "../../key.h"
44038+
44039+#include <linux/fs.h>
44040+#include <linux/dcache.h> /* for struct dentry */
44041+
44042+typedef struct directory_entry_format {
44043+ /* key of object stat-data. It's not necessary to store whole
44044+ key here, because it's always key of stat-data, so minor
44045+ packing locality and offset can be omitted here. But this
44046+ relies on particular key allocation scheme for stat-data, so,
44047+ for extensibility sake, whole key can be stored here.
44048+
44049+ We store key as array of bytes, because we don't want 8-byte
44050+ alignment of dir entries.
44051+ */
44052+ obj_key_id id;
44053+ /* file name. Null terminated string. */
44054+ d8 name[0];
44055+} directory_entry_format;
44056+
44057+void print_de(const char *prefix, coord_t * coord);
44058+int extract_key_de(const coord_t * coord, reiser4_key * key);
44059+int update_key_de(const coord_t * coord, const reiser4_key * key,
44060+ lock_handle * lh);
44061+char *extract_name_de(const coord_t * coord, char *buf);
44062+unsigned extract_file_type_de(const coord_t * coord);
44063+int add_entry_de(struct inode *dir, coord_t * coord,
44064+ lock_handle * lh, const struct dentry *name,
44065+ reiser4_dir_entry_desc * entry);
44066+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
44067+ lock_handle * lh, reiser4_dir_entry_desc * entry);
44068+int max_name_len_de(const struct inode *dir);
44069+
44070+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
44071+
44072+char *extract_dent_name(const coord_t * coord,
44073+ directory_entry_format * dent, char *buf);
44074+
44075+#if REISER4_LARGE_KEY
44076+#define DE_NAME_BUF_LEN (24)
44077+#else
44078+#define DE_NAME_BUF_LEN (16)
44079+#endif
44080+
44081+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
44082+#endif
44083+
44084+/* Make Linus happy.
44085+ Local variables:
44086+ c-indentation-style: "K&R"
44087+ mode-name: "LC"
44088+ c-basic-offset: 8
44089+ tab-width: 8
44090+ fill-column: 120
44091+ End:
44092+*/
44093Index: linux-2.6.16/fs/reiser4/plugin/item/blackbox.c
44094===================================================================
44095--- /dev/null
44096+++ linux-2.6.16/fs/reiser4/plugin/item/blackbox.c
44097@@ -0,0 +1,142 @@
44098+/* Copyright 2003 by Hans Reiser, licensing governed by
44099+ * reiser4/README */
44100+
44101+/* Black box item implementation */
44102+
44103+#include "../../forward.h"
44104+#include "../../debug.h"
44105+#include "../../dformat.h"
44106+#include "../../kassign.h"
44107+#include "../../coord.h"
44108+#include "../../tree.h"
44109+#include "../../lock.h"
44110+
44111+#include "blackbox.h"
44112+#include "item.h"
44113+#include "../plugin.h"
44114+
44115+int
44116+store_black_box(reiser4_tree * tree,
44117+ const reiser4_key * key, void *data, int length)
44118+{
44119+ int result;
44120+ reiser4_item_data idata;
44121+ coord_t coord;
44122+ lock_handle lh;
44123+
44124+ memset(&idata, 0, sizeof idata);
44125+
44126+ idata.data = data;
44127+ idata.user = 0;
44128+ idata.length = length;
44129+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
44130+
44131+ init_lh(&lh);
44132+ result = insert_by_key(tree, key,
44133+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
44134+
44135+ assert("nikita-3413",
44136+ ergo(result == 0,
44137+ WITH_COORD(&coord,
44138+ item_length_by_coord(&coord) == length)));
44139+
44140+ done_lh(&lh);
44141+ return result;
44142+}
44143+
44144+int
44145+load_black_box(reiser4_tree * tree,
44146+ reiser4_key * key, void *data, int length, int exact)
44147+{
44148+ int result;
44149+ coord_t coord;
44150+ lock_handle lh;
44151+
44152+ init_lh(&lh);
44153+ result = coord_by_key(tree, key,
44154+ &coord, &lh, ZNODE_READ_LOCK,
44155+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44156+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44157+
44158+ if (result == 0) {
44159+ int ilen;
44160+
44161+ result = zload(coord.node);
44162+ if (result == 0) {
44163+ ilen = item_length_by_coord(&coord);
44164+ if (ilen <= length) {
44165+ memcpy(data, item_body_by_coord(&coord), ilen);
44166+ unit_key_by_coord(&coord, key);
44167+ } else if (exact) {
44168+ /*
44169+ * item is larger than buffer provided by the
44170+ * user. Only issue a warning if @exact is
44171+ * set. If @exact is false, we are iterating
44172+ * over all safe-links and here we are reaching
44173+ * the end of the iteration.
44174+ */
44175+ warning("nikita-3415",
44176+ "Wrong black box length: %i > %i",
44177+ ilen, length);
44178+ result = RETERR(-EIO);
44179+ }
44180+ zrelse(coord.node);
44181+ }
44182+ }
44183+
44184+ done_lh(&lh);
44185+ return result;
44186+
44187+}
44188+
44189+int
44190+update_black_box(reiser4_tree * tree,
44191+ const reiser4_key * key, void *data, int length)
44192+{
44193+ int result;
44194+ coord_t coord;
44195+ lock_handle lh;
44196+
44197+ init_lh(&lh);
44198+ result = coord_by_key(tree, key,
44199+ &coord, &lh, ZNODE_READ_LOCK,
44200+ FIND_EXACT,
44201+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44202+ if (result == 0) {
44203+ int ilen;
44204+
44205+ result = zload(coord.node);
44206+ if (result == 0) {
44207+ ilen = item_length_by_coord(&coord);
44208+ if (length <= ilen) {
44209+ memcpy(item_body_by_coord(&coord), data,
44210+ length);
44211+ } else {
44212+ warning("nikita-3437",
44213+ "Wrong black box length: %i < %i",
44214+ ilen, length);
44215+ result = RETERR(-EIO);
44216+ }
44217+ zrelse(coord.node);
44218+ }
44219+ }
44220+
44221+ done_lh(&lh);
44222+ return result;
44223+
44224+}
44225+
44226+int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44227+{
44228+ return cut_tree(tree, key, key, NULL, 1);
44229+}
44230+
44231+/* Make Linus happy.
44232+ Local variables:
44233+ c-indentation-style: "K&R"
44234+ mode-name: "LC"
44235+ c-basic-offset: 8
44236+ tab-width: 8
44237+ fill-column: 120
44238+ End:
44239+*/
44240Index: linux-2.6.16/fs/reiser4/plugin/item/blackbox.h
44241===================================================================
44242--- /dev/null
44243+++ linux-2.6.16/fs/reiser4/plugin/item/blackbox.h
44244@@ -0,0 +1,33 @@
44245+/* Copyright 2003 by Hans Reiser, licensing governed by
44246+ * reiser4/README */
44247+
44248+/* "Black box" entry to fixed-width contain user supplied data */
44249+
44250+#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44251+#define __FS_REISER4_BLACK_BOX_H__
44252+
44253+#include "../../forward.h"
44254+#include "../../dformat.h"
44255+#include "../../kassign.h"
44256+#include "../../key.h"
44257+
44258+extern int store_black_box(reiser4_tree * tree,
44259+ const reiser4_key * key, void *data, int length);
44260+extern int load_black_box(reiser4_tree * tree,
44261+ reiser4_key * key, void *data, int length, int exact);
44262+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44263+extern int update_black_box(reiser4_tree * tree,
44264+ const reiser4_key * key, void *data, int length);
44265+
44266+/* __FS_REISER4_BLACK_BOX_H__ */
44267+#endif
44268+
44269+/* Make Linus happy.
44270+ Local variables:
44271+ c-indentation-style: "K&R"
44272+ mode-name: "LC"
44273+ c-basic-offset: 8
44274+ tab-width: 8
44275+ fill-column: 120
44276+ End:
44277+*/
44278Index: linux-2.6.16/fs/reiser4/plugin/item/cde.c
44279===================================================================
44280--- /dev/null
44281+++ linux-2.6.16/fs/reiser4/plugin/item/cde.c
44282@@ -0,0 +1,1007 @@
44283+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44284+
44285+/* Directory entry implementation */
44286+
44287+/* DESCRIPTION:
44288+
44289+ This is "compound" directory item plugin implementation. This directory
44290+ item type is compound (as opposed to the "simple directory item" in
44291+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44292+ entries.
44293+
44294+ The reason behind this decision is disk space efficiency: all directory
44295+ entries inside the same directory have identical fragment in their
44296+ keys. This, of course, depends on key assignment policy. In our default key
44297+ assignment policy, all directory entries have the same locality which is
44298+ equal to the object id of their directory.
44299+
44300+ Composing directory item out of several directory entries for the same
44301+ directory allows us to store said key fragment only once. That is, this is
44302+ some ad hoc form of key compression (stem compression) that is implemented
44303+ here, because general key compression is not supposed to be implemented in
44304+ v4.0.
44305+
44306+ Another decision that was made regarding all directory item plugins, is
44307+ that they will store entry keys unaligned. This is for that sake of disk
44308+ space efficiency again.
44309+
44310+ In should be noted, that storing keys unaligned increases CPU consumption,
44311+ at least on some architectures.
44312+
44313+ Internal on-disk structure of the compound directory item is the following:
44314+
44315+ HEADER cde_item_format. Here number of entries is stored.
44316+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
44317+ ENTRY_HEADER_1 offset of entry body are stored.
44318+ ENTRY_HEADER_2 (basically two last parts of key)
44319+ ...
44320+ ENTRY_HEADER_N
44321+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
44322+ ENTRY_BODY_1 NUL-terminated name are stored.
44323+ ENTRY_BODY_2 (part of statadta key in the
44324+ sence that since all SDs have
44325+ zero offset, this offset is not
44326+ stored on disk).
44327+ ...
44328+ ENTRY_BODY_N
44329+
44330+ When it comes to the balancing, each directory entry in compound directory
44331+ item is unit, that is, something that can be cut from one item and pasted
44332+ into another item of the same type. Handling of unit cut and paste is major
44333+ reason for the complexity of code below.
44334+
44335+*/
44336+
44337+#include "../../forward.h"
44338+#include "../../debug.h"
44339+#include "../../dformat.h"
44340+#include "../../kassign.h"
44341+#include "../../key.h"
44342+#include "../../coord.h"
44343+#include "sde.h"
44344+#include "cde.h"
44345+#include "item.h"
44346+#include "../node/node.h"
44347+#include "../plugin.h"
44348+#include "../../znode.h"
44349+#include "../../carry.h"
44350+#include "../../tree.h"
44351+#include "../../inode.h"
44352+
44353+#include <linux/fs.h> /* for struct inode */
44354+#include <linux/dcache.h> /* for struct dentry */
44355+#include <linux/quotaops.h>
44356+
44357+#if 0
44358+#define CHECKME(coord) \
44359+({ \
44360+ const char *message; \
44361+ coord_t dup; \
44362+ \
44363+ coord_dup_nocheck(&dup, (coord)); \
44364+ dup.unit_pos = 0; \
44365+ assert("nikita-2871", cde_check(&dup, &message) == 0); \
44366+})
44367+#else
44368+#define CHECKME(coord) noop
44369+#endif
44370+
44371+/* return body of compound directory item at @coord */
44372+static inline cde_item_format *formatted_at(const coord_t * coord)
44373+{
44374+ assert("nikita-1282", coord != NULL);
44375+ return item_body_by_coord(coord);
44376+}
44377+
44378+/* return entry header at @coord */
44379+static inline cde_unit_header *header_at(const coord_t *
44380+ coord /* coord of item */ ,
44381+ int idx /* index of unit */ )
44382+{
44383+ assert("nikita-1283", coord != NULL);
44384+ return &formatted_at(coord)->entry[idx];
44385+}
44386+
44387+/* return number of units in compound directory item at @coord */
44388+static int units(const coord_t * coord /* coord of item */ )
44389+{
44390+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
44391+}
44392+
44393+/* return offset of the body of @idx-th entry in @coord */
44394+static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
44395+ int idx /* index of unit */ )
44396+{
44397+ if (idx < units(coord))
44398+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
44399+ else if (idx == units(coord))
44400+ return item_length_by_coord(coord);
44401+ else
44402+ impossible("nikita-1308", "Wrong idx");
44403+ return 0;
44404+}
44405+
44406+/* set offset of the body of @idx-th entry in @coord */
44407+static void set_offset(const coord_t * coord /* coord of item */ ,
44408+ int idx /* index of unit */ ,
44409+ unsigned int offset /* new offset */ )
44410+{
44411+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
44412+}
44413+
44414+static void adj_offset(const coord_t * coord /* coord of item */ ,
44415+ int idx /* index of unit */ ,
44416+ int delta /* offset change */ )
44417+{
44418+ d16 *doffset;
44419+ __u16 offset;
44420+
44421+ doffset = &header_at(coord, idx)->offset;
44422+ offset = le16_to_cpu(get_unaligned(doffset));
44423+ offset += delta;
44424+ put_unaligned(cpu_to_le16((__u16) offset), doffset);
44425+}
44426+
44427+/* return pointer to @offset-th byte from the beginning of @coord */
44428+static char *address(const coord_t * coord /* coord of item */ ,
44429+ int offset)
44430+{
44431+ return ((char *)item_body_by_coord(coord)) + offset;
44432+}
44433+
44434+/* return pointer to the body of @idx-th entry in @coord */
44435+static directory_entry_format *entry_at(const coord_t * coord /* coord of
44436+ * item */ ,
44437+ int idx /* index of unit */ )
44438+{
44439+ return (directory_entry_format *) address(coord,
44440+ (int)offset_of(coord, idx));
44441+}
44442+
44443+/* return number of unit referenced by @coord */
44444+static int idx_of(const coord_t * coord /* coord of item */ )
44445+{
44446+ assert("nikita-1285", coord != NULL);
44447+ return coord->unit_pos;
44448+}
44449+
44450+/* find position where entry with @entry_key would be inserted into @coord */
44451+static int find(const coord_t * coord /* coord of item */ ,
44452+ const reiser4_key * entry_key /* key to look for */ ,
44453+ cmp_t * last /* result of last comparison */ )
44454+{
44455+ int entries;
44456+
44457+ int left;
44458+ int right;
44459+
44460+ cde_unit_header *header;
44461+
44462+ assert("nikita-1295", coord != NULL);
44463+ assert("nikita-1296", entry_key != NULL);
44464+ assert("nikita-1297", last != NULL);
44465+
44466+ entries = units(coord);
44467+ left = 0;
44468+ right = entries - 1;
44469+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
44470+ int median;
44471+
44472+ median = (left + right) >> 1;
44473+
44474+ header = header_at(coord, median);
44475+ *last = de_id_key_cmp(&header->hash, entry_key);
44476+ switch (*last) {
44477+ case LESS_THAN:
44478+ left = median;
44479+ break;
44480+ case GREATER_THAN:
44481+ right = median;
44482+ break;
44483+ case EQUAL_TO:{
44484+ do {
44485+ median--;
44486+ header--;
44487+ } while (median >= 0 &&
44488+ de_id_key_cmp(&header->hash,
44489+ entry_key) == EQUAL_TO);
44490+ return median + 1;
44491+ }
44492+ }
44493+ }
44494+ header = header_at(coord, left);
44495+ for (; left < entries; ++left, ++header) {
44496+ prefetch(header + 1);
44497+ *last = de_id_key_cmp(&header->hash, entry_key);
44498+ if (*last != LESS_THAN)
44499+ break;
44500+ }
44501+ if (left < entries)
44502+ return left;
44503+ else
44504+ return RETERR(-ENOENT);
44505+
44506+}
44507+
44508+/* expand @coord as to accommodate for insertion of @no new entries starting
44509+ from @pos, with total bodies size @size. */
44510+static int expand_item(const coord_t * coord /* coord of item */ ,
44511+ int pos /* unit position */ , int no /* number of new
44512+ * units*/ ,
44513+ int size /* total size of new units' data */ ,
44514+ unsigned int data_size /* free space already reserved
44515+ * in the item for insertion */ )
44516+{
44517+ int entries;
44518+ cde_unit_header *header;
44519+ char *dent;
44520+ int i;
44521+
44522+ assert("nikita-1310", coord != NULL);
44523+ assert("nikita-1311", pos >= 0);
44524+ assert("nikita-1312", no > 0);
44525+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
44526+ assert("nikita-1343",
44527+ item_length_by_coord(coord) >=
44528+ (int)(size + data_size + no * sizeof *header));
44529+
44530+ entries = units(coord);
44531+
44532+ if (pos == entries)
44533+ dent = address(coord, size);
44534+ else
44535+ dent = (char *)entry_at(coord, pos);
44536+ /* place where new header will be in */
44537+ header = header_at(coord, pos);
44538+ /* free space for new entry headers */
44539+ memmove(header + no, header,
44540+ (unsigned)(address(coord, size) - (char *)header));
44541+ /* if adding to the end initialise first new header */
44542+ if (pos == entries) {
44543+ set_offset(coord, pos, (unsigned)size);
44544+ }
44545+
44546+ /* adjust entry pointer and size */
44547+ dent = dent + no * sizeof *header;
44548+ size += no * sizeof *header;
44549+ /* free space for new entries */
44550+ memmove(dent + data_size, dent,
44551+ (unsigned)(address(coord, size) - dent));
44552+
44553+ /* increase counter */
44554+ entries += no;
44555+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
44556+
44557+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
44558+ bytes. */
44559+ for (i = 0; i <= pos; ++i)
44560+ adj_offset(coord, i, no * sizeof *header);
44561+ /* [ pos + no ... +\infty ) entries were shifted by ( no *
44562+ sizeof *header + data_size ) bytes */
44563+ for (i = pos + no; i < entries; ++i)
44564+ adj_offset(coord, i, no * sizeof *header + data_size);
44565+ return 0;
44566+}
44567+
44568+/* insert new @entry into item */
44569+static int expand(const coord_t * coord /* coord of item */ ,
44570+ cde_entry * entry /* entry to insert */ ,
44571+ int len /* length of @entry data */ ,
44572+ int *pos /* position to insert */ ,
44573+ reiser4_dir_entry_desc * dir_entry /* parameters for new
44574+ * entry */ )
44575+{
44576+ cmp_t cmp_res;
44577+ int datasize;
44578+
44579+ *pos = find(coord, &dir_entry->key, &cmp_res);
44580+ if (*pos < 0)
44581+ *pos = units(coord);
44582+
44583+ datasize = sizeof(directory_entry_format);
44584+ if (is_longname(entry->name->name, entry->name->len))
44585+ datasize += entry->name->len + 1;
44586+
44587+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
44588+ datasize);
44589+ return 0;
44590+}
44591+
44592+/* paste body of @entry into item */
44593+static int paste_entry(const coord_t * coord /* coord of item */ ,
44594+ cde_entry * entry /* new entry */ ,
44595+ int pos /* position to insert */ ,
44596+ reiser4_dir_entry_desc * dir_entry /* parameters for
44597+ * new entry */ )
44598+{
44599+ cde_unit_header *header;
44600+ directory_entry_format *dent;
44601+ const char *name;
44602+ int len;
44603+
44604+ header = header_at(coord, pos);
44605+ dent = entry_at(coord, pos);
44606+
44607+ build_de_id_by_key(&dir_entry->key, &header->hash);
44608+ build_inode_key_id(entry->obj, &dent->id);
44609+ /* AUDIT unsafe strcpy() operation! It should be replaced with
44610+ much less CPU hungry
44611+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
44612+
44613+ Also a more major thing is that there should be a way to figure out
44614+ amount of space in dent -> name and be able to check that we are
44615+ not going to overwrite more than we supposed to */
44616+ name = entry->name->name;
44617+ len = entry->name->len;
44618+ if (is_longname(name, len)) {
44619+ strcpy((unsigned char *)dent->name, name);
44620+ put_unaligned(0, &dent->name[len]);
44621+ }
44622+ return 0;
44623+}
44624+
44625+/* estimate how much space is necessary in item to insert/paste set of entries
44626+ described in @data. */
44627+int estimate_cde(const coord_t * coord /* coord of item */ ,
44628+ const reiser4_item_data * data /* parameters for new item */ )
44629+{
44630+ cde_entry_data *e;
44631+ int result;
44632+ int i;
44633+
44634+ e = (cde_entry_data *) data->data;
44635+
44636+ assert("nikita-1288", e != NULL);
44637+ assert("nikita-1289", e->num_of_entries >= 0);
44638+
44639+ if (coord == NULL)
44640+ /* insert */
44641+ result = sizeof(cde_item_format);
44642+ else
44643+ /* paste */
44644+ result = 0;
44645+
44646+ result += e->num_of_entries *
44647+ (sizeof(cde_unit_header) + sizeof(directory_entry_format));
44648+ for (i = 0; i < e->num_of_entries; ++i) {
44649+ const char *name;
44650+ int len;
44651+
44652+ name = e->entry[i].name->name;
44653+ len = e->entry[i].name->len;
44654+ assert("nikita-2054", strlen(name) == len);
44655+ if (is_longname(name, len))
44656+ result += len + 1;
44657+ }
44658+ ((reiser4_item_data *) data)->length = result;
44659+ return result;
44660+}
44661+
44662+/* ->nr_units() method for this item plugin. */
44663+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
44664+{
44665+ return units(coord);
44666+}
44667+
44668+/* ->unit_key() method for this item plugin. */
44669+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
44670+ reiser4_key * key /* resulting key */ )
44671+{
44672+ assert("nikita-1452", coord != NULL);
44673+ assert("nikita-1345", idx_of(coord) < units(coord));
44674+ assert("nikita-1346", key != NULL);
44675+
44676+ item_key_by_coord(coord, key);
44677+ extract_key_from_de_id(extract_dir_id_from_key(key),
44678+ &header_at(coord, idx_of(coord))->hash, key);
44679+ return key;
44680+}
44681+
44682+/* mergeable_cde(): implementation of ->mergeable() item method.
44683+
44684+ Two directory items are mergeable iff they are from the same
44685+ directory. That simple.
44686+
44687+*/
44688+int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
44689+ const coord_t * p2 /* coord of second item */ )
44690+{
44691+ reiser4_key k1;
44692+ reiser4_key k2;
44693+
44694+ assert("nikita-1339", p1 != NULL);
44695+ assert("nikita-1340", p2 != NULL);
44696+
44697+ return
44698+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
44699+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
44700+ extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
44701+
44702+}
44703+
44704+/* ->max_key_inside() method for this item plugin. */
44705+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
44706+ reiser4_key * result /* resulting key */ )
44707+{
44708+ assert("nikita-1342", coord != NULL);
44709+
44710+ item_key_by_coord(coord, result);
44711+ set_key_ordering(result, get_key_ordering(max_key()));
44712+ set_key_fulloid(result, get_key_fulloid(max_key()));
44713+ set_key_offset(result, get_key_offset(max_key()));
44714+ return result;
44715+}
44716+
44717+/* @data contains data which are to be put into tree */
44718+int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
44719+ const reiser4_key * key /* key to check */ ,
44720+ const reiser4_item_data * data /* parameters of new
44721+ * item/unit being
44722+ * created */ )
44723+{
44724+ reiser4_key item_key;
44725+
44726+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only
44727+ data->iplug is initialized */
44728+ assert("vs-457", data && data->iplug);
44729+/* assert( "vs-553", data -> user == 0 );*/
44730+ item_key_by_coord(coord, &item_key);
44731+
44732+ return (item_plugin_by_coord(coord) == data->iplug) &&
44733+ (extract_dir_id_from_key(&item_key) ==
44734+ extract_dir_id_from_key(key));
44735+}
44736+
44737+#if REISER4_DEBUG
44738+/* cde_check ->check() method for compressed directory items
44739+
44740+ used for debugging, every item should have here the most complete
44741+ possible check of the consistency of the item that the inventor can
44742+ construct
44743+*/
44744+int check_cde(const coord_t * coord /* coord of item to check */ ,
44745+ const char **error /* where to store error message */ )
44746+{
44747+ int i;
44748+ int result;
44749+ char *item_start;
44750+ char *item_end;
44751+ reiser4_key key;
44752+
44753+ coord_t c;
44754+
44755+ assert("nikita-1357", coord != NULL);
44756+ assert("nikita-1358", error != NULL);
44757+
44758+ if (!ergo(coord->item_pos != 0,
44759+ is_dot_key(item_key_by_coord(coord, &key)))) {
44760+ *error = "CDE doesn't start with dot";
44761+ return -1;
44762+ }
44763+ item_start = item_body_by_coord(coord);
44764+ item_end = item_start + item_length_by_coord(coord);
44765+
44766+ coord_dup(&c, coord);
44767+ result = 0;
44768+ for (i = 0; i < units(coord); ++i) {
44769+ directory_entry_format *entry;
44770+
44771+ if ((char *)(header_at(coord, i) + 1) >
44772+ item_end - units(coord) * sizeof *entry) {
44773+ *error = "CDE header is out of bounds";
44774+ result = -1;
44775+ break;
44776+ }
44777+ entry = entry_at(coord, i);
44778+ if ((char *)entry < item_start + sizeof(cde_item_format)) {
44779+ *error = "CDE header is too low";
44780+ result = -1;
44781+ break;
44782+ }
44783+ if ((char *)(entry + 1) > item_end) {
44784+ *error = "CDE header is too high";
44785+ result = -1;
44786+ break;
44787+ }
44788+ }
44789+
44790+ return result;
44791+}
44792+#endif
44793+
44794+/* ->init() method for this item plugin. */
44795+int init_cde(coord_t * coord /* coord of item */ ,
44796+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
44797+ UNUSED_ARG)
44798+{
44799+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
44800+ return 0;
44801+}
44802+
44803+/* ->lookup() method for this item plugin. */
44804+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
44805+ lookup_bias bias /* search bias */ ,
44806+ coord_t * coord /* coord of item to lookup in */ )
44807+{
44808+ cmp_t last_comp;
44809+ int pos;
44810+
44811+ reiser4_key utmost_key;
44812+
44813+ assert("nikita-1293", coord != NULL);
44814+ assert("nikita-1294", key != NULL);
44815+
44816+ CHECKME(coord);
44817+
44818+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
44819+ coord->unit_pos = 0;
44820+ coord->between = BEFORE_UNIT;
44821+ return CBK_COORD_NOTFOUND;
44822+ }
44823+ pos = find(coord, key, &last_comp);
44824+ if (pos >= 0) {
44825+ coord->unit_pos = (int)pos;
44826+ switch (last_comp) {
44827+ case EQUAL_TO:
44828+ coord->between = AT_UNIT;
44829+ return CBK_COORD_FOUND;
44830+ case GREATER_THAN:
44831+ coord->between = BEFORE_UNIT;
44832+ return RETERR(-ENOENT);
44833+ case LESS_THAN:
44834+ default:
44835+ impossible("nikita-1298", "Broken find");
44836+ return RETERR(-EIO);
44837+ }
44838+ } else {
44839+ coord->unit_pos = units(coord) - 1;
44840+ coord->between = AFTER_UNIT;
44841+ return (bias ==
44842+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
44843+ CBK_COORD_NOTFOUND;
44844+ }
44845+}
44846+
44847+/* ->paste() method for this item plugin. */
44848+int paste_cde(coord_t * coord /* coord of item */ ,
44849+ reiser4_item_data * data /* parameters of new unit being
44850+ * inserted */ ,
44851+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
44852+{
44853+ cde_entry_data *e;
44854+ int result;
44855+ int i;
44856+
44857+ CHECKME(coord);
44858+ e = (cde_entry_data *) data->data;
44859+
44860+ result = 0;
44861+ for (i = 0; i < e->num_of_entries; ++i) {
44862+ int pos;
44863+ int phantom_size;
44864+
44865+ phantom_size = data->length;
44866+ if (units(coord) == 0)
44867+ phantom_size -= sizeof(cde_item_format);
44868+
44869+ result =
44870+ expand(coord, e->entry + i, phantom_size, &pos, data->arg);
44871+ if (result != 0)
44872+ break;
44873+ result = paste_entry(coord, e->entry + i, pos, data->arg);
44874+ if (result != 0)
44875+ break;
44876+ }
44877+ CHECKME(coord);
44878+ return result;
44879+}
44880+
44881+/* amount of space occupied by all entries starting from @idx both headers and
44882+ bodies. */
44883+static unsigned int part_size(const coord_t * coord /* coord of item */ ,
44884+ int idx /* index of unit */ )
44885+{
44886+ assert("nikita-1299", coord != NULL);
44887+ assert("nikita-1300", idx < (int)units(coord));
44888+
44889+ return sizeof(cde_item_format) +
44890+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
44891+ idx + 1) -
44892+ offset_of(coord, 0);
44893+}
44894+
44895+/* how many but not more than @want units of @source can be merged with
44896+ item in @target node. If pend == append - we try to append last item
44897+ of @target by first units of @source. If pend == prepend - we try to
44898+ "prepend" first item in @target by last units of @source. @target
44899+ node has @free_space bytes of free space. Total size of those units
44900+ are returned via @size */
44901+int can_shift_cde(unsigned free_space /* free space in item */ ,
44902+ coord_t * coord /* coord of source item */ ,
44903+ znode * target /* target node */ ,
44904+ shift_direction pend /* shift direction */ ,
44905+ unsigned *size /* resulting number of shifted bytes */ ,
44906+ unsigned want /* maximal number of bytes to shift */ )
44907+{
44908+ int shift;
44909+
44910+ CHECKME(coord);
44911+ if (want == 0) {
44912+ *size = 0;
44913+ return 0;
44914+ }
44915+
44916+ /* pend == SHIFT_LEFT <==> shifting to the left */
44917+ if (pend == SHIFT_LEFT) {
44918+ for (shift = min((int)want - 1, units(coord)); shift >= 0;
44919+ --shift) {
44920+ *size = part_size(coord, shift);
44921+ if (target != NULL)
44922+ *size -= sizeof(cde_item_format);
44923+ if (*size <= free_space)
44924+ break;
44925+ }
44926+ shift = shift + 1;
44927+ } else {
44928+ int total_size;
44929+
44930+ assert("nikita-1301", pend == SHIFT_RIGHT);
44931+
44932+ total_size = item_length_by_coord(coord);
44933+ for (shift = units(coord) - want - 1; shift < units(coord) - 1;
44934+ ++shift) {
44935+ *size = total_size - part_size(coord, shift);
44936+ if (target == NULL)
44937+ *size += sizeof(cde_item_format);
44938+ if (*size <= free_space)
44939+ break;
44940+ }
44941+ shift = units(coord) - shift - 1;
44942+ }
44943+ if (shift == 0)
44944+ *size = 0;
44945+ CHECKME(coord);
44946+ return shift;
44947+}
44948+
44949+/* ->copy_units() method for this item plugin. */
44950+void copy_units_cde(coord_t * target /* coord of target item */ ,
44951+ coord_t * source /* coord of source item */ ,
44952+ unsigned from /* starting unit */ ,
44953+ unsigned count /* how many units to copy */ ,
44954+ shift_direction where_is_free_space /* shift direction */ ,
44955+ unsigned free_space /* free space in item */ )
44956+{
44957+ char *header_from;
44958+ char *header_to;
44959+
44960+ char *entry_from;
44961+ char *entry_to;
44962+
44963+ int pos_in_target;
44964+ int data_size;
44965+ int data_delta;
44966+ int i;
44967+
44968+ assert("nikita-1303", target != NULL);
44969+ assert("nikita-1304", source != NULL);
44970+ assert("nikita-1305", (int)from < units(source));
44971+ assert("nikita-1307", (int)(from + count) <= units(source));
44972+
44973+ if (where_is_free_space == SHIFT_LEFT) {
44974+ assert("nikita-1453", from == 0);
44975+ pos_in_target = units(target);
44976+ } else {
44977+ assert("nikita-1309", (int)(from + count) == units(source));
44978+ pos_in_target = 0;
44979+ memmove(item_body_by_coord(target),
44980+ (char *)item_body_by_coord(target) + free_space,
44981+ item_length_by_coord(target) - free_space);
44982+ }
44983+
44984+ CHECKME(target);
44985+ CHECKME(source);
44986+
44987+ /* expand @target */
44988+ data_size =
44989+ offset_of(source, (int)(from + count)) - offset_of(source,
44990+ (int)from);
44991+
44992+ if (units(target) == 0)
44993+ free_space -= sizeof(cde_item_format);
44994+
44995+ expand_item(target, pos_in_target, (int)count,
44996+ (int)(item_length_by_coord(target) - free_space),
44997+ (unsigned)data_size);
44998+
44999+ /* copy first @count units of @source into @target */
45000+ data_delta =
45001+ offset_of(target, pos_in_target) - offset_of(source, (int)from);
45002+
45003+ /* copy entries */
45004+ entry_from = (char *)entry_at(source, (int)from);
45005+ entry_to = (char *)entry_at(source, (int)(from + count));
45006+ memmove(entry_at(target, pos_in_target), entry_from,
45007+ (unsigned)(entry_to - entry_from));
45008+
45009+ /* copy headers */
45010+ header_from = (char *)header_at(source, (int)from);
45011+ header_to = (char *)header_at(source, (int)(from + count));
45012+ memmove(header_at(target, pos_in_target), header_from,
45013+ (unsigned)(header_to - header_from));
45014+
45015+ /* update offsets */
45016+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
45017+ adj_offset(target, i, data_delta);
45018+ CHECKME(target);
45019+ CHECKME(source);
45020+}
45021+
45022+/* ->cut_units() method for this item plugin. */
45023+int cut_units_cde(coord_t * coord /* coord of item */ ,
45024+ pos_in_node_t from /* start unit pos */ ,
45025+ pos_in_node_t to /* stop unit pos */ ,
45026+ struct carry_cut_data *cdata UNUSED_ARG,
45027+ reiser4_key * smallest_removed, reiser4_key * new_first)
45028+{
45029+ char *header_from;
45030+ char *header_to;
45031+
45032+ char *entry_from;
45033+ char *entry_to;
45034+
45035+ int size;
45036+ int entry_delta;
45037+ int header_delta;
45038+ int i;
45039+
45040+ unsigned count;
45041+
45042+ CHECKME(coord);
45043+
45044+ count = to - from + 1;
45045+
45046+ assert("nikita-1454", coord != NULL);
45047+ assert("nikita-1455", (int)(from + count) <= units(coord));
45048+
45049+ if (smallest_removed)
45050+ unit_key_by_coord(coord, smallest_removed);
45051+
45052+ if (new_first) {
45053+ coord_t next;
45054+
45055+ /* not everything is cut from item head */
45056+ assert("vs-1527", from == 0);
45057+ assert("vs-1528", to < units(coord) - 1);
45058+
45059+ coord_dup(&next, coord);
45060+ next.unit_pos++;
45061+ unit_key_by_coord(&next, new_first);
45062+ }
45063+
45064+ size = item_length_by_coord(coord);
45065+ if (count == (unsigned)units(coord)) {
45066+ return size;
45067+ }
45068+
45069+ header_from = (char *)header_at(coord, (int)from);
45070+ header_to = (char *)header_at(coord, (int)(from + count));
45071+
45072+ entry_from = (char *)entry_at(coord, (int)from);
45073+ entry_to = (char *)entry_at(coord, (int)(from + count));
45074+
45075+ /* move headers */
45076+ memmove(header_from, header_to,
45077+ (unsigned)(address(coord, size) - header_to));
45078+
45079+ header_delta = header_to - header_from;
45080+
45081+ entry_from -= header_delta;
45082+ entry_to -= header_delta;
45083+ size -= header_delta;
45084+
45085+ /* copy entries */
45086+ memmove(entry_from, entry_to,
45087+ (unsigned)(address(coord, size) - entry_to));
45088+
45089+ entry_delta = entry_to - entry_from;
45090+ size -= entry_delta;
45091+
45092+ /* update offsets */
45093+
45094+ for (i = 0; i < (int)from; ++i)
45095+ adj_offset(coord, i, -header_delta);
45096+
45097+ for (i = from; i < units(coord) - (int)count; ++i)
45098+ adj_offset(coord, i, -header_delta - entry_delta);
45099+
45100+ put_unaligned(cpu_to_le16((__u16) units(coord) - count),
45101+ &formatted_at(coord)->num_of_entries);
45102+
45103+ if (from == 0) {
45104+ /* entries from head was removed - move remaining to right */
45105+ memmove((char *)item_body_by_coord(coord) +
45106+ header_delta + entry_delta, item_body_by_coord(coord),
45107+ (unsigned)size);
45108+ if (REISER4_DEBUG)
45109+ memset(item_body_by_coord(coord), 0,
45110+ (unsigned)header_delta + entry_delta);
45111+ } else {
45112+ /* freed space is already at the end of item */
45113+ if (REISER4_DEBUG)
45114+ memset((char *)item_body_by_coord(coord) + size, 0,
45115+ (unsigned)header_delta + entry_delta);
45116+ }
45117+
45118+ return header_delta + entry_delta;
45119+}
45120+
45121+int kill_units_cde(coord_t * coord /* coord of item */ ,
45122+ pos_in_node_t from /* start unit pos */ ,
45123+ pos_in_node_t to /* stop unit pos */ ,
45124+ struct carry_kill_data *kdata UNUSED_ARG,
45125+ reiser4_key * smallest_removed, reiser4_key * new_first)
45126+{
45127+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
45128+}
45129+
45130+/* ->s.dir.extract_key() method for this item plugin. */
45131+int extract_key_cde(const coord_t * coord /* coord of item */ ,
45132+ reiser4_key * key /* resulting key */ )
45133+{
45134+ directory_entry_format *dent;
45135+
45136+ assert("nikita-1155", coord != NULL);
45137+ assert("nikita-1156", key != NULL);
45138+
45139+ dent = entry_at(coord, idx_of(coord));
45140+ return extract_key_from_id(&dent->id, key);
45141+}
45142+
45143+int
45144+update_key_cde(const coord_t * coord, const reiser4_key * key,
45145+ lock_handle * lh UNUSED_ARG)
45146+{
45147+ directory_entry_format *dent;
45148+ obj_key_id obj_id;
45149+ int result;
45150+
45151+ assert("nikita-2344", coord != NULL);
45152+ assert("nikita-2345", key != NULL);
45153+
45154+ dent = entry_at(coord, idx_of(coord));
45155+ result = build_obj_key_id(key, &obj_id);
45156+ if (result == 0) {
45157+ dent->id = obj_id;
45158+ znode_make_dirty(coord->node);
45159+ }
45160+ return 0;
45161+}
45162+
45163+/* ->s.dir.extract_name() method for this item plugin. */
45164+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45165+{
45166+ directory_entry_format *dent;
45167+
45168+ assert("nikita-1157", coord != NULL);
45169+
45170+ dent = entry_at(coord, idx_of(coord));
45171+ return extract_dent_name(coord, dent, buf);
45172+}
45173+
45174+static int cde_bytes(int pasting, const reiser4_item_data * data)
45175+{
45176+ int result;
45177+
45178+ result = data->length;
45179+ if (!pasting)
45180+ result -= sizeof(cde_item_format);
45181+ return result;
45182+}
45183+
45184+/* ->s.dir.add_entry() method for this item plugin */
45185+int add_entry_cde(struct inode *dir /* directory object */ ,
45186+ coord_t * coord /* coord of item */ ,
45187+ lock_handle * lh /* lock handle for insertion */ ,
45188+ const struct dentry *name /* name to insert */ ,
45189+ reiser4_dir_entry_desc * dir_entry /* parameters of new
45190+ * directory entry */ )
45191+{
45192+ reiser4_item_data data;
45193+ cde_entry entry;
45194+ cde_entry_data edata;
45195+ int result;
45196+
45197+ assert("nikita-1656", coord->node == lh->node);
45198+ assert("nikita-1657", znode_is_write_locked(coord->node));
45199+
45200+ edata.num_of_entries = 1;
45201+ edata.entry = &entry;
45202+
45203+ entry.dir = dir;
45204+ entry.obj = dir_entry->obj;
45205+ entry.name = &name->d_name;
45206+
45207+ data.data = (char *)&edata;
45208+ data.user = 0; /* &edata is not user space */
45209+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45210+ data.arg = dir_entry;
45211+ assert("nikita-1302", data.iplug != NULL);
45212+
45213+ result = is_dot_key(&dir_entry->key);
45214+ data.length = estimate_cde(result ? coord : NULL, &data);
45215+
45216+ /* NOTE-NIKITA quota plugin? */
45217+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45218+ return RETERR(-EDQUOT);
45219+
45220+ if (result)
45221+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45222+ else
45223+ result = resize_item(coord, &data, &dir_entry->key, lh, 0);
45224+ return result;
45225+}
45226+
45227+/* ->s.dir.rem_entry() */
45228+int rem_entry_cde(struct inode *dir /* directory of item */ ,
45229+ const struct qstr *name, coord_t * coord /* coord of item */ ,
45230+ lock_handle * lh UNUSED_ARG /* lock handle for
45231+ * removal */ ,
45232+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
45233+ * directory entry
45234+ * being removed */ )
45235+{
45236+ coord_t shadow;
45237+ int result;
45238+ int length;
45239+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45240+
45241+ assert("nikita-2870", strlen(name->name) == name->len);
45242+ assert("nikita-2869",
45243+ !strcmp(name->name, extract_name_cde(coord, buf)));
45244+
45245+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45246+ if (is_longname(name->name, name->len))
45247+ length += name->len + 1;
45248+
45249+ if (inode_get_bytes(dir) < length) {
45250+ warning("nikita-2628", "Dir is broke: %llu: %llu",
45251+ (unsigned long long)get_inode_oid(dir),
45252+ inode_get_bytes(dir));
45253+
45254+ return RETERR(-EIO);
45255+ }
45256+
45257+ /* cut_node() is supposed to take pointers to _different_
45258+ coords, because it will modify them without respect to
45259+ possible aliasing. To work around this, create temporary copy
45260+ of @coord.
45261+ */
45262+ coord_dup(&shadow, coord);
45263+ result =
45264+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45265+ if (result == 0) {
45266+ /* NOTE-NIKITA quota plugin? */
45267+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
45268+ }
45269+ return result;
45270+}
45271+
45272+/* ->s.dir.max_name_len() method for this item plugin */
45273+int max_name_len_cde(const struct inode *dir /* directory */ )
45274+{
45275+ return
45276+ tree_by_inode(dir)->nplug->max_item_size() -
45277+ sizeof(directory_entry_format) - sizeof(cde_item_format) -
45278+ sizeof(cde_unit_header) - 2;
45279+}
45280+
45281+/* Make Linus happy.
45282+ Local variables:
45283+ c-indentation-style: "K&R"
45284+ mode-name: "LC"
45285+ c-basic-offset: 8
45286+ tab-width: 8
45287+ fill-column: 120
45288+ End:
45289+*/
45290Index: linux-2.6.16/fs/reiser4/plugin/item/cde.h
45291===================================================================
45292--- /dev/null
45293+++ linux-2.6.16/fs/reiser4/plugin/item/cde.h
45294@@ -0,0 +1,87 @@
45295+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45296+
45297+/* Compound directory item. See cde.c for description. */
45298+
45299+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45300+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45301+
45302+#include "../../forward.h"
45303+#include "../../kassign.h"
45304+#include "../../dformat.h"
45305+
45306+#include <linux/fs.h> /* for struct inode */
45307+#include <linux/dcache.h> /* for struct dentry, etc */
45308+
45309+typedef struct cde_unit_header {
45310+ de_id hash;
45311+ d16 offset;
45312+} cde_unit_header;
45313+
45314+typedef struct cde_item_format {
45315+ d16 num_of_entries;
45316+ cde_unit_header entry[0];
45317+} cde_item_format;
45318+
45319+typedef struct cde_entry {
45320+ const struct inode *dir;
45321+ const struct inode *obj;
45322+ const struct qstr *name;
45323+} cde_entry;
45324+
45325+typedef struct cde_entry_data {
45326+ int num_of_entries;
45327+ cde_entry *entry;
45328+} cde_entry_data;
45329+
45330+/* plugin->item.b.* */
45331+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
45332+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
45333+ const reiser4_item_data *);
45334+int mergeable_cde(const coord_t * p1, const coord_t * p2);
45335+pos_in_node_t nr_units_cde(const coord_t * coord);
45336+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
45337+int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
45338+void print_cde(const char *prefix, coord_t * coord);
45339+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
45340+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
45341+ coord_t * coord);
45342+int paste_cde(coord_t * coord, reiser4_item_data * data,
45343+ carry_plugin_info * info UNUSED_ARG);
45344+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
45345+ shift_direction pend, unsigned *size, unsigned want);
45346+void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
45347+ unsigned count, shift_direction where_is_free_space,
45348+ unsigned free_space);
45349+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45350+ struct carry_cut_data *, reiser4_key * smallest_removed,
45351+ reiser4_key * new_first);
45352+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45353+ struct carry_kill_data *, reiser4_key * smallest_removed,
45354+ reiser4_key * new_first);
45355+void print_cde(const char *prefix, coord_t * coord);
45356+int check_cde(const coord_t * coord, const char **error);
45357+
45358+/* plugin->u.item.s.dir.* */
45359+int extract_key_cde(const coord_t * coord, reiser4_key * key);
45360+int update_key_cde(const coord_t * coord, const reiser4_key * key,
45361+ lock_handle * lh);
45362+char *extract_name_cde(const coord_t * coord, char *buf);
45363+int add_entry_cde(struct inode *dir, coord_t * coord,
45364+ lock_handle * lh, const struct dentry *name,
45365+ reiser4_dir_entry_desc * entry);
45366+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
45367+ lock_handle * lh, reiser4_dir_entry_desc * entry);
45368+int max_name_len_cde(const struct inode *dir);
45369+
45370+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
45371+#endif
45372+
45373+/* Make Linus happy.
45374+ Local variables:
45375+ c-indentation-style: "K&R"
45376+ mode-name: "LC"
45377+ c-basic-offset: 8
45378+ tab-width: 8
45379+ fill-column: 120
45380+ End:
45381+*/
45382Index: linux-2.6.16/fs/reiser4/plugin/item/ctail.c
45383===================================================================
45384--- /dev/null
45385+++ linux-2.6.16/fs/reiser4/plugin/item/ctail.c
45386@@ -0,0 +1,1588 @@
45387+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45388+
45389+/* ctails (aka "clustered tails") are items for cryptcompress objects */
45390+
45391+/* DESCRIPTION:
45392+
45393+Each cryptcompress object is stored on disk as a set of clusters sliced
45394+into ctails.
45395+
45396+Internal on-disk structure:
45397+
45398+ HEADER (1) Here stored disk cluster shift
45399+ BODY
45400+*/
45401+
45402+#include "../../forward.h"
45403+#include "../../debug.h"
45404+#include "../../dformat.h"
45405+#include "../../kassign.h"
45406+#include "../../key.h"
45407+#include "../../coord.h"
45408+#include "item.h"
45409+#include "../node/node.h"
45410+#include "../plugin.h"
45411+#include "../object.h"
45412+#include "../../znode.h"
45413+#include "../../carry.h"
45414+#include "../../tree.h"
45415+#include "../../inode.h"
45416+#include "../../super.h"
45417+#include "../../context.h"
45418+#include "../../page_cache.h"
45419+#include "../cluster.h"
45420+#include "../../flush.h"
45421+#include "../../tree_walk.h"
45422+
45423+#include <linux/pagevec.h>
45424+#include <linux/swap.h>
45425+#include <linux/fs.h>
45426+
45427+/* return body of ctail item at @coord */
45428+static ctail_item_format *ctail_formatted_at(const coord_t * coord)
45429+{
45430+ assert("edward-60", coord != NULL);
45431+ return item_body_by_coord(coord);
45432+}
45433+
45434+int cluster_shift_by_coord(const coord_t * coord)
45435+{
45436+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
45437+}
45438+
45439+static loff_t off_by_coord(const coord_t * coord)
45440+{
45441+ reiser4_key key;
45442+ return get_key_offset(item_key_by_coord(coord, &key));
45443+}
45444+
45445+static int coord_is_unprepped_ctail(const coord_t * coord)
45446+{
45447+ assert("edward-1233", coord != NULL);
45448+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
45449+ assert("edward-1235",
45450+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
45451+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
45452+
45453+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
45454+}
45455+
45456+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
45457+{
45458+ int shift;
45459+
45460+ if (inode != NULL) {
45461+ shift = inode_cluster_shift(inode);
45462+ assert("edward-1236",
45463+ ergo(!coord_is_unprepped_ctail(coord),
45464+ shift == cluster_shift_by_coord(coord)));
45465+ } else {
45466+ assert("edward-1237", !coord_is_unprepped_ctail(coord));
45467+ shift = cluster_shift_by_coord(coord);
45468+ }
45469+ return off_by_coord(coord) >> shift;
45470+}
45471+
45472+static int disk_cluster_size(const coord_t * coord)
45473+{
45474+ assert("edward-1156",
45475+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45476+ /* calculation of disk cluster size
45477+ is meaninless if ctail is unprepped */
45478+ assert("edward-1238", !coord_is_unprepped_ctail(coord));
45479+
45480+ return 1 << cluster_shift_by_coord(coord);
45481+}
45482+
45483+/* true if the key is of first disk cluster item */
45484+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
45485+{
45486+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
45487+
45488+ return coord_is_unprepped_ctail(coord) ||
45489+ ((get_key_offset(key) &
45490+ ((loff_t) disk_cluster_size(coord) - 1)) == 0);
45491+}
45492+
45493+static char *first_unit(coord_t * coord)
45494+{
45495+ /* FIXME: warning: pointer of type `void *' used in arithmetic */
45496+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
45497+}
45498+
45499+/* plugin->u.item.b.max_key_inside :
45500+ tail_max_key_inside */
45501+
45502+/* plugin->u.item.b.can_contain_key */
45503+int
45504+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
45505+ const reiser4_item_data * data)
45506+{
45507+ reiser4_key item_key;
45508+
45509+ if (item_plugin_by_coord(coord) != data->iplug)
45510+ return 0;
45511+
45512+ item_key_by_coord(coord, &item_key);
45513+ if (get_key_locality(key) != get_key_locality(&item_key) ||
45514+ get_key_objectid(key) != get_key_objectid(&item_key))
45515+ return 0;
45516+ if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
45517+ get_key_offset(key))
45518+ return 0;
45519+ if (is_disk_cluster_key(key, coord))
45520+ return 0;
45521+ return 1;
45522+}
45523+
45524+/* plugin->u.item.b.mergeable
45525+ c-tails of different clusters are not mergeable */
45526+int mergeable_ctail(const coord_t * p1, const coord_t * p2)
45527+{
45528+ reiser4_key key1, key2;
45529+
45530+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
45531+ assert("edward-61",
45532+ item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
45533+
45534+ if (item_id_by_coord(p2) != CTAIL_ID) {
45535+ /* second item is of another type */
45536+ return 0;
45537+ }
45538+
45539+ item_key_by_coord(p1, &key1);
45540+ item_key_by_coord(p2, &key2);
45541+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
45542+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
45543+ get_key_type(&key1) != get_key_type(&key2)) {
45544+ /* items of different objects */
45545+ return 0;
45546+ }
45547+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
45548+ /* not adjacent items */
45549+ return 0;
45550+ if (is_disk_cluster_key(&key2, p2))
45551+ return 0;
45552+ return 1;
45553+}
45554+
45555+/* plugin->u.item.b.nr_units */
45556+pos_in_node_t nr_units_ctail(const coord_t * coord)
45557+{
45558+ return (item_length_by_coord(coord) -
45559+ sizeof(ctail_formatted_at(coord)->cluster_shift));
45560+}
45561+
45562+/* plugin->u.item.b.estimate:
45563+ estimate how much space is needed to insert/paste @data->length bytes
45564+ into ctail at @coord */
45565+int estimate_ctail(const coord_t * coord /* coord of item */ ,
45566+ const reiser4_item_data *
45567+ data /* parameters for new item */ )
45568+{
45569+ if (coord == NULL)
45570+ /* insert */
45571+ return (sizeof(ctail_item_format) + data->length);
45572+ else
45573+ /* paste */
45574+ return data->length;
45575+}
45576+
45577+/* ->init() method for this item plugin. */
45578+int init_ctail(coord_t * to /* coord of item */ ,
45579+ coord_t * from /* old_item */ ,
45580+ reiser4_item_data * data /* structure used for insertion */ )
45581+{
45582+ int cluster_shift; /* cpu value to convert */
45583+
45584+ if (data) {
45585+ assert("edward-463", data->length > sizeof(ctail_item_format));
45586+ cluster_shift = *((int *)(data->arg));
45587+ data->length -= sizeof(ctail_item_format);
45588+ } else {
45589+ assert("edward-464", from != NULL);
45590+ assert("edward-855", ctail_ok(from));
45591+ cluster_shift = (int)(cluster_shift_by_coord(from));
45592+ }
45593+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
45594+ assert("edward-856", ctail_ok(to));
45595+ return 0;
45596+}
45597+
45598+/* plugin->u.item.b.lookup:
45599+ NULL: We are looking for item keys only */
45600+
45601+#if REISER4_DEBUG
45602+int ctail_ok(const coord_t * coord)
45603+{
45604+ return coord_is_unprepped_ctail(coord) ||
45605+ cluster_shift_ok(cluster_shift_by_coord(coord));
45606+}
45607+
45608+/* plugin->u.item.b.check */
45609+int check_ctail(const coord_t * coord, const char **error)
45610+{
45611+ if (!ctail_ok(coord)) {
45612+ if (error)
45613+ *error = "bad cluster shift in ctail";
45614+ return 1;
45615+ }
45616+ return 0;
45617+}
45618+#endif
45619+
45620+/* plugin->u.item.b.paste */
45621+int
45622+paste_ctail(coord_t * coord, reiser4_item_data * data,
45623+ carry_plugin_info * info UNUSED_ARG)
45624+{
45625+ unsigned old_nr_units;
45626+
45627+ assert("edward-268", data->data != NULL);
45628+ /* copy only from kernel space */
45629+ assert("edward-66", data->user == 0);
45630+
45631+ old_nr_units =
45632+ item_length_by_coord(coord) - sizeof(ctail_item_format) -
45633+ data->length;
45634+
45635+ /* ctail items never get pasted in the middle */
45636+
45637+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
45638+
45639+ /* paste at the beginning when create new item */
45640+ assert("edward-450",
45641+ item_length_by_coord(coord) ==
45642+ data->length + sizeof(ctail_item_format));
45643+ assert("edward-451", old_nr_units == 0);
45644+ } else if (coord->unit_pos == old_nr_units - 1
45645+ && coord->between == AFTER_UNIT) {
45646+
45647+ /* paste at the end */
45648+ coord->unit_pos++;
45649+ } else
45650+ impossible("edward-453", "bad paste position");
45651+
45652+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
45653+
45654+ assert("edward-857", ctail_ok(coord));
45655+
45656+ return 0;
45657+}
45658+
45659+/* plugin->u.item.b.fast_paste */
45660+
45661+/* plugin->u.item.b.can_shift
45662+ number of units is returned via return value, number of bytes via @size. For
45663+ ctail items they coincide */
45664+int
45665+can_shift_ctail(unsigned free_space, coord_t * source,
45666+ znode * target, shift_direction direction UNUSED_ARG,
45667+ unsigned *size /* number of bytes */ , unsigned want)
45668+{
45669+ /* make sure that that we do not want to shift more than we have */
45670+ assert("edward-68", want > 0 && want <= nr_units_ctail(source));
45671+
45672+ *size = min(want, free_space);
45673+
45674+ if (!target) {
45675+ /* new item will be created */
45676+ if (*size <= sizeof(ctail_item_format)) {
45677+ *size = 0;
45678+ return 0;
45679+ }
45680+ return *size - sizeof(ctail_item_format);
45681+ }
45682+ return *size;
45683+}
45684+
45685+/* plugin->u.item.b.copy_units
45686+ cooperates with ->can_shift() */
45687+void
45688+copy_units_ctail(coord_t * target, coord_t * source,
45689+ unsigned from, unsigned count /* units */ ,
45690+ shift_direction where_is_free_space,
45691+ unsigned free_space /* bytes */ )
45692+{
45693+ /* make sure that item @target is expanded already */
45694+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
45695+ assert("edward-70", free_space == count || free_space == count + 1);
45696+
45697+ assert("edward-858", ctail_ok(source));
45698+
45699+ if (where_is_free_space == SHIFT_LEFT) {
45700+ /* append item @target with @count first bytes of @source:
45701+ this restriction came from ordinary tails */
45702+ assert("edward-71", from == 0);
45703+ assert("edward-860", ctail_ok(target));
45704+
45705+ memcpy(first_unit(target) + nr_units_ctail(target) - count,
45706+ first_unit(source), count);
45707+ } else {
45708+ /* target item is moved to right already */
45709+ reiser4_key key;
45710+
45711+ assert("edward-72", nr_units_ctail(source) == from + count);
45712+
45713+ if (free_space == count) {
45714+ init_ctail(target, source, NULL);
45715+ } else {
45716+ /* new item has been created */
45717+ assert("edward-862", ctail_ok(target));
45718+ }
45719+ memcpy(first_unit(target), first_unit(source) + from, count);
45720+
45721+ assert("edward-863", ctail_ok(target));
45722+
45723+ /* new units are inserted before first unit in an item,
45724+ therefore, we have to update item key */
45725+ item_key_by_coord(source, &key);
45726+ set_key_offset(&key, get_key_offset(&key) + from);
45727+
45728+ node_plugin_by_node(target->node)->update_item_key(target, &key,
45729+ NULL /*info */);
45730+ }
45731+}
45732+
45733+/* plugin->u.item.b.create_hook */
45734+int create_hook_ctail(const coord_t * coord, void *arg)
45735+{
45736+ assert("edward-864", znode_is_loaded(coord->node));
45737+
45738+ znode_set_convertible(coord->node);
45739+ return 0;
45740+}
45741+
45742+/* plugin->u.item.b.kill_hook */
45743+int
45744+kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
45745+ carry_kill_data * kdata)
45746+{
45747+ struct inode *inode;
45748+
45749+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
45750+ assert("edward-291", znode_is_write_locked(coord->node));
45751+
45752+ inode = kdata->inode;
45753+ if (inode) {
45754+ reiser4_key key;
45755+ item_key_by_coord(coord, &key);
45756+
45757+ if (from == 0 && is_disk_cluster_key(&key, coord)) {
45758+ cloff_t start =
45759+ off_to_clust(get_key_offset(&key), inode);
45760+ truncate_page_cluster(inode, start);
45761+ }
45762+ }
45763+ return 0;
45764+}
45765+
45766+/* for shift_hook_ctail(),
45767+ return true if the first disk cluster item has dirty child
45768+*/
45769+static int ctail_convertible(const coord_t * coord)
45770+{
45771+ int result;
45772+ reiser4_key key;
45773+ jnode *child = NULL;
45774+
45775+ assert("edward-477", coord != NULL);
45776+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
45777+
45778+ if (coord_is_unprepped_ctail(coord))
45779+ /* unprepped ctail should be converted */
45780+ return 1;
45781+
45782+ item_key_by_coord(coord, &key);
45783+ child = jlookup(current_tree,
45784+ get_key_objectid(&key),
45785+ off_to_pg(off_by_coord(coord)));
45786+ if (!child)
45787+ return 0;
45788+ result = JF_ISSET(child, JNODE_DIRTY);
45789+ jput(child);
45790+ return result;
45791+}
45792+
45793+/* FIXME-EDWARD */
45794+/* plugin->u.item.b.shift_hook */
45795+int shift_hook_ctail(const coord_t * item /* coord of item */ ,
45796+ unsigned from UNUSED_ARG /* start unit */ ,
45797+ unsigned count UNUSED_ARG /* stop unit */ ,
45798+ znode * old_node /* old parent */ )
45799+{
45800+ assert("edward-479", item != NULL);
45801+ assert("edward-480", item->node != old_node);
45802+
45803+ if (!znode_convertible(old_node) || znode_convertible(item->node))
45804+ return 0;
45805+ if (ctail_convertible(item))
45806+ znode_set_convertible(item->node);
45807+ return 0;
45808+}
45809+
45810+static int
45811+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45812+ int cut, void *p, reiser4_key * smallest_removed,
45813+ reiser4_key * new_first)
45814+{
45815+ pos_in_node_t count; /* number of units to cut */
45816+ char *item;
45817+
45818+ count = to - from + 1;
45819+ item = item_body_by_coord(coord);
45820+
45821+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
45822+
45823+ if (smallest_removed) {
45824+ /* store smallest key removed */
45825+ item_key_by_coord(coord, smallest_removed);
45826+ set_key_offset(smallest_removed,
45827+ get_key_offset(smallest_removed) + from);
45828+ }
45829+
45830+ if (new_first) {
45831+ assert("vs-1531", from == 0);
45832+
45833+ item_key_by_coord(coord, new_first);
45834+ set_key_offset(new_first,
45835+ get_key_offset(new_first) + from + count);
45836+ }
45837+
45838+ if (!cut)
45839+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
45840+
45841+ if (from == 0) {
45842+ if (count != nr_units_ctail(coord)) {
45843+ /* part of item is removed, so move free space at the beginning
45844+ of the item and update item key */
45845+ reiser4_key key;
45846+ memcpy(item + to + 1, item, sizeof(ctail_item_format));
45847+ item_key_by_coord(coord, &key);
45848+ set_key_offset(&key, get_key_offset(&key) + count);
45849+ node_plugin_by_node(coord->node)->update_item_key(coord,
45850+ &key,
45851+ NULL);
45852+ } else {
45853+ /* cut_units should not be called to cut evrything */
45854+ assert("vs-1532", ergo(cut, 0));
45855+ /* whole item is cut, so more then amount of space occupied
45856+ by units got freed */
45857+ count += sizeof(ctail_item_format);
45858+ }
45859+ if (REISER4_DEBUG)
45860+ memset(item, 0, count);
45861+ } else if (REISER4_DEBUG)
45862+ memset(item + sizeof(ctail_item_format) + from, 0, count);
45863+ return count;
45864+}
45865+
45866+/* plugin->u.item.b.cut_units */
45867+int
45868+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45869+ carry_cut_data * cdata, reiser4_key * smallest_removed,
45870+ reiser4_key * new_first)
45871+{
45872+ return cut_or_kill_ctail_units(item, from, to, 1, NULL,
45873+ smallest_removed, new_first);
45874+}
45875+
45876+/* plugin->u.item.b.kill_units */
45877+int
45878+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45879+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
45880+ reiser4_key * new_first)
45881+{
45882+ return cut_or_kill_ctail_units(item, from, to, 0, kdata,
45883+ smallest_removed, new_first);
45884+}
45885+
45886+/* plugin->u.item.s.file.read */
45887+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
45888+{
45889+ uf_coord_t *uf_coord;
45890+ coord_t *coord;
45891+
45892+ uf_coord = &hint->ext_coord;
45893+ coord = &uf_coord->coord;
45894+ assert("edward-127", f->user == 0);
45895+ assert("edward-129", coord && coord->node);
45896+ assert("edward-130", coord_is_existing_unit(coord));
45897+ assert("edward-132", znode_is_loaded(coord->node));
45898+
45899+ /* start read only from the beginning of ctail */
45900+ assert("edward-133", coord->unit_pos == 0);
45901+ /* read only whole ctails */
45902+ assert("edward-135", nr_units_ctail(coord) <= f->length);
45903+
45904+ assert("edward-136", schedulable());
45905+ assert("edward-886", ctail_ok(coord));
45906+
45907+ if (f->data)
45908+ memcpy(f->data, (char *)first_unit(coord),
45909+ (size_t) nr_units_ctail(coord));
45910+
45911+ dclust_set_extension(hint);
45912+ mark_page_accessed(znode_page(coord->node));
45913+ move_flow_forward(f, nr_units_ctail(coord));
45914+
45915+ return 0;
45916+}
45917+
45918+/* Reads a disk cluster consists of ctail items,
45919+ attaches a transform stream with plain text */
45920+int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
45921+ int write)
45922+{
45923+ int result;
45924+ assert("edward-671", clust->hint != NULL);
45925+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
45926+ assert("edward-672", crc_inode_ok(inode));
45927+
45928+ /* set input stream */
45929+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
45930+ if (result)
45931+ return result;
45932+
45933+ result = find_cluster(clust, inode, 1 /* read */ , write);
45934+ assert("edward-1340", !result);
45935+ if (result)
45936+ return result;
45937+ if (!write)
45938+ /* write still need the lock to insert unprepped
45939+ items, etc... */
45940+ put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
45941+
45942+ assert("edward-673",
45943+ ergo(write, znode_is_write_locked(clust->hint->lh.node)));
45944+
45945+ if (clust->dstat == FAKE_DISK_CLUSTER ||
45946+ clust->dstat == UNPR_DISK_CLUSTER) {
45947+ tfm_cluster_set_uptodate(&clust->tc);
45948+ return 0;
45949+ }
45950+ result = grab_coa(&clust->tc, inode_compression_plugin(inode));
45951+ if (result)
45952+ return result;
45953+ result = inflate_cluster(clust, inode);
45954+ if (result)
45955+ return result;
45956+ tfm_cluster_set_uptodate(&clust->tc);
45957+ return 0;
45958+}
45959+
45960+/* read one locked page */
45961+int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
45962+ struct page *page)
45963+{
45964+ int ret;
45965+ unsigned cloff;
45966+ char *data;
45967+ size_t pgcnt;
45968+ tfm_cluster_t *tc = &clust->tc;
45969+
45970+ assert("edward-212", PageLocked(page));
45971+
45972+ if (PageUptodate(page))
45973+ goto exit;
45974+
45975+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
45976+ clust->index = pg_to_clust(page->index, inode);
45977+ unlock_page(page);
45978+ ret = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
45979+ lock_page(page);
45980+ if (ret)
45981+ return ret;
45982+ }
45983+ if (PageUptodate(page))
45984+ /* races with another read/write */
45985+ goto exit;
45986+
45987+ /* bytes in the page */
45988+ pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
45989+
45990+ if (pgcnt == 0) {
45991+ assert("edward-1290", 0);
45992+ return RETERR(-EINVAL);
45993+ }
45994+ assert("edward-119", tfm_cluster_is_uptodate(tc));
45995+
45996+ switch (clust->dstat) {
45997+ case UNPR_DISK_CLUSTER:
45998+ assert("edward-1285", 0);
45999+#if REISER4_DEBUG
46000+ warning("edward-1168",
46001+ "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
46002+ page->index, clust->index,
46003+ (unsigned long long)get_inode_oid(inode));
46004+#endif
46005+ case FAKE_DISK_CLUSTER:
46006+ /* fill the page by zeroes */
46007+ data = kmap_atomic(page, KM_USER0);
46008+
46009+ memset(data, 0, PAGE_CACHE_SIZE);
46010+ flush_dcache_page(page);
46011+ kunmap_atomic(data, KM_USER0);
46012+ SetPageUptodate(page);
46013+ break;
46014+ case PREP_DISK_CLUSTER:
46015+ /* fill the page by transformed data */
46016+ assert("edward-1058", !PageUptodate(page));
46017+ assert("edward-120", tc->len <= inode_cluster_size(inode));
46018+
46019+ /* start page offset in the cluster */
46020+ cloff = pg_to_off_to_cloff(page->index, inode);
46021+
46022+ data = kmap(page);
46023+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
46024+ memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
46025+ flush_dcache_page(page);
46026+ kunmap(page);
46027+ SetPageUptodate(page);
46028+ break;
46029+ default:
46030+ impossible("edward-1169", "bad disk cluster state");
46031+ }
46032+ exit:
46033+ return 0;
46034+}
46035+
46036+/* plugin->u.item.s.file.readpage */
46037+int readpage_ctail(void *vp, struct page *page)
46038+{
46039+ int result;
46040+ hint_t *hint;
46041+ reiser4_cluster_t *clust = vp;
46042+
46043+ assert("edward-114", clust != NULL);
46044+ assert("edward-115", PageLocked(page));
46045+ assert("edward-116", !PageUptodate(page));
46046+ assert("edward-117", !jprivate(page) && !PagePrivate(page));
46047+ assert("edward-118", page->mapping && page->mapping->host);
46048+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
46049+
46050+ hint = kmalloc(sizeof(*hint), GFP_KERNEL);
46051+ if (hint == NULL)
46052+ return RETERR(-ENOMEM);
46053+ clust->hint = hint;
46054+ result = load_file_hint(clust->file, hint);
46055+ if (result) {
46056+ kfree(hint);
46057+ return result;
46058+ }
46059+ assert("vs-25", hint->ext_coord.lh == &hint->lh);
46060+ result = do_readpage_ctail(page->mapping->host, clust, page);
46061+
46062+ assert("edward-213", PageLocked(page));
46063+ assert("edward-1163", ergo(!result, PageUptodate(page)));
46064+ assert("edward-868",
46065+ ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
46066+
46067+ unlock_page(page);
46068+ done_lh(&hint->lh);
46069+ hint->ext_coord.valid = 0;
46070+ save_file_hint(clust->file, hint);
46071+ kfree(hint);
46072+ tfm_cluster_clr_uptodate(&clust->tc);
46073+
46074+ return result;
46075+}
46076+
46077+/* This unconditionally reads a disk cluster.
46078+ Helper function for ->readpages() */
46079+static int
46080+ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
46081+{
46082+ int i;
46083+ int result;
46084+ assert("edward-779", clust != NULL);
46085+ assert("edward-1059", clust->win == NULL);
46086+ assert("edward-780", inode != NULL);
46087+
46088+ result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
46089+ if (result)
46090+ return result;
46091+ result = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
46092+ if (result)
46093+ goto out;
46094+ /* at this point stream with valid plain text is attached */
46095+ assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
46096+
46097+ for (i = 0; i < clust->nr_pages; i++) {
46098+ struct page *page = clust->pages[i];
46099+ lock_page(page);
46100+ result = do_readpage_ctail(inode, clust, page);
46101+ unlock_page(page);
46102+ if (result)
46103+ break;
46104+ }
46105+ tfm_cluster_clr_uptodate(&clust->tc);
46106+ out:
46107+ release_cluster_pages(clust);
46108+ return result;
46109+}
46110+
46111+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
46112+#define list_to_next_page(head) (list_entry((head)->prev->prev, struct page, lru))
46113+
46114+#if REISER4_DEBUG
46115+#define check_order(pages) \
46116+assert("edward-214", ergo(!list_empty(pages) && pages->next != pages->prev, \
46117+ list_to_page(pages)->index < list_to_next_page(pages)->index))
46118+#endif
46119+
46120+/* plugin->u.item.s.file.readpages
46121+ Populate an address space with some page clusters,
46122+ and start reads against them.
46123+ FIXME-EDWARD: this function should return errors?
46124+*/
46125+void
46126+readpages_ctail(void *vp, struct address_space *mapping,
46127+ struct list_head *pages)
46128+{
46129+ int ret = 0;
46130+ hint_t *hint;
46131+ reiser4_cluster_t clust;
46132+ struct page *page;
46133+ struct pagevec lru_pvec;
46134+ struct inode *inode = mapping->host;
46135+ int progress = 0;
46136+
46137+ assert("edward-214", ergo(!list_empty(pages) &&
46138+ pages->next != pages->prev,
46139+ list_to_page(pages)->index <
46140+ list_to_next_page(pages)->index));
46141+ pagevec_init(&lru_pvec, 0);
46142+ cluster_init_read(&clust, NULL);
46143+ clust.file = vp;
46144+ hint = kmalloc(sizeof(*hint), GFP_KERNEL);
46145+ if (hint == NULL) {
46146+ warning("vs-28", "failed to allocate hint");
46147+ goto exit1;
46148+ }
46149+ clust.hint = hint;
46150+ ret = load_file_hint(clust.file, hint);
46151+ if (ret)
46152+ goto exit2;
46153+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46154+ if (ret)
46155+ goto exit3;
46156+ assert("vs-26", hint->ext_coord.lh == &hint->lh);
46157+
46158+ /* address_space-level file readahead doesn't know about
46159+ reiser4 concept of clustering, so we work around this
46160+ fact: with each page of the list @pages address space
46161+ will be populated with the whole page cluster.
46162+ */
46163+ while (!list_empty(pages)) {
46164+ page = list_to_page(pages);
46165+ list_del(&page->lru);
46166+ if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
46167+ page_cache_release(page);
46168+ continue;
46169+ }
46170+ if (PageUptodate(page)) {
46171+ if (!pagevec_add(&lru_pvec, page))
46172+ __pagevec_lru_add(&lru_pvec);
46173+ unlock_page(page);
46174+ continue;
46175+ }
46176+ unlock_page(page);
46177+
46178+ move_cluster_forward(&clust, inode, page->index, &progress);
46179+ ret = ctail_read_page_cluster(&clust, inode);
46180+ if (ret)
46181+ break;
46182+ assert("edward-869", !tfm_cluster_is_uptodate(&clust.tc));
46183+ lock_page(page);
46184+
46185+ ret = do_readpage_ctail(inode, &clust, page);
46186+ if (!pagevec_add(&lru_pvec, page))
46187+ __pagevec_lru_add(&lru_pvec);
46188+ if (ret) {
46189+ warning("edward-215", "do_readpage_ctail failed");
46190+ unlock_page(page);
46191+ break;
46192+ }
46193+ assert("edward-1061", PageUptodate(page));
46194+
46195+ unlock_page(page);
46196+ }
46197+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46198+ exit3:
46199+ done_lh(&hint->lh);
46200+ save_file_hint(clust.file, hint);
46201+ hint->ext_coord.valid = 0;
46202+ exit2:
46203+ kfree(hint);
46204+ exit1:
46205+ while (!list_empty(pages)) {
46206+ struct page *victim;
46207+ victim = list_to_page(pages);
46208+ list_del(&victim->lru);
46209+ page_cache_release(victim);
46210+ }
46211+ put_cluster_handle(&clust);
46212+ pagevec_lru_add(&lru_pvec);
46213+ return;
46214+}
46215+
46216+/*
46217+ plugin->u.item.s.file.append_key
46218+ key of the first item of the next disk cluster
46219+*/
46220+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46221+{
46222+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46223+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46224+
46225+ item_key_by_coord(coord, key);
46226+ set_key_offset(key,
46227+ ((__u64) (clust_by_coord(coord, NULL)) +
46228+ 1) << cluster_shift_by_coord(coord));
46229+ return key;
46230+}
46231+
46232+static int
46233+insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
46234+{
46235+ int result;
46236+ char buf[UCTAIL_NR_UNITS];
46237+ reiser4_item_data data;
46238+ reiser4_key key;
46239+ int shift = (int)UCTAIL_SHIFT;
46240+
46241+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46242+ result = key_by_inode_cryptcompress(inode,
46243+ clust_to_off(clust->index, inode),
46244+ &key);
46245+ if (result)
46246+ return result;
46247+ data.user = 0;
46248+ data.iplug = item_plugin_by_id(CTAIL_ID);
46249+ data.arg = &shift;
46250+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46251+ data.data = buf;
46252+
46253+ result = insert_by_coord(&clust->hint->ext_coord.coord,
46254+ &data, &key, clust->hint->ext_coord.lh, 0);
46255+ return result;
46256+}
46257+
46258+static int
46259+insert_crc_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46260+ struct inode *inode)
46261+{
46262+ int result;
46263+ carry_pool *pool;
46264+ carry_level *lowest_level;
46265+ reiser4_item_data *data;
46266+ carry_op *op;
46267+ int cluster_shift = inode_cluster_shift(inode);
46268+
46269+ pool =
46270+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46271+ sizeof(*data));
46272+ if (IS_ERR(pool))
46273+ return PTR_ERR(pool);
46274+ lowest_level = (carry_level *) (pool + 1);
46275+ init_carry_level(lowest_level, pool);
46276+ data = (reiser4_item_data *) (lowest_level + 3);
46277+
46278+ assert("edward-466", coord->between == AFTER_ITEM
46279+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46280+ || coord->between == EMPTY_NODE
46281+ || coord->between == BEFORE_UNIT);
46282+
46283+ if (coord->between == AFTER_UNIT) {
46284+ coord->unit_pos = 0;
46285+ coord->between = AFTER_ITEM;
46286+ }
46287+ op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46288+ 0 /* operate directly on coord -> node */ );
46289+ if (IS_ERR(op) || (op == NULL)) {
46290+ done_carry_pool(pool);
46291+ return RETERR(op ? PTR_ERR(op) : -EIO);
46292+ }
46293+ data->user = 0;
46294+ data->iplug = item_plugin_by_id(CTAIL_ID);
46295+ data->arg = &cluster_shift;
46296+
46297+ data->length = 0;
46298+ data->data = NULL;
46299+
46300+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46301+ op->u.insert_flow.insert_point = coord;
46302+ op->u.insert_flow.flow = f;
46303+ op->u.insert_flow.data = data;
46304+ op->u.insert_flow.new_nodes = 0;
46305+
46306+ lowest_level->track_type = CARRY_TRACK_CHANGE;
46307+ lowest_level->tracked = lh;
46308+
46309+ result = carry(lowest_level, NULL);
46310+ done_carry_pool(pool);
46311+
46312+ return result;
46313+}
46314+
46315+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46316+static int
46317+insert_crc_flow_in_place(coord_t * coord, lock_handle * lh, flow_t * f,
46318+ struct inode *inode)
46319+{
46320+ int ret;
46321+ coord_t pos;
46322+ lock_handle lock;
46323+
46324+ assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46325+ assert("edward-484", coord->between == AT_UNIT
46326+ || coord->between == AFTER_ITEM);
46327+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46328+
46329+ coord_dup(&pos, coord);
46330+ pos.unit_pos = 0;
46331+ pos.between = AFTER_ITEM;
46332+
46333+ init_lh(&lock);
46334+ copy_lh(&lock, lh);
46335+
46336+ ret = insert_crc_flow(&pos, &lock, f, inode);
46337+ done_lh(&lock);
46338+ assert("edward-1347", znode_is_write_locked(lh->node));
46339+ assert("edward-1228", !ret);
46340+ return ret;
46341+}
46342+
46343+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46344+static int overwrite_ctail(coord_t * coord, flow_t * f)
46345+{
46346+ unsigned count;
46347+
46348+ assert("edward-269", f->user == 0);
46349+ assert("edward-270", f->data != NULL);
46350+ assert("edward-271", f->length > 0);
46351+ assert("edward-272", coord_is_existing_unit(coord));
46352+ assert("edward-273", coord->unit_pos == 0);
46353+ assert("edward-274", znode_is_write_locked(coord->node));
46354+ assert("edward-275", schedulable());
46355+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
46356+ assert("edward-1243", ctail_ok(coord));
46357+
46358+ count = nr_units_ctail(coord);
46359+
46360+ if (count > f->length)
46361+ count = f->length;
46362+ memcpy(first_unit(coord), f->data, count);
46363+ move_flow_forward(f, count);
46364+ coord->unit_pos += count;
46365+ return 0;
46366+}
46367+
46368+/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
46369+ cut ctail (part or whole) starting from next unit position */
46370+static int cut_ctail(coord_t * coord)
46371+{
46372+ coord_t stop;
46373+
46374+ assert("edward-435", coord->between == AT_UNIT &&
46375+ coord->item_pos < coord_num_items(coord) &&
46376+ coord->unit_pos <= coord_num_units(coord));
46377+
46378+ if (coord->unit_pos == coord_num_units(coord))
46379+ /* nothing to cut */
46380+ return 0;
46381+ coord_dup(&stop, coord);
46382+ stop.unit_pos = coord_last_unit_pos(coord);
46383+
46384+ return cut_node_content(coord, &stop, NULL, NULL, NULL);
46385+}
46386+
46387+int
46388+ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
46389+{
46390+ int result;
46391+ assert("edward-1244", inode != NULL);
46392+ assert("edward-1245", clust->hint != NULL);
46393+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
46394+ assert("edward-1247", clust->reserved == 1);
46395+ assert("edward-1248", get_current_context()->grabbed_blocks ==
46396+ estimate_insert_cluster(inode));
46397+
46398+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
46399+ if (cbk_errored(result))
46400+ return result;
46401+ assert("edward-1249", result == CBK_COORD_NOTFOUND);
46402+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
46403+
46404+ assert("edward-1295",
46405+ clust->hint->ext_coord.lh->node ==
46406+ clust->hint->ext_coord.coord.node);
46407+
46408+ coord_set_between_clusters(&clust->hint->ext_coord.coord);
46409+
46410+ result = insert_unprepped_ctail(clust, inode);
46411+ all_grabbed2free();
46412+
46413+ assert("edward-1251", !result);
46414+ assert("edward-1252", crc_inode_ok(inode));
46415+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
46416+ assert("edward-1254",
46417+ reiser4_clustered_blocks(reiser4_get_current_sb()));
46418+ assert("edward-1255",
46419+ znode_convertible(clust->hint->ext_coord.coord.node));
46420+
46421+ return result;
46422+}
46423+
46424+static int do_convert_ctail(flush_pos_t * pos, crc_write_mode_t mode)
46425+{
46426+ int result = 0;
46427+ convert_item_info_t *info;
46428+
46429+ assert("edward-468", pos != NULL);
46430+ assert("edward-469", pos->sq != NULL);
46431+ assert("edward-845", item_convert_data(pos) != NULL);
46432+
46433+ info = item_convert_data(pos);
46434+ assert("edward-679", info->flow.data != NULL);
46435+
46436+ switch (mode) {
46437+ case CRC_APPEND_ITEM:
46438+ assert("edward-1229", info->flow.length != 0);
46439+ assert("edward-1256",
46440+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
46441+ result =
46442+ insert_crc_flow_in_place(&pos->coord, &pos->lock,
46443+ &info->flow, info->inode);
46444+ break;
46445+ case CRC_OVERWRITE_ITEM:
46446+ assert("edward-1230", info->flow.length != 0);
46447+ overwrite_ctail(&pos->coord, &info->flow);
46448+ if (info->flow.length != 0)
46449+ break;
46450+ case CRC_CUT_ITEM:
46451+ assert("edward-1231", info->flow.length == 0);
46452+ result = cut_ctail(&pos->coord);
46453+ break;
46454+ default:
46455+ result = RETERR(-EIO);
46456+ impossible("edward-244", "bad convert mode");
46457+ }
46458+ return result;
46459+}
46460+
46461+/* plugin->u.item.f.scan */
46462+int scan_ctail(flush_scan * scan)
46463+{
46464+ int result = 0;
46465+ struct page *page;
46466+ struct inode *inode;
46467+ jnode *node = scan->node;
46468+
46469+ assert("edward-227", scan->node != NULL);
46470+ assert("edward-228", jnode_is_cluster_page(scan->node));
46471+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
46472+
46473+ page = jnode_page(node);
46474+ inode = page->mapping->host;
46475+
46476+ if (!scanning_left(scan))
46477+ return result;
46478+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
46479+ znode_make_dirty(scan->parent_lock.node);
46480+
46481+ if (!znode_convertible(scan->parent_lock.node)) {
46482+ if (JF_ISSET(scan->node, JNODE_DIRTY))
46483+ znode_set_convertible(scan->parent_lock.node);
46484+ else {
46485+ warning("edward-681",
46486+ "cluster page is already processed");
46487+ return -EAGAIN;
46488+ }
46489+ }
46490+ return result;
46491+}
46492+
46493+/* If true, this function attaches children */
46494+static int should_attach_convert_idata(flush_pos_t * pos)
46495+{
46496+ int result;
46497+ assert("edward-431", pos != NULL);
46498+ assert("edward-432", pos->child == NULL);
46499+ assert("edward-619", znode_is_write_locked(pos->coord.node));
46500+ assert("edward-470",
46501+ item_plugin_by_coord(&pos->coord) ==
46502+ item_plugin_by_id(CTAIL_ID));
46503+
46504+ /* check for leftmost child */
46505+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
46506+
46507+ if (!pos->child)
46508+ return 0;
46509+ spin_lock_jnode(pos->child);
46510+ result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
46511+ pos->child->atom == ZJNODE(pos->coord.node)->atom);
46512+ spin_unlock_jnode(pos->child);
46513+ if (!result && pos->child) {
46514+ /* existing child isn't to attach, clear up this one */
46515+ jput(pos->child);
46516+ pos->child = NULL;
46517+ }
46518+ return result;
46519+}
46520+
46521+/* plugin->init_convert_data() */
46522+static int
46523+init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
46524+{
46525+ assert("edward-813", idata != NULL);
46526+ assert("edward-814", inode != NULL);
46527+
46528+ idata->inode = inode;
46529+ idata->d_cur = DC_FIRST_ITEM;
46530+ idata->d_next = DC_INVALID_STATE;
46531+
46532+ return 0;
46533+}
46534+
46535+static int alloc_item_convert_data(convert_info_t * sq)
46536+{
46537+ assert("edward-816", sq != NULL);
46538+ assert("edward-817", sq->itm == NULL);
46539+
46540+ sq->itm = kmalloc(sizeof(*sq->itm), GFP_KERNEL);
46541+ if (sq->itm == NULL)
46542+ return RETERR(-ENOMEM);
46543+ return 0;
46544+}
46545+
46546+static void free_item_convert_data(convert_info_t * sq)
46547+{
46548+ assert("edward-818", sq != NULL);
46549+ assert("edward-819", sq->itm != NULL);
46550+ assert("edward-820", sq->iplug != NULL);
46551+
46552+ kfree(sq->itm);
46553+ sq->itm = NULL;
46554+ return;
46555+}
46556+
46557+static int alloc_convert_data(flush_pos_t * pos)
46558+{
46559+ assert("edward-821", pos != NULL);
46560+ assert("edward-822", pos->sq == NULL);
46561+
46562+ pos->sq = kmalloc(sizeof(*pos->sq), GFP_KERNEL);
46563+ if (!pos->sq)
46564+ return RETERR(-ENOMEM);
46565+ memset(pos->sq, 0, sizeof(*pos->sq));
46566+ cluster_init_write(&pos->sq->clust, 0);
46567+ return 0;
46568+}
46569+
46570+void free_convert_data(flush_pos_t * pos)
46571+{
46572+ convert_info_t *sq;
46573+
46574+ assert("edward-823", pos != NULL);
46575+ assert("edward-824", pos->sq != NULL);
46576+
46577+ sq = pos->sq;
46578+ if (sq->itm)
46579+ free_item_convert_data(sq);
46580+ put_cluster_handle(&sq->clust);
46581+ kfree(pos->sq);
46582+ pos->sq = NULL;
46583+ return;
46584+}
46585+
46586+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
46587+{
46588+ convert_info_t *sq;
46589+
46590+ assert("edward-825", pos != NULL);
46591+ assert("edward-826", pos->sq != NULL);
46592+ assert("edward-827", item_convert_data(pos) != NULL);
46593+ assert("edward-828", inode != NULL);
46594+
46595+ sq = pos->sq;
46596+
46597+ memset(sq->itm, 0, sizeof(*sq->itm));
46598+
46599+ /* iplug->init_convert_data() */
46600+ return init_convert_data_ctail(sq->itm, inode);
46601+}
46602+
46603+/* create and attach disk cluster info used by 'convert' phase of the flush
46604+ squalloc() */
46605+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
46606+{
46607+ int ret = 0;
46608+ convert_item_info_t *info;
46609+ reiser4_cluster_t *clust;
46610+ file_plugin *fplug = inode_file_plugin(inode);
46611+ compression_plugin *cplug = inode_compression_plugin(inode);
46612+
46613+ assert("edward-248", pos != NULL);
46614+ assert("edward-249", pos->child != NULL);
46615+ assert("edward-251", inode != NULL);
46616+ assert("edward-682", crc_inode_ok(inode));
46617+ assert("edward-252", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
46618+ assert("edward-473",
46619+ item_plugin_by_coord(&pos->coord) ==
46620+ item_plugin_by_id(CTAIL_ID));
46621+
46622+ if (!pos->sq) {
46623+ ret = alloc_convert_data(pos);
46624+ if (ret)
46625+ return ret;
46626+ }
46627+ clust = &pos->sq->clust;
46628+ ret = grab_coa(&clust->tc, cplug);
46629+ if (ret)
46630+ goto err;
46631+ ret = set_cluster_by_page(clust,
46632+ jnode_page(pos->child),
46633+ MAX_CLUSTER_NRPAGES);
46634+ if (ret)
46635+ goto err;
46636+
46637+ assert("edward-829", pos->sq != NULL);
46638+ assert("edward-250", item_convert_data(pos) == NULL);
46639+
46640+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
46641+
46642+ ret = alloc_item_convert_data(pos->sq);
46643+ if (ret)
46644+ goto err;
46645+ ret = init_item_convert_data(pos, inode);
46646+ if (ret)
46647+ goto err;
46648+ info = item_convert_data(pos);
46649+
46650+ ret = flush_cluster_pages(clust, pos->child, inode);
46651+ if (ret)
46652+ goto err;
46653+
46654+ deflate_cluster(clust, inode);
46655+ inc_item_convert_count(pos);
46656+
46657+ /* make flow by transformed stream */
46658+ fplug->flow_by_inode(info->inode,
46659+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
46660+ 0 /* kernel space */ ,
46661+ clust->tc.len,
46662+ clust_to_off(clust->index, inode),
46663+ WRITE_OP, &info->flow);
46664+ jput(pos->child);
46665+
46666+ assert("edward-683", crc_inode_ok(inode));
46667+ return 0;
46668+ err:
46669+ jput(pos->child);
46670+ free_convert_data(pos);
46671+ return ret;
46672+}
46673+
46674+/* clear up disk cluster info */
46675+static void detach_convert_idata(convert_info_t * sq)
46676+{
46677+ convert_item_info_t *info;
46678+
46679+ assert("edward-253", sq != NULL);
46680+ assert("edward-840", sq->itm != NULL);
46681+
46682+ info = sq->itm;
46683+ assert("edward-255", info->inode != NULL);
46684+ assert("edward-1212", info->flow.length == 0);
46685+
46686+ free_item_convert_data(sq);
46687+ return;
46688+}
46689+
46690+/* plugin->u.item.f.utmost_child */
46691+
46692+/* This function sets leftmost child for a first cluster item,
46693+ if the child exists, and NULL in other cases.
46694+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */
46695+
46696+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
46697+{
46698+ reiser4_key key;
46699+
46700+ item_key_by_coord(coord, &key);
46701+
46702+ assert("edward-257", coord != NULL);
46703+ assert("edward-258", child != NULL);
46704+ assert("edward-259", side == LEFT_SIDE);
46705+ assert("edward-260",
46706+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46707+
46708+ if (!is_disk_cluster_key(&key, coord))
46709+ *child = NULL;
46710+ else
46711+ *child = jlookup(current_tree,
46712+ get_key_objectid(item_key_by_coord
46713+ (coord, &key)),
46714+ off_to_pg(get_key_offset(&key)));
46715+ return 0;
46716+}
46717+
46718+/* Returns true if @p2 is the next item to @p1
46719+ in the _same_ disk cluster.
46720+ Disk cluster is a set of items. If ->clustered() != NULL,
46721+ with each item the whole disk cluster should be read/modified
46722+*/
46723+static int clustered_ctail(const coord_t * p1, const coord_t * p2)
46724+{
46725+ return mergeable_ctail(p1, p2);
46726+}
46727+
46728+/* Go rightward and check for next disk cluster item, set
46729+ d_next to DC_CHAINED_ITEM, if the last one exists.
46730+ If the current position is last item, go to right neighbor.
46731+ Skip empty nodes. Note, that right neighbors may be not in
46732+ the slum because of races. If so, make it dirty and
46733+ convertible.
46734+*/
46735+static int next_item_dc_stat(flush_pos_t * pos)
46736+{
46737+ int ret = 0;
46738+ int stop = 0;
46739+ znode *cur;
46740+ coord_t coord;
46741+ lock_handle lh;
46742+ lock_handle right_lock;
46743+
46744+ assert("edward-1232", !node_is_empty(pos->coord.node));
46745+ assert("edward-1014",
46746+ pos->coord.item_pos < coord_num_items(&pos->coord));
46747+ assert("edward-1015", chaining_data_present(pos));
46748+ assert("edward-1017",
46749+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
46750+
46751+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
46752+
46753+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
46754+ return ret;
46755+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
46756+ return ret;
46757+
46758+ /* check next slum item */
46759+ init_lh(&right_lock);
46760+ cur = pos->coord.node;
46761+
46762+ while (!stop) {
46763+ init_lh(&lh);
46764+ ret = reiser4_get_right_neighbor(&lh,
46765+ cur,
46766+ ZNODE_WRITE_LOCK,
46767+ GN_CAN_USE_UPPER_LEVELS);
46768+ if (ret)
46769+ break;
46770+ ret = zload(lh.node);
46771+ if (ret) {
46772+ done_lh(&lh);
46773+ break;
46774+ }
46775+ coord_init_before_first_item(&coord, lh.node);
46776+
46777+ if (node_is_empty(lh.node)) {
46778+ znode_make_dirty(lh.node);
46779+ znode_set_convertible(lh.node);
46780+ stop = 0;
46781+ } else if (clustered_ctail(&pos->coord, &coord)) {
46782+
46783+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
46784+
46785+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
46786+ /*
46787+ warning("edward-1024",
46788+ "next slum item mergeable, "
46789+ "but znode %p isn't dirty\n",
46790+ lh.node);
46791+ */
46792+ znode_make_dirty(lh.node);
46793+ }
46794+ if (!znode_convertible(lh.node)) {
46795+ /*
46796+ warning("edward-1272",
46797+ "next slum item mergeable, "
46798+ "but znode %p isn't convertible\n",
46799+ lh.node);
46800+ */
46801+ znode_set_convertible(lh.node);
46802+ }
46803+ stop = 1;
46804+ } else
46805+ stop = 1;
46806+ zrelse(lh.node);
46807+ done_lh(&right_lock);
46808+ copy_lh(&right_lock, &lh);
46809+ done_lh(&lh);
46810+ cur = right_lock.node;
46811+ }
46812+ done_lh(&right_lock);
46813+
46814+ if (ret == -E_NO_NEIGHBOR)
46815+ ret = 0;
46816+ return ret;
46817+}
46818+
46819+static int
46820+assign_convert_mode(convert_item_info_t * idata, crc_write_mode_t * mode)
46821+{
46822+ int result = 0;
46823+
46824+ assert("edward-1025", idata != NULL);
46825+
46826+ if (idata->flow.length) {
46827+ /* append or overwrite */
46828+ switch (idata->d_cur) {
46829+ case DC_FIRST_ITEM:
46830+ case DC_CHAINED_ITEM:
46831+ *mode = CRC_OVERWRITE_ITEM;
46832+ break;
46833+ case DC_AFTER_CLUSTER:
46834+ *mode = CRC_APPEND_ITEM;
46835+ break;
46836+ default:
46837+ impossible("edward-1018", "wrong current item state");
46838+ }
46839+ } else {
46840+ /* cut or invalidate */
46841+ switch (idata->d_cur) {
46842+ case DC_FIRST_ITEM:
46843+ case DC_CHAINED_ITEM:
46844+ *mode = CRC_CUT_ITEM;
46845+ break;
46846+ case DC_AFTER_CLUSTER:
46847+ result = 1;
46848+ break;
46849+ default:
46850+ impossible("edward-1019", "wrong current item state");
46851+ }
46852+ }
46853+ return result;
46854+}
46855+
46856+/* plugin->u.item.f.convert */
46857+/* write ctail in guessed mode */
46858+int convert_ctail(flush_pos_t * pos)
46859+{
46860+ int result;
46861+ int nr_items;
46862+ crc_write_mode_t mode = CRC_OVERWRITE_ITEM;
46863+
46864+ assert("edward-1020", pos != NULL);
46865+ assert("edward-1213", coord_num_items(&pos->coord) != 0);
46866+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
46867+ assert("edward-1258", ctail_ok(&pos->coord));
46868+ assert("edward-261", pos->coord.node != NULL);
46869+
46870+ nr_items = coord_num_items(&pos->coord);
46871+ if (!chaining_data_present(pos)) {
46872+ if (should_attach_convert_idata(pos)) {
46873+ /* attach convert item info */
46874+ struct inode *inode;
46875+
46876+ assert("edward-264", pos->child != NULL);
46877+ assert("edward-265", jnode_page(pos->child) != NULL);
46878+ assert("edward-266",
46879+ jnode_page(pos->child)->mapping != NULL);
46880+
46881+ inode = jnode_page(pos->child)->mapping->host;
46882+
46883+ assert("edward-267", inode != NULL);
46884+
46885+ /* attach item convert info by child and put the last one */
46886+ result = attach_convert_idata(pos, inode);
46887+ pos->child = NULL;
46888+ if (result == -E_REPEAT) {
46889+ /* jnode became clean, or there is no dirty
46890+ pages (nothing to update in disk cluster) */
46891+ warning("edward-1021",
46892+ "convert_ctail: nothing to attach");
46893+ return 0;
46894+ }
46895+ if (result != 0)
46896+ return result;
46897+ } else
46898+ /* unconvertible */
46899+ return 0;
46900+ } else {
46901+ /* use old convert info */
46902+
46903+ convert_item_info_t *idata;
46904+
46905+ idata = item_convert_data(pos);
46906+
46907+ result = assign_convert_mode(idata, &mode);
46908+ if (result) {
46909+ /* disk cluster is over,
46910+ nothing to update anymore */
46911+ detach_convert_idata(pos->sq);
46912+ return 0;
46913+ }
46914+ }
46915+
46916+ assert("edward-433", chaining_data_present(pos));
46917+ assert("edward-1022",
46918+ pos->coord.item_pos < coord_num_items(&pos->coord));
46919+
46920+ result = next_item_dc_stat(pos);
46921+ if (result) {
46922+ detach_convert_idata(pos->sq);
46923+ return result;
46924+ }
46925+ result = do_convert_ctail(pos, mode);
46926+ if (result) {
46927+ detach_convert_idata(pos->sq);
46928+ return result;
46929+ }
46930+ switch (mode) {
46931+ case CRC_CUT_ITEM:
46932+ assert("edward-1214", item_convert_data(pos)->flow.length == 0);
46933+ assert("edward-1215",
46934+ coord_num_items(&pos->coord) == nr_items ||
46935+ coord_num_items(&pos->coord) == nr_items - 1);
46936+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
46937+ break;
46938+ if (coord_num_items(&pos->coord) != nr_items) {
46939+ /* the item was killed, no more chained items */
46940+ detach_convert_idata(pos->sq);
46941+ if (!node_is_empty(pos->coord.node))
46942+ /* make sure the next item will be scanned */
46943+ coord_init_before_item(&pos->coord);
46944+ break;
46945+ }
46946+ case CRC_APPEND_ITEM:
46947+ assert("edward-434", item_convert_data(pos)->flow.length == 0);
46948+ detach_convert_idata(pos->sq);
46949+ break;
46950+ case CRC_OVERWRITE_ITEM:
46951+ if (coord_is_unprepped_ctail(&pos->coord)) {
46952+ /* convert unpprepped ctail to prepped one */
46953+ int shift;
46954+ shift =
46955+ inode_cluster_shift(item_convert_data(pos)->inode);
46956+ assert("edward-1259", cluster_shift_ok(shift));
46957+ put_unaligned((d8)shift,
46958+ &ctail_formatted_at(&pos->coord)->
46959+ cluster_shift);
46960+ }
46961+ break;
46962+ }
46963+ return result;
46964+}
46965+
46966+/* Make Linus happy.
46967+ Local variables:
46968+ c-indentation-style: "K&R"
46969+ mode-name: "LC"
46970+ c-basic-offset: 8
46971+ tab-width: 8
46972+ fill-column: 120
46973+ End:
46974+*/
46975Index: linux-2.6.16/fs/reiser4/plugin/item/ctail.h
46976===================================================================
46977--- /dev/null
46978+++ linux-2.6.16/fs/reiser4/plugin/item/ctail.h
46979@@ -0,0 +1,89 @@
46980+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46981+
46982+#if !defined( __FS_REISER4_CTAIL_H__ )
46983+#define __FS_REISER4_CTAIL_H__
46984+
46985+/* cryptcompress object item. See ctail.c for description. */
46986+
46987+#define UCTAIL_NR_UNITS 1
46988+#define UCTAIL_SHIFT 0xff
46989+
46990+typedef struct ctail_item_format {
46991+ /* cluster shift */
46992+ d8 cluster_shift;
46993+ /* ctail body */
46994+ d8 body[0];
46995+} __attribute__ ((packed)) ctail_item_format;
46996+
46997+/* The following is a set of various item states in a disk cluster.
46998+ Disk cluster is a set of items whose keys belong to the interval
46999+ [dc_key , dc_key + disk_cluster_size - 1] */
47000+typedef enum {
47001+ DC_INVALID_STATE = 0,
47002+ DC_FIRST_ITEM = 1,
47003+ DC_CHAINED_ITEM = 2,
47004+ DC_AFTER_CLUSTER = 3
47005+} dc_item_stat;
47006+
47007+typedef struct {
47008+ int shift; /* we keep here a cpu value of cluster_shift field
47009+ of ctail_item_format (see above) */
47010+} ctail_coord_extension_t;
47011+
47012+struct cut_list;
47013+
47014+/* plugin->item.b.* */
47015+int can_contain_key_ctail(const coord_t *, const reiser4_key *,
47016+ const reiser4_item_data *);
47017+int mergeable_ctail(const coord_t * p1, const coord_t * p2);
47018+pos_in_node_t nr_units_ctail(const coord_t * coord);
47019+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
47020+void print_ctail(const char *prefix, coord_t * coord);
47021+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
47022+
47023+int paste_ctail(coord_t * coord, reiser4_item_data * data,
47024+ carry_plugin_info * info UNUSED_ARG);
47025+int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
47026+int can_shift_ctail(unsigned free_space, coord_t * coord,
47027+ znode * target, shift_direction pend, unsigned *size,
47028+ unsigned want);
47029+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
47030+ unsigned count, shift_direction where_is_free_space,
47031+ unsigned free_space);
47032+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47033+ carry_cut_data *, reiser4_key * smallest_removed,
47034+ reiser4_key * new_first);
47035+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47036+ carry_kill_data *, reiser4_key * smallest_removed,
47037+ reiser4_key * new_first);
47038+int ctail_ok(const coord_t * coord);
47039+int check_ctail(const coord_t * coord, const char **error);
47040+
47041+/* plugin->u.item.s.* */
47042+int read_ctail(struct file *, flow_t *, hint_t *);
47043+int readpage_ctail(void *, struct page *);
47044+void readpages_ctail(void *, struct address_space *, struct list_head *);
47045+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
47046+int create_hook_ctail(const coord_t * coord, void *arg);
47047+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
47048+ carry_kill_data *);
47049+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
47050+
47051+/* plugin->u.item.f */
47052+int utmost_child_ctail(const coord_t *, sideof, jnode **);
47053+int scan_ctail(flush_scan *);
47054+int convert_ctail(flush_pos_t *);
47055+size_t inode_scaled_cluster_size(struct inode *);
47056+int cluster_shift_by_coord(const coord_t * coord);
47057+
47058+#endif /* __FS_REISER4_CTAIL_H__ */
47059+
47060+/* Make Linus happy.
47061+ Local variables:
47062+ c-indentation-style: "K&R"
47063+ mode-name: "LC"
47064+ c-basic-offset: 8
47065+ tab-width: 8
47066+ fill-column: 120
47067+ End:
47068+*/
47069Index: linux-2.6.16/fs/reiser4/plugin/item/extent.c
47070===================================================================
47071--- /dev/null
47072+++ linux-2.6.16/fs/reiser4/plugin/item/extent.c
47073@@ -0,0 +1,197 @@
47074+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47075+
47076+#include "item.h"
47077+#include "../../key.h"
47078+#include "../../super.h"
47079+#include "../../carry.h"
47080+#include "../../inode.h"
47081+#include "../../page_cache.h"
47082+#include "../../flush.h"
47083+#include "../object.h"
47084+
47085+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
47086+/* Audited by: green(2002.06.13) */
47087+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47088+ int nr_extents)
47089+{
47090+ data->data = ext_unit;
47091+ /* data->data is kernel space */
47092+ data->user = 0;
47093+ data->length = sizeof(reiser4_extent) * nr_extents;
47094+ data->arg = NULL;
47095+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47096+ return data;
47097+}
47098+
47099+/* how many bytes are addressed by @nr first extents of the extent item */
47100+reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr)
47101+{
47102+ pos_in_node_t i;
47103+ reiser4_block_nr blocks;
47104+ reiser4_extent *ext;
47105+
47106+ ext = item_body_by_coord(coord);
47107+ assert("vs-263", nr <= nr_units_extent(coord));
47108+
47109+ blocks = 0;
47110+ for (i = 0; i < nr; i++, ext++) {
47111+ blocks += extent_get_width(ext);
47112+ }
47113+
47114+ return blocks * current_blocksize;
47115+}
47116+
47117+extent_state state_of_extent(reiser4_extent * ext)
47118+{
47119+ switch ((int)extent_get_start(ext)) {
47120+ case 0:
47121+ return HOLE_EXTENT;
47122+ case 1:
47123+ return UNALLOCATED_EXTENT;
47124+ default:
47125+ break;
47126+ }
47127+ return ALLOCATED_EXTENT;
47128+}
47129+
47130+int extent_is_unallocated(const coord_t * item)
47131+{
47132+ assert("jmacd-5133", item_is_extent(item));
47133+
47134+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47135+}
47136+
47137+/* set extent's start and width */
47138+void
47139+set_extent(reiser4_extent * ext, reiser4_block_nr start, reiser4_block_nr width)
47140+{
47141+ extent_set_start(ext, start);
47142+ extent_set_width(ext, width);
47143+}
47144+
47145+
47146+/**
47147+ * replace_extent - replace extent and paste 1 or 2 after it
47148+ * @un_extent: coordinate of extent to be overwritten
47149+ * @lh: need better comment
47150+ * @key: need better comment
47151+ * @exts_to_add: data prepared for insertion into tree
47152+ * @replace: need better comment
47153+ * @flags: need better comment
47154+ * @return_insert_position: need better comment
47155+ *
47156+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
47157+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47158+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47159+ * set to extent which was overwritten.
47160+ */
47161+int replace_extent(struct replace_handle *h, int return_inserted_position)
47162+{
47163+ int result;
47164+ znode *orig_znode;
47165+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
47166+
47167+ assert("vs-990", coord_is_existing_unit(h->coord));
47168+ assert("vs-1375", znode_is_write_locked(h->coord->node));
47169+ assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47170+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47171+ assert("vs-1427", ergo(h->nr_new_extents == 2,
47172+ extent_get_width(&h->new_extents[1]) != 0));
47173+
47174+ /* compose structure for paste */
47175+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47176+
47177+ coord_dup(&h->coord_after, h->coord);
47178+ init_lh(&h->lh_after);
47179+ copy_lh(&h->lh_after, h->lh);
47180+ tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47181+ tap_monitor(&h->watch);
47182+
47183+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47184+ orig_znode = h->coord->node;
47185+
47186+#if REISER4_DEBUG
47187+ /* make sure that key is set properly */
47188+ unit_key_by_coord(h->coord, &h->tmp);
47189+ set_key_offset(&h->tmp,
47190+ get_key_offset(&h->tmp) +
47191+ extent_get_width(&h->overwrite) * current_blocksize);
47192+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47193+#endif
47194+
47195+ /* set insert point after unit to be replaced */
47196+ h->coord->between = AFTER_UNIT;
47197+
47198+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47199+ &h->paste_key, &h->item, h->flags);
47200+ if (!result) {
47201+ /* now we have to replace the unit after which new units were
47202+ inserted. Its position is tracked by @watch */
47203+ reiser4_extent *ext;
47204+ znode *node;
47205+
47206+ node = h->coord_after.node;
47207+ if (node != orig_znode) {
47208+ coord_clear_iplug(&h->coord_after);
47209+ result = zload(node);
47210+ }
47211+
47212+ if (likely(!result)) {
47213+ ext = extent_by_coord(&h->coord_after);
47214+
47215+ assert("vs-987", znode_is_loaded(node));
47216+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47217+
47218+ /* overwrite extent unit */
47219+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47220+ znode_make_dirty(node);
47221+
47222+ if (node != orig_znode)
47223+ zrelse(node);
47224+
47225+ if (return_inserted_position == 0) {
47226+ /* coord and lh are to be set to overwritten
47227+ extent */
47228+ assert("vs-1662",
47229+ WITH_DATA(node, !memcmp(&h->overwrite,
47230+ extent_by_coord(
47231+ &h->coord_after),
47232+ sizeof(reiser4_extent))));
47233+
47234+ *h->coord = h->coord_after;
47235+ done_lh(h->lh);
47236+ copy_lh(h->lh, &h->lh_after);
47237+ } else {
47238+ /* h->coord and h->lh are to be set to first of
47239+ inserted units */
47240+ assert("vs-1663",
47241+ WITH_DATA(h->coord->node,
47242+ !memcmp(&h->new_extents[0],
47243+ extent_by_coord(h->coord),
47244+ sizeof(reiser4_extent))));
47245+ assert("vs-1664", h->lh->node == h->coord->node);
47246+ }
47247+ }
47248+ }
47249+ tap_done(&h->watch);
47250+
47251+ return result;
47252+}
47253+
47254+lock_handle *znode_lh(znode *node)
47255+{
47256+ assert("vs-1371", znode_is_write_locked(node));
47257+ assert("vs-1372", znode_is_wlocked_once(node));
47258+ return list_entry(node->lock.owners.next, lock_handle, owners_link);
47259+}
47260+
47261+/*
47262+ * Local variables:
47263+ * c-indentation-style: "K&R"
47264+ * mode-name: "LC"
47265+ * c-basic-offset: 8
47266+ * tab-width: 8
47267+ * fill-column: 79
47268+ * scroll-step: 1
47269+ * End:
47270+ */
47271Index: linux-2.6.16/fs/reiser4/plugin/item/extent.h
47272===================================================================
47273--- /dev/null
47274+++ linux-2.6.16/fs/reiser4/plugin/item/extent.h
47275@@ -0,0 +1,228 @@
47276+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47277+
47278+#ifndef __REISER4_EXTENT_H__
47279+#define __REISER4_EXTENT_H__
47280+
47281+/* on disk extent */
47282+typedef struct {
47283+ reiser4_dblock_nr start;
47284+ reiser4_dblock_nr width;
47285+} reiser4_extent;
47286+
47287+typedef struct extent_stat {
47288+ int unallocated_units;
47289+ int unallocated_blocks;
47290+ int allocated_units;
47291+ int allocated_blocks;
47292+ int hole_units;
47293+ int hole_blocks;
47294+} extent_stat;
47295+
47296+/* extents in an extent item can be either holes, or unallocated or allocated
47297+ extents */
47298+typedef enum {
47299+ HOLE_EXTENT,
47300+ UNALLOCATED_EXTENT,
47301+ ALLOCATED_EXTENT
47302+} extent_state;
47303+
47304+#define HOLE_EXTENT_START 0
47305+#define UNALLOCATED_EXTENT_START 1
47306+#define UNALLOCATED_EXTENT_START2 2
47307+
47308+typedef struct {
47309+ reiser4_block_nr pos_in_unit;
47310+ reiser4_block_nr width; /* width of current unit */
47311+ pos_in_node_t nr_units; /* number of units */
47312+ int ext_offset; /* offset from the beginning of zdata() */
47313+ unsigned long expected_page;
47314+#if REISER4_DEBUG
47315+ reiser4_extent extent;
47316+#endif
47317+} extent_coord_extension_t;
47318+
47319+/* macros to set/get fields of on-disk extent */
47320+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47321+{
47322+ return le64_to_cpu(ext->start);
47323+}
47324+
47325+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47326+{
47327+ return le64_to_cpu(ext->width);
47328+}
47329+
47330+extern __u64 reiser4_current_block_count(void);
47331+
47332+static inline void
47333+extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47334+{
47335+ cassert(sizeof(ext->start) == 8);
47336+ assert("nikita-2510",
47337+ ergo(start > 1, start < reiser4_current_block_count()));
47338+ put_unaligned(cpu_to_le64(start), &ext->start);
47339+}
47340+
47341+static inline void
47342+extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47343+{
47344+ cassert(sizeof(ext->width) == 8);
47345+ assert("", width > 0);
47346+ put_unaligned(cpu_to_le64(width), &ext->width);
47347+ assert("nikita-2511",
47348+ ergo(extent_get_start(ext) > 1,
47349+ extent_get_start(ext) + width <=
47350+ reiser4_current_block_count()));
47351+}
47352+
47353+#define extent_item(coord) \
47354+({ \
47355+ assert("nikita-3143", item_is_extent(coord)); \
47356+ ((reiser4_extent *)item_body_by_coord (coord)); \
47357+})
47358+
47359+#define extent_by_coord(coord) \
47360+({ \
47361+ assert("nikita-3144", item_is_extent(coord)); \
47362+ (extent_item (coord) + (coord)->unit_pos); \
47363+})
47364+
47365+#define width_by_coord(coord) \
47366+({ \
47367+ assert("nikita-3145", item_is_extent(coord)); \
47368+ extent_get_width (extent_by_coord(coord)); \
47369+})
47370+
47371+struct carry_cut_data;
47372+struct carry_kill_data;
47373+
47374+/* plugin->u.item.b.* */
47375+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47376+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47377+ const reiser4_item_data *);
47378+int mergeable_extent(const coord_t * p1, const coord_t * p2);
47379+pos_in_node_t nr_units_extent(const coord_t *);
47380+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47381+void init_coord_extent(coord_t *);
47382+int init_extent(coord_t *, reiser4_item_data *);
47383+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47384+int can_shift_extent(unsigned free_space,
47385+ coord_t * source, znode * target, shift_direction,
47386+ unsigned *size, unsigned want);
47387+void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47388+ unsigned count, shift_direction where_is_free_space,
47389+ unsigned free_space);
47390+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47391+ struct carry_kill_data *);
47392+int create_hook_extent(const coord_t * coord, void *arg);
47393+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47394+ struct carry_cut_data *, reiser4_key * smallest_removed,
47395+ reiser4_key * new_first);
47396+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47397+ struct carry_kill_data *, reiser4_key * smallest_removed,
47398+ reiser4_key * new_first);
47399+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47400+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47401+void print_extent(const char *, coord_t *);
47402+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47403+int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47404+ reiser4_block_nr * block);
47405+void item_stat_extent(const coord_t * coord, void *vp);
47406+int check_extent(const coord_t * coord, const char **error);
47407+
47408+/* plugin->u.item.s.file.* */
47409+ssize_t write_extent(struct file *, const char __user *, size_t, loff_t *);
47410+int read_extent(struct file *, flow_t *, hint_t *);
47411+int readpage_extent(void *, struct page *);
47412+void readpages_extent(void *, struct address_space *, struct list_head *pages);
47413+reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47414+void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47415+int get_block_address_extent(const coord_t *, sector_t block,
47416+ sector_t * result);
47417+
47418+/* these are used in flush.c
47419+ FIXME-VS: should they be somewhere in item_plugin? */
47420+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47421+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47422+ reiser4_key * stop_key);
47423+
47424+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47425+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47426+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47427+
47428+/* plugin->u.item.f. */
47429+int scan_extent(flush_scan * scan);
47430+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47431+
47432+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47433+ int nr_extents);
47434+reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr);
47435+extent_state state_of_extent(reiser4_extent * ext);
47436+void set_extent(reiser4_extent *, reiser4_block_nr start,
47437+ reiser4_block_nr width);
47438+int update_extent(struct inode *, jnode *, loff_t pos, int *plugged_hole);
47439+
47440+#include "../../coord.h"
47441+#include "../../lock.h"
47442+#include "../../tap.h"
47443+
47444+struct replace_handle {
47445+ /* these are to be set before calling replace_extent */
47446+ coord_t *coord;
47447+ lock_handle *lh;
47448+ reiser4_key key;
47449+ reiser4_key *pkey;
47450+ reiser4_extent overwrite;
47451+ reiser4_extent new_extents[2];
47452+ int nr_new_extents;
47453+ unsigned flags;
47454+
47455+ /* these are used by replace_extent */
47456+ reiser4_item_data item;
47457+ coord_t coord_after;
47458+ lock_handle lh_after;
47459+ tap_t watch;
47460+ reiser4_key paste_key;
47461+#if REISER4_DEBUG
47462+ reiser4_extent orig_ext;
47463+ reiser4_key tmp;
47464+#endif
47465+};
47466+
47467+/* this structure is kmalloced before calling make_extent to avoid excessive
47468+ stack consumption on plug_hole->replace_extent */
47469+struct make_extent_handle {
47470+ uf_coord_t *uf_coord;
47471+ reiser4_block_nr blocknr;
47472+ int created;
47473+ struct inode *inode;
47474+ union {
47475+ struct {
47476+ } append;
47477+ struct replace_handle replace;
47478+ } u;
47479+};
47480+
47481+int replace_extent(struct replace_handle *, int return_inserted_position);
47482+lock_handle *znode_lh(znode *);
47483+
47484+/* the reiser4 repacker support */
47485+struct repacker_cursor;
47486+extern int process_extent_backward_for_repacking(tap_t *,
47487+ struct repacker_cursor *);
47488+extern int mark_extent_for_repacking(tap_t *, int);
47489+
47490+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47491+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47492+
47493+/* __REISER4_EXTENT_H__ */
47494+#endif
47495+/*
47496+ Local variables:
47497+ c-indentation-style: "K&R"
47498+ mode-name: "LC"
47499+ c-basic-offset: 8
47500+ tab-width: 8
47501+ fill-column: 120
47502+ End:
47503+*/
47504Index: linux-2.6.16/fs/reiser4/plugin/item/extent_file_ops.c
47505===================================================================
47506--- /dev/null
47507+++ linux-2.6.16/fs/reiser4/plugin/item/extent_file_ops.c
47508@@ -0,0 +1,1712 @@
47509+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47510+
47511+#include "item.h"
47512+#include "../../inode.h"
47513+#include "../../page_cache.h"
47514+#include "../object.h"
47515+
47516+#include <linux/quotaops.h>
47517+#include <linux/swap.h>
47518+#include "../../../../mm/filemap.h"
47519+
47520+
47521+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
47522+{
47523+ reiser4_extent *ext;
47524+
47525+ ext = (reiser4_extent *) (zdata(node) + offset);
47526+ return ext;
47527+}
47528+
47529+/**
47530+ * check_uf_coord - verify coord extension
47531+ * @uf_coord:
47532+ * @key:
47533+ *
47534+ * Makes sure that all fields of @uf_coord are set properly. If @key is
47535+ * specified - check whether @uf_coord is set correspondingly.
47536+ */
47537+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
47538+{
47539+#if REISER4_DEBUG
47540+ const coord_t *coord;
47541+ const extent_coord_extension_t *ext_coord;
47542+ reiser4_extent *ext;
47543+
47544+ coord = &uf_coord->coord;
47545+ ext_coord = &uf_coord->extension.extent;
47546+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
47547+
47548+ assert("",
47549+ WITH_DATA(coord->node,
47550+ (uf_coord->valid == 1 &&
47551+ coord_is_iplug_set(coord) &&
47552+ item_is_extent(coord) &&
47553+ ext_coord->nr_units == nr_units_extent(coord) &&
47554+ ext == extent_by_coord(coord) &&
47555+ ext_coord->width == extent_get_width(ext) &&
47556+ coord->unit_pos < ext_coord->nr_units &&
47557+ ext_coord->pos_in_unit < ext_coord->width &&
47558+ memcmp(ext, &ext_coord->extent,
47559+ sizeof(reiser4_extent)) == 0)));
47560+ if (key) {
47561+ reiser4_key coord_key;
47562+
47563+ unit_key_by_coord(&uf_coord->coord, &coord_key);
47564+ set_key_offset(&coord_key,
47565+ get_key_offset(&coord_key) +
47566+ (uf_coord->extension.extent.
47567+ pos_in_unit << PAGE_CACHE_SHIFT));
47568+ assert("", keyeq(key, &coord_key));
47569+ }
47570+#endif
47571+}
47572+
47573+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
47574+{
47575+ check_uf_coord(uf_coord, NULL);
47576+
47577+ return ext_by_offset(uf_coord->coord.node,
47578+ uf_coord->extension.extent.ext_offset);
47579+}
47580+
47581+#if REISER4_DEBUG
47582+
47583+/**
47584+ * offset_is_in_unit
47585+ *
47586+ *
47587+ *
47588+ */
47589+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
47590+ pos_in_unit inside of unit correspondingly */
47591+static int offset_is_in_unit(const coord_t *coord, loff_t off)
47592+{
47593+ reiser4_key unit_key;
47594+ __u64 unit_off;
47595+ reiser4_extent *ext;
47596+
47597+ ext = extent_by_coord(coord);
47598+
47599+ unit_key_extent(coord, &unit_key);
47600+ unit_off = get_key_offset(&unit_key);
47601+ if (off < unit_off)
47602+ return 0;
47603+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
47604+ return 0;
47605+ return 1;
47606+}
47607+
47608+static int
47609+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
47610+{
47611+ reiser4_key item_key;
47612+
47613+ assert("vs-771", coord_is_existing_unit(coord));
47614+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
47615+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
47616+
47617+ return offset_is_in_unit(coord, get_key_offset(key));
47618+}
47619+
47620+#endif
47621+
47622+/**
47623+ * can_append -
47624+ * @key:
47625+ * @coord:
47626+ *
47627+ * Returns 1 if @key is equal to an append key of item @coord is set to
47628+ */
47629+static int can_append(const reiser4_key *key, const coord_t *coord)
47630+{
47631+ reiser4_key append_key;
47632+
47633+ return keyeq(key, append_key_extent(coord, &append_key));
47634+}
47635+
47636+/**
47637+ * append_hole
47638+ * @coord:
47639+ * @lh:
47640+ * @key:
47641+ *
47642+ */
47643+static int append_hole(coord_t *coord, lock_handle *lh,
47644+ const reiser4_key *key)
47645+{
47646+ reiser4_key append_key;
47647+ reiser4_block_nr hole_width;
47648+ reiser4_extent *ext, new_ext;
47649+ reiser4_item_data idata;
47650+
47651+ /* last item of file may have to be appended with hole */
47652+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
47653+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
47654+
47655+ /* key of first byte which is not addressed by this extent */
47656+ append_key_extent(coord, &append_key);
47657+
47658+ assert("", keyle(&append_key, key));
47659+
47660+ /*
47661+ * extent item has to be appended with hole. Calculate length of that
47662+ * hole
47663+ */
47664+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
47665+ current_blocksize - 1) >> current_blocksize_bits);
47666+ assert("vs-954", hole_width > 0);
47667+
47668+ /* set coord after last unit */
47669+ coord_init_after_item_end(coord);
47670+
47671+ /* get last extent in the item */
47672+ ext = extent_by_coord(coord);
47673+ if (state_of_extent(ext) == HOLE_EXTENT) {
47674+ /*
47675+ * last extent of a file is hole extent. Widen that extent by
47676+ * @hole_width blocks. Note that we do not worry about
47677+ * overflowing - extent width is 64 bits
47678+ */
47679+ set_extent(ext, HOLE_EXTENT_START,
47680+ extent_get_width(ext) + hole_width);
47681+ znode_make_dirty(coord->node);
47682+ return 0;
47683+ }
47684+
47685+ /* append last item of the file with hole extent unit */
47686+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
47687+ state_of_extent(ext) == UNALLOCATED_EXTENT));
47688+
47689+ set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47690+ init_new_extent(&idata, &new_ext, 1);
47691+ return insert_into_item(coord, lh, &append_key, &idata, 0);
47692+}
47693+
47694+/**
47695+ * check_jnodes
47696+ * @twig: longterm locked twig node
47697+ * @key:
47698+ *
47699+ */
47700+static void check_jnodes(znode *twig, const reiser4_key *key, int count)
47701+{
47702+#if REISER4_DEBUG
47703+ coord_t c;
47704+ reiser4_key node_key, jnode_key;
47705+
47706+ jnode_key = *key;
47707+
47708+ assert("", twig != NULL);
47709+ assert("", znode_get_level(twig) == TWIG_LEVEL);
47710+ assert("", znode_is_write_locked(twig));
47711+
47712+ zload(twig);
47713+ /* get the smallest key in twig node */
47714+ coord_init_first_unit(&c, twig);
47715+ unit_key_by_coord(&c, &node_key);
47716+ assert("", keyle(&node_key, &jnode_key));
47717+
47718+ coord_init_last_unit(&c, twig);
47719+ unit_key_by_coord(&c, &node_key);
47720+ if (item_plugin_by_coord(&c)->s.file.append_key)
47721+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
47722+ set_key_offset(&jnode_key,
47723+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
47724+ assert("", keylt(&jnode_key, &node_key));
47725+ zrelse(twig);
47726+#endif
47727+}
47728+
47729+/**
47730+ * append_last_extent - append last file item
47731+ * @uf_coord: coord to start insertion from
47732+ * @jnodes: array of jnodes
47733+ * @count: number of jnodes in the array
47734+ *
47735+ * There is already at least one extent item of file @inode in the tree. Append
47736+ * the last of them with unallocated extent unit of width @count. Assign
47737+ * fake block numbers to jnodes corresponding to the inserted extent.
47738+ */
47739+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47740+ jnode **jnodes, int count)
47741+{
47742+ int result;
47743+ reiser4_extent new_ext;
47744+ reiser4_item_data idata;
47745+ coord_t *coord;
47746+ extent_coord_extension_t *ext_coord;
47747+ reiser4_extent *ext;
47748+ reiser4_block_nr block;
47749+ jnode *node;
47750+ int i;
47751+
47752+ coord = &uf_coord->coord;
47753+ ext_coord = &uf_coord->extension.extent;
47754+ ext = ext_by_ext_coord(uf_coord);
47755+
47756+ /* check correctness of position in the item */
47757+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
47758+ assert("vs-1311", coord->between == AFTER_UNIT);
47759+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
47760+
47761+ if (!can_append(key, coord)) {
47762+ /* hole extent has to be inserted */
47763+ result = append_hole(coord, uf_coord->lh, key);
47764+ uf_coord->valid = 0;
47765+ return result;
47766+ }
47767+
47768+ if (count == 0)
47769+ return 0;
47770+
47771+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
47772+
47773+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
47774+ count);
47775+ BUG_ON(result != 0);
47776+
47777+ switch (state_of_extent(ext)) {
47778+ case UNALLOCATED_EXTENT:
47779+ /*
47780+ * last extent unit of the file is unallocated one. Increase
47781+ * its width by @count
47782+ */
47783+ set_extent(ext, UNALLOCATED_EXTENT_START,
47784+ extent_get_width(ext) + count);
47785+ znode_make_dirty(coord->node);
47786+
47787+ /* update coord extension */
47788+ ext_coord->width += count;
47789+ ON_DEBUG(extent_set_width
47790+ (&uf_coord->extension.extent.extent,
47791+ ext_coord->width));
47792+ break;
47793+
47794+ case HOLE_EXTENT:
47795+ case ALLOCATED_EXTENT:
47796+ /*
47797+ * last extent unit of the file is either hole or allocated
47798+ * one. Append one unallocated extent of width @count
47799+ */
47800+ set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47801+ init_new_extent(&idata, &new_ext, 1);
47802+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
47803+ uf_coord->valid = 0;
47804+ if (result)
47805+ return result;
47806+ break;
47807+
47808+ default:
47809+ return RETERR(-EIO);
47810+ }
47811+
47812+ /*
47813+ * make sure that we hold long term locked twig node containing all
47814+ * jnodes we are about to capture
47815+ */
47816+ check_jnodes(uf_coord->lh->node, key, count);
47817+
47818+ /*
47819+ * assign fake block numbers to all jnodes. FIXME: make sure whether
47820+ * twig node containing inserted extent item is locked
47821+ */
47822+ block = fake_blocknr_unformatted(count);
47823+ for (i = 0; i < count; i ++, block ++) {
47824+ node = jnodes[i];
47825+ spin_lock_jnode(node);
47826+ JF_SET(node, JNODE_CREATED);
47827+ jnode_set_block(node, &block);
47828+ result = try_capture(node, ZNODE_WRITE_LOCK, 0);
47829+ BUG_ON(result != 0);
47830+ jnode_make_dirty_locked(node);
47831+ spin_unlock_jnode(node);
47832+ }
47833+ return count;
47834+}
47835+
47836+/**
47837+ * insert_first_hole - inser hole extent into tree
47838+ * @coord:
47839+ * @lh:
47840+ * @key:
47841+ *
47842+ *
47843+ */
47844+static int insert_first_hole(coord_t *coord, lock_handle *lh,
47845+ const reiser4_key *key)
47846+{
47847+ reiser4_extent new_ext;
47848+ reiser4_item_data idata;
47849+ reiser4_key item_key;
47850+ reiser4_block_nr hole_width;
47851+
47852+ /* @coord must be set for inserting of new item */
47853+ assert("vs-711", coord_is_between_items(coord));
47854+
47855+ item_key = *key;
47856+ set_key_offset(&item_key, 0ull);
47857+
47858+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
47859+ current_blocksize_bits);
47860+ assert("vs-710", hole_width > 0);
47861+
47862+ /* compose body of hole extent and insert item into tree */
47863+ set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47864+ init_new_extent(&idata, &new_ext, 1);
47865+ return insert_extent_by_coord(coord, &idata, &item_key, lh);
47866+}
47867+
47868+
47869+/**
47870+ * insert_first_extent - insert first file item
47871+ * @inode: inode of file
47872+ * @uf_coord: coord to start insertion from
47873+ * @jnodes: array of jnodes
47874+ * @count: number of jnodes in the array
47875+ * @inode:
47876+ *
47877+ * There are no items of file @inode in the tree yet. Insert unallocated extent
47878+ * of width @count into tree or hole extent if writing not to the
47879+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
47880+ * unallocated extent. Returns number of jnodes or error code.
47881+ */
47882+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47883+ jnode **jnodes, int count,
47884+ struct inode *inode)
47885+{
47886+ int result;
47887+ int i;
47888+ reiser4_extent new_ext;
47889+ reiser4_item_data idata;
47890+ reiser4_block_nr block;
47891+ unix_file_info_t *uf_info;
47892+ jnode *node;
47893+
47894+ /* first extent insertion starts at leaf level */
47895+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
47896+ assert("vs-711", coord_is_between_items(&uf_coord->coord));
47897+
47898+ if (get_key_offset(key) != 0) {
47899+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
47900+ uf_coord->valid = 0;
47901+ uf_info = unix_file_inode_data(inode);
47902+
47903+ /*
47904+ * first item insertion is only possible when writing to empty
47905+ * file or performing tail conversion
47906+ */
47907+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
47908+ (inode_get_flag(inode, REISER4_PART_MIXED) &&
47909+ inode_get_flag(inode, REISER4_PART_IN_CONV))));
47910+
47911+ /* if file was empty - update its state */
47912+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
47913+ uf_info->container = UF_CONTAINER_EXTENTS;
47914+ return result;
47915+ }
47916+
47917+ if (count == 0)
47918+ return 0;
47919+
47920+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
47921+ BUG_ON(result != 0);
47922+
47923+ /*
47924+ * prepare for tree modification: compose body of item and item data
47925+ * structure needed for insertion
47926+ */
47927+ set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47928+ init_new_extent(&idata, &new_ext, 1);
47929+
47930+ /* insert extent item into the tree */
47931+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
47932+ uf_coord->lh);
47933+ if (result)
47934+ return result;
47935+
47936+ /*
47937+ * make sure that we hold long term locked twig node containing all
47938+ * jnodes we are about to capture
47939+ */
47940+ check_jnodes(uf_coord->lh->node, key, count);
47941+ /*
47942+ * assign fake block numbers to all jnodes, capture and mark them dirty
47943+ */
47944+ block = fake_blocknr_unformatted(count);
47945+ for (i = 0; i < count; i ++, block ++) {
47946+ node = jnodes[i];
47947+ spin_lock_jnode(node);
47948+ JF_SET(node, JNODE_CREATED);
47949+ jnode_set_block(node, &block);
47950+ result = try_capture(node, ZNODE_WRITE_LOCK, 0);
47951+ BUG_ON(result != 0);
47952+ jnode_make_dirty_locked(node);
47953+ spin_unlock_jnode(node);
47954+ }
47955+
47956+ /*
47957+ * invalidate coordinate, research must be performed to continue
47958+ * because write will continue on twig level
47959+ */
47960+ uf_coord->valid = 0;
47961+ return count;
47962+}
47963+
47964+/**
47965+ * plug_hole - replace hole extent with unallocated and holes
47966+ * @uf_coord:
47967+ * @key:
47968+ * @node:
47969+ * @h: structure containing coordinate, lock handle, key, etc
47970+ *
47971+ * Creates an unallocated extent of width 1 within a hole. In worst case two
47972+ * additional extents can be created.
47973+ */
47974+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
47975+{
47976+ struct replace_handle rh;
47977+ reiser4_extent *ext;
47978+ reiser4_block_nr width, pos_in_unit;
47979+ coord_t *coord;
47980+ extent_coord_extension_t *ext_coord;
47981+ int return_inserted_position;
47982+
47983+ check_uf_coord(uf_coord, key);
47984+
47985+ rh.coord = coord_by_uf_coord(uf_coord);
47986+ rh.lh = uf_coord->lh;
47987+ rh.flags = 0;
47988+
47989+ coord = coord_by_uf_coord(uf_coord);
47990+ ext_coord = ext_coord_by_uf_coord(uf_coord);
47991+ ext = ext_by_ext_coord(uf_coord);
47992+
47993+ width = ext_coord->width;
47994+ pos_in_unit = ext_coord->pos_in_unit;
47995+
47996+ *how = 0;
47997+ if (width == 1) {
47998+ set_extent(ext, UNALLOCATED_EXTENT_START, 1);
47999+ znode_make_dirty(coord->node);
48000+ /* update uf_coord */
48001+ ON_DEBUG(ext_coord->extent = *ext);
48002+ *how = 1;
48003+ return 0;
48004+ } else if (pos_in_unit == 0) {
48005+ /* we deal with first element of extent */
48006+ if (coord->unit_pos) {
48007+ /* there is an extent to the left */
48008+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
48009+ /*
48010+ * left neighboring unit is an unallocated
48011+ * extent. Increase its width and decrease
48012+ * width of hole
48013+ */
48014+ extent_set_width(ext - 1,
48015+ extent_get_width(ext - 1) + 1);
48016+ extent_set_width(ext, width - 1);
48017+ znode_make_dirty(coord->node);
48018+
48019+ /* update coord extension */
48020+ coord->unit_pos--;
48021+ ext_coord->width = extent_get_width(ext - 1);
48022+ ext_coord->pos_in_unit = ext_coord->width - 1;
48023+ ext_coord->ext_offset -= sizeof(reiser4_extent);
48024+ ON_DEBUG(ext_coord->extent =
48025+ *extent_by_coord(coord));
48026+ *how = 2;
48027+ return 0;
48028+ }
48029+ }
48030+ /* extent for replace */
48031+ set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
48032+ /* extent to be inserted */
48033+ set_extent(&rh.new_extents[0], HOLE_EXTENT_START, width - 1);
48034+ rh.nr_new_extents = 1;
48035+
48036+ /* have replace_extent to return with @coord and @uf_coord->lh
48037+ set to unit which was replaced */
48038+ return_inserted_position = 0;
48039+ *how = 3;
48040+ } else if (pos_in_unit == width - 1) {
48041+ /* we deal with last element of extent */
48042+ if (coord->unit_pos < nr_units_extent(coord) - 1) {
48043+ /* there is an extent unit to the right */
48044+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
48045+ /*
48046+ * right neighboring unit is an unallocated
48047+ * extent. Increase its width and decrease
48048+ * width of hole
48049+ */
48050+ extent_set_width(ext + 1,
48051+ extent_get_width(ext + 1) + 1);
48052+ extent_set_width(ext, width - 1);
48053+ znode_make_dirty(coord->node);
48054+
48055+ /* update coord extension */
48056+ coord->unit_pos++;
48057+ ext_coord->width = extent_get_width(ext + 1);
48058+ ext_coord->pos_in_unit = 0;
48059+ ext_coord->ext_offset += sizeof(reiser4_extent);
48060+ ON_DEBUG(ext_coord->extent =
48061+ *extent_by_coord(coord));
48062+ *how = 4;
48063+ return 0;
48064+ }
48065+ }
48066+ /* extent for replace */
48067+ set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
48068+ /* extent to be inserted */
48069+ set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
48070+ rh.nr_new_extents = 1;
48071+
48072+ /* have replace_extent to return with @coord and @uf_coord->lh
48073+ set to unit which was inserted */
48074+ return_inserted_position = 1;
48075+ *how = 5;
48076+ } else {
48077+ /* extent for replace */
48078+ set_extent(&rh.overwrite, HOLE_EXTENT_START, pos_in_unit);
48079+ /* extents to be inserted */
48080+ set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
48081+ set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
48082+ width - pos_in_unit - 1);
48083+ rh.nr_new_extents = 2;
48084+
48085+ /* have replace_extent to return with @coord and @uf_coord->lh
48086+ set to first of units which were inserted */
48087+ return_inserted_position = 1;
48088+ *how = 6;
48089+ }
48090+ unit_key_by_coord(coord, &rh.paste_key);
48091+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
48092+ extent_get_width(&rh.overwrite) * current_blocksize);
48093+
48094+ uf_coord->valid = 0;
48095+ return replace_extent(&rh, return_inserted_position);
48096+}
48097+
48098+/**
48099+ * overwrite_one_block -
48100+ * @uf_coord:
48101+ * @key:
48102+ * @node:
48103+ *
48104+ * If @node corresponds to hole extent - create unallocated extent for it and
48105+ * assign fake block number. If @node corresponds to allocated extent - assign
48106+ * block number of jnode
48107+ */
48108+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
48109+ jnode *node, int *hole_plugged)
48110+{
48111+ int result;
48112+ extent_coord_extension_t *ext_coord;
48113+ reiser4_extent *ext;
48114+ reiser4_block_nr block;
48115+ int how;
48116+
48117+ assert("vs-1312", uf_coord->coord.between == AT_UNIT);
48118+
48119+ result = 0;
48120+ ext_coord = ext_coord_by_uf_coord(uf_coord);
48121+ ext = ext_by_ext_coord(uf_coord);
48122+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
48123+
48124+ switch (state_of_extent(ext)) {
48125+ case ALLOCATED_EXTENT:
48126+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
48127+ break;
48128+
48129+ case HOLE_EXTENT:
48130+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
48131+ BUG_ON(result != 0);
48132+ result = plug_hole(uf_coord, key, &how);
48133+ if (result)
48134+ return result;
48135+ block = fake_blocknr_unformatted(1);
48136+ if (hole_plugged)
48137+ *hole_plugged = 1;
48138+ JF_SET(node, JNODE_CREATED);
48139+ break;
48140+
48141+ default:
48142+ return RETERR(-EIO);
48143+ }
48144+
48145+ jnode_set_block(node, &block);
48146+ return 0;
48147+}
48148+
48149+/**
48150+ * move_coord - move coordinate forward
48151+ * @uf_coord:
48152+ *
48153+ * Move coordinate one data block pointer forward. Return 1 if coord is set to
48154+ * the last one already or is invalid.
48155+ */
48156+static int move_coord(uf_coord_t *uf_coord)
48157+{
48158+ extent_coord_extension_t *ext_coord;
48159+
48160+ if (uf_coord->valid == 0)
48161+ return 1;
48162+ ext_coord = &uf_coord->extension.extent;
48163+ ext_coord->pos_in_unit ++;
48164+ if (ext_coord->pos_in_unit < ext_coord->width)
48165+ /* coordinate moved within the unit */
48166+ return 0;
48167+
48168+ /* end of unit is reached. Try to move to next unit */
48169+ ext_coord->pos_in_unit = 0;
48170+ uf_coord->coord.unit_pos ++;
48171+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
48172+ /* coordinate moved to next unit */
48173+ ext_coord->ext_offset += sizeof(reiser4_extent);
48174+ ext_coord->width =
48175+ extent_get_width(ext_by_offset
48176+ (uf_coord->coord.node,
48177+ ext_coord->ext_offset));
48178+ ON_DEBUG(ext_coord->extent =
48179+ *ext_by_offset(uf_coord->coord.node,
48180+ ext_coord->ext_offset));
48181+ return 0;
48182+ }
48183+ /* end of item is reached */
48184+ uf_coord->valid = 0;
48185+ return 1;
48186+}
48187+
48188+/**
48189+ * overwrite_extent -
48190+ * @inode:
48191+ *
48192+ * Returns number of handled jnodes.
48193+ */
48194+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48195+ jnode **jnodes, int count, int *plugged_hole)
48196+{
48197+ int result;
48198+ reiser4_key k;
48199+ int i;
48200+ jnode *node;
48201+
48202+ k = *key;
48203+ for (i = 0; i < count; i ++) {
48204+ node = jnodes[i];
48205+ if (*jnode_get_block(node) == 0) {
48206+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
48207+ if (result)
48208+ return result;
48209+ }
48210+ /*
48211+ * make sure that we hold long term locked twig node containing
48212+ * all jnodes we are about to capture
48213+ */
48214+ check_jnodes(uf_coord->lh->node, &k, 1);
48215+ /*
48216+ * assign fake block numbers to all jnodes, capture and mark
48217+ * them dirty
48218+ */
48219+ spin_lock_jnode(node);
48220+ result = try_capture(node, ZNODE_WRITE_LOCK, 0);
48221+ BUG_ON(result != 0);
48222+ jnode_make_dirty_locked(node);
48223+ spin_unlock_jnode(node);
48224+
48225+ if (uf_coord->valid == 0)
48226+ return i + 1;
48227+
48228+ check_uf_coord(uf_coord, &k);
48229+
48230+ if (move_coord(uf_coord)) {
48231+ /*
48232+ * failed to move to the next node pointer. Either end
48233+ * of file or end of twig node is reached. In the later
48234+ * case we might go to the right neighbor.
48235+ */
48236+ uf_coord->valid = 0;
48237+ return i + 1;
48238+ }
48239+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
48240+ }
48241+
48242+ return count;
48243+}
48244+
48245+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
48246+
48247+/**
48248+ * update_extent
48249+ * @file:
48250+ * @jnodes:
48251+ * @count:
48252+ * @off:
48253+ *
48254+ */
48255+int update_extent(struct inode *inode, jnode *node, loff_t pos,
48256+ int *plugged_hole)
48257+{
48258+ int result;
48259+ znode *loaded;
48260+ uf_coord_t uf_coord;
48261+ coord_t *coord;
48262+ lock_handle lh;
48263+ reiser4_key key;
48264+
48265+ assert("", lock_counters()->d_refs == 0);
48266+
48267+ key_by_inode_and_offset_common(inode, pos, &key);
48268+
48269+ init_uf_coord(&uf_coord, &lh);
48270+ coord = &uf_coord.coord;
48271+ result = find_file_item_nohint(coord, &lh, &key,
48272+ ZNODE_WRITE_LOCK, inode);
48273+ if (IS_CBKERR(result)) {
48274+ assert("", lock_counters()->d_refs == 0);
48275+ return result;
48276+ }
48277+
48278+ result = zload(coord->node);
48279+ BUG_ON(result != 0);
48280+ loaded = coord->node;
48281+
48282+ if (coord->between == AFTER_UNIT) {
48283+ /*
48284+ * append existing extent item with unallocated extent of width
48285+ * nr_jnodes
48286+ */
48287+ init_coord_extension_extent(&uf_coord,
48288+ get_key_offset(&key));
48289+ result = append_last_extent(&uf_coord, &key,
48290+ &node, 1);
48291+ } else if (coord->between == AT_UNIT) {
48292+ /*
48293+ * overwrite
48294+ * not optimal yet. Will be optimized if new write will show
48295+ * performance win.
48296+ */
48297+ init_coord_extension_extent(&uf_coord,
48298+ get_key_offset(&key));
48299+ result = overwrite_extent(&uf_coord, &key,
48300+ &node, 1, plugged_hole);
48301+ } else {
48302+ /*
48303+ * there are no items of this file in the tree yet. Create
48304+ * first item of the file inserting one unallocated extent of
48305+ * width nr_jnodes
48306+ */
48307+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
48308+ }
48309+ assert("", result == 1 || result < 0);
48310+ zrelse(loaded);
48311+ done_lh(&lh);
48312+ assert("", lock_counters()->d_refs == 0);
48313+ return (result == 1) ? 0 : result;
48314+}
48315+
48316+/**
48317+ * update_extents
48318+ * @file:
48319+ * @jnodes:
48320+ * @count:
48321+ * @off:
48322+ *
48323+ */
48324+static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
48325+{
48326+ struct inode *inode;
48327+ struct hint hint;
48328+ reiser4_key key;
48329+ int result;
48330+ znode *loaded;
48331+
48332+ result = load_file_hint(file, &hint);
48333+ BUG_ON(result != 0);
48334+
48335+ inode = file->f_dentry->d_inode;
48336+ if (count != 0)
48337+ /*
48338+ * count == 0 is special case: expanding truncate
48339+ */
48340+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
48341+ key_by_inode_and_offset_common(inode, pos, &key);
48342+
48343+ assert("", lock_counters()->d_refs == 0);
48344+
48345+ do {
48346+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
48347+ if (IS_CBKERR(result)) {
48348+ assert("", lock_counters()->d_refs == 0);
48349+ return result;
48350+ }
48351+
48352+ result = zload(hint.ext_coord.coord.node);
48353+ BUG_ON(result != 0);
48354+ loaded = hint.ext_coord.coord.node;
48355+
48356+ if (hint.ext_coord.coord.between == AFTER_UNIT) {
48357+ /*
48358+ * append existing extent item with unallocated extent
48359+ * of width nr_jnodes
48360+ */
48361+ if (hint.ext_coord.valid == 0)
48362+ /* NOTE: get statistics on this */
48363+ init_coord_extension_extent(&hint.ext_coord,
48364+ get_key_offset(&key));
48365+ result = append_last_extent(&hint.ext_coord, &key,
48366+ jnodes, count);
48367+ } else if (hint.ext_coord.coord.between == AT_UNIT) {
48368+ /*
48369+ * overwrite
48370+ * not optimal yet. Will be optimized if new write will
48371+ * show performance win.
48372+ */
48373+ if (hint.ext_coord.valid == 0)
48374+ /* NOTE: get statistics on this */
48375+ init_coord_extension_extent(&hint.ext_coord,
48376+ get_key_offset(&key));
48377+ result = overwrite_extent(&hint.ext_coord, &key,
48378+ jnodes, count, NULL);
48379+ } else {
48380+ /*
48381+ * there are no items of this file in the tree
48382+ * yet. Create first item of the file inserting one
48383+ * unallocated extent of * width nr_jnodes
48384+ */
48385+ result = insert_first_extent(&hint.ext_coord, &key,
48386+ jnodes, count, inode);
48387+ }
48388+ zrelse(loaded);
48389+ if (result < 0) {
48390+ done_lh(hint.ext_coord.lh);
48391+ break;
48392+ }
48393+
48394+ jnodes += result;
48395+ count -= result;
48396+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
48397+
48398+ /* seal and unlock znode */
48399+ if (hint.ext_coord.valid)
48400+ set_hint(&hint, &key, ZNODE_WRITE_LOCK);
48401+ else
48402+ unset_hint(&hint);
48403+
48404+ } while (count > 0);
48405+
48406+ save_file_hint(file, &hint);
48407+ assert("", lock_counters()->d_refs == 0);
48408+ return result;
48409+}
48410+
48411+/**
48412+ * write_extent_reserve_space - reserve space for extent write operation
48413+ * @inode:
48414+ *
48415+ * Estimates and reserves space which may be required for writing
48416+ * WRITE_GRANULARITY pages of file.
48417+ */
48418+static int write_extent_reserve_space(struct inode *inode)
48419+{
48420+ __u64 count;
48421+ reiser4_tree *tree;
48422+
48423+ /*
48424+ * to write WRITE_GRANULARITY pages to a file by extents we have to
48425+ * reserve disk space for:
48426+
48427+ * 1. find_file_item may have to insert empty node to the tree (empty
48428+ * leaf node between two extent items). This requires 1 block and
48429+ * number of blocks which are necessary to perform insertion of an
48430+ * internal item into twig level.
48431+
48432+ * 2. for each of written pages there might be needed 1 block and
48433+ * number of blocks which might be necessary to perform insertion of or
48434+ * paste to an extent item.
48435+
48436+ * 3. stat data update
48437+ */
48438+ tree = tree_by_inode(inode);
48439+ count = estimate_one_insert_item(tree) +
48440+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
48441+ estimate_one_insert_item(tree);
48442+ grab_space_enable();
48443+ return reiser4_grab_space(count, 0 /* flags */);
48444+}
48445+
48446+/**
48447+ * write_extent - write method of extent item plugin
48448+ * @file: file to write to
48449+ * @buf: address of user-space buffer
48450+ * @write_amount: number of bytes to write
48451+ * @off: position in file to write to
48452+ *
48453+ */
48454+ssize_t write_extent(struct file *file, const char __user *buf, size_t count,
48455+ loff_t *pos)
48456+{
48457+ int have_to_update_extent;
48458+ int nr_pages;
48459+ struct page *page;
48460+ jnode *jnodes[WRITE_GRANULARITY + 1];
48461+ struct inode *inode;
48462+ unsigned long index;
48463+ unsigned long end;
48464+ int i;
48465+ int to_page, page_off;
48466+ size_t left, written;
48467+ int result;
48468+
48469+ inode = file->f_dentry->d_inode;
48470+ if (write_extent_reserve_space(inode))
48471+ return RETERR(-ENOSPC);
48472+
48473+ if (count == 0) {
48474+ /* truncate case */
48475+ update_extents(file, jnodes, 0, *pos);
48476+ return 0;
48477+ }
48478+
48479+ BUG_ON(get_current_context()->trans->atom != NULL);
48480+
48481+ index = *pos >> PAGE_CACHE_SHIFT;
48482+ /* calculate number of pages which are to be written */
48483+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
48484+ nr_pages = end - index + 1;
48485+ assert("", nr_pages <= WRITE_GRANULARITY + 1);
48486+
48487+ /* get pages and jnodes */
48488+ for (i = 0; i < nr_pages; i ++) {
48489+ page = find_or_create_page(inode->i_mapping, index + i, get_gfp_mask());
48490+ if (page == NULL) {
48491+ while(i --) {
48492+ unlock_page(jnode_page(jnodes[i]));
48493+ page_cache_release(jnode_page(jnodes[i]));
48494+ }
48495+ return RETERR(-ENOMEM);
48496+ }
48497+
48498+ jnodes[i] = jnode_of_page(page);
48499+ if (IS_ERR(jnodes[i])) {
48500+ unlock_page(page);
48501+ page_cache_release(page);
48502+ while (i --) {
48503+ jput(jnodes[i]);
48504+ page_cache_release(jnode_page(jnodes[i]));
48505+ }
48506+ return RETERR(-ENOMEM);
48507+ }
48508+ /* prevent jnode and page from disconnecting */
48509+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
48510+ unlock_page(page);
48511+ }
48512+
48513+ BUG_ON(get_current_context()->trans->atom != NULL);
48514+
48515+ have_to_update_extent = 0;
48516+
48517+ left = count;
48518+ page_off = (*pos & (PAGE_CACHE_SIZE - 1));
48519+ for (i = 0; i < nr_pages; i ++) {
48520+ to_page = PAGE_CACHE_SIZE - page_off;
48521+ if (to_page > left)
48522+ to_page = left;
48523+ page = jnode_page(jnodes[i]);
48524+ if (((loff_t)page->index << PAGE_CACHE_SHIFT) < inode->i_size &&
48525+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48526+ /*
48527+ * the above is not optimal for partial write to last
48528+ * page of file when file size is not at boundary of
48529+ * page
48530+ */
48531+ lock_page(page);
48532+ if (!PageUptodate(page)) {
48533+ result = readpage_unix_file(NULL, page);
48534+ BUG_ON(result != 0);
48535+ /* wait for read completion */
48536+ lock_page(page);
48537+ BUG_ON(!PageUptodate(page));
48538+ unlock_page(page);
48539+ } else
48540+ result = 0;
48541+ }
48542+
48543+ BUG_ON(get_current_context()->trans->atom != NULL);
48544+ fault_in_pages_readable(buf, to_page);
48545+ BUG_ON(get_current_context()->trans->atom != NULL);
48546+
48547+ lock_page(page);
48548+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48549+ void *kaddr;
48550+
48551+ kaddr = kmap_atomic(page, KM_USER0);
48552+ memset(kaddr, 0, page_off);
48553+ memset(kaddr + page_off + to_page, 0,
48554+ PAGE_CACHE_SIZE - (page_off + to_page));
48555+ flush_dcache_page(page);
48556+ kunmap_atomic(kaddr, KM_USER0);
48557+ }
48558+
48559+ written = filemap_copy_from_user(page, page_off, buf, to_page);
48560+ if (written != to_page) {
48561+ unlock_page(page);
48562+ page_cache_release(page);
48563+ nr_pages = i;
48564+ jput(jnodes[i]);
48565+ result = RETERR(-EFAULT);
48566+ break;
48567+ }
48568+ flush_dcache_page(page);
48569+ set_page_dirty_internal(page);
48570+ unlock_page(page);
48571+ mark_page_accessed(page);
48572+ SetPageUptodate(page);
48573+ page_cache_release(page);
48574+
48575+ if (jnodes[i]->blocknr == 0)
48576+ have_to_update_extent ++;
48577+
48578+ page_off = 0;
48579+ buf += to_page;
48580+ left -= to_page;
48581+ BUG_ON(get_current_context()->trans->atom != NULL);
48582+ }
48583+
48584+ if (have_to_update_extent) {
48585+ update_extents(file, jnodes, nr_pages, *pos);
48586+ } else {
48587+ for (i = 0; i < nr_pages; i ++) {
48588+ spin_lock_jnode(jnodes[i]);
48589+ result = try_capture(jnodes[i], ZNODE_WRITE_LOCK, 0);
48590+ BUG_ON(result != 0);
48591+ jnode_make_dirty_locked(jnodes[i]);
48592+ spin_unlock_jnode(jnodes[i]);
48593+ }
48594+ }
48595+
48596+ for (i = 0; i < nr_pages; i ++) {
48597+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
48598+ jput(jnodes[i]);
48599+ }
48600+
48601+ /* the only error handled so far is EFAULT on copy_from_user */
48602+ return (count - left) ? (count - left) : -EFAULT;
48603+}
48604+
48605+static inline void zero_page(struct page *page)
48606+{
48607+ char *kaddr = kmap_atomic(page, KM_USER0);
48608+
48609+ memset(kaddr, 0, PAGE_CACHE_SIZE);
48610+ flush_dcache_page(page);
48611+ kunmap_atomic(kaddr, KM_USER0);
48612+ SetPageUptodate(page);
48613+ unlock_page(page);
48614+}
48615+
48616+static int
48617+do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
48618+ struct page *page)
48619+{
48620+ jnode *j;
48621+ struct address_space *mapping;
48622+ unsigned long index;
48623+ oid_t oid;
48624+ reiser4_block_nr block;
48625+
48626+ mapping = page->mapping;
48627+ oid = get_inode_oid(mapping->host);
48628+ index = page->index;
48629+
48630+ switch (state_of_extent(ext)) {
48631+ case HOLE_EXTENT:
48632+ /*
48633+ * it is possible to have hole page with jnode, if page was
48634+ * eflushed previously.
48635+ */
48636+ j = jfind(mapping, index);
48637+ if (j == NULL) {
48638+ zero_page(page);
48639+ return 0;
48640+ }
48641+ spin_lock_jnode(j);
48642+ if (!jnode_page(j)) {
48643+ jnode_attach_page(j, page);
48644+ } else {
48645+ BUG_ON(jnode_page(j) != page);
48646+ assert("vs-1504", jnode_page(j) == page);
48647+ }
48648+ block = *jnode_get_io_block(j);
48649+ spin_unlock_jnode(j);
48650+ if (block == 0) {
48651+ zero_page(page);
48652+ jput(j);
48653+ return 0;
48654+ }
48655+ break;
48656+
48657+ case ALLOCATED_EXTENT:
48658+ j = jnode_of_page(page);
48659+ if (IS_ERR(j))
48660+ return PTR_ERR(j);
48661+ if (*jnode_get_block(j) == 0) {
48662+ reiser4_block_nr blocknr;
48663+
48664+ blocknr = extent_get_start(ext) + pos;
48665+ jnode_set_block(j, &blocknr);
48666+ } else
48667+ assert("vs-1403",
48668+ j->blocknr == extent_get_start(ext) + pos);
48669+ break;
48670+
48671+ case UNALLOCATED_EXTENT:
48672+ j = jfind(mapping, index);
48673+ assert("nikita-2688", j);
48674+ assert("vs-1426", jnode_page(j) == NULL);
48675+
48676+ spin_lock_jnode(j);
48677+ jnode_attach_page(j, page);
48678+ spin_unlock_jnode(j);
48679+ break;
48680+
48681+ default:
48682+ warning("vs-957", "wrong extent\n");
48683+ return RETERR(-EIO);
48684+ }
48685+
48686+ BUG_ON(j == 0);
48687+ page_io(page, j, READ, get_gfp_mask());
48688+ jput(j);
48689+ return 0;
48690+}
48691+
48692+static int
48693+move_coord_pages(coord_t * coord, extent_coord_extension_t * ext_coord,
48694+ unsigned count)
48695+{
48696+ reiser4_extent *ext;
48697+
48698+ ext_coord->expected_page += count;
48699+
48700+ ext = ext_by_offset(coord->node, ext_coord->ext_offset);
48701+
48702+ do {
48703+ if (ext_coord->pos_in_unit + count < ext_coord->width) {
48704+ ext_coord->pos_in_unit += count;
48705+ break;
48706+ }
48707+
48708+ if (coord->unit_pos == ext_coord->nr_units - 1) {
48709+ coord->between = AFTER_UNIT;
48710+ return 1;
48711+ }
48712+
48713+ /* shift to next unit */
48714+ count -= (ext_coord->width - ext_coord->pos_in_unit);
48715+ coord->unit_pos++;
48716+ ext_coord->pos_in_unit = 0;
48717+ ext_coord->ext_offset += sizeof(reiser4_extent);
48718+ ext++;
48719+ ON_DEBUG(ext_coord->extent = *ext);
48720+ ext_coord->width = extent_get_width(ext);
48721+ } while (1);
48722+
48723+ return 0;
48724+}
48725+
48726+static int readahead_readpage_extent(void *vp, struct page *page)
48727+{
48728+ int result;
48729+ uf_coord_t *uf_coord;
48730+ coord_t *coord;
48731+ extent_coord_extension_t *ext_coord;
48732+
48733+ uf_coord = vp;
48734+ coord = &uf_coord->coord;
48735+
48736+ if (coord->between != AT_UNIT) {
48737+ unlock_page(page);
48738+ return RETERR(-EINVAL);
48739+ }
48740+
48741+ ext_coord = &uf_coord->extension.extent;
48742+ if (ext_coord->expected_page != page->index) {
48743+ /* read_cache_pages skipped few pages. Try to adjust coord to page */
48744+ assert("vs-1269", page->index > ext_coord->expected_page);
48745+ if (move_coord_pages
48746+ (coord, ext_coord,
48747+ page->index - ext_coord->expected_page)) {
48748+ /* extent pointing to this page is not here */
48749+ unlock_page(page);
48750+ return RETERR(-EINVAL);
48751+ }
48752+
48753+ assert("vs-1274", offset_is_in_unit(coord,
48754+ (loff_t) page->
48755+ index << PAGE_CACHE_SHIFT));
48756+ ext_coord->expected_page = page->index;
48757+ }
48758+
48759+ assert("vs-1281", page->index == ext_coord->expected_page);
48760+ result =
48761+ do_readpage_extent(ext_by_ext_coord(uf_coord),
48762+ ext_coord->pos_in_unit, page);
48763+ if (!result)
48764+ move_coord_pages(coord, ext_coord, 1);
48765+ return result;
48766+}
48767+
48768+static int move_coord_forward(uf_coord_t *ext_coord)
48769+{
48770+ coord_t *coord;
48771+ extent_coord_extension_t *extension;
48772+
48773+ check_uf_coord(ext_coord, NULL);
48774+
48775+ extension = &ext_coord->extension.extent;
48776+ extension->pos_in_unit++;
48777+ if (extension->pos_in_unit < extension->width)
48778+ /* stay within the same extent unit */
48779+ return 0;
48780+
48781+ coord = &ext_coord->coord;
48782+
48783+ /* try to move to the next extent unit */
48784+ coord->unit_pos++;
48785+ if (coord->unit_pos < extension->nr_units) {
48786+ /* went to the next extent unit */
48787+ reiser4_extent *ext;
48788+
48789+ extension->pos_in_unit = 0;
48790+ extension->ext_offset += sizeof(reiser4_extent);
48791+ ext = ext_by_offset(coord->node, extension->ext_offset);
48792+ ON_DEBUG(extension->extent = *ext);
48793+ extension->width = extent_get_width(ext);
48794+ return 0;
48795+ }
48796+
48797+ /* there is no units in the item anymore */
48798+ return 1;
48799+}
48800+
48801+/* this is called by read_cache_pages for each of readahead pages */
48802+static int extent_readpage_filler(void *data, struct page *page)
48803+{
48804+ hint_t *hint;
48805+ loff_t offset;
48806+ reiser4_key key;
48807+ uf_coord_t *ext_coord;
48808+ int result;
48809+
48810+ offset = (loff_t) page->index << PAGE_CACHE_SHIFT;
48811+ key_by_inode_and_offset_common(page->mapping->host, offset, &key);
48812+
48813+ hint = (hint_t *) data;
48814+ ext_coord = &hint->ext_coord;
48815+
48816+ BUG_ON(PageUptodate(page));
48817+ unlock_page(page);
48818+
48819+ if (hint_validate(hint, &key, 1 /* check key */ , ZNODE_READ_LOCK) != 0) {
48820+ result = coord_by_key(current_tree, &key, &ext_coord->coord,
48821+ ext_coord->lh, ZNODE_READ_LOCK,
48822+ FIND_EXACT, TWIG_LEVEL,
48823+ TWIG_LEVEL, CBK_UNIQUE, NULL);
48824+ if (result != CBK_COORD_FOUND) {
48825+ unset_hint(hint);
48826+ return result;
48827+ }
48828+ ext_coord->valid = 0;
48829+ }
48830+
48831+ if (zload(ext_coord->coord.node)) {
48832+ unset_hint(hint);
48833+ return RETERR(-EIO);
48834+ }
48835+ if (!item_is_extent(&ext_coord->coord)) {
48836+ /* tail conversion is running in parallel */
48837+ zrelse(ext_coord->coord.node);
48838+ unset_hint(hint);
48839+ return RETERR(-EIO);
48840+ }
48841+
48842+ if (ext_coord->valid == 0)
48843+ init_coord_extension_extent(ext_coord, offset);
48844+
48845+ check_uf_coord(ext_coord, &key);
48846+
48847+ lock_page(page);
48848+ if (!PageUptodate(page)) {
48849+ result = do_readpage_extent(ext_by_ext_coord(ext_coord),
48850+ ext_coord->extension.extent.
48851+ pos_in_unit, page);
48852+ if (result)
48853+ unlock_page(page);
48854+ } else {
48855+ unlock_page(page);
48856+ result = 0;
48857+ }
48858+ if (!result && move_coord_forward(ext_coord) == 0) {
48859+ set_key_offset(&key, offset + PAGE_CACHE_SIZE);
48860+ set_hint(hint, &key, ZNODE_READ_LOCK);
48861+ } else
48862+ unset_hint(hint);
48863+ zrelse(ext_coord->coord.node);
48864+ return result;
48865+}
48866+
48867+/* this is called by reiser4_readpages */
48868+static void
48869+extent_readpages_hook(struct address_space *mapping, struct list_head *pages,
48870+ void *data)
48871+{
48872+ /* FIXME: try whether having reiser4_read_cache_pages improves anything */
48873+ read_cache_pages(mapping, pages, extent_readpage_filler, data);
48874+}
48875+
48876+static int
48877+call_page_cache_readahead(struct address_space *mapping, struct file *file,
48878+ hint_t * hint,
48879+ unsigned long page_nr,
48880+ unsigned long ra_pages, struct file_ra_state *ra)
48881+{
48882+ reiser4_file_fsdata *fsdata;
48883+ int result;
48884+
48885+ fsdata = reiser4_get_file_fsdata(file);
48886+ if (IS_ERR(fsdata))
48887+ return page_nr;
48888+ fsdata->ra2.data = hint;
48889+ fsdata->ra2.readpages = extent_readpages_hook;
48890+
48891+ result = page_cache_readahead(mapping, ra, file, page_nr, ra_pages);
48892+ fsdata->ra2.readpages = NULL;
48893+ return result;
48894+}
48895+
48896+/* this is called when readahead did not */
48897+static int call_readpage(struct file *file, struct page *page)
48898+{
48899+ int result;
48900+
48901+ result = readpage_unix_file_nolock(file, page);
48902+ if (result)
48903+ return result;
48904+
48905+ lock_page(page);
48906+ if (!PageUptodate(page)) {
48907+ unlock_page(page);
48908+ page_detach_jnode(page, page->mapping, page->index);
48909+ warning("jmacd-97178", "page is not up to date");
48910+ return RETERR(-EIO);
48911+ }
48912+ unlock_page(page);
48913+ return 0;
48914+}
48915+
48916+static int filler(void *vp, struct page *page)
48917+{
48918+ return readpage_unix_file_nolock(vp, page);
48919+}
48920+
48921+/* Implements plugin->u.item.s.file.read operation for extent items. */
48922+int read_extent(struct file *file, flow_t *flow, hint_t *hint)
48923+{
48924+ int result;
48925+ struct page *page;
48926+ unsigned long cur_page, next_page;
48927+ unsigned long page_off, count;
48928+ struct address_space *mapping;
48929+ loff_t file_off;
48930+ uf_coord_t *uf_coord;
48931+ coord_t *coord;
48932+ extent_coord_extension_t *ext_coord;
48933+ unsigned long nr_pages, prev_page;
48934+ struct file_ra_state ra;
48935+ char *kaddr;
48936+
48937+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
48938+ assert("vs-572", flow->user == 1);
48939+ assert("vs-1351", flow->length > 0);
48940+
48941+ uf_coord = &hint->ext_coord;
48942+
48943+ check_uf_coord(uf_coord, NULL);
48944+ assert("vs-33", uf_coord->lh == &hint->lh);
48945+
48946+ coord = &uf_coord->coord;
48947+ assert("vs-1119", znode_is_rlocked(coord->node));
48948+ assert("vs-1120", znode_is_loaded(coord->node));
48949+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
48950+
48951+ mapping = file->f_dentry->d_inode->i_mapping;
48952+ ext_coord = &uf_coord->extension.extent;
48953+
48954+ /* offset in a file to start read from */
48955+ file_off = get_key_offset(&flow->key);
48956+ /* offset within the page to start read from */
48957+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
48958+ /* bytes which can be read from the page which contains file_off */
48959+ count = PAGE_CACHE_SIZE - page_off;
48960+
48961+ /* index of page containing offset read is to start from */
48962+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
48963+ next_page = cur_page;
48964+ /* number of pages flow spans over */
48965+ nr_pages =
48966+ ((file_off + flow->length + PAGE_CACHE_SIZE -
48967+ 1) >> PAGE_CACHE_SHIFT) - cur_page;
48968+
48969+ /* we start having twig node read locked. However, we do not want to
48970+ keep that lock all the time readahead works. So, set a sel and
48971+ release twig node. */
48972+ set_hint(hint, &flow->key, ZNODE_READ_LOCK);
48973+ /* &hint->lh is done-ed */
48974+
48975+ ra = file->f_ra;
48976+ prev_page = ra.prev_page;
48977+ do {
48978+ txn_restart_current();
48979+ if (next_page == cur_page)
48980+ next_page =
48981+ call_page_cache_readahead(mapping, file, hint,
48982+ cur_page, nr_pages, &ra);
48983+
48984+ page = find_get_page(mapping, cur_page);
48985+ if (unlikely(page == NULL)) {
48986+ handle_ra_miss(mapping, &ra, cur_page);
48987+ page = read_cache_page(mapping, cur_page, filler, file);
48988+ if (IS_ERR(page))
48989+ return PTR_ERR(page);
48990+ lock_page(page);
48991+ if (!PageUptodate(page)) {
48992+ unlock_page(page);
48993+ page_detach_jnode(page, mapping, cur_page);
48994+ page_cache_release(page);
48995+ warning("jmacd-97178",
48996+ "extent_read: page is not up to date");
48997+ return RETERR(-EIO);
48998+ }
48999+ unlock_page(page);
49000+ } else {
49001+ if (!PageUptodate(page)) {
49002+ lock_page(page);
49003+
49004+ assert("", page->mapping == mapping);
49005+ if (PageUptodate(page))
49006+ unlock_page(page);
49007+ else {
49008+ result = call_readpage(file, page);
49009+ if (result) {
49010+ page_cache_release(page);
49011+ return RETERR(result);
49012+ }
49013+ }
49014+ }
49015+ if (prev_page != cur_page)
49016+ mark_page_accessed(page);
49017+ prev_page = cur_page;
49018+ }
49019+
49020+ /* If users can be writing to this page using arbitrary virtual
49021+ addresses, take care about potential aliasing before reading
49022+ the page on the kernel side.
49023+ */
49024+ if (mapping_writably_mapped(mapping))
49025+ flush_dcache_page(page);
49026+
49027+ assert("nikita-3034", schedulable());
49028+
49029+ /* number of bytes which are to be read from the page */
49030+ if (count > flow->length)
49031+ count = flow->length;
49032+
49033+ result = fault_in_pages_writeable(flow->data, count);
49034+ if (result) {
49035+ page_cache_release(page);
49036+ return RETERR(-EFAULT);
49037+ }
49038+
49039+ kaddr = kmap_atomic(page, KM_USER0);
49040+ result = __copy_to_user_inatomic(flow->data,
49041+ kaddr + page_off, count);
49042+ kunmap_atomic(kaddr, KM_USER0);
49043+ if (result != 0) {
49044+ kaddr = kmap(page);
49045+ result = __copy_to_user(flow->data, kaddr + page_off, count);
49046+ kunmap(page);
49047+ if (unlikely(result))
49048+ return RETERR(-EFAULT);
49049+ }
49050+
49051+ page_cache_release(page);
49052+
49053+ /* increase key (flow->key), update user area pointer (flow->data) */
49054+ move_flow_forward(flow, count);
49055+
49056+ page_off = 0;
49057+ cur_page ++;
49058+ count = PAGE_CACHE_SIZE;
49059+ nr_pages--;
49060+ } while (flow->length);
49061+
49062+ file->f_ra = ra;
49063+ return 0;
49064+}
49065+
49066+/*
49067+ plugin->u.item.s.file.readpages
49068+*/
49069+void
49070+readpages_extent(void *vp, struct address_space *mapping,
49071+ struct list_head *pages)
49072+{
49073+ assert("vs-1739", 0);
49074+ if (vp)
49075+ read_cache_pages(mapping, pages, readahead_readpage_extent, vp);
49076+}
49077+
49078+/*
49079+ plugin->s.file.readpage
49080+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
49081+ or
49082+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
49083+
49084+ At the beginning: coord->node is read locked, zloaded, page is
49085+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
49086+*/
49087+int readpage_extent(void *vp, struct page *page)
49088+{
49089+ uf_coord_t *uf_coord = vp;
49090+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
49091+ ON_DEBUG(reiser4_key key);
49092+
49093+ assert("vs-1040", PageLocked(page));
49094+ assert("vs-1050", !PageUptodate(page));
49095+ assert("vs-1039", page->mapping && page->mapping->host);
49096+
49097+ assert("vs-1044", znode_is_loaded(coord->node));
49098+ assert("vs-758", item_is_extent(coord));
49099+ assert("vs-1046", coord_is_existing_unit(coord));
49100+ assert("vs-1045", znode_is_rlocked(coord->node));
49101+ assert("vs-1047",
49102+ page->mapping->host->i_ino ==
49103+ get_key_objectid(item_key_by_coord(coord, &key)));
49104+ check_uf_coord(uf_coord, NULL);
49105+
49106+ return do_readpage_extent(ext_by_ext_coord(uf_coord),
49107+ uf_coord->extension.extent.pos_in_unit, page);
49108+}
49109+
49110+/**
49111+ * get_block_address_extent
49112+ * @coord:
49113+ * @block:
49114+ * @result:
49115+ *
49116+ *
49117+ */
49118+int get_block_address_extent(const coord_t *coord, sector_t block,
49119+ sector_t *result)
49120+{
49121+ reiser4_extent *ext;
49122+
49123+ if (!coord_is_existing_unit(coord))
49124+ return RETERR(-EINVAL);
49125+
49126+ ext = extent_by_coord(coord);
49127+
49128+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
49129+ /* FIXME: bad things may happen if it is unallocated extent */
49130+ *result = 0;
49131+ else {
49132+ reiser4_key key;
49133+
49134+ unit_key_by_coord(coord, &key);
49135+ assert("vs-1645",
49136+ block >= get_key_offset(&key) >> current_blocksize_bits);
49137+ assert("vs-1646",
49138+ block <
49139+ (get_key_offset(&key) >> current_blocksize_bits) +
49140+ extent_get_width(ext));
49141+ *result =
49142+ extent_get_start(ext) + (block -
49143+ (get_key_offset(&key) >>
49144+ current_blocksize_bits));
49145+ }
49146+ return 0;
49147+}
49148+
49149+/*
49150+ plugin->u.item.s.file.append_key
49151+ key of first byte which is the next to last byte by addressed by this extent
49152+*/
49153+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
49154+{
49155+ item_key_by_coord(coord, key);
49156+ set_key_offset(key,
49157+ get_key_offset(key) + extent_size(coord,
49158+ nr_units_extent
49159+ (coord)));
49160+
49161+ assert("vs-610", get_key_offset(key)
49162+ && (get_key_offset(key) & (current_blocksize - 1)) == 0);
49163+ return key;
49164+}
49165+
49166+/* plugin->u.item.s.file.init_coord_extension */
49167+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
49168+{
49169+ coord_t *coord;
49170+ extent_coord_extension_t *ext_coord;
49171+ reiser4_key key;
49172+ loff_t offset;
49173+
49174+ assert("vs-1295", uf_coord->valid == 0);
49175+
49176+ coord = &uf_coord->coord;
49177+ assert("vs-1288", coord_is_iplug_set(coord));
49178+ assert("vs-1327", znode_is_loaded(coord->node));
49179+
49180+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
49181+ return;
49182+
49183+ ext_coord = &uf_coord->extension.extent;
49184+ ext_coord->nr_units = nr_units_extent(coord);
49185+ ext_coord->ext_offset =
49186+ (char *)extent_by_coord(coord) - zdata(coord->node);
49187+ ext_coord->width = extent_get_width(extent_by_coord(coord));
49188+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
49189+ uf_coord->valid = 1;
49190+
49191+ /* pos_in_unit is the only uninitialized field in extended coord */
49192+ if (coord->between == AFTER_UNIT) {
49193+ assert("vs-1330",
49194+ coord->unit_pos == nr_units_extent(coord) - 1);
49195+
49196+ ext_coord->pos_in_unit = ext_coord->width - 1;
49197+ } else {
49198+ /* AT_UNIT */
49199+ unit_key_by_coord(coord, &key);
49200+ offset = get_key_offset(&key);
49201+
49202+ assert("vs-1328", offset <= lookuped);
49203+ assert("vs-1329",
49204+ lookuped <
49205+ offset + ext_coord->width * current_blocksize);
49206+ ext_coord->pos_in_unit =
49207+ ((lookuped - offset) >> current_blocksize_bits);
49208+ }
49209+}
49210+
49211+/*
49212+ * Local variables:
49213+ * c-indentation-style: "K&R"
49214+ * mode-name: "LC"
49215+ * c-basic-offset: 8
49216+ * tab-width: 8
49217+ * fill-column: 79
49218+ * scroll-step: 1
49219+ * End:
49220+ */
49221Index: linux-2.6.16/fs/reiser4/plugin/item/extent_flush_ops.c
49222===================================================================
49223--- /dev/null
49224+++ linux-2.6.16/fs/reiser4/plugin/item/extent_flush_ops.c
49225@@ -0,0 +1,1018 @@
49226+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49227+
49228+#include "item.h"
49229+#include "../../tree.h"
49230+#include "../../jnode.h"
49231+#include "../../super.h"
49232+#include "../../flush.h"
49233+#include "../../carry.h"
49234+#include "../object.h"
49235+
49236+#include <linux/pagemap.h>
49237+
49238+static reiser4_block_nr extent_unit_start(const coord_t * item);
49239+
49240+/* Return either first or last extent (depending on @side) of the item
49241+ @coord is set to. Set @pos_in_unit either to first or to last block
49242+ of extent. */
49243+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
49244+ reiser4_block_nr * pos_in_unit)
49245+{
49246+ reiser4_extent *ext;
49247+
49248+ if (side == LEFT_SIDE) {
49249+ /* get first extent of item */
49250+ ext = extent_item(coord);
49251+ *pos_in_unit = 0;
49252+ } else {
49253+ /* get last extent of item and last position within it */
49254+ assert("vs-363", side == RIGHT_SIDE);
49255+ ext = extent_item(coord) + coord_last_unit_pos(coord);
49256+ *pos_in_unit = extent_get_width(ext) - 1;
49257+ }
49258+
49259+ return ext;
49260+}
49261+
49262+/* item_plugin->f.utmost_child */
49263+/* Return the child. Coord is set to extent item. Find jnode corresponding
49264+ either to first or to last unformatted node pointed by the item */
49265+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
49266+{
49267+ reiser4_extent *ext;
49268+ reiser4_block_nr pos_in_unit;
49269+
49270+ ext = extent_utmost_ext(coord, side, &pos_in_unit);
49271+
49272+ switch (state_of_extent(ext)) {
49273+ case HOLE_EXTENT:
49274+ *childp = NULL;
49275+ return 0;
49276+ case ALLOCATED_EXTENT:
49277+ case UNALLOCATED_EXTENT:
49278+ break;
49279+ default:
49280+ /* this should never happen */
49281+ assert("vs-1417", 0);
49282+ }
49283+
49284+ {
49285+ reiser4_key key;
49286+ reiser4_tree *tree;
49287+ unsigned long index;
49288+
49289+ if (side == LEFT_SIDE) {
49290+ /* get key of first byte addressed by the extent */
49291+ item_key_by_coord(coord, &key);
49292+ } else {
49293+ /* get key of byte which next after last byte addressed by the extent */
49294+ append_key_extent(coord, &key);
49295+ }
49296+
49297+ assert("vs-544",
49298+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
49299+ /* index of first or last (depending on @side) page addressed
49300+ by the extent */
49301+ index =
49302+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
49303+ if (side == RIGHT_SIDE)
49304+ index--;
49305+
49306+ tree = coord->node->zjnode.tree;
49307+ *childp = jlookup(tree, get_key_objectid(&key), index);
49308+ }
49309+
49310+ return 0;
49311+}
49312+
49313+/* item_plugin->f.utmost_child_real_block */
49314+/* Return the child's block, if allocated. */
49315+int
49316+utmost_child_real_block_extent(const coord_t * coord, sideof side,
49317+ reiser4_block_nr * block)
49318+{
49319+ reiser4_extent *ext;
49320+
49321+ ext = extent_by_coord(coord);
49322+
49323+ switch (state_of_extent(ext)) {
49324+ case ALLOCATED_EXTENT:
49325+ *block = extent_get_start(ext);
49326+ if (side == RIGHT_SIDE)
49327+ *block += extent_get_width(ext) - 1;
49328+ break;
49329+ case HOLE_EXTENT:
49330+ case UNALLOCATED_EXTENT:
49331+ *block = 0;
49332+ break;
49333+ default:
49334+ /* this should never happen */
49335+ assert("vs-1418", 0);
49336+ }
49337+
49338+ return 0;
49339+}
49340+
49341+/* item_plugin->f.scan */
49342+/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
49343+ This scan continues, advancing the parent coordinate, until either it encounters a
49344+ formatted child or it finishes scanning this node.
49345+
49346+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
49347+ not sure this is last property (same atom) is enforced, but it should be the case since
49348+ one atom must write the parent and the others must read the parent, thus fusing?). In
49349+ any case, the code below asserts this case for unallocated extents. Unallocated
49350+ extents are thus optimized because we can skip to the endpoint when scanning.
49351+
49352+ It returns control to scan_extent, handles these terminating conditions, e.g., by
49353+ loading the next twig.
49354+*/
49355+int scan_extent(flush_scan * scan)
49356+{
49357+ coord_t coord;
49358+ jnode *neighbor;
49359+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
49360+ reiser4_block_nr unit_start;
49361+ __u64 oid;
49362+ reiser4_key key;
49363+ int ret = 0, allocated, incr;
49364+ reiser4_tree *tree;
49365+
49366+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
49367+ scan->stop = 1;
49368+ return 0; /* Race with truncate, this node is already
49369+ * truncated. */
49370+ }
49371+
49372+ coord_dup(&coord, &scan->parent_coord);
49373+
49374+ assert("jmacd-1404", !scan_finished(scan));
49375+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
49376+ assert("jmacd-1406", jnode_is_unformatted(scan->node));
49377+
49378+ /* The scan_index variable corresponds to the current page index of the
49379+ unformatted block scan position. */
49380+ scan_index = index_jnode(scan->node);
49381+
49382+ assert("jmacd-7889", item_is_extent(&coord));
49383+
49384+ repeat:
49385+ /* objectid of file */
49386+ oid = get_key_objectid(item_key_by_coord(&coord, &key));
49387+
49388+ allocated = !extent_is_unallocated(&coord);
49389+ /* Get the values of this extent unit: */
49390+ unit_index = extent_unit_index(&coord);
49391+ unit_width = extent_unit_width(&coord);
49392+ unit_start = extent_unit_start(&coord);
49393+
49394+ assert("jmacd-7187", unit_width > 0);
49395+ assert("jmacd-7188", scan_index >= unit_index);
49396+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
49397+
49398+ /* Depending on the scan direction, we set different maximum values for scan_index
49399+ (scan_max) and the number of nodes that would be passed if the scan goes the
49400+ entire way (scan_dist). Incr is an integer reflecting the incremental
49401+ direction of scan_index. */
49402+ if (scanning_left(scan)) {
49403+ scan_max = unit_index;
49404+ scan_dist = scan_index - unit_index;
49405+ incr = -1;
49406+ } else {
49407+ scan_max = unit_index + unit_width - 1;
49408+ scan_dist = scan_max - unit_index;
49409+ incr = +1;
49410+ }
49411+
49412+ tree = coord.node->zjnode.tree;
49413+
49414+ /* If the extent is allocated we have to check each of its blocks. If the extent
49415+ is unallocated we can skip to the scan_max. */
49416+ if (allocated) {
49417+ do {
49418+ neighbor = jlookup(tree, oid, scan_index);
49419+ if (neighbor == NULL)
49420+ goto stop_same_parent;
49421+
49422+ if (scan->node != neighbor
49423+ && !scan_goto(scan, neighbor)) {
49424+ /* @neighbor was jput() by scan_goto(). */
49425+ goto stop_same_parent;
49426+ }
49427+
49428+ ret = scan_set_current(scan, neighbor, 1, &coord);
49429+ if (ret != 0) {
49430+ goto exit;
49431+ }
49432+
49433+ /* reference to @neighbor is stored in @scan, no need
49434+ to jput(). */
49435+ scan_index += incr;
49436+
49437+ } while (incr + scan_max != scan_index);
49438+
49439+ } else {
49440+ /* Optimized case for unallocated extents, skip to the end. */
49441+ neighbor = jlookup(tree, oid, scan_max /*index */ );
49442+ if (neighbor == NULL) {
49443+ /* Race with truncate */
49444+ scan->stop = 1;
49445+ ret = 0;
49446+ goto exit;
49447+ }
49448+
49449+ assert("zam-1043", blocknr_is_fake(jnode_get_block(neighbor)));
49450+
49451+ ret = scan_set_current(scan, neighbor, scan_dist, &coord);
49452+ if (ret != 0) {
49453+ goto exit;
49454+ }
49455+ }
49456+
49457+ if (coord_sideof_unit(&coord, scan->direction) == 0
49458+ && item_is_extent(&coord)) {
49459+ /* Continue as long as there are more extent units. */
49460+
49461+ scan_index =
49462+ extent_unit_index(&coord) +
49463+ (scanning_left(scan) ? extent_unit_width(&coord) - 1 : 0);
49464+ goto repeat;
49465+ }
49466+
49467+ if (0) {
49468+ stop_same_parent:
49469+
49470+ /* If we are scanning left and we stop in the middle of an allocated
49471+ extent, we know the preceder immediately.. */
49472+ /* middle of extent is (scan_index - unit_index) != 0. */
49473+ if (scanning_left(scan) && (scan_index - unit_index) != 0) {
49474+ /* FIXME(B): Someone should step-through and verify that this preceder
49475+ calculation is indeed correct. */
49476+ /* @unit_start is starting block (number) of extent
49477+ unit. Flush stopped at the @scan_index block from
49478+ the beginning of the file, which is (scan_index -
49479+ unit_index) block within extent.
49480+ */
49481+ if (unit_start) {
49482+ /* skip preceder update when we are at hole */
49483+ scan->preceder_blk =
49484+ unit_start + scan_index - unit_index;
49485+ check_preceder(scan->preceder_blk);
49486+ }
49487+ }
49488+
49489+ /* In this case, we leave coord set to the parent of scan->node. */
49490+ scan->stop = 1;
49491+
49492+ } else {
49493+ /* In this case, we are still scanning, coord is set to the next item which is
49494+ either off-the-end of the node or not an extent. */
49495+ assert("jmacd-8912", scan->stop == 0);
49496+ assert("jmacd-7812",
49497+ (coord_is_after_sideof_unit(&coord, scan->direction)
49498+ || !item_is_extent(&coord)));
49499+ }
49500+
49501+ ret = 0;
49502+ exit:
49503+ return ret;
49504+}
49505+
49506+/* ask block allocator for some blocks */
49507+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
49508+ reiser4_block_nr wanted_count,
49509+ reiser4_block_nr *first_allocated,
49510+ reiser4_block_nr *allocated,
49511+ block_stage_t block_stage)
49512+{
49513+ *allocated = wanted_count;
49514+ preceder->max_dist = 0; /* scan whole disk, if needed */
49515+
49516+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
49517+ preceder->block_stage = block_stage;
49518+
49519+ /* FIXME: we do not handle errors here now */
49520+ check_me("vs-420",
49521+ reiser4_alloc_blocks(preceder, first_allocated, allocated,
49522+ BA_PERMANENT) == 0);
49523+ /* update flush_pos's preceder to last allocated block number */
49524+ preceder->blk = *first_allocated + *allocated - 1;
49525+}
49526+
49527+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
49528+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
49529+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
49530+static reiser4_block_nr reserve_replace(void)
49531+{
49532+ reiser4_block_nr grabbed, needed;
49533+
49534+ grabbed = get_current_context()->grabbed_blocks;
49535+ needed = estimate_one_insert_into_item(current_tree);
49536+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
49537+ return grabbed;
49538+}
49539+
49540+static void free_replace_reserved(reiser4_block_nr grabbed)
49541+{
49542+ reiser4_context *ctx;
49543+
49544+ ctx = get_current_context();
49545+ grabbed2free(ctx, get_super_private(ctx->super),
49546+ ctx->grabbed_blocks - grabbed);
49547+}
49548+
49549+/* Block offset of first block addressed by unit */
49550+__u64 extent_unit_index(const coord_t * item)
49551+{
49552+ reiser4_key key;
49553+
49554+ assert("vs-648", coord_is_existing_unit(item));
49555+ unit_key_by_coord(item, &key);
49556+ return get_key_offset(&key) >> current_blocksize_bits;
49557+}
49558+
49559+/* AUDIT shouldn't return value be of reiser4_block_nr type?
49560+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
49561+__u64 extent_unit_width(const coord_t * item)
49562+{
49563+ assert("vs-649", coord_is_existing_unit(item));
49564+ return width_by_coord(item);
49565+}
49566+
49567+/* Starting block location of this unit */
49568+static reiser4_block_nr extent_unit_start(const coord_t * item)
49569+{
49570+ return extent_get_start(extent_by_coord(item));
49571+}
49572+
49573+/**
49574+ * split_allocated_extent -
49575+ * @coord:
49576+ * @pos_in_unit:
49577+ *
49578+ * replace allocated extent with two allocated extents
49579+ */
49580+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
49581+{
49582+ int result;
49583+ struct replace_handle *h;
49584+ reiser4_extent *ext;
49585+ reiser4_block_nr grabbed;
49586+
49587+ ext = extent_by_coord(coord);
49588+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
49589+ assert("vs-1411", extent_get_width(ext) > pos_in_unit);
49590+
49591+ h = kmalloc(sizeof(*h), get_gfp_mask());
49592+ if (h == NULL)
49593+ return RETERR(-ENOMEM);
49594+ h->coord = coord;
49595+ h->lh = znode_lh(coord->node);
49596+ h->pkey = &h->key;
49597+ unit_key_by_coord(coord, h->pkey);
49598+ set_key_offset(h->pkey,
49599+ (get_key_offset(h->pkey) +
49600+ pos_in_unit * current_blocksize));
49601+ set_extent(&h->overwrite, extent_get_start(ext), pos_in_unit);
49602+ set_extent(&h->new_extents[0], extent_get_start(ext) + pos_in_unit,
49603+ extent_get_width(ext) - pos_in_unit);
49604+ h->nr_new_extents = 1;
49605+ h->flags = COPI_DONT_SHIFT_LEFT;
49606+ h->paste_key = h->key;
49607+
49608+ /* reserve space for extent unit paste, @grabbed is reserved before */
49609+ grabbed = reserve_replace();
49610+ result = replace_extent(h, 0 /* leave @coord set to overwritten
49611+ extent */);
49612+ /* restore reserved */
49613+ free_replace_reserved(grabbed);
49614+ kfree(h);
49615+ return result;
49616+}
49617+
49618+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
49619+ one). Return 1 if it succeeded, 0 - otherwise */
49620+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
49621+ reiser4_extent *replace)
49622+{
49623+ assert("vs-1415", extent_by_coord(coord) == ext);
49624+
49625+ if (coord->unit_pos == 0
49626+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
49627+ /* @ext either does not exist or is not allocated extent */
49628+ return 0;
49629+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
49630+ extent_get_start(replace))
49631+ return 0;
49632+
49633+ /* we can glue, widen previous unit */
49634+ extent_set_width(ext - 1,
49635+ extent_get_width(ext - 1) + extent_get_width(replace));
49636+
49637+ if (extent_get_width(ext) != extent_get_width(replace)) {
49638+ /* make current extent narrower */
49639+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
49640+ extent_set_start(ext,
49641+ extent_get_start(ext) +
49642+ extent_get_width(replace));
49643+ extent_set_width(ext,
49644+ extent_get_width(ext) -
49645+ extent_get_width(replace));
49646+ } else {
49647+ /* current extent completely glued with its left neighbor, remove it */
49648+ coord_t from, to;
49649+
49650+ coord_dup(&from, coord);
49651+ from.unit_pos = nr_units_extent(coord) - 1;
49652+ coord_dup(&to, &from);
49653+
49654+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got
49655+ freed after unit removal to end of item */
49656+ memmove(ext, ext + 1,
49657+ (from.unit_pos -
49658+ coord->unit_pos) * sizeof(reiser4_extent));
49659+ /* wipe part of item which is going to be cut, so that node_check will not be confused */
49660+ cut_node_content(&from, &to, NULL, NULL, NULL);
49661+ }
49662+ znode_make_dirty(coord->node);
49663+ /* move coord back */
49664+ coord->unit_pos--;
49665+ return 1;
49666+}
49667+
49668+/**
49669+ * conv_extent - replace extent with 2 ones
49670+ * @coord: coordinate of extent to be replaced
49671+ * @replace: extent to overwrite the one @coord is set to
49672+ *
49673+ * Overwrites extent @coord is set to and paste one extent unit after
49674+ * overwritten one if @replace is shorter than initial extent
49675+ */
49676+static int conv_extent(coord_t *coord, reiser4_extent *replace)
49677+{
49678+ int result;
49679+ struct replace_handle *h;
49680+ reiser4_extent *ext;
49681+ reiser4_block_nr start, width, new_width;
49682+ reiser4_block_nr grabbed;
49683+ extent_state state;
49684+
49685+ ext = extent_by_coord(coord);
49686+ state = state_of_extent(ext);
49687+ start = extent_get_start(ext);
49688+ width = extent_get_width(ext);
49689+ new_width = extent_get_width(replace);
49690+
49691+ assert("vs-1458", (state == UNALLOCATED_EXTENT ||
49692+ state == ALLOCATED_EXTENT));
49693+ assert("vs-1459", width >= new_width);
49694+
49695+ if (try_to_merge_with_left(coord, ext, replace)) {
49696+ /* merged @replace with left neighbor. Current unit is either
49697+ removed or narrowed */
49698+ return 0;
49699+ }
49700+
49701+ if (width == new_width) {
49702+ /* replace current extent with @replace */
49703+ *ext = *replace;
49704+ znode_make_dirty(coord->node);
49705+ return 0;
49706+ }
49707+
49708+ h = kmalloc(sizeof(*h), get_gfp_mask());
49709+ if (h == NULL)
49710+ return RETERR(-ENOMEM);
49711+ h->coord = coord;
49712+ h->lh = znode_lh(coord->node);
49713+ h->pkey = &h->key;
49714+ unit_key_by_coord(coord, h->pkey);
49715+ set_key_offset(h->pkey,
49716+ (get_key_offset(h->pkey) + new_width * current_blocksize));
49717+ h->overwrite = *replace;
49718+
49719+ /* replace @ext with @replace and padding extent */
49720+ set_extent(&h->new_extents[0],
49721+ (state == ALLOCATED_EXTENT) ? (start + new_width) : UNALLOCATED_EXTENT_START,
49722+ width - new_width);
49723+ h->nr_new_extents = 1;
49724+ h->flags = COPI_DONT_SHIFT_LEFT;
49725+ h->paste_key = h->key;
49726+
49727+ /* reserve space for extent unit paste, @grabbed is reserved before */
49728+ grabbed = reserve_replace();
49729+ result = replace_extent(h, 0 /* leave @coord set to overwritten
49730+ extent */);
49731+
49732+ /* restore reserved */
49733+ free_replace_reserved(grabbed);
49734+ kfree(h);
49735+ return result;
49736+}
49737+
49738+/**
49739+ * assign_real_blocknrs
49740+ * @flush_pos:
49741+ * @oid: objectid of file jnodes to assign block number to belongs to
49742+ * @index: first jnode on the range
49743+ * @count: number of jnodes to assign block numbers to
49744+ * @first: start of allocated block range
49745+ *
49746+ * Assigns block numbers to each of @count jnodes. Index of first jnode is
49747+ * @index. Jnodes get lookuped with jlookup.
49748+ */
49749+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
49750+ unsigned long index, reiser4_block_nr count,
49751+ reiser4_block_nr first)
49752+{
49753+ unsigned long i;
49754+ reiser4_tree *tree;
49755+ txn_atom *atom;
49756+ int nr;
49757+
49758+ atom = atom_locked_by_fq(flush_pos->fq);
49759+ assert("vs-1468", atom);
49760+ BUG_ON(atom == NULL);
49761+
49762+ nr = 0;
49763+ tree = current_tree;
49764+ for (i = 0; i < count; ++i, ++index) {
49765+ jnode *node;
49766+
49767+ node = jlookup(tree, oid, index);
49768+ assert("", node != NULL);
49769+ BUG_ON(node == NULL);
49770+
49771+ spin_lock_jnode(node);
49772+ assert("", !jnode_is_flushprepped(node));
49773+ assert("vs-1475", node->atom == atom);
49774+ assert("vs-1476", atomic_read(&node->x_count) > 0);
49775+
49776+ JF_CLR(node, JNODE_FLUSH_RESERVED);
49777+ jnode_set_block(node, &first);
49778+ unformatted_make_reloc(node, flush_pos->fq);
49779+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
49780+ FQ_LIST, 0));
49781+ spin_unlock_jnode(node);
49782+ first++;
49783+
49784+ atomic_dec(&node->x_count);
49785+ nr ++;
49786+ }
49787+
49788+ spin_unlock_atom(atom);
49789+ return;
49790+}
49791+
49792+/**
49793+ * make_node_ovrwr - assign node to overwrite set
49794+ * @jnodes: overwrite set list head
49795+ * @node: jnode to belong to overwrite set
49796+ *
49797+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
49798+ * which is an accumulator for nodes before they get to overwrite set list of
49799+ * atom.
49800+ */
49801+static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
49802+{
49803+ spin_lock_jnode(node);
49804+
49805+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
49806+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
49807+
49808+ JF_SET(node, JNODE_OVRWR);
49809+ list_move_tail(&node->capture_link, jnodes);
49810+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
49811+
49812+ spin_unlock_jnode(node);
49813+}
49814+
49815+/**
49816+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
49817+ * @flush_pos: flush position
49818+ * @oid: objectid of file jnodes belong to
49819+ * @index: starting index
49820+ * @width: extent width
49821+ *
49822+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
49823+ * overwrite set. Starting from the one with index @index. If end of slum is
49824+ * detected (node is not found or flushprepped) - stop iterating and set flush
49825+ * position's state to POS_INVALID.
49826+ */
49827+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
49828+ unsigned long index, reiser4_block_nr width)
49829+{
49830+ unsigned long i;
49831+ reiser4_tree *tree;
49832+ jnode *node;
49833+ txn_atom *atom;
49834+ LIST_HEAD(jnodes);
49835+
49836+ tree = current_tree;
49837+
49838+ atom = atom_locked_by_fq(pos_fq(flush_pos));
49839+ assert("vs-1478", atom);
49840+
49841+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
49842+ node = jlookup(tree, oid, index);
49843+ if (!node) {
49844+ flush_pos->state = POS_INVALID;
49845+ break;
49846+ }
49847+ if (jnode_check_flushprepped(node)) {
49848+ flush_pos->state = POS_INVALID;
49849+ atomic_dec(&node->x_count);
49850+ break;
49851+ }
49852+ if (node->atom != atom) {
49853+ flush_pos->state = POS_INVALID;
49854+ atomic_dec(&node->x_count);
49855+ break;
49856+ }
49857+ make_node_ovrwr(&jnodes, node);
49858+ atomic_dec(&node->x_count);
49859+ }
49860+
49861+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
49862+ spin_unlock_atom(atom);
49863+}
49864+
49865+/**
49866+ * allocated_extent_slum_size
49867+ * @flush_pos:
49868+ * @oid:
49869+ * @index:
49870+ * @count:
49871+ *
49872+ *
49873+ */
49874+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
49875+ unsigned long index, unsigned long count)
49876+{
49877+ unsigned long i;
49878+ reiser4_tree *tree;
49879+ txn_atom *atom;
49880+ int nr;
49881+
49882+ atom = atom_locked_by_fq(pos_fq(flush_pos));
49883+ assert("vs-1468", atom);
49884+
49885+ nr = 0;
49886+ tree = current_tree;
49887+ for (i = 0; i < count; ++i, ++index) {
49888+ jnode *node;
49889+
49890+ node = jlookup(tree, oid, index);
49891+ if (!node)
49892+ break;
49893+
49894+ if (jnode_check_flushprepped(node)) {
49895+ atomic_dec(&node->x_count);
49896+ break;
49897+ }
49898+
49899+ if (node->atom != atom) {
49900+ /*
49901+ * this is possible on overwrite: extent_write may
49902+ * capture several unformatted nodes without capturing
49903+ * any formatted nodes.
49904+ */
49905+ atomic_dec(&node->x_count);
49906+ break;
49907+ }
49908+
49909+ assert("vs-1476", atomic_read(&node->x_count) > 1);
49910+ atomic_dec(&node->x_count);
49911+ nr ++;
49912+ }
49913+
49914+ spin_unlock_atom(atom);
49915+ return nr;
49916+}
49917+
49918+/**
49919+ * alloc_extent
49920+ * @flush_pos:
49921+ *
49922+ *
49923+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
49924+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes
49925+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
49926+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
49927+ * set to 1 and to overwrite set otherwise
49928+ */
49929+int alloc_extent(flush_pos_t *flush_pos)
49930+{
49931+ coord_t *coord;
49932+ reiser4_extent *ext;
49933+ reiser4_extent replace_ext;
49934+ oid_t oid;
49935+ reiser4_block_nr protected;
49936+ reiser4_block_nr start;
49937+ __u64 index;
49938+ __u64 width;
49939+ extent_state state;
49940+ int result;
49941+ reiser4_block_nr first_allocated;
49942+ __u64 allocated;
49943+ reiser4_key key;
49944+ block_stage_t block_stage;
49945+
49946+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
49947+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
49948+ && item_is_extent(&flush_pos->coord));
49949+
49950+ coord = &flush_pos->coord;
49951+
49952+ ext = extent_by_coord(coord);
49953+ state = state_of_extent(ext);
49954+ if (state == HOLE_EXTENT) {
49955+ flush_pos->state = POS_INVALID;
49956+ return 0;
49957+ }
49958+
49959+ item_key_by_coord(coord, &key);
49960+ oid = get_key_objectid(&key);
49961+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
49962+ start = extent_get_start(ext);
49963+ width = extent_get_width(ext);
49964+
49965+ assert("vs-1457", width > flush_pos->pos_in_unit);
49966+
49967+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
49968+ /* relocate */
49969+ if (flush_pos->pos_in_unit) {
49970+ /* split extent unit into two */
49971+ result =
49972+ split_allocated_extent(coord,
49973+ flush_pos->pos_in_unit);
49974+ flush_pos->pos_in_unit = 0;
49975+ return result;
49976+ }
49977+
49978+ /* limit number of nodes to allocate */
49979+ if (flush_pos->nr_to_write < width)
49980+ width = flush_pos->nr_to_write;
49981+
49982+ if (state == ALLOCATED_EXTENT) {
49983+ /*
49984+ * all protected nodes are not flushprepped, therefore
49985+ * they are counted as flush_reserved
49986+ */
49987+ block_stage = BLOCK_FLUSH_RESERVED;
49988+ protected = allocated_extent_slum_size(flush_pos, oid,
49989+ index, width);
49990+ if (protected == 0) {
49991+ flush_pos->state = POS_INVALID;
49992+ flush_pos->pos_in_unit = 0;
49993+ return 0;
49994+ }
49995+ } else {
49996+ block_stage = BLOCK_UNALLOCATED;
49997+ protected = width;
49998+ }
49999+
50000+ /*
50001+ * look at previous unit if possible. If it is allocated, make
50002+ * preceder more precise
50003+ */
50004+ if (coord->unit_pos &&
50005+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50006+ pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
50007+ extent_get_width(ext - 1);
50008+
50009+ /* allocate new block numbers for protected nodes */
50010+ extent_allocate_blocks(pos_hint(flush_pos), protected,
50011+ &first_allocated, &allocated,
50012+ block_stage);
50013+
50014+ if (state == ALLOCATED_EXTENT)
50015+ /*
50016+ * on relocating - free nodes which are going to be
50017+ * relocated
50018+ */
50019+ reiser4_dealloc_blocks(&start, &allocated,
50020+ BLOCK_ALLOCATED, BA_DEFER);
50021+
50022+ /* assign new block numbers to protected nodes */
50023+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
50024+
50025+
50026+ /* prepare extent which will replace current one */
50027+ set_extent(&replace_ext, first_allocated, allocated);
50028+
50029+ /* adjust extent item */
50030+ result = conv_extent(coord, &replace_ext);
50031+ if (result != 0 && result != -ENOMEM) {
50032+ warning("vs-1461",
50033+ "Failed to allocate extent. Should not happen\n");
50034+ return result;
50035+ }
50036+
50037+ /*
50038+ * break flush: we prepared for flushing as many blocks as we
50039+ * were asked for
50040+ */
50041+ if (flush_pos->nr_to_write == allocated)
50042+ flush_pos->state = POS_INVALID;
50043+ } else {
50044+ /* overwrite */
50045+ mark_jnodes_overwrite(flush_pos, oid, index, width);
50046+ }
50047+ flush_pos->pos_in_unit = 0;
50048+ return 0;
50049+}
50050+
50051+/* if @key is glueable to the item @coord is set to */
50052+static int must_insert(const coord_t *coord, const reiser4_key *key)
50053+{
50054+ reiser4_key last;
50055+
50056+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID
50057+ && keyeq(append_key_extent(coord, &last), key))
50058+ return 0;
50059+ return 1;
50060+}
50061+
50062+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
50063+ or modify last unit of last item to have greater width */
50064+static int put_unit_to_end(znode *node, const reiser4_key *key,
50065+ reiser4_extent *copy_ext)
50066+{
50067+ int result;
50068+ coord_t coord;
50069+ cop_insert_flag flags;
50070+ reiser4_extent *last_ext;
50071+ reiser4_item_data data;
50072+
50073+ /* set coord after last unit in an item */
50074+ coord_init_last_unit(&coord, node);
50075+ coord.between = AFTER_UNIT;
50076+
50077+ flags =
50078+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
50079+ if (must_insert(&coord, key)) {
50080+ result =
50081+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
50082+ key, NULL /*lh */ , flags);
50083+
50084+ } else {
50085+ /* try to glue with last unit */
50086+ last_ext = extent_by_coord(&coord);
50087+ if (state_of_extent(last_ext) &&
50088+ extent_get_start(last_ext) + extent_get_width(last_ext) ==
50089+ extent_get_start(copy_ext)) {
50090+ /* widen last unit of node */
50091+ extent_set_width(last_ext,
50092+ extent_get_width(last_ext) +
50093+ extent_get_width(copy_ext));
50094+ znode_make_dirty(node);
50095+ return 0;
50096+ }
50097+
50098+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
50099+ result =
50100+ insert_into_item(&coord, NULL /*lh */ , key,
50101+ init_new_extent(&data, copy_ext, 1),
50102+ flags);
50103+ }
50104+
50105+ assert("vs-438", result == 0 || result == -E_NODE_FULL);
50106+ return result;
50107+}
50108+
50109+/* @coord is set to extent unit */
50110+squeeze_result squalloc_extent(znode *left, const coord_t *coord,
50111+ flush_pos_t *flush_pos,
50112+ reiser4_key *stop_key)
50113+{
50114+ reiser4_extent *ext;
50115+ __u64 index;
50116+ __u64 width;
50117+ reiser4_block_nr start;
50118+ extent_state state;
50119+ oid_t oid;
50120+ reiser4_block_nr first_allocated;
50121+ __u64 allocated;
50122+ __u64 protected;
50123+ reiser4_extent copy_extent;
50124+ reiser4_key key;
50125+ int result;
50126+ block_stage_t block_stage;
50127+
50128+ assert("vs-1457", flush_pos->pos_in_unit == 0);
50129+ assert("vs-1467", coord_is_leftmost_unit(coord));
50130+ assert("vs-1467", item_is_extent(coord));
50131+
50132+ ext = extent_by_coord(coord);
50133+ index = extent_unit_index(coord);
50134+ start = extent_get_start(ext);
50135+ width = extent_get_width(ext);
50136+ state = state_of_extent(ext);
50137+ unit_key_by_coord(coord, &key);
50138+ oid = get_key_objectid(&key);
50139+
50140+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
50141+ (state == UNALLOCATED_EXTENT)) {
50142+ /* relocate */
50143+ if (state == ALLOCATED_EXTENT) {
50144+ /* all protected nodes are not flushprepped, therefore
50145+ * they are counted as flush_reserved */
50146+ block_stage = BLOCK_FLUSH_RESERVED;
50147+ protected = allocated_extent_slum_size(flush_pos, oid,
50148+ index, width);
50149+ if (protected == 0) {
50150+ flush_pos->state = POS_INVALID;
50151+ flush_pos->pos_in_unit = 0;
50152+ return 0;
50153+ }
50154+ } else {
50155+ block_stage = BLOCK_UNALLOCATED;
50156+ protected = width;
50157+ }
50158+
50159+ /*
50160+ * look at previous unit if possible. If it is allocated, make
50161+ * preceder more precise
50162+ */
50163+ if (coord->unit_pos &&
50164+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50165+ pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
50166+ extent_get_width(ext - 1);
50167+
50168+ /* allocate new block numbers for protected nodes */
50169+ extent_allocate_blocks(pos_hint(flush_pos), protected,
50170+ &first_allocated, &allocated,
50171+ block_stage);
50172+
50173+ /* prepare extent which will be copied to left */
50174+ set_extent(&copy_extent, first_allocated, allocated);
50175+
50176+ result = put_unit_to_end(left, &key, &copy_extent);
50177+ if (result == -E_NODE_FULL) {
50178+ int target_block_stage;
50179+
50180+ /* free blocks which were just allocated */
50181+ target_block_stage =
50182+ (state ==
50183+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
50184+ BLOCK_UNALLOCATED;
50185+ reiser4_dealloc_blocks(&first_allocated, &allocated,
50186+ target_block_stage,
50187+ BA_PERMANENT);
50188+
50189+ /* rewind the preceder. */
50190+ flush_pos->preceder.blk = first_allocated;
50191+ check_preceder(flush_pos->preceder.blk);
50192+
50193+ return SQUEEZE_TARGET_FULL;
50194+ }
50195+
50196+ if (state == ALLOCATED_EXTENT) {
50197+ /* free nodes which were relocated */
50198+ reiser4_dealloc_blocks(&start, &allocated,
50199+ BLOCK_ALLOCATED, BA_DEFER);
50200+ }
50201+
50202+ /* assign new block numbers to protected nodes */
50203+ assign_real_blocknrs(flush_pos, oid, index, allocated,
50204+ first_allocated);
50205+
50206+ set_key_offset(&key,
50207+ get_key_offset(&key) +
50208+ (allocated << current_blocksize_bits));
50209+ } else {
50210+ /*
50211+ * overwrite: try to copy unit as it is to left neighbor and
50212+ * make all first not flushprepped nodes overwrite nodes
50213+ */
50214+ set_extent(&copy_extent, start, width);
50215+ result = put_unit_to_end(left, &key, &copy_extent);
50216+ if (result == -E_NODE_FULL)
50217+ return SQUEEZE_TARGET_FULL;
50218+
50219+ if (state != HOLE_EXTENT)
50220+ mark_jnodes_overwrite(flush_pos, oid, index, width);
50221+ set_key_offset(&key,
50222+ get_key_offset(&key) +
50223+ (width << current_blocksize_bits));
50224+ }
50225+ *stop_key = key;
50226+ return SQUEEZE_CONTINUE;
50227+}
50228+
50229+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
50230+{
50231+ return key_by_inode_and_offset_common(inode, off, key);
50232+}
50233+
50234+/*
50235+ * Local variables:
50236+ * c-indentation-style: "K&R"
50237+ * mode-name: "LC"
50238+ * c-basic-offset: 8
50239+ * tab-width: 8
50240+ * fill-column: 79
50241+ * scroll-step: 1
50242+ * End:
50243+ */
50244Index: linux-2.6.16/fs/reiser4/plugin/item/extent_item_ops.c
50245===================================================================
50246--- /dev/null
50247+++ linux-2.6.16/fs/reiser4/plugin/item/extent_item_ops.c
50248@@ -0,0 +1,882 @@
50249+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50250+
50251+#include "item.h"
50252+#include "../../inode.h"
50253+#include "../../tree_walk.h" /* check_sibling_list() */
50254+#include "../../page_cache.h"
50255+#include "../../carry.h"
50256+
50257+#include <linux/quotaops.h>
50258+
50259+/* item_plugin->b.max_key_inside */
50260+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
50261+{
50262+ item_key_by_coord(coord, key);
50263+ set_key_offset(key, get_key_offset(max_key()));
50264+ return key;
50265+}
50266+
50267+/* item_plugin->b.can_contain_key
50268+ this checks whether @key of @data is matching to position set by @coord */
50269+int
50270+can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
50271+ const reiser4_item_data * data)
50272+{
50273+ reiser4_key item_key;
50274+
50275+ if (item_plugin_by_coord(coord) != data->iplug)
50276+ return 0;
50277+
50278+ item_key_by_coord(coord, &item_key);
50279+ if (get_key_locality(key) != get_key_locality(&item_key) ||
50280+ get_key_objectid(key) != get_key_objectid(&item_key) ||
50281+ get_key_ordering(key) != get_key_ordering(&item_key))
50282+ return 0;
50283+
50284+ return 1;
50285+}
50286+
50287+/* item_plugin->b.mergeable
50288+ first item is of extent type */
50289+/* Audited by: green(2002.06.13) */
50290+int mergeable_extent(const coord_t * p1, const coord_t * p2)
50291+{
50292+ reiser4_key key1, key2;
50293+
50294+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
50295+ /* FIXME-VS: Which is it? Assert or return 0 */
50296+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
50297+ return 0;
50298+ }
50299+
50300+ item_key_by_coord(p1, &key1);
50301+ item_key_by_coord(p2, &key2);
50302+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
50303+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
50304+ get_key_ordering(&key1) != get_key_ordering(&key2) ||
50305+ get_key_type(&key1) != get_key_type(&key2))
50306+ return 0;
50307+ if (get_key_offset(&key1) + extent_size(p1, nr_units_extent(p1)) !=
50308+ get_key_offset(&key2))
50309+ return 0;
50310+ return 1;
50311+}
50312+
50313+/* item_plugin->b.nr_units */
50314+pos_in_node_t nr_units_extent(const coord_t * coord)
50315+{
50316+ /* length of extent item has to be multiple of extent size */
50317+ assert("vs-1424",
50318+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
50319+ return item_length_by_coord(coord) / sizeof(reiser4_extent);
50320+}
50321+
50322+/* item_plugin->b.lookup */
50323+lookup_result
50324+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
50325+ coord_t * coord)
50326+{ /* znode and item_pos are
50327+ set to an extent item to
50328+ look through */
50329+ reiser4_key item_key;
50330+ reiser4_block_nr lookuped, offset;
50331+ unsigned i, nr_units;
50332+ reiser4_extent *ext;
50333+ unsigned blocksize;
50334+ unsigned char blocksize_bits;
50335+
50336+ item_key_by_coord(coord, &item_key);
50337+ offset = get_key_offset(&item_key);
50338+
50339+ /* key we are looking for must be greater than key of item @coord */
50340+ assert("vs-414", keygt(key, &item_key));
50341+
50342+ assert("umka-99945",
50343+ !keygt(key, max_key_inside_extent(coord, &item_key)));
50344+
50345+ ext = extent_item(coord);
50346+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
50347+
50348+ blocksize = current_blocksize;
50349+ blocksize_bits = current_blocksize_bits;
50350+
50351+ /* offset we are looking for */
50352+ lookuped = get_key_offset(key);
50353+
50354+ nr_units = nr_units_extent(coord);
50355+ /* go through all extents until the one which address given offset */
50356+ for (i = 0; i < nr_units; i++, ext++) {
50357+ offset += (extent_get_width(ext) << blocksize_bits);
50358+ if (offset > lookuped) {
50359+ /* desired byte is somewhere in this extent */
50360+ coord->unit_pos = i;
50361+ coord->between = AT_UNIT;
50362+ return CBK_COORD_FOUND;
50363+ }
50364+ }
50365+
50366+ /* set coord after last unit */
50367+ coord->unit_pos = nr_units - 1;
50368+ coord->between = AFTER_UNIT;
50369+ return CBK_COORD_FOUND;
50370+}
50371+
50372+/* item_plugin->b.paste
50373+ item @coord is set to has been appended with @data->length of free
50374+ space. data->data contains data to be pasted into the item in position
50375+ @coord->in_item.unit_pos. It must fit into that free space.
50376+ @coord must be set between units.
50377+*/
50378+int
50379+paste_extent(coord_t * coord, reiser4_item_data * data,
50380+ carry_plugin_info * info UNUSED_ARG)
50381+{
50382+ unsigned old_nr_units;
50383+ reiser4_extent *ext;
50384+ int item_length;
50385+
50386+ ext = extent_item(coord);
50387+ item_length = item_length_by_coord(coord);
50388+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50389+
50390+ /* this is also used to copy extent into newly created item, so
50391+ old_nr_units could be 0 */
50392+ assert("vs-260", item_length >= data->length);
50393+
50394+ /* make sure that coord is set properly */
50395+ assert("vs-35",
50396+ ((!coord_is_existing_unit(coord))
50397+ || (!old_nr_units && !coord->unit_pos)));
50398+
50399+ /* first unit to be moved */
50400+ switch (coord->between) {
50401+ case AFTER_UNIT:
50402+ coord->unit_pos++;
50403+ case BEFORE_UNIT:
50404+ coord->between = AT_UNIT;
50405+ break;
50406+ case AT_UNIT:
50407+ assert("vs-331", !old_nr_units && !coord->unit_pos);
50408+ break;
50409+ default:
50410+ impossible("vs-330", "coord is set improperly");
50411+ }
50412+
50413+ /* prepare space for new units */
50414+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50415+ ext + coord->unit_pos,
50416+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50417+
50418+ /* copy new data from kernel space */
50419+ assert("vs-556", data->user == 0);
50420+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50421+
50422+ /* after paste @coord is set to first of pasted units */
50423+ assert("vs-332", coord_is_existing_unit(coord));
50424+ assert("vs-333",
50425+ !memcmp(data->data, extent_by_coord(coord),
50426+ (unsigned)data->length));
50427+ return 0;
50428+}
50429+
50430+/* item_plugin->b.can_shift */
50431+int
50432+can_shift_extent(unsigned free_space, coord_t * source,
50433+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50434+ unsigned *size, unsigned want)
50435+{
50436+ *size = item_length_by_coord(source);
50437+ if (*size > free_space)
50438+ /* never split a unit of extent item */
50439+ *size = free_space - free_space % sizeof(reiser4_extent);
50440+
50441+ /* we can shift *size bytes, calculate how many do we want to shift */
50442+ if (*size > want * sizeof(reiser4_extent))
50443+ *size = want * sizeof(reiser4_extent);
50444+
50445+ if (*size % sizeof(reiser4_extent) != 0)
50446+ impossible("vs-119", "Wrong extent size: %i %zd", *size,
50447+ sizeof(reiser4_extent));
50448+ return *size / sizeof(reiser4_extent);
50449+
50450+}
50451+
50452+/* item_plugin->b.copy_units */
50453+void
50454+copy_units_extent(coord_t * target, coord_t * source,
50455+ unsigned from, unsigned count,
50456+ shift_direction where_is_free_space, unsigned free_space)
50457+{
50458+ char *from_ext, *to_ext;
50459+
50460+ assert("vs-217", free_space == count * sizeof(reiser4_extent));
50461+
50462+ from_ext = item_body_by_coord(source);
50463+ to_ext = item_body_by_coord(target);
50464+
50465+ if (where_is_free_space == SHIFT_LEFT) {
50466+ assert("vs-215", from == 0);
50467+
50468+ /* At this moment, item length was already updated in the item
50469+ header by shifting code, hence nr_units_extent() will
50470+ return "new" number of units---one we obtain after copying
50471+ units.
50472+ */
50473+ to_ext +=
50474+ (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50475+ } else {
50476+ reiser4_key key;
50477+ coord_t coord;
50478+
50479+ assert("vs-216",
50480+ from + count == coord_last_unit_pos(source) + 1);
50481+
50482+ from_ext += item_length_by_coord(source) - free_space;
50483+
50484+ /* new units are inserted before first unit in an item,
50485+ therefore, we have to update item key */
50486+ coord = *source;
50487+ coord.unit_pos = from;
50488+ unit_key_extent(&coord, &key);
50489+
50490+ node_plugin_by_node(target->node)->update_item_key(target, &key,
50491+ NULL /*info */);
50492+ }
50493+
50494+ memcpy(to_ext, from_ext, free_space);
50495+}
50496+
50497+/* item_plugin->b.create_hook
50498+ @arg is znode of leaf node for which we need to update right delimiting key */
50499+int create_hook_extent(const coord_t * coord, void *arg)
50500+{
50501+ coord_t *child_coord;
50502+ znode *node;
50503+ reiser4_key key;
50504+ reiser4_tree *tree;
50505+
50506+ if (!arg)
50507+ return 0;
50508+
50509+ child_coord = arg;
50510+ tree = znode_get_tree(coord->node);
50511+
50512+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50513+
50514+ write_lock_tree(tree);
50515+ write_lock_dk(tree);
50516+ /* find a node on the left level for which right delimiting key has to
50517+ be updated */
50518+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50519+ assert("vs-411", znode_is_left_connected(child_coord->node));
50520+ node = child_coord->node->left;
50521+ } else {
50522+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50523+ node = child_coord->node;
50524+ assert("nikita-3314", node != NULL);
50525+ }
50526+
50527+ if (node != NULL) {
50528+ znode_set_rd_key(node, item_key_by_coord(coord, &key));
50529+
50530+ assert("nikita-3282", check_sibling_list(node));
50531+ /* break sibling links */
50532+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50533+ ON_DEBUG(node->right->left_version =
50534+ atomic_inc_return(&delim_key_version);
50535+ node->right_version =
50536+ atomic_inc_return(&delim_key_version););
50537+
50538+ node->right->left = NULL;
50539+ node->right = NULL;
50540+ }
50541+ }
50542+ write_unlock_dk(tree);
50543+ write_unlock_tree(tree);
50544+ return 0;
50545+}
50546+
50547+#define ITEM_TAIL_KILLED 0
50548+#define ITEM_HEAD_KILLED 1
50549+#define ITEM_KILLED 2
50550+
50551+/* item_plugin->b.kill_hook
50552+ this is called when @count units starting from @from-th one are going to be removed
50553+ */
50554+int
50555+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50556+ struct carry_kill_data *kdata)
50557+{
50558+ reiser4_extent *ext;
50559+ reiser4_block_nr start, length;
50560+ const reiser4_key *pfrom_key, *pto_key;
50561+ struct inode *inode;
50562+ reiser4_tree *tree;
50563+ pgoff_t from_off, to_off, offset, skip;
50564+ int retval;
50565+
50566+ /* these are located in memory kmalloc-ed by kill_node_content */
50567+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50568+ coord_t *dup, *next;
50569+
50570+ assert("zam-811", znode_is_write_locked(coord->node));
50571+ assert("nikita-3315", kdata != NULL);
50572+ assert("vs-34", kdata->buf != NULL);
50573+
50574+ /* map structures to kdata->buf */
50575+ min_item_key = (reiser4_key *) (kdata->buf);
50576+ max_item_key = min_item_key + 1;
50577+ from_key = max_item_key + 1;
50578+ to_key = from_key + 1;
50579+ key = to_key + 1;
50580+ dup = (coord_t *) (key + 1);
50581+ next = dup + 1;
50582+
50583+ item_key_by_coord(coord, min_item_key);
50584+ max_item_key_by_coord(coord, max_item_key);
50585+
50586+ if (kdata->params.from_key) {
50587+ pfrom_key = kdata->params.from_key;
50588+ pto_key = kdata->params.to_key;
50589+ } else {
50590+ assert("vs-1549", from == coord->unit_pos);
50591+ unit_key_by_coord(coord, from_key);
50592+ pfrom_key = from_key;
50593+
50594+ coord_dup(dup, coord);
50595+ dup->unit_pos = from + count - 1;
50596+ max_unit_key_by_coord(dup, to_key);
50597+ pto_key = to_key;
50598+ }
50599+
50600+ if (!keylt(pto_key, max_item_key)) {
50601+ if (!keygt(pfrom_key, min_item_key)) {
50602+ znode *left, *right;
50603+
50604+ /* item is to be removed completely */
50605+ assert("nikita-3316", kdata->left != NULL
50606+ && kdata->right != NULL);
50607+
50608+ left = kdata->left->node;
50609+ right = kdata->right->node;
50610+
50611+ tree = current_tree;
50612+ /* we have to do two things:
50613+ *
50614+ * 1. link left and right formatted neighbors of
50615+ * extent being removed, and
50616+ *
50617+ * 2. update their delimiting keys.
50618+ *
50619+ * atomicity of these operations is protected by
50620+ * taking dk-lock and tree-lock.
50621+ */
50622+ /* if neighbors of item being removed are znodes -
50623+ * link them */
50624+ write_lock_tree(tree);
50625+ write_lock_dk(tree);
50626+ link_left_and_right(left, right);
50627+ if (left) {
50628+ /* update right delimiting key of left
50629+ * neighbor of extent item */
50630+ /*coord_t next;
50631+ reiser4_key key; */
50632+
50633+ coord_dup(next, coord);
50634+
50635+ if (coord_next_item(next))
50636+ *key = *znode_get_rd_key(coord->node);
50637+ else
50638+ item_key_by_coord(next, key);
50639+ znode_set_rd_key(left, key);
50640+ }
50641+ write_unlock_dk(tree);
50642+ write_unlock_tree(tree);
50643+
50644+ from_off =
50645+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
50646+ to_off =
50647+ (get_key_offset(max_item_key) +
50648+ 1) >> PAGE_CACHE_SHIFT;
50649+ retval = ITEM_KILLED;
50650+ } else {
50651+ /* tail of item is to be removed */
50652+ from_off =
50653+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
50654+ 1) >> PAGE_CACHE_SHIFT;
50655+ to_off =
50656+ (get_key_offset(max_item_key) +
50657+ 1) >> PAGE_CACHE_SHIFT;
50658+ retval = ITEM_TAIL_KILLED;
50659+ }
50660+ } else {
50661+ /* head of item is to be removed */
50662+ assert("vs-1571", keyeq(pfrom_key, min_item_key));
50663+ assert("vs-1572",
50664+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
50665+ 0);
50666+ assert("vs-1573",
50667+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50668+ 1)) == 0);
50669+
50670+ if (kdata->left->node) {
50671+ /* update right delimiting key of left neighbor of extent item */
50672+ /*reiser4_key key; */
50673+
50674+ *key = *pto_key;
50675+ set_key_offset(key, get_key_offset(pto_key) + 1);
50676+
50677+ write_lock_dk(current_tree);
50678+ znode_set_rd_key(kdata->left->node, key);
50679+ write_unlock_dk(current_tree);
50680+ }
50681+
50682+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
50683+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
50684+ retval = ITEM_HEAD_KILLED;
50685+ }
50686+
50687+ inode = kdata->inode;
50688+ assert("vs-1545", inode != NULL);
50689+ if (inode != NULL)
50690+ /* take care of pages and jnodes corresponding to part of item being killed */
50691+ reiser4_invalidate_pages(inode->i_mapping, from_off,
50692+ to_off - from_off,
50693+ kdata->params.truncate);
50694+
50695+ ext = extent_item(coord) + from;
50696+ offset =
50697+ (get_key_offset(min_item_key) +
50698+ extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
50699+
50700+ assert("vs-1551", from_off >= offset);
50701+ assert("vs-1552", from_off - offset <= extent_get_width(ext));
50702+ skip = from_off - offset;
50703+ offset = from_off;
50704+
50705+ while (offset < to_off) {
50706+ length = extent_get_width(ext) - skip;
50707+ if (state_of_extent(ext) == HOLE_EXTENT) {
50708+ skip = 0;
50709+ offset += length;
50710+ ext++;
50711+ continue;
50712+ }
50713+
50714+ if (offset + length > to_off) {
50715+ length = to_off - offset;
50716+ }
50717+
50718+ DQUOT_FREE_BLOCK_NODIRTY(inode, length);
50719+
50720+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50721+ /* some jnodes corresponding to this unallocated extent */
50722+ fake_allocated2free(length, 0 /* unformatted */ );
50723+
50724+ skip = 0;
50725+ offset += length;
50726+ ext++;
50727+ continue;
50728+ }
50729+
50730+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
50731+
50732+ if (length != 0) {
50733+ start = extent_get_start(ext) + skip;
50734+
50735+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
50736+ immediately */
50737+ reiser4_dealloc_blocks(&start, &length,
50738+ 0 /* not used */ ,
50739+ BA_DEFER
50740+ /* unformatted with defer */ );
50741+ }
50742+ skip = 0;
50743+ offset += length;
50744+ ext++;
50745+ }
50746+ return retval;
50747+}
50748+
50749+/* item_plugin->b.kill_units */
50750+int
50751+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50752+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
50753+ reiser4_key * new_first)
50754+{
50755+ reiser4_extent *ext;
50756+ reiser4_key item_key;
50757+ pos_in_node_t count;
50758+ reiser4_key from_key, to_key;
50759+ const reiser4_key *pfrom_key, *pto_key;
50760+ loff_t off;
50761+ int result;
50762+
50763+ assert("vs-1541",
50764+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
50765+ || (kdata->params.from_key != NULL
50766+ && kdata->params.to_key != NULL)));
50767+
50768+ if (kdata->params.from_key) {
50769+ pfrom_key = kdata->params.from_key;
50770+ pto_key = kdata->params.to_key;
50771+ } else {
50772+ coord_t dup;
50773+
50774+ /* calculate key range of kill */
50775+ assert("vs-1549", from == coord->unit_pos);
50776+ unit_key_by_coord(coord, &from_key);
50777+ pfrom_key = &from_key;
50778+
50779+ coord_dup(&dup, coord);
50780+ dup.unit_pos = to;
50781+ max_unit_key_by_coord(&dup, &to_key);
50782+ pto_key = &to_key;
50783+ }
50784+
50785+ item_key_by_coord(coord, &item_key);
50786+
50787+#if REISER4_DEBUG
50788+ {
50789+ reiser4_key max_item_key;
50790+
50791+ max_item_key_by_coord(coord, &max_item_key);
50792+
50793+ if (new_first) {
50794+ /* head of item is to be cut */
50795+ assert("vs-1542", keyeq(pfrom_key, &item_key));
50796+ assert("vs-1538", keylt(pto_key, &max_item_key));
50797+ } else {
50798+ /* tail of item is to be cut */
50799+ assert("vs-1540", keygt(pfrom_key, &item_key));
50800+ assert("vs-1543", !keylt(pto_key, &max_item_key));
50801+ }
50802+ }
50803+#endif
50804+
50805+ if (smallest_removed)
50806+ *smallest_removed = *pfrom_key;
50807+
50808+ if (new_first) {
50809+ /* item head is cut. Item key will change. This new key is calculated here */
50810+ assert("vs-1556",
50811+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50812+ (PAGE_CACHE_SIZE - 1));
50813+ *new_first = *pto_key;
50814+ set_key_offset(new_first, get_key_offset(new_first) + 1);
50815+ }
50816+
50817+ count = to - from + 1;
50818+ result = kill_hook_extent(coord, from, count, kdata);
50819+ if (result == ITEM_TAIL_KILLED) {
50820+ assert("vs-1553",
50821+ get_key_offset(pfrom_key) >=
50822+ get_key_offset(&item_key) + extent_size(coord, from));
50823+ off =
50824+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
50825+ extent_size(coord, from));
50826+ if (off) {
50827+ /* unit @from is to be cut partially. Its width decreases */
50828+ ext = extent_item(coord) + from;
50829+ extent_set_width(ext,
50830+ (off + PAGE_CACHE_SIZE -
50831+ 1) >> PAGE_CACHE_SHIFT);
50832+ count--;
50833+ }
50834+ } else {
50835+ __u64 max_to_offset;
50836+ __u64 rest;
50837+
50838+ assert("vs-1575", result == ITEM_HEAD_KILLED);
50839+ assert("", from == 0);
50840+ assert("",
50841+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50842+ 1)) == 0);
50843+ assert("",
50844+ get_key_offset(pto_key) + 1 >
50845+ get_key_offset(&item_key) + extent_size(coord, to));
50846+ max_to_offset =
50847+ get_key_offset(&item_key) + extent_size(coord, to + 1) - 1;
50848+ assert("", get_key_offset(pto_key) <= max_to_offset);
50849+
50850+ rest =
50851+ (max_to_offset -
50852+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
50853+ if (rest) {
50854+ /* unit @to is to be cut partially */
50855+ ext = extent_item(coord) + to;
50856+
50857+ assert("", extent_get_width(ext) > rest);
50858+
50859+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
50860+ extent_set_start(ext,
50861+ extent_get_start(ext) +
50862+ (extent_get_width(ext) -
50863+ rest));
50864+
50865+ extent_set_width(ext, rest);
50866+ count--;
50867+ }
50868+ }
50869+ return count * sizeof(reiser4_extent);
50870+}
50871+
50872+/* item_plugin->b.cut_units
50873+ this is too similar to kill_units_extent */
50874+int
50875+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50876+ struct carry_cut_data *cdata, reiser4_key * smallest_removed,
50877+ reiser4_key * new_first)
50878+{
50879+ reiser4_extent *ext;
50880+ reiser4_key item_key;
50881+ pos_in_node_t count;
50882+ reiser4_key from_key, to_key;
50883+ const reiser4_key *pfrom_key, *pto_key;
50884+ loff_t off;
50885+
50886+ assert("vs-1541",
50887+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
50888+ || (cdata->params.from_key != NULL
50889+ && cdata->params.to_key != NULL)));
50890+
50891+ if (cdata->params.from_key) {
50892+ pfrom_key = cdata->params.from_key;
50893+ pto_key = cdata->params.to_key;
50894+ } else {
50895+ coord_t dup;
50896+
50897+ /* calculate key range of kill */
50898+ coord_dup(&dup, coord);
50899+ dup.unit_pos = from;
50900+ unit_key_by_coord(&dup, &from_key);
50901+
50902+ dup.unit_pos = to;
50903+ max_unit_key_by_coord(&dup, &to_key);
50904+
50905+ pfrom_key = &from_key;
50906+ pto_key = &to_key;
50907+ }
50908+
50909+ assert("vs-1555",
50910+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
50911+ assert("vs-1556",
50912+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50913+ (PAGE_CACHE_SIZE - 1));
50914+
50915+ item_key_by_coord(coord, &item_key);
50916+
50917+#if REISER4_DEBUG
50918+ {
50919+ reiser4_key max_item_key;
50920+
50921+ assert("vs-1584",
50922+ get_key_locality(pfrom_key) ==
50923+ get_key_locality(&item_key));
50924+ assert("vs-1585",
50925+ get_key_type(pfrom_key) == get_key_type(&item_key));
50926+ assert("vs-1586",
50927+ get_key_objectid(pfrom_key) ==
50928+ get_key_objectid(&item_key));
50929+ assert("vs-1587",
50930+ get_key_ordering(pfrom_key) ==
50931+ get_key_ordering(&item_key));
50932+
50933+ max_item_key_by_coord(coord, &max_item_key);
50934+
50935+ if (new_first != NULL) {
50936+ /* head of item is to be cut */
50937+ assert("vs-1542", keyeq(pfrom_key, &item_key));
50938+ assert("vs-1538", keylt(pto_key, &max_item_key));
50939+ } else {
50940+ /* tail of item is to be cut */
50941+ assert("vs-1540", keygt(pfrom_key, &item_key));
50942+ assert("vs-1543", keyeq(pto_key, &max_item_key));
50943+ }
50944+ }
50945+#endif
50946+
50947+ if (smallest_removed)
50948+ *smallest_removed = *pfrom_key;
50949+
50950+ if (new_first) {
50951+ /* item head is cut. Item key will change. This new key is calculated here */
50952+ *new_first = *pto_key;
50953+ set_key_offset(new_first, get_key_offset(new_first) + 1);
50954+ }
50955+
50956+ count = to - from + 1;
50957+
50958+ assert("vs-1553",
50959+ get_key_offset(pfrom_key) >=
50960+ get_key_offset(&item_key) + extent_size(coord, from));
50961+ off =
50962+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
50963+ extent_size(coord, from));
50964+ if (off) {
50965+ /* tail of unit @from is to be cut partially. Its width decreases */
50966+ assert("vs-1582", new_first == NULL);
50967+ ext = extent_item(coord) + from;
50968+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
50969+ count--;
50970+ }
50971+
50972+ assert("vs-1554",
50973+ get_key_offset(pto_key) <=
50974+ get_key_offset(&item_key) + extent_size(coord, to + 1) - 1);
50975+ off =
50976+ (get_key_offset(&item_key) + extent_size(coord, to + 1) - 1) -
50977+ get_key_offset(pto_key);
50978+ if (off) {
50979+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
50980+ and width decreased. */
50981+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
50982+ ext = extent_item(coord) + to;
50983+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
50984+ extent_set_start(ext,
50985+ extent_get_start(ext) +
50986+ (extent_get_width(ext) -
50987+ (off >> PAGE_CACHE_SHIFT)));
50988+
50989+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
50990+ count--;
50991+ }
50992+ return count * sizeof(reiser4_extent);
50993+}
50994+
50995+/* item_plugin->b.unit_key */
50996+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
50997+{
50998+ assert("vs-300", coord_is_existing_unit(coord));
50999+
51000+ item_key_by_coord(coord, key);
51001+ set_key_offset(key,
51002+ (get_key_offset(key) +
51003+ extent_size(coord, coord->unit_pos)));
51004+
51005+ return key;
51006+}
51007+
51008+/* item_plugin->b.max_unit_key */
51009+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
51010+{
51011+ assert("vs-300", coord_is_existing_unit(coord));
51012+
51013+ item_key_by_coord(coord, key);
51014+ set_key_offset(key,
51015+ (get_key_offset(key) +
51016+ extent_size(coord, coord->unit_pos + 1) - 1));
51017+ return key;
51018+}
51019+
51020+/* item_plugin->b.estimate
51021+ item_plugin->b.item_data_by_flow */
51022+
51023+#if REISER4_DEBUG
51024+
51025+/* item_plugin->b.check
51026+ used for debugging, every item should have here the most complete
51027+ possible check of the consistency of the item that the inventor can
51028+ construct
51029+*/
51030+int check_extent(const coord_t * coord /* coord of item to check */ ,
51031+ const char **error /* where to store error message */ )
51032+{
51033+ reiser4_extent *ext, *first;
51034+ unsigned i, j;
51035+ reiser4_block_nr start, width, blk_cnt;
51036+ unsigned num_units;
51037+ reiser4_tree *tree;
51038+ oid_t oid;
51039+ reiser4_key key;
51040+ coord_t scan;
51041+
51042+ assert("vs-933", REISER4_DEBUG);
51043+
51044+ if (znode_get_level(coord->node) != TWIG_LEVEL) {
51045+ *error = "Extent on the wrong level";
51046+ return -1;
51047+ }
51048+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
51049+ *error = "Wrong item size";
51050+ return -1;
51051+ }
51052+ ext = first = extent_item(coord);
51053+ blk_cnt = reiser4_block_count(reiser4_get_current_sb());
51054+ num_units = coord_num_units(coord);
51055+ tree = znode_get_tree(coord->node);
51056+ item_key_by_coord(coord, &key);
51057+ oid = get_key_objectid(&key);
51058+ coord_dup(&scan, coord);
51059+
51060+ for (i = 0; i < num_units; ++i, ++ext) {
51061+ __u64 index;
51062+
51063+ scan.unit_pos = i;
51064+ index = extent_unit_index(&scan);
51065+
51066+#if 0
51067+ /* check that all jnodes are present for the unallocated
51068+ * extent */
51069+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51070+ for (j = 0; j < extent_get_width(ext); j++) {
51071+ jnode *node;
51072+
51073+ node = jlookup(tree, oid, index + j);
51074+ if (node == NULL) {
51075+ print_coord("scan", &scan, 0);
51076+ *error = "Jnode missing";
51077+ return -1;
51078+ }
51079+ jput(node);
51080+ }
51081+ }
51082+#endif
51083+
51084+ start = extent_get_start(ext);
51085+ if (start < 2)
51086+ continue;
51087+ /* extent is allocated one */
51088+ width = extent_get_width(ext);
51089+ if (start >= blk_cnt) {
51090+ *error = "Start too large";
51091+ return -1;
51092+ }
51093+ if (start + width > blk_cnt) {
51094+ *error = "End too large";
51095+ return -1;
51096+ }
51097+ /* make sure that this extent does not overlap with other
51098+ allocated extents extents */
51099+ for (j = 0; j < i; j++) {
51100+ if (state_of_extent(first + j) != ALLOCATED_EXTENT)
51101+ continue;
51102+ if (!
51103+ ((extent_get_start(ext) >=
51104+ extent_get_start(first + j) +
51105+ extent_get_width(first + j))
51106+ || (extent_get_start(ext) +
51107+ extent_get_width(ext) <=
51108+ extent_get_start(first + j)))) {
51109+ *error = "Extent overlaps with others";
51110+ return -1;
51111+ }
51112+ }
51113+
51114+ }
51115+
51116+ return 0;
51117+}
51118+
51119+#endif /* REISER4_DEBUG */
51120+
51121+/*
51122+ Local variables:
51123+ c-indentation-style: "K&R"
51124+ mode-name: "LC"
51125+ c-basic-offset: 8
51126+ tab-width: 8
51127+ fill-column: 120
51128+ scroll-step: 1
51129+ End:
51130+*/
51131Index: linux-2.6.16/fs/reiser4/plugin/item/internal.c
51132===================================================================
51133--- /dev/null
51134+++ linux-2.6.16/fs/reiser4/plugin/item/internal.c
51135@@ -0,0 +1,392 @@
51136+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51137+
51138+/* Implementation of internal-item plugin methods. */
51139+
51140+#include "../../forward.h"
51141+#include "../../debug.h"
51142+#include "../../dformat.h"
51143+#include "../../key.h"
51144+#include "../../coord.h"
51145+#include "internal.h"
51146+#include "item.h"
51147+#include "../node/node.h"
51148+#include "../plugin.h"
51149+#include "../../jnode.h"
51150+#include "../../znode.h"
51151+#include "../../tree_walk.h"
51152+#include "../../tree_mod.h"
51153+#include "../../tree.h"
51154+#include "../../super.h"
51155+#include "../../block_alloc.h"
51156+
51157+/* see internal.h for explanation */
51158+
51159+/* plugin->u.item.b.mergeable */
51160+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
51161+ const coord_t * p2 UNUSED_ARG /* second item */ )
51162+{
51163+ /* internal items are not mergeable */
51164+ return 0;
51165+}
51166+
51167+/* ->lookup() method for internal items */
51168+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
51169+ lookup_bias bias UNUSED_ARG /* lookup bias */ ,
51170+ coord_t * coord /* coord of item */ )
51171+{
51172+ reiser4_key ukey;
51173+
51174+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
51175+ default:
51176+ impossible("", "keycmp()?!");
51177+ case LESS_THAN:
51178+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
51179+ item plugin can not be taken using coord set this way */
51180+ assert("vs-681", coord->unit_pos == 0);
51181+ coord->between = AFTER_UNIT;
51182+ case EQUAL_TO:
51183+ return CBK_COORD_FOUND;
51184+ case GREATER_THAN:
51185+ return CBK_COORD_NOTFOUND;
51186+ }
51187+}
51188+
51189+/* return body of internal item at @coord */
51190+static internal_item_layout *internal_at(const coord_t * coord /* coord of
51191+ * item */ )
51192+{
51193+ assert("nikita-607", coord != NULL);
51194+ assert("nikita-1650",
51195+ item_plugin_by_coord(coord) ==
51196+ item_plugin_by_id(NODE_POINTER_ID));
51197+ return (internal_item_layout *) item_body_by_coord(coord);
51198+}
51199+
51200+void update_internal(const coord_t * coord, const reiser4_block_nr * blocknr)
51201+{
51202+ internal_item_layout *item = internal_at(coord);
51203+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
51204+
51205+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
51206+}
51207+
51208+/* return child block number stored in the internal item at @coord */
51209+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
51210+{
51211+ assert("nikita-608", coord != NULL);
51212+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
51213+}
51214+
51215+/* get znode pointed to by internal @item */
51216+static znode *znode_at(const coord_t * item /* coord of item */ ,
51217+ znode * parent /* parent node */ )
51218+{
51219+ return child_znode(item, parent, 1, 0);
51220+}
51221+
51222+/* store pointer from internal item into "block". Implementation of
51223+ ->down_link() method */
51224+void down_link_internal(const coord_t * coord /* coord of item */ ,
51225+ const reiser4_key * key UNUSED_ARG /* key to get
51226+ * pointer for */ ,
51227+ reiser4_block_nr * block /* resulting block number */ )
51228+{
51229+ ON_DEBUG(reiser4_key item_key);
51230+
51231+ assert("nikita-609", coord != NULL);
51232+ assert("nikita-611", block != NULL);
51233+ assert("nikita-612", (key == NULL) ||
51234+ /* twig horrors */
51235+ (znode_get_level(coord->node) == TWIG_LEVEL)
51236+ || keyle(item_key_by_coord(coord, &item_key), key));
51237+
51238+ *block = pointer_at(coord);
51239+ assert("nikita-2960", reiser4_blocknr_is_sane(block));
51240+}
51241+
51242+/* Get the child's block number, or 0 if the block is unallocated. */
51243+int
51244+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
51245+ reiser4_block_nr * block)
51246+{
51247+ assert("jmacd-2059", coord != NULL);
51248+
51249+ *block = pointer_at(coord);
51250+ assert("nikita-2961", reiser4_blocknr_is_sane(block));
51251+
51252+ if (blocknr_is_fake(block)) {
51253+ *block = 0;
51254+ }
51255+
51256+ return 0;
51257+}
51258+
51259+/* Return the child. */
51260+int
51261+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
51262+ jnode ** childp)
51263+{
51264+ reiser4_block_nr block = pointer_at(coord);
51265+ znode *child;
51266+
51267+ assert("jmacd-2059", childp != NULL);
51268+ assert("nikita-2962", reiser4_blocknr_is_sane(&block));
51269+
51270+ child = zlook(znode_get_tree(coord->node), &block);
51271+
51272+ if (IS_ERR(child)) {
51273+ return PTR_ERR(child);
51274+ }
51275+
51276+ *childp = ZJNODE(child);
51277+
51278+ return 0;
51279+}
51280+
51281+static void check_link(znode * left, znode * right)
51282+{
51283+ znode *scan;
51284+
51285+ for (scan = left; scan != right; scan = scan->right) {
51286+ if (ZF_ISSET(scan, JNODE_RIP))
51287+ break;
51288+ if (znode_is_right_connected(scan) && scan->right != NULL) {
51289+ if (ZF_ISSET(scan->right, JNODE_RIP))
51290+ break;
51291+ assert("nikita-3285",
51292+ znode_is_left_connected(scan->right));
51293+ assert("nikita-3265",
51294+ ergo(scan != left,
51295+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
51296+ assert("nikita-3284", scan->right->left == scan);
51297+ } else
51298+ break;
51299+ }
51300+}
51301+
51302+int check__internal(const coord_t * coord, const char **error)
51303+{
51304+ reiser4_block_nr blk;
51305+ znode *child;
51306+ coord_t cpy;
51307+
51308+ blk = pointer_at(coord);
51309+ if (!reiser4_blocknr_is_sane(&blk)) {
51310+ *error = "Invalid pointer";
51311+ return -1;
51312+ }
51313+ coord_dup(&cpy, coord);
51314+ child = znode_at(&cpy, cpy.node);
51315+ if (child != NULL) {
51316+ znode *left_child;
51317+ znode *right_child;
51318+
51319+ left_child = right_child = NULL;
51320+
51321+ assert("nikita-3256", znode_invariant(child));
51322+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
51323+ left_child = znode_at(&cpy, cpy.node);
51324+ if (left_child != NULL) {
51325+ read_lock_tree(znode_get_tree(child));
51326+ check_link(left_child, child);
51327+ read_unlock_tree(znode_get_tree(child));
51328+ zput(left_child);
51329+ }
51330+ }
51331+ coord_dup(&cpy, coord);
51332+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
51333+ right_child = znode_at(&cpy, cpy.node);
51334+ if (right_child != NULL) {
51335+ read_lock_tree(znode_get_tree(child));
51336+ check_link(child, right_child);
51337+ read_unlock_tree(znode_get_tree(child));
51338+ zput(right_child);
51339+ }
51340+ }
51341+ zput(child);
51342+ }
51343+ return 0;
51344+}
51345+
51346+/* return true only if this item really points to "block" */
51347+/* Audited by: green(2002.06.14) */
51348+int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
51349+ const reiser4_block_nr * block /* block number to
51350+ * check */ )
51351+{
51352+ assert("nikita-613", coord != NULL);
51353+ assert("nikita-614", block != NULL);
51354+
51355+ return pointer_at(coord) == *block;
51356+}
51357+
51358+/* hook called by ->create_item() method of node plugin after new internal
51359+ item was just created.
51360+
51361+ This is point where pointer to new node is inserted into tree. Initialize
51362+ parent pointer in child znode, insert child into sibling list and slum.
51363+
51364+*/
51365+int create_hook_internal(const coord_t * item /* coord of item */ ,
51366+ void *arg /* child's left neighbor, if any */ )
51367+{
51368+ znode *child;
51369+ __u64 child_ptr;
51370+
51371+ assert("nikita-1252", item != NULL);
51372+ assert("nikita-1253", item->node != NULL);
51373+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51374+ assert("nikita-1450", item->unit_pos == 0);
51375+
51376+ /*
51377+ * preparing to item insertion build_child_ptr_data sets pointer to
51378+ * data to be inserted to jnode's blocknr which is in cpu byte
51379+ * order. Node's create_item simply copied those data. As result we
51380+ * have child pointer in cpu's byte order. Convert content of internal
51381+ * item to little endian byte order.
51382+ */
51383+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51384+ update_internal(item, &child_ptr);
51385+
51386+ child = znode_at(item, item->node);
51387+ if (child != NULL && !IS_ERR(child)) {
51388+ znode *left;
51389+ int result = 0;
51390+ reiser4_tree *tree;
51391+
51392+ left = arg;
51393+ tree = znode_get_tree(item->node);
51394+ write_lock_tree(tree);
51395+ write_lock_dk(tree);
51396+ assert("nikita-1400", (child->in_parent.node == NULL)
51397+ || (znode_above_root(child->in_parent.node)));
51398+ ++item->node->c_count;
51399+ coord_to_parent_coord(item, &child->in_parent);
51400+ sibling_list_insert_nolock(child, left);
51401+
51402+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51403+ ZF_CLR(child, JNODE_ORPHAN);
51404+
51405+ if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51406+ znode_get_rd_key(child))) {
51407+ znode_set_rd_key(child, znode_get_rd_key(left));
51408+ }
51409+ write_unlock_dk(tree);
51410+ write_unlock_tree(tree);
51411+ zput(child);
51412+ return result;
51413+ } else {
51414+ if (child == NULL)
51415+ child = ERR_PTR(-EIO);
51416+ return PTR_ERR(child);
51417+ }
51418+}
51419+
51420+/* hook called by ->cut_and_kill() method of node plugin just before internal
51421+ item is removed.
51422+
51423+ This is point where empty node is removed from the tree. Clear parent
51424+ pointer in child, and mark node for pending deletion.
51425+
51426+ Node will be actually deleted later and in several installations:
51427+
51428+ . when last lock on this node will be released, node will be removed from
51429+ the sibling list and its lock will be invalidated
51430+
51431+ . when last reference to this node will be dropped, bitmap will be updated
51432+ and node will be actually removed from the memory.
51433+
51434+
51435+*/
51436+int kill_hook_internal(const coord_t * item /* coord of item */ ,
51437+ pos_in_node_t from UNUSED_ARG /* start unit */ ,
51438+ pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51439+ struct carry_kill_data *p UNUSED_ARG)
51440+{
51441+ znode *child;
51442+
51443+ assert("nikita-1222", item != NULL);
51444+ assert("nikita-1224", from == 0);
51445+ assert("nikita-1225", count == 1);
51446+
51447+ child = znode_at(item, item->node);
51448+ if (IS_ERR(child))
51449+ return PTR_ERR(child);
51450+ else if (node_is_empty(child)) {
51451+ reiser4_tree *tree;
51452+
51453+ assert("nikita-1397", znode_is_write_locked(child));
51454+ assert("nikita-1398", child->c_count == 0);
51455+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51456+
51457+ tree = znode_get_tree(item->node);
51458+ write_lock_tree(tree);
51459+ init_parent_coord(&child->in_parent, NULL);
51460+ --item->node->c_count;
51461+ write_unlock_tree(tree);
51462+ zput(child);
51463+ return 0;
51464+ } else {
51465+ warning("nikita-1223",
51466+ "Cowardly refuse to remove link to non-empty node");
51467+ zput(child);
51468+ return RETERR(-EIO);
51469+ }
51470+}
51471+
51472+/* hook called by ->shift() node plugin method when iternal item was just
51473+ moved from one node to another.
51474+
51475+ Update parent pointer in child and c_counts in old and new parent
51476+
51477+*/
51478+int shift_hook_internal(const coord_t * item /* coord of item */ ,
51479+ unsigned from UNUSED_ARG /* start unit */ ,
51480+ unsigned count UNUSED_ARG /* stop unit */ ,
51481+ znode * old_node /* old parent */ )
51482+{
51483+ znode *child;
51484+ znode *new_node;
51485+ reiser4_tree *tree;
51486+
51487+ assert("nikita-1276", item != NULL);
51488+ assert("nikita-1277", from == 0);
51489+ assert("nikita-1278", count == 1);
51490+ assert("nikita-1451", item->unit_pos == 0);
51491+
51492+ new_node = item->node;
51493+ assert("nikita-2132", new_node != old_node);
51494+ tree = znode_get_tree(item->node);
51495+ child = child_znode(item, old_node, 1, 0);
51496+ if (child == NULL)
51497+ return 0;
51498+ if (!IS_ERR(child)) {
51499+ write_lock_tree(tree);
51500+ ++new_node->c_count;
51501+ assert("nikita-1395", znode_parent(child) == old_node);
51502+ assert("nikita-1396", old_node->c_count > 0);
51503+ coord_to_parent_coord(item, &child->in_parent);
51504+ assert("nikita-1781", znode_parent(child) == new_node);
51505+ assert("nikita-1782",
51506+ check_tree_pointer(item, child) == NS_FOUND);
51507+ --old_node->c_count;
51508+ write_unlock_tree(tree);
51509+ zput(child);
51510+ return 0;
51511+ } else
51512+ return PTR_ERR(child);
51513+}
51514+
51515+/* plugin->u.item.b.max_key_inside - not defined */
51516+
51517+/* plugin->u.item.b.nr_units - item.c:single_unit */
51518+
51519+/* Make Linus happy.
51520+ Local variables:
51521+ c-indentation-style: "K&R"
51522+ mode-name: "LC"
51523+ c-basic-offset: 8
51524+ tab-width: 8
51525+ fill-column: 120
51526+ End:
51527+*/
51528Index: linux-2.6.16/fs/reiser4/plugin/item/internal.h
51529===================================================================
51530--- /dev/null
51531+++ linux-2.6.16/fs/reiser4/plugin/item/internal.h
51532@@ -0,0 +1,57 @@
51533+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51534+/* Internal item contains down-link to the child of the internal/twig
51535+ node in a tree. It is internal items that are actually used during
51536+ tree traversal. */
51537+
51538+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51539+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51540+
51541+#include "../../forward.h"
51542+#include "../../dformat.h"
51543+
51544+/* on-disk layout of internal item */
51545+typedef struct internal_item_layout {
51546+ /* 0 */ reiser4_dblock_nr pointer;
51547+ /* 4 */
51548+} internal_item_layout;
51549+
51550+struct cut_list;
51551+
51552+int mergeable_internal(const coord_t * p1, const coord_t * p2);
51553+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51554+ coord_t * coord);
51555+/* store pointer from internal item into "block". Implementation of
51556+ ->down_link() method */
51557+extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51558+ reiser4_block_nr * block);
51559+extern int has_pointer_to_internal(const coord_t * coord,
51560+ const reiser4_block_nr * block);
51561+extern int create_hook_internal(const coord_t * item, void *arg);
51562+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51563+ pos_in_node_t count, struct carry_kill_data *);
51564+extern int shift_hook_internal(const coord_t * item, unsigned from,
51565+ unsigned count, znode * old_node);
51566+extern void print_internal(const char *prefix, coord_t * coord);
51567+
51568+extern int utmost_child_internal(const coord_t * coord, sideof side,
51569+ jnode ** child);
51570+int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51571+ reiser4_block_nr * block);
51572+
51573+extern void update_internal(const coord_t * coord,
51574+ const reiser4_block_nr * blocknr);
51575+/* FIXME: reiserfs has check_internal */
51576+extern int check__internal(const coord_t * coord, const char **error);
51577+
51578+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51579+#endif
51580+
51581+/* Make Linus happy.
51582+ Local variables:
51583+ c-indentation-style: "K&R"
51584+ mode-name: "LC"
51585+ c-basic-offset: 8
51586+ tab-width: 8
51587+ fill-column: 120
51588+ End:
51589+*/
51590Index: linux-2.6.16/fs/reiser4/plugin/item/item.c
51591===================================================================
51592--- /dev/null
51593+++ linux-2.6.16/fs/reiser4/plugin/item/item.c
51594@@ -0,0 +1,727 @@
51595+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51596+
51597+/* definition of item plugins. */
51598+
51599+#include "../../forward.h"
51600+#include "../../debug.h"
51601+#include "../../key.h"
51602+#include "../../coord.h"
51603+#include "../plugin_header.h"
51604+#include "sde.h"
51605+#include "internal.h"
51606+#include "item.h"
51607+#include "static_stat.h"
51608+#include "../plugin.h"
51609+#include "../../znode.h"
51610+#include "../../tree.h"
51611+#include "../../context.h"
51612+#include "ctail.h"
51613+
51614+/* return pointer to item body */
51615+void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
51616+{
51617+ assert("nikita-324", coord != NULL);
51618+ assert("nikita-325", coord->node != NULL);
51619+ assert("nikita-326", znode_is_loaded(coord->node));
51620+ assert("nikita-3200", coord->offset == INVALID_OFFSET);
51621+
51622+ coord->offset =
51623+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
51624+ zdata(coord->node);
51625+ ON_DEBUG(coord->body_v = coord->node->times_locked);
51626+}
51627+
51628+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
51629+{
51630+ return zdata(coord->node) + coord->offset;
51631+}
51632+
51633+#if REISER4_DEBUG
51634+
51635+int item_body_is_valid(const coord_t * coord)
51636+{
51637+ return
51638+ coord->offset ==
51639+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
51640+ zdata(coord->node);
51641+}
51642+
51643+#endif
51644+
51645+/* return length of item at @coord */
51646+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
51647+{
51648+ int len;
51649+
51650+ assert("nikita-327", coord != NULL);
51651+ assert("nikita-328", coord->node != NULL);
51652+ assert("nikita-329", znode_is_loaded(coord->node));
51653+
51654+ len = node_plugin_by_node(coord->node)->length_by_coord(coord);
51655+ return len;
51656+}
51657+
51658+void obtain_item_plugin(const coord_t * coord)
51659+{
51660+ assert("nikita-330", coord != NULL);
51661+ assert("nikita-331", coord->node != NULL);
51662+ assert("nikita-332", znode_is_loaded(coord->node));
51663+
51664+ coord_set_iplug((coord_t *) coord,
51665+ node_plugin_by_node(coord->node)->
51666+ plugin_by_coord(coord));
51667+ assert("nikita-2479",
51668+ coord_iplug(coord) ==
51669+ node_plugin_by_node(coord->node)->plugin_by_coord(coord));
51670+}
51671+
51672+/* return type of item at @coord */
51673+item_type_id item_type_by_coord(const coord_t * coord /* coord to query */ )
51674+{
51675+ assert("nikita-333", coord != NULL);
51676+ assert("nikita-334", coord->node != NULL);
51677+ assert("nikita-335", znode_is_loaded(coord->node));
51678+ assert("nikita-336", item_plugin_by_coord(coord) != NULL);
51679+
51680+ return item_plugin_by_coord(coord)->b.item_type;
51681+}
51682+
51683+/* return id of item */
51684+/* Audited by: green(2002.06.15) */
51685+item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
51686+{
51687+ assert("vs-539", coord != NULL);
51688+ assert("vs-538", coord->node != NULL);
51689+ assert("vs-537", znode_is_loaded(coord->node));
51690+ assert("vs-536", item_plugin_by_coord(coord) != NULL);
51691+ assert("vs-540",
51692+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
51693+
51694+ return item_id_by_plugin(item_plugin_by_coord(coord));
51695+}
51696+
51697+/* return key of item at @coord */
51698+/* Audited by: green(2002.06.15) */
51699+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
51700+ reiser4_key * key /* result */ )
51701+{
51702+ assert("nikita-338", coord != NULL);
51703+ assert("nikita-339", coord->node != NULL);
51704+ assert("nikita-340", znode_is_loaded(coord->node));
51705+
51706+ return node_plugin_by_node(coord->node)->key_at(coord, key);
51707+}
51708+
51709+/* this returns max key in the item */
51710+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
51711+ reiser4_key * key /* result */ )
51712+{
51713+ coord_t last;
51714+
51715+ assert("nikita-338", coord != NULL);
51716+ assert("nikita-339", coord->node != NULL);
51717+ assert("nikita-340", znode_is_loaded(coord->node));
51718+
51719+ /* make coord pointing to last item's unit */
51720+ coord_dup(&last, coord);
51721+ last.unit_pos = coord_num_units(&last) - 1;
51722+ assert("vs-1560", coord_is_existing_unit(&last));
51723+
51724+ max_unit_key_by_coord(&last, key);
51725+ return key;
51726+}
51727+
51728+/* return key of unit at @coord */
51729+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51730+ reiser4_key * key /* result */ )
51731+{
51732+ assert("nikita-772", coord != NULL);
51733+ assert("nikita-774", coord->node != NULL);
51734+ assert("nikita-775", znode_is_loaded(coord->node));
51735+
51736+ if (item_plugin_by_coord(coord)->b.unit_key != NULL)
51737+ return item_plugin_by_coord(coord)->b.unit_key(coord, key);
51738+ else
51739+ return item_key_by_coord(coord, key);
51740+}
51741+
51742+/* return the biggest key contained the unit @coord */
51743+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51744+ reiser4_key * key /* result */ )
51745+{
51746+ assert("nikita-772", coord != NULL);
51747+ assert("nikita-774", coord->node != NULL);
51748+ assert("nikita-775", znode_is_loaded(coord->node));
51749+
51750+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
51751+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
51752+ else
51753+ return unit_key_by_coord(coord, key);
51754+}
51755+
51756+/* ->max_key_inside() method for items consisting of exactly one key (like
51757+ stat-data) */
51758+static reiser4_key *max_key_inside_single_key(const coord_t *
51759+ coord /* coord of item */ ,
51760+ reiser4_key *
51761+ result /* resulting key */ )
51762+{
51763+ assert("nikita-604", coord != NULL);
51764+
51765+ /* coord -> key is starting key of this item and it has to be already
51766+ filled in */
51767+ return unit_key_by_coord(coord, result);
51768+}
51769+
51770+/* ->nr_units() method for items consisting of exactly one unit always */
51771+static pos_in_node_t
51772+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
51773+{
51774+ return 1;
51775+}
51776+
51777+static int
51778+paste_no_paste(coord_t * coord UNUSED_ARG,
51779+ reiser4_item_data * data UNUSED_ARG,
51780+ carry_plugin_info * info UNUSED_ARG)
51781+{
51782+ return 0;
51783+}
51784+
51785+/* default ->fast_paste() method */
51786+static int
51787+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
51788+{
51789+ return 1;
51790+}
51791+
51792+int item_can_contain_key(const coord_t * item /* coord of item */ ,
51793+ const reiser4_key * key /* key to check */ ,
51794+ const reiser4_item_data * data /* parameters of item
51795+ * being created */ )
51796+{
51797+ item_plugin *iplug;
51798+ reiser4_key min_key_in_item;
51799+ reiser4_key max_key_in_item;
51800+
51801+ assert("nikita-1658", item != NULL);
51802+ assert("nikita-1659", key != NULL);
51803+
51804+ iplug = item_plugin_by_coord(item);
51805+ if (iplug->b.can_contain_key != NULL)
51806+ return iplug->b.can_contain_key(item, key, data);
51807+ else {
51808+ assert("nikita-1681", iplug->b.max_key_inside != NULL);
51809+ item_key_by_coord(item, &min_key_in_item);
51810+ iplug->b.max_key_inside(item, &max_key_in_item);
51811+
51812+ /* can contain key if
51813+ min_key_in_item <= key &&
51814+ key <= max_key_in_item
51815+ */
51816+ return keyle(&min_key_in_item, key)
51817+ && keyle(key, &max_key_in_item);
51818+ }
51819+}
51820+
51821+/* mergeable method for non mergeable items */
51822+static int
51823+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
51824+{
51825+ return 0;
51826+}
51827+
51828+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
51829+int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
51830+ const coord_t * i2 /* coord of second item */ )
51831+{
51832+ item_plugin *iplug;
51833+ reiser4_key k1;
51834+ reiser4_key k2;
51835+
51836+ assert("nikita-1336", i1 != NULL);
51837+ assert("nikita-1337", i2 != NULL);
51838+
51839+ iplug = item_plugin_by_coord(i1);
51840+ assert("nikita-1338", iplug != NULL);
51841+
51842+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
51843+ shifting code when nodes are in "suspended" state. */
51844+ assert("nikita-1663",
51845+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
51846+
51847+ if (iplug->b.mergeable != NULL) {
51848+ return iplug->b.mergeable(i1, i2);
51849+ } else if (iplug->b.max_key_inside != NULL) {
51850+ iplug->b.max_key_inside(i1, &k1);
51851+ item_key_by_coord(i2, &k2);
51852+
51853+ /* mergeable if ->max_key_inside() >= key of i2; */
51854+ return keyge(iplug->b.max_key_inside(i1, &k1),
51855+ item_key_by_coord(i2, &k2));
51856+ } else {
51857+ item_key_by_coord(i1, &k1);
51858+ item_key_by_coord(i2, &k2);
51859+
51860+ return
51861+ (get_key_locality(&k1) == get_key_locality(&k2)) &&
51862+ (get_key_objectid(&k1) == get_key_objectid(&k2))
51863+ && (iplug == item_plugin_by_coord(i2));
51864+ }
51865+}
51866+
51867+int item_is_extent(const coord_t * item)
51868+{
51869+ assert("vs-482", coord_is_existing_item(item));
51870+ return item_id_by_coord(item) == EXTENT_POINTER_ID;
51871+}
51872+
51873+int item_is_tail(const coord_t * item)
51874+{
51875+ assert("vs-482", coord_is_existing_item(item));
51876+ return item_id_by_coord(item) == FORMATTING_ID;
51877+}
51878+
51879+int item_is_statdata(const coord_t * item)
51880+{
51881+ assert("vs-516", coord_is_existing_item(item));
51882+ return item_type_by_coord(item) == STAT_DATA_ITEM_TYPE;
51883+}
51884+
51885+int item_is_ctail(const coord_t * item)
51886+{
51887+ assert("edward-xx", coord_is_existing_item(item));
51888+ return item_id_by_coord(item) == CTAIL_ID;
51889+}
51890+
51891+static int change_item(struct inode *inode, reiser4_plugin * plugin)
51892+{
51893+ /* cannot change constituent item (sd, or dir_item) */
51894+ return RETERR(-EINVAL);
51895+}
51896+
51897+static reiser4_plugin_ops item_plugin_ops = {
51898+ .init = NULL,
51899+ .load = NULL,
51900+ .save_len = NULL,
51901+ .save = NULL,
51902+ .change = change_item
51903+};
51904+
51905+item_plugin item_plugins[LAST_ITEM_ID] = {
51906+ [STATIC_STAT_DATA_ID] = {
51907+ .h = {
51908+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51909+ .id = STATIC_STAT_DATA_ID,
51910+ .pops = &item_plugin_ops,
51911+ .label = "sd",
51912+ .desc = "stat-data",
51913+ .linkage = {NULL, NULL}
51914+ },
51915+ .b = {
51916+ .item_type = STAT_DATA_ITEM_TYPE,
51917+ .max_key_inside = max_key_inside_single_key,
51918+ .can_contain_key = NULL,
51919+ .mergeable = not_mergeable,
51920+ .nr_units = nr_units_single_unit,
51921+ .lookup = NULL,
51922+ .init = NULL,
51923+ .paste = paste_no_paste,
51924+ .fast_paste = NULL,
51925+ .can_shift = NULL,
51926+ .copy_units = NULL,
51927+ .create_hook = NULL,
51928+ .kill_hook = NULL,
51929+ .shift_hook = NULL,
51930+ .cut_units = NULL,
51931+ .kill_units = NULL,
51932+ .unit_key = NULL,
51933+ .max_unit_key = NULL,
51934+ .estimate = NULL,
51935+ .item_data_by_flow = NULL,
51936+#if REISER4_DEBUG
51937+ .check = NULL
51938+#endif
51939+ },
51940+ .f = {
51941+ .utmost_child = NULL,
51942+ .utmost_child_real_block = NULL,
51943+ .update = NULL,
51944+ .scan = NULL,
51945+ .convert = NULL
51946+ },
51947+ .s = {
51948+ .sd = {
51949+ .init_inode = init_inode_static_sd,
51950+ .save_len = save_len_static_sd,
51951+ .save = save_static_sd
51952+ }
51953+ }
51954+ },
51955+ [SIMPLE_DIR_ENTRY_ID] = {
51956+ .h = {
51957+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51958+ .id = SIMPLE_DIR_ENTRY_ID,
51959+ .pops = &item_plugin_ops,
51960+ .label = "de",
51961+ .desc = "directory entry",
51962+ .linkage = {NULL, NULL}
51963+ },
51964+ .b = {
51965+ .item_type = DIR_ENTRY_ITEM_TYPE,
51966+ .max_key_inside = max_key_inside_single_key,
51967+ .can_contain_key = NULL,
51968+ .mergeable = NULL,
51969+ .nr_units = nr_units_single_unit,
51970+ .lookup = NULL,
51971+ .init = NULL,
51972+ .paste = NULL,
51973+ .fast_paste = NULL,
51974+ .can_shift = NULL,
51975+ .copy_units = NULL,
51976+ .create_hook = NULL,
51977+ .kill_hook = NULL,
51978+ .shift_hook = NULL,
51979+ .cut_units = NULL,
51980+ .kill_units = NULL,
51981+ .unit_key = NULL,
51982+ .max_unit_key = NULL,
51983+ .estimate = NULL,
51984+ .item_data_by_flow = NULL,
51985+#if REISER4_DEBUG
51986+ .check = NULL
51987+#endif
51988+ },
51989+ .f = {
51990+ .utmost_child = NULL,
51991+ .utmost_child_real_block = NULL,
51992+ .update = NULL,
51993+ .scan = NULL,
51994+ .convert = NULL
51995+ },
51996+ .s = {
51997+ .dir = {
51998+ .extract_key = extract_key_de,
51999+ .update_key = update_key_de,
52000+ .extract_name = extract_name_de,
52001+ .extract_file_type = extract_file_type_de,
52002+ .add_entry = add_entry_de,
52003+ .rem_entry = rem_entry_de,
52004+ .max_name_len = max_name_len_de
52005+ }
52006+ }
52007+ },
52008+ [COMPOUND_DIR_ID] = {
52009+ .h = {
52010+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52011+ .id = COMPOUND_DIR_ID,
52012+ .pops = &item_plugin_ops,
52013+ .label = "cde",
52014+ .desc = "compressed directory entry",
52015+ .linkage = {NULL, NULL}
52016+ },
52017+ .b = {
52018+ .item_type = DIR_ENTRY_ITEM_TYPE,
52019+ .max_key_inside = max_key_inside_cde,
52020+ .can_contain_key = can_contain_key_cde,
52021+ .mergeable = mergeable_cde,
52022+ .nr_units = nr_units_cde,
52023+ .lookup = lookup_cde,
52024+ .init = init_cde,
52025+ .paste = paste_cde,
52026+ .fast_paste = agree_to_fast_op,
52027+ .can_shift = can_shift_cde,
52028+ .copy_units = copy_units_cde,
52029+ .create_hook = NULL,
52030+ .kill_hook = NULL,
52031+ .shift_hook = NULL,
52032+ .cut_units = cut_units_cde,
52033+ .kill_units = kill_units_cde,
52034+ .unit_key = unit_key_cde,
52035+ .max_unit_key = unit_key_cde,
52036+ .estimate = estimate_cde,
52037+ .item_data_by_flow = NULL,
52038+#if REISER4_DEBUG
52039+ .check = check_cde
52040+#endif
52041+ },
52042+ .f = {
52043+ .utmost_child = NULL,
52044+ .utmost_child_real_block = NULL,
52045+ .update = NULL,
52046+ .scan = NULL,
52047+ .convert = NULL
52048+ },
52049+ .s = {
52050+ .dir = {
52051+ .extract_key = extract_key_cde,
52052+ .update_key = update_key_cde,
52053+ .extract_name = extract_name_cde,
52054+ .extract_file_type = extract_file_type_de,
52055+ .add_entry = add_entry_cde,
52056+ .rem_entry = rem_entry_cde,
52057+ .max_name_len = max_name_len_cde
52058+ }
52059+ }
52060+ },
52061+ [NODE_POINTER_ID] = {
52062+ .h = {
52063+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52064+ .id = NODE_POINTER_ID,
52065+ .pops = NULL,
52066+ .label = "internal",
52067+ .desc = "internal item",
52068+ .linkage = {NULL, NULL}
52069+ },
52070+ .b = {
52071+ .item_type = INTERNAL_ITEM_TYPE,
52072+ .max_key_inside = NULL,
52073+ .can_contain_key = NULL,
52074+ .mergeable = mergeable_internal,
52075+ .nr_units = nr_units_single_unit,
52076+ .lookup = lookup_internal,
52077+ .init = NULL,
52078+ .paste = NULL,
52079+ .fast_paste = NULL,
52080+ .can_shift = NULL,
52081+ .copy_units = NULL,
52082+ .create_hook = create_hook_internal,
52083+ .kill_hook = kill_hook_internal,
52084+ .shift_hook = shift_hook_internal,
52085+ .cut_units = NULL,
52086+ .kill_units = NULL,
52087+ .unit_key = NULL,
52088+ .max_unit_key = NULL,
52089+ .estimate = NULL,
52090+ .item_data_by_flow = NULL,
52091+#if REISER4_DEBUG
52092+ .check = check__internal
52093+#endif
52094+ },
52095+ .f = {
52096+ .utmost_child = utmost_child_internal,
52097+ .utmost_child_real_block =
52098+ utmost_child_real_block_internal,
52099+ .update = update_internal,
52100+ .scan = NULL,
52101+ .convert = NULL
52102+ },
52103+ .s = {
52104+ .internal = {
52105+ .down_link = down_link_internal,
52106+ .has_pointer_to = has_pointer_to_internal
52107+ }
52108+ }
52109+ },
52110+ [EXTENT_POINTER_ID] = {
52111+ .h = {
52112+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52113+ .id = EXTENT_POINTER_ID,
52114+ .pops = NULL,
52115+ .label = "extent",
52116+ .desc = "extent item",
52117+ .linkage = {NULL, NULL}
52118+ },
52119+ .b = {
52120+ .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52121+ .max_key_inside = max_key_inside_extent,
52122+ .can_contain_key = can_contain_key_extent,
52123+ .mergeable = mergeable_extent,
52124+ .nr_units = nr_units_extent,
52125+ .lookup = lookup_extent,
52126+ .init = NULL,
52127+ .paste = paste_extent,
52128+ .fast_paste = agree_to_fast_op,
52129+ .can_shift = can_shift_extent,
52130+ .create_hook = create_hook_extent,
52131+ .copy_units = copy_units_extent,
52132+ .kill_hook = kill_hook_extent,
52133+ .shift_hook = NULL,
52134+ .cut_units = cut_units_extent,
52135+ .kill_units = kill_units_extent,
52136+ .unit_key = unit_key_extent,
52137+ .max_unit_key = max_unit_key_extent,
52138+ .estimate = NULL,
52139+ .item_data_by_flow = NULL,
52140+#if REISER4_DEBUG
52141+ .check = check_extent
52142+#endif
52143+ },
52144+ .f = {
52145+ .utmost_child = utmost_child_extent,
52146+ .utmost_child_real_block =
52147+ utmost_child_real_block_extent,
52148+ .update = NULL,
52149+ .scan = scan_extent,
52150+ .convert = NULL,
52151+ .key_by_offset = key_by_offset_extent
52152+ },
52153+ .s = {
52154+ .file = {
52155+ .write = write_extent,
52156+ .read = read_extent,
52157+ .readpage = readpage_extent,
52158+ .get_block = get_block_address_extent,
52159+ .readpages = readpages_extent,
52160+ .append_key = append_key_extent,
52161+ .init_coord_extension =
52162+ init_coord_extension_extent
52163+ }
52164+ }
52165+ },
52166+ [FORMATTING_ID] = {
52167+ .h = {
52168+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52169+ .id = FORMATTING_ID,
52170+ .pops = NULL,
52171+ .label = "body",
52172+ .desc = "body (or tail?) item",
52173+ .linkage = {NULL, NULL}
52174+ },
52175+ .b = {
52176+ .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52177+ .max_key_inside = max_key_inside_tail,
52178+ .can_contain_key = can_contain_key_tail,
52179+ .mergeable = mergeable_tail,
52180+ .nr_units = nr_units_tail,
52181+ .lookup = lookup_tail,
52182+ .init = NULL,
52183+ .paste = paste_tail,
52184+ .fast_paste = agree_to_fast_op,
52185+ .can_shift = can_shift_tail,
52186+ .create_hook = NULL,
52187+ .copy_units = copy_units_tail,
52188+ .kill_hook = kill_hook_tail,
52189+ .shift_hook = NULL,
52190+ .cut_units = cut_units_tail,
52191+ .kill_units = kill_units_tail,
52192+ .unit_key = unit_key_tail,
52193+ .max_unit_key = unit_key_tail,
52194+ .estimate = NULL,
52195+ .item_data_by_flow = NULL,
52196+#if REISER4_DEBUG
52197+ .check = NULL
52198+#endif
52199+ },
52200+ .f = {
52201+ .utmost_child = NULL,
52202+ .utmost_child_real_block = NULL,
52203+ .update = NULL,
52204+ .scan = NULL,
52205+ .convert = NULL
52206+ },
52207+ .s = {
52208+ .file = {
52209+ .write = write_tail,
52210+ .read = read_tail,
52211+ .readpage = readpage_tail,
52212+ .get_block = NULL,
52213+ .readpages = NULL,
52214+ .append_key = append_key_tail,
52215+ .init_coord_extension =
52216+ init_coord_extension_tail
52217+ }
52218+ }
52219+ },
52220+ [CTAIL_ID] = {
52221+ .h = {
52222+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52223+ .id = CTAIL_ID,
52224+ .pops = NULL,
52225+ .label = "ctail",
52226+ .desc = "cryptcompress tail item",
52227+ .linkage = {NULL, NULL}
52228+ },
52229+ .b = {
52230+ .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52231+ .max_key_inside = max_key_inside_tail,
52232+ .can_contain_key = can_contain_key_ctail,
52233+ .mergeable = mergeable_ctail,
52234+ .nr_units = nr_units_ctail,
52235+ .lookup = NULL,
52236+ .init = init_ctail,
52237+ .paste = paste_ctail,
52238+ .fast_paste = agree_to_fast_op,
52239+ .can_shift = can_shift_ctail,
52240+ .create_hook = create_hook_ctail,
52241+ .copy_units = copy_units_ctail,
52242+ .kill_hook = kill_hook_ctail,
52243+ .shift_hook = shift_hook_ctail,
52244+ .cut_units = cut_units_ctail,
52245+ .kill_units = kill_units_ctail,
52246+ .unit_key = unit_key_tail,
52247+ .max_unit_key = unit_key_tail,
52248+ .estimate = estimate_ctail,
52249+ .item_data_by_flow = NULL,
52250+#if REISER4_DEBUG
52251+ .check = check_ctail
52252+#endif
52253+ },
52254+ .f = {
52255+ .utmost_child = utmost_child_ctail,
52256+ /* FIXME-EDWARD: write this */
52257+ .utmost_child_real_block = NULL,
52258+ .update = NULL,
52259+ .scan = scan_ctail,
52260+ .convert = convert_ctail
52261+ },
52262+ .s = {
52263+ .file = {
52264+ .write = NULL,
52265+ .read = read_ctail,
52266+ .readpage = readpage_ctail,
52267+ .get_block = get_block_address_tail,
52268+ .readpages = readpages_ctail,
52269+ .append_key = append_key_ctail,
52270+ .init_coord_extension =
52271+ init_coord_extension_tail
52272+ }
52273+ }
52274+ },
52275+ [BLACK_BOX_ID] = {
52276+ .h = {
52277+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52278+ .id = BLACK_BOX_ID,
52279+ .pops = NULL,
52280+ .label = "blackbox",
52281+ .desc = "black box item",
52282+ .linkage = {NULL, NULL}
52283+ },
52284+ .b = {
52285+ .item_type = OTHER_ITEM_TYPE,
52286+ .max_key_inside = NULL,
52287+ .can_contain_key = NULL,
52288+ .mergeable = not_mergeable,
52289+ .nr_units = nr_units_single_unit,
52290+ /* to need for ->lookup method */
52291+ .lookup = NULL,
52292+ .init = NULL,
52293+ .paste = NULL,
52294+ .fast_paste = NULL,
52295+ .can_shift = NULL,
52296+ .copy_units = NULL,
52297+ .create_hook = NULL,
52298+ .kill_hook = NULL,
52299+ .shift_hook = NULL,
52300+ .cut_units = NULL,
52301+ .kill_units = NULL,
52302+ .unit_key = NULL,
52303+ .max_unit_key = NULL,
52304+ .estimate = NULL,
52305+ .item_data_by_flow = NULL,
52306+#if REISER4_DEBUG
52307+ .check = NULL
52308+#endif
52309+ }
52310+ }
52311+};
52312+
52313+/* Make Linus happy.
52314+ Local variables:
52315+ c-indentation-style: "K&R"
52316+ mode-name: "LC"
52317+ c-basic-offset: 8
52318+ tab-width: 8
52319+ fill-column: 120
52320+ End:
52321+*/
52322Index: linux-2.6.16/fs/reiser4/plugin/item/item.h
52323===================================================================
52324--- /dev/null
52325+++ linux-2.6.16/fs/reiser4/plugin/item/item.h
52326@@ -0,0 +1,399 @@
52327+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52328+
52329+/* first read balance.c comments before reading this */
52330+
52331+/* An item_plugin implements all of the operations required for
52332+ balancing that are item specific. */
52333+
52334+/* an item plugin also implements other operations that are specific to that
52335+ item. These go into the item specific operations portion of the item
52336+ handler, and all of the item specific portions of the item handler are put
52337+ into a union. */
52338+
52339+#if !defined( __REISER4_ITEM_H__ )
52340+#define __REISER4_ITEM_H__
52341+
52342+#include "../../forward.h"
52343+#include "../plugin_header.h"
52344+#include "../../dformat.h"
52345+#include "../../seal.h"
52346+#include "../../plugin/file/file.h"
52347+
52348+#include <linux/fs.h> /* for struct file, struct inode */
52349+#include <linux/mm.h> /* for struct page */
52350+#include <linux/dcache.h> /* for struct dentry */
52351+
52352+typedef enum {
52353+ STAT_DATA_ITEM_TYPE,
52354+ DIR_ENTRY_ITEM_TYPE,
52355+ INTERNAL_ITEM_TYPE,
52356+ UNIX_FILE_METADATA_ITEM_TYPE,
52357+ OTHER_ITEM_TYPE
52358+} item_type_id;
52359+
52360+/* this is the part of each item plugin that all items are expected to
52361+ support or at least explicitly fail to support by setting the
52362+ pointer to null. */
52363+typedef struct {
52364+ item_type_id item_type;
52365+
52366+ /* operations called by balancing
52367+
52368+ It is interesting to consider that some of these item
52369+ operations could be given sources or targets that are not
52370+ really items in nodes. This could be ok/useful.
52371+
52372+ */
52373+ /* maximal key that can _possibly_ be occupied by this item
52374+
52375+ When inserting, and node ->lookup() method (called by
52376+ coord_by_key()) reaches an item after binary search,
52377+ the ->max_key_inside() item plugin method is used to determine
52378+ whether new item should pasted into existing item
52379+ (new_key<=max_key_inside()) or new item has to be created
52380+ (new_key>max_key_inside()).
52381+
52382+ For items that occupy exactly one key (like stat-data)
52383+ this method should return this key. For items that can
52384+ grow indefinitely (extent, directory item) this should
52385+ return max_key().
52386+
52387+ For example extent with the key
52388+
52389+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52390+
52391+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52392+ */
52393+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52394+
52395+ /* true if item @coord can merge data at @key. */
52396+ int (*can_contain_key) (const coord_t *, const reiser4_key *,
52397+ const reiser4_item_data *);
52398+ /* mergeable() - check items for mergeability
52399+
52400+ Optional method. Returns true if two items can be merged.
52401+
52402+ */
52403+ int (*mergeable) (const coord_t *, const coord_t *);
52404+
52405+ /* number of atomic things in an item */
52406+ pos_in_node_t(*nr_units) (const coord_t *);
52407+
52408+ /* search within item for a unit within the item, and return a
52409+ pointer to it. This can be used to calculate how many
52410+ bytes to shrink an item if you use pointer arithmetic and
52411+ compare to the start of the item body if the item's data
52412+ are continuous in the node, if the item's data are not
52413+ continuous in the node, all sorts of other things are maybe
52414+ going to break as well. */
52415+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52416+ /* method called by ode_plugin->create_item() to initialise new
52417+ item */
52418+ int (*init) (coord_t * target, coord_t * from,
52419+ reiser4_item_data * data);
52420+ /* method called (e.g., by resize_item()) to place new data into
52421+ item when it grows */
52422+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52423+ /* return true if paste into @coord is allowed to skip
52424+ carry. That is, if such paste would require any changes
52425+ at the parent level
52426+ */
52427+ int (*fast_paste) (const coord_t *);
52428+ /* how many but not more than @want units of @source can be
52429+ shifted into @target node. If pend == append - we try to
52430+ append last item of @target by first units of @source. If
52431+ pend == prepend - we try to "prepend" first item in @target
52432+ by last units of @source. @target node has @free_space
52433+ bytes of free space. Total size of those units are returned
52434+ via @size.
52435+
52436+ @target is not NULL if shifting to the mergeable item and
52437+ NULL is new item will be created during shifting.
52438+ */
52439+ int (*can_shift) (unsigned free_space, coord_t *,
52440+ znode *, shift_direction, unsigned *size,
52441+ unsigned want);
52442+
52443+ /* starting off @from-th unit of item @source append or
52444+ prepend @count units to @target. @target has been already
52445+ expanded by @free_space bytes. That must be exactly what is
52446+ needed for those items in @target. If @where_is_free_space
52447+ == SHIFT_LEFT - free space is at the end of @target item,
52448+ othersize - it is in the beginning of it. */
52449+ void (*copy_units) (coord_t *, coord_t *,
52450+ unsigned from, unsigned count,
52451+ shift_direction where_is_free_space,
52452+ unsigned free_space);
52453+
52454+ int (*create_hook) (const coord_t *, void *);
52455+ /* do whatever is necessary to do when @count units starting
52456+ from @from-th one are removed from the tree */
52457+ /* FIXME-VS: this is used to be here for, in particular,
52458+ extents and items of internal type to free blocks they point
52459+ to at the same time with removing items from a
52460+ tree. Problems start, however, when dealloc_block fails due
52461+ to some reason. Item gets removed, but blocks it pointed to
52462+ are not freed. It is not clear how to fix this for items of
52463+ internal type because a need to remove internal item may
52464+ appear in the middle of balancing, and there is no way to
52465+ undo changes made. OTOH, if space allocator involves
52466+ balancing to perform dealloc_block - this will probably
52467+ break balancing due to deadlock issues
52468+ */
52469+ int (*kill_hook) (const coord_t *, pos_in_node_t from,
52470+ pos_in_node_t count, struct carry_kill_data *);
52471+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52472+ znode * _node);
52473+
52474+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52475+ including boundaries. When units are cut from item beginning - move space which gets freed to head of
52476+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52477+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52478+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52479+ */
52480+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52481+ struct carry_cut_data *,
52482+ reiser4_key * smallest_removed,
52483+ reiser4_key * new_first_key);
52484+
52485+ /* like cut_units, except that these units are removed from the
52486+ tree, not only from a node */
52487+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52488+ struct carry_kill_data *,
52489+ reiser4_key * smallest_removed,
52490+ reiser4_key * new_first);
52491+
52492+ /* if @key_of_coord == 1 - returned key of coord, otherwise -
52493+ key of unit is returned. If @coord is not set to certain
52494+ unit - ERR_PTR(-ENOENT) is returned */
52495+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52496+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52497+ /* estimate how much space is needed for paste @data into item at
52498+ @coord. if @coord==0 - estimate insertion, otherwise - estimate
52499+ pasting
52500+ */
52501+ int (*estimate) (const coord_t *, const reiser4_item_data *);
52502+
52503+ /* converts flow @f to item data. @coord == 0 on insert */
52504+ int (*item_data_by_flow) (const coord_t *, const flow_t *,
52505+ reiser4_item_data *);
52506+
52507+ /*void (*show) (struct seq_file *, coord_t *); */
52508+
52509+#if REISER4_DEBUG
52510+ /* used for debugging, every item should have here the most
52511+ complete possible check of the consistency of the item that
52512+ the inventor can construct */
52513+ int (*check) (const coord_t *, const char **error);
52514+#endif
52515+
52516+} balance_ops;
52517+
52518+typedef struct {
52519+ /* return the right or left child of @coord, only if it is in memory */
52520+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52521+
52522+ /* return whether the right or left child of @coord has a non-fake
52523+ block number. */
52524+ int (*utmost_child_real_block) (const coord_t *, sideof side,
52525+ reiser4_block_nr *);
52526+ /* relocate child at @coord to the @block */
52527+ void (*update) (const coord_t *, const reiser4_block_nr *);
52528+ /* count unformatted nodes per item for leave relocation policy, etc.. */
52529+ int (*scan) (flush_scan * scan);
52530+ /* convert item by flush */
52531+ int (*convert) (flush_pos_t * pos);
52532+ /* backward mapping from jnode offset to a key. */
52533+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52534+} flush_ops;
52535+
52536+/* operations specific to the directory item */
52537+typedef struct {
52538+ /* extract stat-data key from directory entry at @coord and place it
52539+ into @key. */
52540+ int (*extract_key) (const coord_t *, reiser4_key * key);
52541+ /* update object key in item. */
52542+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52543+ /* extract name from directory entry at @coord and return it */
52544+ char *(*extract_name) (const coord_t *, char *buf);
52545+ /* extract file type (DT_* stuff) from directory entry at @coord and
52546+ return it */
52547+ unsigned (*extract_file_type) (const coord_t *);
52548+ int (*add_entry) (struct inode * dir,
52549+ coord_t *, lock_handle *,
52550+ const struct dentry * name,
52551+ reiser4_dir_entry_desc * entry);
52552+ int (*rem_entry) (struct inode * dir, const struct qstr * name,
52553+ coord_t *, lock_handle *,
52554+ reiser4_dir_entry_desc * entry);
52555+ int (*max_name_len) (const struct inode * dir);
52556+} dir_entry_ops;
52557+
52558+/* operations specific to items regular (unix) file metadata are built of */
52559+typedef struct {
52560+ int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52561+ int (*read) (struct file *, flow_t *, hint_t *);
52562+ int (*readpage) (void *, struct page *);
52563+ int (*get_block) (const coord_t *, sector_t, sector_t *);
52564+ void (*readpages) (void *, struct address_space *,
52565+ struct list_head * pages);
52566+ /*
52567+ * key of first byte which is not addressed by the item @coord is set
52568+ * to.
52569+ * For example, for extent item with the key
52570+ *
52571+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52572+ *
52573+ * ->append_key is
52574+ *
52575+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52576+ */
52577+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52578+
52579+ void (*init_coord_extension) (uf_coord_t *, loff_t);
52580+} file_ops;
52581+
52582+/* operations specific to items of stat data type */
52583+typedef struct {
52584+ int (*init_inode) (struct inode * inode, char *sd, int len);
52585+ int (*save_len) (struct inode * inode);
52586+ int (*save) (struct inode * inode, char **area);
52587+} sd_ops;
52588+
52589+/* operations specific to internal item */
52590+typedef struct {
52591+ /* all tree traversal want to know from internal item is where
52592+ to go next. */
52593+ void (*down_link) (const coord_t * coord,
52594+ const reiser4_key * key, reiser4_block_nr * block);
52595+ /* check that given internal item contains given pointer. */
52596+ int (*has_pointer_to) (const coord_t * coord,
52597+ const reiser4_block_nr * block);
52598+} internal_item_ops;
52599+
52600+struct item_plugin {
52601+ /* generic fields */
52602+ plugin_header h;
52603+
52604+ /* methods common for all item types */
52605+ balance_ops b;
52606+ /* methods used during flush */
52607+ flush_ops f;
52608+
52609+ /* methods specific to particular type of item */
52610+ union {
52611+ dir_entry_ops dir;
52612+ file_ops file;
52613+ sd_ops sd;
52614+ internal_item_ops internal;
52615+ } s;
52616+
52617+};
52618+
52619+static inline item_id item_id_by_plugin(item_plugin * plugin)
52620+{
52621+ return plugin->h.id;
52622+}
52623+
52624+static inline char get_iplugid(item_plugin * iplug)
52625+{
52626+ assert("nikita-2838", iplug != NULL);
52627+ assert("nikita-2839", iplug->h.id < 0xff);
52628+ return (char)item_id_by_plugin(iplug);
52629+}
52630+
52631+extern unsigned long znode_times_locked(const znode * z);
52632+
52633+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
52634+{
52635+ assert("nikita-2837", coord != NULL);
52636+ assert("nikita-2838", iplug != NULL);
52637+ coord->iplugid = get_iplugid(iplug);
52638+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
52639+}
52640+
52641+static inline item_plugin *coord_iplug(const coord_t * coord)
52642+{
52643+ assert("nikita-2833", coord != NULL);
52644+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
52645+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
52646+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
52647+ coord->iplugid);
52648+}
52649+
52650+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
52651+ const reiser4_item_data *);
52652+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
52653+extern int item_is_extent(const coord_t *);
52654+extern int item_is_tail(const coord_t *);
52655+extern int item_is_statdata(const coord_t * item);
52656+extern int item_is_ctail(const coord_t *);
52657+
52658+extern pos_in_node_t item_length_by_coord(const coord_t * coord);
52659+extern item_type_id item_type_by_coord(const coord_t * coord);
52660+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
52661+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
52662+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
52663+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
52664+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
52665+ reiser4_key * key);
52666+
52667+extern void obtain_item_plugin(const coord_t * coord);
52668+
52669+#if defined(REISER4_DEBUG)
52670+extern int znode_is_loaded(const znode * node);
52671+#endif
52672+
52673+/* return plugin of item at @coord */
52674+static inline item_plugin *item_plugin_by_coord(const coord_t *
52675+ coord /* coord to query */ )
52676+{
52677+ assert("nikita-330", coord != NULL);
52678+ assert("nikita-331", coord->node != NULL);
52679+ assert("nikita-332", znode_is_loaded(coord->node));
52680+
52681+ if (unlikely(!coord_is_iplug_set(coord)))
52682+ obtain_item_plugin(coord);
52683+ return coord_iplug(coord);
52684+}
52685+
52686+/* this returns true if item is of internal type */
52687+static inline int item_is_internal(const coord_t * item)
52688+{
52689+ assert("vs-483", coord_is_existing_item(item));
52690+ return item_type_by_coord(item) == INTERNAL_ITEM_TYPE;
52691+}
52692+
52693+extern void item_body_by_coord_hard(coord_t * coord);
52694+extern void *item_body_by_coord_easy(const coord_t * coord);
52695+#if REISER4_DEBUG
52696+extern int item_body_is_valid(const coord_t * coord);
52697+#endif
52698+
52699+/* return pointer to item body */
52700+static inline void *item_body_by_coord(const coord_t *
52701+ coord /* coord to query */ )
52702+{
52703+ assert("nikita-324", coord != NULL);
52704+ assert("nikita-325", coord->node != NULL);
52705+ assert("nikita-326", znode_is_loaded(coord->node));
52706+
52707+ if (coord->offset == INVALID_OFFSET)
52708+ item_body_by_coord_hard((coord_t *) coord);
52709+ assert("nikita-3201", item_body_is_valid(coord));
52710+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
52711+ return item_body_by_coord_easy(coord);
52712+}
52713+
52714+/* __REISER4_ITEM_H__ */
52715+#endif
52716+/* Make Linus happy.
52717+ Local variables:
52718+ c-indentation-style: "K&R"
52719+ mode-name: "LC"
52720+ c-basic-offset: 8
52721+ tab-width: 8
52722+ fill-column: 120
52723+ scroll-step: 1
52724+ End:
52725+*/
52726Index: linux-2.6.16/fs/reiser4/plugin/item/sde.c
52727===================================================================
52728--- /dev/null
52729+++ linux-2.6.16/fs/reiser4/plugin/item/sde.c
52730@@ -0,0 +1,190 @@
52731+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52732+
52733+/* Directory entry implementation */
52734+#include "../../forward.h"
52735+#include "../../debug.h"
52736+#include "../../dformat.h"
52737+#include "../../kassign.h"
52738+#include "../../coord.h"
52739+#include "sde.h"
52740+#include "item.h"
52741+#include "../plugin.h"
52742+#include "../../znode.h"
52743+#include "../../carry.h"
52744+#include "../../tree.h"
52745+#include "../../inode.h"
52746+
52747+#include <linux/fs.h> /* for struct inode */
52748+#include <linux/dcache.h> /* for struct dentry */
52749+#include <linux/quotaops.h>
52750+
52751+/* ->extract_key() method of simple directory item plugin. */
52752+int extract_key_de(const coord_t * coord /* coord of item */ ,
52753+ reiser4_key * key /* resulting key */ )
52754+{
52755+ directory_entry_format *dent;
52756+
52757+ assert("nikita-1458", coord != NULL);
52758+ assert("nikita-1459", key != NULL);
52759+
52760+ dent = (directory_entry_format *) item_body_by_coord(coord);
52761+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
52762+ return extract_key_from_id(&dent->id, key);
52763+}
52764+
52765+int
52766+update_key_de(const coord_t * coord, const reiser4_key * key,
52767+ lock_handle * lh UNUSED_ARG)
52768+{
52769+ directory_entry_format *dent;
52770+ obj_key_id obj_id;
52771+ int result;
52772+
52773+ assert("nikita-2342", coord != NULL);
52774+ assert("nikita-2343", key != NULL);
52775+
52776+ dent = (directory_entry_format *) item_body_by_coord(coord);
52777+ result = build_obj_key_id(key, &obj_id);
52778+ if (result == 0) {
52779+ dent->id = obj_id;
52780+ znode_make_dirty(coord->node);
52781+ }
52782+ return 0;
52783+}
52784+
52785+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
52786+ char *buf)
52787+{
52788+ reiser4_key key;
52789+
52790+ unit_key_by_coord(coord, &key);
52791+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
52792+ reiser4_print_address("oops", znode_get_block(coord->node));
52793+ if (!is_longname_key(&key)) {
52794+ if (is_dot_key(&key))
52795+ return (char *)".";
52796+ else
52797+ return extract_name_from_key(&key, buf);
52798+ } else
52799+ return (char *)dent->name;
52800+}
52801+
52802+/* ->extract_name() method of simple directory item plugin. */
52803+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
52804+{
52805+ directory_entry_format *dent;
52806+
52807+ assert("nikita-1460", coord != NULL);
52808+
52809+ dent = (directory_entry_format *) item_body_by_coord(coord);
52810+ return extract_dent_name(coord, dent, buf);
52811+}
52812+
52813+/* ->extract_file_type() method of simple directory item plugin. */
52814+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
52815+ * item */ )
52816+{
52817+ assert("nikita-1764", coord != NULL);
52818+ /* we don't store file type in the directory entry yet.
52819+
52820+ But see comments at kassign.h:obj_key_id
52821+ */
52822+ return DT_UNKNOWN;
52823+}
52824+
52825+int add_entry_de(struct inode *dir /* directory of item */ ,
52826+ coord_t * coord /* coord of item */ ,
52827+ lock_handle * lh /* insertion lock handle */ ,
52828+ const struct dentry *de /* name to add */ ,
52829+ reiser4_dir_entry_desc * entry /* parameters of new directory
52830+ * entry */ )
52831+{
52832+ reiser4_item_data data;
52833+ directory_entry_format *dent;
52834+ int result;
52835+ const char *name;
52836+ int len;
52837+ int longname;
52838+
52839+ name = de->d_name.name;
52840+ len = de->d_name.len;
52841+ assert("nikita-1163", strlen(name) == len);
52842+
52843+ longname = is_longname(name, len);
52844+
52845+ data.length = sizeof *dent;
52846+ if (longname)
52847+ data.length += len + 1;
52848+ data.data = NULL;
52849+ data.user = 0;
52850+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
52851+
52852+ /* NOTE-NIKITA quota plugin */
52853+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
52854+ return -EDQUOT;
52855+
52856+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
52857+ if (result != 0)
52858+ return result;
52859+
52860+ dent = (directory_entry_format *) item_body_by_coord(coord);
52861+ build_inode_key_id(entry->obj, &dent->id);
52862+ if (longname) {
52863+ memcpy(dent->name, name, len);
52864+ put_unaligned(0, &dent->name[len]);
52865+ }
52866+ return 0;
52867+}
52868+
52869+int rem_entry_de(struct inode *dir /* directory of item */ ,
52870+ const struct qstr *name UNUSED_ARG,
52871+ coord_t * coord /* coord of item */ ,
52872+ lock_handle * lh UNUSED_ARG /* lock handle for
52873+ * removal */ ,
52874+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
52875+ * directory entry
52876+ * being removed */ )
52877+{
52878+ coord_t shadow;
52879+ int result;
52880+ int length;
52881+
52882+ length = item_length_by_coord(coord);
52883+ if (inode_get_bytes(dir) < length) {
52884+ warning("nikita-2627", "Dir is broke: %llu: %llu",
52885+ (unsigned long long)get_inode_oid(dir),
52886+ inode_get_bytes(dir));
52887+
52888+ return RETERR(-EIO);
52889+ }
52890+
52891+ /* cut_node() is supposed to take pointers to _different_
52892+ coords, because it will modify them without respect to
52893+ possible aliasing. To work around this, create temporary copy
52894+ of @coord.
52895+ */
52896+ coord_dup(&shadow, coord);
52897+ result =
52898+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
52899+ if (result == 0) {
52900+ /* NOTE-NIKITA quota plugin */
52901+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
52902+ }
52903+ return result;
52904+}
52905+
52906+int max_name_len_de(const struct inode *dir)
52907+{
52908+ return tree_by_inode(dir)->nplug->max_item_size() -
52909+ sizeof(directory_entry_format) - 2;
52910+}
52911+
52912+/* Make Linus happy.
52913+ Local variables:
52914+ c-indentation-style: "K&R"
52915+ mode-name: "LC"
52916+ c-basic-offset: 8
52917+ tab-width: 8
52918+ fill-column: 120
52919+ End:
52920+*/
52921Index: linux-2.6.16/fs/reiser4/plugin/item/sde.h
52922===================================================================
52923--- /dev/null
52924+++ linux-2.6.16/fs/reiser4/plugin/item/sde.h
52925@@ -0,0 +1,66 @@
52926+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52927+
52928+/* Directory entry. */
52929+
52930+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
52931+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
52932+
52933+#include "../../forward.h"
52934+#include "../../dformat.h"
52935+#include "../../kassign.h"
52936+#include "../../key.h"
52937+
52938+#include <linux/fs.h>
52939+#include <linux/dcache.h> /* for struct dentry */
52940+
52941+typedef struct directory_entry_format {
52942+ /* key of object stat-data. It's not necessary to store whole
52943+ key here, because it's always key of stat-data, so minor
52944+ packing locality and offset can be omitted here. But this
52945+ relies on particular key allocation scheme for stat-data, so,
52946+ for extensibility sake, whole key can be stored here.
52947+
52948+ We store key as array of bytes, because we don't want 8-byte
52949+ alignment of dir entries.
52950+ */
52951+ obj_key_id id;
52952+ /* file name. Null terminated string. */
52953+ d8 name[0];
52954+} directory_entry_format;
52955+
52956+void print_de(const char *prefix, coord_t * coord);
52957+int extract_key_de(const coord_t * coord, reiser4_key * key);
52958+int update_key_de(const coord_t * coord, const reiser4_key * key,
52959+ lock_handle * lh);
52960+char *extract_name_de(const coord_t * coord, char *buf);
52961+unsigned extract_file_type_de(const coord_t * coord);
52962+int add_entry_de(struct inode *dir, coord_t * coord,
52963+ lock_handle * lh, const struct dentry *name,
52964+ reiser4_dir_entry_desc * entry);
52965+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
52966+ lock_handle * lh, reiser4_dir_entry_desc * entry);
52967+int max_name_len_de(const struct inode *dir);
52968+
52969+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
52970+
52971+char *extract_dent_name(const coord_t * coord,
52972+ directory_entry_format * dent, char *buf);
52973+
52974+#if REISER4_LARGE_KEY
52975+#define DE_NAME_BUF_LEN (24)
52976+#else
52977+#define DE_NAME_BUF_LEN (16)
52978+#endif
52979+
52980+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
52981+#endif
52982+
52983+/* Make Linus happy.
52984+ Local variables:
52985+ c-indentation-style: "K&R"
52986+ mode-name: "LC"
52987+ c-basic-offset: 8
52988+ tab-width: 8
52989+ fill-column: 120
52990+ End:
52991+*/
52992Index: linux-2.6.16/fs/reiser4/plugin/item/static_stat.c
52993===================================================================
52994--- /dev/null
52995+++ linux-2.6.16/fs/reiser4/plugin/item/static_stat.c
52996@@ -0,0 +1,1040 @@
52997+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52998+
52999+/* stat data manipulation. */
53000+
53001+#include "../../forward.h"
53002+#include "../../super.h"
53003+#include "../../vfs_ops.h"
53004+#include "../../inode.h"
53005+#include "../../debug.h"
53006+#include "../../dformat.h"
53007+#include "../object.h"
53008+#include "../plugin.h"
53009+#include "../plugin_header.h"
53010+#include "static_stat.h"
53011+#include "item.h"
53012+
53013+#include <linux/types.h>
53014+#include <linux/fs.h>
53015+
53016+/* see static_stat.h for explanation */
53017+
53018+/* helper function used while we are dumping/loading inode/plugin state
53019+ to/from the stat-data. */
53020+
53021+static void move_on(int *length /* space remaining in stat-data */ ,
53022+ char **area /* current coord in stat data */ ,
53023+ int size_of /* how many bytes to move forward */ )
53024+{
53025+ assert("nikita-615", length != NULL);
53026+ assert("nikita-616", area != NULL);
53027+
53028+ *length -= size_of;
53029+ *area += size_of;
53030+
53031+ assert("nikita-617", *length >= 0);
53032+}
53033+
53034+/* helper function used while loading inode/plugin state from stat-data.
53035+ Complain if there is less space in stat-data than was expected.
53036+ Can only happen on disk corruption. */
53037+static int not_enough_space(struct inode *inode /* object being processed */ ,
53038+ const char *where /* error message */ )
53039+{
53040+ assert("nikita-618", inode != NULL);
53041+
53042+ warning("nikita-619", "Not enough space in %llu while loading %s",
53043+ (unsigned long long)get_inode_oid(inode), where);
53044+
53045+ return RETERR(-EINVAL);
53046+}
53047+
53048+/* helper function used while loading inode/plugin state from
53049+ stat-data. Call it if invalid plugin id was found. */
53050+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
53051+ struct inode *inode /* object being processed */ )
53052+{
53053+ warning("nikita-620", "Unknown plugin %i in %llu",
53054+ id, (unsigned long long)get_inode_oid(inode));
53055+
53056+ return RETERR(-EINVAL);
53057+}
53058+
53059+/* this is installed as ->init_inode() method of
53060+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
53061+ Copies data from on-disk stat-data format into inode.
53062+ Handles stat-data extensions. */
53063+/* was sd_load */
53064+int init_inode_static_sd(struct inode *inode /* object being processed */ ,
53065+ char *sd /* stat-data body */ ,
53066+ int len /* length of stat-data */ )
53067+{
53068+ int result;
53069+ int bit;
53070+ int chunk;
53071+ __u16 mask;
53072+ __u64 bigmask;
53073+ reiser4_stat_data_base *sd_base;
53074+ reiser4_inode *state;
53075+
53076+ assert("nikita-625", inode != NULL);
53077+ assert("nikita-626", sd != NULL);
53078+
53079+ result = 0;
53080+ sd_base = (reiser4_stat_data_base *) sd;
53081+ state = reiser4_inode_data(inode);
53082+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
53083+ bigmask = mask;
53084+ inode_set_flag(inode, REISER4_SDLEN_KNOWN);
53085+
53086+ move_on(&len, &sd, sizeof *sd_base);
53087+ for (bit = 0, chunk = 0;
53088+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
53089+ ++bit, mask >>= 1) {
53090+ if (((bit + 1) % 16) != 0) {
53091+ /* handle extension */
53092+ sd_ext_plugin *sdplug;
53093+
53094+ if (bit >= LAST_SD_EXTENSION) {
53095+ warning("vpf-1904",
53096+ "No such extension %i in inode %llu",
53097+ bit,
53098+ (unsigned long long)
53099+ get_inode_oid(inode));
53100+
53101+ result = RETERR(-EINVAL);
53102+ break;
53103+ }
53104+
53105+ sdplug = sd_ext_plugin_by_id(bit);
53106+ if (sdplug == NULL) {
53107+ warning("nikita-627",
53108+ "No such extension %i in inode %llu",
53109+ bit,
53110+ (unsigned long long)
53111+ get_inode_oid(inode));
53112+
53113+ result = RETERR(-EINVAL);
53114+ break;
53115+ }
53116+ if (mask & 1) {
53117+ assert("nikita-628", sdplug->present);
53118+ /* alignment is not supported in node layout
53119+ plugin yet.
53120+ result = align( inode, &len, &sd,
53121+ sdplug -> alignment );
53122+ if( result != 0 )
53123+ return result; */
53124+ result = sdplug->present(inode, &sd, &len);
53125+ } else if (sdplug->absent != NULL)
53126+ result = sdplug->absent(inode);
53127+ if (result)
53128+ break;
53129+ /* else, we are looking at the last bit in 16-bit
53130+ portion of bitmask */
53131+ } else if (mask & 1) {
53132+ /* next portion of bitmask */
53133+ if (len < (int)sizeof(d16)) {
53134+ warning("nikita-629",
53135+ "No space for bitmap in inode %llu",
53136+ (unsigned long long)
53137+ get_inode_oid(inode));
53138+
53139+ result = RETERR(-EINVAL);
53140+ break;
53141+ }
53142+ mask = le16_to_cpu(get_unaligned((d16 *)sd));
53143+ bigmask <<= 16;
53144+ bigmask |= mask;
53145+ move_on(&len, &sd, sizeof(d16));
53146+ ++chunk;
53147+ if (chunk == 3) {
53148+ if (!(mask & 0x8000)) {
53149+ /* clear last bit */
53150+ mask &= ~0x8000;
53151+ continue;
53152+ }
53153+ /* too much */
53154+ warning("nikita-630",
53155+ "Too many extensions in %llu",
53156+ (unsigned long long)
53157+ get_inode_oid(inode));
53158+
53159+ result = RETERR(-EINVAL);
53160+ break;
53161+ }
53162+ } else
53163+ /* bitmask exhausted */
53164+ break;
53165+ }
53166+ state->extmask = bigmask;
53167+ /* common initialisations */
53168+ inode->i_blksize = get_super_private(inode->i_sb)->optimal_io_size;
53169+ if (len - (bit / 16 * sizeof(d16)) > 0) {
53170+ /* alignment in save_len_static_sd() is taken into account
53171+ -edward */
53172+ warning("nikita-631", "unused space in inode %llu",
53173+ (unsigned long long)get_inode_oid(inode));
53174+ }
53175+
53176+ return result;
53177+}
53178+
53179+/* estimates size of stat-data required to store inode.
53180+ Installed as ->save_len() method of
53181+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53182+/* was sd_len */
53183+int save_len_static_sd(struct inode *inode /* object being processed */ )
53184+{
53185+ unsigned int result;
53186+ __u64 mask;
53187+ int bit;
53188+
53189+ assert("nikita-632", inode != NULL);
53190+
53191+ result = sizeof(reiser4_stat_data_base);
53192+ mask = reiser4_inode_data(inode)->extmask;
53193+ for (bit = 0; mask != 0; ++bit, mask >>= 1) {
53194+ if (mask & 1) {
53195+ sd_ext_plugin *sdplug;
53196+
53197+ sdplug = sd_ext_plugin_by_id(bit);
53198+ assert("nikita-633", sdplug != NULL);
53199+ /* no aligment support
53200+ result +=
53201+ round_up( result, sdplug -> alignment ) - result; */
53202+ result += sdplug->save_len(inode);
53203+ }
53204+ }
53205+ result += bit / 16 * sizeof(d16);
53206+ return result;
53207+}
53208+
53209+/* saves inode into stat-data.
53210+ Installed as ->save() method of
53211+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53212+/* was sd_save */
53213+int save_static_sd(struct inode *inode /* object being processed */ ,
53214+ char **area /* where to save stat-data */ )
53215+{
53216+ int result;
53217+ __u64 emask;
53218+ int bit;
53219+ unsigned int len;
53220+ reiser4_stat_data_base *sd_base;
53221+
53222+ assert("nikita-634", inode != NULL);
53223+ assert("nikita-635", area != NULL);
53224+
53225+ result = 0;
53226+ emask = reiser4_inode_data(inode)->extmask;
53227+ sd_base = (reiser4_stat_data_base *) * area;
53228+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
53229+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
53230+
53231+ *area += sizeof *sd_base;
53232+ len = 0xffffffffu;
53233+ for (bit = 0; emask != 0; ++bit, emask >>= 1) {
53234+ if (emask & 1) {
53235+ if ((bit + 1) % 16 != 0) {
53236+ sd_ext_plugin *sdplug;
53237+ sdplug = sd_ext_plugin_by_id(bit);
53238+ assert("nikita-636", sdplug != NULL);
53239+ /* no alignment support yet
53240+ align( inode, &len, area,
53241+ sdplug -> alignment ); */
53242+ result = sdplug->save(inode, area);
53243+ if (result)
53244+ break;
53245+ } else {
53246+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
53247+ (d16 *)(*area));
53248+ /*cputod16((unsigned)(emask & 0xffff),
53249+ (d16 *) * area);*/
53250+ *area += sizeof(d16);
53251+ }
53252+ }
53253+ }
53254+ return result;
53255+}
53256+
53257+/* stat-data extension handling functions. */
53258+
53259+static int present_lw_sd(struct inode *inode /* object being processed */ ,
53260+ char **area /* position in stat-data */ ,
53261+ int *len /* remaining length */ )
53262+{
53263+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
53264+ reiser4_light_weight_stat *sd_lw;
53265+
53266+ sd_lw = (reiser4_light_weight_stat *) * area;
53267+
53268+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
53269+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
53270+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
53271+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
53272+ inode->i_mode &= ~S_IFIFO;
53273+ warning("", "partially converted file is encountered");
53274+ inode_set_flag(inode, REISER4_PART_MIXED);
53275+ }
53276+ move_on(len, area, sizeof *sd_lw);
53277+ return 0;
53278+ } else
53279+ return not_enough_space(inode, "lw sd");
53280+}
53281+
53282+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
53283+ * processed */ )
53284+{
53285+ return sizeof(reiser4_light_weight_stat);
53286+}
53287+
53288+static int save_lw_sd(struct inode *inode /* object being processed */ ,
53289+ char **area /* position in stat-data */ )
53290+{
53291+ reiser4_light_weight_stat *sd;
53292+ mode_t delta;
53293+
53294+ assert("nikita-2705", inode != NULL);
53295+ assert("nikita-2706", area != NULL);
53296+ assert("nikita-2707", *area != NULL);
53297+
53298+ sd = (reiser4_light_weight_stat *) * area;
53299+
53300+ delta = (inode_get_flag(inode, REISER4_PART_MIXED) ? S_IFIFO : 0);
53301+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
53302+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
53303+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
53304+ *area += sizeof *sd;
53305+ return 0;
53306+}
53307+
53308+static int present_unix_sd(struct inode *inode /* object being processed */ ,
53309+ char **area /* position in stat-data */ ,
53310+ int *len /* remaining length */ )
53311+{
53312+ assert("nikita-637", inode != NULL);
53313+ assert("nikita-638", area != NULL);
53314+ assert("nikita-639", *area != NULL);
53315+ assert("nikita-640", len != NULL);
53316+ assert("nikita-641", *len > 0);
53317+
53318+ if (*len >= (int)sizeof(reiser4_unix_stat)) {
53319+ reiser4_unix_stat *sd;
53320+
53321+ sd = (reiser4_unix_stat *) * area;
53322+
53323+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
53324+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
53325+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
53326+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
53327+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
53328+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53329+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
53330+ else
53331+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
53332+ move_on(len, area, sizeof *sd);
53333+ return 0;
53334+ } else
53335+ return not_enough_space(inode, "unix sd");
53336+}
53337+
53338+static int absent_unix_sd(struct inode *inode /* object being processed */ )
53339+{
53340+ inode->i_uid = get_super_private(inode->i_sb)->default_uid;
53341+ inode->i_gid = get_super_private(inode->i_sb)->default_gid;
53342+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53343+ inode_set_bytes(inode, inode->i_size);
53344+ /* mark inode as lightweight, so that caller (reiser4_lookup) will
53345+ complete initialisation by copying [ug]id from a parent. */
53346+ inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
53347+ return 0;
53348+}
53349+
53350+/* Audited by: green(2002.06.14) */
53351+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
53352+ * processed */ )
53353+{
53354+ return sizeof(reiser4_unix_stat);
53355+}
53356+
53357+static int save_unix_sd(struct inode *inode /* object being processed */ ,
53358+ char **area /* position in stat-data */ )
53359+{
53360+ reiser4_unix_stat *sd;
53361+
53362+ assert("nikita-642", inode != NULL);
53363+ assert("nikita-643", area != NULL);
53364+ assert("nikita-644", *area != NULL);
53365+
53366+ sd = (reiser4_unix_stat *) * area;
53367+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53368+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53369+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53370+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53371+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53372+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53373+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53374+ else
53375+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53376+ *area += sizeof *sd;
53377+ return 0;
53378+}
53379+
53380+static int
53381+present_large_times_sd(struct inode *inode /* object being processed */ ,
53382+ char **area /* position in stat-data */ ,
53383+ int *len /* remaining length */ )
53384+{
53385+ if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53386+ reiser4_large_times_stat *sd_lt;
53387+
53388+ sd_lt = (reiser4_large_times_stat *) * area;
53389+
53390+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53391+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53392+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53393+
53394+ move_on(len, area, sizeof *sd_lt);
53395+ return 0;
53396+ } else
53397+ return not_enough_space(inode, "large times sd");
53398+}
53399+
53400+static int
53401+save_len_large_times_sd(struct inode *inode UNUSED_ARG
53402+ /* object being processed */ )
53403+{
53404+ return sizeof(reiser4_large_times_stat);
53405+}
53406+
53407+static int
53408+save_large_times_sd(struct inode *inode /* object being processed */ ,
53409+ char **area /* position in stat-data */ )
53410+{
53411+ reiser4_large_times_stat *sd;
53412+
53413+ assert("nikita-2817", inode != NULL);
53414+ assert("nikita-2818", area != NULL);
53415+ assert("nikita-2819", *area != NULL);
53416+
53417+ sd = (reiser4_large_times_stat *) * area;
53418+
53419+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53420+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53421+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53422+
53423+ *area += sizeof *sd;
53424+ return 0;
53425+}
53426+
53427+/* symlink stat data extension */
53428+
53429+/* allocate memory for symlink target and attach it to inode->u.generic_ip */
53430+static int
53431+symlink_target_to_inode(struct inode *inode, const char *target, int len)
53432+{
53433+ assert("vs-845", inode->u.generic_ip == NULL);
53434+ assert("vs-846", !inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
53435+
53436+ /* FIXME-VS: this is prone to deadlock. Not more than other similar
53437+ places, though */
53438+ inode->u.generic_ip = kmalloc((size_t) len + 1, get_gfp_mask());
53439+ if (!inode->u.generic_ip)
53440+ return RETERR(-ENOMEM);
53441+
53442+ memcpy((char *)(inode->u.generic_ip), target, (size_t) len);
53443+ ((char *)(inode->u.generic_ip))[len] = 0;
53444+ inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53445+ return 0;
53446+}
53447+
53448+/* this is called on read_inode. There is nothing to do actually, but some
53449+ sanity checks */
53450+static int present_symlink_sd(struct inode *inode, char **area, int *len)
53451+{
53452+ int result;
53453+ int length;
53454+ reiser4_symlink_stat *sd;
53455+
53456+ length = (int)inode->i_size;
53457+ /*
53458+ * *len is number of bytes in stat data item from *area to the end of
53459+ * item. It must be not less than size of symlink + 1 for ending 0
53460+ */
53461+ if (length > *len)
53462+ return not_enough_space(inode, "symlink");
53463+
53464+ if (*(*area + length) != 0) {
53465+ warning("vs-840", "Symlink is not zero terminated");
53466+ return RETERR(-EIO);
53467+ }
53468+
53469+ sd = (reiser4_symlink_stat *) * area;
53470+ result = symlink_target_to_inode(inode, sd->body, length);
53471+
53472+ move_on(len, area, length + 1);
53473+ return result;
53474+}
53475+
53476+static int save_len_symlink_sd(struct inode *inode)
53477+{
53478+ return inode->i_size + 1;
53479+}
53480+
53481+/* this is called on create and update stat data. Do nothing on update but
53482+ update @area */
53483+static int save_symlink_sd(struct inode *inode, char **area)
53484+{
53485+ int result;
53486+ int length;
53487+ reiser4_symlink_stat *sd;
53488+
53489+ length = (int)inode->i_size;
53490+ /* inode->i_size must be set already */
53491+ assert("vs-841", length);
53492+
53493+ result = 0;
53494+ sd = (reiser4_symlink_stat *) * area;
53495+ if (!inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53496+ const char *target;
53497+
53498+ target = (const char *)(inode->u.generic_ip);
53499+ inode->u.generic_ip = NULL;
53500+
53501+ result = symlink_target_to_inode(inode, target, length);
53502+
53503+ /* copy symlink to stat data */
53504+ memcpy(sd->body, target, (size_t) length);
53505+ (*area)[length] = 0;
53506+ } else {
53507+ /* there is nothing to do in update but move area */
53508+ assert("vs-844",
53509+ !memcmp(inode->u.generic_ip, sd->body,
53510+ (size_t) length + 1));
53511+ }
53512+
53513+ *area += (length + 1);
53514+ return result;
53515+}
53516+
53517+static int present_flags_sd(struct inode *inode /* object being processed */ ,
53518+ char **area /* position in stat-data */ ,
53519+ int *len /* remaining length */ )
53520+{
53521+ assert("nikita-645", inode != NULL);
53522+ assert("nikita-646", area != NULL);
53523+ assert("nikita-647", *area != NULL);
53524+ assert("nikita-648", len != NULL);
53525+ assert("nikita-649", *len > 0);
53526+
53527+ if (*len >= (int)sizeof(reiser4_flags_stat)) {
53528+ reiser4_flags_stat *sd;
53529+
53530+ sd = (reiser4_flags_stat *) * area;
53531+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53532+ move_on(len, area, sizeof *sd);
53533+ return 0;
53534+ } else
53535+ return not_enough_space(inode, "generation and attrs");
53536+}
53537+
53538+/* Audited by: green(2002.06.14) */
53539+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
53540+ * processed */ )
53541+{
53542+ return sizeof(reiser4_flags_stat);
53543+}
53544+
53545+static int save_flags_sd(struct inode *inode /* object being processed */ ,
53546+ char **area /* position in stat-data */ )
53547+{
53548+ reiser4_flags_stat *sd;
53549+
53550+ assert("nikita-650", inode != NULL);
53551+ assert("nikita-651", area != NULL);
53552+ assert("nikita-652", *area != NULL);
53553+
53554+ sd = (reiser4_flags_stat *) * area;
53555+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53556+ *area += sizeof *sd;
53557+ return 0;
53558+}
53559+
53560+static int absent_plugin_sd(struct inode *inode);
53561+static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53562+ char **area /* position in stat-data */ ,
53563+ int *len /* remaining length */ )
53564+{
53565+ reiser4_plugin_stat *sd;
53566+ reiser4_plugin *plugin;
53567+ int i;
53568+ __u16 mask;
53569+ int result;
53570+ int num_of_plugins;
53571+
53572+ assert("nikita-653", inode != NULL);
53573+ assert("nikita-654", area != NULL);
53574+ assert("nikita-655", *area != NULL);
53575+ assert("nikita-656", len != NULL);
53576+ assert("nikita-657", *len > 0);
53577+
53578+ if (*len < (int)sizeof(reiser4_plugin_stat))
53579+ return not_enough_space(inode, "plugin");
53580+
53581+ sd = (reiser4_plugin_stat *) * area;
53582+
53583+ mask = 0;
53584+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
53585+ move_on(len, area, sizeof *sd);
53586+ result = 0;
53587+ for (i = 0; i < num_of_plugins; ++i) {
53588+ reiser4_plugin_slot *slot;
53589+ reiser4_plugin_type type;
53590+ pset_member memb;
53591+
53592+ slot = (reiser4_plugin_slot *) * area;
53593+ if (*len < (int)sizeof *slot)
53594+ return not_enough_space(inode, "additional plugin");
53595+
53596+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
53597+ type = pset_member_to_type_unsafe(memb);
53598+ if (type == REISER4_PLUGIN_TYPES) {
53599+ warning("nikita-3502",
53600+ "wrong pset member (%i) for %llu", memb,
53601+ (unsigned long long)get_inode_oid(inode));
53602+ return RETERR(-EINVAL);
53603+ }
53604+ plugin = plugin_by_disk_id(tree_by_inode(inode),
53605+ type, &slot->id);
53606+ if (plugin == NULL)
53607+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
53608+
53609+ /* plugin is loaded into inode, mark this into inode's
53610+ bitmask of loaded non-standard plugins */
53611+ if (!(mask & (1 << memb))) {
53612+ mask |= (1 << memb);
53613+ } else {
53614+ warning("nikita-658", "duplicate plugin for %llu",
53615+ (unsigned long long)get_inode_oid(inode));
53616+ return RETERR(-EINVAL);
53617+ }
53618+ move_on(len, area, sizeof *slot);
53619+ /* load plugin data, if any */
53620+ if (plugin->h.pops != NULL && plugin->h.pops->load) {
53621+ result = plugin->h.pops->load(inode, plugin, area, len);
53622+ if (result != 0)
53623+ return result;
53624+ } else
53625+ result = grab_plugin_from(inode, memb, plugin);
53626+ }
53627+ /* if object plugin wasn't loaded from stat-data, guess it by
53628+ mode bits */
53629+ plugin = file_plugin_to_plugin(inode_file_plugin(inode));
53630+ if (plugin == NULL)
53631+ result = absent_plugin_sd(inode);
53632+
53633+ reiser4_inode_data(inode)->plugin_mask = mask;
53634+ return result;
53635+}
53636+
53637+/* Determine object plugin for @inode based on i_mode.
53638+
53639+ Many objects in reiser4 file system are controlled by standard object
53640+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
53641+
53642+ For such files we don't explicitly store plugin id in object stat
53643+ data. Rather required plugin is guessed from mode bits, where file "type"
53644+ is encoded (see stat(2)).
53645+*/
53646+static int
53647+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
53648+{
53649+ int fplug_id;
53650+ int dplug_id;
53651+ reiser4_inode *info;
53652+
53653+ assert("nikita-736", inode != NULL);
53654+
53655+ dplug_id = fplug_id = -1;
53656+
53657+ switch (inode->i_mode & S_IFMT) {
53658+ case S_IFSOCK:
53659+ case S_IFBLK:
53660+ case S_IFCHR:
53661+ case S_IFIFO:
53662+ fplug_id = SPECIAL_FILE_PLUGIN_ID;
53663+ break;
53664+ case S_IFLNK:
53665+ fplug_id = SYMLINK_FILE_PLUGIN_ID;
53666+ break;
53667+ case S_IFDIR:
53668+ fplug_id = DIRECTORY_FILE_PLUGIN_ID;
53669+ dplug_id = HASHED_DIR_PLUGIN_ID;
53670+ break;
53671+ default:
53672+ warning("nikita-737", "wrong file mode: %o", inode->i_mode);
53673+ return RETERR(-EIO);
53674+ case S_IFREG:
53675+ fplug_id = UNIX_FILE_PLUGIN_ID;
53676+ break;
53677+ }
53678+ info = reiser4_inode_data(inode);
53679+ plugin_set_file(&info->pset,
53680+ (fplug_id >= 0) ? file_plugin_by_id(fplug_id) : NULL);
53681+ plugin_set_dir(&info->pset,
53682+ (dplug_id >= 0) ? dir_plugin_by_id(dplug_id) : NULL);
53683+ return 0;
53684+}
53685+
53686+/* Audited by: green(2002.06.14) */
53687+static int absent_plugin_sd(struct inode *inode /* object being processed */ )
53688+{
53689+ int result;
53690+
53691+ assert("nikita-659", inode != NULL);
53692+
53693+ result = guess_plugin_by_mode(inode);
53694+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
53695+ but setup_inode_ops() will call make_bad_inode().
53696+ Another, more logical but bit more complex solution is to add
53697+ "bad-file plugin". */
53698+ /* FIXME-VS: activate was called here */
53699+ return result;
53700+}
53701+
53702+/* helper function for plugin_sd_save_len(): calculate how much space
53703+ required to save state of given plugin */
53704+/* Audited by: green(2002.06.14) */
53705+static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
53706+ struct inode *inode /* object being processed */ ,
53707+ pset_member memb, int len)
53708+{
53709+ reiser4_inode *info;
53710+ assert("nikita-661", inode != NULL);
53711+
53712+ info = reiser4_inode_data(inode);
53713+ if (plugin != NULL && (info->plugin_mask & (1 << memb))) {
53714+ len += sizeof(reiser4_plugin_slot);
53715+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
53716+ /* non-standard plugin, call method */
53717+ /* commented as it is incompatible with alignment
53718+ * policy in save_plug() -edward */
53719+ /* len = round_up(len, plugin->h.pops->alignment); */
53720+ len += plugin->h.pops->save_len(inode, plugin);
53721+ }
53722+ }
53723+ return len;
53724+}
53725+
53726+/* calculate how much space is required to save state of all plugins,
53727+ associated with inode */
53728+static int save_len_plugin_sd(struct inode *inode /* object being processed */ )
53729+{
53730+ int len;
53731+ reiser4_inode *state;
53732+ pset_member memb;
53733+
53734+ assert("nikita-663", inode != NULL);
53735+
53736+ state = reiser4_inode_data(inode);
53737+ /* common case: no non-standard plugins */
53738+ if (state->plugin_mask == 0)
53739+ return 0;
53740+ len = sizeof(reiser4_plugin_stat);
53741+ for (memb = 0; memb < PSET_LAST; ++memb)
53742+ len = len_for(pset_get(state->pset, memb), inode, memb, len);
53743+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
53744+ return len;
53745+}
53746+
53747+/* helper function for plugin_sd_save(): save plugin, associated with
53748+ inode. */
53749+static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
53750+ struct inode *inode /* object being processed */ ,
53751+ pset_member memb /* what element of pset is saved */ ,
53752+ char **area /* position in stat-data */ ,
53753+ int *count /* incremented if plugin were actually
53754+ * saved. */ )
53755+{
53756+ reiser4_plugin_slot *slot;
53757+ int fake_len;
53758+ int result;
53759+
53760+ assert("nikita-665", inode != NULL);
53761+ assert("nikita-666", area != NULL);
53762+ assert("nikita-667", *area != NULL);
53763+
53764+ if (plugin == NULL)
53765+ return 0;
53766+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << memb)))
53767+ return 0;
53768+ slot = (reiser4_plugin_slot *) * area;
53769+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
53770+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
53771+ fake_len = (int)0xffff;
53772+ move_on(&fake_len, area, sizeof *slot);
53773+ ++*count;
53774+ result = 0;
53775+ if (plugin->h.pops != NULL) {
53776+ if (plugin->h.pops->save != NULL)
53777+ result = plugin->h.pops->save(inode, plugin, area);
53778+ }
53779+ return result;
53780+}
53781+
53782+/* save state of all non-standard plugins associated with inode */
53783+static int save_plugin_sd(struct inode *inode /* object being processed */ ,
53784+ char **area /* position in stat-data */ )
53785+{
53786+ int result = 0;
53787+ int num_of_plugins;
53788+ reiser4_plugin_stat *sd;
53789+ reiser4_inode *state;
53790+ int fake_len;
53791+ pset_member memb;
53792+
53793+ assert("nikita-669", inode != NULL);
53794+ assert("nikita-670", area != NULL);
53795+ assert("nikita-671", *area != NULL);
53796+
53797+ state = reiser4_inode_data(inode);
53798+ if (state->plugin_mask == 0)
53799+ return 0;
53800+ sd = (reiser4_plugin_stat *) * area;
53801+ fake_len = (int)0xffff;
53802+ move_on(&fake_len, area, sizeof *sd);
53803+
53804+ num_of_plugins = 0;
53805+ for (memb = 0; memb < PSET_LAST; ++memb) {
53806+ result = save_plug(pset_get(state->pset, memb),
53807+ inode, memb, area, &num_of_plugins);
53808+ if (result != 0)
53809+ break;
53810+ }
53811+
53812+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
53813+ return result;
53814+}
53815+
53816+/* helper function for crypto_sd_present(), crypto_sd_save.
53817+ Allocates memory for crypto stat, keyid and attaches it to the inode */
53818+static int extract_crypto_stat (struct inode * inode,
53819+ reiser4_crypto_stat * sd)
53820+{
53821+ crypto_stat_t * info;
53822+ assert("edward-11", !inode_crypto_stat(inode));
53823+ assert("edward-1413",
53824+ !inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
53825+ /* create and attach a crypto-stat without secret key loaded */
53826+ info = alloc_crypto_stat(inode);
53827+ if (IS_ERR(info))
53828+ return PTR_ERR(info);
53829+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
53830+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
53831+ attach_crypto_stat(inode, info);
53832+ inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53833+ return 0;
53834+}
53835+
53836+/* crypto stat-data extension */
53837+
53838+static int present_crypto_sd(struct inode *inode, char **area, int *len)
53839+{
53840+ int result;
53841+ reiser4_crypto_stat *sd;
53842+ digest_plugin *dplug = inode_digest_plugin(inode);
53843+
53844+ assert("edward-06", dplug != NULL);
53845+ assert("edward-684", dplug->fipsize);
53846+ assert("edward-07", area != NULL);
53847+ assert("edward-08", *area != NULL);
53848+ assert("edward-09", len != NULL);
53849+ assert("edward-10", *len > 0);
53850+
53851+ if (*len < (int)sizeof(reiser4_crypto_stat)) {
53852+ return not_enough_space(inode, "crypto-sd");
53853+ }
53854+ /* *len is number of bytes in stat data item from *area to the end of
53855+ item. It must be not less than size of this extension */
53856+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
53857+
53858+ sd = (reiser4_crypto_stat *) * area;
53859+ result = extract_crypto_stat(inode, sd);
53860+ move_on(len, area, sizeof(*sd) + dplug->fipsize);
53861+
53862+ return result;
53863+}
53864+
53865+static int save_len_crypto_sd(struct inode *inode)
53866+{
53867+ return sizeof(reiser4_crypto_stat) +
53868+ inode_digest_plugin(inode)->fipsize;
53869+}
53870+
53871+static int save_crypto_sd(struct inode *inode, char **area)
53872+{
53873+ int result = 0;
53874+ reiser4_crypto_stat *sd;
53875+ crypto_stat_t * info = inode_crypto_stat(inode);
53876+ digest_plugin *dplug = inode_digest_plugin(inode);
53877+
53878+ assert("edward-12", dplug != NULL);
53879+ assert("edward-13", area != NULL);
53880+ assert("edward-14", *area != NULL);
53881+ assert("edward-15", info != NULL);
53882+ assert("edward-1414", info->keyid != NULL);
53883+ assert("edward-1415", info->keysize != 0);
53884+ assert("edward-76", reiser4_inode_data(inode) != NULL);
53885+
53886+ if (!inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
53887+ /* file is just created */
53888+ sd = (reiser4_crypto_stat *) *area;
53889+ /* copy everything but private key to the disk stat-data */
53890+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
53891+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
53892+ inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53893+ }
53894+ *area += (sizeof(*sd) + dplug->fipsize);
53895+ return result;
53896+}
53897+
53898+static int eio(struct inode *inode, char **area, int *len)
53899+{
53900+ return RETERR(-EIO);
53901+}
53902+
53903+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
53904+ [LIGHT_WEIGHT_STAT] = {
53905+ .h = {
53906+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53907+ .id = LIGHT_WEIGHT_STAT,
53908+ .pops = NULL,
53909+ .label = "light-weight sd",
53910+ .desc = "sd for light-weight files",
53911+ .linkage = {NULL,NULL}
53912+ },
53913+ .present = present_lw_sd,
53914+ .absent = NULL,
53915+ .save_len = save_len_lw_sd,
53916+ .save = save_lw_sd,
53917+ .alignment = 8
53918+ },
53919+ [UNIX_STAT] = {
53920+ .h = {
53921+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53922+ .id = UNIX_STAT,
53923+ .pops = NULL,
53924+ .label = "unix-sd",
53925+ .desc = "unix stat-data fields",
53926+ .linkage = {NULL,NULL}
53927+ },
53928+ .present = present_unix_sd,
53929+ .absent = absent_unix_sd,
53930+ .save_len = save_len_unix_sd,
53931+ .save = save_unix_sd,
53932+ .alignment = 8
53933+ },
53934+ [LARGE_TIMES_STAT] = {
53935+ .h = {
53936+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53937+ .id = LARGE_TIMES_STAT,
53938+ .pops = NULL,
53939+ .label = "64time-sd",
53940+ .desc = "nanosecond resolution for times",
53941+ .linkage = {NULL,NULL}
53942+ },
53943+ .present = present_large_times_sd,
53944+ .absent = NULL,
53945+ .save_len = save_len_large_times_sd,
53946+ .save = save_large_times_sd,
53947+ .alignment = 8
53948+ },
53949+ [SYMLINK_STAT] = {
53950+ /* stat data of symlink has this extension */
53951+ .h = {
53952+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53953+ .id = SYMLINK_STAT,
53954+ .pops = NULL,
53955+ .label = "symlink-sd",
53956+ .desc =
53957+ "stat data is appended with symlink name",
53958+ .linkage = {NULL,NULL}
53959+ },
53960+ .present = present_symlink_sd,
53961+ .absent = NULL,
53962+ .save_len = save_len_symlink_sd,
53963+ .save = save_symlink_sd,
53964+ .alignment = 8
53965+ },
53966+ [PLUGIN_STAT] = {
53967+ .h = {
53968+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53969+ .id = PLUGIN_STAT,
53970+ .pops = NULL,
53971+ .label = "plugin-sd",
53972+ .desc = "plugin stat-data fields",
53973+ .linkage = {NULL,NULL}
53974+ },
53975+ .present = present_plugin_sd,
53976+ .absent = absent_plugin_sd,
53977+ .save_len = save_len_plugin_sd,
53978+ .save = save_plugin_sd,
53979+ .alignment = 8
53980+ },
53981+ [FLAGS_STAT] = {
53982+ .h = {
53983+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53984+ .id = FLAGS_STAT,
53985+ .pops = NULL,
53986+ .label = "flags-sd",
53987+ .desc = "inode bit flags",
53988+ .linkage = {NULL, NULL}
53989+ },
53990+ .present = present_flags_sd,
53991+ .absent = NULL,
53992+ .save_len = save_len_flags_sd,
53993+ .save = save_flags_sd,
53994+ .alignment = 8
53995+ },
53996+ [CAPABILITIES_STAT] = {
53997+ .h = {
53998+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53999+ .id = CAPABILITIES_STAT,
54000+ .pops = NULL,
54001+ .label = "capabilities-sd",
54002+ .desc = "capabilities",
54003+ .linkage = {NULL, NULL}
54004+ },
54005+ .present = eio,
54006+ .absent = NULL,
54007+ .save_len = save_len_flags_sd,
54008+ .save = save_flags_sd,
54009+ .alignment = 8
54010+ },
54011+ [CRYPTO_STAT] = {
54012+ .h = {
54013+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54014+ .id = CRYPTO_STAT,
54015+ .pops = NULL,
54016+ .label = "crypto-sd",
54017+ .desc = "secret key size and id",
54018+ .linkage = {NULL, NULL}
54019+ },
54020+ .present = present_crypto_sd,
54021+ .absent = NULL,
54022+ .save_len = save_len_crypto_sd,
54023+ .save = save_crypto_sd,
54024+ .alignment = 8
54025+ }
54026+};
54027+
54028+/* Make Linus happy.
54029+ Local variables:
54030+ c-indentation-style: "K&R"
54031+ mode-name: "LC"
54032+ c-basic-offset: 8
54033+ tab-width: 8
54034+ fill-column: 120
54035+ End:
54036+*/
54037Index: linux-2.6.16/fs/reiser4/plugin/item/static_stat.h
54038===================================================================
54039--- /dev/null
54040+++ linux-2.6.16/fs/reiser4/plugin/item/static_stat.h
54041@@ -0,0 +1,219 @@
54042+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54043+
54044+/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
54045+
54046+In the case where each file has not less than the fields needed by the
54047+stat() syscall, it is more compact to store those fields in this
54048+struct.
54049+
54050+If this item does not exist, then all stats are dynamically resolved.
54051+At the moment, we either resolve all stats dynamically or all of them
54052+statically. If you think this is not fully optimal, and the rest of
54053+reiser4 is working, then fix it...:-)
54054+
54055+*/
54056+
54057+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
54058+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
54059+
54060+#include "../../forward.h"
54061+#include "../../dformat.h"
54062+
54063+#include <linux/fs.h> /* for struct inode */
54064+
54065+/* Stat data layout: goals and implementation.
54066+
54067+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
54068+ them, including not having semantic metadata attached to them.
54069+
54070+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
54071+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
54072+ sized structure because the statically sized structure knows without recording it what the names and lengths of the
54073+ attributes are.
54074+
54075+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file
54076+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
54077+ file in their use of file attributes.
54078+
54079+ Yet this compromise deserves to be compromised a little.
54080+
54081+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension
54082+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
54083+
54084+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
54085+ from parent directory (as uid, gid) or initialised to some sane values.
54086+
54087+ To capitalize on existing code infrastructure, extensions are
54088+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
54089+ Each stat-data extension plugin implements four methods:
54090+
54091+ ->present() called by sd_load() when this extension is found in stat-data
54092+ ->absent() called by sd_load() when this extension is not found in stat-data
54093+ ->save_len() called by sd_len() to calculate total length of stat-data
54094+ ->save() called by sd_save() to store extension data into stat-data
54095+
54096+ Implementation is in fs/reiser4/plugin/item/static_stat.c
54097+*/
54098+
54099+/* stat-data extension. Please order this by presumed frequency of use */
54100+typedef enum {
54101+ /* support for light-weight files */
54102+ LIGHT_WEIGHT_STAT,
54103+ /* data required to implement unix stat(2) call. Layout is in
54104+ reiser4_unix_stat. If this is not present, file is light-weight */
54105+ UNIX_STAT,
54106+ /* this contains additional set of 32bit [anc]time fields to implement
54107+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
54108+ if this extension is governed by 32bittimes mount option. */
54109+ LARGE_TIMES_STAT,
54110+ /* stat data has link name included */
54111+ SYMLINK_STAT,
54112+ /* if this is present, file is controlled by non-standard
54113+ plugin (that is, plugin that cannot be deduced from file
54114+ mode bits), for example, aggregation, interpolation etc. */
54115+ PLUGIN_STAT,
54116+ /* this extension contains persistent inode flags. These flags are
54117+ single bits: immutable, append, only, etc. Layout is in
54118+ reiser4_flags_stat. */
54119+ FLAGS_STAT,
54120+ /* this extension contains capabilities sets, associated with this
54121+ file. Layout is in reiser4_capabilities_stat */
54122+ CAPABILITIES_STAT,
54123+ /* this extension contains size and public id of the secret key.
54124+ Layout is in reiser4_crypto_stat */
54125+ CRYPTO_STAT,
54126+ LAST_SD_EXTENSION,
54127+ /*
54128+ * init_inode_static_sd() iterates over extension mask until all
54129+ * non-zero bits are processed. This means, that neither ->present(),
54130+ * nor ->absent() methods will be called for stat-data extensions that
54131+ * go after last present extension. But some basic extensions, we want
54132+ * either ->absent() or ->present() method to be called, because these
54133+ * extensions set up something in inode even when they are not
54134+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
54135+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
54136+ * ->present(), or ->absent() method will be called, independently of
54137+ * what other extensions are present.
54138+ */
54139+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT,
54140+} sd_ext_bits;
54141+
54142+/* minimal stat-data. This allows to support light-weight files. */
54143+typedef struct reiser4_stat_data_base {
54144+ /* 0 */ __le16 extmask;
54145+ /* 2 */
54146+} PACKED reiser4_stat_data_base;
54147+
54148+typedef struct reiser4_light_weight_stat {
54149+ /* 0 */ __le16 mode;
54150+ /* 2 */ __le32 nlink;
54151+ /* 8 */ __le64 size;
54152+ /* size in bytes */
54153+ /* 16 */
54154+} PACKED reiser4_light_weight_stat;
54155+
54156+typedef struct reiser4_unix_stat {
54157+ /* owner id */
54158+ /* 0 */ __le32 uid;
54159+ /* group id */
54160+ /* 4 */ __le32 gid;
54161+ /* access time */
54162+ /* 8 */ __le32 atime;
54163+ /* modification time */
54164+ /* 12 */ __le32 mtime;
54165+ /* change time */
54166+ /* 16 */ __le32 ctime;
54167+ union {
54168+ /* minor:major for device files */
54169+ /* 20 */ __le64 rdev;
54170+ /* bytes used by file */
54171+ /* 20 */ __le64 bytes;
54172+ } u;
54173+ /* 28 */
54174+} PACKED reiser4_unix_stat;
54175+
54176+/* symlink stored as part of inode */
54177+typedef struct reiser4_symlink_stat {
54178+ char body[0];
54179+} PACKED reiser4_symlink_stat;
54180+
54181+typedef struct reiser4_plugin_slot {
54182+ /* 0 */ __le16 pset_memb;
54183+ /* 2 */ __le16 id;
54184+ /* 4 *//* here plugin stores its persistent state */
54185+} PACKED reiser4_plugin_slot;
54186+
54187+/* stat-data extension for files with non-standard plugin. */
54188+typedef struct reiser4_plugin_stat {
54189+ /* number of additional plugins, associated with this object */
54190+ /* 0 */ __le16 plugins_no;
54191+ /* 2 */ reiser4_plugin_slot slot[0];
54192+ /* 2 */
54193+} PACKED reiser4_plugin_stat;
54194+
54195+/* stat-data extension for inode flags. Currently it is just fixed-width 32
54196+ * bit mask. If need arise, this can be replaced with variable width
54197+ * bitmask. */
54198+typedef struct reiser4_flags_stat {
54199+ /* 0 */ __le32 flags;
54200+ /* 4 */
54201+} PACKED reiser4_flags_stat;
54202+
54203+typedef struct reiser4_capabilities_stat {
54204+ /* 0 */ __le32 effective;
54205+ /* 8 */ __le32 permitted;
54206+ /* 16 */
54207+} PACKED reiser4_capabilities_stat;
54208+
54209+typedef struct reiser4_cluster_stat {
54210+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
54211+ /* 0 */ d8 cluster_shift;
54212+ /* 1 */
54213+} PACKED reiser4_cluster_stat;
54214+
54215+typedef struct reiser4_crypto_stat {
54216+ /* secret key size, bits */
54217+ /* 0 */ d16 keysize;
54218+ /* secret key id */
54219+ /* 2 */ d8 keyid[0];
54220+ /* 2 */
54221+} PACKED reiser4_crypto_stat;
54222+
54223+typedef struct reiser4_large_times_stat {
54224+ /* access time */
54225+ /* 0 */ d32 atime;
54226+ /* modification time */
54227+ /* 8 */ d32 mtime;
54228+ /* change time */
54229+ /* 16 */ d32 ctime;
54230+ /* 24 */
54231+} PACKED reiser4_large_times_stat;
54232+
54233+/* this structure is filled by sd_item_stat */
54234+typedef struct sd_stat {
54235+ int dirs;
54236+ int files;
54237+ int others;
54238+} sd_stat;
54239+
54240+/* plugin->item.common.* */
54241+extern void print_sd(const char *prefix, coord_t * coord);
54242+extern void item_stat_static_sd(const coord_t * coord, void *vp);
54243+
54244+/* plugin->item.s.sd.* */
54245+extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
54246+extern int save_len_static_sd(struct inode *inode);
54247+extern int save_static_sd(struct inode *inode, char **area);
54248+
54249+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
54250+#endif
54251+
54252+/* Make Linus happy.
54253+ Local variables:
54254+ c-indentation-style: "K&R"
54255+ mode-name: "LC"
54256+ c-basic-offset: 8
54257+ tab-width: 8
54258+ fill-column: 120
54259+ End:
54260+*/
54261Index: linux-2.6.16/fs/reiser4/plugin/item/tail.c
54262===================================================================
54263--- /dev/null
54264+++ linux-2.6.16/fs/reiser4/plugin/item/tail.c
54265@@ -0,0 +1,805 @@
54266+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54267+
54268+#include "item.h"
54269+#include "../../inode.h"
54270+#include "../../page_cache.h"
54271+#include "../../carry.h"
54272+#include "../../vfs_ops.h"
54273+
54274+#include <linux/quotaops.h>
54275+#include <asm/uaccess.h>
54276+#include <linux/swap.h>
54277+#include <linux/writeback.h>
54278+
54279+/* plugin->u.item.b.max_key_inside */
54280+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
54281+{
54282+ item_key_by_coord(coord, key);
54283+ set_key_offset(key, get_key_offset(max_key()));
54284+ return key;
54285+}
54286+
54287+/* plugin->u.item.b.can_contain_key */
54288+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54289+ const reiser4_item_data *data)
54290+{
54291+ reiser4_key item_key;
54292+
54293+ if (item_plugin_by_coord(coord) != data->iplug)
54294+ return 0;
54295+
54296+ item_key_by_coord(coord, &item_key);
54297+ if (get_key_locality(key) != get_key_locality(&item_key) ||
54298+ get_key_objectid(key) != get_key_objectid(&item_key))
54299+ return 0;
54300+
54301+ return 1;
54302+}
54303+
54304+/* plugin->u.item.b.mergeable
54305+ first item is of tail type */
54306+/* Audited by: green(2002.06.14) */
54307+int mergeable_tail(const coord_t *p1, const coord_t *p2)
54308+{
54309+ reiser4_key key1, key2;
54310+
54311+ assert("vs-535",
54312+ item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
54313+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54314+
54315+ if (item_id_by_coord(p2) != FORMATTING_ID) {
54316+ /* second item is of another type */
54317+ return 0;
54318+ }
54319+
54320+ item_key_by_coord(p1, &key1);
54321+ item_key_by_coord(p2, &key2);
54322+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
54323+ get_key_objectid(&key1) != get_key_objectid(&key2)
54324+ || get_key_type(&key1) != get_key_type(&key2)) {
54325+ /* items of different objects */
54326+ return 0;
54327+ }
54328+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54329+ /* not adjacent items */
54330+ return 0;
54331+ }
54332+ return 1;
54333+}
54334+
54335+/* plugin->u.item.b.print
54336+ plugin->u.item.b.check */
54337+
54338+/* plugin->u.item.b.nr_units */
54339+pos_in_node_t nr_units_tail(const coord_t * coord)
54340+{
54341+ return item_length_by_coord(coord);
54342+}
54343+
54344+/* plugin->u.item.b.lookup */
54345+lookup_result
54346+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54347+{
54348+ reiser4_key item_key;
54349+ __u64 lookuped, offset;
54350+ unsigned nr_units;
54351+
54352+ item_key_by_coord(coord, &item_key);
54353+ offset = get_key_offset(item_key_by_coord(coord, &item_key));
54354+ nr_units = nr_units_tail(coord);
54355+
54356+ /* key we are looking for must be greater than key of item @coord */
54357+ assert("vs-416", keygt(key, &item_key));
54358+
54359+ /* offset we are looking for */
54360+ lookuped = get_key_offset(key);
54361+
54362+ if (lookuped >= offset && lookuped < offset + nr_units) {
54363+ /* byte we are looking for is in this item */
54364+ coord->unit_pos = lookuped - offset;
54365+ coord->between = AT_UNIT;
54366+ return CBK_COORD_FOUND;
54367+ }
54368+
54369+ /* set coord after last unit */
54370+ coord->unit_pos = nr_units - 1;
54371+ coord->between = AFTER_UNIT;
54372+ return bias ==
54373+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54374+}
54375+
54376+/* plugin->u.item.b.paste */
54377+int
54378+paste_tail(coord_t *coord, reiser4_item_data *data,
54379+ carry_plugin_info *info UNUSED_ARG)
54380+{
54381+ unsigned old_item_length;
54382+ char *item;
54383+
54384+ /* length the item had before resizing has been performed */
54385+ old_item_length = item_length_by_coord(coord) - data->length;
54386+
54387+ /* tail items never get pasted in the middle */
54388+ assert("vs-363",
54389+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54390+ (coord->unit_pos == old_item_length - 1 &&
54391+ coord->between == AFTER_UNIT) ||
54392+ (coord->unit_pos == 0 && old_item_length == 0
54393+ && coord->between == AT_UNIT));
54394+
54395+ item = item_body_by_coord(coord);
54396+ if (coord->unit_pos == 0)
54397+ /* make space for pasted data when pasting at the beginning of
54398+ the item */
54399+ memmove(item + data->length, item, old_item_length);
54400+
54401+ if (coord->between == AFTER_UNIT)
54402+ coord->unit_pos++;
54403+
54404+ if (data->data) {
54405+ assert("vs-554", data->user == 0 || data->user == 1);
54406+ if (data->user) {
54407+ assert("nikita-3035", schedulable());
54408+ /* copy from user space */
54409+ if (__copy_from_user(item + coord->unit_pos,
54410+ (const char __user *)data->data,
54411+ (unsigned)data->length))
54412+ return RETERR(-EFAULT);
54413+ } else
54414+ /* copy from kernel space */
54415+ memcpy(item + coord->unit_pos, data->data,
54416+ (unsigned)data->length);
54417+ } else {
54418+ memset(item + coord->unit_pos, 0, (unsigned)data->length);
54419+ }
54420+ return 0;
54421+}
54422+
54423+/* plugin->u.item.b.fast_paste */
54424+
54425+/* plugin->u.item.b.can_shift
54426+ number of units is returned via return value, number of bytes via @size. For
54427+ tail items they coincide */
54428+int
54429+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54430+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54431+ unsigned *size, unsigned want)
54432+{
54433+ /* make sure that that we do not want to shift more than we have */
54434+ assert("vs-364", want > 0
54435+ && want <= (unsigned)item_length_by_coord(source));
54436+
54437+ *size = min(want, free_space);
54438+ return *size;
54439+}
54440+
54441+/* plugin->u.item.b.copy_units */
54442+void
54443+copy_units_tail(coord_t * target, coord_t * source,
54444+ unsigned from, unsigned count,
54445+ shift_direction where_is_free_space,
54446+ unsigned free_space UNUSED_ARG)
54447+{
54448+ /* make sure that item @target is expanded already */
54449+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54450+ assert("vs-370", free_space >= count);
54451+
54452+ if (where_is_free_space == SHIFT_LEFT) {
54453+ /* append item @target with @count first bytes of @source */
54454+ assert("vs-365", from == 0);
54455+
54456+ memcpy((char *)item_body_by_coord(target) +
54457+ item_length_by_coord(target) - count,
54458+ (char *)item_body_by_coord(source), count);
54459+ } else {
54460+ /* target item is moved to right already */
54461+ reiser4_key key;
54462+
54463+ assert("vs-367",
54464+ (unsigned)item_length_by_coord(source) == from + count);
54465+
54466+ memcpy((char *)item_body_by_coord(target),
54467+ (char *)item_body_by_coord(source) + from, count);
54468+
54469+ /* new units are inserted before first unit in an item,
54470+ therefore, we have to update item key */
54471+ item_key_by_coord(source, &key);
54472+ set_key_offset(&key, get_key_offset(&key) + from);
54473+
54474+ node_plugin_by_node(target->node)->update_item_key(target, &key,
54475+ NULL /*info */);
54476+ }
54477+}
54478+
54479+/* plugin->u.item.b.create_hook */
54480+
54481+/* item_plugin->b.kill_hook
54482+ this is called when @count units starting from @from-th one are going to be removed
54483+ */
54484+int
54485+kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54486+ pos_in_node_t count, struct carry_kill_data *kdata)
54487+{
54488+ reiser4_key key;
54489+ loff_t start, end;
54490+
54491+ assert("vs-1577", kdata);
54492+ assert("vs-1579", kdata->inode);
54493+
54494+ item_key_by_coord(coord, &key);
54495+ start = get_key_offset(&key) + from;
54496+ end = start + count;
54497+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54498+ return 0;
54499+}
54500+
54501+/* plugin->u.item.b.shift_hook */
54502+
54503+/* helper for kill_units_tail and cut_units_tail */
54504+static int
54505+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54506+ reiser4_key * smallest_removed, reiser4_key * new_first)
54507+{
54508+ pos_in_node_t count;
54509+
54510+ /* this method is only called to remove part of item */
54511+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54512+ /* tails items are never cut from the middle of an item */
54513+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
54514+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
54515+
54516+ count = to - from + 1;
54517+
54518+ if (smallest_removed) {
54519+ /* store smallest key removed */
54520+ item_key_by_coord(coord, smallest_removed);
54521+ set_key_offset(smallest_removed,
54522+ get_key_offset(smallest_removed) + from);
54523+ }
54524+ if (new_first) {
54525+ /* head of item is cut */
54526+ assert("vs-1529", from == 0);
54527+
54528+ item_key_by_coord(coord, new_first);
54529+ set_key_offset(new_first,
54530+ get_key_offset(new_first) + from + count);
54531+ }
54532+
54533+ if (REISER4_DEBUG)
54534+ memset((char *)item_body_by_coord(coord) + from, 0, count);
54535+ return count;
54536+}
54537+
54538+/* plugin->u.item.b.cut_units */
54539+int
54540+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54541+ struct carry_cut_data *cdata UNUSED_ARG,
54542+ reiser4_key * smallest_removed, reiser4_key * new_first)
54543+{
54544+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54545+}
54546+
54547+/* plugin->u.item.b.kill_units */
54548+int
54549+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54550+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
54551+ reiser4_key * new_first)
54552+{
54553+ kill_hook_tail(coord, from, to - from + 1, kdata);
54554+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54555+}
54556+
54557+/* plugin->u.item.b.unit_key */
54558+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
54559+{
54560+ assert("vs-375", coord_is_existing_unit(coord));
54561+
54562+ item_key_by_coord(coord, key);
54563+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
54564+
54565+ return key;
54566+}
54567+
54568+/* plugin->u.item.b.estimate
54569+ plugin->u.item.b.item_data_by_flow */
54570+
54571+/* tail redpage function. It is called from readpage_tail(). */
54572+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
54573+{
54574+ tap_t tap;
54575+ int result;
54576+ coord_t coord;
54577+ lock_handle lh;
54578+ int count, mapped;
54579+ struct inode *inode;
54580+ char *pagedata;
54581+
54582+ /* saving passed coord in order to do not move it by tap. */
54583+ init_lh(&lh);
54584+ copy_lh(&lh, uf_coord->lh);
54585+ inode = page->mapping->host;
54586+ coord_dup(&coord, &uf_coord->coord);
54587+
54588+ tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
54589+
54590+ if ((result = tap_load(&tap)))
54591+ goto out_tap_done;
54592+
54593+ /* lookup until page is filled up. */
54594+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
54595+ /* number of bytes to be copied to page */
54596+ count = item_length_by_coord(&coord) - coord.unit_pos;
54597+ if (count > PAGE_CACHE_SIZE - mapped)
54598+ count = PAGE_CACHE_SIZE - mapped;
54599+
54600+ /* attach @page to address space and get data address */
54601+ pagedata = kmap_atomic(page, KM_USER0);
54602+
54603+ /* copy tail item to page */
54604+ memcpy(pagedata + mapped,
54605+ ((char *)item_body_by_coord(&coord) + coord.unit_pos),
54606+ count);
54607+ mapped += count;
54608+
54609+ flush_dcache_page(page);
54610+
54611+ /* dettach page from address space */
54612+ kunmap_atomic(pagedata, KM_USER0);
54613+
54614+ /* Getting next tail item. */
54615+ if (mapped < PAGE_CACHE_SIZE) {
54616+ /*
54617+ * unlock page in order to avoid keep it locked
54618+ * during tree lookup, which takes long term locks
54619+ */
54620+ unlock_page(page);
54621+
54622+ /* getting right neighbour. */
54623+ result = go_dir_el(&tap, RIGHT_SIDE, 0);
54624+
54625+ /* lock page back */
54626+ lock_page(page);
54627+ if (PageUptodate(page)) {
54628+ /*
54629+ * another thread read the page, we have
54630+ * nothing to do
54631+ */
54632+ result = 0;
54633+ goto out_unlock_page;
54634+ }
54635+
54636+ if (result) {
54637+ if (result == -E_NO_NEIGHBOR) {
54638+ /*
54639+ * rigth neighbor is not a formatted
54640+ * node
54641+ */
54642+ result = 0;
54643+ goto done;
54644+ } else {
54645+ goto out_tap_relse;
54646+ }
54647+ } else {
54648+ if (!inode_file_plugin(inode)->
54649+ owns_item(inode, &coord)) {
54650+ /* item of another file is found */
54651+ result = 0;
54652+ goto done;
54653+ }
54654+ }
54655+ }
54656+ }
54657+
54658+ done:
54659+ if (mapped != PAGE_CACHE_SIZE) {
54660+ pagedata = kmap_atomic(page, KM_USER0);
54661+ memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
54662+ flush_dcache_page(page);
54663+ kunmap_atomic(pagedata, KM_USER0);
54664+ }
54665+ SetPageUptodate(page);
54666+ out_unlock_page:
54667+ unlock_page(page);
54668+ out_tap_relse:
54669+ tap_relse(&tap);
54670+ out_tap_done:
54671+ tap_done(&tap);
54672+ return result;
54673+}
54674+
54675+/*
54676+ plugin->s.file.readpage
54677+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
54678+ or
54679+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
54680+
54681+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
54682+ item. */
54683+int readpage_tail(void *vp, struct page *page)
54684+{
54685+ uf_coord_t *uf_coord = vp;
54686+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
54687+ ON_DEBUG(reiser4_key key);
54688+
54689+ assert("umka-2515", PageLocked(page));
54690+ assert("umka-2516", !PageUptodate(page));
54691+ assert("umka-2517", !jprivate(page) && !PagePrivate(page));
54692+ assert("umka-2518", page->mapping && page->mapping->host);
54693+
54694+ assert("umka-2519", znode_is_loaded(coord->node));
54695+ assert("umka-2520", item_is_tail(coord));
54696+ assert("umka-2521", coord_is_existing_unit(coord));
54697+ assert("umka-2522", znode_is_rlocked(coord->node));
54698+ assert("umka-2523",
54699+ page->mapping->host->i_ino ==
54700+ get_key_objectid(item_key_by_coord(coord, &key)));
54701+
54702+ return do_readpage_tail(uf_coord, page);
54703+}
54704+
54705+/**
54706+ * overwrite_tail
54707+ * @flow:
54708+ * @coord:
54709+ *
54710+ * Overwrites tail item or its part by user data. Returns number of bytes
54711+ * written or error code.
54712+ */
54713+static int overwrite_tail(flow_t *flow, coord_t *coord)
54714+{
54715+ unsigned count;
54716+
54717+ assert("vs-570", flow->user == 1);
54718+ assert("vs-946", flow->data);
54719+ assert("vs-947", coord_is_existing_unit(coord));
54720+ assert("vs-948", znode_is_write_locked(coord->node));
54721+ assert("nikita-3036", schedulable());
54722+
54723+ count = item_length_by_coord(coord) - coord->unit_pos;
54724+ if (count > flow->length)
54725+ count = flow->length;
54726+
54727+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
54728+ (const char __user *)flow->data, count))
54729+ return RETERR(-EFAULT);
54730+
54731+ znode_make_dirty(coord->node);
54732+ return count;
54733+}
54734+
54735+/**
54736+ * insert_first_tail
54737+ * @inode:
54738+ * @flow:
54739+ * @coord:
54740+ * @lh:
54741+ *
54742+ * Returns number of bytes written or error code.
54743+ */
54744+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
54745+ coord_t *coord, lock_handle *lh)
54746+{
54747+ int result;
54748+ loff_t to_write;
54749+ unix_file_info_t *uf_info;
54750+
54751+ if (get_key_offset(&flow->key) != 0) {
54752+ /*
54753+ * file is empty and we have to write not to the beginning of
54754+ * file. Create a hole at the beginning of file. On success
54755+ * insert_flow returns 0 as number of written bytes which is
54756+ * what we have to return on padding a file with holes
54757+ */
54758+ flow->data = NULL;
54759+ flow->length = get_key_offset(&flow->key);
54760+ set_key_offset(&flow->key, 0);
54761+ /*
54762+ * holes in files built of tails are stored just like if there
54763+ * were real data which are all zeros. Therefore we have to
54764+ * allocate quota here as well
54765+ */
54766+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54767+ return RETERR(-EDQUOT);
54768+ result = insert_flow(coord, lh, flow);
54769+ if (flow->length)
54770+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54771+
54772+ uf_info = unix_file_inode_data(inode);
54773+
54774+ /*
54775+ * first item insertion is only possible when writing to empty
54776+ * file or performing tail conversion
54777+ */
54778+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
54779+ (inode_get_flag(inode, REISER4_PART_MIXED) &&
54780+ inode_get_flag(inode, REISER4_PART_IN_CONV))));
54781+
54782+ /* if file was empty - update its state */
54783+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
54784+ uf_info->container = UF_CONTAINER_TAILS;
54785+ return result;
54786+ }
54787+
54788+ /* check quota before appending data */
54789+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54790+ return RETERR(-EDQUOT);
54791+
54792+ to_write = flow->length;
54793+ result = insert_flow(coord, lh, flow);
54794+ if (flow->length)
54795+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54796+ return (to_write - flow->length) ? (to_write - flow->length) : result;
54797+}
54798+
54799+/**
54800+ * append_tail
54801+ * @inode:
54802+ * @flow:
54803+ * @coord:
54804+ * @lh:
54805+ *
54806+ * Returns number of bytes written or error code.
54807+ */
54808+static ssize_t append_tail(struct inode *inode,
54809+ flow_t *flow, coord_t *coord, lock_handle *lh)
54810+{
54811+ int result;
54812+ reiser4_key append_key;
54813+ loff_t to_write;
54814+
54815+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
54816+ flow->data = NULL;
54817+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
54818+ set_key_offset(&flow->key, get_key_offset(&append_key));
54819+ /*
54820+ * holes in files built of tails are stored just like if there
54821+ * were real data which are all zeros. Therefore we have to
54822+ * allocate quota here as well
54823+ */
54824+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54825+ return RETERR(-EDQUOT);
54826+ result = insert_flow(coord, lh, flow);
54827+ if (flow->length)
54828+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54829+ return result;
54830+ }
54831+
54832+ /* check quota before appending data */
54833+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54834+ return RETERR(-EDQUOT);
54835+
54836+ to_write = flow->length;
54837+ result = insert_flow(coord, lh, flow);
54838+ if (flow->length)
54839+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54840+ return (to_write - flow->length) ? (to_write - flow->length) : result;
54841+}
54842+
54843+/**
54844+ * write_tail_reserve_space - reserve space for tail write operation
54845+ * @inode:
54846+ *
54847+ * Estimates and reserves space which may be required for writing one flow to a
54848+ * file
54849+ */
54850+static int write_extent_reserve_space(struct inode *inode)
54851+{
54852+ __u64 count;
54853+ reiser4_tree *tree;
54854+
54855+ /*
54856+ * to write one flow to a file by tails we have to reserve disk space for:
54857+
54858+ * 1. find_file_item may have to insert empty node to the tree (empty
54859+ * leaf node between two extent items). This requires 1 block and
54860+ * number of blocks which are necessary to perform insertion of an
54861+ * internal item into twig level.
54862+ *
54863+ * 2. flow insertion
54864+ *
54865+ * 3. stat data update
54866+ */
54867+ tree = tree_by_inode(inode);
54868+ count = estimate_one_insert_item(tree) +
54869+ estimate_insert_flow(tree->height) +
54870+ estimate_one_insert_item(tree);
54871+ grab_space_enable();
54872+ return reiser4_grab_space(count, 0 /* flags */);
54873+}
54874+
54875+#define PAGE_PER_FLOW 4
54876+
54877+static loff_t faultin_user_pages(const char __user *buf, size_t count)
54878+{
54879+ loff_t faulted;
54880+ int to_fault;
54881+
54882+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
54883+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
54884+ faulted = 0;
54885+ while (count > 0) {
54886+ to_fault = PAGE_CACHE_SIZE;
54887+ if (count < to_fault)
54888+ to_fault = count;
54889+ fault_in_pages_readable(buf + faulted, to_fault);
54890+ count -= to_fault;
54891+ faulted += to_fault;
54892+ }
54893+ return faulted;
54894+}
54895+
54896+/**
54897+ * write_extent - write method of tail item plugin
54898+ * @file: file to write to
54899+ * @buf: address of user-space buffer
54900+ * @count: number of bytes to write
54901+ * @pos: position in file to write to
54902+ *
54903+ * Returns number of written bytes or error code.
54904+ */
54905+ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
54906+ loff_t *pos)
54907+{
54908+ struct inode *inode;
54909+ struct hint hint;
54910+ int result;
54911+ flow_t flow;
54912+ coord_t *coord;
54913+ lock_handle *lh;
54914+ znode *loaded;
54915+
54916+ inode = file->f_dentry->d_inode;
54917+
54918+ if (write_extent_reserve_space(inode))
54919+ return RETERR(-ENOSPC);
54920+
54921+ result = load_file_hint(file, &hint);
54922+ BUG_ON(result != 0);
54923+
54924+ flow.length = faultin_user_pages(buf, count);
54925+ flow.user = 1;
54926+ memcpy(&flow.data, &buf, sizeof(buf));
54927+ flow.op = WRITE_OP;
54928+ key_by_inode_and_offset_common(inode, *pos, &flow.key);
54929+
54930+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
54931+ if (IS_CBKERR(result))
54932+ return result;
54933+
54934+ coord = &hint.ext_coord.coord;
54935+ lh = hint.ext_coord.lh;
54936+
54937+ result = zload(coord->node);
54938+ BUG_ON(result != 0);
54939+ loaded = coord->node;
54940+
54941+ if (coord->between == AFTER_UNIT) {
54942+ /* append with data or hole */
54943+ result = append_tail(inode, &flow, coord, lh);
54944+ } else if (coord->between == AT_UNIT) {
54945+ /* overwrite */
54946+ result = overwrite_tail(&flow, coord);
54947+ } else {
54948+ /* no items of this file yet. insert data or hole */
54949+ result = insert_first_tail(inode, &flow, coord, lh);
54950+ }
54951+ zrelse(loaded);
54952+ if (result < 0) {
54953+ done_lh(lh);
54954+ return result;
54955+ }
54956+
54957+ /* seal and unlock znode */
54958+ hint.ext_coord.valid = 0;
54959+ if (hint.ext_coord.valid)
54960+ set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
54961+ else
54962+ unset_hint(&hint);
54963+
54964+ save_file_hint(file, &hint);
54965+ return result;
54966+}
54967+
54968+#if REISER4_DEBUG
54969+
54970+static int
54971+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
54972+{
54973+ reiser4_key item_key;
54974+
54975+ assert("vs-1356", coord_is_existing_unit(coord));
54976+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
54977+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
54978+ return get_key_offset(key) ==
54979+ get_key_offset(&item_key) + coord->unit_pos;
54980+
54981+}
54982+
54983+#endif
54984+
54985+/* plugin->u.item.s.file.read */
54986+int read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
54987+{
54988+ unsigned count;
54989+ int item_length;
54990+ coord_t *coord;
54991+ uf_coord_t *uf_coord;
54992+
54993+ uf_coord = &hint->ext_coord;
54994+ coord = &uf_coord->coord;
54995+
54996+ assert("vs-571", f->user == 1);
54997+ assert("vs-571", f->data);
54998+ assert("vs-967", coord && coord->node);
54999+ assert("vs-1117", znode_is_rlocked(coord->node));
55000+ assert("vs-1118", znode_is_loaded(coord->node));
55001+
55002+ assert("nikita-3037", schedulable());
55003+ assert("vs-1357", coord_matches_key_tail(coord, &f->key));
55004+
55005+ /* calculate number of bytes to read off the item */
55006+ item_length = item_length_by_coord(coord);
55007+ count = item_length_by_coord(coord) - coord->unit_pos;
55008+ if (count > f->length)
55009+ count = f->length;
55010+
55011+ /* user page has to be brought in so that major page fault does not
55012+ * occur here when longtem lock is held */
55013+ if (__copy_to_user((char __user *)f->data,
55014+ ((char *)item_body_by_coord(coord) + coord->unit_pos),
55015+ count))
55016+ return RETERR(-EFAULT);
55017+
55018+ /* probably mark_page_accessed() should only be called if
55019+ * coord->unit_pos is zero. */
55020+ mark_page_accessed(znode_page(coord->node));
55021+ move_flow_forward(f, count);
55022+
55023+ coord->unit_pos += count;
55024+ if (item_length == coord->unit_pos) {
55025+ coord->unit_pos--;
55026+ coord->between = AFTER_UNIT;
55027+ }
55028+
55029+ return 0;
55030+}
55031+
55032+/*
55033+ plugin->u.item.s.file.append_key
55034+ key of first byte which is the next to last byte by addressed by this item
55035+*/
55036+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
55037+{
55038+ item_key_by_coord(coord, key);
55039+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
55040+ return key;
55041+}
55042+
55043+/* plugin->u.item.s.file.init_coord_extension */
55044+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
55045+{
55046+ uf_coord->valid = 1;
55047+}
55048+
55049+/*
55050+ plugin->u.item.s.file.get_block
55051+*/
55052+int
55053+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
55054+{
55055+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
55056+
55057+ *block = *znode_get_block(coord->node);
55058+ return 0;
55059+}
55060+
55061+/*
55062+ * Local variables:
55063+ * c-indentation-style: "K&R"
55064+ * mode-name: "LC"
55065+ * c-basic-offset: 8
55066+ * tab-width: 8
55067+ * fill-column: 79
55068+ * scroll-step: 1
55069+ * End:
55070+ */
55071Index: linux-2.6.16/fs/reiser4/plugin/item/tail.h
55072===================================================================
55073--- /dev/null
55074+++ linux-2.6.16/fs/reiser4/plugin/item/tail.h
55075@@ -0,0 +1,58 @@
55076+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55077+
55078+#if !defined( __REISER4_TAIL_H__ )
55079+#define __REISER4_TAIL_H__
55080+
55081+typedef struct {
55082+ int not_used;
55083+} tail_coord_extension_t;
55084+
55085+struct cut_list;
55086+
55087+/* plugin->u.item.b.* */
55088+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
55089+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
55090+ const reiser4_item_data *);
55091+int mergeable_tail(const coord_t * p1, const coord_t * p2);
55092+pos_in_node_t nr_units_tail(const coord_t *);
55093+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
55094+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
55095+int can_shift_tail(unsigned free_space, coord_t * source,
55096+ znode * target, shift_direction, unsigned *size,
55097+ unsigned want);
55098+void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
55099+ unsigned count, shift_direction, unsigned free_space);
55100+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
55101+ struct carry_kill_data *);
55102+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55103+ struct carry_cut_data *, reiser4_key * smallest_removed,
55104+ reiser4_key * new_first);
55105+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55106+ struct carry_kill_data *, reiser4_key * smallest_removed,
55107+ reiser4_key * new_first);
55108+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
55109+
55110+/* plugin->u.item.s.* */
55111+ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
55112+ loff_t *pos);
55113+int read_tail(struct file *, flow_t *, hint_t *);
55114+int readpage_tail(void *vp, struct page *page);
55115+reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
55116+void init_coord_extension_tail(uf_coord_t *, loff_t offset);
55117+int get_block_address_tail(const coord_t *, sector_t, sector_t *);
55118+int item_balance_dirty_pages(struct address_space *, const flow_t *,
55119+ hint_t *, int back_to_dirty, int set_hint);
55120+
55121+/* __REISER4_TAIL_H__ */
55122+#endif
55123+
55124+/* Make Linus happy.
55125+ Local variables:
55126+ c-indentation-style: "K&R"
55127+ mode-name: "LC"
55128+ c-basic-offset: 8
55129+ tab-width: 8
55130+ fill-column: 120
55131+ scroll-step: 1
55132+ End:
55133+*/
55134Index: linux-2.6.16/fs/reiser4/plugin/node/Makefile
55135===================================================================
55136--- /dev/null
55137+++ linux-2.6.16/fs/reiser4/plugin/node/Makefile
55138@@ -0,0 +1,5 @@
55139+obj-$(CONFIG_REISER4_FS) += node_plugins.o
55140+
55141+node_plugins-objs := \
55142+ node.o \
55143+ node40.o
55144Index: linux-2.6.16/fs/reiser4/plugin/node/node.c
55145===================================================================
55146--- /dev/null
55147+++ linux-2.6.16/fs/reiser4/plugin/node/node.c
55148@@ -0,0 +1,131 @@
55149+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55150+
55151+/* Node plugin interface.
55152+
55153+ Description: The tree provides the abstraction of flows, which it
55154+ internally fragments into items which it stores in nodes.
55155+
55156+ A key_atom is a piece of data bound to a single key.
55157+
55158+ For reasonable space efficiency to be achieved it is often
55159+ necessary to store key_atoms in the nodes in the form of items, where
55160+ an item is a sequence of key_atoms of the same or similar type. It is
55161+ more space-efficient, because the item can implement (very)
55162+ efficient compression of key_atom's bodies using internal knowledge
55163+ about their semantics, and it can often avoid having a key for each
55164+ key_atom. Each type of item has specific operations implemented by its
55165+ item handler (see balance.c).
55166+
55167+ Rationale: the rest of the code (specifically balancing routines)
55168+ accesses leaf level nodes through this interface. This way we can
55169+ implement various block layouts and even combine various layouts
55170+ within the same tree. Balancing/allocating algorithms should not
55171+ care about peculiarities of splitting/merging specific item types,
55172+ but rather should leave that to the item's item handler.
55173+
55174+ Items, including those that provide the abstraction of flows, have
55175+ the property that if you move them in part or in whole to another
55176+ node, the balancing code invokes their is_left_mergeable()
55177+ item_operation to determine if they are mergeable with their new
55178+ neighbor in the node you have moved them to. For some items the
55179+ is_left_mergeable() function always returns null.
55180+
55181+ When moving the bodies of items from one node to another:
55182+
55183+ if a partial item is shifted to another node the balancing code invokes
55184+ an item handler method to handle the item splitting.
55185+
55186+ if the balancing code needs to merge with an item in the node it
55187+ is shifting to, it will invoke an item handler method to handle
55188+ the item merging.
55189+
55190+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55191+ adjusting the item headers after the move is done using the node handler.
55192+*/
55193+
55194+#include "../../forward.h"
55195+#include "../../debug.h"
55196+#include "../../key.h"
55197+#include "../../coord.h"
55198+#include "../plugin_header.h"
55199+#include "../item/item.h"
55200+#include "node.h"
55201+#include "../plugin.h"
55202+#include "../../znode.h"
55203+#include "../../tree.h"
55204+#include "../../super.h"
55205+#include "../../reiser4.h"
55206+
55207+/**
55208+ * leftmost_key_in_node - get the smallest key in node
55209+ * @node:
55210+ * @key: store result here
55211+ *
55212+ * Stores the leftmost key of @node in @key.
55213+ */
55214+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55215+{
55216+ assert("nikita-1634", node != NULL);
55217+ assert("nikita-1635", key != NULL);
55218+
55219+ if (!node_is_empty(node)) {
55220+ coord_t first_item;
55221+
55222+ coord_init_first_unit(&first_item, (znode *) node);
55223+ item_key_by_coord(&first_item, key);
55224+ } else
55225+ *key = *max_key();
55226+ return key;
55227+}
55228+
55229+node_plugin node_plugins[LAST_NODE_ID] = {
55230+ [NODE40_ID] = {
55231+ .h = {
55232+ .type_id = REISER4_NODE_PLUGIN_TYPE,
55233+ .id = NODE40_ID,
55234+ .pops = NULL,
55235+ .label = "unified",
55236+ .desc = "unified node layout",
55237+ .linkage = {NULL, NULL}
55238+ },
55239+ .item_overhead = item_overhead_node40,
55240+ .free_space = free_space_node40,
55241+ .lookup = lookup_node40,
55242+ .num_of_items = num_of_items_node40,
55243+ .item_by_coord = item_by_coord_node40,
55244+ .length_by_coord = length_by_coord_node40,
55245+ .plugin_by_coord = plugin_by_coord_node40,
55246+ .key_at = key_at_node40,
55247+ .estimate = estimate_node40,
55248+ .check = check_node40,
55249+ .parse = parse_node40,
55250+ .init = init_node40,
55251+#ifdef GUESS_EXISTS
55252+ .guess = guess_node40,
55253+#endif
55254+ .change_item_size = change_item_size_node40,
55255+ .create_item = create_item_node40,
55256+ .update_item_key = update_item_key_node40,
55257+ .cut_and_kill = kill_node40,
55258+ .cut = cut_node40,
55259+ .shift = shift_node40,
55260+ .shrink_item = shrink_item_node40,
55261+ .fast_insert = fast_insert_node40,
55262+ .fast_paste = fast_paste_node40,
55263+ .fast_cut = fast_cut_node40,
55264+ .max_item_size = max_item_size_node40,
55265+ .prepare_removal = prepare_removal_node40,
55266+ .set_item_plugin = set_item_plugin_node40
55267+ }
55268+};
55269+
55270+/*
55271+ Local variables:
55272+ c-indentation-style: "K&R"
55273+ mode-name: "LC"
55274+ c-basic-offset: 8
55275+ tab-width: 8
55276+ fill-column: 120
55277+ scroll-step: 1
55278+ End:
55279+*/
55280Index: linux-2.6.16/fs/reiser4/plugin/node/node.h
55281===================================================================
55282--- /dev/null
55283+++ linux-2.6.16/fs/reiser4/plugin/node/node.h
55284@@ -0,0 +1,272 @@
55285+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55286+
55287+/* We need a definition of the default node layout here. */
55288+
55289+/* Generally speaking, it is best to have free space in the middle of the
55290+ node so that two sets of things can grow towards it, and to have the
55291+ item bodies on the left so that the last one of them grows into free
55292+ space. We optimize for the case where we append new items to the end
55293+ of the node, or grow the last item, because it hurts nothing to so
55294+ optimize and it is a common special case to do massive insertions in
55295+ increasing key order (and one of cases more likely to have a real user
55296+ notice the delay time for).
55297+
55298+ formatted leaf default layout: (leaf1)
55299+
55300+ |node header:item bodies:free space:key + pluginid + item offset|
55301+
55302+ We grow towards the middle, optimizing layout for the case where we
55303+ append new items to the end of the node. The node header is fixed
55304+ length. Keys, and item offsets plus pluginids for the items
55305+ corresponding to them are in increasing key order, and are fixed
55306+ length. Item offsets are relative to start of node (16 bits creating
55307+ a node size limit of 64k, 12 bits might be a better choice....). Item
55308+ bodies are in decreasing key order. Item bodies have a variable size.
55309+ There is a one to one to one mapping of keys to item offsets to item
55310+ bodies. Item offsets consist of pointers to the zeroth byte of the
55311+ item body. Item length equals the start of the next item minus the
55312+ start of this item, except the zeroth item whose length equals the end
55313+ of the node minus the start of that item (plus a byte). In other
55314+ words, the item length is not recorded anywhere, and it does not need
55315+ to be since it is computable.
55316+
55317+ Leaf variable length items and keys layout : (lvar)
55318+
55319+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55320+
55321+ We grow towards the middle, optimizing layout for the case where we
55322+ append new items to the end of the node. The node header is fixed
55323+ length. Keys and item offsets for the items corresponding to them are
55324+ in increasing key order, and keys are variable length. Item offsets
55325+ are relative to start of node (16 bits). Item bodies are in
55326+ decreasing key order. Item bodies have a variable size. There is a
55327+ one to one to one mapping of keys to item offsets to item bodies.
55328+ Item offsets consist of pointers to the zeroth byte of the item body.
55329+ Item length equals the start of the next item's key minus the start of
55330+ this item, except the zeroth item whose length equals the end of the
55331+ node minus the start of that item (plus a byte).
55332+
55333+ leaf compressed keys layout: (lcomp)
55334+
55335+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55336+
55337+ We grow towards the middle, optimizing layout for the case where we
55338+ append new items to the end of the node. The node header is fixed
55339+ length. Keys and item offsets for the items corresponding to them are
55340+ in increasing key order, and keys are variable length. The "key
55341+ inherit" field indicates how much of the key prefix is identical to
55342+ the previous key (stem compression as described in "Managing
55343+ Gigabytes" is used). key_inherit is a one byte integer. The
55344+ intra-node searches performed through this layout are linear searches,
55345+ and this is theorized to not hurt performance much due to the high
55346+ cost of processor stalls on modern CPUs, and the small number of keys
55347+ in a single node. Item offsets are relative to start of node (16
55348+ bits). Item bodies are in decreasing key order. Item bodies have a
55349+ variable size. There is a one to one to one mapping of keys to item
55350+ offsets to item bodies. Item offsets consist of pointers to the
55351+ zeroth byte of the item body. Item length equals the start of the
55352+ next item minus the start of this item, except the zeroth item whose
55353+ length equals the end of the node minus the start of that item (plus a
55354+ byte). In other words, item length and key length is not recorded
55355+ anywhere, and it does not need to be since it is computable.
55356+
55357+ internal node default layout: (idef1)
55358+
55359+ just like ldef1 except that item bodies are either blocknrs of
55360+ children or extents, and moving them may require updating parent
55361+ pointers in the nodes that they point to.
55362+*/
55363+
55364+/* There is an inherent 3-way tradeoff between optimizing and
55365+ exchanging disks between different architectures and code
55366+ complexity. This is optimal and simple and inexchangeable.
55367+ Someone else can do the code for exchanging disks and make it
55368+ complex. It would not be that hard. Using other than the PAGE_SIZE
55369+ might be suboptimal.
55370+*/
55371+
55372+#if !defined( __REISER4_NODE_H__ )
55373+#define __REISER4_NODE_H__
55374+
55375+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55376+
55377+#include "../../dformat.h"
55378+#include "../plugin_header.h"
55379+
55380+#include <linux/types.h>
55381+
55382+typedef enum {
55383+ NS_FOUND = 0,
55384+ NS_NOT_FOUND = -ENOENT
55385+} node_search_result;
55386+
55387+/* Maximal possible space overhead for creation of new item in a node */
55388+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55389+
55390+typedef enum {
55391+ REISER4_NODE_DKEYS = (1 << 0),
55392+ REISER4_NODE_TREE_STABLE = (1 << 1)
55393+} reiser4_node_check_flag;
55394+
55395+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55396+struct cut_list {
55397+ coord_t *from;
55398+ coord_t *to;
55399+ const reiser4_key *from_key;
55400+ const reiser4_key *to_key;
55401+ reiser4_key *smallest_removed;
55402+ carry_plugin_info *info;
55403+ __u32 flags;
55404+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55405+ lock_handle *left;
55406+ lock_handle *right;
55407+};
55408+
55409+struct carry_cut_data;
55410+struct carry_kill_data;
55411+
55412+/* The responsibility of the node plugin is to store and give access
55413+ to the sequence of items within the node. */
55414+typedef struct node_plugin {
55415+ /* generic plugin fields */
55416+ plugin_header h;
55417+
55418+ /* calculates the amount of space that will be required to store an
55419+ item which is in addition to the space consumed by the item body.
55420+ (the space consumed by the item body can be gotten by calling
55421+ item->estimate) */
55422+ size_t(*item_overhead) (const znode * node, flow_t * f);
55423+
55424+ /* returns free space by looking into node (i.e., without using
55425+ znode->free_space). */
55426+ size_t(*free_space) (znode * node);
55427+ /* search within the node for the one item which might
55428+ contain the key, invoking item->search_within to search within
55429+ that item to see if it is in there */
55430+ node_search_result(*lookup) (znode * node, const reiser4_key * key,
55431+ lookup_bias bias, coord_t * coord);
55432+ /* number of items in node */
55433+ int (*num_of_items) (const znode * node);
55434+
55435+ /* store information about item in @coord in @data */
55436+ /* break into several node ops, don't add any more uses of this before doing so */
55437+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55438+ char *(*item_by_coord) (const coord_t * coord);
55439+ int (*length_by_coord) (const coord_t * coord);
55440+ item_plugin *(*plugin_by_coord) (const coord_t * coord);
55441+
55442+ /* store item key in @key */
55443+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55444+ /* conservatively estimate whether unit of what size can fit
55445+ into node. This estimation should be performed without
55446+ actually looking into the node's content (free space is saved in
55447+ znode). */
55448+ size_t(*estimate) (znode * node);
55449+
55450+ /* performs every consistency check the node plugin author could
55451+ imagine. Optional. */
55452+ int (*check) (const znode * node, __u32 flags, const char **error);
55453+
55454+ /* Called when node is read into memory and node plugin is
55455+ already detected. This should read some data into znode (like free
55456+ space counter) and, optionally, check data consistency.
55457+ */
55458+ int (*parse) (znode * node);
55459+ /* This method is called on a new node to initialise plugin specific
55460+ data (header, etc.) */
55461+ int (*init) (znode * node);
55462+ /* Check whether @node content conforms to this plugin format.
55463+ Probably only useful after support for old V3.x formats is added.
55464+ Uncomment after 4.0 only.
55465+ */
55466+ /* int ( *guess )( const znode *node ); */
55467+#if REISER4_DEBUG
55468+ void (*print) (const char *prefix, const znode * node, __u32 flags);
55469+#endif
55470+ /* change size of @item by @by bytes. @item->node has enough free
55471+ space. When @by > 0 - free space is appended to end of item. When
55472+ @by < 0 - item is truncated - it is assumed that last @by bytes if
55473+ the item are freed already */
55474+ void (*change_item_size) (coord_t * item, int by);
55475+
55476+ /* create new item @length bytes long in coord @target */
55477+ int (*create_item) (coord_t * target, const reiser4_key * key,
55478+ reiser4_item_data * data, carry_plugin_info * info);
55479+
55480+ /* update key of item. */
55481+ void (*update_item_key) (coord_t * target, const reiser4_key * key,
55482+ carry_plugin_info * info);
55483+
55484+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
55485+ int (*cut) (struct carry_cut_data *, carry_plugin_info *);
55486+
55487+ /*
55488+ * shrink item pointed to by @coord by @delta bytes.
55489+ */
55490+ int (*shrink_item) (coord_t * coord, int delta);
55491+
55492+ /* copy as much as possible but not more than up to @stop from
55493+ @stop->node to @target. If (pend == append) then data from beginning of
55494+ @stop->node are copied to the end of @target. If (pend == prepend) then
55495+ data from the end of @stop->node are copied to the beginning of
55496+ @target. Copied data are removed from @stop->node. Information
55497+ about what to do on upper level is stored in @todo */
55498+ int (*shift) (coord_t * stop, znode * target, shift_direction pend,
55499+ int delete_node, int including_insert_coord,
55500+ carry_plugin_info * info);
55501+ /* return true if this node allows skip carry() in some situations
55502+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
55503+ emulation doesn't.
55504+
55505+ This will speedup insertions that doesn't require updates to the
55506+ parent, by bypassing initialisation of carry() structures. It's
55507+ believed that majority of insertions will fit there.
55508+
55509+ */
55510+ int (*fast_insert) (const coord_t * coord);
55511+ int (*fast_paste) (const coord_t * coord);
55512+ int (*fast_cut) (const coord_t * coord);
55513+ /* this limits max size of item which can be inserted into a node and
55514+ number of bytes item in a node may be appended with */
55515+ int (*max_item_size) (void);
55516+ int (*prepare_removal) (znode * empty, carry_plugin_info * info);
55517+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
55518+ * files */
55519+ int (*set_item_plugin) (coord_t * coord, item_id);
55520+} node_plugin;
55521+
55522+typedef enum {
55523+ /* standard unified node layout used for both leaf and internal
55524+ nodes */
55525+ NODE40_ID,
55526+ LAST_NODE_ID
55527+} reiser4_node_id;
55528+
55529+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
55530+#if REISER4_DEBUG
55531+extern void print_node_content(const char *prefix, const znode * node,
55532+ __u32 flags);
55533+#endif
55534+
55535+extern void indent_znode(const znode * node);
55536+
55537+typedef struct common_node_header {
55538+ /*
55539+ * identifier of node plugin. Must be located at the very beginning of
55540+ * a node.
55541+ */
55542+ __le16 plugin_id;
55543+} common_node_header;
55544+
55545+/* __REISER4_NODE_H__ */
55546+#endif
55547+/*
55548+ * Local variables:
55549+ * c-indentation-style: "K&R"
55550+ * mode-name: "LC"
55551+ * c-basic-offset: 8
55552+ * tab-width: 8
55553+ * fill-column: 79
55554+ * scroll-step: 1
55555+ * End:
55556+ */
55557Index: linux-2.6.16/fs/reiser4/plugin/node/node40.c
55558===================================================================
55559--- /dev/null
55560+++ linux-2.6.16/fs/reiser4/plugin/node/node40.c
55561@@ -0,0 +1,2924 @@
55562+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55563+
55564+#include "../../debug.h"
55565+#include "../../key.h"
55566+#include "../../coord.h"
55567+#include "../plugin_header.h"
55568+#include "../item/item.h"
55569+#include "node.h"
55570+#include "node40.h"
55571+#include "../plugin.h"
55572+#include "../../jnode.h"
55573+#include "../../znode.h"
55574+#include "../../pool.h"
55575+#include "../../carry.h"
55576+#include "../../tap.h"
55577+#include "../../tree.h"
55578+#include "../../super.h"
55579+#include "../../reiser4.h"
55580+
55581+#include <asm/uaccess.h>
55582+#include <linux/types.h>
55583+#include <linux/prefetch.h>
55584+
55585+/* leaf 40 format:
55586+
55587+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
55588+ plugin_id (16) key
55589+ free_space (16) pluginid (16)
55590+ free_space_start (16) offset (16)
55591+ level (8)
55592+ num_items (16)
55593+ magic (32)
55594+ flush_time (32)
55595+*/
55596+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
55597+/* magic number that is stored in ->magic field of node header */
55598+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
55599+
55600+static int prepare_for_update(znode * left, znode * right,
55601+ carry_plugin_info * info);
55602+
55603+/* header of node of reiser40 format is at the beginning of node */
55604+static inline node40_header *node40_node_header(const znode * node /* node to
55605+ * query */ )
55606+{
55607+ assert("nikita-567", node != NULL);
55608+ assert("nikita-568", znode_page(node) != NULL);
55609+ assert("nikita-569", zdata(node) != NULL);
55610+ return (node40_header *) zdata(node);
55611+}
55612+
55613+/* functions to get/set fields of node40_header */
55614+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
55615+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
55616+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
55617+#define nh40_get_level(nh) get_unaligned(&(nh)->level)
55618+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
55619+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
55620+
55621+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
55622+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
55623+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
55624+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
55625+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
55626+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
55627+
55628+
55629+/* plugin field of node header should be read/set by
55630+ plugin_by_disk_id/save_disk_plugin */
55631+
55632+/* array of item headers is at the end of node */
55633+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
55634+{
55635+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
55636+}
55637+
55638+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
55639+ */
55640+static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
55641+{
55642+ return (item_header40 *) (zdata(coord->node) +
55643+ znode_size(coord->node)) - (coord->item_pos) -
55644+ 1;
55645+}
55646+
55647+/* functions to get/set fields of item_header40 */
55648+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
55649+
55650+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
55651+
55652+/* plugin field of item header should be read/set by
55653+ plugin_by_disk_id/save_disk_plugin */
55654+
55655+/* plugin methods */
55656+
55657+/* plugin->u.node.item_overhead
55658+ look for description of this method in plugin/node/node.h */
55659+size_t
55660+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
55661+{
55662+ return sizeof(item_header40);
55663+}
55664+
55665+/* plugin->u.node.free_space
55666+ look for description of this method in plugin/node/node.h */
55667+size_t free_space_node40(znode * node)
55668+{
55669+ assert("nikita-577", node != NULL);
55670+ assert("nikita-578", znode_is_loaded(node));
55671+ assert("nikita-579", zdata(node) != NULL);
55672+
55673+ return nh40_get_free_space(node40_node_header(node));
55674+}
55675+
55676+/* private inline version of node40_num_of_items() for use in this file. This
55677+ is necessary, because address of node40_num_of_items() is taken and it is
55678+ never inlined as a result. */
55679+static inline short node40_num_of_items_internal(const znode * node)
55680+{
55681+ return nh40_get_num_items(node40_node_header(node));
55682+}
55683+
55684+#if REISER4_DEBUG
55685+static inline void check_num_items(const znode * node)
55686+{
55687+ assert("nikita-2749",
55688+ node40_num_of_items_internal(node) == node->nr_items);
55689+ assert("nikita-2746", znode_is_write_locked(node));
55690+}
55691+#else
55692+#define check_num_items(node) noop
55693+#endif
55694+
55695+/* plugin->u.node.num_of_items
55696+ look for description of this method in plugin/node/node.h */
55697+int num_of_items_node40(const znode * node)
55698+{
55699+ return node40_num_of_items_internal(node);
55700+}
55701+
55702+static void
55703+node40_set_num_items(znode * node, node40_header * nh, unsigned value)
55704+{
55705+ assert("nikita-2751", node != NULL);
55706+ assert("nikita-2750", nh == node40_node_header(node));
55707+
55708+ check_num_items(node);
55709+ nh40_set_num_items(nh, value);
55710+ node->nr_items = value;
55711+ check_num_items(node);
55712+}
55713+
55714+/* plugin->u.node.item_by_coord
55715+ look for description of this method in plugin/node/node.h */
55716+char *item_by_coord_node40(const coord_t * coord)
55717+{
55718+ item_header40 *ih;
55719+ char *p;
55720+
55721+ /* @coord is set to existing item */
55722+ assert("nikita-596", coord != NULL);
55723+ assert("vs-255", coord_is_existing_item(coord));
55724+
55725+ ih = node40_ih_at_coord(coord);
55726+ p = zdata(coord->node) + ih40_get_offset(ih);
55727+ return p;
55728+}
55729+
55730+/* plugin->u.node.length_by_coord
55731+ look for description of this method in plugin/node/node.h */
55732+int length_by_coord_node40(const coord_t * coord)
55733+{
55734+ item_header40 *ih;
55735+ int result;
55736+
55737+ /* @coord is set to existing item */
55738+ assert("vs-256", coord != NULL);
55739+ assert("vs-257", coord_is_existing_item(coord));
55740+
55741+ ih = node40_ih_at_coord(coord);
55742+ if ((int)coord->item_pos ==
55743+ node40_num_of_items_internal(coord->node) - 1)
55744+ result =
55745+ nh40_get_free_space_start(node40_node_header(coord->node)) -
55746+ ih40_get_offset(ih);
55747+ else
55748+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55749+
55750+ return result;
55751+}
55752+
55753+static pos_in_node_t
55754+node40_item_length(const znode * node, pos_in_node_t item_pos)
55755+{
55756+ item_header40 *ih;
55757+ pos_in_node_t result;
55758+
55759+ /* @coord is set to existing item */
55760+ assert("vs-256", node != NULL);
55761+ assert("vs-257", node40_num_of_items_internal(node) > item_pos);
55762+
55763+ ih = node40_ih_at(node, item_pos);
55764+ if (item_pos == node40_num_of_items_internal(node) - 1)
55765+ result =
55766+ nh40_get_free_space_start(node40_node_header(node)) -
55767+ ih40_get_offset(ih);
55768+ else
55769+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55770+
55771+ return result;
55772+}
55773+
55774+/* plugin->u.node.plugin_by_coord
55775+ look for description of this method in plugin/node/node.h */
55776+item_plugin *plugin_by_coord_node40(const coord_t * coord)
55777+{
55778+ item_header40 *ih;
55779+ item_plugin *result;
55780+
55781+ /* @coord is set to existing item */
55782+ assert("vs-258", coord != NULL);
55783+ assert("vs-259", coord_is_existing_item(coord));
55784+
55785+ ih = node40_ih_at_coord(coord);
55786+ /* pass NULL in stead of current tree. This is time critical call. */
55787+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
55788+ return result;
55789+}
55790+
55791+/* plugin->u.node.key_at
55792+ look for description of this method in plugin/node/node.h */
55793+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
55794+{
55795+ item_header40 *ih;
55796+
55797+ assert("nikita-1765", coord_is_existing_item(coord));
55798+
55799+ /* @coord is set to existing item */
55800+ ih = node40_ih_at_coord(coord);
55801+ memcpy(key, &ih->key, sizeof(reiser4_key));
55802+ return key;
55803+}
55804+
55805+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
55806+
55807+#define NODE_INCSTAT(n, counter) \
55808+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
55809+
55810+#define NODE_ADDSTAT(n, counter, val) \
55811+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
55812+
55813+/* plugin->u.node.lookup
55814+ look for description of this method in plugin/node/node.h */
55815+node_search_result lookup_node40(znode * node /* node to query */ ,
55816+ const reiser4_key * key /* key to look for */ ,
55817+ lookup_bias bias /* search bias */ ,
55818+ coord_t * coord /* resulting coord */ )
55819+{
55820+ int left;
55821+ int right;
55822+ int found;
55823+ int items;
55824+
55825+ item_header40 *lefth;
55826+ item_header40 *righth;
55827+
55828+ item_plugin *iplug;
55829+ item_header40 *bstop;
55830+ item_header40 *ih;
55831+ cmp_t order;
55832+
55833+ assert("nikita-583", node != NULL);
55834+ assert("nikita-584", key != NULL);
55835+ assert("nikita-585", coord != NULL);
55836+ assert("nikita-2693", znode_is_any_locked(node));
55837+ cassert(REISER4_SEQ_SEARCH_BREAK > 2);
55838+
55839+ items = node_num_items(node);
55840+
55841+ if (unlikely(items == 0)) {
55842+ coord_init_first_unit(coord, node);
55843+ return NS_NOT_FOUND;
55844+ }
55845+
55846+ /* binary search for item that can contain given key */
55847+ left = 0;
55848+ right = items - 1;
55849+ coord->node = node;
55850+ coord_clear_iplug(coord);
55851+ found = 0;
55852+
55853+ lefth = node40_ih_at(node, left);
55854+ righth = node40_ih_at(node, right);
55855+
55856+ /* It is known that for small arrays sequential search is on average
55857+ more efficient than binary. This is because sequential search is
55858+ coded as tight loop that can be better optimized by compilers and
55859+ for small array size gain from this optimization makes sequential
55860+ search the winner. Another, maybe more important, reason for this,
55861+ is that sequential array is more CPU cache friendly, whereas binary
55862+ search effectively destroys CPU caching.
55863+
55864+ Critical here is the notion of "smallness". Reasonable value of
55865+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
55866+ fs/reiser4/ulevel/ulevel.c:test_search().
55867+
55868+ Don't try to further optimize sequential search by scanning from
55869+ right to left in attempt to use more efficient loop termination
55870+ condition (comparison with 0). This doesn't work.
55871+
55872+ */
55873+
55874+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
55875+ int median;
55876+ item_header40 *medianh;
55877+
55878+ median = (left + right) / 2;
55879+ medianh = node40_ih_at(node, median);
55880+
55881+ assert("nikita-1084", median >= 0);
55882+ assert("nikita-1085", median < items);
55883+ switch (keycmp(key, &medianh->key)) {
55884+ case LESS_THAN:
55885+ right = median;
55886+ righth = medianh;
55887+ break;
55888+ default:
55889+ wrong_return_value("nikita-586", "keycmp");
55890+ case GREATER_THAN:
55891+ left = median;
55892+ lefth = medianh;
55893+ break;
55894+ case EQUAL_TO:
55895+ do {
55896+ --median;
55897+ /* headers are ordered from right to left */
55898+ ++medianh;
55899+ } while (median >= 0 && keyeq(key, &medianh->key));
55900+ right = left = median + 1;
55901+ ih = lefth = righth = medianh - 1;
55902+ found = 1;
55903+ break;
55904+ }
55905+ }
55906+ /* sequential scan. Item headers, and, therefore, keys are stored at
55907+ the rightmost part of a node from right to left. We are trying to
55908+ access memory from left to right, and hence, scan in _descending_
55909+ order of item numbers.
55910+ */
55911+ if (!found) {
55912+ for (left = right, ih = righth; left >= 0; ++ih, --left) {
55913+ cmp_t comparison;
55914+
55915+ prefetchkey(&(ih + 1)->key);
55916+ comparison = keycmp(&ih->key, key);
55917+ if (comparison == GREATER_THAN)
55918+ continue;
55919+ if (comparison == EQUAL_TO) {
55920+ found = 1;
55921+ do {
55922+ --left;
55923+ ++ih;
55924+ } while (left >= 0 && keyeq(&ih->key, key));
55925+ ++left;
55926+ --ih;
55927+ } else {
55928+ assert("nikita-1256", comparison == LESS_THAN);
55929+ }
55930+ break;
55931+ }
55932+ if (unlikely(left < 0))
55933+ left = 0;
55934+ }
55935+
55936+ assert("nikita-3212", right >= left);
55937+ assert("nikita-3214",
55938+ equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
55939+
55940+ coord_set_item_pos(coord, left);
55941+ coord->unit_pos = 0;
55942+ coord->between = AT_UNIT;
55943+
55944+ /* key < leftmost key in a mode or node is corrupted and keys
55945+ are not sorted */
55946+ bstop = node40_ih_at(node, (unsigned)left);
55947+ order = keycmp(&bstop->key, key);
55948+ if (unlikely(order == GREATER_THAN)) {
55949+ if (unlikely(left != 0)) {
55950+ /* screw up */
55951+ warning("nikita-587", "Key less than %i key in a node",
55952+ left);
55953+ print_key("key", key);
55954+ print_key("min", &bstop->key);
55955+ print_coord_content("coord", coord);
55956+ return RETERR(-EIO);
55957+ } else {
55958+ coord->between = BEFORE_UNIT;
55959+ return NS_NOT_FOUND;
55960+ }
55961+ }
55962+ /* left <= key, ok */
55963+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
55964+
55965+ if (unlikely(iplug == NULL)) {
55966+ warning("nikita-588", "Unknown plugin %i",
55967+ le16_to_cpu(get_unaligned(&bstop->plugin_id)));
55968+ print_key("key", key);
55969+ print_coord_content("coord", coord);
55970+ return RETERR(-EIO);
55971+ }
55972+
55973+ coord_set_iplug(coord, iplug);
55974+
55975+ /* if exact key from item header was found by binary search, no
55976+ further checks are necessary. */
55977+ if (found) {
55978+ assert("nikita-1259", order == EQUAL_TO);
55979+ return NS_FOUND;
55980+ }
55981+ if (iplug->b.max_key_inside != NULL) {
55982+ reiser4_key max_item_key;
55983+
55984+ /* key > max_item_key --- outside of an item */
55985+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
55986+ coord->unit_pos = 0;
55987+ coord->between = AFTER_ITEM;
55988+ /* FIXME-VS: key we are looking for does not fit into
55989+ found item. Return NS_NOT_FOUND then. Without that
55990+ the following case does not work: there is extent of
55991+ file 10000, 10001. File 10000, 10002 has been just
55992+ created. When writing to position 0 in that file -
55993+ traverse_tree will stop here on twig level. When we
55994+ want it to go down to leaf level
55995+ */
55996+ return NS_NOT_FOUND;
55997+ }
55998+ }
55999+
56000+ if (iplug->b.lookup != NULL) {
56001+ return iplug->b.lookup(key, bias, coord);
56002+ } else {
56003+ assert("nikita-1260", order == LESS_THAN);
56004+ coord->between = AFTER_UNIT;
56005+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
56006+ }
56007+}
56008+
56009+#undef NODE_ADDSTAT
56010+#undef NODE_INCSTAT
56011+
56012+/* plugin->u.node.estimate
56013+ look for description of this method in plugin/node/node.h */
56014+size_t estimate_node40(znode * node)
56015+{
56016+ size_t result;
56017+
56018+ assert("nikita-597", node != NULL);
56019+
56020+ result = free_space_node40(node) - sizeof(item_header40);
56021+
56022+ return (result > 0) ? result : 0;
56023+}
56024+
56025+/* plugin->u.node.check
56026+ look for description of this method in plugin/node/node.h */
56027+int check_node40(const znode * node /* node to check */ ,
56028+ __u32 flags /* check flags */ ,
56029+ const char **error /* where to store error message */ )
56030+{
56031+ int nr_items;
56032+ int i;
56033+ reiser4_key prev;
56034+ unsigned old_offset;
56035+ tree_level level;
56036+ coord_t coord;
56037+ int result;
56038+
56039+ assert("nikita-580", node != NULL);
56040+ assert("nikita-581", error != NULL);
56041+ assert("nikita-2948", znode_is_loaded(node));
56042+
56043+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
56044+ return 0;
56045+
56046+ assert("nikita-582", zdata(node) != NULL);
56047+
56048+ nr_items = node40_num_of_items_internal(node);
56049+ if (nr_items < 0) {
56050+ *error = "Negative number of items";
56051+ return -1;
56052+ }
56053+
56054+ if (flags & REISER4_NODE_DKEYS)
56055+ prev = *znode_get_ld_key((znode *) node);
56056+ else
56057+ prev = *min_key();
56058+
56059+ old_offset = 0;
56060+ coord_init_zero(&coord);
56061+ coord.node = (znode *) node;
56062+ coord.unit_pos = 0;
56063+ coord.between = AT_UNIT;
56064+ level = znode_get_level(node);
56065+ for (i = 0; i < nr_items; i++) {
56066+ item_header40 *ih;
56067+ reiser4_key unit_key;
56068+ unsigned j;
56069+
56070+ ih = node40_ih_at(node, (unsigned)i);
56071+ coord_set_item_pos(&coord, i);
56072+ if ((ih40_get_offset(ih) >=
56073+ znode_size(node) - nr_items * sizeof(item_header40)) ||
56074+ (ih40_get_offset(ih) < sizeof(node40_header))) {
56075+ *error = "Offset is out of bounds";
56076+ return -1;
56077+ }
56078+ if (ih40_get_offset(ih) <= old_offset) {
56079+ *error = "Offsets are in wrong order";
56080+ return -1;
56081+ }
56082+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
56083+ *error = "Wrong offset of first item";
56084+ return -1;
56085+ }
56086+ old_offset = ih40_get_offset(ih);
56087+
56088+ if (keygt(&prev, &ih->key)) {
56089+ *error = "Keys are in wrong order";
56090+ return -1;
56091+ }
56092+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
56093+ *error = "Wrong key of first unit";
56094+ return -1;
56095+ }
56096+ prev = ih->key;
56097+ for (j = 0; j < coord_num_units(&coord); ++j) {
56098+ coord.unit_pos = j;
56099+ unit_key_by_coord(&coord, &unit_key);
56100+ if (keygt(&prev, &unit_key)) {
56101+ *error = "Unit keys are in wrong order";
56102+ return -1;
56103+ }
56104+ prev = unit_key;
56105+ }
56106+ coord.unit_pos = 0;
56107+ if (level != TWIG_LEVEL && item_is_extent(&coord)) {
56108+ *error = "extent on the wrong level";
56109+ return -1;
56110+ }
56111+ if (level == LEAF_LEVEL && item_is_internal(&coord)) {
56112+ *error = "internal item on the wrong level";
56113+ return -1;
56114+ }
56115+ if (level != LEAF_LEVEL &&
56116+ !item_is_internal(&coord) && !item_is_extent(&coord)) {
56117+ *error = "wrong item on the internal level";
56118+ return -1;
56119+ }
56120+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
56121+ *error = "non-internal item on the internal level";
56122+ return -1;
56123+ }
56124+#if REISER4_DEBUG
56125+ if (item_plugin_by_coord(&coord)->b.check
56126+ && item_plugin_by_coord(&coord)->b.check(&coord, error))
56127+ return -1;
56128+#endif
56129+ if (i) {
56130+ coord_t prev_coord;
56131+ /* two neighboring items can not be mergeable */
56132+ coord_dup(&prev_coord, &coord);
56133+ coord_prev_item(&prev_coord);
56134+ if (are_items_mergeable(&prev_coord, &coord)) {
56135+ *error = "mergeable items in one node";
56136+ return -1;
56137+ }
56138+
56139+ }
56140+ }
56141+
56142+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
56143+ coord_t coord;
56144+ item_plugin *iplug;
56145+
56146+ coord_init_last_unit(&coord, node);
56147+ iplug = item_plugin_by_coord(&coord);
56148+ if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
56149+ iplug->s.file.append_key != NULL) {
56150+ reiser4_key mkey;
56151+
56152+ iplug->s.file.append_key(&coord, &mkey);
56153+ set_key_offset(&mkey, get_key_offset(&mkey) - 1);
56154+ read_lock_dk(current_tree);
56155+ result = keygt(&mkey, znode_get_rd_key((znode *) node));
56156+ read_unlock_dk(current_tree);
56157+ if (result) {
56158+ *error = "key of rightmost item is too large";
56159+ return -1;
56160+ }
56161+ }
56162+ }
56163+ if (flags & REISER4_NODE_DKEYS) {
56164+ read_lock_tree(current_tree);
56165+ read_lock_dk(current_tree);
56166+
56167+ flags |= REISER4_NODE_TREE_STABLE;
56168+
56169+ if (keygt(&prev, znode_get_rd_key((znode *) node))) {
56170+ if (flags & REISER4_NODE_TREE_STABLE) {
56171+ *error = "Last key is greater than rdkey";
56172+ read_unlock_dk(current_tree);
56173+ read_unlock_tree(current_tree);
56174+ return -1;
56175+ }
56176+ }
56177+ if (keygt
56178+ (znode_get_ld_key((znode *) node),
56179+ znode_get_rd_key((znode *) node))) {
56180+ *error = "ldkey is greater than rdkey";
56181+ read_unlock_dk(current_tree);
56182+ read_unlock_tree(current_tree);
56183+ return -1;
56184+ }
56185+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
56186+ (node->left != NULL) &&
56187+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
56188+ ergo(flags & REISER4_NODE_TREE_STABLE,
56189+ !keyeq(znode_get_rd_key(node->left),
56190+ znode_get_ld_key((znode *) node)))
56191+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56192+ keygt(znode_get_rd_key(node->left),
56193+ znode_get_ld_key((znode *) node)))) {
56194+ *error = "left rdkey or ldkey is wrong";
56195+ read_unlock_dk(current_tree);
56196+ read_unlock_tree(current_tree);
56197+ return -1;
56198+ }
56199+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
56200+ (node->right != NULL) &&
56201+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
56202+ ergo(flags & REISER4_NODE_TREE_STABLE,
56203+ !keyeq(znode_get_rd_key((znode *) node),
56204+ znode_get_ld_key(node->right)))
56205+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56206+ keygt(znode_get_rd_key((znode *) node),
56207+ znode_get_ld_key(node->right)))) {
56208+ *error = "rdkey or right ldkey is wrong";
56209+ read_unlock_dk(current_tree);
56210+ read_unlock_tree(current_tree);
56211+ return -1;
56212+ }
56213+
56214+ read_unlock_dk(current_tree);
56215+ read_unlock_tree(current_tree);
56216+ }
56217+
56218+ return 0;
56219+}
56220+
56221+/* plugin->u.node.parse
56222+ look for description of this method in plugin/node/node.h */
56223+int parse_node40(znode * node /* node to parse */ )
56224+{
56225+ node40_header *header;
56226+ int result;
56227+ d8 level;
56228+
56229+ header = node40_node_header((znode *) node);
56230+ result = -EIO;
56231+ level = nh40_get_level(header);
56232+ if (unlikely(((__u8) znode_get_level(node)) != level))
56233+ warning("nikita-494", "Wrong level found in node: %i != %i",
56234+ znode_get_level(node), level);
56235+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
56236+ warning("nikita-495",
56237+ "Wrong magic in tree node: want %x, got %x",
56238+ REISER4_NODE_MAGIC, nh40_get_magic(header));
56239+ else {
56240+ node->nr_items = node40_num_of_items_internal(node);
56241+ result = 0;
56242+ }
56243+ if (unlikely(result != 0))
56244+ /* print_znode("node", node) */ ;
56245+ return RETERR(result);
56246+}
56247+
56248+/* plugin->u.node.init
56249+ look for description of this method in plugin/node/node.h */
56250+int init_node40(znode * node /* node to initialise */ )
56251+{
56252+ node40_header *header;
56253+
56254+ assert("nikita-570", node != NULL);
56255+ assert("nikita-572", zdata(node) != NULL);
56256+
56257+ header = node40_node_header(node);
56258+ memset(header, 0, sizeof(node40_header));
56259+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
56260+ nh40_set_free_space_start(header, sizeof(node40_header));
56261+ /* sane hypothesis: 0 in CPU format is 0 in disk format */
56262+ /* items: 0 */
56263+ save_plugin_id(node_plugin_to_plugin(node->nplug),
56264+ &header->common_header.plugin_id);
56265+ nh40_set_level(header, znode_get_level(node));
56266+ nh40_set_magic(header, REISER4_NODE_MAGIC);
56267+ node->nr_items = 0;
56268+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
56269+
56270+ /* flags: 0 */
56271+ return 0;
56272+}
56273+
56274+#ifdef GUESS_EXISTS
56275+int guess_node40(const znode * node /* node to guess plugin of */ )
56276+{
56277+ node40_header *nethack;
56278+
56279+ assert("nikita-1058", node != NULL);
56280+ nethack = node40_node_header(node);
56281+ return
56282+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
56283+ (plugin_by_disk_id(znode_get_tree(node),
56284+ REISER4_NODE_PLUGIN_TYPE,
56285+ &nethack->common_header.plugin_id)->h.id ==
56286+ NODE40_ID);
56287+}
56288+#endif
56289+
56290+/* plugin->u.node.chage_item_size
56291+ look for description of this method in plugin/node/node.h */
56292+void change_item_size_node40(coord_t * coord, int by)
56293+{
56294+ node40_header *nh;
56295+ item_header40 *ih;
56296+ char *item_data;
56297+ int item_length;
56298+ unsigned i;
56299+
56300+ /* make sure that @item is coord of existing item */
56301+ assert("vs-210", coord_is_existing_item(coord));
56302+
56303+ nh = node40_node_header(coord->node);
56304+
56305+ item_data = item_by_coord_node40(coord);
56306+ item_length = length_by_coord_node40(coord);
56307+
56308+ /* move item bodies */
56309+ ih = node40_ih_at_coord(coord);
56310+ memmove(item_data + item_length + by, item_data + item_length,
56311+ nh40_get_free_space_start(node40_node_header(coord->node)) -
56312+ (ih40_get_offset(ih) + item_length));
56313+
56314+ /* update offsets of moved items */
56315+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
56316+ ih = node40_ih_at(coord->node, i);
56317+ ih40_set_offset(ih, ih40_get_offset(ih) + by);
56318+ }
56319+
56320+ /* update node header */
56321+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
56322+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
56323+}
56324+
56325+static int should_notify_parent(const znode * node)
56326+{
56327+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
56328+ return !disk_addr_eq(znode_get_block(node),
56329+ &znode_get_tree(node)->root_block);
56330+}
56331+
56332+/* plugin->u.node.create_item
56333+ look for description of this method in plugin/node/node.h */
56334+int
56335+create_item_node40(coord_t *target, const reiser4_key *key,
56336+ reiser4_item_data *data, carry_plugin_info *info)
56337+{
56338+ node40_header *nh;
56339+ item_header40 *ih;
56340+ unsigned offset;
56341+ unsigned i;
56342+
56343+ nh = node40_node_header(target->node);
56344+
56345+ assert("vs-212", coord_is_between_items(target));
56346+ /* node must have enough free space */
56347+ assert("vs-254",
56348+ free_space_node40(target->node) >=
56349+ data->length + sizeof(item_header40));
56350+ assert("vs-1410", data->length >= 0);
56351+
56352+ if (coord_set_to_right(target))
56353+ /* there are not items to the right of @target, so, new item
56354+ will be inserted after last one */
56355+ coord_set_item_pos(target, nh40_get_num_items(nh));
56356+
56357+ if (target->item_pos < nh40_get_num_items(nh)) {
56358+ /* there are items to be moved to prepare space for new
56359+ item */
56360+ ih = node40_ih_at_coord(target);
56361+ /* new item will start at this offset */
56362+ offset = ih40_get_offset(ih);
56363+
56364+ memmove(zdata(target->node) + offset + data->length,
56365+ zdata(target->node) + offset,
56366+ nh40_get_free_space_start(nh) - offset);
56367+ /* update headers of moved items */
56368+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
56369+ ih = node40_ih_at(target->node, i);
56370+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
56371+ }
56372+
56373+ /* @ih is set to item header of the last item, move item headers */
56374+ memmove(ih - 1, ih,
56375+ sizeof(item_header40) * (nh40_get_num_items(nh) -
56376+ target->item_pos));
56377+ } else {
56378+ /* new item will start at this offset */
56379+ offset = nh40_get_free_space_start(nh);
56380+ }
56381+
56382+ /* make item header for the new item */
56383+ ih = node40_ih_at_coord(target);
56384+ memcpy(&ih->key, key, sizeof(reiser4_key));
56385+ ih40_set_offset(ih, offset);
56386+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
56387+
56388+ /* update node header */
56389+ nh40_set_free_space(nh,
56390+ nh40_get_free_space(nh) - data->length -
56391+ sizeof(item_header40));
56392+ nh40_set_free_space_start(nh,
56393+ nh40_get_free_space_start(nh) + data->length);
56394+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
56395+
56396+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
56397+ target->unit_pos = 0;
56398+ target->between = AT_UNIT;
56399+ coord_clear_iplug(target);
56400+
56401+ /* initialize item */
56402+ if (data->iplug->b.init != NULL) {
56403+ data->iplug->b.init(target, NULL, data);
56404+ }
56405+ /* copy item body */
56406+ if (data->iplug->b.paste != NULL) {
56407+ data->iplug->b.paste(target, data, info);
56408+ } else if (data->data != NULL) {
56409+ if (data->user) {
56410+ /* AUDIT: Are we really should not check that pointer
56411+ from userspace was valid and data bytes were
56412+ available? How will we return -EFAULT of some kind
56413+ without this check? */
56414+ assert("nikita-3038", schedulable());
56415+ /* copy data from user space */
56416+ __copy_from_user(zdata(target->node) + offset,
56417+ (const char __user *)data->data,
56418+ (unsigned)data->length);
56419+ } else
56420+ /* copy from kernel space */
56421+ memcpy(zdata(target->node) + offset, data->data,
56422+ (unsigned)data->length);
56423+ }
56424+
56425+ if (target->item_pos == 0) {
56426+ /* left delimiting key has to be updated */
56427+ prepare_for_update(NULL, target->node, info);
56428+ }
56429+
56430+ if (item_plugin_by_coord(target)->b.create_hook != NULL) {
56431+ item_plugin_by_coord(target)->b.create_hook(target, data->arg);
56432+ }
56433+
56434+ return 0;
56435+}
56436+
56437+/* plugin->u.node.update_item_key
56438+ look for description of this method in plugin/node/node.h */
56439+void
56440+update_item_key_node40(coord_t * target, const reiser4_key * key,
56441+ carry_plugin_info * info)
56442+{
56443+ item_header40 *ih;
56444+
56445+ ih = node40_ih_at_coord(target);
56446+ memcpy(&ih->key, key, sizeof(reiser4_key));
56447+
56448+ if (target->item_pos == 0) {
56449+ prepare_for_update(NULL, target->node, info);
56450+ }
56451+}
56452+
56453+/* this bits encode cut mode */
56454+#define CMODE_TAIL 1
56455+#define CMODE_WHOLE 2
56456+#define CMODE_HEAD 4
56457+
56458+struct cut40_info {
56459+ int mode;
56460+ pos_in_node_t tail_removed; /* position of item which gets tail removed */
56461+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
56462+ pos_in_node_t removed_count; /* number of items removed completely */
56463+ pos_in_node_t head_removed; /* position of item which gets head removed */
56464+
56465+ pos_in_node_t freed_space_start;
56466+ pos_in_node_t freed_space_end;
56467+ pos_in_node_t first_moved;
56468+ pos_in_node_t head_removed_location;
56469+};
56470+
56471+static void init_cinfo(struct cut40_info *cinfo)
56472+{
56473+ cinfo->mode = 0;
56474+ cinfo->tail_removed = MAX_POS_IN_NODE;
56475+ cinfo->first_removed = MAX_POS_IN_NODE;
56476+ cinfo->removed_count = MAX_POS_IN_NODE;
56477+ cinfo->head_removed = MAX_POS_IN_NODE;
56478+ cinfo->freed_space_start = MAX_POS_IN_NODE;
56479+ cinfo->freed_space_end = MAX_POS_IN_NODE;
56480+ cinfo->first_moved = MAX_POS_IN_NODE;
56481+ cinfo->head_removed_location = MAX_POS_IN_NODE;
56482+}
56483+
56484+/* complete cut_node40/kill_node40 content by removing the gap created by */
56485+static void compact(znode * node, struct cut40_info *cinfo)
56486+{
56487+ node40_header *nh;
56488+ item_header40 *ih;
56489+ pos_in_node_t freed;
56490+ pos_in_node_t pos, nr_items;
56491+
56492+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
56493+ cinfo->freed_space_end != MAX_POS_IN_NODE &&
56494+ cinfo->first_moved != MAX_POS_IN_NODE));
56495+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
56496+
56497+ nh = node40_node_header(node);
56498+ nr_items = nh40_get_num_items(nh);
56499+
56500+ /* remove gap made up by removal */
56501+ memmove(zdata(node) + cinfo->freed_space_start,
56502+ zdata(node) + cinfo->freed_space_end,
56503+ nh40_get_free_space_start(nh) - cinfo->freed_space_end);
56504+
56505+ /* update item headers of moved items - change their locations */
56506+ pos = cinfo->first_moved;
56507+ ih = node40_ih_at(node, pos);
56508+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
56509+ assert("vs-1580", pos == cinfo->head_removed);
56510+ ih40_set_offset(ih, cinfo->head_removed_location);
56511+ pos++;
56512+ ih--;
56513+ }
56514+
56515+ freed = cinfo->freed_space_end - cinfo->freed_space_start;
56516+ for (; pos < nr_items; pos++, ih--) {
56517+ assert("vs-1581", ih == node40_ih_at(node, pos));
56518+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
56519+ }
56520+
56521+ /* free space start moved to right */
56522+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
56523+
56524+ if (cinfo->removed_count != MAX_POS_IN_NODE) {
56525+ /* number of items changed. Remove item headers of those items */
56526+ ih = node40_ih_at(node, nr_items - 1);
56527+ memmove(ih + cinfo->removed_count, ih,
56528+ sizeof(item_header40) * (nr_items -
56529+ cinfo->removed_count -
56530+ cinfo->first_removed));
56531+ freed += sizeof(item_header40) * cinfo->removed_count;
56532+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
56533+ }
56534+
56535+ /* total amount of free space increased */
56536+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
56537+}
56538+
56539+int shrink_item_node40(coord_t * coord, int delta)
56540+{
56541+ node40_header *nh;
56542+ item_header40 *ih;
56543+ pos_in_node_t pos;
56544+ pos_in_node_t nr_items;
56545+ char *end;
56546+ znode *node;
56547+ int off;
56548+
56549+ assert("nikita-3487", coord != NULL);
56550+ assert("nikita-3488", delta >= 0);
56551+
56552+ node = coord->node;
56553+ nh = node40_node_header(node);
56554+ nr_items = nh40_get_num_items(nh);
56555+
56556+ ih = node40_ih_at_coord(coord);
56557+ assert("nikita-3489", delta <= length_by_coord_node40(coord));
56558+ off = ih40_get_offset(ih) + length_by_coord_node40(coord);
56559+ end = zdata(node) + off;
56560+
56561+ /* remove gap made up by removal */
56562+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
56563+
56564+ /* update item headers of moved items - change their locations */
56565+ pos = coord->item_pos + 1;
56566+ ih = node40_ih_at(node, pos);
56567+ for (; pos < nr_items; pos++, ih--) {
56568+ assert("nikita-3490", ih == node40_ih_at(node, pos));
56569+ ih40_set_offset(ih, ih40_get_offset(ih) - delta);
56570+ }
56571+
56572+ /* free space start moved to left */
56573+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
56574+ /* total amount of free space increased */
56575+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
56576+ /*
56577+ * This method does _not_ changes number of items. Hence, it cannot
56578+ * make node empty. Also it doesn't remove items at all, which means
56579+ * that no keys have to be updated either.
56580+ */
56581+ return 0;
56582+}
56583+
56584+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
56585+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
56586+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
56587+ getting head cut. Function returns 0 in this case */
56588+static int
56589+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
56590+{
56591+ reiser4_key left_key, right_key;
56592+ reiser4_key min_from_key, max_to_key;
56593+ const reiser4_key *from_key, *to_key;
56594+
56595+ init_cinfo(cinfo);
56596+
56597+ /* calculate minimal key stored in first item of items to be cut (params->from) */
56598+ item_key_by_coord(params->from, &min_from_key);
56599+ /* and max key stored in last item of items to be cut (params->to) */
56600+ max_item_key_by_coord(params->to, &max_to_key);
56601+
56602+ /* if cut key range is not defined in input parameters - define it using cut coord range */
56603+ if (params->from_key == NULL) {
56604+ assert("vs-1513", params->to_key == NULL);
56605+ unit_key_by_coord(params->from, &left_key);
56606+ from_key = &left_key;
56607+ max_unit_key_by_coord(params->to, &right_key);
56608+ to_key = &right_key;
56609+ } else {
56610+ from_key = params->from_key;
56611+ to_key = params->to_key;
56612+ }
56613+
56614+ if (params->from->item_pos == params->to->item_pos) {
56615+ if (keylt(&min_from_key, from_key)
56616+ && keylt(to_key, &max_to_key))
56617+ return 1;
56618+
56619+ if (keygt(from_key, &min_from_key)) {
56620+ /* tail of item is to be cut cut */
56621+ cinfo->tail_removed = params->from->item_pos;
56622+ cinfo->mode |= CMODE_TAIL;
56623+ } else if (keylt(to_key, &max_to_key)) {
56624+ /* head of item is to be cut */
56625+ cinfo->head_removed = params->from->item_pos;
56626+ cinfo->mode |= CMODE_HEAD;
56627+ } else {
56628+ /* item is removed completely */
56629+ cinfo->first_removed = params->from->item_pos;
56630+ cinfo->removed_count = 1;
56631+ cinfo->mode |= CMODE_WHOLE;
56632+ }
56633+ } else {
56634+ cinfo->first_removed = params->from->item_pos + 1;
56635+ cinfo->removed_count =
56636+ params->to->item_pos - params->from->item_pos - 1;
56637+
56638+ if (keygt(from_key, &min_from_key)) {
56639+ /* first item is not cut completely */
56640+ cinfo->tail_removed = params->from->item_pos;
56641+ cinfo->mode |= CMODE_TAIL;
56642+ } else {
56643+ cinfo->first_removed--;
56644+ cinfo->removed_count++;
56645+ }
56646+ if (keylt(to_key, &max_to_key)) {
56647+ /* last item is not cut completely */
56648+ cinfo->head_removed = params->to->item_pos;
56649+ cinfo->mode |= CMODE_HEAD;
56650+ } else {
56651+ cinfo->removed_count++;
56652+ }
56653+ if (cinfo->removed_count)
56654+ cinfo->mode |= CMODE_WHOLE;
56655+ }
56656+
56657+ return 0;
56658+}
56659+
56660+static void
56661+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
56662+ carry_kill_data * kdata)
56663+{
56664+ coord_t coord;
56665+ item_plugin *iplug;
56666+ pos_in_node_t pos;
56667+
56668+ coord.node = node;
56669+ coord.unit_pos = 0;
56670+ coord.between = AT_UNIT;
56671+ for (pos = 0; pos < count; pos++) {
56672+ coord_set_item_pos(&coord, from + pos);
56673+ coord.unit_pos = 0;
56674+ coord.between = AT_UNIT;
56675+ iplug = item_plugin_by_coord(&coord);
56676+ if (iplug->b.kill_hook) {
56677+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
56678+ kdata);
56679+ }
56680+ }
56681+}
56682+
56683+/* this is used to kill item partially */
56684+static pos_in_node_t
56685+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56686+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
56687+{
56688+ struct carry_kill_data *kdata;
56689+ item_plugin *iplug;
56690+
56691+ kdata = data;
56692+ iplug = item_plugin_by_coord(coord);
56693+
56694+ assert("vs-1524", iplug->b.kill_units);
56695+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
56696+ new_first_key);
56697+}
56698+
56699+/* call item plugin to cut tail of file */
56700+static pos_in_node_t
56701+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56702+{
56703+ struct carry_kill_data *kdata;
56704+ pos_in_node_t to;
56705+
56706+ kdata = data;
56707+ to = coord_last_unit_pos(coord);
56708+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
56709+ NULL);
56710+}
56711+
56712+/* call item plugin to cut head of item */
56713+static pos_in_node_t
56714+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56715+ reiser4_key * new_first_key)
56716+{
56717+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
56718+ new_first_key);
56719+}
56720+
56721+/* this is used to cut item partially */
56722+static pos_in_node_t
56723+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56724+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
56725+{
56726+ carry_cut_data *cdata;
56727+ item_plugin *iplug;
56728+
56729+ cdata = data;
56730+ iplug = item_plugin_by_coord(coord);
56731+ assert("vs-302", iplug->b.cut_units);
56732+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
56733+ new_first_key);
56734+}
56735+
56736+/* call item plugin to cut tail of file */
56737+static pos_in_node_t
56738+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56739+{
56740+ carry_cut_data *cdata;
56741+ pos_in_node_t to;
56742+
56743+ cdata = data;
56744+ to = coord_last_unit_pos(cdata->params.from);
56745+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
56746+}
56747+
56748+/* call item plugin to cut head of item */
56749+static pos_in_node_t
56750+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56751+ reiser4_key * new_first_key)
56752+{
56753+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
56754+ new_first_key);
56755+}
56756+
56757+/* this returns 1 of key of first item changed, 0 - if it did not */
56758+static int
56759+prepare_for_compact(struct cut40_info *cinfo,
56760+ const struct cut_kill_params *params, int is_cut,
56761+ void *data, carry_plugin_info * info)
56762+{
56763+ znode *node;
56764+ item_header40 *ih;
56765+ pos_in_node_t freed;
56766+ pos_in_node_t item_pos;
56767+ coord_t coord;
56768+ reiser4_key new_first_key;
56769+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
56770+ void *, reiser4_key *, reiser4_key *);
56771+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
56772+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
56773+ reiser4_key *);
56774+ int retval;
56775+
56776+ retval = 0;
56777+
56778+ node = params->from->node;
56779+
56780+ assert("vs-184", node == params->to->node);
56781+ assert("vs-312", !node_is_empty(node));
56782+ assert("vs-297",
56783+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
56784+
56785+ if (is_cut) {
56786+ kill_units_f = cut_units;
56787+ kill_tail_f = cut_tail;
56788+ kill_head_f = cut_head;
56789+ } else {
56790+ kill_units_f = kill_units;
56791+ kill_tail_f = kill_tail;
56792+ kill_head_f = kill_head;
56793+ }
56794+
56795+ if (parse_cut(cinfo, params) == 1) {
56796+ /* cut from the middle of item */
56797+ freed =
56798+ kill_units_f(params->from, params->from->unit_pos,
56799+ params->to->unit_pos, data,
56800+ params->smallest_removed, NULL);
56801+
56802+ item_pos = params->from->item_pos;
56803+ ih = node40_ih_at(node, item_pos);
56804+ cinfo->freed_space_start =
56805+ ih40_get_offset(ih) + node40_item_length(node,
56806+ item_pos) - freed;
56807+ cinfo->freed_space_end = cinfo->freed_space_start + freed;
56808+ cinfo->first_moved = item_pos + 1;
56809+ } else {
56810+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
56811+ cinfo->first_removed != MAX_POS_IN_NODE ||
56812+ cinfo->head_removed != MAX_POS_IN_NODE));
56813+
56814+ switch (cinfo->mode) {
56815+ case CMODE_TAIL:
56816+ /* one item gets cut partially from its end */
56817+ assert("vs-1562",
56818+ cinfo->tail_removed == params->from->item_pos);
56819+
56820+ freed =
56821+ kill_tail_f(params->from, data,
56822+ params->smallest_removed);
56823+
56824+ item_pos = cinfo->tail_removed;
56825+ ih = node40_ih_at(node, item_pos);
56826+ cinfo->freed_space_start =
56827+ ih40_get_offset(ih) + node40_item_length(node,
56828+ item_pos) -
56829+ freed;
56830+ cinfo->freed_space_end =
56831+ cinfo->freed_space_start + freed;
56832+ cinfo->first_moved = cinfo->tail_removed + 1;
56833+ break;
56834+
56835+ case CMODE_WHOLE:
56836+ /* one or more items get removed completely */
56837+ assert("vs-1563",
56838+ cinfo->first_removed == params->from->item_pos);
56839+ assert("vs-1564", cinfo->removed_count > 0
56840+ && cinfo->removed_count != MAX_POS_IN_NODE);
56841+
56842+ /* call kill hook for all items removed completely */
56843+ if (is_cut == 0)
56844+ call_kill_hooks(node, cinfo->first_removed,
56845+ cinfo->removed_count, data);
56846+
56847+ item_pos = cinfo->first_removed;
56848+ ih = node40_ih_at(node, item_pos);
56849+
56850+ if (params->smallest_removed)
56851+ memcpy(params->smallest_removed, &ih->key,
56852+ sizeof(reiser4_key));
56853+
56854+ cinfo->freed_space_start = ih40_get_offset(ih);
56855+
56856+ item_pos += (cinfo->removed_count - 1);
56857+ ih -= (cinfo->removed_count - 1);
56858+ cinfo->freed_space_end =
56859+ ih40_get_offset(ih) + node40_item_length(node,
56860+ item_pos);
56861+ cinfo->first_moved = item_pos + 1;
56862+ if (cinfo->first_removed == 0)
56863+ /* key of first item of the node changes */
56864+ retval = 1;
56865+ break;
56866+
56867+ case CMODE_HEAD:
56868+ /* one item gets cut partially from its head */
56869+ assert("vs-1565",
56870+ cinfo->head_removed == params->from->item_pos);
56871+
56872+ freed =
56873+ kill_head_f(params->to, data,
56874+ params->smallest_removed,
56875+ &new_first_key);
56876+
56877+ item_pos = cinfo->head_removed;
56878+ ih = node40_ih_at(node, item_pos);
56879+ cinfo->freed_space_start = ih40_get_offset(ih);
56880+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56881+ cinfo->first_moved = cinfo->head_removed + 1;
56882+
56883+ /* item head is removed, therefore, item key changed */
56884+ coord.node = node;
56885+ coord_set_item_pos(&coord, item_pos);
56886+ coord.unit_pos = 0;
56887+ coord.between = AT_UNIT;
56888+ update_item_key_node40(&coord, &new_first_key, NULL);
56889+ if (item_pos == 0)
56890+ /* key of first item of the node changes */
56891+ retval = 1;
56892+ break;
56893+
56894+ case CMODE_TAIL | CMODE_WHOLE:
56895+ /* one item gets cut from its end and one or more items get removed completely */
56896+ assert("vs-1566",
56897+ cinfo->tail_removed == params->from->item_pos);
56898+ assert("vs-1567",
56899+ cinfo->first_removed == cinfo->tail_removed + 1);
56900+ assert("vs-1564", cinfo->removed_count > 0
56901+ && cinfo->removed_count != MAX_POS_IN_NODE);
56902+
56903+ freed =
56904+ kill_tail_f(params->from, data,
56905+ params->smallest_removed);
56906+
56907+ item_pos = cinfo->tail_removed;
56908+ ih = node40_ih_at(node, item_pos);
56909+ cinfo->freed_space_start =
56910+ ih40_get_offset(ih) + node40_item_length(node,
56911+ item_pos) -
56912+ freed;
56913+
56914+ /* call kill hook for all items removed completely */
56915+ if (is_cut == 0)
56916+ call_kill_hooks(node, cinfo->first_removed,
56917+ cinfo->removed_count, data);
56918+
56919+ item_pos += cinfo->removed_count;
56920+ ih -= cinfo->removed_count;
56921+ cinfo->freed_space_end =
56922+ ih40_get_offset(ih) + node40_item_length(node,
56923+ item_pos);
56924+ cinfo->first_moved = item_pos + 1;
56925+ break;
56926+
56927+ case CMODE_WHOLE | CMODE_HEAD:
56928+ /* one or more items get removed completely and one item gets cut partially from its head */
56929+ assert("vs-1568",
56930+ cinfo->first_removed == params->from->item_pos);
56931+ assert("vs-1564", cinfo->removed_count > 0
56932+ && cinfo->removed_count != MAX_POS_IN_NODE);
56933+ assert("vs-1569",
56934+ cinfo->head_removed ==
56935+ cinfo->first_removed + cinfo->removed_count);
56936+
56937+ /* call kill hook for all items removed completely */
56938+ if (is_cut == 0)
56939+ call_kill_hooks(node, cinfo->first_removed,
56940+ cinfo->removed_count, data);
56941+
56942+ item_pos = cinfo->first_removed;
56943+ ih = node40_ih_at(node, item_pos);
56944+
56945+ if (params->smallest_removed)
56946+ memcpy(params->smallest_removed, &ih->key,
56947+ sizeof(reiser4_key));
56948+
56949+ freed =
56950+ kill_head_f(params->to, data, NULL, &new_first_key);
56951+
56952+ cinfo->freed_space_start = ih40_get_offset(ih);
56953+
56954+ ih = node40_ih_at(node, cinfo->head_removed);
56955+ /* this is the most complex case. Item which got head removed and items which are to be moved
56956+ intact change their location differently. */
56957+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56958+ cinfo->first_moved = cinfo->head_removed;
56959+ cinfo->head_removed_location = cinfo->freed_space_start;
56960+
56961+ /* item head is removed, therefore, item key changed */
56962+ coord.node = node;
56963+ coord_set_item_pos(&coord, cinfo->head_removed);
56964+ coord.unit_pos = 0;
56965+ coord.between = AT_UNIT;
56966+ update_item_key_node40(&coord, &new_first_key, NULL);
56967+
56968+ assert("vs-1579", cinfo->first_removed == 0);
56969+ /* key of first item of the node changes */
56970+ retval = 1;
56971+ break;
56972+
56973+ case CMODE_TAIL | CMODE_HEAD:
56974+ /* one item get cut from its end and its neighbor gets cut from its tail */
56975+ impossible("vs-1576", "this can not happen currently");
56976+ break;
56977+
56978+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
56979+ impossible("vs-1577", "this can not happen currently");
56980+ break;
56981+ default:
56982+ impossible("vs-1578", "unexpected cut mode");
56983+ break;
56984+ }
56985+ }
56986+ return retval;
56987+}
56988+
56989+/* plugin->u.node.kill
56990+ return value is number of items removed completely */
56991+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
56992+{
56993+ znode *node;
56994+ struct cut40_info cinfo;
56995+ int first_key_changed;
56996+
56997+ node = kdata->params.from->node;
56998+
56999+ first_key_changed =
57000+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
57001+ info);
57002+ compact(node, &cinfo);
57003+
57004+ if (info) {
57005+ /* it is not called by node40_shift, so we have to take care
57006+ of changes on upper levels */
57007+ if (node_is_empty(node)
57008+ && !(kdata->flags & DELETE_RETAIN_EMPTY))
57009+ /* all contents of node is deleted */
57010+ prepare_removal_node40(node, info);
57011+ else if (first_key_changed) {
57012+ prepare_for_update(NULL, node, info);
57013+ }
57014+ }
57015+
57016+ coord_clear_iplug(kdata->params.from);
57017+ coord_clear_iplug(kdata->params.to);
57018+
57019+ znode_make_dirty(node);
57020+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57021+}
57022+
57023+/* plugin->u.node.cut
57024+ return value is number of items removed completely */
57025+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
57026+{
57027+ znode *node;
57028+ struct cut40_info cinfo;
57029+ int first_key_changed;
57030+
57031+ node = cdata->params.from->node;
57032+
57033+ first_key_changed =
57034+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
57035+ info);
57036+ compact(node, &cinfo);
57037+
57038+ if (info) {
57039+ /* it is not called by node40_shift, so we have to take care
57040+ of changes on upper levels */
57041+ if (node_is_empty(node))
57042+ /* all contents of node is deleted */
57043+ prepare_removal_node40(node, info);
57044+ else if (first_key_changed) {
57045+ prepare_for_update(NULL, node, info);
57046+ }
57047+ }
57048+
57049+ coord_clear_iplug(cdata->params.from);
57050+ coord_clear_iplug(cdata->params.to);
57051+
57052+ znode_make_dirty(node);
57053+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57054+}
57055+
57056+/* this structure is used by shift method of node40 plugin */
57057+struct shift_params {
57058+ shift_direction pend; /* when @pend == append - we are shifting to
57059+ left, when @pend == prepend - to right */
57060+ coord_t wish_stop; /* when shifting to left this is last unit we
57061+ want shifted, when shifting to right - this
57062+ is set to unit we want to start shifting
57063+ from */
57064+ znode *target;
57065+ int everything; /* it is set to 1 if everything we have to shift is
57066+ shifted, 0 - otherwise */
57067+
57068+ /* FIXME-VS: get rid of read_stop */
57069+
57070+ /* these are set by estimate_shift */
57071+ coord_t real_stop; /* this will be set to last unit which will be
57072+ really shifted */
57073+
57074+ /* coordinate in source node before operation of unit which becomes
57075+ first after shift to left of last after shift to right */
57076+ union {
57077+ coord_t future_first;
57078+ coord_t future_last;
57079+ } u;
57080+
57081+ unsigned merging_units; /* number of units of first item which have to
57082+ be merged with last item of target node */
57083+ unsigned merging_bytes; /* number of bytes in those units */
57084+
57085+ unsigned entire; /* items shifted in their entirety */
57086+ unsigned entire_bytes; /* number of bytes in those items */
57087+
57088+ unsigned part_units; /* number of units of partially copied item */
57089+ unsigned part_bytes; /* number of bytes in those units */
57090+
57091+ unsigned shift_bytes; /* total number of bytes in items shifted (item
57092+ headers not included) */
57093+
57094+};
57095+
57096+static int item_creation_overhead(coord_t *item)
57097+{
57098+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
57099+}
57100+
57101+/* how many units are there in @source starting from source->unit_pos
57102+ but not further than @stop_coord */
57103+static int
57104+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
57105+{
57106+ if (pend == SHIFT_LEFT) {
57107+ assert("vs-181", source->unit_pos == 0);
57108+ } else {
57109+ assert("vs-182",
57110+ source->unit_pos == coord_last_unit_pos(source));
57111+ }
57112+
57113+ if (source->item_pos != stop_coord->item_pos) {
57114+ /* @source and @stop_coord are different items */
57115+ return coord_last_unit_pos(source) + 1;
57116+ }
57117+
57118+ if (pend == SHIFT_LEFT) {
57119+ return stop_coord->unit_pos + 1;
57120+ } else {
57121+ return source->unit_pos - stop_coord->unit_pos + 1;
57122+ }
57123+}
57124+
57125+/* this calculates what can be copied from @shift->wish_stop.node to
57126+ @shift->target */
57127+static void
57128+estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
57129+{
57130+ unsigned target_free_space, size;
57131+ pos_in_node_t stop_item; /* item which estimating should not consider */
57132+ unsigned want; /* number of units of item we want shifted */
57133+ coord_t source; /* item being estimated */
57134+ item_plugin *iplug;
57135+
57136+ /* shifting to left/right starts from first/last units of
57137+ @shift->wish_stop.node */
57138+ if (shift->pend == SHIFT_LEFT) {
57139+ coord_init_first_unit(&source, shift->wish_stop.node);
57140+ } else {
57141+ coord_init_last_unit(&source, shift->wish_stop.node);
57142+ }
57143+ shift->real_stop = source;
57144+
57145+ /* free space in target node and number of items in source */
57146+ target_free_space = znode_free_space(shift->target);
57147+
57148+ shift->everything = 0;
57149+ if (!node_is_empty(shift->target)) {
57150+ /* target node is not empty, check for boundary items
57151+ mergeability */
57152+ coord_t to;
57153+
57154+ /* item we try to merge @source with */
57155+ if (shift->pend == SHIFT_LEFT) {
57156+ coord_init_last_unit(&to, shift->target);
57157+ } else {
57158+ coord_init_first_unit(&to, shift->target);
57159+ }
57160+
57161+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
57162+ &source) :
57163+ are_items_mergeable(&source, &to)) {
57164+ /* how many units of @source do we want to merge to
57165+ item @to */
57166+ want =
57167+ wanted_units(&source, &shift->wish_stop,
57168+ shift->pend);
57169+
57170+ /* how many units of @source we can merge to item
57171+ @to */
57172+ iplug = item_plugin_by_coord(&source);
57173+ if (iplug->b.can_shift != NULL)
57174+ shift->merging_units =
57175+ iplug->b.can_shift(target_free_space,
57176+ &source, shift->target,
57177+ shift->pend, &size,
57178+ want);
57179+ else {
57180+ shift->merging_units = 0;
57181+ size = 0;
57182+ }
57183+ shift->merging_bytes = size;
57184+ shift->shift_bytes += size;
57185+ /* update stop coord to be set to last unit of @source
57186+ we can merge to @target */
57187+ if (shift->merging_units)
57188+ /* at least one unit can be shifted */
57189+ shift->real_stop.unit_pos =
57190+ (shift->merging_units - source.unit_pos -
57191+ 1) * shift->pend;
57192+ else {
57193+ /* nothing can be shifted */
57194+ if (shift->pend == SHIFT_LEFT)
57195+ coord_init_before_first_item(&shift->
57196+ real_stop,
57197+ source.
57198+ node);
57199+ else
57200+ coord_init_after_last_item(&shift->
57201+ real_stop,
57202+ source.node);
57203+ }
57204+ assert("nikita-2081", shift->real_stop.unit_pos + 1);
57205+
57206+ if (shift->merging_units != want) {
57207+ /* we could not copy as many as we want, so,
57208+ there is no reason for estimating any
57209+ longer */
57210+ return;
57211+ }
57212+
57213+ target_free_space -= size;
57214+ coord_add_item_pos(&source, shift->pend);
57215+ }
57216+ }
57217+
57218+ /* number of item nothing of which we want to shift */
57219+ stop_item = shift->wish_stop.item_pos + shift->pend;
57220+
57221+ /* calculate how many items can be copied into given free
57222+ space as whole */
57223+ for (; source.item_pos != stop_item;
57224+ coord_add_item_pos(&source, shift->pend)) {
57225+ if (shift->pend == SHIFT_RIGHT)
57226+ source.unit_pos = coord_last_unit_pos(&source);
57227+
57228+ /* how many units of @source do we want to copy */
57229+ want = wanted_units(&source, &shift->wish_stop, shift->pend);
57230+
57231+ if (want == coord_last_unit_pos(&source) + 1) {
57232+ /* we want this item to be copied entirely */
57233+ size =
57234+ item_length_by_coord(&source) +
57235+ item_creation_overhead(&source);
57236+ if (size <= target_free_space) {
57237+ /* item fits into target node as whole */
57238+ target_free_space -= size;
57239+ shift->shift_bytes +=
57240+ size - item_creation_overhead(&source);
57241+ shift->entire_bytes +=
57242+ size - item_creation_overhead(&source);
57243+ shift->entire++;
57244+
57245+ /* update shift->real_stop coord to be set to
57246+ last unit of @source we can merge to
57247+ @target */
57248+ shift->real_stop = source;
57249+ if (shift->pend == SHIFT_LEFT)
57250+ shift->real_stop.unit_pos =
57251+ coord_last_unit_pos(&shift->
57252+ real_stop);
57253+ else
57254+ shift->real_stop.unit_pos = 0;
57255+ continue;
57256+ }
57257+ }
57258+
57259+ /* we reach here only for an item which does not fit into
57260+ target node in its entirety. This item may be either
57261+ partially shifted, or not shifted at all. We will have to
57262+ create new item in target node, so decrease amout of free
57263+ space by an item creation overhead. We can reach here also
57264+ if stop coord is in this item */
57265+ if (target_free_space >=
57266+ (unsigned)item_creation_overhead(&source)) {
57267+ target_free_space -= item_creation_overhead(&source);
57268+ iplug = item_plugin_by_coord(&source);
57269+ if (iplug->b.can_shift) {
57270+ shift->part_units = iplug->b.can_shift(target_free_space,
57271+ &source,
57272+ NULL, /* target */
57273+ shift->pend,
57274+ &size,
57275+ want);
57276+ } else {
57277+ target_free_space = 0;
57278+ shift->part_units = 0;
57279+ size = 0;
57280+ }
57281+ } else {
57282+ target_free_space = 0;
57283+ shift->part_units = 0;
57284+ size = 0;
57285+ }
57286+ shift->part_bytes = size;
57287+ shift->shift_bytes += size;
57288+
57289+ /* set @shift->real_stop to last unit of @source we can merge
57290+ to @shift->target */
57291+ if (shift->part_units) {
57292+ shift->real_stop = source;
57293+ shift->real_stop.unit_pos =
57294+ (shift->part_units - source.unit_pos -
57295+ 1) * shift->pend;
57296+ assert("nikita-2082", shift->real_stop.unit_pos + 1);
57297+ }
57298+
57299+ if (want != shift->part_units)
57300+ /* not everything wanted were shifted */
57301+ return;
57302+ break;
57303+ }
57304+
57305+ shift->everything = 1;
57306+}
57307+
57308+static void
57309+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
57310+ shift_direction dir, unsigned free_space)
57311+{
57312+ item_plugin *iplug;
57313+
57314+ assert("nikita-1463", target != NULL);
57315+ assert("nikita-1464", source != NULL);
57316+ assert("nikita-1465", from + count <= coord_num_units(source));
57317+
57318+ iplug = item_plugin_by_coord(source);
57319+ assert("nikita-1468", iplug == item_plugin_by_coord(target));
57320+ iplug->b.copy_units(target, source, from, count, dir, free_space);
57321+
57322+ if (dir == SHIFT_RIGHT) {
57323+ /* FIXME-VS: this looks not necessary. update_item_key was
57324+ called already by copy_units method */
57325+ reiser4_key split_key;
57326+
57327+ assert("nikita-1469", target->unit_pos == 0);
57328+
57329+ unit_key_by_coord(target, &split_key);
57330+ node_plugin_by_coord(target)->update_item_key(target,
57331+ &split_key, NULL);
57332+ }
57333+}
57334+
57335+/* copy part of @shift->real_stop.node starting either from its beginning or
57336+ from its end and ending at @shift->real_stop to either the end or the
57337+ beginning of @shift->target */
57338+static void copy(struct shift_params *shift)
57339+{
57340+ node40_header *nh;
57341+ coord_t from;
57342+ coord_t to;
57343+ item_header40 *from_ih, *to_ih;
57344+ int free_space_start;
57345+ int new_items;
57346+ unsigned old_items;
57347+ int old_offset;
57348+ unsigned i;
57349+
57350+ nh = node40_node_header(shift->target);
57351+ free_space_start = nh40_get_free_space_start(nh);
57352+ old_items = nh40_get_num_items(nh);
57353+ new_items = shift->entire + (shift->part_units ? 1 : 0);
57354+ assert("vs-185",
57355+ shift->shift_bytes ==
57356+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
57357+
57358+ from = shift->wish_stop;
57359+
57360+ coord_init_first_unit(&to, shift->target);
57361+
57362+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
57363+ hence to.between is set to EMPTY_NODE above. Looks like we want it
57364+ to be AT_UNIT.
57365+
57366+ Oh, wonders of ->betweeness...
57367+
57368+ */
57369+ to.between = AT_UNIT;
57370+
57371+ if (shift->pend == SHIFT_LEFT) {
57372+ /* copying to left */
57373+
57374+ coord_set_item_pos(&from, 0);
57375+ from_ih = node40_ih_at(from.node, 0);
57376+
57377+ coord_set_item_pos(&to,
57378+ node40_num_of_items_internal(to.node) - 1);
57379+ if (shift->merging_units) {
57380+ /* expand last item, so that plugin methods will see
57381+ correct data */
57382+ free_space_start += shift->merging_bytes;
57383+ nh40_set_free_space_start(nh,
57384+ (unsigned)free_space_start);
57385+ nh40_set_free_space(nh,
57386+ nh40_get_free_space(nh) -
57387+ shift->merging_bytes);
57388+
57389+ /* appending last item of @target */
57390+ copy_units(&to, &from, 0, /* starting from 0-th unit */
57391+ shift->merging_units, SHIFT_LEFT,
57392+ shift->merging_bytes);
57393+ coord_inc_item_pos(&from);
57394+ from_ih--;
57395+ coord_inc_item_pos(&to);
57396+ }
57397+
57398+ to_ih = node40_ih_at(shift->target, old_items);
57399+ if (shift->entire) {
57400+ /* copy @entire items entirely */
57401+
57402+ /* copy item headers */
57403+ memcpy(to_ih - shift->entire + 1,
57404+ from_ih - shift->entire + 1,
57405+ shift->entire * sizeof(item_header40));
57406+ /* update item header offset */
57407+ old_offset = ih40_get_offset(from_ih);
57408+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
57409+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
57410+ ih40_set_offset(to_ih,
57411+ ih40_get_offset(from_ih) -
57412+ old_offset + free_space_start);
57413+
57414+ /* copy item bodies */
57415+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
57416+ shift->entire_bytes);
57417+
57418+ coord_add_item_pos(&from, (int)shift->entire);
57419+ coord_add_item_pos(&to, (int)shift->entire);
57420+ }
57421+
57422+ nh40_set_free_space_start(nh,
57423+ free_space_start +
57424+ shift->shift_bytes -
57425+ shift->merging_bytes);
57426+ nh40_set_free_space(nh,
57427+ nh40_get_free_space(nh) -
57428+ (shift->shift_bytes - shift->merging_bytes +
57429+ sizeof(item_header40) * new_items));
57430+
57431+ /* update node header */
57432+ node40_set_num_items(shift->target, nh, old_items + new_items);
57433+ assert("vs-170",
57434+ nh40_get_free_space(nh) < znode_size(shift->target));
57435+
57436+ if (shift->part_units) {
57437+ /* copy heading part (@part units) of @source item as
57438+ a new item into @target->node */
57439+
57440+ /* copy item header of partially copied item */
57441+ coord_set_item_pos(&to,
57442+ node40_num_of_items_internal(to.node)
57443+ - 1);
57444+ memcpy(to_ih, from_ih, sizeof(item_header40));
57445+ ih40_set_offset(to_ih,
57446+ nh40_get_free_space_start(nh) -
57447+ shift->part_bytes);
57448+ if (item_plugin_by_coord(&to)->b.init)
57449+ item_plugin_by_coord(&to)->b.init(&to, &from,
57450+ NULL);
57451+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
57452+ shift->part_bytes);
57453+ }
57454+
57455+ } else {
57456+ /* copying to right */
57457+
57458+ coord_set_item_pos(&from,
57459+ node40_num_of_items_internal(from.node) - 1);
57460+ from_ih = node40_ih_at_coord(&from);
57461+
57462+ coord_set_item_pos(&to, 0);
57463+
57464+ /* prepare space for new items */
57465+ memmove(zdata(to.node) + sizeof(node40_header) +
57466+ shift->shift_bytes,
57467+ zdata(to.node) + sizeof(node40_header),
57468+ free_space_start - sizeof(node40_header));
57469+ /* update item headers of moved items */
57470+ to_ih = node40_ih_at(to.node, 0);
57471+ /* first item gets @merging_bytes longer. free space appears
57472+ at its beginning */
57473+ if (!node_is_empty(to.node))
57474+ ih40_set_offset(to_ih,
57475+ ih40_get_offset(to_ih) +
57476+ shift->shift_bytes -
57477+ shift->merging_bytes);
57478+
57479+ for (i = 1; i < old_items; i++)
57480+ ih40_set_offset(to_ih - i,
57481+ ih40_get_offset(to_ih - i) +
57482+ shift->shift_bytes);
57483+
57484+ /* move item headers to make space for new items */
57485+ memmove(to_ih - old_items + 1 - new_items,
57486+ to_ih - old_items + 1,
57487+ sizeof(item_header40) * old_items);
57488+ to_ih -= (new_items - 1);
57489+
57490+ nh40_set_free_space_start(nh,
57491+ free_space_start +
57492+ shift->shift_bytes);
57493+ nh40_set_free_space(nh,
57494+ nh40_get_free_space(nh) -
57495+ (shift->shift_bytes +
57496+ sizeof(item_header40) * new_items));
57497+
57498+ /* update node header */
57499+ node40_set_num_items(shift->target, nh, old_items + new_items);
57500+ assert("vs-170",
57501+ nh40_get_free_space(nh) < znode_size(shift->target));
57502+
57503+ if (shift->merging_units) {
57504+ coord_add_item_pos(&to, new_items);
57505+ to.unit_pos = 0;
57506+ to.between = AT_UNIT;
57507+ /* prepend first item of @to */
57508+ copy_units(&to, &from,
57509+ coord_last_unit_pos(&from) -
57510+ shift->merging_units + 1,
57511+ shift->merging_units, SHIFT_RIGHT,
57512+ shift->merging_bytes);
57513+ coord_dec_item_pos(&from);
57514+ from_ih++;
57515+ }
57516+
57517+ if (shift->entire) {
57518+ /* copy @entire items entirely */
57519+
57520+ /* copy item headers */
57521+ memcpy(to_ih, from_ih,
57522+ shift->entire * sizeof(item_header40));
57523+
57524+ /* update item header offset */
57525+ old_offset =
57526+ ih40_get_offset(from_ih + shift->entire - 1);
57527+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
57528+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
57529+ ih40_set_offset(to_ih,
57530+ ih40_get_offset(from_ih) -
57531+ old_offset +
57532+ sizeof(node40_header) +
57533+ shift->part_bytes);
57534+ /* copy item bodies */
57535+ coord_add_item_pos(&from, -(int)(shift->entire - 1));
57536+ memcpy(zdata(to.node) + sizeof(node40_header) +
57537+ shift->part_bytes, item_by_coord_node40(&from),
57538+ shift->entire_bytes);
57539+ coord_dec_item_pos(&from);
57540+ }
57541+
57542+ if (shift->part_units) {
57543+ coord_set_item_pos(&to, 0);
57544+ to.unit_pos = 0;
57545+ to.between = AT_UNIT;
57546+ /* copy heading part (@part units) of @source item as
57547+ a new item into @target->node */
57548+
57549+ /* copy item header of partially copied item */
57550+ memcpy(to_ih, from_ih, sizeof(item_header40));
57551+ ih40_set_offset(to_ih, sizeof(node40_header));
57552+ if (item_plugin_by_coord(&to)->b.init)
57553+ item_plugin_by_coord(&to)->b.init(&to, &from,
57554+ NULL);
57555+ copy_units(&to, &from,
57556+ coord_last_unit_pos(&from) -
57557+ shift->part_units + 1, shift->part_units,
57558+ SHIFT_RIGHT, shift->part_bytes);
57559+ }
57560+ }
57561+}
57562+
57563+/* remove everything either before or after @fact_stop. Number of items
57564+ removed completely is returned */
57565+static int delete_copied(struct shift_params *shift)
57566+{
57567+ coord_t from;
57568+ coord_t to;
57569+ struct carry_cut_data cdata;
57570+
57571+ if (shift->pend == SHIFT_LEFT) {
57572+ /* we were shifting to left, remove everything from the
57573+ beginning of @shift->wish_stop->node upto
57574+ @shift->wish_stop */
57575+ coord_init_first_unit(&from, shift->real_stop.node);
57576+ to = shift->real_stop;
57577+
57578+ /* store old coordinate of unit which will be first after
57579+ shift to left */
57580+ shift->u.future_first = to;
57581+ coord_next_unit(&shift->u.future_first);
57582+ } else {
57583+ /* we were shifting to right, remove everything from
57584+ @shift->stop_coord upto to end of
57585+ @shift->stop_coord->node */
57586+ from = shift->real_stop;
57587+ coord_init_last_unit(&to, from.node);
57588+
57589+ /* store old coordinate of unit which will be last after
57590+ shift to right */
57591+ shift->u.future_last = from;
57592+ coord_prev_unit(&shift->u.future_last);
57593+ }
57594+
57595+ cdata.params.from = &from;
57596+ cdata.params.to = &to;
57597+ cdata.params.from_key = NULL;
57598+ cdata.params.to_key = NULL;
57599+ cdata.params.smallest_removed = NULL;
57600+ return cut_node40(&cdata, NULL);
57601+}
57602+
57603+/* something was moved between @left and @right. Add carry operation to @info
57604+ list to have carry to update delimiting key between them */
57605+static int
57606+prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
57607+{
57608+ carry_op *op;
57609+ carry_node *cn;
57610+
57611+ if (info == NULL)
57612+ /* nowhere to send operation to. */
57613+ return 0;
57614+
57615+ if (!should_notify_parent(right))
57616+ return 0;
57617+
57618+ op = node_post_carry(info, COP_UPDATE, right, 1);
57619+ if (IS_ERR(op) || op == NULL)
57620+ return op ? PTR_ERR(op) : -EIO;
57621+
57622+ if (left != NULL) {
57623+ carry_node *reference;
57624+
57625+ if (info->doing)
57626+ reference = insert_carry_node(info->doing,
57627+ info->todo, left);
57628+ else
57629+ reference = op->node;
57630+ assert("nikita-2992", reference != NULL);
57631+ cn = add_carry(info->todo, POOLO_BEFORE, reference);
57632+ if (IS_ERR(cn))
57633+ return PTR_ERR(cn);
57634+ cn->parent = 1;
57635+ cn->node = left;
57636+ if (ZF_ISSET(left, JNODE_ORPHAN))
57637+ cn->left_before = 1;
57638+ op->u.update.left = cn;
57639+ } else
57640+ op->u.update.left = NULL;
57641+ return 0;
57642+}
57643+
57644+/* plugin->u.node.prepare_removal
57645+ to delete a pointer to @empty from the tree add corresponding carry
57646+ operation (delete) to @info list */
57647+int prepare_removal_node40(znode * empty, carry_plugin_info * info)
57648+{
57649+ carry_op *op;
57650+ reiser4_tree *tree;
57651+
57652+ if (!should_notify_parent(empty))
57653+ return 0;
57654+ /* already on a road to Styx */
57655+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
57656+ return 0;
57657+ op = node_post_carry(info, COP_DELETE, empty, 1);
57658+ if (IS_ERR(op) || op == NULL)
57659+ return RETERR(op ? PTR_ERR(op) : -EIO);
57660+
57661+ op->u.delete.child = NULL;
57662+ op->u.delete.flags = 0;
57663+
57664+ /* fare thee well */
57665+ tree = znode_get_tree(empty);
57666+ read_lock_tree(tree);
57667+ write_lock_dk(tree);
57668+ znode_set_ld_key(empty, znode_get_rd_key(empty));
57669+ if (znode_is_left_connected(empty) && empty->left)
57670+ znode_set_rd_key(empty->left, znode_get_rd_key(empty));
57671+ write_unlock_dk(tree);
57672+ read_unlock_tree(tree);
57673+
57674+ ZF_SET(empty, JNODE_HEARD_BANSHEE);
57675+ return 0;
57676+}
57677+
57678+/* something were shifted from @insert_coord->node to @shift->target, update
57679+ @insert_coord correspondingly */
57680+static void
57681+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
57682+ int including_insert_coord)
57683+{
57684+ /* item plugin was invalidated by shifting */
57685+ coord_clear_iplug(insert_coord);
57686+
57687+ if (node_is_empty(shift->wish_stop.node)) {
57688+ assert("vs-242", shift->everything);
57689+ if (including_insert_coord) {
57690+ if (shift->pend == SHIFT_RIGHT) {
57691+ /* set @insert_coord before first unit of
57692+ @shift->target node */
57693+ coord_init_before_first_item(insert_coord,
57694+ shift->target);
57695+ } else {
57696+ /* set @insert_coord after last in target node */
57697+ coord_init_after_last_item(insert_coord,
57698+ shift->target);
57699+ }
57700+ } else {
57701+ /* set @insert_coord inside of empty node. There is
57702+ only one possible coord within an empty
57703+ node. init_first_unit will set that coord */
57704+ coord_init_first_unit(insert_coord,
57705+ shift->wish_stop.node);
57706+ }
57707+ return;
57708+ }
57709+
57710+ if (shift->pend == SHIFT_RIGHT) {
57711+ /* there was shifting to right */
57712+ if (shift->everything) {
57713+ /* everything wanted was shifted */
57714+ if (including_insert_coord) {
57715+ /* @insert_coord is set before first unit of
57716+ @to node */
57717+ coord_init_before_first_item(insert_coord,
57718+ shift->target);
57719+ insert_coord->between = BEFORE_UNIT;
57720+ } else {
57721+ /* @insert_coord is set after last unit of
57722+ @insert->node */
57723+ coord_init_last_unit(insert_coord,
57724+ shift->wish_stop.node);
57725+ insert_coord->between = AFTER_UNIT;
57726+ }
57727+ }
57728+ return;
57729+ }
57730+
57731+ /* there was shifting to left */
57732+ if (shift->everything) {
57733+ /* everything wanted was shifted */
57734+ if (including_insert_coord) {
57735+ /* @insert_coord is set after last unit in @to node */
57736+ coord_init_after_last_item(insert_coord, shift->target);
57737+ } else {
57738+ /* @insert_coord is set before first unit in the same
57739+ node */
57740+ coord_init_before_first_item(insert_coord,
57741+ shift->wish_stop.node);
57742+ }
57743+ return;
57744+ }
57745+
57746+ /* FIXME-VS: the code below is complicated because with between ==
57747+ AFTER_ITEM unit_pos is set to 0 */
57748+
57749+ if (!removed) {
57750+ /* no items were shifted entirely */
57751+ assert("vs-195", shift->merging_units == 0
57752+ || shift->part_units == 0);
57753+
57754+ if (shift->real_stop.item_pos == insert_coord->item_pos) {
57755+ if (shift->merging_units) {
57756+ if (insert_coord->between == AFTER_UNIT) {
57757+ assert("nikita-1441",
57758+ insert_coord->unit_pos >=
57759+ shift->merging_units);
57760+ insert_coord->unit_pos -=
57761+ shift->merging_units;
57762+ } else if (insert_coord->between == BEFORE_UNIT) {
57763+ assert("nikita-2090",
57764+ insert_coord->unit_pos >
57765+ shift->merging_units);
57766+ insert_coord->unit_pos -=
57767+ shift->merging_units;
57768+ }
57769+
57770+ assert("nikita-2083",
57771+ insert_coord->unit_pos + 1);
57772+ } else {
57773+ if (insert_coord->between == AFTER_UNIT) {
57774+ assert("nikita-1442",
57775+ insert_coord->unit_pos >=
57776+ shift->part_units);
57777+ insert_coord->unit_pos -=
57778+ shift->part_units;
57779+ } else if (insert_coord->between == BEFORE_UNIT) {
57780+ assert("nikita-2089",
57781+ insert_coord->unit_pos >
57782+ shift->part_units);
57783+ insert_coord->unit_pos -=
57784+ shift->part_units;
57785+ }
57786+
57787+ assert("nikita-2084",
57788+ insert_coord->unit_pos + 1);
57789+ }
57790+ }
57791+ return;
57792+ }
57793+
57794+ /* we shifted to left and there was no enough space for everything */
57795+ switch (insert_coord->between) {
57796+ case AFTER_UNIT:
57797+ case BEFORE_UNIT:
57798+ if (shift->real_stop.item_pos == insert_coord->item_pos)
57799+ insert_coord->unit_pos -= shift->part_units;
57800+ case AFTER_ITEM:
57801+ coord_add_item_pos(insert_coord, -removed);
57802+ break;
57803+ default:
57804+ impossible("nikita-2087", "not ready");
57805+ }
57806+ assert("nikita-2085", insert_coord->unit_pos + 1);
57807+}
57808+
57809+static int call_shift_hooks(struct shift_params *shift)
57810+{
57811+ unsigned i, shifted;
57812+ coord_t coord;
57813+ item_plugin *iplug;
57814+
57815+ assert("vs-275", !node_is_empty(shift->target));
57816+
57817+ /* number of items shift touches */
57818+ shifted =
57819+ shift->entire + (shift->merging_units ? 1 : 0) +
57820+ (shift->part_units ? 1 : 0);
57821+
57822+ if (shift->pend == SHIFT_LEFT) {
57823+ /* moved items are at the end */
57824+ coord_init_last_unit(&coord, shift->target);
57825+ coord.unit_pos = 0;
57826+
57827+ assert("vs-279", shift->pend == 1);
57828+ for (i = 0; i < shifted; i++) {
57829+ unsigned from, count;
57830+
57831+ iplug = item_plugin_by_coord(&coord);
57832+ if (i == 0 && shift->part_units) {
57833+ assert("vs-277",
57834+ coord_num_units(&coord) ==
57835+ shift->part_units);
57836+ count = shift->part_units;
57837+ from = 0;
57838+ } else if (i == shifted - 1 && shift->merging_units) {
57839+ count = shift->merging_units;
57840+ from = coord_num_units(&coord) - count;
57841+ } else {
57842+ count = coord_num_units(&coord);
57843+ from = 0;
57844+ }
57845+
57846+ if (iplug->b.shift_hook) {
57847+ iplug->b.shift_hook(&coord, from, count,
57848+ shift->wish_stop.node);
57849+ }
57850+ coord_add_item_pos(&coord, -shift->pend);
57851+ }
57852+ } else {
57853+ /* moved items are at the beginning */
57854+ coord_init_first_unit(&coord, shift->target);
57855+
57856+ assert("vs-278", shift->pend == -1);
57857+ for (i = 0; i < shifted; i++) {
57858+ unsigned from, count;
57859+
57860+ iplug = item_plugin_by_coord(&coord);
57861+ if (i == 0 && shift->part_units) {
57862+ assert("vs-277",
57863+ coord_num_units(&coord) ==
57864+ shift->part_units);
57865+ count = coord_num_units(&coord);
57866+ from = 0;
57867+ } else if (i == shifted - 1 && shift->merging_units) {
57868+ count = shift->merging_units;
57869+ from = 0;
57870+ } else {
57871+ count = coord_num_units(&coord);
57872+ from = 0;
57873+ }
57874+
57875+ if (iplug->b.shift_hook) {
57876+ iplug->b.shift_hook(&coord, from, count,
57877+ shift->wish_stop.node);
57878+ }
57879+ coord_add_item_pos(&coord, -shift->pend);
57880+ }
57881+ }
57882+
57883+ return 0;
57884+}
57885+
57886+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
57887+static int
57888+unit_moved_left(const struct shift_params *shift, const coord_t * old)
57889+{
57890+ assert("vs-944", shift->real_stop.node == old->node);
57891+
57892+ if (shift->real_stop.item_pos < old->item_pos)
57893+ return 0;
57894+ if (shift->real_stop.item_pos == old->item_pos) {
57895+ if (shift->real_stop.unit_pos < old->unit_pos)
57896+ return 0;
57897+ }
57898+ return 1;
57899+}
57900+
57901+/* shift to right is completed. Return 1 if unit @old was moved to right
57902+ neighbor */
57903+static int
57904+unit_moved_right(const struct shift_params *shift, const coord_t * old)
57905+{
57906+ assert("vs-944", shift->real_stop.node == old->node);
57907+
57908+ if (shift->real_stop.item_pos > old->item_pos)
57909+ return 0;
57910+ if (shift->real_stop.item_pos == old->item_pos) {
57911+ if (shift->real_stop.unit_pos > old->unit_pos)
57912+ return 0;
57913+ }
57914+ return 1;
57915+}
57916+
57917+/* coord @old was set in node from which shift was performed. What was shifted
57918+ is stored in @shift. Update @old correspondingly to performed shift */
57919+static coord_t *adjust_coord2(const struct shift_params *shift,
57920+ const coord_t * old, coord_t * new)
57921+{
57922+ coord_clear_iplug(new);
57923+ new->between = old->between;
57924+
57925+ coord_clear_iplug(new);
57926+ if (old->node == shift->target) {
57927+ if (shift->pend == SHIFT_LEFT) {
57928+ /* coord which is set inside of left neighbor does not
57929+ change during shift to left */
57930+ coord_dup(new, old);
57931+ return new;
57932+ }
57933+ new->node = old->node;
57934+ coord_set_item_pos(new,
57935+ old->item_pos + shift->entire +
57936+ (shift->part_units ? 1 : 0));
57937+ new->unit_pos = old->unit_pos;
57938+ if (old->item_pos == 0 && shift->merging_units)
57939+ new->unit_pos += shift->merging_units;
57940+ return new;
57941+ }
57942+
57943+ assert("vs-977", old->node == shift->wish_stop.node);
57944+ if (shift->pend == SHIFT_LEFT) {
57945+ if (unit_moved_left(shift, old)) {
57946+ /* unit @old moved to left neighbor. Calculate its
57947+ coordinate there */
57948+ new->node = shift->target;
57949+ coord_set_item_pos(new,
57950+ node_num_items(shift->target) -
57951+ shift->entire -
57952+ (shift->part_units ? 1 : 0) +
57953+ old->item_pos);
57954+
57955+ new->unit_pos = old->unit_pos;
57956+ if (shift->merging_units) {
57957+ coord_dec_item_pos(new);
57958+ if (old->item_pos == 0) {
57959+ /* unit_pos only changes if item got
57960+ merged */
57961+ new->unit_pos =
57962+ coord_num_units(new) -
57963+ (shift->merging_units -
57964+ old->unit_pos);
57965+ }
57966+ }
57967+ } else {
57968+ /* unit @old did not move to left neighbor.
57969+
57970+ Use _nocheck, because @old is outside of its node.
57971+ */
57972+ coord_dup_nocheck(new, old);
57973+ coord_add_item_pos(new,
57974+ -shift->u.future_first.item_pos);
57975+ if (new->item_pos == 0)
57976+ new->unit_pos -= shift->u.future_first.unit_pos;
57977+ }
57978+ } else {
57979+ if (unit_moved_right(shift, old)) {
57980+ /* unit @old moved to right neighbor */
57981+ new->node = shift->target;
57982+ coord_set_item_pos(new,
57983+ old->item_pos -
57984+ shift->real_stop.item_pos);
57985+ if (new->item_pos == 0) {
57986+ /* unit @old might change unit pos */
57987+ coord_set_item_pos(new,
57988+ old->unit_pos -
57989+ shift->real_stop.unit_pos);
57990+ }
57991+ } else {
57992+ /* unit @old did not move to right neighbor, therefore
57993+ it did not change */
57994+ coord_dup(new, old);
57995+ }
57996+ }
57997+ coord_set_iplug(new, item_plugin_by_coord(new));
57998+ return new;
57999+}
58000+
58001+/* this is called when shift is completed (something of source node is copied
58002+ to target and deleted in source) to update all taps set in current
58003+ context */
58004+static void update_taps(const struct shift_params *shift)
58005+{
58006+ tap_t *tap;
58007+ coord_t new;
58008+
58009+ for_all_taps(tap) {
58010+ /* update only taps set to nodes participating in shift */
58011+ if (tap->coord->node == shift->wish_stop.node
58012+ || tap->coord->node == shift->target)
58013+ tap_to_coord(tap,
58014+ adjust_coord2(shift, tap->coord, &new));
58015+ }
58016+}
58017+
58018+#if REISER4_DEBUG
58019+
58020+struct shift_check {
58021+ reiser4_key key;
58022+ __u16 plugin_id;
58023+ union {
58024+ __u64 bytes;
58025+ __u64 entries;
58026+ void *unused;
58027+ } u;
58028+};
58029+
58030+void *shift_check_prepare(const znode * left, const znode * right)
58031+{
58032+ pos_in_node_t i, nr_items;
58033+ int mergeable;
58034+ struct shift_check *data;
58035+ item_header40 *ih;
58036+
58037+ if (node_is_empty(left) || node_is_empty(right))
58038+ mergeable = 0;
58039+ else {
58040+ coord_t l, r;
58041+
58042+ coord_init_last_unit(&l, left);
58043+ coord_init_first_unit(&r, right);
58044+ mergeable = are_items_mergeable(&l, &r);
58045+ }
58046+ nr_items =
58047+ node40_num_of_items_internal(left) +
58048+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58049+ data =
58050+ kmalloc(sizeof(struct shift_check) * nr_items, get_gfp_mask());
58051+ if (data != NULL) {
58052+ coord_t coord;
58053+ pos_in_node_t item_pos;
58054+
58055+ coord_init_first_unit(&coord, left);
58056+ i = 0;
58057+
58058+ for (item_pos = 0;
58059+ item_pos < node40_num_of_items_internal(left);
58060+ item_pos++) {
58061+
58062+ coord_set_item_pos(&coord, item_pos);
58063+ ih = node40_ih_at_coord(&coord);
58064+
58065+ data[i].key = ih->key;
58066+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58067+ switch (data[i].plugin_id) {
58068+ case CTAIL_ID:
58069+ case FORMATTING_ID:
58070+ data[i].u.bytes = coord_num_units(&coord);
58071+ break;
58072+ case EXTENT_POINTER_ID:
58073+ data[i].u.bytes =
58074+ extent_size(&coord,
58075+ coord_num_units(&coord));
58076+ break;
58077+ case COMPOUND_DIR_ID:
58078+ data[i].u.entries = coord_num_units(&coord);
58079+ break;
58080+ default:
58081+ data[i].u.unused = NULL;
58082+ break;
58083+ }
58084+ i++;
58085+ }
58086+
58087+ coord_init_first_unit(&coord, right);
58088+
58089+ if (mergeable) {
58090+ assert("vs-1609", i != 0);
58091+
58092+ ih = node40_ih_at_coord(&coord);
58093+
58094+ assert("vs-1589",
58095+ data[i - 1].plugin_id ==
58096+ le16_to_cpu(get_unaligned(&ih->plugin_id)));
58097+ switch (data[i - 1].plugin_id) {
58098+ case CTAIL_ID:
58099+ case FORMATTING_ID:
58100+ data[i - 1].u.bytes += coord_num_units(&coord);
58101+ break;
58102+ case EXTENT_POINTER_ID:
58103+ data[i - 1].u.bytes +=
58104+ extent_size(&coord,
58105+ coord_num_units(&coord));
58106+ break;
58107+ case COMPOUND_DIR_ID:
58108+ data[i - 1].u.entries +=
58109+ coord_num_units(&coord);
58110+ break;
58111+ default:
58112+ impossible("vs-1605", "wrong mergeable item");
58113+ break;
58114+ }
58115+ item_pos = 1;
58116+ } else
58117+ item_pos = 0;
58118+ for (; item_pos < node40_num_of_items_internal(right);
58119+ item_pos++) {
58120+
58121+ assert("vs-1604", i < nr_items);
58122+ coord_set_item_pos(&coord, item_pos);
58123+ ih = node40_ih_at_coord(&coord);
58124+
58125+ data[i].key = ih->key;
58126+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58127+ switch (data[i].plugin_id) {
58128+ case CTAIL_ID:
58129+ case FORMATTING_ID:
58130+ data[i].u.bytes = coord_num_units(&coord);
58131+ break;
58132+ case EXTENT_POINTER_ID:
58133+ data[i].u.bytes =
58134+ extent_size(&coord,
58135+ coord_num_units(&coord));
58136+ break;
58137+ case COMPOUND_DIR_ID:
58138+ data[i].u.entries = coord_num_units(&coord);
58139+ break;
58140+ default:
58141+ data[i].u.unused = NULL;
58142+ break;
58143+ }
58144+ i++;
58145+ }
58146+ assert("vs-1606", i == nr_items);
58147+ }
58148+ return data;
58149+}
58150+
58151+void shift_check(void *vp, const znode * left, const znode * right)
58152+{
58153+ pos_in_node_t i, nr_items;
58154+ coord_t coord;
58155+ __u64 last_bytes;
58156+ int mergeable;
58157+ item_header40 *ih;
58158+ pos_in_node_t item_pos;
58159+ struct shift_check *data;
58160+
58161+ data = (struct shift_check *)vp;
58162+
58163+ if (data == NULL)
58164+ return;
58165+
58166+ if (node_is_empty(left) || node_is_empty(right))
58167+ mergeable = 0;
58168+ else {
58169+ coord_t l, r;
58170+
58171+ coord_init_last_unit(&l, left);
58172+ coord_init_first_unit(&r, right);
58173+ mergeable = are_items_mergeable(&l, &r);
58174+ }
58175+
58176+ nr_items =
58177+ node40_num_of_items_internal(left) +
58178+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58179+
58180+ i = 0;
58181+ last_bytes = 0;
58182+
58183+ coord_init_first_unit(&coord, left);
58184+
58185+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
58186+ item_pos++) {
58187+
58188+ coord_set_item_pos(&coord, item_pos);
58189+ ih = node40_ih_at_coord(&coord);
58190+
58191+ assert("vs-1611", i == item_pos);
58192+ assert("vs-1590", keyeq(&ih->key, &data[i].key));
58193+ assert("vs-1591",
58194+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58195+ if ((i < (node40_num_of_items_internal(left) - 1))
58196+ || !mergeable) {
58197+ switch (data[i].plugin_id) {
58198+ case CTAIL_ID:
58199+ case FORMATTING_ID:
58200+ assert("vs-1592",
58201+ data[i].u.bytes ==
58202+ coord_num_units(&coord));
58203+ break;
58204+ case EXTENT_POINTER_ID:
58205+ assert("vs-1593",
58206+ data[i].u.bytes == extent_size(&coord,
58207+ coord_num_units
58208+ (&coord)));
58209+ break;
58210+ case COMPOUND_DIR_ID:
58211+ assert("vs-1594",
58212+ data[i].u.entries ==
58213+ coord_num_units(&coord));
58214+ break;
58215+ default:
58216+ break;
58217+ }
58218+ }
58219+ if (item_pos == (node40_num_of_items_internal(left) - 1)
58220+ && mergeable) {
58221+ switch (data[i].plugin_id) {
58222+ case CTAIL_ID:
58223+ case FORMATTING_ID:
58224+ last_bytes = coord_num_units(&coord);
58225+ break;
58226+ case EXTENT_POINTER_ID:
58227+ last_bytes =
58228+ extent_size(&coord,
58229+ coord_num_units(&coord));
58230+ break;
58231+ case COMPOUND_DIR_ID:
58232+ last_bytes = coord_num_units(&coord);
58233+ break;
58234+ default:
58235+ impossible("vs-1595", "wrong mergeable item");
58236+ break;
58237+ }
58238+ }
58239+ i++;
58240+ }
58241+
58242+ coord_init_first_unit(&coord, right);
58243+ if (mergeable) {
58244+ ih = node40_ih_at_coord(&coord);
58245+
58246+ assert("vs-1589",
58247+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
58248+ assert("vs-1608", last_bytes != 0);
58249+ switch (data[i - 1].plugin_id) {
58250+ case CTAIL_ID:
58251+ case FORMATTING_ID:
58252+ assert("vs-1596",
58253+ data[i - 1].u.bytes ==
58254+ last_bytes + coord_num_units(&coord));
58255+ break;
58256+
58257+ case EXTENT_POINTER_ID:
58258+ assert("vs-1597",
58259+ data[i - 1].u.bytes ==
58260+ last_bytes + extent_size(&coord,
58261+ coord_num_units
58262+ (&coord)));
58263+ break;
58264+
58265+ case COMPOUND_DIR_ID:
58266+ assert("vs-1598",
58267+ data[i - 1].u.bytes ==
58268+ last_bytes + coord_num_units(&coord));
58269+ break;
58270+ default:
58271+ impossible("vs-1599", "wrong mergeable item");
58272+ break;
58273+ }
58274+ item_pos = 1;
58275+ } else
58276+ item_pos = 0;
58277+
58278+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
58279+
58280+ coord_set_item_pos(&coord, item_pos);
58281+ ih = node40_ih_at_coord(&coord);
58282+
58283+ assert("vs-1612", keyeq(&ih->key, &data[i].key));
58284+ assert("vs-1613",
58285+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58286+ switch (data[i].plugin_id) {
58287+ case CTAIL_ID:
58288+ case FORMATTING_ID:
58289+ assert("vs-1600",
58290+ data[i].u.bytes == coord_num_units(&coord));
58291+ break;
58292+ case EXTENT_POINTER_ID:
58293+ assert("vs-1601",
58294+ data[i].u.bytes == extent_size(&coord,
58295+ coord_num_units
58296+ (&coord)));
58297+ break;
58298+ case COMPOUND_DIR_ID:
58299+ assert("vs-1602",
58300+ data[i].u.entries == coord_num_units(&coord));
58301+ break;
58302+ default:
58303+ break;
58304+ }
58305+ i++;
58306+ }
58307+
58308+ assert("vs-1603", i == nr_items);
58309+ kfree(data);
58310+}
58311+
58312+#endif
58313+
58314+/* plugin->u.node.shift
58315+ look for description of this method in plugin/node/node.h */
58316+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
58317+ deleted from the tree if this is set to 1 */
58318+ int including_stop_coord, carry_plugin_info * info)
58319+{
58320+ struct shift_params shift;
58321+ int result;
58322+ znode *left, *right;
58323+ znode *source;
58324+ int target_empty;
58325+
58326+ assert("nikita-2161", coord_check(from));
58327+
58328+ memset(&shift, 0, sizeof(shift));
58329+ shift.pend = pend;
58330+ shift.wish_stop = *from;
58331+ shift.target = to;
58332+
58333+ assert("nikita-1473", znode_is_write_locked(from->node));
58334+ assert("nikita-1474", znode_is_write_locked(to));
58335+
58336+ source = from->node;
58337+
58338+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want
58339+ shifted */
58340+ if (pend == SHIFT_LEFT) {
58341+ result = coord_set_to_left(&shift.wish_stop);
58342+ left = to;
58343+ right = from->node;
58344+ } else {
58345+ result = coord_set_to_right(&shift.wish_stop);
58346+ left = from->node;
58347+ right = to;
58348+ }
58349+
58350+ if (result) {
58351+ /* move insertion coord even if there is nothing to move */
58352+ if (including_stop_coord) {
58353+ /* move insertion coord (@from) */
58354+ if (pend == SHIFT_LEFT) {
58355+ /* after last item in target node */
58356+ coord_init_after_last_item(from, to);
58357+ } else {
58358+ /* before first item in target node */
58359+ coord_init_before_first_item(from, to);
58360+ }
58361+ }
58362+
58363+ if (delete_child && node_is_empty(shift.wish_stop.node))
58364+ result =
58365+ prepare_removal_node40(shift.wish_stop.node, info);
58366+ else
58367+ result = 0;
58368+ /* there is nothing to shift */
58369+ assert("nikita-2078", coord_check(from));
58370+ return result;
58371+ }
58372+
58373+ target_empty = node_is_empty(to);
58374+
58375+ /* when first node plugin with item body compression is implemented,
58376+ this must be changed to call node specific plugin */
58377+
58378+ /* shift->stop_coord is updated to last unit which really will be
58379+ shifted */
58380+ estimate_shift(&shift, get_current_context());
58381+ if (!shift.shift_bytes) {
58382+ /* we could not shift anything */
58383+ assert("nikita-2079", coord_check(from));
58384+ return 0;
58385+ }
58386+
58387+ copy(&shift);
58388+
58389+ /* result value of this is important. It is used by adjust_coord below */
58390+ result = delete_copied(&shift);
58391+
58392+ assert("vs-1610", result >= 0);
58393+ assert("vs-1471",
58394+ ((reiser4_context *) current->journal_info)->magic ==
58395+ context_magic);
58396+
58397+ /* item which has been moved from one node to another might want to do
58398+ something on that event. This can be done by item's shift_hook
58399+ method, which will be now called for every moved items */
58400+ call_shift_hooks(&shift);
58401+
58402+ assert("vs-1472",
58403+ ((reiser4_context *) current->journal_info)->magic ==
58404+ context_magic);
58405+
58406+ update_taps(&shift);
58407+
58408+ assert("vs-1473",
58409+ ((reiser4_context *) current->journal_info)->magic ==
58410+ context_magic);
58411+
58412+ /* adjust @from pointer in accordance with @including_stop_coord flag
58413+ and amount of data which was really shifted */
58414+ adjust_coord(from, &shift, result, including_stop_coord);
58415+
58416+ if (target_empty)
58417+ /*
58418+ * items were shifted into empty node. Update delimiting key.
58419+ */
58420+ result = prepare_for_update(NULL, left, info);
58421+
58422+ /* add update operation to @info, which is the list of operations to
58423+ be performed on a higher level */
58424+ result = prepare_for_update(left, right, info);
58425+ if (!result && node_is_empty(source) && delete_child) {
58426+ /* all contents of @from->node is moved to @to and @from->node
58427+ has to be removed from the tree, so, on higher level we
58428+ will be removing the pointer to node @from->node */
58429+ result = prepare_removal_node40(source, info);
58430+ }
58431+ assert("nikita-2080", coord_check(from));
58432+ return result ? result : (int)shift.shift_bytes;
58433+}
58434+
58435+/* plugin->u.node.fast_insert()
58436+ look for description of this method in plugin/node/node.h */
58437+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58438+{
58439+ return 1;
58440+}
58441+
58442+/* plugin->u.node.fast_paste()
58443+ look for description of this method in plugin/node/node.h */
58444+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58445+{
58446+ return 1;
58447+}
58448+
58449+/* plugin->u.node.fast_cut()
58450+ look for description of this method in plugin/node/node.h */
58451+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58452+{
58453+ return 1;
58454+}
58455+
58456+/* plugin->u.node.modify - not defined */
58457+
58458+/* plugin->u.node.max_item_size */
58459+int max_item_size_node40(void)
58460+{
58461+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
58462+ sizeof(item_header40);
58463+}
58464+
58465+/* plugin->u.node.set_item_plugin */
58466+int set_item_plugin_node40(coord_t *coord, item_id id)
58467+{
58468+ item_header40 *ih;
58469+
58470+ ih = node40_ih_at_coord(coord);
58471+ put_unaligned(cpu_to_le16(id), &ih->plugin_id);
58472+ coord->iplugid = id;
58473+ return 0;
58474+}
58475+
58476+/*
58477+ Local variables:
58478+ c-indentation-style: "K&R"
58479+ mode-name: "LC"
58480+ c-basic-offset: 8
58481+ tab-width: 8
58482+ fill-column: 120
58483+ scroll-step: 1
58484+ End:
58485+*/
58486Index: linux-2.6.16/fs/reiser4/plugin/node/node40.h
58487===================================================================
58488--- /dev/null
58489+++ linux-2.6.16/fs/reiser4/plugin/node/node40.h
58490@@ -0,0 +1,125 @@
58491+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58492+
58493+#if !defined( __REISER4_NODE40_H__ )
58494+#define __REISER4_NODE40_H__
58495+
58496+#include "../../forward.h"
58497+#include "../../dformat.h"
58498+#include "node.h"
58499+
58500+#include <linux/types.h>
58501+
58502+/* format of node header for 40 node layouts. Keep bloat out of this struct. */
58503+typedef struct node40_header {
58504+ /* identifier of node plugin. Must be located at the very beginning
58505+ of a node. */
58506+ common_node_header common_header; /* this is 16 bits */
58507+ /* number of items. Should be first element in the node header,
58508+ because we haven't yet finally decided whether it shouldn't go into
58509+ common_header.
58510+ */
58511+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
58512+ * node format at compile time, and it is this one, accesses do not function dereference when
58513+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
58514+ d16 nr_items;
58515+ /* free space in node measured in bytes */
58516+ d16 free_space;
58517+ /* offset to start of free space in node */
58518+ d16 free_space_start;
58519+ /* for reiser4_fsck. When information about what is a free
58520+ block is corrupted, and we try to recover everything even
58521+ if marked as freed, then old versions of data may
58522+ duplicate newer versions, and this field allows us to
58523+ restore the newer version. Also useful for when users
58524+ who don't have the new trashcan installed on their linux distro
58525+ delete the wrong files and send us desperate emails
58526+ offering $25 for them back. */
58527+
58528+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
58529+ d32 magic;
58530+ /* flushstamp is made of mk_id and write_counter. mk_id is an
58531+ id generated randomly at mkreiserfs time. So we can just
58532+ skip all nodes with different mk_id. write_counter is d64
58533+ incrementing counter of writes on disk. It is used for
58534+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
58535+
58536+ d32 mkfs_id;
58537+ d64 flush_id;
58538+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
58539+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
58540+ d16 flags;
58541+
58542+ /* 1 is leaf level, 2 is twig level, root is the numerically
58543+ largest level */
58544+ d8 level;
58545+
58546+ d8 pad;
58547+} PACKED node40_header;
58548+
58549+/* item headers are not standard across all node layouts, pass
58550+ pos_in_node to functions instead */
58551+typedef struct item_header40 {
58552+ /* key of item */
58553+ /* 0 */ reiser4_key key;
58554+ /* offset from start of a node measured in 8-byte chunks */
58555+ /* 24 */ d16 offset;
58556+ /* 26 */ d16 flags;
58557+ /* 28 */ d16 plugin_id;
58558+} PACKED item_header40;
58559+
58560+size_t item_overhead_node40(const znode * node, flow_t * aflow);
58561+size_t free_space_node40(znode * node);
58562+node_search_result lookup_node40(znode * node, const reiser4_key * key,
58563+ lookup_bias bias, coord_t * coord);
58564+int num_of_items_node40(const znode * node);
58565+char *item_by_coord_node40(const coord_t * coord);
58566+int length_by_coord_node40(const coord_t * coord);
58567+item_plugin *plugin_by_coord_node40(const coord_t * coord);
58568+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
58569+size_t estimate_node40(znode * node);
58570+int check_node40(const znode * node, __u32 flags, const char **error);
58571+int parse_node40(znode * node);
58572+int init_node40(znode * node);
58573+#ifdef GUESS_EXISTS
58574+int guess_node40(const znode * node);
58575+#endif
58576+void change_item_size_node40(coord_t * coord, int by);
58577+int create_item_node40(coord_t * target, const reiser4_key * key,
58578+ reiser4_item_data * data, carry_plugin_info * info);
58579+void update_item_key_node40(coord_t * target, const reiser4_key * key,
58580+ carry_plugin_info * info);
58581+int kill_node40(struct carry_kill_data *, carry_plugin_info *);
58582+int cut_node40(struct carry_cut_data *, carry_plugin_info *);
58583+int shift_node40(coord_t * from, znode * to, shift_direction pend,
58584+ /* if @from->node becomes
58585+ empty - it will be deleted from
58586+ the tree if this is set to 1
58587+ */
58588+ int delete_child, int including_stop_coord,
58589+ carry_plugin_info * info);
58590+
58591+int fast_insert_node40(const coord_t * coord);
58592+int fast_paste_node40(const coord_t * coord);
58593+int fast_cut_node40(const coord_t * coord);
58594+int max_item_size_node40(void);
58595+int prepare_removal_node40(znode * empty, carry_plugin_info * info);
58596+int set_item_plugin_node40(coord_t * coord, item_id id);
58597+int shrink_item_node40(coord_t * coord, int delta);
58598+
58599+#if REISER4_DEBUG
58600+void *shift_check_prepare(const znode *left, const znode *right);
58601+void shift_check(void *vp, const znode *left, const znode *right);
58602+#endif
58603+
58604+/* __REISER4_NODE40_H__ */
58605+#endif
58606+/*
58607+ Local variables:
58608+ c-indentation-style: "K&R"
58609+ mode-name: "LC"
58610+ c-basic-offset: 8
58611+ tab-width: 8
58612+ fill-column: 120
58613+ scroll-step: 1
58614+ End:
58615+*/
58616Index: linux-2.6.16/fs/reiser4/plugin/object.c
58617===================================================================
58618--- /dev/null
58619+++ linux-2.6.16/fs/reiser4/plugin/object.c
58620@@ -0,0 +1,501 @@
58621+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58622+ * reiser4/README */
58623+
58624+/*
58625+ * Examples of object plugins: file, directory, symlink, special file.
58626+ *
58627+ * Plugins associated with inode:
58628+ *
58629+ * Plugin of inode is plugin referenced by plugin-id field of on-disk
58630+ * stat-data. How we store this plugin in in-core inode is not
58631+ * important. Currently pointers are used, another variant is to store offsets
58632+ * and do array lookup on each access.
58633+ *
58634+ * Now, each inode has one selected plugin: object plugin that
58635+ * determines what type of file this object is: directory, regular etc.
58636+ *
58637+ * This main plugin can use other plugins that are thus subordinated to
58638+ * it. Directory instance of object plugin uses hash; regular file
58639+ * instance uses tail policy plugin.
58640+ *
58641+ * Object plugin is either taken from id in stat-data or guessed from
58642+ * i_mode bits. Once it is established we ask it to install its
58643+ * subordinate plugins, by looking again in stat-data or inheriting them
58644+ * from parent.
58645+ *
58646+ * How new inode is initialized during ->read_inode():
58647+ * 1 read stat-data and initialize inode fields: i_size, i_mode,
58648+ * i_generation, capabilities etc.
58649+ * 2 read plugin id from stat data or try to guess plugin id
58650+ * from inode->i_mode bits if plugin id is missing.
58651+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
58652+ *
58653+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
58654+ * if stat data does contain i_size, etc., due to it being an unusual plugin?
58655+ *
58656+ * 4 Call ->activate() method of object's plugin. Plugin is either read from
58657+ * from stat-data or guessed from mode bits
58658+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
58659+ * plugins from parent.
58660+ *
58661+ * Easy induction proves that on last step all plugins of inode would be
58662+ * initialized.
58663+ *
58664+ * When creating new object:
58665+ * 1 obtain object plugin id (see next period)
58666+ * NIKITA-FIXME-HANS: period?
58667+ * 2 ->install() this plugin
58668+ * 3 ->inherit() the rest from the parent
58669+ *
58670+ * We need some examples of creating an object with default and non-default
58671+ * plugin ids. Nikita, please create them.
58672+ */
58673+
58674+#include "../inode.h"
58675+
58676+static int _bugop(void)
58677+{
58678+ BUG_ON(1);
58679+ return 0;
58680+}
58681+
58682+#define bugop ((void *)_bugop)
58683+
58684+static int _dummyop(void)
58685+{
58686+ return 0;
58687+}
58688+
58689+#define dummyop ((void *)_dummyop)
58690+
58691+static int change_file(struct inode *inode, reiser4_plugin * plugin)
58692+{
58693+ /* cannot change object plugin of already existing object */
58694+ return RETERR(-EINVAL);
58695+}
58696+
58697+static reiser4_plugin_ops file_plugin_ops = {
58698+ .change = change_file
58699+};
58700+
58701+/*
58702+ * Definitions of object plugins.
58703+ */
58704+
58705+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
58706+ [UNIX_FILE_PLUGIN_ID] = {
58707+ .h = {
58708+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58709+ .id = UNIX_FILE_PLUGIN_ID,
58710+ .pops = &file_plugin_ops,
58711+ .label = "reg",
58712+ .desc = "regular file",
58713+ .linkage = {NULL, NULL},
58714+ },
58715+ .inode_ops = {
58716+ .permission = permission_common,
58717+ .setattr = setattr_unix_file,
58718+ .getattr = getattr_common
58719+ },
58720+ .file_ops = {
58721+ .llseek = generic_file_llseek,
58722+ .read = read_unix_file,
58723+ .write = write_unix_file,
58724+ .ioctl = ioctl_unix_file,
58725+ .mmap = mmap_unix_file,
58726+ .open = open_unix_file,
58727+ .release = release_unix_file,
58728+ .fsync = sync_unix_file,
58729+ .sendfile = sendfile_unix_file
58730+ },
58731+ .as_ops = {
58732+ .writepage = reiser4_writepage,
58733+ .readpage = readpage_unix_file,
58734+ .sync_page = block_sync_page,
58735+ .writepages = writepages_unix_file,
58736+ .set_page_dirty = reiser4_set_page_dirty,
58737+ .readpages = reiser4_readpages,
58738+ .prepare_write = prepare_write_unix_file,
58739+ .commit_write = commit_write_unix_file,
58740+ .bmap = bmap_unix_file,
58741+ .invalidatepage = reiser4_invalidatepage,
58742+ .releasepage = reiser4_releasepage
58743+ },
58744+ .write_sd_by_inode = write_sd_by_inode_common,
58745+ .flow_by_inode = flow_by_inode_unix_file,
58746+ .key_by_inode = key_by_inode_and_offset_common,
58747+ .set_plug_in_inode = set_plug_in_inode_common,
58748+ .adjust_to_parent = adjust_to_parent_common,
58749+ .create_object = create_object_common, /* this is not inode_operations's create */
58750+ .delete_object = delete_object_unix_file,
58751+ .add_link = add_link_common,
58752+ .rem_link = rem_link_common,
58753+ .owns_item = owns_item_unix_file,
58754+ .can_add_link = can_add_link_common,
58755+ .detach = dummyop,
58756+ .bind = dummyop,
58757+ .safelink = safelink_common,
58758+ .estimate = {
58759+ .create = estimate_create_common,
58760+ .update = estimate_update_common,
58761+ .unlink = estimate_unlink_common
58762+ },
58763+ .init_inode_data = init_inode_data_unix_file,
58764+ .cut_tree_worker = cut_tree_worker_common,
58765+ .wire = {
58766+ .write = wire_write_common,
58767+ .read = wire_read_common,
58768+ .get = wire_get_common,
58769+ .size = wire_size_common,
58770+ .done = wire_done_common
58771+ }
58772+ },
58773+ [DIRECTORY_FILE_PLUGIN_ID] = {
58774+ .h = {
58775+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58776+ .id = DIRECTORY_FILE_PLUGIN_ID,
58777+ .pops = &file_plugin_ops,
58778+ .label = "dir",
58779+ .desc = "directory",
58780+ .linkage = {NULL, NULL}
58781+ },
58782+ .inode_ops = {NULL,},
58783+ .file_ops = {NULL,},
58784+ .as_ops = {NULL,},
58785+
58786+ .write_sd_by_inode = write_sd_by_inode_common,
58787+ .flow_by_inode = bugop,
58788+ .key_by_inode = bugop,
58789+ .set_plug_in_inode = set_plug_in_inode_common,
58790+ .adjust_to_parent = adjust_to_parent_common_dir,
58791+ .create_object = create_object_common,
58792+ .delete_object = delete_directory_common,
58793+ .add_link = add_link_common,
58794+ .rem_link = rem_link_common_dir,
58795+ .owns_item = owns_item_common_dir,
58796+ .can_add_link = can_add_link_common,
58797+ .can_rem_link = can_rem_link_common_dir,
58798+ .detach = detach_common_dir,
58799+ .bind = bind_common_dir,
58800+ .safelink = safelink_common,
58801+ .estimate = {
58802+ .create = estimate_create_common_dir,
58803+ .update = estimate_update_common,
58804+ .unlink = estimate_unlink_common_dir
58805+ },
58806+ .wire = {
58807+ .write = wire_write_common,
58808+ .read = wire_read_common,
58809+ .get = wire_get_common,
58810+ .size = wire_size_common,
58811+ .done = wire_done_common
58812+ },
58813+ .init_inode_data = init_inode_ordering,
58814+ .cut_tree_worker = cut_tree_worker_common,
58815+ },
58816+ [SYMLINK_FILE_PLUGIN_ID] = {
58817+ .h = {
58818+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58819+ .id = SYMLINK_FILE_PLUGIN_ID,
58820+ .pops = &file_plugin_ops,
58821+ .label = "symlink",
58822+ .desc = "symbolic link",
58823+ .linkage = {NULL,NULL}
58824+ },
58825+ .inode_ops = {
58826+ .readlink = generic_readlink,
58827+ .follow_link = follow_link_common,
58828+ .permission = permission_common,
58829+ .setattr = setattr_common,
58830+ .getattr = getattr_common
58831+ },
58832+ /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
58833+ .file_ops = {NULL,},
58834+ .as_ops = {NULL,},
58835+
58836+ .write_sd_by_inode = write_sd_by_inode_common,
58837+ .set_plug_in_inode = set_plug_in_inode_common,
58838+ .adjust_to_parent = adjust_to_parent_common,
58839+ .create_object = create_symlink,
58840+ .delete_object = delete_object_common,
58841+ .add_link = add_link_common,
58842+ .rem_link = rem_link_common,
58843+ .can_add_link = can_add_link_common,
58844+ .detach = dummyop,
58845+ .bind = dummyop,
58846+ .safelink = safelink_common,
58847+ .estimate = {
58848+ .create = estimate_create_common,
58849+ .update = estimate_update_common,
58850+ .unlink = estimate_unlink_common
58851+ },
58852+ .init_inode_data = init_inode_ordering,
58853+ .cut_tree_worker = cut_tree_worker_common,
58854+ .destroy_inode = destroy_inode_symlink,
58855+ .wire = {
58856+ .write = wire_write_common,
58857+ .read = wire_read_common,
58858+ .get = wire_get_common,
58859+ .size = wire_size_common,
58860+ .done = wire_done_common
58861+ }
58862+ },
58863+ [SPECIAL_FILE_PLUGIN_ID] = {
58864+ .h = {
58865+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58866+ .id = SPECIAL_FILE_PLUGIN_ID,
58867+ .pops = &file_plugin_ops,
58868+ .label = "special",
58869+ .desc =
58870+ "special: fifo, device or socket",
58871+ .linkage = {NULL, NULL}
58872+ },
58873+ .inode_ops = {
58874+ .permission = permission_common,
58875+ .setattr = setattr_common,
58876+ .getattr = getattr_common
58877+ },
58878+ /* file_ops of special files (sockets, block, char, fifo) are
58879+ initialized by init_special_inode. */
58880+ .file_ops = {NULL,},
58881+ .as_ops = {NULL,},
58882+
58883+ .write_sd_by_inode = write_sd_by_inode_common,
58884+ .set_plug_in_inode = set_plug_in_inode_common,
58885+ .adjust_to_parent = adjust_to_parent_common,
58886+ .create_object = create_object_common,
58887+ .delete_object = delete_object_common,
58888+ .add_link = add_link_common,
58889+ .rem_link = rem_link_common,
58890+ .owns_item = owns_item_common,
58891+ .can_add_link = can_add_link_common,
58892+ .detach = dummyop,
58893+ .bind = dummyop,
58894+ .safelink = safelink_common,
58895+ .estimate = {
58896+ .create = estimate_create_common,
58897+ .update = estimate_update_common,
58898+ .unlink = estimate_unlink_common
58899+ },
58900+ .init_inode_data = init_inode_ordering,
58901+ .cut_tree_worker = cut_tree_worker_common,
58902+ .wire = {
58903+ .write = wire_write_common,
58904+ .read = wire_read_common,
58905+ .get = wire_get_common,
58906+ .size = wire_size_common,
58907+ .done = wire_done_common
58908+ }
58909+ },
58910+ [CRC_FILE_PLUGIN_ID] = {
58911+ .h = {
58912+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58913+ .id = CRC_FILE_PLUGIN_ID,
58914+ .pops = &cryptcompress_plugin_ops,
58915+ .label = "cryptcompress",
58916+ .desc = "cryptcompress file",
58917+ .linkage = {NULL, NULL}
58918+ },
58919+ .inode_ops = {
58920+ .permission = permission_common,
58921+ .setattr = setattr_cryptcompress,
58922+ .getattr = getattr_common
58923+ },
58924+ .file_ops = {
58925+ .llseek = generic_file_llseek,
58926+ .read = read_cryptcompress,
58927+ .write = write_cryptcompress,
58928+ .mmap = mmap_cryptcompress,
58929+ .release = release_cryptcompress,
58930+ .fsync = sync_common,
58931+ .sendfile = sendfile_cryptcompress
58932+ },
58933+ .as_ops = {
58934+ .writepage = reiser4_writepage,
58935+ .readpage = readpage_cryptcompress,
58936+ .sync_page = block_sync_page,
58937+ .writepages = writepages_cryptcompress,
58938+ .set_page_dirty = reiser4_set_page_dirty,
58939+ .readpages = reiser4_readpages,
58940+ .prepare_write = prepare_write_common,
58941+ .invalidatepage = reiser4_invalidatepage,
58942+ .releasepage = reiser4_releasepage
58943+ },
58944+ .write_sd_by_inode = write_sd_by_inode_common,
58945+ .flow_by_inode = flow_by_inode_cryptcompress,
58946+ .key_by_inode = key_by_inode_cryptcompress,
58947+ .set_plug_in_inode = set_plug_in_inode_common,
58948+ .adjust_to_parent = adjust_to_parent_cryptcompress,
58949+ .create_object = create_cryptcompress,
58950+ .open_object = open_cryptcompress,
58951+ .delete_object = delete_cryptcompress,
58952+ .add_link = add_link_common,
58953+ .rem_link = rem_link_common,
58954+ .owns_item = owns_item_common,
58955+ .can_add_link = can_add_link_common,
58956+ .detach = dummyop,
58957+ .bind = dummyop,
58958+ .safelink = safelink_common,
58959+ .estimate = {
58960+ .create = estimate_create_common,
58961+ .update = estimate_update_common,
58962+ .unlink = estimate_unlink_common
58963+ },
58964+ .init_inode_data = init_inode_data_cryptcompress,
58965+ .cut_tree_worker = cut_tree_worker_cryptcompress,
58966+ .destroy_inode = destroy_inode_cryptcompress,
58967+ .wire = {
58968+ .write = wire_write_common,
58969+ .read = wire_read_common,
58970+ .get = wire_get_common,
58971+ .size = wire_size_common,
58972+ .done = wire_done_common
58973+ }
58974+ }
58975+};
58976+
58977+static int change_dir(struct inode *inode, reiser4_plugin * plugin)
58978+{
58979+ /* cannot change dir plugin of already existing object */
58980+ return RETERR(-EINVAL);
58981+}
58982+
58983+static reiser4_plugin_ops dir_plugin_ops = {
58984+ .change = change_dir
58985+};
58986+
58987+/*
58988+ * definition of directory plugins
58989+ */
58990+
58991+dir_plugin dir_plugins[LAST_DIR_ID] = {
58992+ /* standard hashed directory plugin */
58993+ [HASHED_DIR_PLUGIN_ID] = {
58994+ .h = {
58995+ .type_id = REISER4_DIR_PLUGIN_TYPE,
58996+ .id = HASHED_DIR_PLUGIN_ID,
58997+ .pops = &dir_plugin_ops,
58998+ .label = "dir",
58999+ .desc = "hashed directory",
59000+ .linkage = {NULL, NULL}
59001+ },
59002+ .inode_ops = {
59003+ .create = create_common,
59004+ .lookup = lookup_common,
59005+ .link = link_common,
59006+ .unlink = unlink_common,
59007+ .symlink = symlink_common,
59008+ .mkdir = mkdir_common,
59009+ .rmdir = unlink_common,
59010+ .mknod = mknod_common,
59011+ .rename = rename_common,
59012+ .permission = permission_common,
59013+ .setattr = setattr_common,
59014+ .getattr = getattr_common
59015+ },
59016+ .file_ops = {
59017+ .llseek = llseek_common_dir,
59018+ .read = generic_read_dir,
59019+ .readdir = readdir_common,
59020+ .release = release_dir_common,
59021+ .fsync = sync_common
59022+ },
59023+ .as_ops = {
59024+ .writepage = bugop,
59025+ .sync_page = bugop,
59026+ .writepages = dummyop,
59027+ .set_page_dirty = bugop,
59028+ .readpages = bugop,
59029+ .prepare_write = bugop,
59030+ .commit_write = bugop,
59031+ .bmap = bugop,
59032+ .invalidatepage = bugop,
59033+ .releasepage = bugop
59034+ },
59035+ .get_parent = get_parent_common,
59036+ .is_name_acceptable = is_name_acceptable_common,
59037+ .build_entry_key = build_entry_key_hashed,
59038+ .build_readdir_key = build_readdir_key_common,
59039+ .add_entry = add_entry_common,
59040+ .rem_entry = rem_entry_common,
59041+ .init = init_common,
59042+ .done = done_common,
59043+ .attach = attach_common,
59044+ .detach = detach_common,
59045+ .estimate = {
59046+ .add_entry = estimate_add_entry_common,
59047+ .rem_entry = estimate_rem_entry_common,
59048+ .unlink = dir_estimate_unlink_common
59049+ }
59050+ },
59051+ /* hashed directory for which seekdir/telldir are guaranteed to
59052+ * work. Brain-damage. */
59053+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
59054+ .h = {
59055+ .type_id = REISER4_DIR_PLUGIN_TYPE,
59056+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
59057+ .pops = &dir_plugin_ops,
59058+ .label = "dir32",
59059+ .desc = "directory hashed with 31 bit hash",
59060+ .linkage = {NULL, NULL}
59061+ },
59062+ .inode_ops = {
59063+ .create = create_common,
59064+ .lookup = lookup_common,
59065+ .link = link_common,
59066+ .unlink = unlink_common,
59067+ .symlink = symlink_common,
59068+ .mkdir = mkdir_common,
59069+ .rmdir = unlink_common,
59070+ .mknod = mknod_common,
59071+ .rename = rename_common,
59072+ .permission = permission_common,
59073+ .setattr = setattr_common,
59074+ .getattr = getattr_common
59075+ },
59076+ .file_ops = {
59077+ .llseek = llseek_common_dir,
59078+ .read = generic_read_dir,
59079+ .readdir = readdir_common,
59080+ .release = release_dir_common,
59081+ .fsync = sync_common
59082+ },
59083+ .as_ops = {
59084+ .writepage = bugop,
59085+ .sync_page = bugop,
59086+ .writepages = dummyop,
59087+ .set_page_dirty = bugop,
59088+ .readpages = bugop,
59089+ .prepare_write = bugop,
59090+ .commit_write = bugop,
59091+ .bmap = bugop,
59092+ .invalidatepage = bugop,
59093+ .releasepage = bugop
59094+ },
59095+ .get_parent = get_parent_common,
59096+ .is_name_acceptable = is_name_acceptable_common,
59097+ .build_entry_key = build_entry_key_seekable,
59098+ .build_readdir_key = build_readdir_key_common,
59099+ .add_entry = add_entry_common,
59100+ .rem_entry = rem_entry_common,
59101+ .init = init_common,
59102+ .done = done_common,
59103+ .attach = attach_common,
59104+ .detach = detach_common,
59105+ .estimate = {
59106+ .add_entry = estimate_add_entry_common,
59107+ .rem_entry = estimate_rem_entry_common,
59108+ .unlink = dir_estimate_unlink_common
59109+ }
59110+ }
59111+};
59112+
59113+/* Make Linus happy.
59114+ Local variables:
59115+ c-indentation-style: "K&R"
59116+ mode-name: "LC"
59117+ c-basic-offset: 8
59118+ tab-width: 8
59119+ fill-column: 120
59120+ End:
59121+*/
59122Index: linux-2.6.16/fs/reiser4/plugin/object.h
59123===================================================================
59124--- /dev/null
59125+++ linux-2.6.16/fs/reiser4/plugin/object.h
59126@@ -0,0 +1,121 @@
59127+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
59128+ * reiser4/README */
59129+
59130+/* Declaration of object plugin functions. */
59131+
59132+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
59133+#define __FS_REISER4_PLUGIN_OBJECT_H__
59134+
59135+#include "../type_safe_hash.h"
59136+
59137+/* common implementations of inode operations */
59138+int create_common(struct inode *parent, struct dentry *dentry,
59139+ int mode, struct nameidata *);
59140+struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
59141+ struct nameidata *nameidata);
59142+int link_common(struct dentry *existing, struct inode *parent,
59143+ struct dentry *newname);
59144+int unlink_common(struct inode *parent, struct dentry *victim);
59145+int mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
59146+int symlink_common(struct inode *parent, struct dentry *dentry,
59147+ const char *linkname);
59148+int mknod_common(struct inode *parent, struct dentry *dentry,
59149+ int mode, dev_t rdev);
59150+int rename_common(struct inode *old_dir, struct dentry *old_name,
59151+ struct inode *new_dir, struct dentry *new_name);
59152+void *follow_link_common(struct dentry *, struct nameidata *data);
59153+int permission_common(struct inode *, int mask, /* mode bits to check permissions for */
59154+ struct nameidata *nameidata);
59155+int setattr_common(struct dentry *, struct iattr *);
59156+int getattr_common(struct vfsmount *mnt, struct dentry *, struct kstat *);
59157+
59158+/* common implementations of file operations */
59159+loff_t llseek_common_dir(struct file *, loff_t off, int origin);
59160+int readdir_common(struct file *, void *dirent, filldir_t);
59161+int release_dir_common(struct inode *, struct file *);
59162+int sync_common(struct file *, struct dentry *, int datasync);
59163+
59164+/* common implementations of address space operations */
59165+int prepare_write_common(struct file *, struct page *, unsigned from,
59166+ unsigned to);
59167+
59168+/* file plugin operations: common implementations */
59169+int write_sd_by_inode_common(struct inode *);
59170+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
59171+int set_plug_in_inode_common(struct inode *object, struct inode *parent,
59172+ reiser4_object_create_data *);
59173+int adjust_to_parent_common(struct inode *object, struct inode *parent,
59174+ struct inode *root);
59175+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
59176+ struct inode *root);
59177+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
59178+ struct inode *root);
59179+int create_object_common(struct inode *object, struct inode *parent,
59180+ reiser4_object_create_data *);
59181+int delete_object_common(struct inode *);
59182+int delete_directory_common(struct inode *);
59183+int add_link_common(struct inode *object, struct inode *parent);
59184+int rem_link_common(struct inode *object, struct inode *parent);
59185+int rem_link_common_dir(struct inode *object, struct inode *parent);
59186+int owns_item_common(const struct inode *, const coord_t *);
59187+int owns_item_common_dir(const struct inode *, const coord_t *);
59188+int can_add_link_common(const struct inode *);
59189+int can_rem_link_common_dir(const struct inode *);
59190+int detach_common_dir(struct inode *child, struct inode *parent);
59191+int open_cryptcompress(struct inode * inode, struct file * file);
59192+int bind_common_dir(struct inode *child, struct inode *parent);
59193+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
59194+reiser4_block_nr estimate_create_common(const struct inode *);
59195+reiser4_block_nr estimate_create_common_dir(const struct inode *);
59196+reiser4_block_nr estimate_update_common(const struct inode *);
59197+reiser4_block_nr estimate_unlink_common(const struct inode *,
59198+ const struct inode *);
59199+reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
59200+ const struct inode *);
59201+char *wire_write_common(struct inode *, char *start);
59202+char *wire_read_common(char *addr, reiser4_object_on_wire *);
59203+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
59204+int wire_size_common(struct inode *);
59205+void wire_done_common(reiser4_object_on_wire *);
59206+
59207+/* dir plugin operations: common implementations */
59208+struct dentry *get_parent_common(struct inode *child);
59209+int is_name_acceptable_common(const struct inode *, const char *name, int len);
59210+void build_entry_key_common(const struct inode *,
59211+ const struct qstr *qname, reiser4_key *);
59212+int build_readdir_key_common(struct file *dir, reiser4_key *);
59213+int add_entry_common(struct inode *object, struct dentry *where,
59214+ reiser4_object_create_data *, reiser4_dir_entry_desc *);
59215+int rem_entry_common(struct inode *object, struct dentry *where,
59216+ reiser4_dir_entry_desc *);
59217+int init_common(struct inode *object, struct inode *parent,
59218+ reiser4_object_create_data *);
59219+int done_common(struct inode *);
59220+int attach_common(struct inode *child, struct inode *parent);
59221+int detach_common(struct inode *object, struct inode *parent);
59222+reiser4_block_nr estimate_add_entry_common(const struct inode *);
59223+reiser4_block_nr estimate_rem_entry_common(const struct inode *);
59224+reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
59225+ const struct inode *);
59226+
59227+/* these are essential parts of common implementations, they are to make
59228+ customized implementations easier */
59229+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
59230+
59231+/* merely useful functions */
59232+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
59233+ const reiser4_key *, int silent);
59234+
59235+
59236+/* __FS_REISER4_PLUGIN_OBJECT_H__ */
59237+#endif
59238+
59239+/* Make Linus happy.
59240+ Local variables:
59241+ c-indentation-style: "K&R"
59242+ mode-name: "LC"
59243+ c-basic-offset: 8
59244+ tab-width: 8
59245+ fill-column: 120
59246+ End:
59247+*/
59248Index: linux-2.6.16/fs/reiser4/plugin/plugin.c
59249===================================================================
59250--- /dev/null
59251+++ linux-2.6.16/fs/reiser4/plugin/plugin.c
59252@@ -0,0 +1,533 @@
59253+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59254+ * reiser4/README */
59255+
59256+/* Basic plugin infrastructure, lookup etc. */
59257+
59258+/* PLUGINS:
59259+
59260+ Plugins are internal Reiser4 "modules" or "objects" used to increase
59261+ extensibility and allow external users to easily adapt reiser4 to
59262+ their needs.
59263+
59264+ Plugins are classified into several disjoint "types". Plugins
59265+ belonging to the particular plugin type are termed "instances" of
59266+ this type. Currently the following types are present:
59267+
59268+ . object plugin
59269+ . hash plugin
59270+ . tail plugin
59271+ . perm plugin
59272+ . item plugin
59273+ . node layout plugin
59274+
59275+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59276+
59277+ Object (file) plugin determines how given file-system object serves
59278+ standard VFS requests for read, write, seek, mmap etc. Instances of
59279+ file plugins are: regular file, directory, symlink. Another example
59280+ of file plugin is audit plugin, that optionally records accesses to
59281+ underlying object and forwards requests to it.
59282+
59283+ Hash plugins compute hashes used by reiser4 to store and locate
59284+ files within directories. Instances of hash plugin type are: r5,
59285+ tea, rupasov.
59286+
59287+ Tail plugins (or, more precisely, tail policy plugins) determine
59288+ when last part of the file should be stored in a formatted item.
59289+
59290+ Perm plugins control permissions granted for a process accessing a file.
59291+
59292+ Scope and lookup:
59293+
59294+ label such that pair ( type_label, plugin_label ) is unique. This
59295+ pair is a globally persistent and user-visible plugin
59296+ identifier. Internally kernel maintains plugins and plugin types in
59297+ arrays using an index into those arrays as plugin and plugin type
59298+ identifiers. File-system in turn, also maintains persistent
59299+ "dictionary" which is mapping from plugin label to numerical
59300+ identifier which is stored in file-system objects. That is, we
59301+ store the offset into the plugin array for that plugin type as the
59302+ plugin id in the stat data of the filesystem object.
59303+
59304+ plugin_labels have meaning for the user interface that assigns
59305+ plugins to files, and may someday have meaning for dynamic loading of
59306+ plugins and for copying of plugins from one fs instance to
59307+ another by utilities like cp and tar.
59308+
59309+ Internal kernel plugin type identifier (index in plugins[] array) is
59310+ of type reiser4_plugin_type. Set of available plugin types is
59311+ currently static, but dynamic loading doesn't seem to pose
59312+ insurmountable problems.
59313+
59314+ Within each type plugins are addressed by the identifiers of type
59315+ reiser4_plugin_id (indices in
59316+ reiser4_plugin_type_data.builtin[]). Such identifiers are only
59317+ required to be unique within one type, not globally.
59318+
59319+ Thus, plugin in memory is uniquely identified by the pair (type_id,
59320+ id).
59321+
59322+ Usage:
59323+
59324+ There exists only one instance of each plugin instance, but this
59325+ single instance can be associated with many entities (file-system
59326+ objects, items, nodes, transactions, file-descriptors etc.). Entity
59327+ to which plugin of given type is termed (due to the lack of
59328+ imagination) "subject" of this plugin type and, by abuse of
59329+ terminology, subject of particular instance of this type to which
59330+ it's attached currently. For example, inode is subject of object
59331+ plugin type. Inode representing directory is subject of directory
59332+ plugin, hash plugin type and some particular instance of hash plugin
59333+ type. Inode, representing regular file is subject of "regular file"
59334+ plugin, tail-policy plugin type etc.
59335+
59336+ With each subject the plugin possibly stores some state. For example,
59337+ the state of a directory plugin (instance of object plugin type) is pointer
59338+ to hash plugin (if directories always use hashing that is). State of
59339+ audit plugin is file descriptor (struct file) of log file or some
59340+ magic value to do logging through printk().
59341+
59342+ Interface:
59343+
59344+ In addition to a scalar identifier, each plugin type and plugin
59345+ proper has a "label": short string and a "description"---longer
59346+ descriptive string. Labels and descriptions of plugin types are
59347+ hard-coded into plugins[] array, declared and defined in
59348+ plugin.c. Label and description of plugin are stored in .label and
59349+ .desc fields of reiser4_plugin_header respectively. It's possible to
59350+ locate plugin by the pair of labels.
59351+
59352+ Features:
59353+
59354+ . user-level plugin manipulations:
59355+ + reiser4("filename/..file_plugin<='audit'");
59356+ + write(open("filename/..file_plugin"), "audit", 8);
59357+
59358+ . user level utilities lsplug and chplug to manipulate plugins.
59359+ Utilities are not of primary priority. Possibly they will be not
59360+ working on v4.0
59361+
59362+NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree? I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
59363+
59364+ . mount option "plug" to set-up plugins of root-directory.
59365+ "plug=foo:bar" will set "bar" as default plugin of type "foo".
59366+
59367+ Limitations:
59368+
59369+ . each plugin type has to provide at least one builtin
59370+ plugin. This is technical limitation and it can be lifted in the
59371+ future.
59372+
59373+ TODO:
59374+
59375+ New plugin types/plugings:
59376+ Things we should be able to separately choose to inherit:
59377+
59378+ security plugins
59379+
59380+ stat data
59381+
59382+ file bodies
59383+
59384+ file plugins
59385+
59386+ dir plugins
59387+
59388+ . perm:acl
59389+
59390+ d audi---audit plugin intercepting and possibly logging all
59391+ accesses to object. Requires to put stub functions in file_operations
59392+ in stead of generic_file_*.
59393+
59394+NIKITA-FIXME-HANS: why make overflows a plugin?
59395+ . over---handle hash overflows
59396+
59397+ . sqnt---handle different access patterns and instruments read-ahead
59398+
59399+NIKITA-FIXME-HANS: describe the line below in more detail.
59400+
59401+ . hier---handle inheritance of plugins along file-system hierarchy
59402+
59403+ Different kinds of inheritance: on creation vs. on access.
59404+ Compatible/incompatible plugins.
59405+ Inheritance for multi-linked files.
59406+ Layered plugins.
59407+ Notion of plugin context is abandoned.
59408+
59409+Each file is associated
59410+ with one plugin and dependant plugins (hash, etc.) are stored as
59411+ main plugin state. Now, if we have plugins used for regular files
59412+ but not for directories, how such plugins would be inherited?
59413+ . always store them with directories also
59414+
59415+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing the line below which is also useful.
59416+
59417+ . use inheritance hierarchy, independent of file-system namespace
59418+
59419+*/
59420+
59421+#include "../debug.h"
59422+#include "../dformat.h"
59423+#include "plugin_header.h"
59424+#include "item/static_stat.h"
59425+#include "node/node.h"
59426+#include "security/perm.h"
59427+#include "space/space_allocator.h"
59428+#include "disk_format/disk_format.h"
59429+#include "plugin.h"
59430+#include "../reiser4.h"
59431+#include "../jnode.h"
59432+#include "../inode.h"
59433+
59434+#include <linux/fs.h> /* for struct super_block */
59435+
59436+/* public interface */
59437+
59438+/* initialise plugin sub-system. Just call this once on reiser4 startup. */
59439+int init_plugins(void);
59440+int setup_plugins(struct super_block *super, reiser4_plugin ** area);
59441+int locate_plugin(struct inode *inode, plugin_locator * loc);
59442+
59443+
59444+/**
59445+ * init_plugins - initialize plugins
59446+ *
59447+ * Initializes plugin sub-system. It is part of reiser4 module
59448+ * initialization. For each plugin of each type init method is called and each
59449+ * plugin is put into list of plugins.
59450+ */
59451+int init_plugins(void)
59452+{
59453+ reiser4_plugin_type type_id;
59454+
59455+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59456+ reiser4_plugin_type_data *ptype;
59457+ int i;
59458+
59459+ ptype = &plugins[type_id];
59460+ assert("nikita-3508", ptype->label != NULL);
59461+ assert("nikita-3509", ptype->type_id == type_id);
59462+
59463+ INIT_LIST_HEAD(&ptype->plugins_list);
59464+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59465+ for (i = 0; i < ptype->builtin_num; ++i) {
59466+ reiser4_plugin *plugin;
59467+
59468+ plugin = plugin_at(ptype, i);
59469+
59470+ if (plugin->h.label == NULL)
59471+ /* uninitialized slot encountered */
59472+ continue;
59473+ assert("nikita-3445", plugin->h.type_id == type_id);
59474+ plugin->h.id = i;
59475+ if (plugin->h.pops != NULL &&
59476+ plugin->h.pops->init != NULL) {
59477+ int result;
59478+
59479+ result = plugin->h.pops->init(plugin);
59480+ if (result != 0)
59481+ return result;
59482+ }
59483+ INIT_LIST_HEAD(&plugin->h.linkage);
59484+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
59485+ }
59486+ }
59487+ return 0;
59488+}
59489+
59490+/* true if plugin type id is valid */
59491+int is_type_id_valid(reiser4_plugin_type type_id /* plugin type id */ )
59492+{
59493+ /* "type_id" is unsigned, so no comparison with 0 is
59494+ necessary */
59495+ return (type_id < REISER4_PLUGIN_TYPES);
59496+}
59497+
59498+/* true if plugin id is valid */
59499+int is_plugin_id_valid(reiser4_plugin_type type_id /* plugin type id */ ,
59500+ reiser4_plugin_id id /* plugin id */ )
59501+{
59502+ assert("nikita-1653", is_type_id_valid(type_id));
59503+ return id < plugins[type_id].builtin_num;
59504+}
59505+
59506+/* return plugin by its @type_id and @id.
59507+
59508+ Both arguments are checked for validness: this is supposed to be called
59509+ from user-level.
59510+
59511+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
59512+user space, and passed to the filesystem by use of method files? Your
59513+comment really confused me on the first reading....
59514+
59515+*/
59516+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id /* plugin
59517+ * type id,
59518+ * unchecked */ ,
59519+ reiser4_plugin_id id /* plugin id,
59520+ * unchecked */ )
59521+{
59522+ if (is_type_id_valid(type_id)) {
59523+ if (is_plugin_id_valid(type_id, id))
59524+ return plugin_at(&plugins[type_id], id);
59525+ else
59526+ /* id out of bounds */
59527+ warning("nikita-2913",
59528+ "Invalid plugin id: [%i:%i]", type_id, id);
59529+ } else
59530+ /* type_id out of bounds */
59531+ warning("nikita-2914", "Invalid type_id: %i", type_id);
59532+ return NULL;
59533+}
59534+
59535+/**
59536+ * save_plugin_id - store plugin id in disk format
59537+ * @plugin: plugin to convert
59538+ * @area: where to store result
59539+ *
59540+ * Puts id of @plugin in little endian format to address @area.
59541+ */
59542+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
59543+ d16 *area /* where to store result */ )
59544+{
59545+ assert("nikita-1261", plugin != NULL);
59546+ assert("nikita-1262", area != NULL);
59547+
59548+ put_unaligned(cpu_to_le16(plugin->h.id), area);
59549+ return 0;
59550+}
59551+
59552+/* list of all plugins of given type */
59553+struct list_head *get_plugin_list(reiser4_plugin_type type_id /* plugin type
59554+ * id */ )
59555+{
59556+ assert("nikita-1056", is_type_id_valid(type_id));
59557+ return &plugins[type_id].plugins_list;
59558+}
59559+
59560+int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb)
59561+{
59562+ reiser4_plugin *plug;
59563+ reiser4_inode *parent;
59564+
59565+ parent = reiser4_inode_data(ancestor);
59566+ plug = pset_get(parent->hset, memb) ? : pset_get(parent->pset, memb);
59567+ return grab_plugin_from(self, memb, plug);
59568+}
59569+
59570+static void update_plugin_mask(reiser4_inode * info, pset_member memb)
59571+{
59572+ struct dentry *rootdir;
59573+ reiser4_inode *root;
59574+
59575+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
59576+ if (rootdir != NULL) {
59577+ root = reiser4_inode_data(rootdir->d_inode);
59578+ /*
59579+ * if inode is different from the default one, or we are
59580+ * changing plugin of root directory, update plugin_mask
59581+ */
59582+ if (pset_get(info->pset, memb) != pset_get(root->pset, memb) ||
59583+ info == root)
59584+ info->plugin_mask |= (1 << memb);
59585+ }
59586+}
59587+
59588+int
59589+grab_plugin_from(struct inode *self, pset_member memb, reiser4_plugin * plug)
59590+{
59591+ reiser4_inode *info;
59592+ int result = 0;
59593+
59594+ info = reiser4_inode_data(self);
59595+ if (pset_get(info->pset, memb) == NULL) {
59596+ result = pset_set(&info->pset, memb, plug);
59597+ if (result == 0)
59598+ update_plugin_mask(info, memb);
59599+ }
59600+ return result;
59601+}
59602+
59603+int force_plugin(struct inode *self, pset_member memb, reiser4_plugin * plug)
59604+{
59605+ reiser4_inode *info;
59606+ int result = 0;
59607+
59608+ info = reiser4_inode_data(self);
59609+ if (plug->h.pops != NULL && plug->h.pops->change != NULL)
59610+ result = plug->h.pops->change(self, plug);
59611+ else
59612+ result = pset_set(&info->pset, memb, plug);
59613+ if (result == 0)
59614+ update_plugin_mask(info, memb);
59615+ return result;
59616+}
59617+
59618+reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
59619+ /* C90 initializers */
59620+ [REISER4_FILE_PLUGIN_TYPE] = {
59621+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59622+ .label = "file",
59623+ .desc = "Object plugins",
59624+ .builtin_num = sizeof_array(file_plugins),
59625+ .builtin = file_plugins,
59626+ .plugins_list = {NULL, NULL},
59627+ .size = sizeof(file_plugin)
59628+ },
59629+ [REISER4_DIR_PLUGIN_TYPE] = {
59630+ .type_id = REISER4_DIR_PLUGIN_TYPE,
59631+ .label = "dir",
59632+ .desc = "Directory plugins",
59633+ .builtin_num = sizeof_array(dir_plugins),
59634+ .builtin = dir_plugins,
59635+ .plugins_list = {NULL, NULL},
59636+ .size = sizeof(dir_plugin)
59637+ },
59638+ [REISER4_HASH_PLUGIN_TYPE] = {
59639+ .type_id = REISER4_HASH_PLUGIN_TYPE,
59640+ .label = "hash",
59641+ .desc = "Directory hashes",
59642+ .builtin_num = sizeof_array(hash_plugins),
59643+ .builtin = hash_plugins,
59644+ .plugins_list = {NULL, NULL},
59645+ .size = sizeof(hash_plugin)
59646+ },
59647+ [REISER4_FIBRATION_PLUGIN_TYPE] = {
59648+ .type_id =
59649+ REISER4_FIBRATION_PLUGIN_TYPE,
59650+ .label = "fibration",
59651+ .desc = "Directory fibrations",
59652+ .builtin_num = sizeof_array(fibration_plugins),
59653+ .builtin = fibration_plugins,
59654+ .plugins_list = {NULL, NULL},
59655+ .size = sizeof(fibration_plugin)
59656+ },
59657+ [REISER4_CIPHER_PLUGIN_TYPE] = {
59658+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
59659+ .label = "cipher",
59660+ .desc = "Cipher plugins",
59661+ .builtin_num = sizeof_array(cipher_plugins),
59662+ .builtin = cipher_plugins,
59663+ .plugins_list = {NULL, NULL},
59664+ .size = sizeof(cipher_plugin)
59665+ },
59666+ [REISER4_DIGEST_PLUGIN_TYPE] = {
59667+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
59668+ .label = "digest",
59669+ .desc = "Digest plugins",
59670+ .builtin_num = sizeof_array(digest_plugins),
59671+ .builtin = digest_plugins,
59672+ .plugins_list = {NULL, NULL},
59673+ .size = sizeof(digest_plugin)
59674+ },
59675+ [REISER4_COMPRESSION_PLUGIN_TYPE] = {
59676+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
59677+ .label = "compression",
59678+ .desc = "Compression plugins",
59679+ .builtin_num = sizeof_array(compression_plugins),
59680+ .builtin = compression_plugins,
59681+ .plugins_list = {NULL, NULL},
59682+ .size = sizeof(compression_plugin)
59683+ },
59684+ [REISER4_FORMATTING_PLUGIN_TYPE] = {
59685+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
59686+ .label = "formatting",
59687+ .desc = "Tail inlining policies",
59688+ .builtin_num = sizeof_array(formatting_plugins),
59689+ .builtin = formatting_plugins,
59690+ .plugins_list = {NULL, NULL},
59691+ .size = sizeof(formatting_plugin)
59692+ },
59693+ [REISER4_PERM_PLUGIN_TYPE] = {
59694+ .type_id = REISER4_PERM_PLUGIN_TYPE,
59695+ .label = "perm",
59696+ .desc = "Permission checks",
59697+ .builtin_num = sizeof_array(perm_plugins),
59698+ .builtin = perm_plugins,
59699+ .plugins_list = {NULL, NULL},
59700+ .size = sizeof(perm_plugin)
59701+ },
59702+ [REISER4_ITEM_PLUGIN_TYPE] = {
59703+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
59704+ .label = "item",
59705+ .desc = "Item handlers",
59706+ .builtin_num = sizeof_array(item_plugins),
59707+ .builtin = item_plugins,
59708+ .plugins_list = {NULL, NULL},
59709+ .size = sizeof(item_plugin)
59710+ },
59711+ [REISER4_NODE_PLUGIN_TYPE] = {
59712+ .type_id = REISER4_NODE_PLUGIN_TYPE,
59713+ .label = "node",
59714+ .desc = "node layout handlers",
59715+ .builtin_num = sizeof_array(node_plugins),
59716+ .builtin = node_plugins,
59717+ .plugins_list = {NULL, NULL},
59718+ .size = sizeof(node_plugin)
59719+ },
59720+ [REISER4_SD_EXT_PLUGIN_TYPE] = {
59721+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
59722+ .label = "sd_ext",
59723+ .desc = "Parts of stat-data",
59724+ .builtin_num = sizeof_array(sd_ext_plugins),
59725+ .builtin = sd_ext_plugins,
59726+ .plugins_list = {NULL, NULL},
59727+ .size = sizeof(sd_ext_plugin)
59728+ },
59729+ [REISER4_FORMAT_PLUGIN_TYPE] = {
59730+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
59731+ .label = "disk_layout",
59732+ .desc = "defines filesystem on disk layout",
59733+ .builtin_num = sizeof_array(format_plugins),
59734+ .builtin = format_plugins,
59735+ .plugins_list = {NULL, NULL},
59736+ .size = sizeof(disk_format_plugin)
59737+ },
59738+ [REISER4_JNODE_PLUGIN_TYPE] = {
59739+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
59740+ .label = "jnode",
59741+ .desc = "defines kind of jnode",
59742+ .builtin_num = sizeof_array(jnode_plugins),
59743+ .builtin = jnode_plugins,
59744+ .plugins_list = {NULL, NULL},
59745+ .size = sizeof(jnode_plugin)
59746+ },
59747+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
59748+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59749+ .label = "compression_mode",
59750+ .desc = "Defines compression mode",
59751+ .builtin_num = sizeof_array(compression_mode_plugins),
59752+ .builtin = compression_mode_plugins,
59753+ .plugins_list = {NULL, NULL},
59754+ .size = sizeof(compression_mode_plugin)
59755+ },
59756+ [REISER4_CLUSTER_PLUGIN_TYPE] = {
59757+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
59758+ .label = "cluster",
59759+ .desc = "Defines cluster size",
59760+ .builtin_num = sizeof_array(cluster_plugins),
59761+ .builtin = cluster_plugins,
59762+ .plugins_list = {NULL, NULL},
59763+ .size = sizeof(cluster_plugin)
59764+ },
59765+ [REISER4_REGULAR_PLUGIN_TYPE] = {
59766+ .type_id = REISER4_REGULAR_PLUGIN_TYPE,
59767+ .label = "regular",
59768+ .desc = "Defines kind of regular file",
59769+ .builtin_num =
59770+ sizeof_array(regular_plugins),
59771+ .builtin = regular_plugins,
59772+ .plugins_list = {NULL, NULL},
59773+ .size = sizeof(regular_plugin)
59774+ }
59775+};
59776+
59777+/*
59778+ * Local variables:
59779+ * c-indentation-style: "K&R"
59780+ * mode-name: "LC"
59781+ * c-basic-offset: 8
59782+ * tab-width: 8
59783+ * fill-column: 120
59784+ * End:
59785+ */
59786Index: linux-2.6.16/fs/reiser4/plugin/plugin.h
59787===================================================================
59788--- /dev/null
59789+++ linux-2.6.16/fs/reiser4/plugin/plugin.h
59790@@ -0,0 +1,936 @@
59791+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59792+
59793+/* Basic plugin data-types.
59794+ see fs/reiser4/plugin/plugin.c for details */
59795+
59796+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
59797+#define __FS_REISER4_PLUGIN_TYPES_H__
59798+
59799+#include "../forward.h"
59800+#include "../debug.h"
59801+#include "../dformat.h"
59802+#include "../key.h"
59803+#include "compress/compress.h"
59804+#include "crypto/cipher.h"
59805+#include "plugin_header.h"
59806+#include "item/static_stat.h"
59807+#include "item/internal.h"
59808+#include "item/sde.h"
59809+#include "item/cde.h"
59810+#include "item/item.h"
59811+#include "node/node.h"
59812+#include "node/node40.h"
59813+#include "security/perm.h"
59814+#include "fibration.h"
59815+
59816+#include "space/bitmap.h"
59817+#include "space/space_allocator.h"
59818+
59819+#include "disk_format/disk_format40.h"
59820+#include "disk_format/disk_format.h"
59821+
59822+#include <linux/fs.h> /* for struct super_block, address_space */
59823+#include <linux/mm.h> /* for struct page */
59824+#include <linux/buffer_head.h> /* for struct buffer_head */
59825+#include <linux/dcache.h> /* for struct dentry */
59826+#include <linux/types.h>
59827+#include <linux/crypto.h>
59828+
59829+typedef struct reiser4_object_on_wire reiser4_object_on_wire;
59830+
59831+/*
59832+ * File plugin. Defines the set of methods that file plugins implement, some
59833+ * of which are optional.
59834+ *
59835+ * A file plugin offers to the caller an interface for IO ( writing to and/or
59836+ * reading from) to what the caller sees as one sequence of bytes. An IO to it
59837+ * may affect more than one physical sequence of bytes, or no physical sequence
59838+ * of bytes, it may affect sequences of bytes offered by other file plugins to
59839+ * the semantic layer, and the file plugin may invoke other plugins and
59840+ * delegate work to them, but its interface is structured for offering the
59841+ * caller the ability to read and/or write what the caller sees as being a
59842+ * single sequence of bytes.
59843+ *
59844+ * The file plugin must present a sequence of bytes to the caller, but it does
59845+ * not necessarily have to store a sequence of bytes, it does not necessarily
59846+ * have to support efficient tree traversal to any offset in the sequence of
59847+ * bytes (tail and extent items, whose keys contain offsets, do however provide
59848+ * efficient non-sequential lookup of any offset in the sequence of bytes).
59849+ *
59850+ * Directory plugins provide methods for selecting file plugins by resolving a
59851+ * name for them.
59852+ *
59853+ * The functionality other filesystems call an attribute, and rigidly tie
59854+ * together, we decompose into orthogonal selectable features of files. Using
59855+ * the terminology we will define next, an attribute is a perhaps constrained,
59856+ * perhaps static length, file whose parent has a uni-count-intra-link to it,
59857+ * which might be grandparent-major-packed, and whose parent has a deletion
59858+ * method that deletes it.
59859+ *
59860+ * File plugins can implement constraints.
59861+ *
59862+ * Files can be of variable length (e.g. regular unix files), or of static
59863+ * length (e.g. static sized attributes).
59864+ *
59865+ * An object may have many sequences of bytes, and many file plugins, but, it
59866+ * has exactly one objectid. It is usually desirable that an object has a
59867+ * deletion method which deletes every item with that objectid. Items cannot
59868+ * in general be found by just their objectids. This means that an object must
59869+ * have either a method built into its deletion plugin method for knowing what
59870+ * items need to be deleted, or links stored with the object that provide the
59871+ * plugin with a method for finding those items. Deleting a file within an
59872+ * object may or may not have the effect of deleting the entire object,
59873+ * depending on the file plugin's deletion method.
59874+ *
59875+ * LINK TAXONOMY:
59876+ *
59877+ * Many objects have a reference count, and when the reference count reaches 0
59878+ * the object's deletion method is invoked. Some links embody a reference
59879+ * count increase ("countlinks"), and others do not ("nocountlinks").
59880+ *
59881+ * Some links are bi-directional links ("bilinks"), and some are
59882+ * uni-directional("unilinks").
59883+ *
59884+ * Some links are between parts of the same object ("intralinks"), and some are
59885+ * between different objects ("interlinks").
59886+ *
59887+ * PACKING TAXONOMY:
59888+ *
59889+ * Some items of an object are stored with a major packing locality based on
59890+ * their object's objectid (e.g. unix directory items in plan A), and these are
59891+ * called "self-major-packed".
59892+ *
59893+ * Some items of an object are stored with a major packing locality based on
59894+ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
59895+ * and these are called "parent-major-packed".
59896+ *
59897+ * Some items of an object are stored with a major packing locality based on
59898+ * their semantic grandparent, and these are called "grandparent-major-packed".
59899+ * Now carefully notice that we run into trouble with key length if we have to
59900+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
59901+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
59902+ * a 24 byte key. One of these fields must be sacrificed if an item is to be
59903+ * grandparent-major-packed, and which to sacrifice is left to the item author
59904+ * choosing to make the item grandparent-major-packed. You cannot make tail
59905+ * items and extent items grandparent-major-packed, though you could make them
59906+ * self-major-packed (usually they are parent-major-packed).
59907+ *
59908+ * In the case of ACLs (which are composed of fixed length ACEs which consist
59909+ * of {subject-type, subject, and permission bitmask} triples), it makes sense
59910+ * to not have an offset field in the ACE item key, and to allow duplicate keys
59911+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a
59912+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
59913+ * a directory together), the minor packing locality of ACE, the objectid of
59914+ * the file, and 0.
59915+ *
59916+ * IO involves moving data from one location to another, which means that two
59917+ * locations must be specified, source and destination.
59918+ *
59919+ * This source and destination can be in the filesystem, or they can be a
59920+ * pointer in the user process address space plus a byte count.
59921+ *
59922+ * If both source and destination are in the filesystem, then at least one of
59923+ * them must be representable as a pure stream of bytes (which we call a flow,
59924+ * and define as a struct containing a key, a data pointer, and a length).
59925+ * This may mean converting one of them into a flow. We provide a generic
59926+ * cast_into_flow() method, which will work for any plugin supporting
59927+ * read_flow(), though it is inefficiently implemented in that it temporarily
59928+ * stores the flow in a buffer (Question: what to do with huge flows that
59929+ * cannot fit into memory? Answer: we must not convert them all at once. )
59930+ *
59931+ * Performing a write requires resolving the write request into a flow defining
59932+ * the source, and a method that performs the write, and a key that defines
59933+ * where in the tree the write is to go.
59934+ *
59935+ * Performing a read requires resolving the read request into a flow defining
59936+ * the target, and a method that performs the read, and a key that defines
59937+ * where in the tree the read is to come from.
59938+ *
59939+ * There will exist file plugins which have no pluginid stored on the disk for
59940+ * them, and which are only invoked by other plugins.
59941+ */
59942+
59943+/* builtin file-plugins */
59944+typedef enum {
59945+ /* regular file */
59946+ UNIX_FILE_PLUGIN_ID,
59947+ /* directory */
59948+ DIRECTORY_FILE_PLUGIN_ID,
59949+ /* symlink */
59950+ SYMLINK_FILE_PLUGIN_ID,
59951+ /* for objects completely handled by the VFS: fifos, devices,
59952+ sockets */
59953+ SPECIAL_FILE_PLUGIN_ID,
59954+ /* regular cryptcompress file */
59955+ CRC_FILE_PLUGIN_ID,
59956+ /* number of file plugins. Used as size of arrays to hold
59957+ file plugins. */
59958+ LAST_FILE_PLUGIN_ID
59959+} reiser4_file_id;
59960+
59961+typedef struct file_plugin {
59962+
59963+ /* generic fields */
59964+ plugin_header h;
59965+
59966+ struct inode_operations inode_ops;
59967+ struct file_operations file_ops;
59968+ struct address_space_operations as_ops;
59969+
59970+ /* save inode cached stat-data onto disk. It was called
59971+ reiserfs_update_sd() in 3.x */
59972+ int (*write_sd_by_inode) (struct inode *);
59973+
59974+ /*
59975+ * private methods: These are optional. If used they will allow you to
59976+ * minimize the amount of code needed to implement a deviation from
59977+ * some other method that also uses them.
59978+ */
59979+
59980+ /*
59981+ * Construct flow into @flow according to user-supplied data.
59982+ *
59983+ * This is used by read/write methods to construct a flow to
59984+ * write/read. ->flow_by_inode() is plugin method, rather than single
59985+ * global implementation, because key in a flow used by plugin may
59986+ * depend on data in a @buf.
59987+ *
59988+ * NIKITA-FIXME-HANS: please create statistics on what functions are
59989+ * dereferenced how often for the mongo benchmark. You can supervise
59990+ * Elena doing this for you if that helps. Email me the list of the
59991+ * top 10, with their counts, and an estimate of the total number of
59992+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent
59993+ * processing (non-idle processing). If the total percent is, say,
59994+ * less than 1%, it will make our coding discussions much easier, and
59995+ * keep me from questioning whether functions like the below are too
59996+ * frequently called to be dereferenced. If the total percent is more
59997+ * than 1%, perhaps private methods should be listed in a "required"
59998+ * comment at the top of each plugin (with stern language about how if
59999+ * the comment is missing it will not be accepted by the maintainer),
60000+ * and implemented using macros not dereferenced functions. How about
60001+ * replacing this whole private methods part of the struct with a
60002+ * thorough documentation of what the standard helper functions are for
60003+ * use in constructing plugins? I think users have been asking for
60004+ * that, though not in so many words.
60005+ */
60006+ int (*flow_by_inode) (struct inode *, const char __user *buf,
60007+ int user, loff_t size,
60008+ loff_t off, rw_op op, flow_t *);
60009+
60010+ /*
60011+ * Return the key used to retrieve an offset of a file. It is used by
60012+ * default implementation of ->flow_by_inode() method
60013+ * (common_build_flow()) and, among other things, to get to the extent
60014+ * from jnode of unformatted node.
60015+ */
60016+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
60017+
60018+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
60019+ /*
60020+ * set the plugin for a file. Called during file creation in creat()
60021+ * but not reiser4() unless an inode already exists for the file.
60022+ */
60023+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
60024+ reiser4_object_create_data *);
60025+
60026+ /* NIKITA-FIXME-HANS: comment and name seem to say different things,
60027+ * are you setting up the object itself also or just adjusting the
60028+ * parent?.... */
60029+ /* set up plugins for new @object created in @parent. @root is root
60030+ directory. */
60031+ int (*adjust_to_parent) (struct inode *object, struct inode *parent,
60032+ struct inode *root);
60033+ /*
60034+ * this does whatever is necessary to do when object is created. For
60035+ * instance, for unix files stat data is inserted. It is supposed to be
60036+ * called by create of struct inode_operations.
60037+ */
60038+ int (*create_object) (struct inode *object, struct inode *parent,
60039+ reiser4_object_create_data *);
60040+
60041+ /* this does whatever is necessary to do when object is opened */
60042+ int (*open_object) (struct inode * inode, struct file * file);
60043+ /*
60044+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on
60045+ * success. Deletion of an object usually includes removal of items
60046+ * building file body (for directories this is removal of "." and "..")
60047+ * and removal of stat-data item.
60048+ */
60049+ int (*delete_object) (struct inode *);
60050+
60051+ /* add link from @parent to @object */
60052+ int (*add_link) (struct inode *object, struct inode *parent);
60053+
60054+ /* remove link from @parent to @object */
60055+ int (*rem_link) (struct inode *object, struct inode *parent);
60056+
60057+ /*
60058+ * return true if item addressed by @coord belongs to @inode. This is
60059+ * used by read/write to properly slice flow into items in presence of
60060+ * multiple key assignment policies, because items of a file are not
60061+ * necessarily contiguous in a key space, for example, in a plan-b.
60062+ */
60063+ int (*owns_item) (const struct inode *, const coord_t *);
60064+
60065+ /* checks whether yet another hard links to this object can be
60066+ added */
60067+ int (*can_add_link) (const struct inode *);
60068+
60069+ /* checks whether hard links to this object can be removed */
60070+ int (*can_rem_link) (const struct inode *);
60071+
60072+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
60073+ detach of directory plugin to remove ".." */
60074+ int (*detach) (struct inode * child, struct inode * parent);
60075+
60076+ /* called when @child was just looked up in the @parent. It is not
60077+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
60078+ directory plugin */
60079+ int (*bind) (struct inode * child, struct inode * parent);
60080+
60081+ /* process safe-link during mount */
60082+ int (*safelink) (struct inode * object, reiser4_safe_link_t link,
60083+ __u64 value);
60084+
60085+ /* The couple of estimate methods for all file operations */
60086+ struct {
60087+ reiser4_block_nr(*create) (const struct inode *);
60088+ reiser4_block_nr(*update) (const struct inode *);
60089+ reiser4_block_nr(*unlink) (const struct inode *,
60090+ const struct inode *);
60091+ } estimate;
60092+
60093+ /*
60094+ * reiser4 specific part of inode has a union of structures which are
60095+ * specific to a plugin. This method is called when inode is read
60096+ * (read_inode) and when file is created (common_create_child) so that
60097+ * file plugin could initialize its inode data
60098+ */
60099+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
60100+ int);
60101+
60102+ /*
60103+ * This method performs progressive deletion of items and whole nodes
60104+ * from right to left.
60105+ *
60106+ * @tap: the point deletion process begins from,
60107+ * @from_key: the beginning of the deleted key range,
60108+ * @to_key: the end of the deleted key range,
60109+ * @smallest_removed: the smallest removed key,
60110+ *
60111+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
60112+ * operation was interrupted for allowing atom commit .
60113+ */
60114+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
60115+ const reiser4_key * to_key,
60116+ reiser4_key * smallest_removed, struct inode *,
60117+ int, int *);
60118+
60119+ /* called from ->destroy_inode() */
60120+ void (*destroy_inode) (struct inode *);
60121+
60122+ /*
60123+ * methods to serialize object identify. This is used, for example, by
60124+ * reiser4_{en,de}code_fh().
60125+ */
60126+ struct {
60127+ /* store object's identity at @area */
60128+ char *(*write) (struct inode * inode, char *area);
60129+ /* parse object from wire to the @obj */
60130+ char *(*read) (char *area, reiser4_object_on_wire * obj);
60131+ /* given object identity in @obj, find or create its dentry */
60132+ struct dentry *(*get) (struct super_block * s,
60133+ reiser4_object_on_wire * obj);
60134+ /* how many bytes ->wire.write() consumes */
60135+ int (*size) (struct inode * inode);
60136+ /* finish with object identify */
60137+ void (*done) (reiser4_object_on_wire * obj);
60138+ } wire;
60139+} file_plugin;
60140+
60141+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60142+
60143+struct reiser4_object_on_wire {
60144+ file_plugin *plugin;
60145+ union {
60146+ struct {
60147+ obj_key_id key_id;
60148+ } std;
60149+ void *generic;
60150+ } u;
60151+};
60152+
60153+/* builtin dir-plugins */
60154+typedef enum {
60155+ HASHED_DIR_PLUGIN_ID,
60156+ SEEKABLE_HASHED_DIR_PLUGIN_ID,
60157+ LAST_DIR_ID
60158+} reiser4_dir_id;
60159+
60160+typedef struct dir_plugin {
60161+ /* generic fields */
60162+ plugin_header h;
60163+
60164+ struct inode_operations inode_ops;
60165+ struct file_operations file_ops;
60166+ struct address_space_operations as_ops;
60167+
60168+ /*
60169+ * private methods: These are optional. If used they will allow you to
60170+ * minimize the amount of code needed to implement a deviation from
60171+ * some other method that uses them. You could logically argue that
60172+ * they should be a separate type of plugin.
60173+ */
60174+
60175+ struct dentry *(*get_parent) (struct inode * childdir);
60176+
60177+ /*
60178+ * check whether "name" is acceptable name to be inserted into this
60179+ * object. Optionally implemented by directory-like objects. Can check
60180+ * for maximal length, reserved symbols etc
60181+ */
60182+ int (*is_name_acceptable) (const struct inode * inode, const char *name,
60183+ int len);
60184+
60185+ void (*build_entry_key) (const struct inode * dir /* directory where
60186+ * entry is (or will
60187+ * be) in.*/ ,
60188+ const struct qstr * name /* name of file
60189+ * referenced by this
60190+ * entry */ ,
60191+ reiser4_key * result /* resulting key of
60192+ * directory entry */ );
60193+ int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60194+ int (*add_entry) (struct inode * object, struct dentry * where,
60195+ reiser4_object_create_data * data,
60196+ reiser4_dir_entry_desc * entry);
60197+ int (*rem_entry) (struct inode * object, struct dentry * where,
60198+ reiser4_dir_entry_desc * entry);
60199+
60200+ /*
60201+ * initialize directory structure for newly created object. For normal
60202+ * unix directories, insert dot and dotdot.
60203+ */
60204+ int (*init) (struct inode * object, struct inode * parent,
60205+ reiser4_object_create_data * data);
60206+
60207+ /* destroy directory */
60208+ int (*done) (struct inode * child);
60209+
60210+ /* called when @subdir was just looked up in the @dir */
60211+ int (*attach) (struct inode * subdir, struct inode * dir);
60212+ int (*detach) (struct inode * subdir, struct inode * dir);
60213+
60214+ struct {
60215+ reiser4_block_nr(*add_entry) (const struct inode *);
60216+ reiser4_block_nr(*rem_entry) (const struct inode *);
60217+ reiser4_block_nr(*unlink) (const struct inode *,
60218+ const struct inode *);
60219+ } estimate;
60220+} dir_plugin;
60221+
60222+extern dir_plugin dir_plugins[LAST_DIR_ID];
60223+
60224+typedef struct formatting_plugin {
60225+ /* generic fields */
60226+ plugin_header h;
60227+ /* returns non-zero iff file's tail has to be stored
60228+ in a direct item. */
60229+ int (*have_tail) (const struct inode * inode, loff_t size);
60230+} formatting_plugin;
60231+
60232+typedef struct hash_plugin {
60233+ /* generic fields */
60234+ plugin_header h;
60235+ /* computes hash of the given name */
60236+ __u64(*hash) (const unsigned char *name, int len);
60237+} hash_plugin;
60238+
60239+typedef struct cipher_plugin {
60240+ /* generic fields */
60241+ plugin_header h;
60242+ struct crypto_tfm * (*alloc) (void);
60243+ void (*free) (struct crypto_tfm * tfm);
60244+ /* Offset translator. For each offset this returns (k * offset), where
60245+ k (k >= 1) is an expansion factor of the cipher algorithm.
60246+ For all symmetric algorithms k == 1. For asymmetric algorithms (which
60247+ inflate data) offset translation guarantees that all disk cluster's
60248+ units will have keys smaller then next cluster's one.
60249+ */
60250+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60251+ /* Cipher algorithms can accept data only by chunks of cipher block
60252+ size. This method is to align any flow up to cipher block size when
60253+ we pass it to cipher algorithm. To align means to append padding of
60254+ special format specific to the cipher algorithm */
60255+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60256+ /* low-level key manager (check, install, etc..) */
60257+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60258+ unsigned int keylen);
60259+ /* main text processing procedures */
60260+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60261+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60262+} cipher_plugin;
60263+
60264+typedef struct digest_plugin {
60265+ /* generic fields */
60266+ plugin_header h;
60267+ /* fingerprint size in bytes */
60268+ int fipsize;
60269+ struct crypto_tfm * (*alloc) (void);
60270+ void (*free) (struct crypto_tfm * tfm);
60271+} digest_plugin;
60272+
60273+typedef struct compression_plugin {
60274+ /* generic fields */
60275+ plugin_header h;
60276+ int (*init) (void);
60277+ /* the maximum number of bytes the size of the "compressed" data can
60278+ * exceed the uncompressed data. */
60279+ int (*overrun) (unsigned src_len);
60280+ coa_t(*alloc) (tfm_action act);
60281+ void (*free) (coa_t coa, tfm_action act);
60282+ /* minimal size of the flow we still try to compress */
60283+ int (*min_size_deflate) (void);
60284+ __u32(*checksum) (char *data, __u32 length);
60285+ /* main transform procedures */
60286+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60287+ __u8 * dst_first, unsigned *dst_len);
60288+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60289+ __u8 * dst_first, unsigned *dst_len);
60290+} compression_plugin;
60291+
60292+typedef struct compression_mode_plugin {
60293+ /* generic fields */
60294+ plugin_header h;
60295+ /* this is called when estimating compressibility
60296+ of a logical cluster by its content */
60297+ int (*should_deflate) (struct inode * inode, cloff_t index);
60298+ /* this is called when results of compression should be saved */
60299+ int (*accept_hook) (struct inode * inode, cloff_t index);
60300+ /* this is called when results of compression should be discarded */
60301+ int (*discard_hook) (struct inode * inode, cloff_t index);
60302+} compression_mode_plugin;
60303+
60304+typedef struct regular_plugin {
60305+ /* generic fields */
60306+ plugin_header h;
60307+ /* file plugin id which implements regular file */
60308+ reiser4_file_id id;
60309+} regular_plugin;
60310+
60311+typedef struct cluster_plugin {
60312+ /* generic fields */
60313+ plugin_header h;
60314+ int shift;
60315+} cluster_plugin;
60316+
60317+typedef struct sd_ext_plugin {
60318+ /* generic fields */
60319+ plugin_header h;
60320+ int (*present) (struct inode * inode, char **area, int *len);
60321+ int (*absent) (struct inode * inode);
60322+ int (*save_len) (struct inode * inode);
60323+ int (*save) (struct inode * inode, char **area);
60324+ /* alignment requirement for this stat-data part */
60325+ int alignment;
60326+} sd_ext_plugin;
60327+
60328+/* this plugin contains methods to allocate objectid for newly created files,
60329+ to deallocate objectid when file gets removed, to report number of used and
60330+ free objectids */
60331+typedef struct oid_allocator_plugin {
60332+ /* generic fields */
60333+ plugin_header h;
60334+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60335+ __u64 oids);
60336+ /* used to report statfs->f_files */
60337+ __u64(*oids_used) (reiser4_oid_allocator * map);
60338+ /* get next oid to use */
60339+ __u64(*next_oid) (reiser4_oid_allocator * map);
60340+ /* used to report statfs->f_ffree */
60341+ __u64(*oids_free) (reiser4_oid_allocator * map);
60342+ /* allocate new objectid */
60343+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60344+ /* release objectid */
60345+ int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60346+ /* how many pages to reserve in transaction for allocation of new
60347+ objectid */
60348+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60349+ /* how many pages to reserve in transaction for freeing of an
60350+ objectid */
60351+ int (*oid_reserve_release) (reiser4_oid_allocator * map);
60352+ void (*print_info) (const char *, reiser4_oid_allocator *);
60353+} oid_allocator_plugin;
60354+
60355+/* disk layout plugin: this specifies super block, journal, bitmap (if there
60356+ are any) locations, etc */
60357+typedef struct disk_format_plugin {
60358+ /* generic fields */
60359+ plugin_header h;
60360+ /* replay journal, initialize super_info_data, etc */
60361+ int (*init_format) (struct super_block *, void *data);
60362+
60363+ /* key of root directory stat data */
60364+ const reiser4_key *(*root_dir_key) (const struct super_block *);
60365+
60366+ int (*release) (struct super_block *);
60367+ jnode *(*log_super) (struct super_block *);
60368+ int (*check_open) (const struct inode * object);
60369+} disk_format_plugin;
60370+
60371+struct jnode_plugin {
60372+ /* generic fields */
60373+ plugin_header h;
60374+ int (*init) (jnode * node);
60375+ int (*parse) (jnode * node);
60376+ struct address_space *(*mapping) (const jnode * node);
60377+ unsigned long (*index) (const jnode * node);
60378+ jnode *(*clone) (jnode * node);
60379+};
60380+
60381+/* plugin instance. */
60382+/* */
60383+/* This is "wrapper" union for all types of plugins. Most of the code uses */
60384+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
60385+/* operates with pointers to reiser4_plugin. This union is only used in */
60386+/* some generic code in plugin/plugin.c that operates on all */
60387+/* plugins. Technically speaking purpose of this union is to add type */
60388+/* safety to said generic code: each plugin type (file_plugin, for */
60389+/* example), contains plugin_header as its first memeber. This first member */
60390+/* is located at the same place in memory as .h member of */
60391+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
60392+/* looks in the .h which is header of plugin type located in union. This */
60393+/* allows to avoid type-casts. */
60394+union reiser4_plugin {
60395+ /* generic fields */
60396+ plugin_header h;
60397+ /* file plugin */
60398+ file_plugin file;
60399+ /* directory plugin */
60400+ dir_plugin dir;
60401+ /* hash plugin, used by directory plugin */
60402+ hash_plugin hash;
60403+ /* fibration plugin used by directory plugin */
60404+ fibration_plugin fibration;
60405+ /* cipher transform plugin, used by file plugin */
60406+ cipher_plugin cipher;
60407+ /* digest transform plugin, used by file plugin */
60408+ digest_plugin digest;
60409+ /* compression transform plugin, used by file plugin */
60410+ compression_plugin compression;
60411+ /* tail plugin, used by file plugin */
60412+ formatting_plugin formatting;
60413+ /* permission plugin */
60414+ perm_plugin perm;
60415+ /* node plugin */
60416+ node_plugin node;
60417+ /* item plugin */
60418+ item_plugin item;
60419+ /* stat-data extension plugin */
60420+ sd_ext_plugin sd_ext;
60421+ /* disk layout plugin */
60422+ disk_format_plugin format;
60423+ /* object id allocator plugin */
60424+ oid_allocator_plugin oid_allocator;
60425+ /* plugin for different jnode types */
60426+ jnode_plugin jnode;
60427+ /* compression mode plugin, used by object plugin */
60428+ compression_mode_plugin compression_mode;
60429+ /* cluster plugin, used by object plugin */
60430+ cluster_plugin clust;
60431+ /* regular plugin, used by directory plugin */
60432+ regular_plugin regular;
60433+ /* place-holder for new plugin types that can be registered
60434+ dynamically, and used by other dynamically loaded plugins. */
60435+ void *generic;
60436+};
60437+
60438+struct reiser4_plugin_ops {
60439+ /* called when plugin is initialized */
60440+ int (*init) (reiser4_plugin * plugin);
60441+ /* called when plugin is unloaded */
60442+ int (*done) (reiser4_plugin * plugin);
60443+ /* load given plugin from disk */
60444+ int (*load) (struct inode * inode,
60445+ reiser4_plugin * plugin, char **area, int *len);
60446+ /* how many space is required to store this plugin's state
60447+ in stat-data */
60448+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
60449+ /* save persistent plugin-data to disk */
60450+ int (*save) (struct inode * inode, reiser4_plugin * plugin,
60451+ char **area);
60452+ /* alignment requirement for on-disk state of this plugin
60453+ in number of bytes */
60454+ int alignment;
60455+ /* install itself into given inode. This can return error
60456+ (e.g., you cannot change hash of non-empty directory). */
60457+ int (*change) (struct inode * inode, reiser4_plugin * plugin);
60458+ /* install itself into given inode. This can return error
60459+ (e.g., you cannot change hash of non-empty directory). */
60460+ int (*inherit) (struct inode * inode, struct inode * parent,
60461+ reiser4_plugin * plugin);
60462+};
60463+
60464+/* functions implemented in fs/reiser4/plugin/plugin.c */
60465+
60466+/* stores plugin reference in reiser4-specific part of inode */
60467+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
60468+extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
60469+extern int init_plugins(void);
60470+
60471+/* builtin plugins */
60472+
60473+/* builtin hash-plugins */
60474+
60475+typedef enum {
60476+ RUPASOV_HASH_ID,
60477+ R5_HASH_ID,
60478+ TEA_HASH_ID,
60479+ FNV1_HASH_ID,
60480+ DEGENERATE_HASH_ID,
60481+ LAST_HASH_ID
60482+} reiser4_hash_id;
60483+
60484+/* builtin cipher plugins */
60485+
60486+typedef enum {
60487+ NONE_CIPHER_ID,
60488+ AES_CIPHER_ID,
60489+ LAST_CIPHER_ID
60490+} reiser4_cipher_id;
60491+
60492+/* builtin digest plugins */
60493+
60494+typedef enum {
60495+ SHA256_32_DIGEST_ID,
60496+ LAST_DIGEST_ID
60497+} reiser4_digest_id;
60498+
60499+/* builtin compression mode plugins */
60500+typedef enum {
60501+ NONE_COMPRESSION_MODE_ID,
60502+ COL_8_COMPRESSION_MODE_ID,
60503+ COL_16_COMPRESSION_MODE_ID,
60504+ COL_32_COMPRESSION_MODE_ID,
60505+ COZ_COMPRESSION_MODE_ID,
60506+ FORCE_COMPRESSION_MODE_ID,
60507+ TEST_COMPRESSION_MODE_ID,
60508+ LAST_COMPRESSION_MODE_ID
60509+} reiser4_compression_mode_id;
60510+
60511+/* builtin cluster plugins */
60512+typedef enum {
60513+ CLUSTER_64K_ID,
60514+ CLUSTER_32K_ID,
60515+ CLUSTER_16K_ID,
60516+ CLUSTER_8K_ID,
60517+ CLUSTER_4K_ID,
60518+ LAST_CLUSTER_ID
60519+} reiser4_cluster_id;
60520+
60521+/* builtin regular plugins */
60522+typedef enum {
60523+ UF_REGULAR_ID,
60524+ CRC_REGULAR_ID,
60525+ LAST_REGULAR_ID
60526+} reiser4_regular_id;
60527+
60528+/* builtin tail-plugins */
60529+
60530+typedef enum {
60531+ NEVER_TAILS_FORMATTING_ID,
60532+ ALWAYS_TAILS_FORMATTING_ID,
60533+ SMALL_FILE_FORMATTING_ID,
60534+ LAST_TAIL_FORMATTING_ID
60535+} reiser4_formatting_id;
60536+
60537+/* compression/clustering specific data */
60538+typedef struct compression_data {
60539+ reiser4_compression_id coa; /* id of the compression algorithm */
60540+} compression_data_t;
60541+
60542+typedef __u8 cluster_data_t; /* cluster info */
60543+
60544+/* data type used to pack parameters that we pass to vfs object creation
60545+ function create_object() */
60546+struct reiser4_object_create_data {
60547+ /* plugin to control created object */
60548+ reiser4_file_id id;
60549+ /* mode of regular file, directory or special file */
60550+/* what happens if some other sort of perm plugin is in use? */
60551+ int mode;
60552+ /* rdev of special file */
60553+ dev_t rdev;
60554+ /* symlink target */
60555+ const char *name;
60556+ /* add here something for non-standard objects you invent, like
60557+ query for interpolation file etc. */
60558+
60559+ crypto_stat_t * crypto;
60560+ compression_data_t *compression;
60561+ cluster_data_t *cluster;
60562+
60563+ struct inode *parent;
60564+ struct dentry *dentry;
60565+};
60566+
60567+/* description of directory entry being created/destroyed/sought for
60568+
60569+ It is passed down to the directory plugin and farther to the
60570+ directory item plugin methods. Creation of new directory is done in
60571+ several stages: first we search for an entry with the same name, then
60572+ create new one. reiser4_dir_entry_desc is used to store some information
60573+ collected at some stage of this process and required later: key of
60574+ item that we want to insert/delete and pointer to an object that will
60575+ be bound by the new directory entry. Probably some more fields will
60576+ be added there.
60577+
60578+*/
60579+struct reiser4_dir_entry_desc {
60580+ /* key of directory entry */
60581+ reiser4_key key;
60582+ /* object bound by this entry. */
60583+ struct inode *obj;
60584+};
60585+
60586+#define MAX_PLUGIN_TYPE_LABEL_LEN 32
60587+#define MAX_PLUGIN_PLUG_LABEL_LEN 32
60588+
60589+/* used for interface with user-land: table-driven parsing in
60590+ reiser4(). */
60591+typedef struct plugin_locator {
60592+ reiser4_plugin_type type_id;
60593+ reiser4_plugin_id id;
60594+ char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
60595+ char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
60596+} plugin_locator;
60597+
60598+extern int locate_plugin(struct inode *inode, plugin_locator * loc);
60599+
60600+
60601+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
60602+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
60603+{ \
60604+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
60605+ return plugin ? & plugin -> FIELD : NULL; \
60606+} \
60607+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
60608+{ \
60609+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
60610+ return plugin ? & plugin -> FIELD : NULL; \
60611+} \
60612+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
60613+{ \
60614+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
60615+ return plugin ? & plugin -> FIELD : NULL; \
60616+} \
60617+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
60618+{ \
60619+ return ( reiser4_plugin * ) plugin; \
60620+} \
60621+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
60622+{ \
60623+ return TYPE ## _to_plugin (plugin) -> h.id; \
60624+} \
60625+typedef struct { int foo; } TYPE ## _plugin_dummy
60626+
60627+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
60628+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
60629+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
60630+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
60631+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
60632+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
60633+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
60634+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
60635+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
60636+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
60637+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
60638+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
60639+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
60640+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
60641+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60642+ compression_mode);
60643+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
60644+PLUGIN_BY_ID(regular_plugin, REISER4_REGULAR_PLUGIN_TYPE, regular);
60645+
60646+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
60647+
60648+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
60649+
60650+#define for_all_plugins(ptype, plugin) \
60651+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
60652+ get_plugin_list(ptype) != &plugin->h.linkage; \
60653+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
60654+
60655+
60656+/* enumeration of fields within plugin_set */
60657+typedef enum {
60658+ PSET_FILE,
60659+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
60660+ * inode.c:read_inode() depends on this. */
60661+ PSET_PERM,
60662+ PSET_FORMATTING,
60663+ PSET_HASH,
60664+ PSET_FIBRATION,
60665+ PSET_SD,
60666+ PSET_DIR_ITEM,
60667+ PSET_CIPHER,
60668+ PSET_DIGEST,
60669+ PSET_COMPRESSION,
60670+ PSET_COMPRESSION_MODE,
60671+ PSET_CLUSTER,
60672+ PSET_REGULAR_ENTRY,
60673+ PSET_LAST
60674+} pset_member;
60675+
60676+int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb);
60677+int grab_plugin_from(struct inode *self, pset_member memb,
60678+ reiser4_plugin * plug);
60679+int force_plugin(struct inode *self, pset_member memb, reiser4_plugin * plug);
60680+
60681+/* defined in fs/reiser4/plugin/object.c */
60682+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60683+/* defined in fs/reiser4/plugin/object.c */
60684+extern dir_plugin dir_plugins[LAST_DIR_ID];
60685+/* defined in fs/reiser4/plugin/item/static_stat.c */
60686+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
60687+/* defined in fs/reiser4/plugin/hash.c */
60688+extern hash_plugin hash_plugins[LAST_HASH_ID];
60689+/* defined in fs/reiser4/plugin/fibration.c */
60690+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
60691+/* defined in fs/reiser4/plugin/crypt.c */
60692+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
60693+/* defined in fs/reiser4/plugin/digest.c */
60694+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
60695+/* defined in fs/reiser4/plugin/compress/compress.c */
60696+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
60697+/* defined in fs/reiser4/plugin/compress/compression_mode.c */
60698+extern compression_mode_plugin
60699+compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
60700+/* defined in fs/reiser4/plugin/cluster.c */
60701+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
60702+/* defined in fs/reiser4/plugin/regular.c */
60703+extern regular_plugin regular_plugins[LAST_REGULAR_ID];
60704+/* defined in fs/reiser4/plugin/tail.c */
60705+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
60706+/* defined in fs/reiser4/plugin/security/security.c */
60707+extern perm_plugin perm_plugins[LAST_PERM_ID];
60708+/* defined in fs/reiser4/plugin/item/item.c */
60709+extern item_plugin item_plugins[LAST_ITEM_ID];
60710+/* defined in fs/reiser4/plugin/node/node.c */
60711+extern node_plugin node_plugins[LAST_NODE_ID];
60712+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
60713+extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
60714+
60715+/* __FS_REISER4_PLUGIN_TYPES_H__ */
60716+#endif
60717+
60718+/* Make Linus happy.
60719+ Local variables:
60720+ c-indentation-style: "K&R"
60721+ mode-name: "LC"
60722+ c-basic-offset: 8
60723+ tab-width: 8
60724+ fill-column: 120
60725+ End:
60726+*/
60727Index: linux-2.6.16/fs/reiser4/plugin/plugin_header.h
60728===================================================================
60729--- /dev/null
60730+++ linux-2.6.16/fs/reiser4/plugin/plugin_header.h
60731@@ -0,0 +1,136 @@
60732+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60733+
60734+/* plugin header. Data structures required by all plugin types. */
60735+
60736+#if !defined( __PLUGIN_HEADER_H__ )
60737+#define __PLUGIN_HEADER_H__
60738+
60739+/* plugin data-types and constants */
60740+
60741+#include "../debug.h"
60742+#include "../dformat.h"
60743+
60744+typedef enum {
60745+ REISER4_FILE_PLUGIN_TYPE,
60746+ REISER4_DIR_PLUGIN_TYPE,
60747+ REISER4_ITEM_PLUGIN_TYPE,
60748+ REISER4_NODE_PLUGIN_TYPE,
60749+ REISER4_HASH_PLUGIN_TYPE,
60750+ REISER4_FIBRATION_PLUGIN_TYPE,
60751+ REISER4_FORMATTING_PLUGIN_TYPE,
60752+ REISER4_PERM_PLUGIN_TYPE,
60753+ REISER4_SD_EXT_PLUGIN_TYPE,
60754+ REISER4_FORMAT_PLUGIN_TYPE,
60755+ REISER4_JNODE_PLUGIN_TYPE,
60756+ REISER4_CIPHER_PLUGIN_TYPE,
60757+ REISER4_DIGEST_PLUGIN_TYPE,
60758+ REISER4_COMPRESSION_PLUGIN_TYPE,
60759+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60760+ REISER4_CLUSTER_PLUGIN_TYPE,
60761+ REISER4_REGULAR_PLUGIN_TYPE,
60762+ REISER4_PLUGIN_TYPES
60763+} reiser4_plugin_type;
60764+
60765+struct reiser4_plugin_ops;
60766+/* generic plugin operations, supported by each
60767+ plugin type. */
60768+typedef struct reiser4_plugin_ops reiser4_plugin_ops;
60769+
60770+/* the common part of all plugin instances. */
60771+typedef struct plugin_header {
60772+ /* plugin type */
60773+ reiser4_plugin_type type_id;
60774+ /* id of this plugin */
60775+ reiser4_plugin_id id;
60776+ /* plugin operations */
60777+ reiser4_plugin_ops *pops;
60778+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
60779+ /* short label of this plugin */
60780+ const char *label;
60781+ /* descriptive string.. */
60782+ const char *desc;
60783+ /* list linkage */
60784+ struct list_head linkage;
60785+} plugin_header;
60786+
60787+/* PRIVATE INTERFACES */
60788+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
60789+/* plugin type representation. */
60790+typedef struct reiser4_plugin_type_data {
60791+ /* internal plugin type identifier. Should coincide with
60792+ index of this item in plugins[] array. */
60793+ reiser4_plugin_type type_id;
60794+ /* short symbolic label of this plugin type. Should be no longer
60795+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
60796+ const char *label;
60797+ /* plugin type description longer than .label */
60798+ const char *desc;
60799+
60800+/* NIKITA-FIXME-HANS: define built-in */
60801+ /* number of built-in plugin instances of this type */
60802+ int builtin_num;
60803+ /* array of built-in plugins */
60804+ void *builtin;
60805+ struct list_head plugins_list;
60806+ size_t size;
60807+} reiser4_plugin_type_data;
60808+
60809+extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
60810+
60811+int is_type_id_valid(reiser4_plugin_type type_id);
60812+int is_plugin_id_valid(reiser4_plugin_type type_id, reiser4_plugin_id id);
60813+
60814+static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
60815+{
60816+ char *builtin;
60817+
60818+ builtin = ptype->builtin;
60819+ return (reiser4_plugin *) (builtin + i * ptype->size);
60820+}
60821+
60822+/* return plugin by its @type_id and @id */
60823+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type_id
60824+ /* plugin type id */ ,
60825+ reiser4_plugin_id id /* plugin id */
60826+ )
60827+{
60828+ assert("nikita-1651", is_type_id_valid(type_id));
60829+ assert("nikita-1652", is_plugin_id_valid(type_id, id));
60830+ return plugin_at(&plugins[type_id], id);
60831+}
60832+
60833+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
60834+ reiser4_plugin_id id);
60835+
60836+/**
60837+ * plugin_by_disk_id - get reiser4_plugin
60838+ * @type_id: plugin type id
60839+ * @did: plugin id in disk format
60840+ *
60841+ * Returns reiser4_plugin by plugin type id an dplugin_id.
60842+ */
60843+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
60844+ reiser4_plugin_type type_id,
60845+ __le16 *plugin_id)
60846+{
60847+ /*
60848+ * what we should do properly is to maintain within each file-system a
60849+ * dictionary that maps on-disk plugin ids to "universal" ids. This
60850+ * dictionary will be resolved on mount time, so that this function
60851+ * will perform just one additional array lookup.
60852+ */
60853+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
60854+}
60855+
60856+/* __PLUGIN_HEADER_H__ */
60857+#endif
60858+
60859+/*
60860+ * Local variables:
60861+ * c-indentation-style: "K&R"
60862+ * mode-name: "LC"
60863+ * c-basic-offset: 8
60864+ * tab-width: 8
60865+ * fill-column: 79
60866+ * End:
60867+ */
60868Index: linux-2.6.16/fs/reiser4/plugin/plugin_set.c
60869===================================================================
60870--- /dev/null
60871+++ linux-2.6.16/fs/reiser4/plugin/plugin_set.c
60872@@ -0,0 +1,378 @@
60873+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60874+ * reiser4/README */
60875+/* NIKITA-FIXME-HANS: you didn't discuss this with me before coding it did you? Remove plugin-sets from code by March 15th, 2004 */
60876+/* plugin-sets */
60877+
60878+/*
60879+ * Each inode comes with a whole set of plugins: file plugin, directory
60880+ * plugin, hash plugin, tail policy plugin, security plugin, etc.
60881+ *
60882+ * Storing them (pointers to them, that is) in inode is a waste of
60883+ * space. Especially, given that on average file system plugins of vast
60884+ * majority of files will belong to few sets (e.g., one set for regular files,
60885+ * another set for standard directory, etc.)
60886+ *
60887+ * Plugin set (pset) is an object containing pointers to all plugins required
60888+ * by inode. Inode only stores a pointer to pset. psets are "interned", that
60889+ * is, different inodes with the same set of plugins point to the same
60890+ * pset. This is archived by storing psets in global hash table. Races are
60891+ * avoided by simple (and efficient so far) solution of never recycling psets,
60892+ * even when last inode pointing to it is destroyed.
60893+ *
60894+ */
60895+
60896+#include "../debug.h"
60897+#include "../super.h"
60898+#include "plugin_set.h"
60899+
60900+#include <linux/slab.h>
60901+#include <linux/stddef.h>
60902+
60903+/* slab for plugin sets */
60904+static kmem_cache_t *plugin_set_slab;
60905+
60906+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
60907+ [0 ... 7] = SPIN_LOCK_UNLOCKED
60908+};
60909+
60910+/* hash table support */
60911+
60912+#define PS_TABLE_SIZE (32)
60913+
60914+static inline plugin_set *cast_to(const unsigned long *a)
60915+{
60916+ return container_of(a, plugin_set, hashval);
60917+}
60918+
60919+static inline int pseq(const unsigned long *a1, const unsigned long *a2)
60920+{
60921+ plugin_set *set1;
60922+ plugin_set *set2;
60923+
60924+ /* make sure fields are not missed in the code below */
60925+ cassert(sizeof *set1 ==
60926+ sizeof set1->hashval +
60927+ sizeof set1->link +
60928+ sizeof set1->file +
60929+ sizeof set1->dir +
60930+ sizeof set1->perm +
60931+ sizeof set1->formatting +
60932+ sizeof set1->hash +
60933+ sizeof set1->fibration +
60934+ sizeof set1->sd +
60935+ sizeof set1->dir_item +
60936+ sizeof set1->cipher +
60937+ sizeof set1->digest +
60938+ sizeof set1->compression +
60939+ sizeof set1->compression_mode +
60940+ sizeof set1->cluster + sizeof set1->regular_entry);
60941+
60942+ set1 = cast_to(a1);
60943+ set2 = cast_to(a2);
60944+ return
60945+ set1->hashval == set2->hashval &&
60946+ set1->file == set2->file &&
60947+ set1->dir == set2->dir &&
60948+ set1->perm == set2->perm &&
60949+ set1->formatting == set2->formatting &&
60950+ set1->hash == set2->hash &&
60951+ set1->fibration == set2->fibration &&
60952+ set1->sd == set2->sd &&
60953+ set1->dir_item == set2->dir_item &&
60954+ set1->cipher == set2->cipher &&
60955+ set1->digest == set2->digest &&
60956+ set1->compression == set2->compression &&
60957+ set1->compression_mode == set2->compression_mode &&
60958+ set1->cluster == set2->cluster &&
60959+ set1->regular_entry == set2->regular_entry;
60960+}
60961+
60962+#define HASH_FIELD(hash, set, field) \
60963+({ \
60964+ (hash) += (unsigned long)(set)->field >> 2; \
60965+})
60966+
60967+static inline unsigned long calculate_hash(const plugin_set * set)
60968+{
60969+ unsigned long result;
60970+
60971+ result = 0;
60972+ HASH_FIELD(result, set, file);
60973+ HASH_FIELD(result, set, dir);
60974+ HASH_FIELD(result, set, perm);
60975+ HASH_FIELD(result, set, formatting);
60976+ HASH_FIELD(result, set, hash);
60977+ HASH_FIELD(result, set, fibration);
60978+ HASH_FIELD(result, set, sd);
60979+ HASH_FIELD(result, set, dir_item);
60980+ HASH_FIELD(result, set, cipher);
60981+ HASH_FIELD(result, set, digest);
60982+ HASH_FIELD(result, set, compression);
60983+ HASH_FIELD(result, set, compression_mode);
60984+ HASH_FIELD(result, set, cluster);
60985+ HASH_FIELD(result, set, regular_entry);
60986+ return result & (PS_TABLE_SIZE - 1);
60987+}
60988+
60989+static inline unsigned long
60990+pshash(ps_hash_table * table, const unsigned long *a)
60991+{
60992+ return *a;
60993+}
60994+
60995+/* The hash table definition */
60996+#define KMALLOC(size) kmalloc((size), get_gfp_mask())
60997+#define KFREE(ptr, size) kfree(ptr)
60998+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
60999+ pseq);
61000+#undef KFREE
61001+#undef KMALLOC
61002+
61003+static ps_hash_table ps_table;
61004+static plugin_set empty_set = {
61005+ .hashval = 0,
61006+ .file = NULL,
61007+ .dir = NULL,
61008+ .perm = NULL,
61009+ .formatting = NULL,
61010+ .hash = NULL,
61011+ .fibration = NULL,
61012+ .sd = NULL,
61013+ .dir_item = NULL,
61014+ .cipher = NULL,
61015+ .digest = NULL,
61016+ .compression = NULL,
61017+ .compression_mode = NULL,
61018+ .cluster = NULL,
61019+ .regular_entry = NULL,
61020+ .link = {NULL}
61021+};
61022+
61023+plugin_set *plugin_set_get_empty(void)
61024+{
61025+ return &empty_set;
61026+}
61027+
61028+void plugin_set_put(plugin_set * set)
61029+{
61030+}
61031+
61032+static inline unsigned long *pset_field(plugin_set * set, int offset)
61033+{
61034+ return (unsigned long *)(((char *)set) + offset);
61035+}
61036+
61037+static int plugin_set_field(plugin_set ** set, const unsigned long val,
61038+ const int offset)
61039+{
61040+ unsigned long *spot;
61041+ spinlock_t *lock;
61042+ plugin_set replica;
61043+ plugin_set *twin;
61044+ plugin_set *psal;
61045+ plugin_set *orig;
61046+
61047+ assert("nikita-2902", set != NULL);
61048+ assert("nikita-2904", *set != NULL);
61049+
61050+ spot = pset_field(*set, offset);
61051+ if (unlikely(*spot == val))
61052+ return 0;
61053+
61054+ replica = *(orig = *set);
61055+ *pset_field(&replica, offset) = val;
61056+ replica.hashval = calculate_hash(&replica);
61057+ rcu_read_lock();
61058+ twin = ps_hash_find(&ps_table, &replica.hashval);
61059+ if (unlikely(twin == NULL)) {
61060+ rcu_read_unlock();
61061+ psal = kmem_cache_alloc(plugin_set_slab, get_gfp_mask());
61062+ if (psal == NULL)
61063+ return RETERR(-ENOMEM);
61064+ *psal = replica;
61065+ lock = &plugin_set_lock[replica.hashval & 7];
61066+ spin_lock(lock);
61067+ twin = ps_hash_find(&ps_table, &replica.hashval);
61068+ if (likely(twin == NULL)) {
61069+ *set = psal;
61070+ ps_hash_insert_rcu(&ps_table, psal);
61071+ } else {
61072+ *set = twin;
61073+ kmem_cache_free(plugin_set_slab, psal);
61074+ }
61075+ spin_unlock(lock);
61076+ } else {
61077+ rcu_read_unlock();
61078+ *set = twin;
61079+ }
61080+ return 0;
61081+}
61082+
61083+static struct {
61084+ int offset;
61085+ reiser4_plugin_type type;
61086+} pset_descr[PSET_LAST] = {
61087+ [PSET_FILE] = {
61088+ .offset = offsetof(plugin_set, file),
61089+ .type = REISER4_FILE_PLUGIN_TYPE
61090+ },
61091+ [PSET_DIR] = {
61092+ .offset = offsetof(plugin_set, dir),
61093+ .type = REISER4_DIR_PLUGIN_TYPE
61094+ },
61095+ [PSET_PERM] = {
61096+ .offset = offsetof(plugin_set, perm),
61097+ .type = REISER4_PERM_PLUGIN_TYPE
61098+ },
61099+ [PSET_FORMATTING] = {
61100+ .offset = offsetof(plugin_set, formatting),
61101+ .type = REISER4_FORMATTING_PLUGIN_TYPE
61102+ },
61103+ [PSET_HASH] = {
61104+ .offset = offsetof(plugin_set, hash),
61105+ .type = REISER4_HASH_PLUGIN_TYPE
61106+ },
61107+ [PSET_FIBRATION] = {
61108+ .offset = offsetof(plugin_set, fibration),
61109+ .type = REISER4_FIBRATION_PLUGIN_TYPE
61110+ },
61111+ [PSET_SD] = {
61112+ .offset = offsetof(plugin_set, sd),
61113+ .type = REISER4_ITEM_PLUGIN_TYPE
61114+ },
61115+ [PSET_DIR_ITEM] = {
61116+ .offset = offsetof(plugin_set, dir_item),
61117+ .type = REISER4_ITEM_PLUGIN_TYPE
61118+ },
61119+ [PSET_CIPHER] = {
61120+ .offset = offsetof(plugin_set, cipher),
61121+ .type = REISER4_CIPHER_PLUGIN_TYPE
61122+ },
61123+ [PSET_DIGEST] = {
61124+ .offset = offsetof(plugin_set, digest),
61125+ .type = REISER4_DIGEST_PLUGIN_TYPE
61126+ },
61127+ [PSET_COMPRESSION] = {
61128+ .offset = offsetof(plugin_set, compression),
61129+ .type = REISER4_COMPRESSION_PLUGIN_TYPE
61130+ },
61131+ [PSET_COMPRESSION_MODE] = {
61132+ .offset = offsetof(plugin_set, compression_mode),
61133+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE
61134+ },
61135+ [PSET_CLUSTER] = {
61136+ .offset = offsetof(plugin_set, cluster),
61137+ .type = REISER4_CLUSTER_PLUGIN_TYPE
61138+ },
61139+ [PSET_REGULAR_ENTRY] = {
61140+ .offset = offsetof(plugin_set, regular_entry),
61141+ .type = REISER4_REGULAR_PLUGIN_TYPE
61142+ }
61143+};
61144+
61145+#if REISER4_DEBUG
61146+static reiser4_plugin_type pset_member_to_type(pset_member memb)
61147+{
61148+ assert("nikita-3501", 0 <= memb && memb < PSET_LAST);
61149+ return pset_descr[memb].type;
61150+}
61151+#endif
61152+
61153+reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb)
61154+{
61155+ if (0 <= memb && memb < PSET_LAST)
61156+ return pset_descr[memb].type;
61157+ else
61158+ return REISER4_PLUGIN_TYPES;
61159+}
61160+
61161+int pset_set(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
61162+{
61163+ assert("nikita-3492", set != NULL);
61164+ assert("nikita-3493", *set != NULL);
61165+ assert("nikita-3494", plugin != NULL);
61166+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST);
61167+ assert("nikita-3496", plugin->h.type_id == pset_member_to_type(memb));
61168+
61169+ return plugin_set_field(set,
61170+ (unsigned long)plugin, pset_descr[memb].offset);
61171+}
61172+
61173+reiser4_plugin *pset_get(plugin_set * set, pset_member memb)
61174+{
61175+ assert("nikita-3497", set != NULL);
61176+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST);
61177+
61178+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset);
61179+}
61180+
61181+#define DEFINE_PLUGIN_SET(type, field) \
61182+int plugin_set_ ## field(plugin_set **set, type *val) \
61183+{ \
61184+ cassert(sizeof val == sizeof(unsigned long)); \
61185+ return plugin_set_field(set, (unsigned long)val, \
61186+ offsetof(plugin_set, field)); \
61187+}
61188+
61189+DEFINE_PLUGIN_SET(file_plugin, file)
61190+ DEFINE_PLUGIN_SET(dir_plugin, dir)
61191+ DEFINE_PLUGIN_SET(formatting_plugin, formatting)
61192+ DEFINE_PLUGIN_SET(hash_plugin, hash)
61193+ DEFINE_PLUGIN_SET(fibration_plugin, fibration)
61194+ DEFINE_PLUGIN_SET(item_plugin, sd)
61195+ /* DEFINE_PLUGIN_SET(cipher_plugin, cipher) */
61196+ /* DEFINE_PLUGIN_SET(digest_plugin, digest) */
61197+ DEFINE_PLUGIN_SET(compression_plugin, compression)
61198+ /* DEFINE_PLUGIN_SET(compression_mode_plugin, compression_mode) */
61199+ DEFINE_PLUGIN_SET(cluster_plugin, cluster)
61200+ /* DEFINE_PLUGIN_SET(regular_plugin, regular_entry) */
61201+
61202+
61203+/**
61204+ * init_plugin_set - create pset cache and hash table
61205+ *
61206+ * Initializes slab cache of plugin_set-s and their hash table. It is part of
61207+ * reiser4 module initialization.
61208+ */
61209+int init_plugin_set(void)
61210+{
61211+ int result;
61212+
61213+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
61214+ if (result == 0) {
61215+ plugin_set_slab = kmem_cache_create("plugin_set",
61216+ sizeof(plugin_set), 0,
61217+ SLAB_HWCACHE_ALIGN,
61218+ NULL, NULL);
61219+ if (plugin_set_slab == NULL)
61220+ result = RETERR(-ENOMEM);
61221+ }
61222+ return result;
61223+}
61224+
61225+/**
61226+ * done_plugin_set - delete plugin_set cache and plugin_set hash table
61227+ *
61228+ * This is called on reiser4 module unloading or system shutdown.
61229+ */
61230+void done_plugin_set(void)
61231+{
61232+ plugin_set *cur, *next;
61233+
61234+ for_all_in_htable(&ps_table, ps, cur, next) {
61235+ ps_hash_remove(&ps_table, cur);
61236+ kmem_cache_free(plugin_set_slab, cur);
61237+ }
61238+ destroy_reiser4_cache(&plugin_set_slab);
61239+ ps_hash_done(&ps_table);
61240+}
61241+
61242+/*
61243+ * Local variables:
61244+ * c-indentation-style: "K&R"
61245+ * mode-name: "LC"
61246+ * c-basic-offset: 8
61247+ * tab-width: 8
61248+ * fill-column: 120
61249+ * End:
61250+ */
61251Index: linux-2.6.16/fs/reiser4/plugin/plugin_set.h
61252===================================================================
61253--- /dev/null
61254+++ linux-2.6.16/fs/reiser4/plugin/plugin_set.h
61255@@ -0,0 +1,83 @@
61256+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61257+
61258+/* plugin-sets. see fs/reiser4/plugin/plugin_set.c for details */
61259+
61260+#if !defined( __PLUGIN_SET_H__ )
61261+#define __PLUGIN_SET_H__
61262+
61263+#include "../type_safe_hash.h"
61264+#include "plugin.h"
61265+
61266+#include <linux/rcupdate.h>
61267+
61268+struct plugin_set;
61269+typedef struct plugin_set plugin_set;
61270+
61271+TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61272+
61273+struct plugin_set {
61274+ unsigned long hashval;
61275+ /* plugin of file */
61276+ file_plugin *file;
61277+ /* plugin of dir */
61278+ dir_plugin *dir;
61279+ /* perm plugin for this file */
61280+ perm_plugin *perm;
61281+ /* tail policy plugin. Only meaningful for regular files */
61282+ formatting_plugin *formatting;
61283+ /* hash plugin. Only meaningful for directories. */
61284+ hash_plugin *hash;
61285+ /* fibration plugin. Only meaningful for directories. */
61286+ fibration_plugin *fibration;
61287+ /* plugin of stat-data */
61288+ item_plugin *sd;
61289+ /* plugin of items a directory is built of */
61290+ item_plugin *dir_item;
61291+ /* cipher plugin */
61292+ cipher_plugin *cipher;
61293+ /* digest plugin */
61294+ digest_plugin *digest;
61295+ /* compression plugin */
61296+ compression_plugin *compression;
61297+ /* compression mode plugin */
61298+ compression_mode_plugin *compression_mode;
61299+ /* cluster plugin */
61300+ cluster_plugin *cluster;
61301+ /* plugin of regular child should be created */
61302+ regular_plugin *regular_entry;
61303+ ps_hash_link link;
61304+};
61305+
61306+extern plugin_set *plugin_set_get_empty(void);
61307+extern void plugin_set_put(plugin_set * set);
61308+
61309+extern int plugin_set_file(plugin_set ** set, file_plugin * plug);
61310+extern int plugin_set_dir(plugin_set ** set, dir_plugin * plug);
61311+extern int plugin_set_formatting(plugin_set ** set, formatting_plugin * plug);
61312+extern int plugin_set_hash(plugin_set ** set, hash_plugin * plug);
61313+extern int plugin_set_fibration(plugin_set ** set, fibration_plugin * plug);
61314+extern int plugin_set_sd(plugin_set ** set, item_plugin * plug);
61315+extern int plugin_set_compression(plugin_set ** set, compression_plugin * plug);
61316+extern int plugin_set_cluster(plugin_set ** set, cluster_plugin * plug);
61317+
61318+extern int init_plugin_set(void);
61319+extern void done_plugin_set(void);
61320+
61321+extern int pset_set(plugin_set ** set, pset_member memb,
61322+ reiser4_plugin * plugin);
61323+extern reiser4_plugin *pset_get(plugin_set * set, pset_member memb);
61324+
61325+extern reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb);
61326+
61327+/* __PLUGIN_SET_H__ */
61328+#endif
61329+
61330+/* Make Linus happy.
61331+ Local variables:
61332+ c-indentation-style: "K&R"
61333+ mode-name: "LC"
61334+ c-basic-offset: 8
61335+ tab-width: 8
61336+ fill-column: 120
61337+ End:
61338+*/
61339Index: linux-2.6.16/fs/reiser4/plugin/regular.c
61340===================================================================
61341--- /dev/null
61342+++ linux-2.6.16/fs/reiser4/plugin/regular.c
61343@@ -0,0 +1,44 @@
61344+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61345+
61346+/* Contains Reiser4 regular plugins which:
61347+ . specify a set of reiser4 regular object plugins,
61348+ . used by directory plugin to create entries powered by specified
61349+ regular plugins */
61350+
61351+#include "plugin.h"
61352+
61353+regular_plugin regular_plugins[LAST_REGULAR_ID] = {
61354+ [UF_REGULAR_ID] = {
61355+ .h = {
61356+ .type_id = REISER4_REGULAR_PLUGIN_TYPE,
61357+ .id = UF_REGULAR_ID,
61358+ .pops = NULL,
61359+ .label = "unixfile",
61360+ .desc = "Unix file regular plugin",
61361+ .linkage = {NULL, NULL}
61362+ },
61363+ .id = UNIX_FILE_PLUGIN_ID
61364+ },
61365+ [CRC_REGULAR_ID] = {
61366+ .h = {
61367+ .type_id = REISER4_REGULAR_PLUGIN_TYPE,
61368+ .id = CRC_REGULAR_ID,
61369+ .pops = NULL,
61370+ .label = "cryptcompress",
61371+ .desc = "Cryptcompress regular plugin",
61372+ .linkage = {NULL, NULL}
61373+ },
61374+ .id = CRC_FILE_PLUGIN_ID
61375+ }
61376+};
61377+
61378+/*
61379+ Local variables:
61380+ c-indentation-style: "K&R"
61381+ mode-name: "LC"
61382+ c-basic-offset: 8
61383+ tab-width: 8
61384+ fill-column: 120
61385+ scroll-step: 1
61386+ End:
61387+*/
61388Index: linux-2.6.16/fs/reiser4/plugin/security/Makefile
61389===================================================================
61390--- /dev/null
61391+++ linux-2.6.16/fs/reiser4/plugin/security/Makefile
61392@@ -0,0 +1,4 @@
61393+obj-$(CONFIG_REISER4_FS) += security_plugins.o
61394+
61395+security_plugins-objs := \
61396+ perm.o
61397Index: linux-2.6.16/fs/reiser4/plugin/security/perm.c
61398===================================================================
61399--- /dev/null
61400+++ linux-2.6.16/fs/reiser4/plugin/security/perm.c
61401@@ -0,0 +1,44 @@
61402+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61403+
61404+/*
61405+ * this file contains implementation of permission plugins. Currently, only
61406+ * RWX_PERM_ID is implemented
61407+ */
61408+
61409+#include "../plugin.h"
61410+#include "../plugin_header.h"
61411+#include "../../debug.h"
61412+
61413+perm_plugin perm_plugins[LAST_PERM_ID] = {
61414+ [NULL_PERM_ID] = {
61415+ .h = {
61416+ .type_id = REISER4_PERM_PLUGIN_TYPE,
61417+ .id = NULL_PERM_ID,
61418+ .pops = NULL,
61419+ .label = "null",
61420+ .desc = "stub permission plugin",
61421+ .linkage = {NULL, NULL}
61422+ },
61423+ .read_ok = NULL,
61424+ .write_ok = NULL,
61425+ .lookup_ok = NULL,
61426+ .create_ok = NULL,
61427+ .link_ok = NULL,
61428+ .unlink_ok = NULL,
61429+ .delete_ok = NULL,
61430+ .mask_ok = NULL,
61431+ .setattr_ok = NULL,
61432+ .getattr_ok = NULL,
61433+ .rename_ok = NULL,
61434+ }
61435+};
61436+
61437+/*
61438+ * Local variables:
61439+ * c-indentation-style: "K&R"
61440+ * mode-name: "LC"
61441+ * c-basic-offset: 8
61442+ * tab-width: 8
61443+ * fill-column: 79
61444+ * End:
61445+ */
61446Index: linux-2.6.16/fs/reiser4/plugin/security/perm.h
61447===================================================================
61448--- /dev/null
61449+++ linux-2.6.16/fs/reiser4/plugin/security/perm.h
61450@@ -0,0 +1,82 @@
61451+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61452+
61453+/* Perm (short for "permissions") plugins common stuff. */
61454+
61455+#if !defined( __REISER4_PERM_H__ )
61456+#define __REISER4_PERM_H__
61457+
61458+#include "../../forward.h"
61459+#include "../plugin_header.h"
61460+
61461+#include <linux/types.h>
61462+#include <linux/fs.h> /* for struct file */
61463+#include <linux/dcache.h> /* for struct dentry */
61464+
61465+/* interface for perm plugin.
61466+
61467+ Perm plugin method can be implemented through:
61468+
61469+ 1. consulting ->i_mode bits in stat data
61470+
61471+ 2. obtaining acl from the tree and inspecting it
61472+
61473+ 3. asking some kernel module or user-level program to authorize access.
61474+
61475+ This allows for integration with things like capabilities, SELinux-style
61476+ secutiry contexts, etc.
61477+
61478+*/
61479+/* NIKITA-FIXME-HANS: define what this is targeted for. It does not seem to be intended for use with sys_reiser4. Explain. */
61480+typedef struct perm_plugin {
61481+ /* generic plugin fields */
61482+ plugin_header h;
61483+
61484+ /* check permissions for read/write */
61485+ int (*read_ok) (struct file *file, const char __user *buf,
61486+ size_t size, loff_t *off);
61487+ int (*write_ok) (struct file *file, const char __user *buf,
61488+ size_t size, loff_t *off);
61489+
61490+ /* check permissions for lookup */
61491+ int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
61492+
61493+ /* check permissions for create */
61494+ int (*create_ok) (struct inode * parent, struct dentry * dentry,
61495+ reiser4_object_create_data * data);
61496+
61497+ /* check permissions for linking @where to @existing */
61498+ int (*link_ok) (struct dentry * existing, struct inode * parent,
61499+ struct dentry * where);
61500+
61501+ /* check permissions for unlinking @victim from @parent */
61502+ int (*unlink_ok) (struct inode * parent, struct dentry * victim);
61503+
61504+ /* check permissions for deletion of @object whose last reference is
61505+ by @parent */
61506+ int (*delete_ok) (struct inode * parent, struct dentry * victim);
61507+ int (*mask_ok) (struct inode * inode, int mask);
61508+ /* check whether attribute change is acceptable */
61509+ int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
61510+
61511+ /* check whether stat(2) is allowed */
61512+ int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
61513+ struct dentry * dentry, struct kstat * stat);
61514+ /* check whether rename(2) is allowed */
61515+ int (*rename_ok) (struct inode * old_dir, struct dentry * old,
61516+ struct inode * new_dir, struct dentry * new);
61517+} perm_plugin;
61518+
61519+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
61520+
61521+/* __REISER4_PERM_H__ */
61522+#endif
61523+
61524+/* Make Linus happy.
61525+ Local variables:
61526+ c-indentation-style: "K&R"
61527+ mode-name: "LC"
61528+ c-basic-offset: 8
61529+ tab-width: 8
61530+ fill-column: 120
61531+ End:
61532+*/
61533Index: linux-2.6.16/fs/reiser4/plugin/space/Makefile
61534===================================================================
61535--- /dev/null
61536+++ linux-2.6.16/fs/reiser4/plugin/space/Makefile
61537@@ -0,0 +1,4 @@
61538+obj-$(CONFIG_REISER4_FS) += space_plugins.o
61539+
61540+space_plugins-objs := \
61541+ bitmap.o
61542Index: linux-2.6.16/fs/reiser4/plugin/space/bitmap.c
61543===================================================================
61544--- /dev/null
61545+++ linux-2.6.16/fs/reiser4/plugin/space/bitmap.c
61546@@ -0,0 +1,1592 @@
61547+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61548+
61549+#include "../../debug.h"
61550+#include "../../dformat.h"
61551+#include "../../txnmgr.h"
61552+#include "../../jnode.h"
61553+#include "../../block_alloc.h"
61554+#include "../../tree.h"
61555+#include "../../super.h"
61556+#include "../plugin.h"
61557+#include "space_allocator.h"
61558+#include "bitmap.h"
61559+
61560+#include <linux/types.h>
61561+#include <linux/fs.h> /* for struct super_block */
61562+#include <asm/semaphore.h>
61563+#include <linux/vmalloc.h>
61564+#include <asm/div64.h>
61565+
61566+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
61567+ * blocks
61568+
61569+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap
61570+ blocks loading/unloading which is different from v3.x where all bitmap
61571+ blocks are loaded at mount time.
61572+
61573+ To implement bitmap blocks unloading we need to count bitmap block usage
61574+ and detect currently unused blocks allowing them to be unloaded. It is not
61575+ a simple task since we allow several threads to modify one bitmap block
61576+ simultaneously.
61577+
61578+ Briefly speaking, the following schema is proposed: we count in special
61579+ variable associated with each bitmap block. That is for counting of block
61580+ alloc/dealloc operations on that bitmap block. With a deferred block
61581+ deallocation feature of reiser4 all those operation will be represented in
61582+ atom dirty/deleted lists as jnodes for freshly allocated or deleted
61583+ nodes.
61584+
61585+ So, we increment usage counter for each new node allocated or deleted, and
61586+ decrement it at atom commit one time for each node from the dirty/deleted
61587+ atom's list. Of course, freshly allocated node deletion and node reusing
61588+ from atom deleted (if we do so) list should decrement bitmap usage counter
61589+ also.
61590+
61591+ This schema seems to be working but that reference counting is
61592+ not easy to debug. I think we should agree with Hans and do not implement
61593+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
61594+
61595+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are
61596+ loaded into memory on fs mount time or each bitmap nodes are loaded at the
61597+ first access to it, the "dont_load_bitmap" mount option controls whether
61598+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
61599+ nodes currently is not supported. */
61600+
61601+#define CHECKSUM_SIZE 4
61602+
61603+#define BYTES_PER_LONG (sizeof(long))
61604+
61605+#if BITS_PER_LONG == 64
61606+# define LONG_INT_SHIFT (6)
61607+#else
61608+# define LONG_INT_SHIFT (5)
61609+#endif
61610+
61611+#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
61612+
61613+typedef unsigned long ulong_t;
61614+
61615+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
61616+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
61617+
61618+/* Block allocation/deallocation are done through special bitmap objects which
61619+ are allocated in an array at fs mount. */
61620+struct bitmap_node {
61621+ struct semaphore sema; /* long term lock object */
61622+
61623+ jnode *wjnode; /* j-nodes for WORKING ... */
61624+ jnode *cjnode; /* ... and COMMIT bitmap blocks */
61625+
61626+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */
61627+
61628+ atomic_t loaded; /* a flag which shows that bnode is loaded
61629+ * already */
61630+};
61631+
61632+static inline char *bnode_working_data(struct bitmap_node *bnode)
61633+{
61634+ char *data;
61635+
61636+ data = jdata(bnode->wjnode);
61637+ assert("zam-429", data != NULL);
61638+
61639+ return data + CHECKSUM_SIZE;
61640+}
61641+
61642+static inline char *bnode_commit_data(const struct bitmap_node *bnode)
61643+{
61644+ char *data;
61645+
61646+ data = jdata(bnode->cjnode);
61647+ assert("zam-430", data != NULL);
61648+
61649+ return data + CHECKSUM_SIZE;
61650+}
61651+
61652+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
61653+{
61654+ char *data;
61655+
61656+ data = jdata(bnode->cjnode);
61657+ assert("vpf-261", data != NULL);
61658+
61659+ return le32_to_cpu(get_unaligned((d32 *)data));
61660+}
61661+
61662+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
61663+{
61664+ char *data;
61665+
61666+ data = jdata(bnode->cjnode);
61667+ assert("vpf-261", data != NULL);
61668+
61669+ put_unaligned(cpu_to_le32(crc), (d32 *)data);
61670+}
61671+
61672+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
61673+ * written the code, does this added abstraction still have */
61674+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
61675+ * reiser4_space_allocator structure) */
61676+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
61677+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
61678+ * someday?". What they about? If there is a reason to have a union, it should
61679+ * be a union, if not, it should not be a union. "..might be someday" means no
61680+ * reason. */
61681+struct bitmap_allocator_data {
61682+ /* an array for bitmap blocks direct access */
61683+ struct bitmap_node *bitmap;
61684+};
61685+
61686+#define get_barray(super) \
61687+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
61688+
61689+#define get_bnode(super, i) (get_barray(super) + i)
61690+
61691+/* allocate and initialize jnode with JNODE_BITMAP type */
61692+static jnode *bnew(void)
61693+{
61694+ jnode *jal = jalloc();
61695+
61696+ if (jal)
61697+ jnode_init(jal, current_tree, JNODE_BITMAP);
61698+
61699+ return jal;
61700+}
61701+
61702+/* this file contains:
61703+ - bitmap based implementation of space allocation plugin
61704+ - all the helper functions like set bit, find_first_zero_bit, etc */
61705+
61706+/* Audited by: green(2002.06.12) */
61707+static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
61708+{
61709+ ulong_t mask = 1UL << start_bit;
61710+ int i = start_bit;
61711+
61712+ while ((word & mask) != 0) {
61713+ mask <<= 1;
61714+ if (++i >= BITS_PER_LONG)
61715+ break;
61716+ }
61717+
61718+ return i;
61719+}
61720+
61721+#include <asm/bitops.h>
61722+
61723+#if BITS_PER_LONG == 64
61724+
61725+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
61726+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
61727+
61728+static inline void reiser4_set_bit(int nr, void *addr)
61729+{
61730+ ext2_set_bit(nr + OFF(addr), BASE(addr));
61731+}
61732+
61733+static inline void reiser4_clear_bit(int nr, void *addr)
61734+{
61735+ ext2_clear_bit(nr + OFF(addr), BASE(addr));
61736+}
61737+
61738+static inline int reiser4_test_bit(int nr, void *addr)
61739+{
61740+ return ext2_test_bit(nr + OFF(addr), BASE(addr));
61741+}
61742+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
61743+ int offset)
61744+{
61745+ int off = OFF(addr);
61746+
61747+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
61748+ offset + off) - off;
61749+}
61750+
61751+#else
61752+
61753+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
61754+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
61755+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
61756+
61757+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
61758+ext2_find_next_zero_bit(addr, maxoffset, offset)
61759+#endif
61760+
61761+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
61762+ * are counted from @addr, return the offset of the first bit if it is found,
61763+ * @maxoffset otherwise. */
61764+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61765+ bmap_off_t start_offset)
61766+{
61767+ ulong_t *base = addr;
61768+ /* start_offset is in bits, convert it to byte offset within bitmap. */
61769+ int word_nr = start_offset >> LONG_INT_SHIFT;
61770+ /* bit number within the byte. */
61771+ int bit_nr = start_offset & LONG_INT_MASK;
61772+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
61773+
61774+ assert("zam-387", max_offset != 0);
61775+
61776+ /* Unaligned @start_offset case. */
61777+ if (bit_nr != 0) {
61778+ bmap_nr_t nr;
61779+
61780+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
61781+
61782+ if (nr < BITS_PER_LONG)
61783+ return (word_nr << LONG_INT_SHIFT) + nr;
61784+
61785+ ++word_nr;
61786+ }
61787+
61788+ /* Fast scan trough aligned words. */
61789+ while (word_nr <= max_word_nr) {
61790+ if (base[word_nr] != 0) {
61791+ return (word_nr << LONG_INT_SHIFT)
61792+ + find_next_zero_bit_in_word(~(base[word_nr]), 0);
61793+ }
61794+
61795+ ++word_nr;
61796+ }
61797+
61798+ return max_offset;
61799+}
61800+
61801+#if BITS_PER_LONG == 64
61802+
61803+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61804+ bmap_off_t start_offset)
61805+{
61806+ bmap_off_t off = OFF(addr);
61807+
61808+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
61809+ start_offset + off) - off;
61810+}
61811+
61812+#else
61813+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
61814+ __reiser4_find_next_set_bit(addr, max_offset, start_offset)
61815+#endif
61816+
61817+/* search for the first set bit in single word. */
61818+static int find_last_set_bit_in_word(ulong_t word, int start_bit)
61819+{
61820+ ulong_t bit_mask;
61821+ int nr = start_bit;
61822+
61823+ assert("zam-965", start_bit < BITS_PER_LONG);
61824+ assert("zam-966", start_bit >= 0);
61825+
61826+ bit_mask = (1UL << nr);
61827+
61828+ while (bit_mask != 0) {
61829+ if (bit_mask & word)
61830+ return nr;
61831+ bit_mask >>= 1;
61832+ nr--;
61833+ }
61834+ return BITS_PER_LONG;
61835+}
61836+
61837+/* Search bitmap for a set bit in backward direction from the end to the
61838+ * beginning of given region
61839+ *
61840+ * @result: result offset of the last set bit
61841+ * @addr: base memory address,
61842+ * @low_off: low end of the search region, edge bit included into the region,
61843+ * @high_off: high end of the search region, edge bit included into the region,
61844+ *
61845+ * @return: 0 - set bit was found, -1 otherwise.
61846+ */
61847+static int
61848+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61849+ bmap_off_t high_off)
61850+{
61851+ ulong_t *base = addr;
61852+ int last_word;
61853+ int first_word;
61854+ int last_bit;
61855+ int nr;
61856+
61857+ assert("zam-961", high_off >= 0);
61858+ assert("zam-962", high_off >= low_off);
61859+
61860+ last_word = high_off >> LONG_INT_SHIFT;
61861+ last_bit = high_off & LONG_INT_MASK;
61862+ first_word = low_off >> LONG_INT_SHIFT;
61863+
61864+ if (last_bit < BITS_PER_LONG) {
61865+ nr = find_last_set_bit_in_word(base[last_word], last_bit);
61866+ if (nr < BITS_PER_LONG) {
61867+ *result = (last_word << LONG_INT_SHIFT) + nr;
61868+ return 0;
61869+ }
61870+ --last_word;
61871+ }
61872+ while (last_word >= first_word) {
61873+ if (base[last_word] != 0x0) {
61874+ last_bit =
61875+ find_last_set_bit_in_word(base[last_word],
61876+ BITS_PER_LONG - 1);
61877+ assert("zam-972", last_bit < BITS_PER_LONG);
61878+ *result = (last_word << LONG_INT_SHIFT) + last_bit;
61879+ return 0;
61880+ }
61881+ --last_word;
61882+ }
61883+
61884+ return -1; /* set bit not found */
61885+}
61886+
61887+/* Search bitmap for a clear bit in backward direction from the end to the
61888+ * beginning of given region */
61889+static int
61890+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61891+ bmap_off_t high_off)
61892+{
61893+ ulong_t *base = addr;
61894+ int last_word;
61895+ int first_word;
61896+ int last_bit;
61897+ int nr;
61898+
61899+ last_word = high_off >> LONG_INT_SHIFT;
61900+ last_bit = high_off & LONG_INT_MASK;
61901+ first_word = low_off >> LONG_INT_SHIFT;
61902+
61903+ if (last_bit < BITS_PER_LONG) {
61904+ nr = find_last_set_bit_in_word(~base[last_word], last_bit);
61905+ if (nr < BITS_PER_LONG) {
61906+ *result = (last_word << LONG_INT_SHIFT) + nr;
61907+ return 0;
61908+ }
61909+ --last_word;
61910+ }
61911+ while (last_word >= first_word) {
61912+ if (base[last_word] != (ulong_t) (-1)) {
61913+ *result = (last_word << LONG_INT_SHIFT) +
61914+ find_last_set_bit_in_word(~base[last_word],
61915+ BITS_PER_LONG - 1);
61916+ return 0;
61917+ }
61918+ --last_word;
61919+ }
61920+
61921+ return -1; /* zero bit not found */
61922+}
61923+
61924+/* Audited by: green(2002.06.12) */
61925+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
61926+{
61927+ int first_byte;
61928+ int last_byte;
61929+
61930+ unsigned char first_byte_mask = 0xFF;
61931+ unsigned char last_byte_mask = 0xFF;
61932+
61933+ assert("zam-410", start < end);
61934+
61935+ first_byte = start >> 3;
61936+ last_byte = (end - 1) >> 3;
61937+
61938+ if (last_byte > first_byte + 1)
61939+ memset(addr + first_byte + 1, 0,
61940+ (size_t) (last_byte - first_byte - 1));
61941+
61942+ first_byte_mask >>= 8 - (start & 0x7);
61943+ last_byte_mask <<= ((end - 1) & 0x7) + 1;
61944+
61945+ if (first_byte == last_byte) {
61946+ addr[first_byte] &= (first_byte_mask | last_byte_mask);
61947+ } else {
61948+ addr[first_byte] &= first_byte_mask;
61949+ addr[last_byte] &= last_byte_mask;
61950+ }
61951+}
61952+
61953+/* Audited by: green(2002.06.12) */
61954+/* ZAM-FIXME-HANS: comment this */
61955+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
61956+{
61957+ int first_byte;
61958+ int last_byte;
61959+
61960+ unsigned char first_byte_mask = 0xFF;
61961+ unsigned char last_byte_mask = 0xFF;
61962+
61963+ assert("zam-386", start < end);
61964+
61965+ first_byte = start >> 3;
61966+ last_byte = (end - 1) >> 3;
61967+
61968+ if (last_byte > first_byte + 1)
61969+ memset(addr + first_byte + 1, 0xFF,
61970+ (size_t) (last_byte - first_byte - 1));
61971+
61972+ first_byte_mask <<= start & 0x7;
61973+ last_byte_mask >>= 7 - ((end - 1) & 0x7);
61974+
61975+ if (first_byte == last_byte) {
61976+ addr[first_byte] |= (first_byte_mask & last_byte_mask);
61977+ } else {
61978+ addr[first_byte] |= first_byte_mask;
61979+ addr[last_byte] |= last_byte_mask;
61980+ }
61981+}
61982+
61983+#define ADLER_BASE 65521
61984+#define ADLER_NMAX 5552
61985+
61986+/* Calculates the adler32 checksum for the data pointed by `data` of the
61987+ length `len`. This function was originally taken from zlib, version 1.1.3,
61988+ July 9th, 1998.
61989+
61990+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
61991+
61992+ This software is provided 'as-is', without any express or implied
61993+ warranty. In no event will the authors be held liable for any damages
61994+ arising from the use of this software.
61995+
61996+ Permission is granted to anyone to use this software for any purpose,
61997+ including commercial applications, and to alter it and redistribute it
61998+ freely, subject to the following restrictions:
61999+
62000+ 1. The origin of this software must not be misrepresented; you must not
62001+ claim that you wrote the original software. If you use this software
62002+ in a product, an acknowledgment in the product documentation would be
62003+ appreciated but is not required.
62004+ 2. Altered source versions must be plainly marked as such, and must not be
62005+ misrepresented as being the original software.
62006+ 3. This notice may not be removed or altered from any source distribution.
62007+
62008+ Jean-loup Gailly Mark Adler
62009+ jloup@gzip.org madler@alumni.caltech.edu
62010+
62011+ The above comment applies only to the reiser4_adler32 function.
62012+*/
62013+
62014+__u32 reiser4_adler32(char *data, __u32 len)
62015+{
62016+ unsigned char *t = data;
62017+ __u32 s1 = 1;
62018+ __u32 s2 = 0;
62019+ int k;
62020+
62021+ while (len > 0) {
62022+ k = len < ADLER_NMAX ? len : ADLER_NMAX;
62023+ len -= k;
62024+
62025+ while (k--) {
62026+ s1 += *t++;
62027+ s2 += s1;
62028+ }
62029+
62030+ s1 %= ADLER_BASE;
62031+ s2 %= ADLER_BASE;
62032+ }
62033+ return (s2 << 16) | s1;
62034+}
62035+
62036+#define sb_by_bnode(bnode) \
62037+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
62038+
62039+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
62040+{
62041+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
62042+}
62043+
62044+static int
62045+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
62046+{
62047+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
62048+ bmap_nr_t bmap;
62049+
62050+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
62051+
62052+ warning("vpf-263",
62053+ "Checksum for the bitmap block %llu is incorrect",
62054+ bmap);
62055+
62056+ return RETERR(-EIO);
62057+ }
62058+
62059+ return 0;
62060+}
62061+
62062+#define REISER4_CHECK_BMAP_CRC (0)
62063+
62064+#if REISER4_CHECK_BMAP_CRC
62065+static int bnode_check_crc(const struct bitmap_node *bnode)
62066+{
62067+ return bnode_check_adler32(bnode,
62068+ bmap_size(sb_by_bnode(bnode)->s_blocksize));
62069+}
62070+
62071+/* REISER4_CHECK_BMAP_CRC */
62072+#else
62073+
62074+#define bnode_check_crc(bnode) (0)
62075+
62076+/* REISER4_CHECK_BMAP_CRC */
62077+#endif
62078+
62079+/* Recalculates the adler32 checksum for only 1 byte change.
62080+ adler - previous adler checksum
62081+ old_data, data - old, new byte values.
62082+ tail == (chunk - offset) : length, checksum was calculated for, - offset of
62083+ the changed byte within this chunk.
62084+ This function can be used for checksum calculation optimisation.
62085+*/
62086+
62087+static __u32
62088+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
62089+ __u32 tail)
62090+{
62091+ __u32 delta = data - old_data + 2 * ADLER_BASE;
62092+ __u32 s1 = adler & 0xffff;
62093+ __u32 s2 = (adler >> 16) & 0xffff;
62094+
62095+ s1 = (delta + s1) % ADLER_BASE;
62096+ s2 = (delta * tail + s2) % ADLER_BASE;
62097+
62098+ return (s2 << 16) | s1;
62099+}
62100+
62101+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
62102+
62103+/**
62104+ * get_nr_bitmap - calculate number of bitmap blocks
62105+ * @super: super block with initialized blocksize and block count
62106+ *
62107+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
62108+ * maintain free disk space. It assumes that each bitmap addresses the same
62109+ * number of blocks which is calculated by bmap_block_count macro defined in
62110+ * above. Number of blocks in the filesystem has to be initialized in reiser4
62111+ * private data of super block already so that it can be obtained via
62112+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
62113+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
62114+ * to use special function to divide and modulo 64bits filesystem block
62115+ * counters.
62116+ *
62117+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
62118+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
62119+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
62120+ */
62121+static bmap_nr_t get_nr_bmap(const struct super_block *super)
62122+{
62123+ u64 quotient;
62124+
62125+ assert("zam-393", reiser4_block_count(super) != 0);
62126+
62127+ quotient = reiser4_block_count(super) - 1;
62128+ do_div(quotient, bmap_bit_count(super->s_blocksize));
62129+ return quotient + 1;
62130+}
62131+
62132+/**
62133+ * parse_blocknr - calculate bitmap number and offset in it by block number
62134+ * @block: pointer to block number to calculate location in bitmap of
62135+ * @bmap: pointer where to store bitmap block number
62136+ * @offset: pointer where to store offset within bitmap block
62137+ *
62138+ * Calculates location of bit which is responsible for allocation/freeing of
62139+ * block @*block. That location is represented by bitmap block number and offset
62140+ * within that bitmap block.
62141+ */
62142+static void
62143+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
62144+ bmap_off_t *offset)
62145+{
62146+ struct super_block *super = get_current_context()->super;
62147+ u64 quotient = *block;
62148+
62149+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
62150+ *bmap = quotient;
62151+
62152+ assert("zam-433", *bmap < get_nr_bmap(super));
62153+ assert("", *offset < bmap_bit_count(super->s_blocksize));
62154+}
62155+
62156+#if REISER4_DEBUG
62157+/* Audited by: green(2002.06.12) */
62158+static void
62159+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
62160+{
62161+ struct super_block *sb = reiser4_get_current_sb();
62162+
62163+ assert("zam-436", sb != NULL);
62164+
62165+ assert("zam-455", start != NULL);
62166+ assert("zam-437", *start != 0);
62167+ assert("zam-541", !blocknr_is_fake(start));
62168+ assert("zam-441", *start < reiser4_block_count(sb));
62169+
62170+ if (len != NULL) {
62171+ assert("zam-438", *len != 0);
62172+ assert("zam-442", *start + *len <= reiser4_block_count(sb));
62173+ }
62174+}
62175+
62176+static void check_bnode_loaded(const struct bitmap_node *bnode)
62177+{
62178+ assert("zam-485", bnode != NULL);
62179+ assert("zam-483", jnode_page(bnode->wjnode) != NULL);
62180+ assert("zam-484", jnode_page(bnode->cjnode) != NULL);
62181+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
62182+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
62183+}
62184+
62185+#else
62186+
62187+# define check_block_range(start, len) do { /* nothing */} while(0)
62188+# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
62189+
62190+#endif
62191+
62192+/* modify bnode->first_zero_bit (if we free bits before); bnode should be
62193+ spin-locked */
62194+static inline void
62195+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
62196+{
62197+ if (offset < bnode->first_zero_bit)
62198+ bnode->first_zero_bit = offset;
62199+}
62200+
62201+/* return a physical disk address for logical bitmap number @bmap */
62202+/* FIXME-VS: this is somehow related to disk layout? */
62203+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
62204+ * per block allocation so that performance is not affected. Probably this
62205+ * whole file should be considered part of the disk layout plugin, and other
62206+ * disk layouts can use other defines and efficiency will not be significantly
62207+ * affected. */
62208+
62209+#define REISER4_FIRST_BITMAP_BLOCK \
62210+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
62211+
62212+/* Audited by: green(2002.06.12) */
62213+static void
62214+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
62215+ reiser4_block_nr * bnr)
62216+{
62217+
62218+ assert("zam-390", bmap < get_nr_bmap(super));
62219+
62220+#ifdef CONFIG_REISER4_BADBLOCKS
62221+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
62222+ /* Check if the diskmap have this already, first. */
62223+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
62224+ return; /* Found it in diskmap */
62225+#endif
62226+ /* FIXME_ZAM: before discussing of disk layouts and disk format
62227+ plugins I implement bitmap location scheme which is close to scheme
62228+ used in reiser 3.6 */
62229+ if (bmap == 0) {
62230+ *bnr = REISER4_FIRST_BITMAP_BLOCK;
62231+ } else {
62232+ *bnr = bmap * bmap_bit_count(super->s_blocksize);
62233+ }
62234+}
62235+
62236+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
62237+/* Audited by: green(2002.06.12) */
62238+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
62239+{
62240+ *bnr =
62241+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
62242+ REISER4_BITMAP_BLOCKS_STATUS_VALUE);
62243+}
62244+
62245+/* bnode structure initialization */
62246+static void
62247+init_bnode(struct bitmap_node *bnode,
62248+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
62249+{
62250+ memset(bnode, 0, sizeof(struct bitmap_node));
62251+
62252+ sema_init(&bnode->sema, 1);
62253+ atomic_set(&bnode->loaded, 0);
62254+}
62255+
62256+static void release(jnode * node)
62257+{
62258+ jrelse(node);
62259+ JF_SET(node, JNODE_HEARD_BANSHEE);
62260+ jput(node);
62261+}
62262+
62263+/* This function is for internal bitmap.c use because it assumes that jnode is
62264+ in under full control of this thread */
62265+static void done_bnode(struct bitmap_node *bnode)
62266+{
62267+ if (bnode) {
62268+ atomic_set(&bnode->loaded, 0);
62269+ if (bnode->wjnode != NULL)
62270+ release(bnode->wjnode);
62271+ if (bnode->cjnode != NULL)
62272+ release(bnode->cjnode);
62273+ bnode->wjnode = bnode->cjnode = NULL;
62274+ }
62275+}
62276+
62277+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
62278+static int
62279+prepare_bnode(struct bitmap_node *bnode, jnode ** cjnode_ret,
62280+ jnode ** wjnode_ret)
62281+{
62282+ struct super_block *super;
62283+ jnode *cjnode;
62284+ jnode *wjnode;
62285+ bmap_nr_t bmap;
62286+ int ret;
62287+
62288+ super = reiser4_get_current_sb();
62289+
62290+ *wjnode_ret = wjnode = bnew();
62291+ if (wjnode == NULL) {
62292+ *cjnode_ret = NULL;
62293+ return RETERR(-ENOMEM);
62294+ }
62295+
62296+ *cjnode_ret = cjnode = bnew();
62297+ if (cjnode == NULL)
62298+ return RETERR(-ENOMEM);
62299+
62300+ bmap = bnode - get_bnode(super, 0);
62301+
62302+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
62303+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
62304+
62305+ jref(cjnode);
62306+ jref(wjnode);
62307+
62308+ /* load commit bitmap */
62309+ ret = jload_gfp(cjnode, GFP_NOFS, 1);
62310+
62311+ if (ret)
62312+ goto error;
62313+
62314+ /* allocate memory for working bitmap block. Note that for
62315+ * bitmaps jinit_new() doesn't actually modifies node content,
62316+ * so parallel calls to this are ok. */
62317+ ret = jinit_new(wjnode, GFP_NOFS);
62318+
62319+ if (ret != 0) {
62320+ jrelse(cjnode);
62321+ goto error;
62322+ }
62323+
62324+ return 0;
62325+
62326+ error:
62327+ jput(cjnode);
62328+ jput(wjnode);
62329+ *wjnode_ret = *cjnode_ret = NULL;
62330+ return ret;
62331+
62332+}
62333+
62334+/* Check the bnode data on read. */
62335+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
62336+{
62337+ void *data;
62338+ int ret;
62339+
62340+ /* Check CRC */
62341+ ret = bnode_check_adler32(bnode, blksize);
62342+
62343+ if (ret) {
62344+ return ret;
62345+ }
62346+
62347+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
62348+
62349+ /* Check the very first bit -- it must be busy. */
62350+ if (!reiser4_test_bit(0, data)) {
62351+ warning("vpf-1362", "The allocator block %llu is not marked "
62352+ "as used.", (unsigned long long)bnode->cjnode->blocknr);
62353+
62354+ return -EINVAL;
62355+ }
62356+
62357+ return 0;
62358+}
62359+
62360+/* load bitmap blocks "on-demand" */
62361+static int load_and_lock_bnode(struct bitmap_node *bnode)
62362+{
62363+ int ret;
62364+
62365+ jnode *cjnode;
62366+ jnode *wjnode;
62367+
62368+ assert("nikita-3040", schedulable());
62369+
62370+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62371+ * need to be atomic, right? Just leave a comment that if bitmaps were
62372+ * unloadable, this would need to be atomic. */
62373+ if (atomic_read(&bnode->loaded)) {
62374+ /* bitmap is already loaded, nothing to do */
62375+ check_bnode_loaded(bnode);
62376+ down(&bnode->sema);
62377+ assert("nikita-2827", atomic_read(&bnode->loaded));
62378+ return 0;
62379+ }
62380+
62381+ ret = prepare_bnode(bnode, &cjnode, &wjnode);
62382+ if (ret == 0) {
62383+ down(&bnode->sema);
62384+
62385+ if (!atomic_read(&bnode->loaded)) {
62386+ assert("nikita-2822", cjnode != NULL);
62387+ assert("nikita-2823", wjnode != NULL);
62388+ assert("nikita-2824", jnode_is_loaded(cjnode));
62389+ assert("nikita-2825", jnode_is_loaded(wjnode));
62390+
62391+ bnode->wjnode = wjnode;
62392+ bnode->cjnode = cjnode;
62393+
62394+ ret = check_struct_bnode(bnode, current_blocksize);
62395+ if (!ret) {
62396+ cjnode = wjnode = NULL;
62397+ atomic_set(&bnode->loaded, 1);
62398+ /* working bitmap is initialized by on-disk
62399+ * commit bitmap. This should be performed
62400+ * under semaphore. */
62401+ memcpy(bnode_working_data(bnode),
62402+ bnode_commit_data(bnode),
62403+ bmap_size(current_blocksize));
62404+ } else {
62405+ up(&bnode->sema);
62406+ }
62407+ } else
62408+ /* race: someone already loaded bitmap while we were
62409+ * busy initializing data. */
62410+ check_bnode_loaded(bnode);
62411+ }
62412+
62413+ if (wjnode != NULL) {
62414+ release(wjnode);
62415+ bnode->wjnode = NULL;
62416+ }
62417+ if (cjnode != NULL) {
62418+ release(cjnode);
62419+ bnode->cjnode = NULL;
62420+ }
62421+
62422+ return ret;
62423+}
62424+
62425+static void release_and_unlock_bnode(struct bitmap_node *bnode)
62426+{
62427+ check_bnode_loaded(bnode);
62428+ up(&bnode->sema);
62429+}
62430+
62431+/* This function does all block allocation work but only for one bitmap
62432+ block.*/
62433+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62434+ block responsibility zone boundaries. This had no sense in v3.6 but may
62435+ have it in v4.x */
62436+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62437+static int
62438+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62439+ bmap_off_t max_offset, int min_len, int max_len)
62440+{
62441+ struct super_block *super = get_current_context()->super;
62442+ struct bitmap_node *bnode = get_bnode(super, bmap);
62443+
62444+ char *data;
62445+
62446+ bmap_off_t search_end;
62447+ bmap_off_t start;
62448+ bmap_off_t end;
62449+
62450+ int set_first_zero_bit = 0;
62451+
62452+ int ret;
62453+
62454+ assert("zam-364", min_len > 0);
62455+ assert("zam-365", max_len >= min_len);
62456+ assert("zam-366", *offset <= max_offset);
62457+
62458+ ret = load_and_lock_bnode(bnode);
62459+
62460+ if (ret)
62461+ return ret;
62462+
62463+ data = bnode_working_data(bnode);
62464+
62465+ start = *offset;
62466+
62467+ if (bnode->first_zero_bit >= start) {
62468+ start = bnode->first_zero_bit;
62469+ set_first_zero_bit = 1;
62470+ }
62471+
62472+ while (start + min_len < max_offset) {
62473+
62474+ start =
62475+ reiser4_find_next_zero_bit((long *)data, max_offset, start);
62476+ if (set_first_zero_bit) {
62477+ bnode->first_zero_bit = start;
62478+ set_first_zero_bit = 0;
62479+ }
62480+ if (start >= max_offset)
62481+ break;
62482+
62483+ search_end = LIMIT(start + max_len, max_offset);
62484+ end =
62485+ reiser4_find_next_set_bit((long *)data, search_end, start);
62486+ if (end >= start + min_len) {
62487+ /* we can't trust find_next_set_bit result if set bit
62488+ was not fount, result may be bigger than
62489+ max_offset */
62490+ if (end > search_end)
62491+ end = search_end;
62492+
62493+ ret = end - start;
62494+ *offset = start;
62495+
62496+ reiser4_set_bits(data, start, end);
62497+
62498+ /* FIXME: we may advance first_zero_bit if [start,
62499+ end] region overlaps the first_zero_bit point */
62500+
62501+ break;
62502+ }
62503+
62504+ start = end + 1;
62505+ }
62506+
62507+ release_and_unlock_bnode(bnode);
62508+
62509+ return ret;
62510+}
62511+
62512+static int
62513+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
62514+ bmap_off_t end_offset, int min_len, int max_len)
62515+{
62516+ struct super_block *super = get_current_context()->super;
62517+ struct bitmap_node *bnode = get_bnode(super, bmap);
62518+ char *data;
62519+ bmap_off_t start;
62520+ int ret;
62521+
62522+ assert("zam-958", min_len > 0);
62523+ assert("zam-959", max_len >= min_len);
62524+ assert("zam-960", *start_offset >= end_offset);
62525+
62526+ ret = load_and_lock_bnode(bnode);
62527+ if (ret)
62528+ return ret;
62529+
62530+ data = bnode_working_data(bnode);
62531+ start = *start_offset;
62532+
62533+ while (1) {
62534+ bmap_off_t end, search_end;
62535+
62536+ /* Find the beginning of the zero filled region */
62537+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
62538+ break;
62539+ /* Is there more than `min_len' bits from `start' to
62540+ * `end_offset'? */
62541+ if (start < end_offset + min_len - 1)
62542+ break;
62543+
62544+ /* Do not search to `end_offset' if we need to find less than
62545+ * `max_len' zero bits. */
62546+ if (end_offset + max_len - 1 < start)
62547+ search_end = start - max_len + 1;
62548+ else
62549+ search_end = end_offset;
62550+
62551+ if (reiser4_find_last_set_bit(&end, data, search_end, start))
62552+ end = search_end;
62553+ else
62554+ end++;
62555+
62556+ if (end + min_len <= start + 1) {
62557+ if (end < search_end)
62558+ end = search_end;
62559+ ret = start - end + 1;
62560+ *start_offset = end; /* `end' is lowest offset */
62561+ assert("zam-987",
62562+ reiser4_find_next_set_bit(data, start + 1,
62563+ end) >= start + 1);
62564+ reiser4_set_bits(data, end, start + 1);
62565+ break;
62566+ }
62567+
62568+ if (end <= end_offset)
62569+ /* left search boundary reached. */
62570+ break;
62571+ start = end - 1;
62572+ }
62573+
62574+ release_and_unlock_bnode(bnode);
62575+ return ret;
62576+}
62577+
62578+/* allocate contiguous range of blocks in bitmap */
62579+static int bitmap_alloc_forward(reiser4_block_nr * start,
62580+ const reiser4_block_nr * end, int min_len,
62581+ int max_len)
62582+{
62583+ bmap_nr_t bmap, end_bmap;
62584+ bmap_off_t offset, end_offset;
62585+ int len;
62586+
62587+ reiser4_block_nr tmp;
62588+
62589+ struct super_block *super = get_current_context()->super;
62590+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62591+
62592+ parse_blocknr(start, &bmap, &offset);
62593+
62594+ tmp = *end - 1;
62595+ parse_blocknr(&tmp, &end_bmap, &end_offset);
62596+ ++end_offset;
62597+
62598+ assert("zam-358", end_bmap >= bmap);
62599+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
62600+
62601+ for (; bmap < end_bmap; bmap++, offset = 0) {
62602+ len =
62603+ search_one_bitmap_forward(bmap, &offset, max_offset,
62604+ min_len, max_len);
62605+ if (len != 0)
62606+ goto out;
62607+ }
62608+
62609+ len =
62610+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
62611+ max_len);
62612+ out:
62613+ *start = bmap * max_offset + offset;
62614+ return len;
62615+}
62616+
62617+/* allocate contiguous range of blocks in bitmap (from @start to @end in
62618+ * backward direction) */
62619+static int bitmap_alloc_backward(reiser4_block_nr * start,
62620+ const reiser4_block_nr * end, int min_len,
62621+ int max_len)
62622+{
62623+ bmap_nr_t bmap, end_bmap;
62624+ bmap_off_t offset, end_offset;
62625+ int len;
62626+ struct super_block *super = get_current_context()->super;
62627+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62628+
62629+ parse_blocknr(start, &bmap, &offset);
62630+ parse_blocknr(end, &end_bmap, &end_offset);
62631+
62632+ assert("zam-961", end_bmap <= bmap);
62633+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
62634+
62635+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
62636+ len =
62637+ search_one_bitmap_backward(bmap, &offset, 0, min_len,
62638+ max_len);
62639+ if (len != 0)
62640+ goto out;
62641+ }
62642+
62643+ len =
62644+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
62645+ max_len);
62646+ out:
62647+ *start = bmap * max_offset + offset;
62648+ return len;
62649+}
62650+
62651+/* plugin->u.space_allocator.alloc_blocks() */
62652+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
62653+ reiser4_block_nr *start, reiser4_block_nr *len)
62654+{
62655+ struct super_block *super = get_current_context()->super;
62656+ int actual_len;
62657+
62658+ reiser4_block_nr search_start;
62659+ reiser4_block_nr search_end;
62660+
62661+ assert("zam-398", super != NULL);
62662+ assert("zam-412", hint != NULL);
62663+ assert("zam-397", hint->blk <= reiser4_block_count(super));
62664+
62665+ if (hint->max_dist == 0)
62666+ search_end = reiser4_block_count(super);
62667+ else
62668+ search_end =
62669+ LIMIT(hint->blk + hint->max_dist,
62670+ reiser4_block_count(super));
62671+
62672+ /* We use @hint -> blk as a search start and search from it to the end
62673+ of the disk or in given region if @hint -> max_dist is not zero */
62674+ search_start = hint->blk;
62675+
62676+ actual_len =
62677+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62678+
62679+ /* There is only one bitmap search if max_dist was specified or first
62680+ pass was from the beginning of the bitmap. We also do one pass for
62681+ scanning bitmap in backward direction. */
62682+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
62683+ /* next step is a scanning from 0 to search_start */
62684+ search_end = search_start;
62685+ search_start = 0;
62686+ actual_len =
62687+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62688+ }
62689+ if (actual_len == 0)
62690+ return RETERR(-ENOSPC);
62691+ if (actual_len < 0)
62692+ return RETERR(actual_len);
62693+ *len = actual_len;
62694+ *start = search_start;
62695+ return 0;
62696+}
62697+
62698+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
62699+ reiser4_block_nr * start,
62700+ reiser4_block_nr * len)
62701+{
62702+ reiser4_block_nr search_start;
62703+ reiser4_block_nr search_end;
62704+ int actual_len;
62705+
62706+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
62707+
62708+ assert("zam-969", super != NULL);
62709+ assert("zam-970", hint != NULL);
62710+ assert("zam-971", hint->blk <= reiser4_block_count(super));
62711+
62712+ search_start = hint->blk;
62713+ if (hint->max_dist == 0 || search_start <= hint->max_dist)
62714+ search_end = 0;
62715+ else
62716+ search_end = search_start - hint->max_dist;
62717+
62718+ actual_len =
62719+ bitmap_alloc_backward(&search_start, &search_end, 1, needed);
62720+ if (actual_len == 0)
62721+ return RETERR(-ENOSPC);
62722+ if (actual_len < 0)
62723+ return RETERR(actual_len);
62724+ *len = actual_len;
62725+ *start = search_start;
62726+ return 0;
62727+}
62728+
62729+/* plugin->u.space_allocator.alloc_blocks() */
62730+int
62731+alloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
62732+ reiser4_blocknr_hint * hint, int needed,
62733+ reiser4_block_nr * start, reiser4_block_nr * len)
62734+{
62735+ if (hint->backward)
62736+ return alloc_blocks_backward(hint, needed, start, len);
62737+ return alloc_blocks_forward(hint, needed, start, len);
62738+}
62739+
62740+/* plugin->u.space_allocator.dealloc_blocks(). */
62741+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
62742+ nodes deletion is deferred until transaction commit. However, deallocation
62743+ of temporary objects like wandered blocks and transaction commit records
62744+ requires immediate node deletion from WORKING BITMAP.*/
62745+void
62746+dealloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
62747+ reiser4_block_nr start, reiser4_block_nr len)
62748+{
62749+ struct super_block *super = reiser4_get_current_sb();
62750+
62751+ bmap_nr_t bmap;
62752+ bmap_off_t offset;
62753+
62754+ struct bitmap_node *bnode;
62755+ int ret;
62756+
62757+ assert("zam-468", len != 0);
62758+ check_block_range(&start, &len);
62759+
62760+ parse_blocknr(&start, &bmap, &offset);
62761+
62762+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
62763+
62764+ bnode = get_bnode(super, bmap);
62765+
62766+ assert("zam-470", bnode != NULL);
62767+
62768+ ret = load_and_lock_bnode(bnode);
62769+ assert("zam-481", ret == 0);
62770+
62771+ reiser4_clear_bits(bnode_working_data(bnode), offset,
62772+ (bmap_off_t) (offset + len));
62773+
62774+ adjust_first_zero_bit(bnode, offset);
62775+
62776+ release_and_unlock_bnode(bnode);
62777+}
62778+
62779+/* plugin->u.space_allocator.check_blocks(). */
62780+void
62781+check_blocks_bitmap(const reiser4_block_nr * start,
62782+ const reiser4_block_nr * len, int desired)
62783+{
62784+#if REISER4_DEBUG
62785+ struct super_block *super = reiser4_get_current_sb();
62786+
62787+ bmap_nr_t bmap;
62788+ bmap_off_t start_offset;
62789+ bmap_off_t end_offset;
62790+
62791+ struct bitmap_node *bnode;
62792+ int ret;
62793+
62794+ assert("zam-622", len != NULL);
62795+ check_block_range(start, len);
62796+ parse_blocknr(start, &bmap, &start_offset);
62797+
62798+ end_offset = start_offset + *len;
62799+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
62800+
62801+ bnode = get_bnode(super, bmap);
62802+
62803+ assert("nikita-2215", bnode != NULL);
62804+
62805+ ret = load_and_lock_bnode(bnode);
62806+ assert("zam-626", ret == 0);
62807+
62808+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
62809+
62810+ if (desired) {
62811+ assert("zam-623",
62812+ reiser4_find_next_zero_bit(bnode_working_data(bnode),
62813+ end_offset, start_offset)
62814+ >= end_offset);
62815+ } else {
62816+ assert("zam-624",
62817+ reiser4_find_next_set_bit(bnode_working_data(bnode),
62818+ end_offset, start_offset)
62819+ >= end_offset);
62820+ }
62821+
62822+ release_and_unlock_bnode(bnode);
62823+#endif
62824+}
62825+
62826+/* conditional insertion of @node into atom's overwrite set if it was not there */
62827+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
62828+{
62829+ assert("zam-546", atom != NULL);
62830+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
62831+ assert("zam-548", node != NULL);
62832+
62833+ spin_lock_atom(atom);
62834+ spin_lock_jnode(node);
62835+
62836+ if (node->atom == NULL) {
62837+ JF_SET(node, JNODE_OVRWR);
62838+ insert_into_atom_ovrwr_list(atom, node);
62839+ } else {
62840+ assert("zam-549", node->atom == atom);
62841+ }
62842+
62843+ spin_unlock_jnode(node);
62844+ spin_unlock_atom(atom);
62845+}
62846+
62847+/* an actor which applies delete set to COMMIT bitmap pages and link modified
62848+ pages in a single-linked list */
62849+static int
62850+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
62851+ const reiser4_block_nr * len, void *data)
62852+{
62853+
62854+ bmap_nr_t bmap;
62855+ bmap_off_t offset;
62856+ int ret;
62857+
62858+ long long *blocks_freed_p = data;
62859+
62860+ struct bitmap_node *bnode;
62861+
62862+ struct super_block *sb = reiser4_get_current_sb();
62863+
62864+ check_block_range(start, len);
62865+
62866+ parse_blocknr(start, &bmap, &offset);
62867+
62868+ /* FIXME-ZAM: we assume that all block ranges are allocated by this
62869+ bitmap-based allocator and each block range can't go over a zone of
62870+ responsibility of one bitmap block; same assumption is used in
62871+ other journal hooks in bitmap code. */
62872+ bnode = get_bnode(sb, bmap);
62873+ assert("zam-448", bnode != NULL);
62874+
62875+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
62876+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
62877+ ret = load_and_lock_bnode(bnode);
62878+ if (ret)
62879+ return ret;
62880+
62881+ /* put bnode into atom's overwrite set */
62882+ cond_add_to_overwrite_set(atom, bnode->cjnode);
62883+
62884+ data = bnode_commit_data(bnode);
62885+
62886+ ret = bnode_check_crc(bnode);
62887+ if (ret != 0)
62888+ return ret;
62889+
62890+ if (len != NULL) {
62891+ /* FIXME-ZAM: a check that all bits are set should be there */
62892+ assert("zam-443",
62893+ offset + *len <= bmap_bit_count(sb->s_blocksize));
62894+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
62895+
62896+ (*blocks_freed_p) += *len;
62897+ } else {
62898+ reiser4_clear_bit(offset, data);
62899+ (*blocks_freed_p)++;
62900+ }
62901+
62902+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
62903+
62904+ release_and_unlock_bnode(bnode);
62905+
62906+ return 0;
62907+}
62908+
62909+/* plugin->u.space_allocator.pre_commit_hook(). */
62910+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
62911+ rest is done by transaction manager (allocate wandered locations for COMMIT
62912+ BITMAP blocks, copy COMMIT BITMAP blocks data). */
62913+/* Only one instance of this function can be running at one given time, because
62914+ only one transaction can be committed a time, therefore it is safe to access
62915+ some global variables without any locking */
62916+
62917+int pre_commit_hook_bitmap(void)
62918+{
62919+ struct super_block *super = reiser4_get_current_sb();
62920+ txn_atom *atom;
62921+
62922+ long long blocks_freed = 0;
62923+
62924+ atom = get_current_atom_locked();
62925+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
62926+ spin_unlock_atom(atom);
62927+
62928+ { /* scan atom's captured list and find all freshly allocated nodes,
62929+ * mark corresponded bits in COMMIT BITMAP as used */
62930+ struct list_head *head = ATOM_CLEAN_LIST(atom);
62931+ jnode *node = list_entry(head->next, jnode, capture_link);
62932+
62933+ while (head != &node->capture_link) {
62934+ /* we detect freshly allocated jnodes */
62935+ if (JF_ISSET(node, JNODE_RELOC)) {
62936+ int ret;
62937+ bmap_nr_t bmap;
62938+
62939+ bmap_off_t offset;
62940+ bmap_off_t index;
62941+ struct bitmap_node *bn;
62942+ __u32 size = bmap_size(super->s_blocksize);
62943+ __u32 crc;
62944+ char byte;
62945+
62946+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
62947+ assert("zam-460",
62948+ !blocknr_is_fake(&node->blocknr));
62949+
62950+ parse_blocknr(&node->blocknr, &bmap, &offset);
62951+ bn = get_bnode(super, bmap);
62952+
62953+ index = offset >> 3;
62954+ assert("vpf-276", index < size);
62955+
62956+ ret = bnode_check_crc(bnode);
62957+ if (ret != 0)
62958+ return ret;
62959+
62960+ check_bnode_loaded(bn);
62961+ load_and_lock_bnode(bn);
62962+
62963+ byte = *(bnode_commit_data(bn) + index);
62964+ reiser4_set_bit(offset, bnode_commit_data(bn));
62965+
62966+ crc = adler32_recalc(bnode_commit_crc(bn), byte,
62967+ *(bnode_commit_data(bn) +
62968+ index),
62969+ size - index),
62970+ bnode_set_commit_crc(bn, crc);
62971+
62972+ release_and_unlock_bnode(bn);
62973+
62974+ ret = bnode_check_crc(bn);
62975+ if (ret != 0)
62976+ return ret;
62977+
62978+ /* working of this depends on how it inserts
62979+ new j-node into clean list, because we are
62980+ scanning the same list now. It is OK, if
62981+ insertion is done to the list front */
62982+ cond_add_to_overwrite_set(atom, bn->cjnode);
62983+ }
62984+
62985+ node = list_entry(node->capture_link.next, jnode, capture_link);
62986+ }
62987+ }
62988+
62989+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
62990+ &blocks_freed, 0);
62991+
62992+ blocks_freed -= atom->nr_blocks_allocated;
62993+
62994+ {
62995+ reiser4_super_info_data *sbinfo;
62996+
62997+ sbinfo = get_super_private(super);
62998+
62999+ spin_lock_reiser4_super(sbinfo);
63000+ sbinfo->blocks_free_committed += blocks_freed;
63001+ spin_unlock_reiser4_super(sbinfo);
63002+ }
63003+
63004+ return 0;
63005+}
63006+
63007+/* plugin->u.space_allocator.init_allocator
63008+ constructor of reiser4_space_allocator object. It is called on fs mount */
63009+int
63010+init_allocator_bitmap(reiser4_space_allocator * allocator,
63011+ struct super_block *super, void *arg UNUSED_ARG)
63012+{
63013+ struct bitmap_allocator_data *data = NULL;
63014+ bmap_nr_t bitmap_blocks_nr;
63015+ bmap_nr_t i;
63016+
63017+ assert("nikita-3039", schedulable());
63018+
63019+ /* getting memory for bitmap allocator private data holder */
63020+ data =
63021+ kmalloc(sizeof(struct bitmap_allocator_data), GFP_KERNEL);
63022+
63023+ if (data == NULL)
63024+ return RETERR(-ENOMEM);
63025+
63026+ /* allocation and initialization for the array of bnodes */
63027+ bitmap_blocks_nr = get_nr_bmap(super);
63028+
63029+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
63030+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
63031+ may I never meet someone who still uses the ia32 architecture when
63032+ storage devices of that size enter the market, and wants to use ia32
63033+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
63034+ probably, another dynamic data structure should replace a static
63035+ array of bnodes. */
63036+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
63037+ data->bitmap = vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
63038+ if (data->bitmap == NULL) {
63039+ kfree(data);
63040+ return RETERR(-ENOMEM);
63041+ }
63042+
63043+ for (i = 0; i < bitmap_blocks_nr; i++)
63044+ init_bnode(data->bitmap + i, super, i);
63045+
63046+ allocator->u.generic = data;
63047+
63048+#if REISER4_DEBUG
63049+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
63050+#endif
63051+
63052+ /* Load all bitmap blocks at mount time. */
63053+ if (!test_bit
63054+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
63055+ __u64 start_time, elapsed_time;
63056+ struct bitmap_node *bnode;
63057+ int ret;
63058+
63059+ if (REISER4_DEBUG)
63060+ printk(KERN_INFO "loading reiser4 bitmap...");
63061+ start_time = jiffies;
63062+
63063+ for (i = 0; i < bitmap_blocks_nr; i++) {
63064+ bnode = data->bitmap + i;
63065+ ret = load_and_lock_bnode(bnode);
63066+ if (ret) {
63067+ destroy_allocator_bitmap(allocator, super);
63068+ return ret;
63069+ }
63070+ release_and_unlock_bnode(bnode);
63071+ }
63072+
63073+ elapsed_time = jiffies - start_time;
63074+ if (REISER4_DEBUG)
63075+ printk("...done (%llu jiffies)\n",
63076+ (unsigned long long)elapsed_time);
63077+ }
63078+
63079+ return 0;
63080+}
63081+
63082+/* plugin->u.space_allocator.destroy_allocator
63083+ destructor. It is called on fs unmount */
63084+int
63085+destroy_allocator_bitmap(reiser4_space_allocator * allocator,
63086+ struct super_block *super)
63087+{
63088+ bmap_nr_t bitmap_blocks_nr;
63089+ bmap_nr_t i;
63090+
63091+ struct bitmap_allocator_data *data = allocator->u.generic;
63092+
63093+ assert("zam-414", data != NULL);
63094+ assert("zam-376", data->bitmap != NULL);
63095+
63096+ bitmap_blocks_nr = get_nr_bmap(super);
63097+
63098+ for (i = 0; i < bitmap_blocks_nr; i++) {
63099+ struct bitmap_node *bnode = data->bitmap + i;
63100+
63101+ down(&bnode->sema);
63102+
63103+#if REISER4_DEBUG
63104+ if (atomic_read(&bnode->loaded)) {
63105+ jnode *wj = bnode->wjnode;
63106+ jnode *cj = bnode->cjnode;
63107+
63108+ assert("zam-480", jnode_page(cj) != NULL);
63109+ assert("zam-633", jnode_page(wj) != NULL);
63110+
63111+ assert("zam-634",
63112+ memcmp(jdata(wj), jdata(wj),
63113+ bmap_size(super->s_blocksize)) == 0);
63114+
63115+ }
63116+#endif
63117+ done_bnode(bnode);
63118+ up(&bnode->sema);
63119+ }
63120+
63121+ vfree(data->bitmap);
63122+ kfree(data);
63123+
63124+ allocator->u.generic = NULL;
63125+
63126+ return 0;
63127+}
63128+
63129+/*
63130+ Local variables:
63131+ c-indentation-style: "K&R"
63132+ mode-name: "LC"
63133+ c-basic-offset: 8
63134+ tab-width: 8
63135+ fill-column: 80
63136+ scroll-step: 1
63137+ End:
63138+*/
63139Index: linux-2.6.16/fs/reiser4/plugin/space/bitmap.h
63140===================================================================
63141--- /dev/null
63142+++ linux-2.6.16/fs/reiser4/plugin/space/bitmap.h
63143@@ -0,0 +1,47 @@
63144+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63145+
63146+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
63147+#define __REISER4_PLUGIN_SPACE_BITMAP_H__
63148+
63149+#include "../../dformat.h"
63150+#include "../../block_alloc.h"
63151+
63152+#include <linux/types.h> /* for __u?? */
63153+#include <linux/fs.h> /* for struct super_block */
63154+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
63155+/* declarations of functions implementing methods of space allocator plugin for
63156+ bitmap based allocator. The functions themselves are in bitmap.c */
63157+extern int init_allocator_bitmap(reiser4_space_allocator *,
63158+ struct super_block *, void *);
63159+extern int destroy_allocator_bitmap(reiser4_space_allocator *,
63160+ struct super_block *);
63161+extern int alloc_blocks_bitmap(reiser4_space_allocator *,
63162+ reiser4_blocknr_hint *, int needed,
63163+ reiser4_block_nr * start,
63164+ reiser4_block_nr * len);
63165+extern void check_blocks_bitmap(const reiser4_block_nr *,
63166+ const reiser4_block_nr *, int);
63167+
63168+extern void dealloc_blocks_bitmap(reiser4_space_allocator *, reiser4_block_nr,
63169+ reiser4_block_nr);
63170+extern int pre_commit_hook_bitmap(void);
63171+
63172+#define post_commit_hook_bitmap() do{}while(0)
63173+#define post_write_back_hook_bitmap() do{}while(0)
63174+#define print_info_bitmap(pref, al) do{}while(0)
63175+
63176+typedef __u64 bmap_nr_t;
63177+typedef __u32 bmap_off_t;
63178+
63179+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
63180+
63181+/* Make Linus happy.
63182+ Local variables:
63183+ c-indentation-style: "K&R"
63184+ mode-name: "LC"
63185+ c-basic-offset: 8
63186+ tab-width: 8
63187+ fill-column: 120
63188+ scroll-step: 1
63189+ End:
63190+*/
63191Index: linux-2.6.16/fs/reiser4/plugin/space/space_allocator.h
63192===================================================================
63193--- /dev/null
63194+++ linux-2.6.16/fs/reiser4/plugin/space/space_allocator.h
63195@@ -0,0 +1,80 @@
63196+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63197+
63198+#ifndef __SPACE_ALLOCATOR_H__
63199+#define __SPACE_ALLOCATOR_H__
63200+
63201+#include "../../forward.h"
63202+#include "bitmap.h"
63203+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
63204+ * but... */
63205+#define DEF_SPACE_ALLOCATOR(allocator) \
63206+ \
63207+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
63208+{ \
63209+ return init_allocator_##allocator (al, s, opaque); \
63210+} \
63211+ \
63212+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
63213+{ \
63214+ destroy_allocator_##allocator (al, s); \
63215+} \
63216+ \
63217+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
63218+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
63219+{ \
63220+ return alloc_blocks_##allocator (al, hint, needed, start, len); \
63221+} \
63222+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
63223+{ \
63224+ dealloc_blocks_##allocator (al, start, len); \
63225+} \
63226+ \
63227+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
63228+{ \
63229+ check_blocks_##allocator (start, end, desired); \
63230+} \
63231+ \
63232+static inline void sa_pre_commit_hook (void) \
63233+{ \
63234+ pre_commit_hook_##allocator (); \
63235+} \
63236+ \
63237+static inline void sa_post_commit_hook (void) \
63238+{ \
63239+ post_commit_hook_##allocator (); \
63240+} \
63241+ \
63242+static inline void sa_post_write_back_hook (void) \
63243+{ \
63244+ post_write_back_hook_##allocator(); \
63245+} \
63246+ \
63247+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
63248+{ \
63249+ print_info_##allocator (prefix, al); \
63250+}
63251+
63252+DEF_SPACE_ALLOCATOR(bitmap)
63253+
63254+/* this object is part of reiser4 private in-core super block */
63255+struct reiser4_space_allocator {
63256+ union {
63257+ /* space allocators might use this pointer to reference their
63258+ * data. */
63259+ void *generic;
63260+ } u;
63261+};
63262+
63263+/* __SPACE_ALLOCATOR_H__ */
63264+#endif
63265+
63266+/* Make Linus happy.
63267+ Local variables:
63268+ c-indentation-style: "K&R"
63269+ mode-name: "LC"
63270+ c-basic-offset: 8
63271+ tab-width: 8
63272+ fill-column: 120
63273+ scroll-step: 1
63274+ End:
63275+*/
63276Index: linux-2.6.16/fs/reiser4/plugin/tail_policy.c
63277===================================================================
63278--- /dev/null
63279+++ linux-2.6.16/fs/reiser4/plugin/tail_policy.c
63280@@ -0,0 +1,113 @@
63281+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63282+ * reiser4/README */
63283+
63284+/* Formatting policy plugins */
63285+
63286+/*
63287+ * Formatting policy plugin is used by object plugin (of regular file) to
63288+ * convert file between two representations.
63289+ *
63290+ * Currently following policies are implemented:
63291+ * never store file in formatted nodes
63292+ * always store file in formatted nodes
63293+ * store file in formatted nodes if file is smaller than 4 blocks (default)
63294+ */
63295+
63296+#include "../tree.h"
63297+#include "../inode.h"
63298+#include "../super.h"
63299+#include "object.h"
63300+#include "plugin.h"
63301+#include "node/node.h"
63302+#include "plugin_header.h"
63303+
63304+#include <linux/pagemap.h>
63305+#include <linux/fs.h> /* For struct inode */
63306+
63307+/**
63308+ * have_formatting_never -
63309+ * @inode:
63310+ * @size:
63311+ *
63312+ *
63313+ */
63314+/* Never store file's tail as direct item */
63315+/* Audited by: green(2002.06.12) */
63316+static int have_formatting_never(const struct inode *inode UNUSED_ARG
63317+ /* inode to operate on */ ,
63318+ loff_t size UNUSED_ARG /* new object size */ )
63319+{
63320+ return 0;
63321+}
63322+
63323+/* Always store file's tail as direct item */
63324+/* Audited by: green(2002.06.12) */
63325+static int
63326+have_formatting_always(const struct inode *inode UNUSED_ARG
63327+ /* inode to operate on */ ,
63328+ loff_t size UNUSED_ARG /* new object size */ )
63329+{
63330+ return 1;
63331+}
63332+
63333+/* This function makes test if we should store file denoted @inode as tails only or
63334+ as extents only. */
63335+static int
63336+have_formatting_default(const struct inode *inode UNUSED_ARG
63337+ /* inode to operate on */ ,
63338+ loff_t size /* new object size */ )
63339+{
63340+ assert("umka-1253", inode != NULL);
63341+
63342+ if (size > inode->i_sb->s_blocksize * 4)
63343+ return 0;
63344+
63345+ return 1;
63346+}
63347+
63348+/* tail plugins */
63349+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
63350+ [NEVER_TAILS_FORMATTING_ID] = {
63351+ .h = {
63352+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63353+ .id = NEVER_TAILS_FORMATTING_ID,
63354+ .pops = NULL,
63355+ .label = "never",
63356+ .desc = "Never store file's tail",
63357+ .linkage = {NULL, NULL}
63358+ },
63359+ .have_tail = have_formatting_never
63360+ },
63361+ [ALWAYS_TAILS_FORMATTING_ID] = {
63362+ .h = {
63363+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63364+ .id = ALWAYS_TAILS_FORMATTING_ID,
63365+ .pops = NULL,
63366+ .label = "always",
63367+ .desc = "Always store file's tail",
63368+ .linkage = {NULL, NULL}
63369+ },
63370+ .have_tail = have_formatting_always
63371+ },
63372+ [SMALL_FILE_FORMATTING_ID] = {
63373+ .h = {
63374+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63375+ .id = SMALL_FILE_FORMATTING_ID,
63376+ .pops = NULL,
63377+ .label = "4blocks",
63378+ .desc = "store files shorter than 4 blocks in tail items",
63379+ .linkage = {NULL, NULL}
63380+ },
63381+ .have_tail = have_formatting_default
63382+ }
63383+};
63384+
63385+/*
63386+ * Local variables:
63387+ * c-indentation-style: "K&R"
63388+ * mode-name: "LC"
63389+ * c-basic-offset: 8
63390+ * tab-width: 8
63391+ * fill-column: 79
63392+ * End:
63393+ */
63394Index: linux-2.6.16/fs/reiser4/pool.c
63395===================================================================
63396--- /dev/null
63397+++ linux-2.6.16/fs/reiser4/pool.c
63398@@ -0,0 +1,236 @@
63399+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63400+ * reiser4/README */
63401+
63402+/* Fast pool allocation.
63403+
63404+ There are situations when some sub-system normally asks memory allocator
63405+ for only few objects, but under some circumstances could require much
63406+ more. Typical and actually motivating example is tree balancing. It needs
63407+ to keep track of nodes that were involved into it, and it is well-known
63408+ that in reasonable packed balanced tree most (92.938121%) percent of all
63409+ balancings end up after working with only few nodes (3.141592 on
63410+ average). But in rare cases balancing can involve much more nodes
63411+ (3*tree_height+1 in extremal situation).
63412+
63413+ On the one hand, we don't want to resort to dynamic allocation (slab,
63414+ malloc(), etc.) to allocate data structures required to keep track of
63415+ nodes during balancing. On the other hand, we cannot statically allocate
63416+ required amount of space on the stack, because first: it is useless wastage
63417+ of precious resource, and second: this amount is unknown in advance (tree
63418+ height can change).
63419+
63420+ Pools, implemented in this file are solution for this problem:
63421+
63422+ - some configurable amount of objects is statically preallocated on the
63423+ stack
63424+
63425+ - if this preallocated pool is exhausted and more objects is requested
63426+ they are allocated dynamically.
63427+
63428+ Pools encapsulate distinction between statically and dynamically allocated
63429+ objects. Both allocation and recycling look exactly the same.
63430+
63431+ To keep track of dynamically allocated objects, pool adds its own linkage
63432+ to each object.
63433+
63434+ NOTE-NIKITA This linkage also contains some balancing-specific data. This
63435+ is not perfect. On the other hand, balancing is currently the only client
63436+ of pool code.
63437+
63438+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63439+ functions in the style of tslist/tshash, i.e., make them unreadable, but
63440+ type-safe.
63441+
63442+
63443+*/
63444+
63445+#include "debug.h"
63446+#include "pool.h"
63447+#include "super.h"
63448+
63449+#include <linux/types.h>
63450+#include <linux/err.h>
63451+
63452+/* initialize new pool object */
63453+static void reiser4_init_pool_obj(reiser4_pool_header * h /* pool object to
63454+ * initialize */ )
63455+{
63456+ INIT_LIST_HEAD(&h->usage_linkage);
63457+ INIT_LIST_HEAD(&h->level_linkage);
63458+ INIT_LIST_HEAD(&h->extra_linkage);
63459+}
63460+
63461+/* initialize new pool */
63462+void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
63463+ size_t obj_size /* size of objects in @pool */ ,
63464+ int num_of_objs /* number of preallocated objects */ ,
63465+ char *data /* area for preallocated objects */ )
63466+{
63467+ reiser4_pool_header *h;
63468+ int i;
63469+
63470+ assert("nikita-955", pool != NULL);
63471+ assert("nikita-1044", obj_size > 0);
63472+ assert("nikita-956", num_of_objs >= 0);
63473+ assert("nikita-957", data != NULL);
63474+
63475+ memset(pool, 0, sizeof *pool);
63476+ pool->obj_size = obj_size;
63477+ pool->data = data;
63478+ INIT_LIST_HEAD(&pool->free);
63479+ INIT_LIST_HEAD(&pool->used);
63480+ INIT_LIST_HEAD(&pool->extra);
63481+ memset(data, 0, obj_size * num_of_objs);
63482+ for (i = 0; i < num_of_objs; ++i) {
63483+ h = (reiser4_pool_header *) (data + i * obj_size);
63484+ reiser4_init_pool_obj(h);
63485+ /* add pool header to the end of pool's free list */
63486+ list_add_tail(&h->usage_linkage, &pool->free);
63487+ }
63488+}
63489+
63490+/* release pool resources
63491+
63492+ Release all resources acquired by this pool, specifically, dynamically
63493+ allocated objects.
63494+
63495+*/
63496+void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
63497+{
63498+}
63499+
63500+/* allocate carry object from pool
63501+
63502+ First, try to get preallocated object. If this fails, resort to dynamic
63503+ allocation.
63504+
63505+*/
63506+static void *reiser4_pool_alloc(reiser4_pool * pool /* pool to allocate object
63507+ * from */ )
63508+{
63509+ reiser4_pool_header *result;
63510+
63511+ assert("nikita-959", pool != NULL);
63512+
63513+ if (!list_empty(&pool->free)) {
63514+ struct list_head *linkage;
63515+
63516+ linkage = pool->free.next;
63517+ list_del(linkage);
63518+ INIT_LIST_HEAD(linkage);
63519+ result = list_entry(linkage, reiser4_pool_header, usage_linkage);
63520+ BUG_ON(!list_empty(&result->level_linkage) ||
63521+ !list_empty(&result->extra_linkage));
63522+ } else {
63523+ /* pool is empty. Extra allocations don't deserve dedicated
63524+ slab to be served from, as they are expected to be rare. */
63525+ result = kmalloc(pool->obj_size, get_gfp_mask());
63526+ if (result != 0) {
63527+ reiser4_init_pool_obj(result);
63528+ list_add(&result->extra_linkage, &pool->extra);
63529+ } else
63530+ return ERR_PTR(RETERR(-ENOMEM));
63531+ BUG_ON(!list_empty(&result->usage_linkage) ||
63532+ !list_empty(&result->level_linkage));
63533+ }
63534+ ++pool->objs;
63535+ list_add(&result->usage_linkage, &pool->used);
63536+ memset(result + 1, 0, pool->obj_size - sizeof *result);
63537+ return result;
63538+}
63539+
63540+/* return object back to the pool */
63541+void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h /* pool to return object back
63542+ * into */ )
63543+{
63544+ assert("nikita-961", h != NULL);
63545+ assert("nikita-962", pool != NULL);
63546+
63547+ --pool->objs;
63548+ assert("nikita-963", pool->objs >= 0);
63549+
63550+ list_del_init(&h->usage_linkage);
63551+ list_del_init(&h->level_linkage);
63552+
63553+ if (list_empty(&h->extra_linkage))
63554+ /*
63555+ * pool header is not an extra one. Push it onto free list
63556+ * using usage_linkage
63557+ */
63558+ list_add(&h->usage_linkage, &pool->free);
63559+ else {
63560+ /* remove pool header from pool's extra list and kfree it */
63561+ list_del(&h->extra_linkage);
63562+ kfree(h);
63563+ }
63564+}
63565+
63566+/* add new object to the carry level list
63567+
63568+ Carry level is FIFO most of the time, but not always. Complications arise
63569+ when make_space() function tries to go to the left neighbor and thus adds
63570+ carry node before existing nodes, and also, when updating delimiting keys
63571+ after moving data between two nodes, we want left node to be locked before
63572+ right node.
63573+
63574+ Latter case is confusing at the first glance. Problem is that COP_UPDATE
63575+ opration that updates delimiting keys is sometimes called with two nodes
63576+ (when data are moved between two nodes) and sometimes with only one node
63577+ (when leftmost item is deleted in a node). In any case operation is
63578+ supplied with at least node whose left delimiting key is to be updated
63579+ (that is "right" node).
63580+
63581+*/
63582+reiser4_pool_header *add_obj(reiser4_pool * pool /* pool from which to
63583+ * allocate new object */ ,
63584+ struct list_head *list, /* list where to add
63585+ * object */
63586+ pool_ordering order /* where to add */ ,
63587+ reiser4_pool_header * reference /* after (or
63588+ * before) which
63589+ * existing
63590+ * object to
63591+ * add */ )
63592+{
63593+ reiser4_pool_header *result;
63594+
63595+ assert("nikita-972", pool != NULL);
63596+
63597+ result = reiser4_pool_alloc(pool);
63598+ if (IS_ERR(result))
63599+ return result;
63600+
63601+ assert("nikita-973", result != NULL);
63602+
63603+ switch (order) {
63604+ case POOLO_BEFORE:
63605+ __list_add(&result->level_linkage,
63606+ reference->level_linkage.prev,
63607+ &reference->level_linkage);
63608+ break;
63609+ case POOLO_AFTER:
63610+ __list_add(&result->level_linkage,
63611+ &reference->level_linkage,
63612+ reference->level_linkage.next);
63613+ break;
63614+ case POOLO_LAST:
63615+ list_add_tail(&result->level_linkage, list);
63616+ break;
63617+ case POOLO_FIRST:
63618+ list_add(&result->level_linkage, list);
63619+ break;
63620+ default:
63621+ wrong_return_value("nikita-927", "order");
63622+ }
63623+ return result;
63624+}
63625+
63626+/* Make Linus happy.
63627+ Local variables:
63628+ c-indentation-style: "K&R"
63629+ mode-name: "LC"
63630+ c-basic-offset: 8
63631+ tab-width: 8
63632+ fill-column: 120
63633+ End:
63634+*/
63635Index: linux-2.6.16/fs/reiser4/pool.h
63636===================================================================
63637--- /dev/null
63638+++ linux-2.6.16/fs/reiser4/pool.h
63639@@ -0,0 +1,54 @@
63640+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63641+
63642+/* Fast pool allocation */
63643+
63644+#ifndef __REISER4_POOL_H__
63645+#define __REISER4_POOL_H__
63646+
63647+#include <linux/types.h>
63648+
63649+typedef struct reiser4_pool {
63650+ size_t obj_size;
63651+ int objs;
63652+ char *data;
63653+ struct list_head free;
63654+ struct list_head used;
63655+ struct list_head extra;
63656+} reiser4_pool;
63657+
63658+typedef struct reiser4_pool_header {
63659+ /* object is either on free or "used" lists */
63660+ struct list_head usage_linkage;
63661+ struct list_head level_linkage;
63662+ struct list_head extra_linkage;
63663+} reiser4_pool_header;
63664+
63665+typedef enum {
63666+ POOLO_BEFORE,
63667+ POOLO_AFTER,
63668+ POOLO_LAST,
63669+ POOLO_FIRST
63670+} pool_ordering;
63671+
63672+/* pool manipulation functions */
63673+
63674+extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
63675+ int num_of_objs, char *data);
63676+extern void reiser4_done_pool(reiser4_pool * pool);
63677+extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
63678+reiser4_pool_header *add_obj(reiser4_pool * pool, struct list_head * list,
63679+ pool_ordering order,
63680+ reiser4_pool_header * reference);
63681+
63682+/* __REISER4_POOL_H__ */
63683+#endif
63684+
63685+/* Make Linus happy.
63686+ Local variables:
63687+ c-indentation-style: "K&R"
63688+ mode-name: "LC"
63689+ c-basic-offset: 8
63690+ tab-width: 8
63691+ fill-column: 120
63692+ End:
63693+*/
63694Index: linux-2.6.16/fs/reiser4/readahead.c
63695===================================================================
63696--- /dev/null
63697+++ linux-2.6.16/fs/reiser4/readahead.c
63698@@ -0,0 +1,138 @@
63699+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63700+ * reiser4/README */
63701+
63702+#include "forward.h"
63703+#include "tree.h"
63704+#include "tree_walk.h"
63705+#include "super.h"
63706+#include "inode.h"
63707+#include "key.h"
63708+#include "znode.h"
63709+
63710+#include <linux/swap.h> /* for totalram_pages */
63711+
63712+void init_ra_info(ra_info_t * rai)
63713+{
63714+ rai->key_to_stop = *min_key();
63715+}
63716+
63717+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
63718+static inline int ra_adjacent_only(int flags)
63719+{
63720+ return flags & RA_ADJACENT_ONLY;
63721+}
63722+
63723+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
63724+ if right neighbor's first key is less or equal to readahead's stop key */
63725+static int should_readahead_neighbor(znode * node, ra_info_t * info)
63726+{
63727+ int result;
63728+
63729+ read_lock_dk(znode_get_tree(node));
63730+ result = keyle(znode_get_rd_key(node), &info->key_to_stop);
63731+ read_unlock_dk(znode_get_tree(node));
63732+ return result;
63733+}
63734+
63735+#define LOW_MEM_PERCENTAGE (5)
63736+
63737+static int low_on_memory(void)
63738+{
63739+ unsigned int freepages;
63740+
63741+ freepages = nr_free_pages();
63742+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
63743+}
63744+
63745+/* start read for @node and for a few of its right neighbors */
63746+void formatted_readahead(znode * node, ra_info_t * info)
63747+{
63748+ ra_params_t *ra_params;
63749+ znode *cur;
63750+ int i;
63751+ int grn_flags;
63752+ lock_handle next_lh;
63753+
63754+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
63755+ if (blocknr_is_fake(znode_get_block(node)))
63756+ return;
63757+
63758+ ra_params = get_current_super_ra_params();
63759+
63760+ if (znode_page(node) == NULL)
63761+ jstartio(ZJNODE(node));
63762+
63763+ if (znode_get_level(node) != LEAF_LEVEL)
63764+ return;
63765+
63766+ /* don't waste memory for read-ahead when low on memory */
63767+ if (low_on_memory())
63768+ return;
63769+
63770+ /* We can have locked nodes on upper tree levels, in this situation lock
63771+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK
63772+ here. */
63773+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
63774+
63775+ i = 0;
63776+ cur = zref(node);
63777+ init_lh(&next_lh);
63778+ while (i < ra_params->max) {
63779+ const reiser4_block_nr *nextblk;
63780+
63781+ if (!should_readahead_neighbor(cur, info))
63782+ break;
63783+
63784+ if (reiser4_get_right_neighbor
63785+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
63786+ break;
63787+
63788+ nextblk = znode_get_block(next_lh.node);
63789+ if (blocknr_is_fake(nextblk) ||
63790+ (ra_adjacent_only(ra_params->flags)
63791+ && *nextblk != *znode_get_block(cur) + 1)) {
63792+ break;
63793+ }
63794+
63795+ zput(cur);
63796+ cur = zref(next_lh.node);
63797+ done_lh(&next_lh);
63798+ if (znode_page(cur) == NULL)
63799+ jstartio(ZJNODE(cur));
63800+ else
63801+ /* Do not scan read-ahead window if pages already
63802+ * allocated (and i/o already started). */
63803+ break;
63804+
63805+ i++;
63806+ }
63807+ zput(cur);
63808+ done_lh(&next_lh);
63809+}
63810+
63811+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
63812+{
63813+ reiser4_key *stop_key;
63814+
63815+ assert("nikita-3542", dir != NULL);
63816+ assert("nikita-3543", tap != NULL);
63817+
63818+ stop_key = &tap->ra_info.key_to_stop;
63819+ /* initialize readdir readahead information: include into readahead
63820+ * stat data of all files of the directory */
63821+ set_key_locality(stop_key, get_inode_oid(dir));
63822+ set_key_type(stop_key, KEY_SD_MINOR);
63823+ set_key_ordering(stop_key, get_key_ordering(max_key()));
63824+ set_key_objectid(stop_key, get_key_objectid(max_key()));
63825+ set_key_offset(stop_key, get_key_offset(max_key()));
63826+}
63827+
63828+/*
63829+ Local variables:
63830+ c-indentation-style: "K&R"
63831+ mode-name: "LC"
63832+ c-basic-offset: 8
63833+ tab-width: 8
63834+ fill-column: 80
63835+ End:
63836+*/
63837Index: linux-2.6.16/fs/reiser4/readahead.h
63838===================================================================
63839--- /dev/null
63840+++ linux-2.6.16/fs/reiser4/readahead.h
63841@@ -0,0 +1,48 @@
63842+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63843+
63844+#ifndef __READAHEAD_H__
63845+#define __READAHEAD_H__
63846+
63847+#include "key.h"
63848+
63849+typedef enum {
63850+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
63851+} ra_global_flags;
63852+
63853+/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
63854+typedef struct formatted_read_ahead_params {
63855+ unsigned long max; /* request not more than this amount of nodes. Default is totalram_pages / 4 */
63856+ int flags;
63857+} ra_params_t;
63858+
63859+typedef struct {
63860+ reiser4_key key_to_stop;
63861+} ra_info_t;
63862+
63863+void formatted_readahead(znode *, ra_info_t *);
63864+void init_ra_info(ra_info_t * rai);
63865+
63866+struct reiser4_file_ra_state {
63867+ loff_t start; /* Current window */
63868+ loff_t size;
63869+ loff_t next_size; /* Next window size */
63870+ loff_t ahead_start; /* Ahead window */
63871+ loff_t ahead_size;
63872+ loff_t max_window_size; /* Maximum readahead window */
63873+ loff_t slow_start; /* enlarging r/a size algorithm. */
63874+};
63875+
63876+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
63877+
63878+/* __READAHEAD_H__ */
63879+#endif
63880+
63881+/*
63882+ Local variables:
63883+ c-indentation-style: "K&R"
63884+ mode-name: "LC"
63885+ c-basic-offset: 8
63886+ tab-width: 8
63887+ fill-column: 120
63888+ End:
63889+*/
63890Index: linux-2.6.16/fs/reiser4/reiser4.h
63891===================================================================
63892--- /dev/null
63893+++ linux-2.6.16/fs/reiser4/reiser4.h
63894@@ -0,0 +1,276 @@
63895+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63896+ * reiser4/README */
63897+
63898+/* definitions of common constants used by reiser4 */
63899+
63900+#if !defined( __REISER4_H__ )
63901+#define __REISER4_H__
63902+
63903+#include <linux/config.h>
63904+#include <asm/param.h> /* for HZ */
63905+#include <linux/errno.h>
63906+#include <linux/types.h>
63907+#include <linux/fs.h>
63908+#include <asm/hardirq.h>
63909+#include <linux/sched.h>
63910+
63911+/*
63912+ * reiser4 compilation options.
63913+ */
63914+
63915+#if defined(CONFIG_REISER4_DEBUG)
63916+/* turn on assertion checks */
63917+#define REISER4_DEBUG (1)
63918+#else
63919+#define REISER4_DEBUG (0)
63920+#endif
63921+
63922+#if defined(CONFIG_ZLIB_INFLATE)
63923+/* turn on zlib */
63924+#define REISER4_ZLIB (1)
63925+#else
63926+#define REISER4_ZLIB (0)
63927+#endif
63928+
63929+#if defined(CONFIG_CRYPTO_SHA256)
63930+#define REISER4_SHA256 (1)
63931+#else
63932+#define REISER4_SHA256 (0)
63933+#endif
63934+
63935+#if defined(CONFIG_CRYPTO_AES_586)
63936+#define REISER4_AES (1)
63937+#else
63938+#define REISER4_AES (0)
63939+#endif
63940+
63941+/*
63942+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
63943+ * 8-byte components. In the old "small key" mode, it's 3 8-byte
63944+ * components. Additional component, referred to as "ordering" is used to
63945+ * order items from which given object is composed of. As such, ordering is
63946+ * placed between locality and objectid. For directory item ordering contains
63947+ * initial prefix of the file name this item is for. This sorts all directory
63948+ * items within given directory lexicographically (but see
63949+ * fibration.[ch]). For file body and stat-data, ordering contains initial
63950+ * prefix of the name file was initially created with. In the common case
63951+ * (files with single name) this allows to order file bodies and stat-datas in
63952+ * the same order as their respective directory entries, thus speeding up
63953+ * readdir.
63954+ *
63955+ * Note, that kernel can only mount file system with the same key size as one
63956+ * it is compiled for, so flipping this option may render your data
63957+ * inaccessible.
63958+ */
63959+#define REISER4_LARGE_KEY (1)
63960+/*#define REISER4_LARGE_KEY (0)*/
63961+
63962+/*#define GUESS_EXISTS 1*/
63963+
63964+/*
63965+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
63966+ * option
63967+ */
63968+
63969+extern const char *REISER4_SUPER_MAGIC_STRING;
63970+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
63971+ * beginning of device */
63972+
63973+/* here go tunable parameters that are not worth special entry in kernel
63974+ configuration */
63975+
63976+/* default number of slots in coord-by-key caches */
63977+#define CBK_CACHE_SLOTS (16)
63978+/* how many elementary tree operation to carry on the next level */
63979+#define CARRIES_POOL_SIZE (5)
63980+/* size of pool of preallocated nodes for carry process. */
63981+#define NODES_LOCKED_POOL_SIZE (5)
63982+
63983+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63984+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63985+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
63986+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
63987+
63988+/* we are supporting reservation of disk space on uid basis */
63989+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
63990+/* we are supporting reservation of disk space for groups */
63991+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
63992+/* we are supporting reservation of disk space for root */
63993+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
63994+/* we use rapid flush mode, see flush.c for comments. */
63995+#define REISER4_USE_RAPID_FLUSH (1)
63996+
63997+/*
63998+ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
63999+ */
64000+#define REISER4_USE_ENTD (1)
64001+
64002+/* key allocation is Plan-A */
64003+#define REISER4_PLANA_KEY_ALLOCATION (1)
64004+/* key allocation follows good old 3.x scheme */
64005+#define REISER4_3_5_KEY_ALLOCATION (0)
64006+
64007+/* size of hash-table for znodes */
64008+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
64009+
64010+/* number of buckets in lnode hash-table */
64011+#define LNODE_HTABLE_BUCKETS (1024)
64012+
64013+/* some ridiculously high maximal limit on height of znode tree. This
64014+ is used in declaration of various per level arrays and
64015+ to allocate stattistics gathering array for per-level stats. */
64016+#define REISER4_MAX_ZTREE_HEIGHT (8)
64017+
64018+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
64019+
64020+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
64021+ sequential search is on average faster than binary. This is because
64022+ of better optimization and because sequential search is more CPU
64023+ cache friendly. This number (25) was found by experiments on dual AMD
64024+ Athlon(tm), 1400MHz.
64025+
64026+ NOTE: testing in kernel has shown that binary search is more effective than
64027+ implied by results of the user level benchmarking. Probably because in the
64028+ node keys are separated by other data. So value was adjusted after few
64029+ tests. More thorough tuning is needed.
64030+*/
64031+#define REISER4_SEQ_SEARCH_BREAK (3)
64032+
64033+/* don't allow tree to be lower than this */
64034+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
64035+
64036+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
64037+ * available memory. */
64038+/* Default value of maximal atom size. Can be ovewritten by
64039+ tmgr.atom_max_size mount option. By default infinity. */
64040+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
64041+
64042+/* Default value of maximal atom age (in jiffies). After reaching this age
64043+ atom will be forced to commit, either synchronously or asynchronously. Can
64044+ be overwritten by tmgr.atom_max_age mount option. */
64045+#define REISER4_ATOM_MAX_AGE (600 * HZ)
64046+
64047+/* sleeping period for ktxnmrgd */
64048+#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
64049+
64050+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
64051+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
64052+
64053+/* start complaining after that many restarts in coord_by_key().
64054+
64055+ This either means incredibly heavy contention for this part of a tree, or
64056+ some corruption or bug.
64057+*/
64058+#define REISER4_CBK_ITERATIONS_LIMIT (100)
64059+
64060+/* return -EIO after that many iterations in coord_by_key().
64061+
64062+ I have witnessed more than 800 iterations (in 30 thread test) before cbk
64063+ finished. --nikita
64064+*/
64065+#define REISER4_MAX_CBK_ITERATIONS 500000
64066+
64067+/* put a per-inode limit on maximal number of directory entries with identical
64068+ keys in hashed directory.
64069+
64070+ Disable this until inheritance interfaces stabilize: we need some way to
64071+ set per directory limit.
64072+*/
64073+#define REISER4_USE_COLLISION_LIMIT (0)
64074+
64075+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
64076+ will force them to be relocated. */
64077+#define FLUSH_RELOCATE_THRESHOLD 64
64078+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
64079+ from the preceder it will relocate to that position. */
64080+#define FLUSH_RELOCATE_DISTANCE 64
64081+
64082+/* If we have written this much or more blocks before encountering busy jnode
64083+ in flush list - abort flushing hoping that next time we get called
64084+ this jnode will be clean already, and we will save some seeks. */
64085+#define FLUSH_WRITTEN_THRESHOLD 50
64086+
64087+/* The maximum number of nodes to scan left on a level during flush. */
64088+#define FLUSH_SCAN_MAXNODES 10000
64089+
64090+/* per-atom limit of flushers */
64091+#define ATOM_MAX_FLUSHERS (1)
64092+
64093+/* default tracing buffer size */
64094+#define REISER4_TRACE_BUF_SIZE (1 << 15)
64095+
64096+/* what size units of IO we would like cp, etc., to use, in writing to
64097+ reiser4. In bytes.
64098+
64099+ Can be overwritten by optimal_io_size mount option.
64100+*/
64101+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
64102+
64103+/* see comments in inode.c:oid_to_uino() */
64104+#define REISER4_UINO_SHIFT (1 << 30)
64105+
64106+/* Mark function argument as unused to avoid compiler warnings. */
64107+#define UNUSED_ARG __attribute__((unused))
64108+
64109+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
64110+#define NONNULL __attribute__((nonnull))
64111+#else
64112+#define NONNULL
64113+#endif
64114+
64115+/* master super block offset in bytes.*/
64116+#define REISER4_MASTER_OFFSET 65536
64117+
64118+/* size of VFS block */
64119+#define VFS_BLKSIZE 512
64120+/* number of bits in size of VFS block (512==2^9) */
64121+#define VFS_BLKSIZE_BITS 9
64122+
64123+#define REISER4_I reiser4_inode_data
64124+
64125+/* implication */
64126+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
64127+/* logical equivalence */
64128+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
64129+
64130+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
64131+
64132+#define NOT_YET (0)
64133+
64134+/** Reiser4 specific error codes **/
64135+
64136+#define REISER4_ERROR_CODE_BASE 500
64137+
64138+/* Neighbor is not available (side neighbor or parent) */
64139+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
64140+
64141+/* Node was not found in cache */
64142+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
64143+
64144+/* node has no free space enough for completion of balancing operation */
64145+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
64146+
64147+/* repeat operation */
64148+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
64149+
64150+/* deadlock happens */
64151+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
64152+
64153+/* operation cannot be performed, because it would block and non-blocking mode
64154+ * was requested. */
64155+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
64156+
64157+/* wait some event (depends on context), then repeat */
64158+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
64159+
64160+#endif /* __REISER4_H__ */
64161+
64162+/* Make Linus happy.
64163+ Local variables:
64164+ c-indentation-style: "K&R"
64165+ mode-name: "LC"
64166+ c-basic-offset: 8
64167+ tab-width: 8
64168+ fill-column: 120
64169+ End:
64170+*/
64171Index: linux-2.6.16/fs/reiser4/safe_link.c
64172===================================================================
64173--- /dev/null
64174+++ linux-2.6.16/fs/reiser4/safe_link.c
64175@@ -0,0 +1,351 @@
64176+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
64177+ * reiser4/README */
64178+
64179+/* Safe-links. */
64180+
64181+/*
64182+ * Safe-links are used to maintain file system consistency during operations
64183+ * that spawns multiple transactions. For example:
64184+ *
64185+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
64186+ * without user-visible names in the file system, but still opened by some
64187+ * active process. What happens here is that unlink proper (i.e., removal
64188+ * of the last file name) and file deletion (truncate of file body to zero
64189+ * and deletion of stat-data, that happens when last file descriptor is
64190+ * closed), may belong to different transactions T1 and T2. If a crash
64191+ * happens after T1 commit, but before T2 commit, on-disk file system has
64192+ * a file without name, that is, disk space leak.
64193+ *
64194+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If
64195+ * system crashes while truncate was in-progress, file is left partially
64196+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that
64197+ * every system is atomic.
64198+ *
64199+ * Safe-links address both above cases. Basically, safe-link is a way post
64200+ * some operation to be executed during commit of some other transaction than
64201+ * current one. (Another way to look at the safe-link is to interpret it as a
64202+ * logical logging.)
64203+ *
64204+ * Specifically, at the beginning of unlink safe-link in inserted in the
64205+ * tree. This safe-link is normally removed by file deletion code (during
64206+ * transaction T2 in the above terms). Truncate also inserts safe-link that is
64207+ * normally removed when truncate operation is finished.
64208+ *
64209+ * This means, that in the case of "clean umount" there are no safe-links in
64210+ * the tree. If safe-links are observed during mount, it means that (a) system
64211+ * was terminated abnormally, and (b) safe-link correspond to the "pending"
64212+ * (i.e., not finished) operations that were in-progress during system
64213+ * termination. Each safe-link record enough information to complete
64214+ * corresponding operation, and mount simply "replays" them (hence, the
64215+ * analogy with the logical logging).
64216+ *
64217+ * Safe-links are implemented as blackbox items (see
64218+ * plugin/item/blackbox.[ch]).
64219+ *
64220+ * For the reference: ext3 also has similar mechanism, it's called "an orphan
64221+ * list" there.
64222+ */
64223+
64224+#include "safe_link.h"
64225+#include "debug.h"
64226+#include "inode.h"
64227+
64228+#include "plugin/item/blackbox.h"
64229+
64230+#include <linux/fs.h>
64231+
64232+/*
64233+ * On-disk format of safe-link.
64234+ */
64235+typedef struct safelink {
64236+ reiser4_key sdkey; /* key of stat-data for the file safe-link is
64237+ * for */
64238+ d64 size; /* size to which file should be truncated */
64239+} safelink_t;
64240+
64241+/*
64242+ * locality where safe-link items are stored. Next to the objectid of root
64243+ * directory.
64244+ */
64245+static oid_t safe_link_locality(reiser4_tree * tree)
64246+{
64247+ return get_key_objectid(get_super_private(tree->super)->df_plug->
64248+ root_dir_key(tree->super)) + 1;
64249+}
64250+
64251+/*
64252+ Construct a key for the safe-link. Key has the following format:
64253+
64254+| 60 | 4 | 64 | 4 | 60 | 64 |
64255++---------------+---+------------------+---+---------------+------------------+
64256+| locality | 0 | 0 | 0 | objectid | link type |
64257++---------------+---+------------------+---+---------------+------------------+
64258+| | | | |
64259+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
64260+
64261+ This is in large keys format. In small keys format second 8 byte chunk is
64262+ out. Locality is a constant returned by safe_link_locality(). objectid is
64263+ an oid of a file on which operation protected by this safe-link is
64264+ performed. link-type is used to distinguish safe-links for different
64265+ operations.
64266+
64267+ */
64268+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64269+ reiser4_safe_link_t link, reiser4_key * key)
64270+{
64271+ reiser4_key_init(key);
64272+ set_key_locality(key, safe_link_locality(tree));
64273+ set_key_objectid(key, oid);
64274+ set_key_offset(key, link);
64275+ return key;
64276+}
64277+
64278+/*
64279+ * how much disk space is necessary to insert and remove (in the
64280+ * error-handling path) safe-link.
64281+ */
64282+static __u64 safe_link_tograb(reiser4_tree * tree)
64283+{
64284+ return
64285+ /* insert safe link */
64286+ estimate_one_insert_item(tree) +
64287+ /* remove safe link */
64288+ estimate_one_item_removal(tree) +
64289+ /* drill to the leaf level during insertion */
64290+ 1 + estimate_one_insert_item(tree) +
64291+ /*
64292+ * possible update of existing safe-link. Actually, if
64293+ * safe-link existed already (we failed to remove it), then no
64294+ * insertion is necessary, so this term is already "covered",
64295+ * but for simplicity let's left it.
64296+ */
64297+ 1;
64298+}
64299+
64300+/*
64301+ * grab enough disk space to insert and remove (in the error-handling path)
64302+ * safe-link.
64303+ */
64304+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64305+{
64306+ int result;
64307+
64308+ grab_space_enable();
64309+ /* The sbinfo->delete semaphore can be taken here.
64310+ * safe_link_release() should be called before leaving reiser4
64311+ * context. */
64312+ result =
64313+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64314+ grab_space_enable();
64315+ return result;
64316+}
64317+
64318+/*
64319+ * release unused disk space reserved by safe_link_grab().
64320+ */
64321+void safe_link_release(reiser4_tree * tree)
64322+{
64323+ reiser4_release_reserved(tree->super);
64324+}
64325+
64326+/*
64327+ * insert into tree safe-link for operation @link on inode @inode.
64328+ */
64329+int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64330+{
64331+ reiser4_key key;
64332+ safelink_t sl;
64333+ int length;
64334+ int result;
64335+ reiser4_tree *tree;
64336+
64337+ build_sd_key(inode, &sl.sdkey);
64338+ length = sizeof sl.sdkey;
64339+
64340+ if (link == SAFE_TRUNCATE) {
64341+ /*
64342+ * for truncate we have to store final file length also,
64343+ * expand item.
64344+ */
64345+ length += sizeof(sl.size);
64346+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64347+ }
64348+ tree = tree_by_inode(inode);
64349+ build_link_key(tree, get_inode_oid(inode), link, &key);
64350+
64351+ result = store_black_box(tree, &key, &sl, length);
64352+ if (result == -EEXIST)
64353+ result = update_black_box(tree, &key, &sl, length);
64354+ return result;
64355+}
64356+
64357+/*
64358+ * remove safe-link corresponding to the operation @link on inode @inode from
64359+ * the tree.
64360+ */
64361+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64362+{
64363+ reiser4_key key;
64364+
64365+ return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64366+}
64367+
64368+/*
64369+ * in-memory structure to keep information extracted from safe-link. This is
64370+ * used to iterate over all safe-links.
64371+ */
64372+typedef struct {
64373+ reiser4_tree *tree; /* internal tree */
64374+ reiser4_key key; /* safe-link key */
64375+ reiser4_key sdkey; /* key of object stat-data */
64376+ reiser4_safe_link_t link; /* safe-link type */
64377+ oid_t oid; /* object oid */
64378+ __u64 size; /* final size for truncate */
64379+} safe_link_context;
64380+
64381+/*
64382+ * start iterating over all safe-links.
64383+ */
64384+static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
64385+{
64386+ ctx->tree = tree;
64387+ reiser4_key_init(&ctx->key);
64388+ set_key_locality(&ctx->key, safe_link_locality(tree));
64389+ set_key_objectid(&ctx->key, get_key_objectid(max_key()));
64390+ set_key_offset(&ctx->key, get_key_offset(max_key()));
64391+}
64392+
64393+/*
64394+ * return next safe-link.
64395+ */
64396+static int safe_link_iter_next(safe_link_context * ctx)
64397+{
64398+ int result;
64399+ safelink_t sl;
64400+
64401+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64402+ if (result == 0) {
64403+ ctx->oid = get_key_objectid(&ctx->key);
64404+ ctx->link = get_key_offset(&ctx->key);
64405+ ctx->sdkey = sl.sdkey;
64406+ if (ctx->link == SAFE_TRUNCATE)
64407+ ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64408+ }
64409+ return result;
64410+}
64411+
64412+/*
64413+ * check are there any more safe-links left in the tree.
64414+ */
64415+static int safe_link_iter_finished(safe_link_context * ctx)
64416+{
64417+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64418+}
64419+
64420+/*
64421+ * finish safe-link iteration.
64422+ */
64423+static void safe_link_iter_end(safe_link_context * ctx)
64424+{
64425+ /* nothing special */
64426+}
64427+
64428+/*
64429+ * process single safe-link.
64430+ */
64431+static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64432+ reiser4_key * sdkey, oid_t oid, __u64 size)
64433+{
64434+ struct inode *inode;
64435+ int result;
64436+
64437+ /*
64438+ * obtain object inode by reiser4_iget(), then call object plugin
64439+ * ->safelink() method to do actual work, then delete safe-link on
64440+ * success.
64441+ */
64442+ inode = reiser4_iget(super, sdkey, 1);
64443+ if (!IS_ERR(inode)) {
64444+ file_plugin *fplug;
64445+
64446+ fplug = inode_file_plugin(inode);
64447+ assert("nikita-3428", fplug != NULL);
64448+ assert("", oid == get_inode_oid(inode));
64449+ if (fplug->safelink != NULL) {
64450+ /* txn_restart_current is not necessary because
64451+ * mounting is signle thread. However, without it
64452+ * deadlock detection code will complain (see
64453+ * nikita-3361). */
64454+ txn_restart_current();
64455+ result = fplug->safelink(inode, link, size);
64456+ } else {
64457+ warning("nikita-3430",
64458+ "Cannot handle safelink for %lli",
64459+ (unsigned long long)oid);
64460+ print_key("key", sdkey);
64461+ result = 0;
64462+ }
64463+ if (result != 0) {
64464+ warning("nikita-3431",
64465+ "Error processing safelink for %lli: %i",
64466+ (unsigned long long)oid, result);
64467+ }
64468+ reiser4_iget_complete(inode);
64469+ iput(inode);
64470+ if (result == 0) {
64471+ result = safe_link_grab(get_tree(super), BA_CAN_COMMIT);
64472+ if (result == 0)
64473+ result =
64474+ safe_link_del(get_tree(super), oid, link);
64475+ safe_link_release(get_tree(super));
64476+ /*
64477+ * restart transaction: if there was large number of
64478+ * safe-links, their processing may fail to fit into
64479+ * single transaction.
64480+ */
64481+ if (result == 0)
64482+ txn_restart_current();
64483+ }
64484+ } else
64485+ result = PTR_ERR(inode);
64486+ return result;
64487+}
64488+
64489+/*
64490+ * iterate over all safe-links in the file-system processing them one by one.
64491+ */
64492+int process_safelinks(struct super_block *super)
64493+{
64494+ safe_link_context ctx;
64495+ int result;
64496+
64497+ if (rofs_super(super))
64498+ /* do nothing on the read-only file system */
64499+ return 0;
64500+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
64501+ result = 0;
64502+ do {
64503+ result = safe_link_iter_next(&ctx);
64504+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
64505+ result = 0;
64506+ break;
64507+ }
64508+ if (result == 0)
64509+ result = process_safelink(super, ctx.link,
64510+ &ctx.sdkey, ctx.oid,
64511+ ctx.size);
64512+ } while (result == 0);
64513+ safe_link_iter_end(&ctx);
64514+ return result;
64515+}
64516+
64517+/* Make Linus happy.
64518+ Local variables:
64519+ c-indentation-style: "K&R"
64520+ mode-name: "LC"
64521+ c-basic-offset: 8
64522+ tab-width: 8
64523+ fill-column: 120
64524+ scroll-step: 1
64525+ End:
64526+*/
64527Index: linux-2.6.16/fs/reiser4/safe_link.h
64528===================================================================
64529--- /dev/null
64530+++ linux-2.6.16/fs/reiser4/safe_link.h
64531@@ -0,0 +1,29 @@
64532+/* Copyright 2003 by Hans Reiser, licensing governed by
64533+ * reiser4/README */
64534+
64535+/* Safe-links. See safe_link.c for details. */
64536+
64537+#if !defined( __FS_SAFE_LINK_H__ )
64538+#define __FS_SAFE_LINK_H__
64539+
64540+#include "tree.h"
64541+
64542+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
64543+void safe_link_release(reiser4_tree * tree);
64544+int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
64545+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
64546+
64547+int process_safelinks(struct super_block *super);
64548+
64549+/* __FS_SAFE_LINK_H__ */
64550+#endif
64551+
64552+/* Make Linus happy.
64553+ Local variables:
64554+ c-indentation-style: "K&R"
64555+ mode-name: "LC"
64556+ c-basic-offset: 8
64557+ tab-width: 8
64558+ fill-column: 120
64559+ End:
64560+*/
64561Index: linux-2.6.16/fs/reiser4/seal.c
64562===================================================================
64563--- /dev/null
64564+++ linux-2.6.16/fs/reiser4/seal.c
64565@@ -0,0 +1,217 @@
64566+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64567+/* Seals implementation. */
64568+/* Seals are "weak" tree pointers. They are analogous to tree coords in
64569+ allowing to bypass tree traversal. But normal usage of coords implies that
64570+ node pointed to by coord is locked, whereas seals don't keep a lock (or
64571+ even a reference) to znode. In stead, each znode contains a version number,
64572+ increased on each znode modification. This version number is copied into a
64573+ seal when seal is created. Later, one can "validate" seal by calling
64574+ seal_validate(). If znode is in cache and its version number is still the
64575+ same, seal is "pristine" and coord associated with it can be re-used
64576+ immediately.
64577+
64578+ If, on the other hand, znode is out of cache, or it is obviously different
64579+ one from the znode seal was initially attached to (for example, it is on
64580+ the different level, or is being removed from the tree), seal is
64581+ irreparably invalid ("burned") and tree traversal has to be repeated.
64582+
64583+ Otherwise, there is some hope, that while znode was modified (and seal was
64584+ "broken" as a result), key attached to the seal is still in the node. This
64585+ is checked by first comparing this key with delimiting keys of node and, if
64586+ key is ok, doing intra-node lookup.
64587+
64588+ Znode version is maintained in the following way:
64589+
64590+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
64591+ znode_epoch is incremented and its new value is stored in ->version field
64592+ of new znode. Whenever znode is dirtied (which means it was probably
64593+ modified), znode_epoch is also incremented and its new value is stored in
64594+ znode->version. This is done so, because just incrementing znode->version
64595+ on each update is not enough: it may so happen, that znode get deleted, new
64596+ znode is allocated for the same disk block and gets the same version
64597+ counter, tricking seal code into false positive.
64598+*/
64599+
64600+#include "forward.h"
64601+#include "debug.h"
64602+#include "key.h"
64603+#include "coord.h"
64604+#include "seal.h"
64605+#include "plugin/item/item.h"
64606+#include "plugin/node/node.h"
64607+#include "jnode.h"
64608+#include "znode.h"
64609+#include "super.h"
64610+
64611+static znode *seal_node(const seal_t * seal);
64612+static int seal_matches(const seal_t * seal, znode * node);
64613+
64614+/* initialise seal. This can be called several times on the same seal. @coord
64615+ and @key can be NULL. */
64616+void seal_init(seal_t * seal /* seal to initialise */ ,
64617+ const coord_t * coord /* coord @seal will be attached to */ ,
64618+ const reiser4_key * key UNUSED_ARG /* key @seal will be
64619+ * attached to */ )
64620+{
64621+ assert("nikita-1886", seal != NULL);
64622+ memset(seal, 0, sizeof *seal);
64623+ if (coord != NULL) {
64624+ znode *node;
64625+
64626+ node = coord->node;
64627+ assert("nikita-1987", node != NULL);
64628+ spin_lock_znode(node);
64629+ seal->version = node->version;
64630+ assert("nikita-1988", seal->version != 0);
64631+ seal->block = *znode_get_block(node);
64632+#if REISER4_DEBUG
64633+ seal->coord1 = *coord;
64634+ if (key != NULL)
64635+ seal->key = *key;
64636+#endif
64637+ spin_unlock_znode(node);
64638+ }
64639+}
64640+
64641+/* finish with seal */
64642+void seal_done(seal_t * seal /* seal to clear */ )
64643+{
64644+ assert("nikita-1887", seal != NULL);
64645+ seal->version = 0;
64646+}
64647+
64648+/* true if seal was initialised */
64649+int seal_is_set(const seal_t * seal /* seal to query */ )
64650+{
64651+ assert("nikita-1890", seal != NULL);
64652+ return seal->version != 0;
64653+}
64654+
64655+#if REISER4_DEBUG
64656+/* helper function for seal_validate(). It checks that item at @coord has
64657+ * expected key. This is to detect cases where node was modified but wasn't
64658+ * marked dirty. */
64659+static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
64660+ const reiser4_key * k /* expected key */ )
64661+{
64662+ reiser4_key ukey;
64663+
64664+ return (coord->between != AT_UNIT) ||
64665+ /* FIXME-VS: we only can compare keys for items whose units
64666+ represent exactly one key */
64667+ ((coord_is_existing_unit(coord))
64668+ && (item_is_extent(coord)
64669+ || keyeq(k, unit_key_by_coord(coord, &ukey))))
64670+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
64671+ && keyge(k, unit_key_by_coord(coord, &ukey)));
64672+}
64673+#endif
64674+
64675+/* this is used by seal_validate. It accepts return value of
64676+ * longterm_lock_znode and returns 1 if it can be interpreted as seal
64677+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
64678+ * seal_validate returns -E_REPEAT and caller will call tre search. We cannot
64679+ * do this in longterm_lock_znode(), because sometimes we want to distinguish
64680+ * between -EINVAL and -E_REPEAT. */
64681+static int should_repeat(int return_code)
64682+{
64683+ return return_code == -EINVAL;
64684+}
64685+
64686+/* (re-)validate seal.
64687+
64688+ Checks whether seal is pristine, and try to revalidate it if possible.
64689+
64690+ If seal was burned, or broken irreparably, return -E_REPEAT.
64691+
64692+ NOTE-NIKITA currently seal_validate() returns -E_REPEAT if key we are
64693+ looking for is in range of keys covered by the sealed node, but item wasn't
64694+ found by node ->lookup() method. Alternative is to return -ENOENT in this
64695+ case, but this would complicate callers logic.
64696+
64697+*/
64698+int seal_validate(seal_t * seal /* seal to validate */ ,
64699+ coord_t * coord /* coord to validate against */ ,
64700+ const reiser4_key * key /* key to validate against */ ,
64701+ lock_handle * lh /* resulting lock handle */ ,
64702+ znode_lock_mode mode /* lock node */ ,
64703+ znode_lock_request request /* locking priority */ )
64704+{
64705+ znode *node;
64706+ int result;
64707+
64708+ assert("nikita-1889", seal != NULL);
64709+ assert("nikita-1881", seal_is_set(seal));
64710+ assert("nikita-1882", key != NULL);
64711+ assert("nikita-1883", coord != NULL);
64712+ assert("nikita-1884", lh != NULL);
64713+ assert("nikita-1885", keyeq(&seal->key, key));
64714+ assert("nikita-1989", coords_equal(&seal->coord1, coord));
64715+
64716+ /* obtain znode by block number */
64717+ node = seal_node(seal);
64718+ if (node != NULL) {
64719+ /* znode was in cache, lock it */
64720+ result = longterm_lock_znode(lh, node, mode, request);
64721+ zput(node);
64722+ if (result == 0) {
64723+ if (seal_matches(seal, node)) {
64724+ /* if seal version and znode version
64725+ coincide */
64726+ ON_DEBUG(coord_update_v(coord));
64727+ assert("nikita-1990",
64728+ node == seal->coord1.node);
64729+ assert("nikita-1898",
64730+ WITH_DATA_RET(coord->node, 1,
64731+ check_seal_match(coord,
64732+ key)));
64733+ } else
64734+ result = RETERR(-E_REPEAT);
64735+ }
64736+ if (result != 0) {
64737+ if (should_repeat(result))
64738+ result = RETERR(-E_REPEAT);
64739+ /* unlock node on failure */
64740+ done_lh(lh);
64741+ }
64742+ } else {
64743+ /* znode wasn't in cache */
64744+ result = RETERR(-E_REPEAT);
64745+ }
64746+ return result;
64747+}
64748+
64749+/* helpers functions */
64750+
64751+/* obtain reference to znode seal points to, if in cache */
64752+static znode *seal_node(const seal_t * seal /* seal to query */ )
64753+{
64754+ assert("nikita-1891", seal != NULL);
64755+ return zlook(current_tree, &seal->block);
64756+}
64757+
64758+/* true if @seal version and @node version coincide */
64759+static int seal_matches(const seal_t * seal /* seal to check */ ,
64760+ znode * node /* node to check */ )
64761+{
64762+ int result;
64763+
64764+ assert("nikita-1991", seal != NULL);
64765+ assert("nikita-1993", node != NULL);
64766+
64767+ spin_lock_znode(node);
64768+ result = (seal->version == node->version);
64769+ spin_unlock_znode(node);
64770+ return result;
64771+}
64772+
64773+/* Make Linus happy.
64774+ Local variables:
64775+ c-indentation-style: "K&R"
64776+ mode-name: "LC"
64777+ c-basic-offset: 8
64778+ tab-width: 8
64779+ fill-column: 120
64780+ scroll-step: 1
64781+ End:
64782+*/
64783Index: linux-2.6.16/fs/reiser4/seal.h
64784===================================================================
64785--- /dev/null
64786+++ linux-2.6.16/fs/reiser4/seal.h
64787@@ -0,0 +1,49 @@
64788+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64789+
64790+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
64791+
64792+#ifndef __SEAL_H__
64793+#define __SEAL_H__
64794+
64795+#include "forward.h"
64796+#include "debug.h"
64797+#include "dformat.h"
64798+#include "key.h"
64799+#include "coord.h"
64800+
64801+/* for __u?? types */
64802+/*#include <linux/types.h>*/
64803+
64804+/* seal. See comment at the top of seal.c */
64805+typedef struct seal_s {
64806+ /* version of znode recorder at the time of seal creation */
64807+ __u64 version;
64808+ /* block number of znode attached to this seal */
64809+ reiser4_block_nr block;
64810+#if REISER4_DEBUG
64811+ /* coord this seal is attached to. For debugging. */
64812+ coord_t coord1;
64813+ /* key this seal is attached to. For debugging. */
64814+ reiser4_key key;
64815+#endif
64816+} seal_t;
64817+
64818+extern void seal_init(seal_t *, const coord_t *, const reiser4_key *);
64819+extern void seal_done(seal_t *);
64820+extern int seal_is_set(const seal_t *);
64821+extern int seal_validate(seal_t *, coord_t *,
64822+ const reiser4_key *, lock_handle *,
64823+ znode_lock_mode mode, znode_lock_request request);
64824+
64825+/* __SEAL_H__ */
64826+#endif
64827+
64828+/* Make Linus happy.
64829+ Local variables:
64830+ c-indentation-style: "K&R"
64831+ mode-name: "LC"
64832+ c-basic-offset: 8
64833+ tab-width: 8
64834+ fill-column: 120
64835+ End:
64836+*/
64837Index: linux-2.6.16/fs/reiser4/search.c
64838===================================================================
64839--- /dev/null
64840+++ linux-2.6.16/fs/reiser4/search.c
64841@@ -0,0 +1,1611 @@
64842+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64843+ * reiser4/README */
64844+
64845+#include "forward.h"
64846+#include "debug.h"
64847+#include "dformat.h"
64848+#include "key.h"
64849+#include "coord.h"
64850+#include "seal.h"
64851+#include "plugin/item/item.h"
64852+#include "plugin/node/node.h"
64853+#include "plugin/plugin.h"
64854+#include "jnode.h"
64855+#include "znode.h"
64856+#include "block_alloc.h"
64857+#include "tree_walk.h"
64858+#include "tree.h"
64859+#include "reiser4.h"
64860+#include "super.h"
64861+#include "inode.h"
64862+
64863+#include <linux/slab.h>
64864+
64865+static const char *bias_name(lookup_bias bias);
64866+
64867+/* tree searching algorithm, intranode searching algorithms are in
64868+ plugin/node/ */
64869+
64870+/* tree lookup cache
64871+ *
64872+ * The coord by key cache consists of small list of recently accessed nodes
64873+ * maintained according to the LRU discipline. Before doing real top-to-down
64874+ * tree traversal this cache is scanned for nodes that can contain key
64875+ * requested.
64876+ *
64877+ * The efficiency of coord cache depends heavily on locality of reference for
64878+ * tree accesses. Our user level simulations show reasonably good hit ratios
64879+ * for coord cache under most loads so far.
64880+ */
64881+
64882+/* Initialise coord cache slot */
64883+static void cbk_cache_init_slot(cbk_cache_slot *slot)
64884+{
64885+ assert("nikita-345", slot != NULL);
64886+
64887+ INIT_LIST_HEAD(&slot->lru);
64888+ slot->node = NULL;
64889+}
64890+
64891+/* Initialize coord cache */
64892+int cbk_cache_init(cbk_cache *cache /* cache to init */ )
64893+{
64894+ int i;
64895+
64896+ assert("nikita-346", cache != NULL);
64897+
64898+ cache->slot =
64899+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, GFP_KERNEL);
64900+ if (cache->slot == NULL)
64901+ return RETERR(-ENOMEM);
64902+
64903+ INIT_LIST_HEAD(&cache->lru);
64904+ for (i = 0; i < cache->nr_slots; ++i) {
64905+ cbk_cache_init_slot(cache->slot + i);
64906+ list_add_tail(&((cache->slot + i)->lru), &cache->lru);
64907+ }
64908+ rwlock_init(&cache->guard);
64909+ return 0;
64910+}
64911+
64912+/* free cbk cache data */
64913+void cbk_cache_done(cbk_cache * cache /* cache to release */ )
64914+{
64915+ assert("nikita-2493", cache != NULL);
64916+ if (cache->slot != NULL) {
64917+ kfree(cache->slot);
64918+ cache->slot = NULL;
64919+ }
64920+}
64921+
64922+/* macro to iterate over all cbk cache slots */
64923+#define for_all_slots(cache, slot) \
64924+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
64925+ &(cache)->lru != &(slot)->lru; \
64926+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
64927+
64928+
64929+#if REISER4_DEBUG
64930+/* this function assures that [cbk-cache-invariant] invariant holds */
64931+static int cbk_cache_invariant(const cbk_cache *cache)
64932+{
64933+ cbk_cache_slot *slot;
64934+ int result;
64935+ int unused;
64936+
64937+ if (cache->nr_slots == 0)
64938+ return 1;
64939+
64940+ assert("nikita-2469", cache != NULL);
64941+ unused = 0;
64942+ result = 1;
64943+ read_lock(&((cbk_cache *)cache)->guard);
64944+ for_all_slots(cache, slot) {
64945+ /* in LRU first go all `used' slots followed by `unused' */
64946+ if (unused && (slot->node != NULL))
64947+ result = 0;
64948+ if (slot->node == NULL)
64949+ unused = 1;
64950+ else {
64951+ cbk_cache_slot *scan;
64952+
64953+ /* all cached nodes are different */
64954+ scan = slot;
64955+ while (result) {
64956+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
64957+ if (&cache->lru == &scan->lru)
64958+ break;
64959+ if (slot->node == scan->node)
64960+ result = 0;
64961+ }
64962+ }
64963+ if (!result)
64964+ break;
64965+ }
64966+ read_unlock(&((cbk_cache *)cache)->guard);
64967+ return result;
64968+}
64969+
64970+#endif
64971+
64972+/* Remove references, if any, to @node from coord cache */
64973+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
64974+ reiser4_tree * tree /* tree to remove node from */ )
64975+{
64976+ cbk_cache_slot *slot;
64977+ cbk_cache *cache;
64978+ int i;
64979+
64980+ assert("nikita-350", node != NULL);
64981+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
64982+
64983+ cache = &tree->cbk_cache;
64984+ assert("nikita-2470", cbk_cache_invariant(cache));
64985+
64986+ write_lock(&(cache->guard));
64987+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
64988+ if (slot->node == node) {
64989+ list_move_tail(&slot->lru, &cache->lru);
64990+ slot->node = NULL;
64991+ break;
64992+ }
64993+ }
64994+ write_unlock(&(cache->guard));
64995+ assert("nikita-2471", cbk_cache_invariant(cache));
64996+}
64997+
64998+/* add to the cbk-cache in the "tree" information about "node". This
64999+ can actually be update of existing slot in a cache. */
65000+static void cbk_cache_add(const znode *node /* node to add to the cache */ )
65001+{
65002+ cbk_cache *cache;
65003+ cbk_cache_slot *slot;
65004+ int i;
65005+
65006+ assert("nikita-352", node != NULL);
65007+
65008+ cache = &znode_get_tree(node)->cbk_cache;
65009+ assert("nikita-2472", cbk_cache_invariant(cache));
65010+
65011+ if (cache->nr_slots == 0)
65012+ return;
65013+
65014+ write_lock(&(cache->guard));
65015+ /* find slot to update/add */
65016+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65017+ /* oops, this node is already in a cache */
65018+ if (slot->node == node)
65019+ break;
65020+ }
65021+ /* if all slots are used, reuse least recently used one */
65022+ if (i == cache->nr_slots) {
65023+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
65024+ slot->node = (znode *) node;
65025+ }
65026+ list_move(&slot->lru, &cache->lru);
65027+ write_unlock(&(cache->guard));
65028+ assert("nikita-2473", cbk_cache_invariant(cache));
65029+}
65030+
65031+static int setup_delimiting_keys(cbk_handle * h);
65032+static lookup_result coord_by_handle(cbk_handle * handle);
65033+static lookup_result traverse_tree(cbk_handle * h);
65034+static int cbk_cache_search(cbk_handle * h);
65035+
65036+static level_lookup_result cbk_level_lookup(cbk_handle * h);
65037+static level_lookup_result cbk_node_lookup(cbk_handle * h);
65038+
65039+/* helper functions */
65040+
65041+static void update_stale_dk(reiser4_tree * tree, znode * node);
65042+
65043+/* release parent node during traversal */
65044+static void put_parent(cbk_handle * h);
65045+/* check consistency of fields */
65046+static int sanity_check(cbk_handle * h);
65047+/* release resources in handle */
65048+static void hput(cbk_handle * h);
65049+
65050+static level_lookup_result search_to_left(cbk_handle * h);
65051+
65052+/* pack numerous (numberous I should say) arguments of coord_by_key() into
65053+ * cbk_handle */
65054+static cbk_handle *cbk_pack(cbk_handle * handle,
65055+ reiser4_tree * tree,
65056+ const reiser4_key * key,
65057+ coord_t * coord,
65058+ lock_handle * active_lh,
65059+ lock_handle * parent_lh,
65060+ znode_lock_mode lock_mode,
65061+ lookup_bias bias,
65062+ tree_level lock_level,
65063+ tree_level stop_level,
65064+ __u32 flags, ra_info_t * info)
65065+{
65066+ memset(handle, 0, sizeof *handle);
65067+
65068+ handle->tree = tree;
65069+ handle->key = key;
65070+ handle->lock_mode = lock_mode;
65071+ handle->bias = bias;
65072+ handle->lock_level = lock_level;
65073+ handle->stop_level = stop_level;
65074+ handle->coord = coord;
65075+ /* set flags. See comment in tree.h:cbk_flags */
65076+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
65077+
65078+ handle->active_lh = active_lh;
65079+ handle->parent_lh = parent_lh;
65080+ handle->ra_info = info;
65081+ return handle;
65082+}
65083+
65084+/* main tree lookup procedure
65085+
65086+ Check coord cache. If key we are looking for is not found there, call cbk()
65087+ to do real tree traversal.
65088+
65089+ As we have extents on the twig level, @lock_level and @stop_level can
65090+ be different from LEAF_LEVEL and each other.
65091+
65092+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
65093+ long term locks) while calling this.
65094+*/
65095+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
65096+ * in. Usually this tree is
65097+ * part of file-system
65098+ * super-block */ ,
65099+ const reiser4_key * key /* key to look for */ ,
65100+ coord_t * coord /* where to store found
65101+ * position in a tree. Fields
65102+ * in "coord" are only valid if
65103+ * coord_by_key() returned
65104+ * "CBK_COORD_FOUND" */ ,
65105+ lock_handle * lh, /* resulting lock handle */
65106+ znode_lock_mode lock_mode /* type of lookup we
65107+ * want on node. Pass
65108+ * ZNODE_READ_LOCK here
65109+ * if you only want to
65110+ * read item found and
65111+ * ZNODE_WRITE_LOCK if
65112+ * you want to modify
65113+ * it */ ,
65114+ lookup_bias bias /* what to return if coord
65115+ * with exactly the @key is
65116+ * not in the tree */ ,
65117+ tree_level lock_level /* tree level where to start
65118+ * taking @lock type of
65119+ * locks */ ,
65120+ tree_level stop_level /* tree level to stop. Pass
65121+ * LEAF_LEVEL or TWIG_LEVEL
65122+ * here Item being looked
65123+ * for has to be between
65124+ * @lock_level and
65125+ * @stop_level, inclusive */ ,
65126+ __u32 flags /* search flags */ ,
65127+ ra_info_t *
65128+ info
65129+ /* information about desired tree traversal readahead */
65130+ )
65131+{
65132+ cbk_handle handle;
65133+ lock_handle parent_lh;
65134+ lookup_result result;
65135+
65136+ init_lh(lh);
65137+ init_lh(&parent_lh);
65138+
65139+ assert("nikita-3023", schedulable());
65140+
65141+ assert("nikita-353", tree != NULL);
65142+ assert("nikita-354", key != NULL);
65143+ assert("nikita-355", coord != NULL);
65144+ assert("nikita-356", (bias == FIND_EXACT)
65145+ || (bias == FIND_MAX_NOT_MORE_THAN));
65146+ assert("nikita-357", stop_level >= LEAF_LEVEL);
65147+ /* no locks can be held during tree traversal */
65148+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65149+
65150+ cbk_pack(&handle,
65151+ tree,
65152+ key,
65153+ coord,
65154+ lh,
65155+ &parent_lh,
65156+ lock_mode, bias, lock_level, stop_level, flags, info);
65157+
65158+ result = coord_by_handle(&handle);
65159+ assert("nikita-3247",
65160+ ergo(!IS_CBKERR(result), coord->node == lh->node));
65161+ return result;
65162+}
65163+
65164+/* like coord_by_key(), but starts traversal from vroot of @object rather than
65165+ * from tree root. */
65166+lookup_result
65167+object_lookup(struct inode * object,
65168+ const reiser4_key * key,
65169+ coord_t * coord,
65170+ lock_handle * lh,
65171+ znode_lock_mode lock_mode,
65172+ lookup_bias bias,
65173+ tree_level lock_level,
65174+ tree_level stop_level, __u32 flags, ra_info_t * info)
65175+{
65176+ cbk_handle handle;
65177+ lock_handle parent_lh;
65178+ lookup_result result;
65179+
65180+ init_lh(lh);
65181+ init_lh(&parent_lh);
65182+
65183+ assert("nikita-3023", schedulable());
65184+
65185+ assert("nikita-354", key != NULL);
65186+ assert("nikita-355", coord != NULL);
65187+ assert("nikita-356", (bias == FIND_EXACT)
65188+ || (bias == FIND_MAX_NOT_MORE_THAN));
65189+ assert("nikita-357", stop_level >= LEAF_LEVEL);
65190+ /* no locks can be held during tree search by key */
65191+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65192+
65193+ cbk_pack(&handle,
65194+ object != NULL ? tree_by_inode(object) : current_tree,
65195+ key,
65196+ coord,
65197+ lh,
65198+ &parent_lh,
65199+ lock_mode, bias, lock_level, stop_level, flags, info);
65200+ handle.object = object;
65201+
65202+ result = coord_by_handle(&handle);
65203+ assert("nikita-3247",
65204+ ergo(!IS_CBKERR(result), coord->node == lh->node));
65205+ return result;
65206+}
65207+
65208+/* lookup by cbk_handle. Common part of coord_by_key() and object_lookup(). */
65209+static lookup_result coord_by_handle(cbk_handle * handle)
65210+{
65211+ /*
65212+ * first check cbk_cache (which is look-aside cache for our tree) and
65213+ * of this fails, start traversal.
65214+ */
65215+ /* first check whether "key" is in cache of recent lookups. */
65216+ if (cbk_cache_search(handle) == 0)
65217+ return handle->result;
65218+ else
65219+ return traverse_tree(handle);
65220+}
65221+
65222+/* Execute actor for each item (or unit, depending on @through_units_p),
65223+ starting from @coord, right-ward, until either:
65224+
65225+ - end of the tree is reached
65226+ - unformatted node is met
65227+ - error occurred
65228+ - @actor returns 0 or less
65229+
65230+ Error code, or last actor return value is returned.
65231+
65232+ This is used by plugin/dir/hashe_dir.c:find_entry() to move through
65233+ sequence of entries with identical keys and alikes.
65234+*/
65235+int iterate_tree(reiser4_tree * tree /* tree to scan */ ,
65236+ coord_t * coord /* coord to start from */ ,
65237+ lock_handle * lh /* lock handle to start with and to
65238+ * update along the way */ ,
65239+ tree_iterate_actor_t actor /* function to call on each
65240+ * item/unit */ ,
65241+ void *arg /* argument to pass to @actor */ ,
65242+ znode_lock_mode mode /* lock mode on scanned nodes */ ,
65243+ int through_units_p /* call @actor on each item or on each
65244+ * unit */ )
65245+{
65246+ int result;
65247+
65248+ assert("nikita-1143", tree != NULL);
65249+ assert("nikita-1145", coord != NULL);
65250+ assert("nikita-1146", lh != NULL);
65251+ assert("nikita-1147", actor != NULL);
65252+
65253+ result = zload(coord->node);
65254+ coord_clear_iplug(coord);
65255+ if (result != 0)
65256+ return result;
65257+ if (!coord_is_existing_unit(coord)) {
65258+ zrelse(coord->node);
65259+ return -ENOENT;
65260+ }
65261+ while ((result = actor(tree, coord, lh, arg)) > 0) {
65262+ /* move further */
65263+ if ((through_units_p && coord_next_unit(coord)) ||
65264+ (!through_units_p && coord_next_item(coord))) {
65265+ do {
65266+ lock_handle couple;
65267+
65268+ /* move to the next node */
65269+ init_lh(&couple);
65270+ result =
65271+ reiser4_get_right_neighbor(&couple,
65272+ coord->node,
65273+ (int)mode,
65274+ GN_CAN_USE_UPPER_LEVELS);
65275+ zrelse(coord->node);
65276+ if (result == 0) {
65277+
65278+ result = zload(couple.node);
65279+ if (result != 0) {
65280+ done_lh(&couple);
65281+ return result;
65282+ }
65283+
65284+ coord_init_first_unit(coord,
65285+ couple.node);
65286+ done_lh(lh);
65287+ move_lh(lh, &couple);
65288+ } else
65289+ return result;
65290+ } while (node_is_empty(coord->node));
65291+ }
65292+
65293+ assert("nikita-1149", coord_is_existing_unit(coord));
65294+ }
65295+ zrelse(coord->node);
65296+ return result;
65297+}
65298+
65299+/* return locked uber znode for @tree */
65300+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65301+ znode_lock_request pri, lock_handle * lh)
65302+{
65303+ int result;
65304+
65305+ result = longterm_lock_znode(lh, tree->uber, mode, pri);
65306+ return result;
65307+}
65308+
65309+/* true if @key is strictly within @node
65310+
65311+ we are looking for possibly non-unique key and it is item is at the edge of
65312+ @node. May be it is in the neighbor.
65313+*/
65314+static int znode_contains_key_strict(znode * node /* node to check key
65315+ * against */ ,
65316+ const reiser4_key *
65317+ key /* key to check */ ,
65318+ int isunique)
65319+{
65320+ int answer;
65321+
65322+ assert("nikita-1760", node != NULL);
65323+ assert("nikita-1722", key != NULL);
65324+
65325+ if (keyge(key, &node->rd_key))
65326+ return 0;
65327+
65328+ answer = keycmp(&node->ld_key, key);
65329+
65330+ if (isunique)
65331+ return answer != GREATER_THAN;
65332+ else
65333+ return answer == LESS_THAN;
65334+}
65335+
65336+/*
65337+ * Virtual Root (vroot) code.
65338+ *
65339+ * For given file system object (e.g., regular file or directory) let's
65340+ * define its "virtual root" as lowest in the tree (that is, furtherest
65341+ * from the tree root) node such that all body items of said object are
65342+ * located in a tree rooted at this node.
65343+ *
65344+ * Once vroot of object is found all tree lookups for items within body of
65345+ * this object ("object lookups") can be started from its vroot rather
65346+ * than from real root. This has following advantages:
65347+ *
65348+ * 1. amount of nodes traversed during lookup (and, hence, amount of
65349+ * key comparisons made) decreases, and
65350+ *
65351+ * 2. contention on tree root is decreased. This latter was actually
65352+ * motivating reason behind vroot, because spin lock of root node,
65353+ * which is taken when acquiring long-term lock on root node is the
65354+ * hottest lock in the reiser4.
65355+ *
65356+ * How to find vroot.
65357+ *
65358+ * When vroot of object F is not yet determined, all object lookups start
65359+ * from the root of the tree. At each tree level during traversal we have
65360+ * a node N such that a key we are looking for (which is the key inside
65361+ * object's body) is located within N. In function handle_vroot() called
65362+ * from cbk_level_lookup() we check whether N is possible vroot for
65363+ * F. Check is trivial---if neither leftmost nor rightmost item of N
65364+ * belongs to F (and we already have helpful ->owns_item() method of
65365+ * object plugin for this), then N is possible vroot of F. This, of
65366+ * course, relies on the assumption that each object occupies contiguous
65367+ * range of keys in the tree.
65368+ *
65369+ * Thus, traversing tree downward and checking each node as we go, we can
65370+ * find lowest such node, which, by definition, is vroot.
65371+ *
65372+ * How to track vroot.
65373+ *
65374+ * Nohow. If actual vroot changes, next object lookup will just restart
65375+ * from the actual tree root, refreshing object's vroot along the way.
65376+ *
65377+ */
65378+
65379+/*
65380+ * Check whether @node is possible vroot of @object.
65381+ */
65382+static void handle_vroot(struct inode *object, znode * node)
65383+{
65384+ file_plugin *fplug;
65385+ coord_t coord;
65386+
65387+ fplug = inode_file_plugin(object);
65388+ assert("nikita-3353", fplug != NULL);
65389+ assert("nikita-3354", fplug->owns_item != NULL);
65390+
65391+ if (unlikely(node_is_empty(node)))
65392+ return;
65393+
65394+ coord_init_first_unit(&coord, node);
65395+ /*
65396+ * if leftmost item of @node belongs to @object, we cannot be sure
65397+ * that @node is vroot of @object, because, some items of @object are
65398+ * probably in the sub-tree rooted at the left neighbor of @node.
65399+ */
65400+ if (fplug->owns_item(object, &coord))
65401+ return;
65402+ coord_init_last_unit(&coord, node);
65403+ /* mutatis mutandis for the rightmost item */
65404+ if (fplug->owns_item(object, &coord))
65405+ return;
65406+ /* otherwise, @node is possible vroot of @object */
65407+ inode_set_vroot(object, node);
65408+}
65409+
65410+/*
65411+ * helper function used by traverse tree to start tree traversal not from the
65412+ * tree root, but from @h->object's vroot, if possible.
65413+ */
65414+static int prepare_object_lookup(cbk_handle * h)
65415+{
65416+ znode *vroot;
65417+ int result;
65418+
65419+ vroot = inode_get_vroot(h->object);
65420+ if (vroot == NULL) {
65421+ /*
65422+ * object doesn't have known vroot, start from real tree root.
65423+ */
65424+ return LOOKUP_CONT;
65425+ }
65426+
65427+ h->level = znode_get_level(vroot);
65428+ /* take a long-term lock on vroot */
65429+ h->result = longterm_lock_znode(h->active_lh, vroot,
65430+ cbk_lock_mode(h->level, h),
65431+ ZNODE_LOCK_LOPRI);
65432+ result = LOOKUP_REST;
65433+ if (h->result == 0) {
65434+ int isunique;
65435+ int inside;
65436+
65437+ isunique = h->flags & CBK_UNIQUE;
65438+ /* check that key is inside vroot */
65439+ read_lock_dk(h->tree);
65440+ inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65441+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65442+ read_unlock_dk(h->tree);
65443+ if (inside) {
65444+ h->result = zload(vroot);
65445+ if (h->result == 0) {
65446+ /* search for key in vroot. */
65447+ result = cbk_node_lookup(h);
65448+ zrelse(vroot); /*h->active_lh->node); */
65449+ if (h->active_lh->node != vroot) {
65450+ result = LOOKUP_REST;
65451+ } else if (result == LOOKUP_CONT) {
65452+ move_lh(h->parent_lh, h->active_lh);
65453+ h->flags &= ~CBK_DKSET;
65454+ }
65455+ }
65456+ }
65457+ } else
65458+ /* long-term locking failed. Restart. */
65459+ ;
65460+
65461+ zput(vroot);
65462+
65463+ if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65464+ hput(h);
65465+ return result;
65466+}
65467+
65468+/* main function that handles common parts of tree traversal: starting
65469+ (fake znode handling), restarts, error handling, completion */
65470+static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65471+{
65472+ int done;
65473+ int iterations;
65474+ int vroot_used;
65475+
65476+ assert("nikita-365", h != NULL);
65477+ assert("nikita-366", h->tree != NULL);
65478+ assert("nikita-367", h->key != NULL);
65479+ assert("nikita-368", h->coord != NULL);
65480+ assert("nikita-369", (h->bias == FIND_EXACT)
65481+ || (h->bias == FIND_MAX_NOT_MORE_THAN));
65482+ assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65483+ assert("nikita-2949", !(h->flags & CBK_DKSET));
65484+ assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65485+
65486+ done = 0;
65487+ iterations = 0;
65488+ vroot_used = 0;
65489+
65490+ /* loop for restarts */
65491+ restart:
65492+
65493+ assert("nikita-3024", schedulable());
65494+
65495+ h->result = CBK_COORD_FOUND;
65496+ /* connect_znode() needs it */
65497+ h->ld_key = *min_key();
65498+ h->rd_key = *max_key();
65499+ h->flags |= CBK_DKSET;
65500+ h->error = NULL;
65501+
65502+ if (!vroot_used && h->object != NULL) {
65503+ vroot_used = 1;
65504+ done = prepare_object_lookup(h);
65505+ if (done == LOOKUP_REST) {
65506+ goto restart;
65507+ } else if (done == LOOKUP_DONE)
65508+ return h->result;
65509+ }
65510+ if (h->parent_lh->node == NULL) {
65511+ done =
65512+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
65513+ h->parent_lh);
65514+
65515+ assert("nikita-1637", done != -E_DEADLOCK);
65516+
65517+ h->block = h->tree->root_block;
65518+ h->level = h->tree->height;
65519+ h->coord->node = h->parent_lh->node;
65520+
65521+ if (done != 0)
65522+ return done;
65523+ }
65524+
65525+ /* loop descending a tree */
65526+ while (!done) {
65527+
65528+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
65529+ IS_POW(iterations))) {
65530+ warning("nikita-1481", "Too many iterations: %i",
65531+ iterations);
65532+ print_key("key", h->key);
65533+ ++iterations;
65534+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
65535+ h->error =
65536+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
65537+ h->result = RETERR(-EIO);
65538+ break;
65539+ }
65540+ switch (cbk_level_lookup(h)) {
65541+ case LOOKUP_CONT:
65542+ move_lh(h->parent_lh, h->active_lh);
65543+ continue;
65544+ default:
65545+ wrong_return_value("nikita-372", "cbk_level");
65546+ case LOOKUP_DONE:
65547+ done = 1;
65548+ break;
65549+ case LOOKUP_REST:
65550+ hput(h);
65551+ /* deadlock avoidance is normal case. */
65552+ if (h->result != -E_DEADLOCK)
65553+ ++iterations;
65554+ preempt_point();
65555+ goto restart;
65556+ }
65557+ }
65558+ /* that's all. The rest is error handling */
65559+ if (unlikely(h->error != NULL)) {
65560+ warning("nikita-373", "%s: level: %i, "
65561+ "lock_level: %i, stop_level: %i "
65562+ "lock_mode: %s, bias: %s",
65563+ h->error, h->level, h->lock_level, h->stop_level,
65564+ lock_mode_name(h->lock_mode), bias_name(h->bias));
65565+ reiser4_print_address("block", &h->block);
65566+ print_key("key", h->key);
65567+ print_coord_content("coord", h->coord);
65568+ }
65569+ /* `unlikely' error case */
65570+ if (unlikely(IS_CBKERR(h->result))) {
65571+ /* failure. do cleanup */
65572+ hput(h);
65573+ } else {
65574+ assert("nikita-1605", WITH_DATA_RET
65575+ (h->coord->node, 1,
65576+ ergo((h->result == CBK_COORD_FOUND) &&
65577+ (h->bias == FIND_EXACT) &&
65578+ (!node_is_empty(h->coord->node)),
65579+ coord_is_existing_item(h->coord))));
65580+ }
65581+ return h->result;
65582+}
65583+
65584+/* find delimiting keys of child
65585+
65586+ Determine left and right delimiting keys for child pointed to by
65587+ @parent_coord.
65588+
65589+*/
65590+static void find_child_delimiting_keys(znode * parent /* parent znode, passed
65591+ * locked */ ,
65592+ const coord_t * parent_coord /* coord where
65593+ * pointer to
65594+ * child is
65595+ * stored */ ,
65596+ reiser4_key * ld /* where to store left
65597+ * delimiting key */ ,
65598+ reiser4_key * rd /* where to store right
65599+ * delimiting key */ )
65600+{
65601+ coord_t neighbor;
65602+
65603+ assert("nikita-1484", parent != NULL);
65604+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
65605+
65606+ coord_dup(&neighbor, parent_coord);
65607+
65608+ if (neighbor.between == AT_UNIT)
65609+ /* imitate item ->lookup() behavior. */
65610+ neighbor.between = AFTER_UNIT;
65611+
65612+ if (coord_set_to_left(&neighbor) == 0)
65613+ unit_key_by_coord(&neighbor, ld);
65614+ else {
65615+ assert("nikita-14851", 0);
65616+ *ld = *znode_get_ld_key(parent);
65617+ }
65618+
65619+ coord_dup(&neighbor, parent_coord);
65620+ if (neighbor.between == AT_UNIT)
65621+ neighbor.between = AFTER_UNIT;
65622+ if (coord_set_to_right(&neighbor) == 0)
65623+ unit_key_by_coord(&neighbor, rd);
65624+ else
65625+ *rd = *znode_get_rd_key(parent);
65626+}
65627+
65628+/*
65629+ * setup delimiting keys for a child
65630+ *
65631+ * @parent parent node
65632+ *
65633+ * @coord location in @parent where pointer to @child is
65634+ *
65635+ * @child child node
65636+ */
65637+int
65638+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
65639+{
65640+ reiser4_tree *tree;
65641+
65642+ assert("nikita-2952",
65643+ znode_get_level(parent) == znode_get_level(coord->node));
65644+
65645+ /* fast check without taking dk lock. This is safe, because
65646+ * JNODE_DKSET is never cleared once set. */
65647+ if (!ZF_ISSET(child, JNODE_DKSET)) {
65648+ tree = znode_get_tree(parent);
65649+ write_lock_dk(tree);
65650+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
65651+ find_child_delimiting_keys(parent, coord,
65652+ &child->ld_key,
65653+ &child->rd_key);
65654+ ON_DEBUG(child->ld_key_version =
65655+ atomic_inc_return(&delim_key_version);
65656+ child->rd_key_version =
65657+ atomic_inc_return(&delim_key_version););
65658+ ZF_SET(child, JNODE_DKSET);
65659+ }
65660+ write_unlock_dk(tree);
65661+ return 1;
65662+ }
65663+ return 0;
65664+}
65665+
65666+/* Perform tree lookup at one level. This is called from cbk_traverse()
65667+ function that drives lookup through tree and calls cbk_node_lookup() to
65668+ perform lookup within one node.
65669+
65670+ See comments in a code.
65671+*/
65672+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
65673+{
65674+ int ret;
65675+ int setdk;
65676+ int ldkeyset = 0;
65677+ reiser4_key ldkey;
65678+ reiser4_key key;
65679+ znode *active;
65680+
65681+ assert("nikita-3025", schedulable());
65682+
65683+ /* acquire reference to @active node */
65684+ active =
65685+ zget(h->tree, &h->block, h->parent_lh->node, h->level, get_gfp_mask());
65686+
65687+ if (IS_ERR(active)) {
65688+ h->result = PTR_ERR(active);
65689+ return LOOKUP_DONE;
65690+ }
65691+
65692+ /* lock @active */
65693+ h->result = longterm_lock_znode(h->active_lh,
65694+ active,
65695+ cbk_lock_mode(h->level, h),
65696+ ZNODE_LOCK_LOPRI);
65697+ /* longterm_lock_znode() acquires additional reference to znode (which
65698+ will be later released by longterm_unlock_znode()). Release
65699+ reference acquired by zget().
65700+ */
65701+ zput(active);
65702+ if (unlikely(h->result != 0))
65703+ goto fail_or_restart;
65704+
65705+ setdk = 0;
65706+ /* if @active is accessed for the first time, setup delimiting keys on
65707+ it. Delimiting keys are taken from the parent node. See
65708+ setup_delimiting_keys() for details.
65709+ */
65710+ if (h->flags & CBK_DKSET) {
65711+ setdk = setup_delimiting_keys(h);
65712+ h->flags &= ~CBK_DKSET;
65713+ } else {
65714+ znode *parent;
65715+
65716+ parent = h->parent_lh->node;
65717+ h->result = zload(parent);
65718+ if (unlikely(h->result != 0))
65719+ goto fail_or_restart;
65720+
65721+ if (!ZF_ISSET(active, JNODE_DKSET))
65722+ setdk = set_child_delimiting_keys(parent,
65723+ h->coord, active);
65724+ else {
65725+ read_lock_dk(h->tree);
65726+ find_child_delimiting_keys(parent, h->coord, &ldkey,
65727+ &key);
65728+ read_unlock_dk(h->tree);
65729+ ldkeyset = 1;
65730+ }
65731+ zrelse(parent);
65732+ }
65733+
65734+ /* this is ugly kludge. Reminder: this is necessary, because
65735+ ->lookup() method returns coord with ->between field probably set
65736+ to something different from AT_UNIT.
65737+ */
65738+ h->coord->between = AT_UNIT;
65739+
65740+ if (znode_just_created(active) && (h->coord->node != NULL)) {
65741+ write_lock_tree(h->tree);
65742+ /* if we are going to load znode right now, setup
65743+ ->in_parent: coord where pointer to this node is stored in
65744+ parent.
65745+ */
65746+ coord_to_parent_coord(h->coord, &active->in_parent);
65747+ write_unlock_tree(h->tree);
65748+ }
65749+
65750+ /* check connectedness without holding tree lock---false negatives
65751+ * will be re-checked by connect_znode(), and false positives are
65752+ * impossible---@active cannot suddenly turn into unconnected
65753+ * state. */
65754+ if (!znode_is_connected(active)) {
65755+ h->result = connect_znode(h->coord, active);
65756+ if (unlikely(h->result != 0)) {
65757+ put_parent(h);
65758+ goto fail_or_restart;
65759+ }
65760+ }
65761+
65762+ jload_prefetch(ZJNODE(active));
65763+
65764+ if (setdk)
65765+ update_stale_dk(h->tree, active);
65766+
65767+ /* put_parent() cannot be called earlier, because connect_znode()
65768+ assumes parent node is referenced; */
65769+ put_parent(h);
65770+
65771+ if ((!znode_contains_key_lock(active, h->key) &&
65772+ (h->flags & CBK_TRUST_DK))
65773+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
65774+ /* 1. key was moved out of this node while this thread was
65775+ waiting for the lock. Restart. More elaborate solution is
65776+ to determine where key moved (to the left, or to the right)
65777+ and try to follow it through sibling pointers.
65778+
65779+ 2. or, node itself is going to be removed from the
65780+ tree. Release lock and restart.
65781+ */
65782+ h->result = -E_REPEAT;
65783+ }
65784+ if (h->result == -E_REPEAT)
65785+ return LOOKUP_REST;
65786+
65787+ h->result = zload_ra(active, h->ra_info);
65788+ if (h->result) {
65789+ return LOOKUP_DONE;
65790+ }
65791+
65792+ /* sanity checks */
65793+ if (sanity_check(h)) {
65794+ zrelse(active);
65795+ return LOOKUP_DONE;
65796+ }
65797+
65798+ /* check that key of leftmost item in the @active is the same as in
65799+ * its parent */
65800+ if (ldkeyset && !node_is_empty(active) &&
65801+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
65802+ warning("vs-3533", "Keys are inconsistent. Fsck?");
65803+ print_key("inparent", &ldkey);
65804+ print_key("inchild", &key);
65805+ h->result = RETERR(-EIO);
65806+ zrelse(active);
65807+ return LOOKUP_DONE;
65808+ }
65809+
65810+ if (h->object != NULL)
65811+ handle_vroot(h->object, active);
65812+
65813+ ret = cbk_node_lookup(h);
65814+
65815+ /* h->active_lh->node might change, but active is yet to be zrelsed */
65816+ zrelse(active);
65817+
65818+ return ret;
65819+
65820+ fail_or_restart:
65821+ if (h->result == -E_DEADLOCK)
65822+ return LOOKUP_REST;
65823+ return LOOKUP_DONE;
65824+}
65825+
65826+#if REISER4_DEBUG
65827+/* check left and right delimiting keys of a znode */
65828+void check_dkeys(znode * node)
65829+{
65830+ znode *left;
65831+ znode *right;
65832+
65833+ read_lock_tree(current_tree);
65834+ read_lock_dk(current_tree);
65835+
65836+ assert("vs-1710", znode_is_any_locked(node));
65837+ assert("vs-1197",
65838+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
65839+
65840+ left = node->left;
65841+ right = node->right;
65842+
65843+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65844+ && left != NULL && ZF_ISSET(left, JNODE_DKSET))
65845+ /* check left neighbor. Note that left neighbor is not locked,
65846+ so it might get wrong delimiting keys therefore */
65847+ assert("vs-1198",
65848+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
65849+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
65850+
65851+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65852+ && right != NULL && ZF_ISSET(right, JNODE_DKSET))
65853+ /* check right neighbor. Note that right neighbor is not
65854+ locked, so it might get wrong delimiting keys therefore */
65855+ assert("vs-1199",
65856+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
65857+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
65858+
65859+ read_unlock_dk(current_tree);
65860+ read_unlock_tree(current_tree);
65861+}
65862+#endif
65863+
65864+/* true if @key is left delimiting key of @node */
65865+static int key_is_ld(znode * node, const reiser4_key * key)
65866+{
65867+ int ld;
65868+
65869+ assert("nikita-1716", node != NULL);
65870+ assert("nikita-1758", key != NULL);
65871+
65872+ read_lock_dk(znode_get_tree(node));
65873+ assert("nikita-1759", znode_contains_key(node, key));
65874+ ld = keyeq(znode_get_ld_key(node), key);
65875+ read_unlock_dk(znode_get_tree(node));
65876+ return ld;
65877+}
65878+
65879+/* Process one node during tree traversal.
65880+
65881+ This is called by cbk_level_lookup(). */
65882+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
65883+{
65884+ /* node plugin of @active */
65885+ node_plugin *nplug;
65886+ /* item plugin of item that was found */
65887+ item_plugin *iplug;
65888+ /* search bias */
65889+ lookup_bias node_bias;
65890+ /* node we are operating upon */
65891+ znode *active;
65892+ /* tree we are searching in */
65893+ reiser4_tree *tree;
65894+ /* result */
65895+ int result;
65896+
65897+ assert("nikita-379", h != NULL);
65898+
65899+ active = h->active_lh->node;
65900+ tree = h->tree;
65901+
65902+ nplug = active->nplug;
65903+ assert("nikita-380", nplug != NULL);
65904+
65905+ ON_DEBUG(check_dkeys(active));
65906+
65907+ /* return item from "active" node with maximal key not greater than
65908+ "key" */
65909+ node_bias = h->bias;
65910+ result = nplug->lookup(active, h->key, node_bias, h->coord);
65911+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
65912+ /* error occurred */
65913+ h->result = result;
65914+ return LOOKUP_DONE;
65915+ }
65916+ if (h->level == h->stop_level) {
65917+ /* welcome to the stop level */
65918+ assert("nikita-381", h->coord->node == active);
65919+ if (result == NS_FOUND) {
65920+ /* success of tree lookup */
65921+ if (!(h->flags & CBK_UNIQUE)
65922+ && key_is_ld(active, h->key)) {
65923+ return search_to_left(h);
65924+ } else
65925+ h->result = CBK_COORD_FOUND;
65926+ } else {
65927+ h->result = CBK_COORD_NOTFOUND;
65928+ }
65929+ if (!(h->flags & CBK_IN_CACHE))
65930+ cbk_cache_add(active);
65931+ return LOOKUP_DONE;
65932+ }
65933+
65934+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
65935+ h->error = "not found on internal node";
65936+ h->result = result;
65937+ return LOOKUP_DONE;
65938+ }
65939+
65940+ assert("vs-361", h->level > h->stop_level);
65941+
65942+ if (handle_eottl(h, &result)) {
65943+ assert("vs-1674", (result == LOOKUP_DONE ||
65944+ result == LOOKUP_REST));
65945+ return result;
65946+ }
65947+
65948+ /* go down to next level */
65949+ check_me("vs-12", zload(h->coord->node) == 0);
65950+ assert("nikita-2116", item_is_internal(h->coord));
65951+ iplug = item_plugin_by_coord(h->coord);
65952+ iplug->s.internal.down_link(h->coord, h->key, &h->block);
65953+ zrelse(h->coord->node);
65954+ --h->level;
65955+ return LOOKUP_CONT; /* continue */
65956+}
65957+
65958+/* scan cbk_cache slots looking for a match for @h */
65959+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
65960+{
65961+ level_lookup_result llr;
65962+ znode *node;
65963+ reiser4_tree *tree;
65964+ cbk_cache_slot *slot;
65965+ cbk_cache *cache;
65966+ tree_level level;
65967+ int isunique;
65968+ const reiser4_key *key;
65969+ int result;
65970+
65971+ assert("nikita-1317", h != NULL);
65972+ assert("nikita-1315", h->tree != NULL);
65973+ assert("nikita-1316", h->key != NULL);
65974+
65975+ tree = h->tree;
65976+ cache = &tree->cbk_cache;
65977+ if (cache->nr_slots == 0)
65978+ /* size of cbk cache was set to 0 by mount time option. */
65979+ return RETERR(-ENOENT);
65980+
65981+ assert("nikita-2474", cbk_cache_invariant(cache));
65982+ node = NULL; /* to keep gcc happy */
65983+ level = h->level;
65984+ key = h->key;
65985+ isunique = h->flags & CBK_UNIQUE;
65986+ result = RETERR(-ENOENT);
65987+
65988+ /*
65989+ * this is time-critical function and dragons had, hence, been settled
65990+ * here.
65991+ *
65992+ * Loop below scans cbk cache slots trying to find matching node with
65993+ * suitable range of delimiting keys and located at the h->level.
65994+ *
65995+ * Scan is done under cbk cache spin lock that protects slot->node
65996+ * pointers. If suitable node is found we want to pin it in
65997+ * memory. But slot->node can point to the node with x_count 0
65998+ * (unreferenced). Such node can be recycled at any moment, or can
65999+ * already be in the process of being recycled (within jput()).
66000+ *
66001+ * As we found node in the cbk cache, it means that jput() hasn't yet
66002+ * called cbk_cache_invalidate().
66003+ *
66004+ * We acquire reference to the node without holding tree lock, and
66005+ * later, check node's RIP bit. This avoids races with jput().
66006+ */
66007+
66008+ rcu_read_lock();
66009+ read_lock(&((cbk_cache *)cache)->guard);
66010+
66011+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
66012+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
66013+ BUG_ON(&slot->lru != &cache->lru);/*????*/
66014+ while (1) {
66015+
66016+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
66017+
66018+ if (&cache->lru != &slot->lru)
66019+ node = slot->node;
66020+ else
66021+ node = NULL;
66022+
66023+ if (unlikely(node == NULL))
66024+ break;
66025+
66026+ /*
66027+ * this is (hopefully) the only place in the code where we are
66028+ * working with delimiting keys without holding dk lock. This
66029+ * is fine here, because this is only "guess" anyway---keys
66030+ * are rechecked under dk lock below.
66031+ */
66032+ if (znode_get_level(node) == level &&
66033+ /* min_key < key < max_key */
66034+ znode_contains_key_strict(node, key, isunique)) {
66035+ zref(node);
66036+ result = 0;
66037+ spin_lock_prefetch(&tree->tree_lock);
66038+ break;
66039+ }
66040+ }
66041+ read_unlock(&((cbk_cache *)cache)->guard);
66042+
66043+ assert("nikita-2475", cbk_cache_invariant(cache));
66044+
66045+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
66046+ result = -ENOENT;
66047+
66048+ rcu_read_unlock();
66049+
66050+ if (result != 0) {
66051+ h->result = CBK_COORD_NOTFOUND;
66052+ return RETERR(-ENOENT);
66053+ }
66054+
66055+ result =
66056+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
66057+ ZNODE_LOCK_LOPRI);
66058+ zput(node);
66059+ if (result != 0)
66060+ return result;
66061+ result = zload(node);
66062+ if (result != 0)
66063+ return result;
66064+
66065+ /* recheck keys */
66066+ read_lock_dk(tree);
66067+ result = (znode_contains_key_strict(node, key, isunique) &&
66068+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66069+ read_unlock_dk(tree);
66070+ if (result) {
66071+ /* do lookup inside node */
66072+ llr = cbk_node_lookup(h);
66073+ /* if cbk_node_lookup() wandered to another node (due to eottl
66074+ or non-unique keys), adjust @node */
66075+ /*node = h->active_lh->node; */
66076+
66077+ if (llr != LOOKUP_DONE) {
66078+ /* restart or continue on the next level */
66079+ result = RETERR(-ENOENT);
66080+ } else if (IS_CBKERR(h->result))
66081+ /* io or oom */
66082+ result = RETERR(-ENOENT);
66083+ else {
66084+ /* good. Either item found or definitely not found. */
66085+ result = 0;
66086+
66087+ write_lock(&(cache->guard));
66088+ if (slot->node == h->active_lh->node /*node */ ) {
66089+ /* if this node is still in cbk cache---move
66090+ its slot to the head of the LRU list. */
66091+ list_move(&slot->lru, &cache->lru);
66092+ }
66093+ write_unlock(&(cache->guard));
66094+ }
66095+ } else {
66096+ /* race. While this thread was waiting for the lock, node was
66097+ rebalanced and item we are looking for, shifted out of it
66098+ (if it ever was here).
66099+
66100+ Continuing scanning is almost hopeless: node key range was
66101+ moved to, is almost certainly at the beginning of the LRU
66102+ list at this time, because it's hot, but restarting
66103+ scanning from the very beginning is complex. Just return,
66104+ so that cbk() will be performed. This is not that
66105+ important, because such races should be rare. Are they?
66106+ */
66107+ result = RETERR(-ENOENT); /* -ERAUGHT */
66108+ }
66109+ zrelse(node);
66110+ assert("nikita-2476", cbk_cache_invariant(cache));
66111+ return result;
66112+}
66113+
66114+/* look for item with given key in the coord cache
66115+
66116+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
66117+ which is a small LRU list of znodes accessed lately. For each znode in
66118+ znode in this list, it checks whether key we are looking for fits into key
66119+ range covered by this node. If so, and in addition, node lies at allowed
66120+ level (this is to handle extents on a twig level), node is locked, and
66121+ lookup inside it is performed.
66122+
66123+ we need a measurement of the cost of this cache search compared to the cost
66124+ of coord_by_key.
66125+
66126+*/
66127+static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
66128+{
66129+ int result = 0;
66130+ tree_level level;
66131+
66132+ /* add CBK_IN_CACHE to the handle flags. This means that
66133+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add
66134+ * found node to the cache. */
66135+ h->flags |= CBK_IN_CACHE;
66136+ for (level = h->stop_level; level <= h->lock_level; ++level) {
66137+ h->level = level;
66138+ result = cbk_cache_scan_slots(h);
66139+ if (result != 0) {
66140+ done_lh(h->active_lh);
66141+ done_lh(h->parent_lh);
66142+ } else {
66143+ assert("nikita-1319", !IS_CBKERR(h->result));
66144+ break;
66145+ }
66146+ }
66147+ h->flags &= ~CBK_IN_CACHE;
66148+ return result;
66149+}
66150+
66151+/* type of lock we want to obtain during tree traversal. On stop level
66152+ we want type of lock user asked for, on upper levels: read lock. */
66153+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
66154+{
66155+ assert("nikita-382", h != NULL);
66156+
66157+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
66158+}
66159+
66160+/* update outdated delimiting keys */
66161+static void stale_dk(reiser4_tree * tree, znode * node)
66162+{
66163+ znode *right;
66164+
66165+ read_lock_tree(tree);
66166+ write_lock_dk(tree);
66167+ right = node->right;
66168+
66169+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66170+ right && ZF_ISSET(right, JNODE_DKSET) &&
66171+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
66172+ znode_set_rd_key(node, znode_get_ld_key(right));
66173+
66174+ write_unlock_dk(tree);
66175+ read_unlock_tree(tree);
66176+}
66177+
66178+/* check for possibly outdated delimiting keys, and update them if
66179+ * necessary. */
66180+static void update_stale_dk(reiser4_tree * tree, znode * node)
66181+{
66182+ znode *right;
66183+ reiser4_key rd;
66184+
66185+ read_lock_tree(tree);
66186+ read_lock_dk(tree);
66187+ rd = *znode_get_rd_key(node);
66188+ right = node->right;
66189+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66190+ right && ZF_ISSET(right, JNODE_DKSET) &&
66191+ !keyeq(&rd, znode_get_ld_key(right)))) {
66192+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
66193+ read_unlock_dk(tree);
66194+ read_unlock_tree(tree);
66195+ stale_dk(tree, node);
66196+ return;
66197+ }
66198+ read_unlock_dk(tree);
66199+ read_unlock_tree(tree);
66200+}
66201+
66202+/*
66203+ * handle searches a the non-unique key.
66204+ *
66205+ * Suppose that we are looking for an item with possibly non-unique key 100.
66206+ *
66207+ * Root node contains two pointers: one to a node with left delimiting key 0,
66208+ * and another to a node with left delimiting key 100. Item we interested in
66209+ * may well happen in the sub-tree rooted at the first pointer.
66210+ *
66211+ * To handle this search_to_left() is called when search reaches stop
66212+ * level. This function checks it is _possible_ that item we are looking for
66213+ * is in the left neighbor (this can be done by comparing delimiting keys) and
66214+ * if so, tries to lock left neighbor (this is low priority lock, so it can
66215+ * deadlock, tree traversal is just restarted if it did) and then checks
66216+ * whether left neighbor actually contains items with our key.
66217+ *
66218+ * Note that this is done on the stop level only. It is possible to try such
66219+ * left-check on each level, but as duplicate keys are supposed to be rare
66220+ * (very unlikely that more than one node is completely filled with items with
66221+ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
66222+ *
66223+ */
66224+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
66225+{
66226+ level_lookup_result result;
66227+ coord_t *coord;
66228+ znode *node;
66229+ znode *neighbor;
66230+
66231+ lock_handle lh;
66232+
66233+ assert("nikita-1761", h != NULL);
66234+ assert("nikita-1762", h->level == h->stop_level);
66235+
66236+ init_lh(&lh);
66237+ coord = h->coord;
66238+ node = h->active_lh->node;
66239+ assert("nikita-1763", coord_is_leftmost_unit(coord));
66240+
66241+ h->result =
66242+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
66243+ GN_CAN_USE_UPPER_LEVELS);
66244+ neighbor = NULL;
66245+ switch (h->result) {
66246+ case -E_DEADLOCK:
66247+ result = LOOKUP_REST;
66248+ break;
66249+ case 0:{
66250+ node_plugin *nplug;
66251+ coord_t crd;
66252+ lookup_bias bias;
66253+
66254+ neighbor = lh.node;
66255+ h->result = zload(neighbor);
66256+ if (h->result != 0) {
66257+ result = LOOKUP_DONE;
66258+ break;
66259+ }
66260+
66261+ nplug = neighbor->nplug;
66262+
66263+ coord_init_zero(&crd);
66264+ bias = h->bias;
66265+ h->bias = FIND_EXACT;
66266+ h->result =
66267+ nplug->lookup(neighbor, h->key, h->bias, &crd);
66268+ h->bias = bias;
66269+
66270+ if (h->result == NS_NOT_FOUND) {
66271+ case -E_NO_NEIGHBOR:
66272+ h->result = CBK_COORD_FOUND;
66273+ if (!(h->flags & CBK_IN_CACHE))
66274+ cbk_cache_add(node);
66275+ default: /* some other error */
66276+ result = LOOKUP_DONE;
66277+ } else if (h->result == NS_FOUND) {
66278+ read_lock_dk(znode_get_tree(neighbor));
66279+ h->rd_key = *znode_get_ld_key(node);
66280+ leftmost_key_in_node(neighbor, &h->ld_key);
66281+ read_unlock_dk(znode_get_tree(neighbor));
66282+ h->flags |= CBK_DKSET;
66283+
66284+ h->block = *znode_get_block(neighbor);
66285+ /* clear coord -> node so that cbk_level_lookup()
66286+ wouldn't overwrite parent hint in neighbor.
66287+
66288+ Parent hint was set up by
66289+ reiser4_get_left_neighbor()
66290+ */
66291+ /* FIXME: why do we have to spinlock here? */
66292+ write_lock_tree(znode_get_tree(neighbor));
66293+ h->coord->node = NULL;
66294+ write_unlock_tree(znode_get_tree(neighbor));
66295+ result = LOOKUP_CONT;
66296+ } else {
66297+ result = LOOKUP_DONE;
66298+ }
66299+ if (neighbor != NULL)
66300+ zrelse(neighbor);
66301+ }
66302+ }
66303+ done_lh(&lh);
66304+ return result;
66305+}
66306+
66307+/* debugging aid: return symbolic name of search bias */
66308+static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66309+{
66310+ if (bias == FIND_EXACT)
66311+ return "exact";
66312+ else if (bias == FIND_MAX_NOT_MORE_THAN)
66313+ return "left-slant";
66314+/* else if( bias == RIGHT_SLANT_BIAS ) */
66315+/* return "right-bias"; */
66316+ else {
66317+ static char buf[30];
66318+
66319+ sprintf(buf, "unknown: %i", bias);
66320+ return buf;
66321+ }
66322+}
66323+
66324+#if REISER4_DEBUG
66325+/* debugging aid: print human readable information about @p */
66326+void print_coord_content(const char *prefix /* prefix to print */ ,
66327+ coord_t * p /* coord to print */ )
66328+{
66329+ reiser4_key key;
66330+
66331+ if (p == NULL) {
66332+ printk("%s: null\n", prefix);
66333+ return;
66334+ }
66335+ if ((p->node != NULL) && znode_is_loaded(p->node)
66336+ && coord_is_existing_item(p))
66337+ printk("%s: data: %p, length: %i\n", prefix,
66338+ item_body_by_coord(p), item_length_by_coord(p));
66339+ if (znode_is_loaded(p->node)) {
66340+ item_key_by_coord(p, &key);
66341+ print_key(prefix, &key);
66342+ }
66343+}
66344+
66345+/* debugging aid: print human readable information about @block */
66346+void reiser4_print_address(const char *prefix /* prefix to print */ ,
66347+ const reiser4_block_nr * block /* block number to print */ )
66348+{
66349+ printk("%s: %s\n", prefix, sprint_address(block));
66350+}
66351+#endif
66352+
66353+/* return string containing human readable representation of @block */
66354+char *sprint_address(const reiser4_block_nr *
66355+ block /* block number to print */ )
66356+{
66357+ static char address[30];
66358+
66359+ if (block == NULL)
66360+ sprintf(address, "null");
66361+ else if (blocknr_is_fake(block))
66362+ sprintf(address, "%llx", (unsigned long long)(*block));
66363+ else
66364+ sprintf(address, "%llu", (unsigned long long)(*block));
66365+ return address;
66366+}
66367+
66368+/* release parent node during traversal */
66369+static void put_parent(cbk_handle * h /* search handle */ )
66370+{
66371+ assert("nikita-383", h != NULL);
66372+ if (h->parent_lh->node != NULL) {
66373+ longterm_unlock_znode(h->parent_lh);
66374+ }
66375+}
66376+
66377+/* helper function used by coord_by_key(): release reference to parent znode
66378+ stored in handle before processing its child. */
66379+static void hput(cbk_handle * h /* search handle */ )
66380+{
66381+ assert("nikita-385", h != NULL);
66382+ done_lh(h->parent_lh);
66383+ done_lh(h->active_lh);
66384+}
66385+
66386+/* Helper function used by cbk(): update delimiting keys of child node (stored
66387+ in h->active_lh->node) using key taken from parent on the parent level. */
66388+static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66389+{
66390+ znode *active;
66391+ reiser4_tree *tree;
66392+
66393+ assert("nikita-1088", h != NULL);
66394+
66395+ active = h->active_lh->node;
66396+
66397+ /* fast check without taking dk lock. This is safe, because
66398+ * JNODE_DKSET is never cleared once set. */
66399+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66400+ tree = znode_get_tree(active);
66401+ write_lock_dk(tree);
66402+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66403+ znode_set_ld_key(active, &h->ld_key);
66404+ znode_set_rd_key(active, &h->rd_key);
66405+ ZF_SET(active, JNODE_DKSET);
66406+ }
66407+ write_unlock_dk(tree);
66408+ return 1;
66409+ }
66410+ return 0;
66411+}
66412+
66413+/* true if @block makes sense for the @tree. Used to detect corrupted node
66414+ * pointers */
66415+static int
66416+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66417+ reiser4_tree * tree /* tree to check against */ )
66418+{
66419+ assert("nikita-757", block != NULL);
66420+ assert("nikita-758", tree != NULL);
66421+
66422+ /* check to see if it exceeds the size of the device. */
66423+ return reiser4_blocknr_is_sane_for(tree->super, block);
66424+}
66425+
66426+/* check consistency of fields */
66427+static int sanity_check(cbk_handle * h /* search handle */ )
66428+{
66429+ assert("nikita-384", h != NULL);
66430+
66431+ if (h->level < h->stop_level) {
66432+ h->error = "Buried under leaves";
66433+ h->result = RETERR(-EIO);
66434+ return LOOKUP_DONE;
66435+ } else if (!block_nr_is_correct(&h->block, h->tree)) {
66436+ h->error = "bad block number";
66437+ h->result = RETERR(-EIO);
66438+ return LOOKUP_DONE;
66439+ } else
66440+ return 0;
66441+}
66442+
66443+/* Make Linus happy.
66444+ Local variables:
66445+ c-indentation-style: "K&R"
66446+ mode-name: "LC"
66447+ c-basic-offset: 8
66448+ tab-width: 8
66449+ fill-column: 120
66450+ scroll-step: 1
66451+ End:
66452+*/
66453Index: linux-2.6.16/fs/reiser4/status_flags.c
66454===================================================================
66455--- /dev/null
66456+++ linux-2.6.16/fs/reiser4/status_flags.c
66457@@ -0,0 +1,176 @@
66458+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66459+ * reiser4/README */
66460+
66461+/* Functions that deal with reiser4 status block, query status and update it, if needed */
66462+
66463+#include <linux/bio.h>
66464+#include <linux/highmem.h>
66465+#include <linux/fs.h>
66466+#include <linux/blkdev.h>
66467+#include "debug.h"
66468+#include "dformat.h"
66469+#include "status_flags.h"
66470+#include "super.h"
66471+
66472+/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66473+ unconditionally unlocks the page, so we can see that io was done.
66474+ We do not free bio, because we hope to reuse that. */
66475+static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66476+ int err)
66477+{
66478+ if (bio->bi_size)
66479+ return 1;
66480+
66481+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66482+ SetPageUptodate(bio->bi_io_vec->bv_page);
66483+ } else {
66484+ ClearPageUptodate(bio->bi_io_vec->bv_page);
66485+ SetPageError(bio->bi_io_vec->bv_page);
66486+ }
66487+ unlock_page(bio->bi_io_vec->bv_page);
66488+ return 0;
66489+}
66490+
66491+/* Initialise status code. This is expected to be called from the disk format
66492+ code. block paremeter is where status block lives. */
66493+int reiser4_status_init(reiser4_block_nr block)
66494+{
66495+ struct super_block *sb = reiser4_get_current_sb();
66496+ struct reiser4_status *statuspage;
66497+ struct bio *bio;
66498+ struct page *page;
66499+
66500+
66501+ get_super_private(sb)->status_page = NULL;
66502+ get_super_private(sb)->status_bio = NULL;
66503+
66504+ page = alloc_pages(GFP_KERNEL, 0);
66505+ if (!page)
66506+ return -ENOMEM;
66507+
66508+ bio = bio_alloc(GFP_KERNEL, 1);
66509+ if (bio != NULL) {
66510+ bio->bi_sector = block * (sb->s_blocksize >> 9);
66511+ bio->bi_bdev = sb->s_bdev;
66512+ bio->bi_io_vec[0].bv_page = page;
66513+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66514+ bio->bi_io_vec[0].bv_offset = 0;
66515+ bio->bi_vcnt = 1;
66516+ bio->bi_size = sb->s_blocksize;
66517+ bio->bi_end_io = reiser4_status_endio;
66518+ } else {
66519+ __free_pages(page, 0);
66520+ return -ENOMEM;
66521+ }
66522+ lock_page(page);
66523+ submit_bio(READ, bio);
66524+ blk_run_address_space(get_super_fake(sb)->i_mapping);
66525+ wait_on_page_locked(page);
66526+ if (!PageUptodate(page)) {
66527+ warning("green-2007",
66528+ "I/O error while tried to read status page\n");
66529+ return -EIO;
66530+ }
66531+
66532+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
66533+ if (memcmp
66534+ (statuspage->magic, REISER4_STATUS_MAGIC,
66535+ sizeof(REISER4_STATUS_MAGIC))) {
66536+ /* Magic does not match. */
66537+ kunmap_atomic((char *)statuspage, KM_USER0);
66538+ warning("green-2008", "Wrong magic in status block\n");
66539+ __free_pages(page, 0);
66540+ bio_put(bio);
66541+ return -EINVAL;
66542+ }
66543+ kunmap_atomic((char *)statuspage, KM_USER0);
66544+
66545+ get_super_private(sb)->status_page = page;
66546+ get_super_private(sb)->status_bio = bio;
66547+ return 0;
66548+}
66549+
66550+/* Query the status of fs. Returns if the FS can be safely mounted.
66551+ Also if "status" and "extended" parameters are given, it will fill
66552+ actual parts of status from disk there. */
66553+int reiser4_status_query(u64 * status, u64 * extended)
66554+{
66555+ struct super_block *sb = reiser4_get_current_sb();
66556+ struct reiser4_status *statuspage;
66557+ int retval;
66558+
66559+ if (!get_super_private(sb)->status_page) { // No status page?
66560+ return REISER4_STATUS_MOUNT_UNKNOWN;
66561+ }
66562+ statuspage = (struct reiser4_status *)
66563+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66564+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
66565+ case REISER4_STATUS_OK:
66566+ retval = REISER4_STATUS_MOUNT_OK;
66567+ break;
66568+ case REISER4_STATUS_CORRUPTED:
66569+ retval = REISER4_STATUS_MOUNT_WARN;
66570+ break;
66571+ case REISER4_STATUS_DAMAGED:
66572+ case REISER4_STATUS_DESTROYED:
66573+ case REISER4_STATUS_IOERROR:
66574+ retval = REISER4_STATUS_MOUNT_RO;
66575+ break;
66576+ default:
66577+ retval = REISER4_STATUS_MOUNT_UNKNOWN;
66578+ break;
66579+ }
66580+
66581+ if (status)
66582+ *status = le64_to_cpu(get_unaligned(&statuspage->status));
66583+ if (extended)
66584+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
66585+
66586+ kunmap_atomic((char *)statuspage, KM_USER0);
66587+ return retval;
66588+}
66589+
66590+/* This function should be called when something bad happens (e.g. from reiser4_panic).
66591+ It fills the status structure and tries to push it to disk. */
66592+int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
66593+{
66594+ struct super_block *sb = reiser4_get_current_sb();
66595+ struct reiser4_status *statuspage;
66596+ struct bio *bio = get_super_private(sb)->status_bio;
66597+
66598+ if (!get_super_private(sb)->status_page) { // No status page?
66599+ return -1;
66600+ }
66601+ statuspage = (struct reiser4_status *)
66602+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66603+
66604+ put_unaligned(cpu_to_le64(status), &statuspage->status);
66605+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
66606+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
66607+
66608+ kunmap_atomic((char *)statuspage, KM_USER0);
66609+ bio->bi_bdev = sb->s_bdev;
66610+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
66611+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66612+ bio->bi_io_vec[0].bv_offset = 0;
66613+ bio->bi_vcnt = 1;
66614+ bio->bi_size = sb->s_blocksize;
66615+ bio->bi_end_io = reiser4_status_endio;
66616+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
66617+ /* We can block now, but we have no other choice anyway */
66618+ submit_bio(WRITE, bio);
66619+ blk_run_address_space(get_super_fake(sb)->i_mapping);
66620+ return 0; // We do not wait for io to finish.
66621+}
66622+
66623+/* Frees the page with status and bio structure. Should be called by disk format at umount time */
66624+int reiser4_status_finish(void)
66625+{
66626+ struct super_block *sb = reiser4_get_current_sb();
66627+
66628+ __free_pages(get_super_private(sb)->status_page, 0);
66629+ get_super_private(sb)->status_page = NULL;
66630+ bio_put(get_super_private(sb)->status_bio);
66631+ get_super_private(sb)->status_bio = NULL;
66632+ return 0;
66633+}
66634Index: linux-2.6.16/fs/reiser4/status_flags.h
66635===================================================================
66636--- /dev/null
66637+++ linux-2.6.16/fs/reiser4/status_flags.h
66638@@ -0,0 +1,43 @@
66639+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66640+ * reiser4/README */
66641+
66642+/* Here we declare structures and flags that store reiser4 status on disk.
66643+ The status that helps us to find out if the filesystem is valid or if it
66644+ contains some critical, or not so critical errors */
66645+
66646+#if !defined( __REISER4_STATUS_FLAGS_H__ )
66647+#define __REISER4_STATUS_FLAGS_H__
66648+
66649+#include "dformat.h"
66650+/* These are major status flags */
66651+#define REISER4_STATUS_OK 0
66652+#define REISER4_STATUS_CORRUPTED 0x1
66653+#define REISER4_STATUS_DAMAGED 0x2
66654+#define REISER4_STATUS_DESTROYED 0x4
66655+#define REISER4_STATUS_IOERROR 0x8
66656+
66657+/* Return values for reiser4_status_query() */
66658+#define REISER4_STATUS_MOUNT_OK 0
66659+#define REISER4_STATUS_MOUNT_WARN 1
66660+#define REISER4_STATUS_MOUNT_RO 2
66661+#define REISER4_STATUS_MOUNT_UNKNOWN -1
66662+
66663+#define REISER4_TEXTERROR_LEN 256
66664+
66665+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
66666+/* We probably need to keep its size under sector size which is 512 bytes */
66667+struct reiser4_status {
66668+ char magic[16];
66669+ d64 status; /* Current FS state */
66670+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
66671+ last sector where io error happened if status is "io error encountered" */
66672+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
66673+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
66674+};
66675+
66676+int reiser4_status_init(reiser4_block_nr block);
66677+int reiser4_status_query(u64 * status, u64 * extended);
66678+int reiser4_status_write(u64 status, u64 extended_status, char *message);
66679+int reiser4_status_finish(void);
66680+
66681+#endif
66682Index: linux-2.6.16/fs/reiser4/super.c
66683===================================================================
66684--- /dev/null
66685+++ linux-2.6.16/fs/reiser4/super.c
66686@@ -0,0 +1,313 @@
66687+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66688+ * reiser4/README */
66689+
66690+/* Super-block manipulations. */
66691+
66692+#include "debug.h"
66693+#include "dformat.h"
66694+#include "key.h"
66695+#include "plugin/security/perm.h"
66696+#include "plugin/space/space_allocator.h"
66697+#include "plugin/plugin.h"
66698+#include "tree.h"
66699+#include "vfs_ops.h"
66700+#include "super.h"
66701+#include "reiser4.h"
66702+
66703+#include <linux/types.h> /* for __u?? */
66704+#include <linux/fs.h> /* for struct super_block */
66705+
66706+
66707+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
66708+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
66709+static __u64 reserved_for_root(const struct super_block *super);
66710+
66711+/* Return reiser4-specific part of super block */
66712+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
66713+ * queried */ )
66714+{
66715+ return (reiser4_super_info_data *) super->s_fs_info;
66716+}
66717+
66718+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
66719+long statfs_type(const struct super_block *super UNUSED_ARG /* super block
66720+ * queried */ )
66721+{
66722+ assert("nikita-448", super != NULL);
66723+ assert("nikita-449", is_reiser4_super(super));
66724+ return (long)REISER4_SUPER_MAGIC;
66725+}
66726+
66727+/* functions to read/modify fields of reiser4_super_info_data */
66728+
66729+/* get number of blocks in file system */
66730+__u64 reiser4_block_count(const struct super_block *super /* super block
66731+ queried */ )
66732+{
66733+ assert("vs-494", super != NULL);
66734+ assert("vs-495", is_reiser4_super(super));
66735+ return get_super_private(super)->block_count;
66736+}
66737+
66738+/*
66739+ * number of blocks in the current file system
66740+ */
66741+__u64 reiser4_current_block_count(void)
66742+{
66743+ return get_current_super_private()->block_count;
66744+}
66745+
66746+/* set number of block in filesystem */
66747+void reiser4_set_block_count(const struct super_block *super, __u64 nr)
66748+{
66749+ assert("vs-501", super != NULL);
66750+ assert("vs-502", is_reiser4_super(super));
66751+ get_super_private(super)->block_count = nr;
66752+ /*
66753+ * The proper calculation of the reserved space counter (%5 of device
66754+ * block counter) we need a 64 bit division which is missing in Linux
66755+ * on i386 platform. Because we do not need a precise calculation here
66756+ * we can replace a div64 operation by this combination of
66757+ * multiplication and shift: 51. / (2^10) == .0498 .
66758+ * FIXME: this is a bug. It comes up only for very small filesystems
66759+ * which probably are never used. Nevertheless, it is a bug. Number of
66760+ * reserved blocks must be not less than maximal number of blocks which
66761+ * get grabbed with BA_RESERVED.
66762+ */
66763+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
66764+}
66765+
66766+/* amount of blocks used (allocated for data) in file system */
66767+__u64 reiser4_data_blocks(const struct super_block *super /* super block
66768+ queried */ )
66769+{
66770+ assert("nikita-452", super != NULL);
66771+ assert("nikita-453", is_reiser4_super(super));
66772+ return get_super_private(super)->blocks_used;
66773+}
66774+
66775+/* set number of block used in filesystem */
66776+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
66777+{
66778+ assert("vs-503", super != NULL);
66779+ assert("vs-504", is_reiser4_super(super));
66780+ get_super_private(super)->blocks_used = nr;
66781+}
66782+
66783+/* amount of free blocks in file system */
66784+__u64 reiser4_free_blocks(const struct super_block *super /* super block
66785+ queried */ )
66786+{
66787+ assert("nikita-454", super != NULL);
66788+ assert("nikita-455", is_reiser4_super(super));
66789+ return get_super_private(super)->blocks_free;
66790+}
66791+
66792+/* set number of blocks free in filesystem */
66793+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
66794+{
66795+ assert("vs-505", super != NULL);
66796+ assert("vs-506", is_reiser4_super(super));
66797+ get_super_private(super)->blocks_free = nr;
66798+}
66799+
66800+/* get mkfs unique identifier */
66801+__u32 reiser4_mkfs_id(const struct super_block *super /* super block
66802+ queried */ )
66803+{
66804+ assert("vpf-221", super != NULL);
66805+ assert("vpf-222", is_reiser4_super(super));
66806+ return get_super_private(super)->mkfs_id;
66807+}
66808+
66809+/* amount of free blocks in file system */
66810+__u64 reiser4_free_committed_blocks(const struct super_block *super)
66811+{
66812+ assert("vs-497", super != NULL);
66813+ assert("vs-498", is_reiser4_super(super));
66814+ return get_super_private(super)->blocks_free_committed;
66815+}
66816+
66817+/* amount of blocks in the file system reserved for @uid and @gid */
66818+long reiser4_reserved_blocks(const struct super_block *super /* super block
66819+ queried */ ,
66820+ uid_t uid /* user id */ ,
66821+ gid_t gid /* group id */ )
66822+{
66823+ long reserved;
66824+
66825+ assert("nikita-456", super != NULL);
66826+ assert("nikita-457", is_reiser4_super(super));
66827+
66828+ reserved = 0;
66829+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
66830+ reserved += reserved_for_gid(super, gid);
66831+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
66832+ reserved += reserved_for_uid(super, uid);
66833+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
66834+ reserved += reserved_for_root(super);
66835+ return reserved;
66836+}
66837+
66838+/* get/set value of/to grabbed blocks counter */
66839+__u64 reiser4_grabbed_blocks(const struct super_block * super)
66840+{
66841+ assert("zam-512", super != NULL);
66842+ assert("zam-513", is_reiser4_super(super));
66843+
66844+ return get_super_private(super)->blocks_grabbed;
66845+}
66846+
66847+__u64 flush_reserved(const struct super_block * super)
66848+{
66849+ assert("vpf-285", super != NULL);
66850+ assert("vpf-286", is_reiser4_super(super));
66851+
66852+ return get_super_private(super)->blocks_flush_reserved;
66853+}
66854+
66855+/* get/set value of/to counter of fake allocated formatted blocks */
66856+__u64 reiser4_fake_allocated(const struct super_block * super)
66857+{
66858+ assert("zam-516", super != NULL);
66859+ assert("zam-517", is_reiser4_super(super));
66860+
66861+ return get_super_private(super)->blocks_fake_allocated;
66862+}
66863+
66864+/* get/set value of/to counter of fake allocated unformatted blocks */
66865+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
66866+{
66867+ assert("zam-516", super != NULL);
66868+ assert("zam-517", is_reiser4_super(super));
66869+
66870+ return get_super_private(super)->blocks_fake_allocated_unformatted;
66871+}
66872+
66873+/* get/set value of/to counter of clustered blocks */
66874+__u64 reiser4_clustered_blocks(const struct super_block * super)
66875+{
66876+ assert("edward-601", super != NULL);
66877+ assert("edward-602", is_reiser4_super(super));
66878+
66879+ return get_super_private(super)->blocks_clustered;
66880+}
66881+
66882+/* space allocator used by this file system */
66883+reiser4_space_allocator *get_space_allocator(const struct super_block * super)
66884+{
66885+ assert("nikita-1965", super != NULL);
66886+ assert("nikita-1966", is_reiser4_super(super));
66887+ return &get_super_private(super)->space_allocator;
66888+}
66889+
66890+/* return fake inode used to bind formatted nodes in the page cache */
66891+struct inode *get_super_fake(const struct super_block *super /* super block
66892+ queried */ )
66893+{
66894+ assert("nikita-1757", super != NULL);
66895+ return get_super_private(super)->fake;
66896+}
66897+
66898+/* return fake inode used to bind copied on capture nodes in the page cache */
66899+struct inode *get_cc_fake(const struct super_block *super /* super block
66900+ queried */ )
66901+{
66902+ assert("nikita-1757", super != NULL);
66903+ return get_super_private(super)->cc;
66904+}
66905+
66906+/* return fake inode used to bind bitmaps and journlal heads */
66907+struct inode *get_bitmap_fake(const struct super_block *super)
66908+{
66909+ assert("nikita-17571", super != NULL);
66910+ return get_super_private(super)->bitmap;
66911+}
66912+
66913+/* tree used by this file system */
66914+reiser4_tree *get_tree(const struct super_block * super /* super block
66915+ * queried */ )
66916+{
66917+ assert("nikita-460", super != NULL);
66918+ assert("nikita-461", is_reiser4_super(super));
66919+ return &get_super_private(super)->tree;
66920+}
66921+
66922+/* Check that @super is (looks like) reiser4 super block. This is mainly for
66923+ use in assertions. */
66924+int is_reiser4_super(const struct super_block *super /* super block
66925+ * queried */ )
66926+{
66927+ return
66928+ super != NULL &&
66929+ get_super_private(super) != NULL &&
66930+ super->s_op == &(get_super_private(super)->ops.super);
66931+}
66932+
66933+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
66934+{
66935+ return test_bit((int)f, &get_super_private(super)->fs_flags);
66936+}
66937+
66938+/* amount of blocks reserved for given group in file system */
66939+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
66940+ * block
66941+ * queried */ ,
66942+ gid_t gid UNUSED_ARG /* group id */ )
66943+{
66944+ return 0;
66945+}
66946+
66947+/* amount of blocks reserved for given user in file system */
66948+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
66949+ block
66950+ queried */ ,
66951+ uid_t uid UNUSED_ARG /* user id */ )
66952+{
66953+ return 0;
66954+}
66955+
66956+/* amount of blocks reserved for super user in file system */
66957+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
66958+ block
66959+ queried */ )
66960+{
66961+ return 0;
66962+}
66963+
66964+/*
66965+ * true if block number @blk makes sense for the file system at @super.
66966+ */
66967+int
66968+reiser4_blocknr_is_sane_for(const struct super_block *super,
66969+ const reiser4_block_nr * blk)
66970+{
66971+ reiser4_super_info_data *sbinfo;
66972+
66973+ assert("nikita-2957", super != NULL);
66974+ assert("nikita-2958", blk != NULL);
66975+
66976+ if (blocknr_is_fake(blk))
66977+ return 1;
66978+
66979+ sbinfo = get_super_private(super);
66980+ return *blk < sbinfo->block_count;
66981+}
66982+
66983+/*
66984+ * true, if block number @blk makes sense for the current file system
66985+ */
66986+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
66987+{
66988+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
66989+}
66990+
66991+/* Make Linus happy.
66992+ Local variables:
66993+ c-indentation-style: "K&R"
66994+ mode-name: "LC"
66995+ c-basic-offset: 8
66996+ tab-width: 8
66997+ fill-column: 120
66998+ End:
66999+*/
67000Index: linux-2.6.16/fs/reiser4/super.h
67001===================================================================
67002--- /dev/null
67003+++ linux-2.6.16/fs/reiser4/super.h
67004@@ -0,0 +1,468 @@
67005+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67006+ * reiser4/README */
67007+
67008+/* Super-block functions. See super.c for details. */
67009+
67010+#if !defined( __REISER4_SUPER_H__ )
67011+#define __REISER4_SUPER_H__
67012+
67013+#include "tree.h"
67014+#include "entd.h"
67015+#include "wander.h"
67016+#include "fsdata.h"
67017+#include "plugin/object.h"
67018+#include "plugin/space/space_allocator.h"
67019+
67020+/*
67021+ * Flush algorithms parameters.
67022+ */
67023+typedef struct {
67024+ unsigned relocate_threshold;
67025+ unsigned relocate_distance;
67026+ unsigned written_threshold;
67027+ unsigned scan_maxnodes;
67028+} flush_params;
67029+
67030+typedef enum {
67031+ /*
67032+ * True if this file system doesn't support hard-links (multiple names)
67033+ * for directories: this is default UNIX behavior.
67034+ *
67035+ * If hard-links on directoires are not allowed, file system is Acyclic
67036+ * Directed Graph (modulo dot, and dotdot, of course).
67037+ *
67038+ * This is used by reiser4_link().
67039+ */
67040+ REISER4_ADG = 0,
67041+ /*
67042+ * set if all nodes in internal tree have the same node layout plugin.
67043+ * If so, znode_guess_plugin() will return tree->node_plugin in stead
67044+ * of guessing plugin by plugin id stored in the node.
67045+ */
67046+ REISER4_ONE_NODE_PLUGIN = 1,
67047+ /* if set, bsd gid assignment is supported. */
67048+ REISER4_BSD_GID = 2,
67049+ /* [mac]_time are 32 bit in inode */
67050+ REISER4_32_BIT_TIMES = 3,
67051+ /* allow concurrent flushes */
67052+ REISER4_MTFLUSH = 4,
67053+ /* load all bitmap blocks at mount time */
67054+ REISER4_DONT_LOAD_BITMAP = 5,
67055+ /* enforce atomicity during write(2) */
67056+ REISER4_ATOMIC_WRITE = 6,
67057+ /* don't use write barriers in the log writer code. */
67058+ REISER4_NO_WRITE_BARRIER = 7
67059+
67060+} reiser4_fs_flag;
67061+
67062+/*
67063+ * VFS related operation vectors.
67064+ */
67065+typedef struct object_ops {
67066+ struct super_operations super;
67067+ struct dentry_operations dentry;
67068+ struct export_operations export;
67069+} object_ops;
67070+
67071+/* reiser4-specific part of super block
67072+
67073+ Locking
67074+
67075+ Fields immutable after mount:
67076+
67077+ ->oid*
67078+ ->space*
67079+ ->default_[ug]id
67080+ ->mkfs_id
67081+ ->trace_flags
67082+ ->debug_flags
67083+ ->fs_flags
67084+ ->df_plug
67085+ ->optimal_io_size
67086+ ->plug
67087+ ->flush
67088+ ->u (bad name)
67089+ ->txnmgr
67090+ ->ra_params
67091+ ->fsuid
67092+ ->journal_header
67093+ ->journal_footer
67094+
67095+ Fields protected by ->lnode_guard
67096+
67097+ ->lnode_htable
67098+
67099+ Fields protected by per-super block spin lock
67100+
67101+ ->block_count
67102+ ->blocks_used
67103+ ->blocks_free
67104+ ->blocks_free_committed
67105+ ->blocks_grabbed
67106+ ->blocks_fake_allocated_unformatted
67107+ ->blocks_fake_allocated
67108+ ->blocks_flush_reserved
67109+ ->eflushed
67110+ ->blocknr_hint_default
67111+
67112+ After journal replaying during mount,
67113+
67114+ ->last_committed_tx
67115+
67116+ is protected by ->tmgr.commit_semaphore
67117+
67118+ Invariants involving this data-type:
67119+
67120+ [sb-block-counts]
67121+ [sb-grabbed]
67122+ [sb-fake-allocated]
67123+*/
67124+struct reiser4_super_info_data {
67125+ /*
67126+ * guard spinlock which protects reiser4 super block fields (currently
67127+ * blocks_free, blocks_free_committed)
67128+ */
67129+ spinlock_t guard;
67130+
67131+ /* next oid that will be returned by oid_allocate() */
67132+ oid_t next_to_use;
67133+ /* total number of used oids */
67134+ oid_t oids_in_use;
67135+
67136+ /* space manager plugin */
67137+ reiser4_space_allocator space_allocator;
67138+
67139+ /* reiser4 internal tree */
67140+ reiser4_tree tree;
67141+
67142+ /*
67143+ * default user id used for light-weight files without their own
67144+ * stat-data.
67145+ */
67146+ uid_t default_uid;
67147+
67148+ /*
67149+ * default group id used for light-weight files without their own
67150+ * stat-data.
67151+ */
67152+ gid_t default_gid;
67153+
67154+ /* mkfs identifier generated at mkfs time. */
67155+ __u32 mkfs_id;
67156+ /* amount of blocks in a file system */
67157+ __u64 block_count;
67158+
67159+ /* inviolable reserve */
67160+ __u64 blocks_reserved;
67161+
67162+ /* amount of blocks used by file system data and meta-data. */
67163+ __u64 blocks_used;
67164+
67165+ /*
67166+ * amount of free blocks. This is "working" free blocks counter. It is
67167+ * like "working" bitmap, please see block_alloc.c for description.
67168+ */
67169+ __u64 blocks_free;
67170+
67171+ /*
67172+ * free block count for fs committed state. This is "commit" version of
67173+ * free block counter.
67174+ */
67175+ __u64 blocks_free_committed;
67176+
67177+ /*
67178+ * number of blocks reserved for further allocation, for all
67179+ * threads.
67180+ */
67181+ __u64 blocks_grabbed;
67182+
67183+ /* number of fake allocated unformatted blocks in tree. */
67184+ __u64 blocks_fake_allocated_unformatted;
67185+
67186+ /* number of fake allocated formatted blocks in tree. */
67187+ __u64 blocks_fake_allocated;
67188+
67189+ /* number of blocks reserved for flush operations. */
67190+ __u64 blocks_flush_reserved;
67191+
67192+ /* number of blocks reserved for cluster operations. */
67193+ __u64 blocks_clustered;
67194+
67195+ /* unique file-system identifier */
67196+ __u32 fsuid;
67197+
67198+ /* file-system wide flags. See reiser4_fs_flag enum */
67199+ unsigned long fs_flags;
67200+
67201+ /* transaction manager */
67202+ txn_mgr tmgr;
67203+
67204+ /* ent thread */
67205+ entd_context entd;
67206+
67207+ /* fake inode used to bind formatted nodes */
67208+ struct inode *fake;
67209+ /* inode used to bind bitmaps (and journal heads) */
67210+ struct inode *bitmap;
67211+ /* inode used to bind copied on capture nodes */
67212+ struct inode *cc;
67213+
67214+ /* disk layout plugin */
67215+ disk_format_plugin *df_plug;
67216+
67217+ /* disk layout specific part of reiser4 super info data */
67218+ union {
67219+ format40_super_info format40;
67220+ } u;
67221+
67222+ /* value we return in st_blksize on stat(2) */
67223+ unsigned long optimal_io_size;
67224+
67225+ /* parameters for the flush algorithm */
67226+ flush_params flush;
67227+
67228+ /* pointers to jnodes for journal header and footer */
67229+ jnode *journal_header;
67230+ jnode *journal_footer;
67231+
67232+ journal_location jloc;
67233+
67234+ /* head block number of last committed transaction */
67235+ __u64 last_committed_tx;
67236+
67237+ /*
67238+ * we remember last written location for using as a hint for new block
67239+ * allocation
67240+ */
67241+ __u64 blocknr_hint_default;
67242+
67243+ /* committed number of files (oid allocator state variable ) */
67244+ __u64 nr_files_committed;
67245+
67246+ ra_params_t ra_params;
67247+
67248+ /*
67249+ * A semaphore for serializing cut tree operation if out-of-free-space:
67250+ * the only one cut_tree thread is allowed to grab space from reserved
67251+ * area (it is 5% of disk space)
67252+ */
67253+ struct semaphore delete_sema;
67254+ /* task owning ->delete_sema */
67255+ struct task_struct *delete_sema_owner;
67256+
67257+ /* serialize semaphore */
67258+ struct semaphore flush_sema;
67259+
67260+ /* Diskmap's blocknumber */
67261+ __u64 diskmap_block;
67262+
67263+ /* What to do in case of error */
67264+ int onerror;
67265+
67266+ /* operations for objects on this file system */
67267+ object_ops ops;
67268+
67269+ /*
67270+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67271+ * more details
67272+ */
67273+ d_cursor_info d_info;
67274+
67275+#ifdef CONFIG_REISER4_BADBLOCKS
67276+ /* Alternative master superblock offset (in bytes) */
67277+ unsigned long altsuper;
67278+#endif
67279+ struct repacker *repacker;
67280+ struct page *status_page;
67281+ struct bio *status_bio;
67282+
67283+#if REISER4_DEBUG
67284+ /*
67285+ * minimum used blocks value (includes super blocks, bitmap blocks and
67286+ * other fs reserved areas), depends on fs format and fs size.
67287+ */
67288+ __u64 min_blocks_used;
67289+
67290+ /*
67291+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67292+ * are kept on a list anchored at sbinfo->all_jnodes. This list is
67293+ * protected by sbinfo->all_guard spin lock. This lock should be taken
67294+ * with _irq modifier, because it is also modified from interrupt
67295+ * contexts (by RCU).
67296+ */
67297+ spinlock_t all_guard;
67298+ /* list of all jnodes */
67299+ struct list_head all_jnodes;
67300+#endif
67301+ struct dentry *debugfs_root;
67302+};
67303+
67304+extern reiser4_super_info_data *get_super_private_nocheck(const struct
67305+ super_block *super);
67306+
67307+
67308+/* Return reiser4-specific part of super block */
67309+static inline reiser4_super_info_data *get_super_private(const struct
67310+ super_block *super)
67311+{
67312+ assert("nikita-447", super != NULL);
67313+
67314+ return (reiser4_super_info_data *) super->s_fs_info;
67315+}
67316+
67317+/* get ent context for the @super */
67318+static inline entd_context *get_entd_context(struct super_block *super)
67319+{
67320+ return &get_super_private(super)->entd;
67321+}
67322+
67323+
67324+/* "Current" super-block: main super block used during current system
67325+ call. Reference to this super block is stored in reiser4_context. */
67326+static inline struct super_block *reiser4_get_current_sb(void)
67327+{
67328+ return get_current_context()->super;
67329+}
67330+
67331+/* Reiser4-specific part of "current" super-block: main super block used
67332+ during current system call. Reference to this super block is stored in
67333+ reiser4_context. */
67334+static inline reiser4_super_info_data *get_current_super_private(void)
67335+{
67336+ return get_super_private(reiser4_get_current_sb());
67337+}
67338+
67339+static inline ra_params_t *get_current_super_ra_params(void)
67340+{
67341+ return &(get_current_super_private()->ra_params);
67342+}
67343+
67344+/*
67345+ * true, if file system on @super is read-only
67346+ */
67347+static inline int rofs_super(struct super_block *super)
67348+{
67349+ return super->s_flags & MS_RDONLY;
67350+}
67351+
67352+/*
67353+ * true, if @tree represents read-only file system
67354+ */
67355+static inline int rofs_tree(reiser4_tree * tree)
67356+{
67357+ return rofs_super(tree->super);
67358+}
67359+
67360+/*
67361+ * true, if file system where @inode lives on, is read-only
67362+ */
67363+static inline int rofs_inode(struct inode *inode)
67364+{
67365+ return rofs_super(inode->i_sb);
67366+}
67367+
67368+/*
67369+ * true, if file system where @node lives on, is read-only
67370+ */
67371+static inline int rofs_jnode(jnode * node)
67372+{
67373+ return rofs_tree(jnode_get_tree(node));
67374+}
67375+
67376+extern __u64 reiser4_current_block_count(void);
67377+
67378+extern void build_object_ops(struct super_block *super, object_ops * ops);
67379+
67380+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67381+
67382+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67383+{
67384+ spin_lock(&(sbinfo->guard));
67385+}
67386+
67387+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67388+{
67389+ assert_spin_locked(&(sbinfo->guard));
67390+ spin_unlock(&(sbinfo->guard));
67391+}
67392+
67393+extern __u64 flush_reserved(const struct super_block *);
67394+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67395+extern long statfs_type(const struct super_block *super);
67396+extern __u64 reiser4_block_count(const struct super_block *super);
67397+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67398+extern __u64 reiser4_data_blocks(const struct super_block *super);
67399+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67400+extern __u64 reiser4_free_blocks(const struct super_block *super);
67401+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67402+extern __u32 reiser4_mkfs_id(const struct super_block *super);
67403+
67404+extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67405+
67406+extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67407+extern __u64 reiser4_fake_allocated(const struct super_block *);
67408+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67409+extern __u64 reiser4_clustered_blocks(const struct super_block *);
67410+
67411+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67412+ gid_t gid);
67413+
67414+extern reiser4_space_allocator *get_space_allocator(const struct super_block
67415+ *super);
67416+extern reiser4_oid_allocator *get_oid_allocator(const struct super_block
67417+ *super);
67418+extern struct inode *get_super_fake(const struct super_block *super);
67419+extern struct inode *get_cc_fake(const struct super_block *super);
67420+extern struct inode *get_bitmap_fake(const struct super_block *super);
67421+extern reiser4_tree *get_tree(const struct super_block *super);
67422+extern int is_reiser4_super(const struct super_block *super);
67423+
67424+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67425+extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67426+ const reiser4_block_nr * blk);
67427+extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67428+extern int reiser4_done_super(struct super_block *s);
67429+
67430+/* step of fill super */
67431+extern int init_fs_info(struct super_block *);
67432+extern void done_fs_info(struct super_block *);
67433+extern int init_super_data(struct super_block *, char *opt_string);
67434+extern int init_read_super(struct super_block *, int silent);
67435+extern int init_root_inode(struct super_block *);
67436+
67437+
67438+/* Maximal possible object id. */
67439+#define ABSOLUTE_MAX_OID ((oid_t)~0)
67440+
67441+#define OIDS_RESERVED ( 1 << 16 )
67442+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67443+oid_t oid_allocate(struct super_block *);
67444+int oid_release(struct super_block *, oid_t);
67445+oid_t oid_next(const struct super_block *);
67446+void oid_count_allocated(void);
67447+void oid_count_released(void);
67448+long oids_used(const struct super_block *);
67449+
67450+#if REISER4_DEBUG
67451+void print_fs_info(const char *prefix, const struct super_block *);
67452+#endif
67453+
67454+extern void destroy_reiser4_cache(kmem_cache_t **);
67455+
67456+extern struct super_operations reiser4_super_operations;
67457+extern struct export_operations reiser4_export_operations;
67458+extern struct dentry_operations reiser4_dentry_operations;
67459+extern struct dentry *reiser4_debugfs_root;
67460+
67461+/* __REISER4_SUPER_H__ */
67462+#endif
67463+
67464+/*
67465+ * Local variables:
67466+ * c-indentation-style: "K&R"
67467+ * mode-name: "LC"
67468+ * c-basic-offset: 8
67469+ * tab-width: 8
67470+ * fill-column: 120
67471+ * End:
67472+ */
67473Index: linux-2.6.16/fs/reiser4/super_ops.c
67474===================================================================
67475--- /dev/null
67476+++ linux-2.6.16/fs/reiser4/super_ops.c
67477@@ -0,0 +1,721 @@
67478+/* Copyright 2005 by Hans Reiser, licensing governed by
67479+ * reiser4/README */
67480+
67481+#include "inode.h"
67482+#include "page_cache.h"
67483+#include "ktxnmgrd.h"
67484+#include "flush.h"
67485+#include "safe_link.h"
67486+
67487+#include <linux/vfs.h>
67488+#include <linux/writeback.h>
67489+#include <linux/mount.h>
67490+#include <linux/seq_file.h>
67491+#include <linux/debugfs.h>
67492+
67493+/* slab cache for inodes */
67494+static kmem_cache_t *inode_cache;
67495+
67496+/**
67497+ * init_once - constructor for reiser4 inodes
67498+ * @obj: inode to be initialized
67499+ * @cache: cache @obj belongs to
67500+ * @flags: SLAB flags
67501+ *
67502+ * Initialization function to be called when new page is allocated by reiser4
67503+ * inode cache. It is set on inode cache creation.
67504+ */
67505+static void init_once(void *obj, kmem_cache_t *cache, unsigned long flags)
67506+{
67507+ reiser4_inode_object *info;
67508+
67509+ info = obj;
67510+
67511+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
67512+ SLAB_CTOR_CONSTRUCTOR) {
67513+ /* initialize vfs inode */
67514+ inode_init_once(&info->vfs_inode);
67515+
67516+ /*
67517+ * initialize reiser4 specific part fo inode.
67518+ * NOTE-NIKITA add here initializations for locks, list heads,
67519+ * etc. that will be added to our private inode part.
67520+ */
67521+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
67522+ /* init semaphore which is used during inode loading */
67523+ loading_init_once(&info->p);
67524+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
67525+ GFP_ATOMIC);
67526+#if REISER4_DEBUG
67527+ info->p.nr_jnodes = 0;
67528+#endif
67529+ }
67530+}
67531+
67532+/**
67533+ * init_inodes - create znode cache
67534+ *
67535+ * Initializes slab cache of inodes. It is part of reiser4 module initialization.
67536+ */
67537+static int init_inodes(void)
67538+{
67539+ inode_cache = kmem_cache_create("reiser4_inode",
67540+ sizeof(reiser4_inode_object),
67541+ 0,
67542+ SLAB_HWCACHE_ALIGN |
67543+ SLAB_RECLAIM_ACCOUNT, init_once, NULL);
67544+ if (inode_cache == NULL)
67545+ return RETERR(-ENOMEM);
67546+ return 0;
67547+}
67548+
67549+/**
67550+ * done_inodes - delete inode cache
67551+ *
67552+ * This is called on reiser4 module unloading or system shutdown.
67553+ */
67554+static void done_inodes(void)
67555+{
67556+ destroy_reiser4_cache(&inode_cache);
67557+}
67558+
67559+/**
67560+ * reiser4_alloc_inode - alloc_inode of super operations
67561+ * @super: super block new inode is allocated for
67562+ *
67563+ * Allocates new inode, initializes reiser4 specific part of it.
67564+ */
67565+static struct inode *reiser4_alloc_inode(struct super_block *super)
67566+{
67567+ reiser4_inode_object *obj;
67568+
67569+ assert("nikita-1696", super != NULL);
67570+ obj = kmem_cache_alloc(inode_cache, SLAB_KERNEL);
67571+ if (obj != NULL) {
67572+ reiser4_inode *info;
67573+
67574+ info = &obj->p;
67575+
67576+ info->hset = info->pset = plugin_set_get_empty();
67577+ info->extmask = 0;
67578+ info->locality_id = 0ull;
67579+ info->plugin_mask = 0;
67580+#if !REISER4_INO_IS_OID
67581+ info->oid_hi = 0;
67582+#endif
67583+ seal_init(&info->sd_seal, NULL, NULL);
67584+ coord_init_invalid(&info->sd_coord, NULL);
67585+ info->flags = 0;
67586+ spin_lock_init(&info->guard);
67587+ /* this deals with info's loading semaphore */
67588+ loading_alloc(info);
67589+ info->vroot = UBER_TREE_ADDR;
67590+ return &obj->vfs_inode;
67591+ } else
67592+ return NULL;
67593+}
67594+
67595+/**
67596+ * reiser4_destroy_inode - destroy_inode of super operations
67597+ * @inode: inode being destroyed
67598+ *
67599+ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
67600+ */
67601+static void reiser4_destroy_inode(struct inode *inode)
67602+{
67603+ reiser4_inode *info;
67604+
67605+ info = reiser4_inode_data(inode);
67606+
67607+ assert("vs-1220", inode_has_no_jnodes(info));
67608+
67609+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
67610+ file_plugin *fplug = inode_file_plugin(inode);
67611+ if (fplug->destroy_inode != NULL)
67612+ fplug->destroy_inode(inode);
67613+ }
67614+ dispose_cursors(inode);
67615+ if (info->pset)
67616+ plugin_set_put(info->pset);
67617+
67618+ /*
67619+ * cannot add similar assertion about ->i_list as prune_icache return
67620+ * inode into slab with dangling ->list.{next,prev}. This is safe,
67621+ * because they are re-initialized in the new_inode().
67622+ */
67623+ assert("nikita-2895", list_empty(&inode->i_dentry));
67624+ assert("nikita-2896", hlist_unhashed(&inode->i_hash));
67625+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
67626+
67627+ /* this deals with info's loading semaphore */
67628+ loading_destroy(info);
67629+
67630+ kmem_cache_free(inode_cache,
67631+ container_of(info, reiser4_inode_object, p));
67632+}
67633+
67634+/**
67635+ * reiser4_dirty_inode - dirty_inode of super operations
67636+ * @inode: inode being dirtied
67637+ *
67638+ * Updates stat data.
67639+ */
67640+static void reiser4_dirty_inode(struct inode *inode)
67641+{
67642+ int result;
67643+
67644+ if (!is_in_reiser4_context())
67645+ return;
67646+ assert("", !IS_RDONLY(inode));
67647+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
67648+ get_current_context()->grabbed_blocks));
67649+
67650+ result = reiser4_update_sd(inode);
67651+ if (result)
67652+ warning("", "failed to dirty inode for %llu: %d",
67653+ get_inode_oid(inode), result);
67654+}
67655+
67656+/**
67657+ * reiser4_delete_inode - delete_inode of super operations
67658+ * @inode: inode to delete
67659+ *
67660+ * Calls file plugin's delete_object method to delete object items from
67661+ * filesystem tree and calls clear_inode.
67662+ */
67663+static void reiser4_delete_inode(struct inode *inode)
67664+{
67665+ reiser4_context *ctx;
67666+ file_plugin *fplug;
67667+
67668+ ctx = init_context(inode->i_sb);
67669+ if (IS_ERR(ctx)) {
67670+ warning("vs-15", "failed to init context");
67671+ return;
67672+ }
67673+
67674+ if (is_inode_loaded(inode)) {
67675+ fplug = inode_file_plugin(inode);
67676+ if (fplug != NULL && fplug->delete_object != NULL)
67677+ fplug->delete_object(inode);
67678+ }
67679+
67680+ inode->i_blocks = 0;
67681+ clear_inode(inode);
67682+ reiser4_exit_context(ctx);
67683+}
67684+
67685+/**
67686+ * reiser4_put_super - put_super of super operations
67687+ * @super: super block to free
67688+ *
67689+ * Stops daemons, release resources, umounts in short.
67690+ */
67691+static void reiser4_put_super(struct super_block *super)
67692+{
67693+ reiser4_super_info_data *sbinfo;
67694+ reiser4_context *ctx;
67695+
67696+ sbinfo = get_super_private(super);
67697+ assert("vs-1699", sbinfo);
67698+
67699+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
67700+ debugfs_remove(sbinfo->tmgr.debugfs_id_count);
67701+ debugfs_remove(sbinfo->debugfs_root);
67702+
67703+ ctx = init_context(super);
67704+ if (IS_ERR(ctx)) {
67705+ warning("vs-17", "failed to init context");
67706+ return;
67707+ }
67708+
67709+ /* have disk format plugin to free its resources */
67710+ if (get_super_private(super)->df_plug->release)
67711+ get_super_private(super)->df_plug->release(super);
67712+
67713+ done_formatted_fake(super);
67714+
67715+ /* stop daemons: ktxnmgr and entd */
67716+ done_entd(super);
67717+ done_ktxnmgrd(super);
67718+ done_txnmgr(&sbinfo->tmgr);
67719+
67720+ done_fs_info(super);
67721+ reiser4_exit_context(ctx);
67722+}
67723+
67724+/**
67725+ * reiser4_write_super - write_super of super operations
67726+ * @super: super block to write
67727+ *
67728+ * Captures znode associated with super block, comit all transactions.
67729+ */
67730+static void reiser4_write_super(struct super_block *super)
67731+{
67732+ int ret;
67733+ reiser4_context *ctx;
67734+
67735+ assert("vs-1700", !rofs_super(super));
67736+
67737+ ctx = init_context(super);
67738+ if (IS_ERR(ctx)) {
67739+ warning("vs-16", "failed to init context");
67740+ return;
67741+ }
67742+
67743+ ret = capture_super_block(super);
67744+ if (ret != 0)
67745+ warning("vs-1701",
67746+ "capture_super_block failed in write_super: %d", ret);
67747+ ret = txnmgr_force_commit_all(super, 0);
67748+ if (ret != 0)
67749+ warning("jmacd-77113",
67750+ "txn_force failed in write_super: %d", ret);
67751+
67752+ super->s_dirt = 0;
67753+
67754+ reiser4_exit_context(ctx);
67755+}
67756+
67757+/**
67758+ * reiser4_statfs - statfs of super operations
67759+ * @super: super block of file system in queried
67760+ * @stafs: buffer to fill with statistics
67761+ *
67762+ * Returns information about filesystem.
67763+ */
67764+static int reiser4_statfs(struct super_block *super, struct kstatfs *statfs)
67765+{
67766+ sector_t total;
67767+ sector_t reserved;
67768+ sector_t free;
67769+ sector_t forroot;
67770+ sector_t deleted;
67771+ reiser4_context *ctx;
67772+
67773+ assert("nikita-408", super != NULL);
67774+ assert("nikita-409", statfs != NULL);
67775+
67776+ ctx = init_context(super);
67777+ if (IS_ERR(ctx))
67778+ return PTR_ERR(ctx);
67779+
67780+ statfs->f_type = statfs_type(super);
67781+ statfs->f_bsize = super->s_blocksize;
67782+
67783+ /*
67784+ * 5% of total block space is reserved. This is needed for flush and
67785+ * for truncates (so that we are able to perform truncate/unlink even
67786+ * on the otherwise completely full file system). If this reservation
67787+ * is hidden from statfs(2), users will mistakenly guess that they
67788+ * have enough free space to complete some operation, which is
67789+ * frustrating.
67790+ *
67791+ * Another possible solution is to subtract ->blocks_reserved from
67792+ * ->f_bfree, but changing available space seems less intrusive than
67793+ * letting user to see 5% of disk space to be used directly after
67794+ * mkfs.
67795+ */
67796+ total = reiser4_block_count(super);
67797+ reserved = get_super_private(super)->blocks_reserved;
67798+ deleted = txnmgr_count_deleted_blocks();
67799+ free = reiser4_free_blocks(super) + deleted;
67800+ forroot = reiser4_reserved_blocks(super, 0, 0);
67801+
67802+ /*
67803+ * These counters may be in inconsistent state because we take the
67804+ * values without keeping any global spinlock. Here we do a sanity
67805+ * check that free block counter does not exceed the number of all
67806+ * blocks.
67807+ */
67808+ if (free > total)
67809+ free = total;
67810+ statfs->f_blocks = total - reserved;
67811+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
67812+ if (free > reserved)
67813+ free -= reserved;
67814+ else
67815+ free = 0;
67816+ statfs->f_bfree = free;
67817+
67818+ if (free > forroot)
67819+ free -= forroot;
67820+ else
67821+ free = 0;
67822+ statfs->f_bavail = free;
67823+
67824+ statfs->f_files = 0;
67825+ statfs->f_ffree = 0;
67826+
67827+ /* maximal acceptable name length depends on directory plugin. */
67828+ assert("nikita-3351", super->s_root->d_inode != NULL);
67829+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
67830+ reiser4_exit_context(ctx);
67831+ return 0;
67832+}
67833+
67834+/**
67835+ * reiser4_clear_inode - clear_inode of super operation
67836+ * @inode: inode about to destroy
67837+ *
67838+ * Does sanity checks: being destroyed should have all jnodes detached.
67839+ */
67840+static void reiser4_clear_inode(struct inode *inode)
67841+{
67842+#if REISER4_DEBUG
67843+ reiser4_inode *r4_inode;
67844+
67845+ r4_inode = reiser4_inode_data(inode);
67846+ if (!inode_has_no_jnodes(r4_inode))
67847+ warning("vs-1732", "reiser4 inode has %ld jnodes\n",
67848+ r4_inode->nr_jnodes);
67849+#endif
67850+}
67851+
67852+/**
67853+ * reiser4_sync_inodes - sync_inodes of super operations
67854+ * @super:
67855+ * @wbc:
67856+ *
67857+ * This method is called by background and non-backgound writeback. Reiser4's
67858+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
67859+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
67860+ * mapping - dirty pages get into atoms. Writeout is called to flush some
67861+ * atoms.
67862+ */
67863+static void reiser4_sync_inodes(struct super_block *super,
67864+ struct writeback_control *wbc)
67865+{
67866+ reiser4_context *ctx;
67867+ long to_write;
67868+
67869+ if (wbc->for_kupdate)
67870+ /* reiser4 has its own means of periodical write-out */
67871+ return;
67872+
67873+ to_write = wbc->nr_to_write;
67874+ assert("vs-49", wbc->older_than_this == NULL);
67875+
67876+ ctx = init_context(super);
67877+ if (IS_ERR(ctx)) {
67878+ warning("vs-13", "failed to init context");
67879+ return;
67880+ }
67881+
67882+ /*
67883+ * call reiser4_writepages for each of dirty inodes to turn dirty pages
67884+ * into transactions if they were not yet.
67885+ */
67886+ generic_sync_sb_inodes(super, wbc);
67887+
67888+ /* flush goes here */
67889+ wbc->nr_to_write = to_write;
67890+ writeout(super, wbc);
67891+
67892+ /* avoid recursive calls to ->sync_inodes */
67893+ context_set_commit_async(ctx);
67894+ reiser4_exit_context(ctx);
67895+}
67896+
67897+/**
67898+ * reiser4_show_options - show_options of super operations
67899+ * @m: file where to write information
67900+ * @mnt: mount structure
67901+ *
67902+ * Makes reiser4 mount options visible in /proc/mounts.
67903+ */
67904+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
67905+{
67906+ struct super_block *super;
67907+ reiser4_super_info_data *sbinfo;
67908+
67909+ super = mnt->mnt_sb;
67910+ sbinfo = get_super_private(super);
67911+
67912+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
67913+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
67914+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
67915+ seq_printf(m, ",atom_max_flushers=0x%x",
67916+ sbinfo->tmgr.atom_max_flushers);
67917+ seq_printf(m, ",cbk_cache_slots=0x%x",
67918+ sbinfo->tree.cbk_cache.nr_slots);
67919+
67920+ return 0;
67921+}
67922+
67923+struct super_operations reiser4_super_operations = {
67924+ .alloc_inode = reiser4_alloc_inode,
67925+ .destroy_inode = reiser4_destroy_inode,
67926+ .dirty_inode = reiser4_dirty_inode,
67927+ .delete_inode = reiser4_delete_inode,
67928+ .put_super = reiser4_put_super,
67929+ .write_super = reiser4_write_super,
67930+ .statfs = reiser4_statfs,
67931+ .clear_inode = reiser4_clear_inode,
67932+ .sync_inodes = reiser4_sync_inodes,
67933+ .show_options = reiser4_show_options
67934+};
67935+
67936+/**
67937+ * fill_super - initialize super block on mount
67938+ * @super: super block to fill
67939+ * @data: reiser4 specific mount option
67940+ * @silent:
67941+ *
67942+ * This is to be called by reiser4_get_sb. Mounts filesystem.
67943+ */
67944+static int fill_super(struct super_block *super, void *data, int silent)
67945+{
67946+ reiser4_context ctx;
67947+ int result;
67948+ reiser4_super_info_data *sbinfo;
67949+
67950+ assert("zam-989", super != NULL);
67951+
67952+ super->s_op = NULL;
67953+ init_stack_context(&ctx, super);
67954+
67955+ /* allocate reiser4 specific super block */
67956+ if ((result = init_fs_info(super)) != 0)
67957+ goto failed_init_sinfo;
67958+
67959+ sbinfo = get_super_private(super);
67960+ /* initialize various reiser4 parameters, parse mount options */
67961+ if ((result = init_super_data(super, data)) != 0)
67962+ goto failed_init_super_data;
67963+
67964+ /* read reiser4 master super block, initialize disk format plugin */
67965+ if ((result = init_read_super(super, silent)) != 0)
67966+ goto failed_init_read_super;
67967+
67968+ /* initialize transaction manager */
67969+ init_txnmgr(&sbinfo->tmgr);
67970+
67971+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
67972+ if ((result = init_ktxnmgrd(super)) != 0)
67973+ goto failed_init_ktxnmgrd;
67974+
67975+ /* initialize entd context and start kernel thread entd */
67976+ if ((result = init_entd(super)) != 0)
67977+ goto failed_init_entd;
67978+
67979+ /* initialize address spaces for formatted nodes and bitmaps */
67980+ if ((result = init_formatted_fake(super)) != 0)
67981+ goto failed_init_formatted_fake;
67982+
67983+ /* initialize disk format plugin */
67984+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
67985+ goto failed_init_disk_format;
67986+
67987+ /*
67988+ * There are some 'committed' versions of reiser4 super block counters,
67989+ * which correspond to reiser4 on-disk state. These counters are
67990+ * initialized here
67991+ */
67992+ sbinfo->blocks_free_committed = sbinfo->blocks_free;
67993+ sbinfo->nr_files_committed = oids_used(super);
67994+
67995+ /* get inode of root directory */
67996+ if ((result = init_root_inode(super)) != 0)
67997+ goto failed_init_root_inode;
67998+
67999+ process_safelinks(super);
68000+ reiser4_exit_context(&ctx);
68001+
68002+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
68003+ reiser4_debugfs_root);
68004+ if (sbinfo->debugfs_root) {
68005+ sbinfo->tmgr.debugfs_atom_count =
68006+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
68007+ sbinfo->debugfs_root,
68008+ &sbinfo->tmgr.atom_count);
68009+ sbinfo->tmgr.debugfs_id_count =
68010+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
68011+ sbinfo->debugfs_root,
68012+ &sbinfo->tmgr.id_count);
68013+ }
68014+ return 0;
68015+
68016+ failed_init_root_inode:
68017+ if (sbinfo->df_plug->release)
68018+ sbinfo->df_plug->release(super);
68019+ failed_init_disk_format:
68020+ done_formatted_fake(super);
68021+ failed_init_formatted_fake:
68022+ done_entd(super);
68023+ failed_init_entd:
68024+ done_ktxnmgrd(super);
68025+ failed_init_ktxnmgrd:
68026+ done_txnmgr(&sbinfo->tmgr);
68027+ failed_init_read_super:
68028+ failed_init_super_data:
68029+ done_fs_info(super);
68030+ failed_init_sinfo:
68031+ reiser4_exit_context(&ctx);
68032+ return result;
68033+}
68034+
68035+/**
68036+ * reiser4_get_sb - get_sb of file_system_type operations
68037+ * @fs_type:
68038+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
68039+ * @dev_name: block device file name
68040+ * @data: specific mount options
68041+ *
68042+ * Reiser4 mount entry.
68043+ */
68044+static struct super_block *reiser4_get_sb(struct file_system_type *fs_type,
68045+ int flags,
68046+ const char *dev_name,
68047+ void *data)
68048+{
68049+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
68050+}
68051+
68052+/* structure describing the reiser4 filesystem implementation */
68053+static struct file_system_type reiser4_fs_type = {
68054+ .owner = THIS_MODULE,
68055+ .name = "reiser4",
68056+ .fs_flags = FS_REQUIRES_DEV,
68057+ .get_sb = reiser4_get_sb,
68058+ .kill_sb = kill_block_super,
68059+ .next = NULL
68060+};
68061+
68062+void destroy_reiser4_cache(kmem_cache_t **cachep)
68063+{
68064+ int result;
68065+
68066+ BUG_ON(*cachep == NULL);
68067+ result = kmem_cache_destroy(*cachep);
68068+ BUG_ON(result != 0);
68069+ *cachep = NULL;
68070+}
68071+
68072+struct dentry *reiser4_debugfs_root = NULL;
68073+
68074+/**
68075+ * init_reiser4 - reiser4 initialization entry point
68076+ *
68077+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
68078+ * on kernel initialization or during reiser4 module load.
68079+ */
68080+static int __init init_reiser4(void)
68081+{
68082+ int result;
68083+
68084+ printk(KERN_INFO
68085+ "Loading Reiser4. "
68086+ "See www.namesys.com for a description of Reiser4.\n");
68087+
68088+ /* initialize slab cache of inodes */
68089+ if ((result = init_inodes()) != 0)
68090+ goto failed_inode_cache;
68091+
68092+ /* initialize cache of znodes */
68093+ if ((result = init_znodes()) != 0)
68094+ goto failed_init_znodes;
68095+
68096+ /* initialize all plugins */
68097+ if ((result = init_plugins()) != 0)
68098+ goto failed_init_plugins;
68099+
68100+ /* initialize cache of plugin_set-s and plugin_set's hash table */
68101+ if ((result = init_plugin_set()) != 0)
68102+ goto failed_init_plugin_set;
68103+
68104+ /* initialize caches of txn_atom-s and txn_handle-s */
68105+ if ((result = init_txnmgr_static()) != 0)
68106+ goto failed_init_txnmgr_static;
68107+
68108+ /* initialize cache of jnodes */
68109+ if ((result = init_jnodes()) != 0)
68110+ goto failed_init_jnodes;
68111+
68112+ /* initialize cache of flush queues */
68113+ if ((result = init_fqs()) != 0)
68114+ goto failed_init_fqs;
68115+
68116+ /* initialize cache of structures attached to dentry->d_fsdata */
68117+ if ((result = init_dentry_fsdata()) != 0)
68118+ goto failed_init_dentry_fsdata;
68119+
68120+ /* initialize cache of structures attached to file->private_data */
68121+ if ((result = init_file_fsdata()) != 0)
68122+ goto failed_init_file_fsdata;
68123+
68124+ /*
68125+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
68126+ * more details
68127+ */
68128+ if ((result = init_d_cursor()) != 0)
68129+ goto failed_init_d_cursor;
68130+
68131+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
68132+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
68133+ return 0;
68134+ }
68135+
68136+ done_d_cursor();
68137+ failed_init_d_cursor:
68138+ done_file_fsdata();
68139+ failed_init_file_fsdata:
68140+ done_dentry_fsdata();
68141+ failed_init_dentry_fsdata:
68142+ done_fqs();
68143+ failed_init_fqs:
68144+ done_jnodes();
68145+ failed_init_jnodes:
68146+ done_txnmgr_static();
68147+ failed_init_txnmgr_static:
68148+ done_plugin_set();
68149+ failed_init_plugin_set:
68150+ failed_init_plugins:
68151+ done_znodes();
68152+ failed_init_znodes:
68153+ done_inodes();
68154+ failed_inode_cache:
68155+ return result;
68156+}
68157+
68158+/**
68159+ * done_reiser4 - reiser4 exit entry point
68160+ *
68161+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
68162+ * or at module unload.
68163+ */
68164+static void __exit done_reiser4(void)
68165+{
68166+ int result;
68167+
68168+ debugfs_remove(reiser4_debugfs_root);
68169+ result = unregister_filesystem(&reiser4_fs_type);
68170+ BUG_ON(result != 0);
68171+ done_d_cursor();
68172+ done_file_fsdata();
68173+ done_dentry_fsdata();
68174+ done_fqs();
68175+ done_jnodes();
68176+ done_txnmgr_static();
68177+ done_plugin_set();
68178+ done_znodes();
68179+ destroy_reiser4_cache(&inode_cache);
68180+}
68181+
68182+module_init(init_reiser4);
68183+module_exit(done_reiser4);
68184+
68185+MODULE_DESCRIPTION("Reiser4 filesystem");
68186+MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
68187+
68188+MODULE_LICENSE("GPL");
68189+
68190+/*
68191+ * Local variables:
68192+ * c-indentation-style: "K&R"
68193+ * mode-name: "LC"
68194+ * c-basic-offset: 8
68195+ * tab-width: 8
68196+ * fill-column: 79
68197+ * End:
68198+ */
68199Index: linux-2.6.16/fs/reiser4/tap.c
68200===================================================================
68201--- /dev/null
68202+++ linux-2.6.16/fs/reiser4/tap.c
68203@@ -0,0 +1,377 @@
68204+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68205+ * reiser4/README */
68206+
68207+/*
68208+ Tree Access Pointer (tap).
68209+
68210+ tap is data structure combining coord and lock handle (mostly). It is
68211+ useful when one has to scan tree nodes (for example, in readdir, or flush),
68212+ for tap functions allow to move tap in either direction transparently
68213+ crossing unit/item/node borders.
68214+
68215+ Tap doesn't provide automatic synchronization of its fields as it is
68216+ supposed to be per-thread object.
68217+*/
68218+
68219+#include "forward.h"
68220+#include "debug.h"
68221+#include "coord.h"
68222+#include "tree.h"
68223+#include "context.h"
68224+#include "tap.h"
68225+#include "znode.h"
68226+#include "tree_walk.h"
68227+
68228+#if REISER4_DEBUG
68229+static int tap_invariant(const tap_t * tap);
68230+static void tap_check(const tap_t * tap);
68231+#else
68232+#define tap_check(tap) noop
68233+#endif
68234+
68235+/** load node tap is pointing to, if not loaded already */
68236+int tap_load(tap_t * tap)
68237+{
68238+ tap_check(tap);
68239+ if (tap->loaded == 0) {
68240+ int result;
68241+
68242+ result = zload_ra(tap->coord->node, &tap->ra_info);
68243+ if (result != 0)
68244+ return result;
68245+ coord_clear_iplug(tap->coord);
68246+ }
68247+ ++tap->loaded;
68248+ tap_check(tap);
68249+ return 0;
68250+}
68251+
68252+/** release node tap is pointing to. Dual to tap_load() */
68253+void tap_relse(tap_t * tap)
68254+{
68255+ tap_check(tap);
68256+ if (tap->loaded > 0) {
68257+ --tap->loaded;
68258+ if (tap->loaded == 0) {
68259+ zrelse(tap->coord->node);
68260+ }
68261+ }
68262+ tap_check(tap);
68263+}
68264+
68265+/**
68266+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68267+ * @mode
68268+ */
68269+void
68270+tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, znode_lock_mode mode)
68271+{
68272+ tap->coord = coord;
68273+ tap->lh = lh;
68274+ tap->mode = mode;
68275+ tap->loaded = 0;
68276+ INIT_LIST_HEAD(&tap->linkage);
68277+ init_ra_info(&tap->ra_info);
68278+}
68279+
68280+/** add @tap to the per-thread list of all taps */
68281+void tap_monitor(tap_t * tap)
68282+{
68283+ assert("nikita-2623", tap != NULL);
68284+ tap_check(tap);
68285+ list_add(&tap->linkage, taps_list());
68286+ tap_check(tap);
68287+}
68288+
68289+/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68290+ * loaded. */
68291+void tap_copy(tap_t * dst, tap_t * src)
68292+{
68293+ assert("nikita-3193", src != NULL);
68294+ assert("nikita-3194", dst != NULL);
68295+
68296+ *dst->coord = *src->coord;
68297+ if (src->lh->node)
68298+ copy_lh(dst->lh, src->lh);
68299+ dst->mode = src->mode;
68300+ dst->loaded = 0;
68301+ INIT_LIST_HEAD(&dst->linkage);
68302+ dst->ra_info = src->ra_info;
68303+}
68304+
68305+/** finish with @tap */
68306+void tap_done(tap_t * tap)
68307+{
68308+ assert("nikita-2565", tap != NULL);
68309+ tap_check(tap);
68310+ if (tap->loaded > 0)
68311+ zrelse(tap->coord->node);
68312+ done_lh(tap->lh);
68313+ tap->loaded = 0;
68314+ list_del_init(&tap->linkage);
68315+ tap->coord->node = NULL;
68316+}
68317+
68318+/**
68319+ * move @tap to the new node, locked with @target. Load @target, if @tap was
68320+ * already loaded.
68321+ */
68322+int tap_move(tap_t * tap, lock_handle * target)
68323+{
68324+ int result = 0;
68325+
68326+ assert("nikita-2567", tap != NULL);
68327+ assert("nikita-2568", target != NULL);
68328+ assert("nikita-2570", target->node != NULL);
68329+ assert("nikita-2569", tap->coord->node == tap->lh->node);
68330+
68331+ tap_check(tap);
68332+ if (tap->loaded > 0)
68333+ result = zload_ra(target->node, &tap->ra_info);
68334+
68335+ if (result == 0) {
68336+ if (tap->loaded > 0)
68337+ zrelse(tap->coord->node);
68338+ done_lh(tap->lh);
68339+ copy_lh(tap->lh, target);
68340+ tap->coord->node = target->node;
68341+ coord_clear_iplug(tap->coord);
68342+ }
68343+ tap_check(tap);
68344+ return result;
68345+}
68346+
68347+/**
68348+ * move @tap to @target. Acquire lock on @target, if @tap was already
68349+ * loaded.
68350+ */
68351+static int tap_to(tap_t * tap, znode * target)
68352+{
68353+ int result;
68354+
68355+ assert("nikita-2624", tap != NULL);
68356+ assert("nikita-2625", target != NULL);
68357+
68358+ tap_check(tap);
68359+ result = 0;
68360+ if (tap->coord->node != target) {
68361+ lock_handle here;
68362+
68363+ init_lh(&here);
68364+ result = longterm_lock_znode(&here, target,
68365+ tap->mode, ZNODE_LOCK_HIPRI);
68366+ if (result == 0) {
68367+ result = tap_move(tap, &here);
68368+ done_lh(&here);
68369+ }
68370+ }
68371+ tap_check(tap);
68372+ return result;
68373+}
68374+
68375+/**
68376+ * move @tap to given @target, loading and locking @target->node if
68377+ * necessary
68378+ */
68379+int tap_to_coord(tap_t * tap, coord_t * target)
68380+{
68381+ int result;
68382+
68383+ tap_check(tap);
68384+ result = tap_to(tap, target->node);
68385+ if (result == 0)
68386+ coord_dup(tap->coord, target);
68387+ tap_check(tap);
68388+ return result;
68389+}
68390+
68391+/** return list of all taps */
68392+struct list_head *taps_list(void)
68393+{
68394+ return &get_current_context()->taps;
68395+}
68396+
68397+/** helper function for go_{next,prev}_{item,unit,node}() */
68398+int go_dir_el(tap_t * tap, sideof dir, int units_p)
68399+{
68400+ coord_t dup;
68401+ coord_t *coord;
68402+ int result;
68403+
68404+ int (*coord_dir) (coord_t *);
68405+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68406+ void (*coord_init) (coord_t *, const znode *);
68407+ ON_DEBUG(int (*coord_check) (const coord_t *));
68408+
68409+ assert("nikita-2556", tap != NULL);
68410+ assert("nikita-2557", tap->coord != NULL);
68411+ assert("nikita-2558", tap->lh != NULL);
68412+ assert("nikita-2559", tap->coord->node != NULL);
68413+
68414+ tap_check(tap);
68415+ if (dir == LEFT_SIDE) {
68416+ coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68417+ get_dir_neighbor = reiser4_get_left_neighbor;
68418+ coord_init = coord_init_last_unit;
68419+ } else {
68420+ coord_dir = units_p ? coord_next_unit : coord_next_item;
68421+ get_dir_neighbor = reiser4_get_right_neighbor;
68422+ coord_init = coord_init_first_unit;
68423+ }
68424+ ON_DEBUG(coord_check =
68425+ units_p ? coord_is_existing_unit : coord_is_existing_item);
68426+ assert("nikita-2560", coord_check(tap->coord));
68427+
68428+ coord = tap->coord;
68429+ coord_dup(&dup, coord);
68430+ if (coord_dir(&dup) != 0) {
68431+ do {
68432+ /* move to the left neighboring node */
68433+ lock_handle dup;
68434+
68435+ init_lh(&dup);
68436+ result =
68437+ get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68438+ GN_CAN_USE_UPPER_LEVELS);
68439+ if (result == 0) {
68440+ result = tap_move(tap, &dup);
68441+ if (result == 0)
68442+ coord_init(tap->coord, dup.node);
68443+ done_lh(&dup);
68444+ }
68445+ /* skip empty nodes */
68446+ } while ((result == 0) && node_is_empty(coord->node));
68447+ } else {
68448+ result = 0;
68449+ coord_dup(coord, &dup);
68450+ }
68451+ assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68452+ tap_check(tap);
68453+ return result;
68454+}
68455+
68456+/**
68457+ * move @tap to the next unit, transparently crossing item and node
68458+ * boundaries
68459+ */
68460+int go_next_unit(tap_t * tap)
68461+{
68462+ return go_dir_el(tap, RIGHT_SIDE, 1);
68463+}
68464+
68465+/**
68466+ * move @tap to the previous unit, transparently crossing item and node
68467+ * boundaries
68468+ */
68469+int go_prev_unit(tap_t * tap)
68470+{
68471+ return go_dir_el(tap, LEFT_SIDE, 1);
68472+}
68473+
68474+/**
68475+ * @shift times apply @actor to the @tap. This is used to move @tap by
68476+ * @shift units (or items, or nodes) in either direction.
68477+ */
68478+static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
68479+{
68480+ int result;
68481+
68482+ assert("nikita-2555", shift >= 0);
68483+ assert("nikita-2562", tap->coord->node == tap->lh->node);
68484+
68485+ tap_check(tap);
68486+ result = tap_load(tap);
68487+ if (result != 0)
68488+ return result;
68489+
68490+ for (; shift > 0; --shift) {
68491+ result = actor(tap);
68492+ assert("nikita-2563", tap->coord->node == tap->lh->node);
68493+ if (result != 0)
68494+ break;
68495+ }
68496+ tap_relse(tap);
68497+ tap_check(tap);
68498+ return result;
68499+}
68500+
68501+/** move @tap @shift units rightward */
68502+int rewind_right(tap_t * tap, int shift)
68503+{
68504+ return rewind_to(tap, go_next_unit, shift);
68505+}
68506+
68507+/** move @tap @shift units leftward */
68508+int rewind_left(tap_t * tap, int shift)
68509+{
68510+ return rewind_to(tap, go_prev_unit, shift);
68511+}
68512+
68513+#if REISER4_DEBUG
68514+/** debugging function: print @tap content in human readable form */
68515+static void print_tap(const char *prefix, const tap_t * tap)
68516+{
68517+ if (tap == NULL) {
68518+ printk("%s: null tap\n", prefix);
68519+ return;
68520+ }
68521+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
68522+ tap->loaded, (&tap->linkage == tap->linkage.next &&
68523+ &tap->linkage == tap->linkage.prev),
68524+ tap->lh->node,
68525+ lock_mode_name(tap->mode));
68526+ print_coord("\tcoord", tap->coord, 0);
68527+}
68528+
68529+/** check [tap-sane] invariant */
68530+static int tap_invariant(const tap_t * tap)
68531+{
68532+ /* [tap-sane] invariant */
68533+
68534+ if (tap == NULL)
68535+ return 1;
68536+ /* tap->mode is one of
68537+ *
68538+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
68539+ */
68540+ if (tap->mode != ZNODE_NO_LOCK &&
68541+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
68542+ return 2;
68543+ /* tap->coord != NULL, and */
68544+ if (tap->coord == NULL)
68545+ return 3;
68546+ /* tap->lh != NULL, and */
68547+ if (tap->lh == NULL)
68548+ return 4;
68549+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
68550+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
68551+ return 5;
68552+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
68553+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
68554+ return 6;
68555+ return 0;
68556+}
68557+
68558+/** debugging function: check internal @tap consistency */
68559+static void tap_check(const tap_t * tap)
68560+{
68561+ int result;
68562+
68563+ result = tap_invariant(tap);
68564+ if (result != 0) {
68565+ print_tap("broken", tap);
68566+ reiser4_panic("nikita-2831", "tap broken: %i\n", result);
68567+ }
68568+}
68569+#endif
68570+
68571+/* Make Linus happy.
68572+ Local variables:
68573+ c-indentation-style: "K&R"
68574+ mode-name: "LC"
68575+ c-basic-offset: 8
68576+ tab-width: 8
68577+ fill-column: 120
68578+ scroll-step: 1
68579+ End:
68580+*/
68581Index: linux-2.6.16/fs/reiser4/tap.h
68582===================================================================
68583--- /dev/null
68584+++ linux-2.6.16/fs/reiser4/tap.h
68585@@ -0,0 +1,69 @@
68586+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68587+
68588+/* Tree Access Pointers. See tap.c for more details. */
68589+
68590+#if !defined( __REISER4_TAP_H__ )
68591+#define __REISER4_TAP_H__
68592+
68593+#include "forward.h"
68594+#include "readahead.h"
68595+
68596+/**
68597+ tree_access_pointer aka tap. Data structure combining coord_t and lock
68598+ handle.
68599+ Invariants involving this data-type, see doc/lock-ordering for details:
68600+
68601+ [tap-sane]
68602+ */
68603+struct tree_access_pointer {
68604+ /* coord tap is at */
68605+ coord_t *coord;
68606+ /* lock handle on ->coord->node */
68607+ lock_handle *lh;
68608+ /* mode of lock acquired by this tap */
68609+ znode_lock_mode mode;
68610+ /* incremented by tap_load(). Decremented by tap_relse(). */
68611+ int loaded;
68612+ /* list of taps */
68613+ struct list_head linkage;
68614+ /* read-ahead hint */
68615+ ra_info_t ra_info;
68616+};
68617+
68618+typedef int (*go_actor_t) (tap_t * tap);
68619+
68620+extern int tap_load(tap_t * tap);
68621+extern void tap_relse(tap_t * tap);
68622+extern void tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68623+ znode_lock_mode mode);
68624+extern void tap_monitor(tap_t * tap);
68625+extern void tap_copy(tap_t * dst, tap_t * src);
68626+extern void tap_done(tap_t * tap);
68627+extern int tap_move(tap_t * tap, lock_handle * target);
68628+extern int tap_to_coord(tap_t * tap, coord_t * target);
68629+
68630+extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
68631+extern int go_next_unit(tap_t * tap);
68632+extern int go_prev_unit(tap_t * tap);
68633+extern int rewind_right(tap_t * tap, int shift);
68634+extern int rewind_left(tap_t * tap, int shift);
68635+
68636+extern struct list_head *taps_list(void);
68637+
68638+#define for_all_taps(tap) \
68639+ for (tap = list_entry(taps_list()->next, tap_t, linkage); \
68640+ taps_list() != &tap->linkage; \
68641+ tap = list_entry(tap->linkage.next, tap_t, linkage))
68642+
68643+/* __REISER4_TAP_H__ */
68644+#endif
68645+/* Make Linus happy.
68646+ Local variables:
68647+ c-indentation-style: "K&R"
68648+ mode-name: "LC"
68649+ c-basic-offset: 8
68650+ tab-width: 8
68651+ fill-column: 120
68652+ scroll-step: 1
68653+ End:
68654+*/
68655Index: linux-2.6.16/fs/reiser4/tree.c
68656===================================================================
68657--- /dev/null
68658+++ linux-2.6.16/fs/reiser4/tree.c
68659@@ -0,0 +1,1875 @@
68660+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68661+ * reiser4/README */
68662+
68663+/*
68664+ * KEYS IN A TREE.
68665+ *
68666+ * The tree consists of nodes located on the disk. Node in the tree is either
68667+ * formatted or unformatted. Formatted node is one that has structure
68668+ * understood by the tree balancing and traversal code. Formatted nodes are
68669+ * further classified into leaf and internal nodes. Latter distinctions is
68670+ * (almost) of only historical importance: general structure of leaves and
68671+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
68672+ * that are part of bodies of ordinary files and attributes.
68673+ *
68674+ * Each node in the tree spawns some interval in the key space. Key ranges for
68675+ * all nodes in the tree are disjoint. Actually, this only holds in some weak
68676+ * sense, because of the non-unique keys: intersection of key ranges for
68677+ * different nodes is either empty, or consists of exactly one key.
68678+ *
68679+ * Formatted node consists of a sequence of items. Each item spawns some
68680+ * interval in key space. Key ranges for all items in a tree are disjoint,
68681+ * modulo non-unique keys again. Items within nodes are ordered in the key
68682+ * order of the smallest key in a item.
68683+ *
68684+ * Particular type of item can be further split into units. Unit is piece of
68685+ * item that can be cut from item and moved into another item of the same
68686+ * time. Units are used by balancing code to repack data during balancing.
68687+ *
68688+ * Unit can be further split into smaller entities (for example, extent unit
68689+ * represents several pages, and it is natural for extent code to operate on
68690+ * particular pages and even bytes within one unit), but this is of no
68691+ * relevance to the generic balancing and lookup code.
68692+ *
68693+ * Although item is said to "spawn" range or interval of keys, it is not
68694+ * necessary that item contains piece of data addressable by each and every
68695+ * key in this range. For example, compound directory item, consisting of
68696+ * units corresponding to directory entries and keyed by hashes of file names,
68697+ * looks more as having "discrete spectrum": only some disjoint keys inside
68698+ * range occupied by this item really address data.
68699+ *
68700+ * No than less, each item always has well-defined least (minimal) key, that
68701+ * is recorded in item header, stored in the node this item is in. Also, item
68702+ * plugin can optionally define method ->max_key_inside() returning maximal
68703+ * key that can _possibly_ be located within this item. This method is used
68704+ * (mainly) to determine when given piece of data should be merged into
68705+ * existing item, in stead of creating new one. Because of this, even though
68706+ * ->max_key_inside() can be larger that any key actually located in the item,
68707+ * intervals
68708+ *
68709+ * [ min_key( item ), ->max_key_inside( item ) ]
68710+ *
68711+ * are still disjoint for all items within the _same_ node.
68712+ *
68713+ * In memory node is represented by znode. It plays several roles:
68714+ *
68715+ * . something locks are taken on
68716+ *
68717+ * . something tracked by transaction manager (this is going to change)
68718+ *
68719+ * . something used to access node data
68720+ *
68721+ * . something used to maintain tree structure in memory: sibling and
68722+ * parental linkage.
68723+ *
68724+ * . something used to organize nodes into "slums"
68725+ *
68726+ * More on znodes see in znode.[ch]
68727+ *
68728+ * DELIMITING KEYS
68729+ *
68730+ * To simplify balancing, allow some flexibility in locking and speed up
68731+ * important coord cache optimization, we keep delimiting keys of nodes in
68732+ * memory. Depending on disk format (implemented by appropriate node plugin)
68733+ * node on disk can record both left and right delimiting key, only one of
68734+ * them, or none. Still, our balancing and tree traversal code keep both
68735+ * delimiting keys for a node that is in memory stored in the znode. When
68736+ * node is first brought into memory during tree traversal, its left
68737+ * delimiting key is taken from its parent, and its right delimiting key is
68738+ * either next key in its parent, or is right delimiting key of parent if
68739+ * node is the rightmost child of parent.
68740+ *
68741+ * Physical consistency of delimiting key is protected by special dk
68742+ * read-write lock. That is, delimiting keys can only be inspected or
68743+ * modified under this lock. But dk lock is only sufficient for fast
68744+ * "pessimistic" check, because to simplify code and to decrease lock
68745+ * contention, balancing (carry) only updates delimiting keys right before
68746+ * unlocking all locked nodes on the given tree level. For example,
68747+ * coord-by-key cache scans LRU list of recently accessed znodes. For each
68748+ * node it first does fast check under dk spin lock. If key looked for is
68749+ * not between delimiting keys for this node, next node is inspected and so
68750+ * on. If key is inside of the key range, long term lock is taken on node
68751+ * and key range is rechecked.
68752+ *
68753+ * COORDINATES
68754+ *
68755+ * To find something in the tree, you supply a key, and the key is resolved
68756+ * by coord_by_key() into a coord (coordinate) that is valid as long as the
68757+ * node the coord points to remains locked. As mentioned above trees
68758+ * consist of nodes that consist of items that consist of units. A unit is
68759+ * the smallest and indivisible piece of tree as far as balancing and tree
68760+ * search are concerned. Each node, item, and unit can be addressed by
68761+ * giving its level in the tree and the key occupied by this entity. A node
68762+ * knows what the key ranges are of the items within it, and how to find its
68763+ * items and invoke their item handlers, but it does not know how to access
68764+ * individual units within its items except through the item handlers.
68765+ * coord is a structure containing a pointer to the node, the ordinal number
68766+ * of the item within this node (a sort of item offset), and the ordinal
68767+ * number of the unit within this item.
68768+ *
68769+ * TREE LOOKUP
68770+ *
68771+ * There are two types of access to the tree: lookup and modification.
68772+ *
68773+ * Lookup is a search for the key in the tree. Search can look for either
68774+ * exactly the key given to it, or for the largest key that is not greater
68775+ * than the key given to it. This distinction is determined by "bias"
68776+ * parameter of search routine (coord_by_key()). coord_by_key() either
68777+ * returns error (key is not in the tree, or some kind of external error
68778+ * occurred), or successfully resolves key into coord.
68779+ *
68780+ * This resolution is done by traversing tree top-to-bottom from root level
68781+ * to the desired level. On levels above twig level (level one above the
68782+ * leaf level) nodes consist exclusively of internal items. Internal item is
68783+ * nothing more than pointer to the tree node on the child level. On twig
68784+ * level nodes consist of internal items intermixed with extent
68785+ * items. Internal items form normal search tree structure used by traversal
68786+ * to descent through the tree.
68787+ *
68788+ * TREE LOOKUP OPTIMIZATIONS
68789+ *
68790+ * Tree lookup described above is expensive even if all nodes traversed are
68791+ * already in the memory: for each node binary search within it has to be
68792+ * performed and binary searches are CPU consuming and tend to destroy CPU
68793+ * caches.
68794+ *
68795+ * Several optimizations are used to work around this:
68796+ *
68797+ * . cbk_cache (look-aside cache for tree traversals, see search.c for
68798+ * details)
68799+ *
68800+ * . seals (see seal.[ch])
68801+ *
68802+ * . vroot (see search.c)
68803+ *
68804+ * General search-by-key is layered thusly:
68805+ *
68806+ * [check seal, if any] --ok--> done
68807+ * |
68808+ * failed
68809+ * |
68810+ * V
68811+ * [vroot defined] --no--> node = tree_root
68812+ * | |
68813+ * yes |
68814+ * | |
68815+ * V |
68816+ * node = vroot |
68817+ * | |
68818+ * | |
68819+ * | |
68820+ * V V
68821+ * [check cbk_cache for key] --ok--> done
68822+ * |
68823+ * failed
68824+ * |
68825+ * V
68826+ * [start tree traversal from node]
68827+ *
68828+ */
68829+
68830+#include "forward.h"
68831+#include "debug.h"
68832+#include "dformat.h"
68833+#include "key.h"
68834+#include "coord.h"
68835+#include "plugin/item/static_stat.h"
68836+#include "plugin/item/item.h"
68837+#include "plugin/node/node.h"
68838+#include "plugin/plugin.h"
68839+#include "txnmgr.h"
68840+#include "jnode.h"
68841+#include "znode.h"
68842+#include "block_alloc.h"
68843+#include "tree_walk.h"
68844+#include "carry.h"
68845+#include "carry_ops.h"
68846+#include "tap.h"
68847+#include "tree.h"
68848+#include "vfs_ops.h"
68849+#include "page_cache.h"
68850+#include "super.h"
68851+#include "reiser4.h"
68852+#include "inode.h"
68853+
68854+#include <linux/fs.h> /* for struct super_block */
68855+#include <linux/spinlock.h>
68856+
68857+/* Disk address (block number) never ever used for any real tree node. This is
68858+ used as block number of "uber" znode.
68859+
68860+ Invalid block addresses are 0 by tradition.
68861+
68862+*/
68863+const reiser4_block_nr UBER_TREE_ADDR = 0ull;
68864+
68865+#define CUT_TREE_MIN_ITERATIONS 64
68866+
68867+static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
68868+
68869+/* return node plugin of coord->node */
68870+node_plugin *node_plugin_by_coord(const coord_t * coord)
68871+{
68872+ assert("vs-1", coord != NULL);
68873+ assert("vs-2", coord->node != NULL);
68874+
68875+ return coord->node->nplug;
68876+}
68877+
68878+/* insert item into tree. Fields of @coord are updated so that they can be
68879+ * used by consequent insert operation. */
68880+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
68881+ * into */ ,
68882+ const reiser4_key * key /* key of new item */ ,
68883+ reiser4_item_data * data /* parameters for item
68884+ * creation */ ,
68885+ coord_t * coord /* resulting insertion coord */ ,
68886+ lock_handle * lh /* resulting lock
68887+ * handle */ ,
68888+ tree_level stop_level /** level where to insert */ ,
68889+ __u32 flags /* insertion flags */ )
68890+{
68891+ int result;
68892+
68893+ assert("nikita-358", tree != NULL);
68894+ assert("nikita-360", coord != NULL);
68895+
68896+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
68897+ FIND_EXACT, stop_level, stop_level,
68898+ flags | CBK_FOR_INSERT, NULL /*ra_info */ );
68899+ switch (result) {
68900+ default:
68901+ break;
68902+ case CBK_COORD_FOUND:
68903+ result = IBK_ALREADY_EXISTS;
68904+ break;
68905+ case CBK_COORD_NOTFOUND:
68906+ assert("nikita-2017", coord->node != NULL);
68907+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
68908+ break;
68909+ }
68910+ return result;
68911+}
68912+
68913+/* insert item by calling carry. Helper function called if short-cut
68914+ insertion failed */
68915+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
68916+ lock_handle * lh, /* lock handle of insertion
68917+ * node */
68918+ reiser4_item_data * data, /* parameters of new
68919+ * item */
68920+ const reiser4_key * key, /* key of new item */
68921+ carry_opcode cop, /* carry operation to perform */
68922+ cop_insert_flag flags
68923+ /* carry flags */ )
68924+{
68925+ int result;
68926+ carry_pool *pool;
68927+ carry_level *lowest_level;
68928+ carry_insert_data *cdata;
68929+ carry_op *op;
68930+
68931+ assert("umka-314", coord != NULL);
68932+
68933+ /* allocate carry_pool and 3 carry_level-s */
68934+ pool =
68935+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68936+ sizeof(*cdata));
68937+ if (IS_ERR(pool))
68938+ return PTR_ERR(pool);
68939+ lowest_level = (carry_level *) (pool + 1);
68940+ init_carry_level(lowest_level, pool);
68941+
68942+ op = post_carry(lowest_level, cop, coord->node, 0);
68943+ if (IS_ERR(op) || (op == NULL)) {
68944+ done_carry_pool(pool);
68945+ return RETERR(op ? PTR_ERR(op) : -EIO);
68946+ }
68947+ cdata = (carry_insert_data *) (lowest_level + 3);
68948+ cdata->coord = coord;
68949+ cdata->data = data;
68950+ cdata->key = key;
68951+ op->u.insert.d = cdata;
68952+ if (flags == 0)
68953+ flags = znode_get_tree(coord->node)->carry.insert_flags;
68954+ op->u.insert.flags = flags;
68955+ op->u.insert.type = COPT_ITEM_DATA;
68956+ op->u.insert.child = NULL;
68957+ if (lh != NULL) {
68958+ assert("nikita-3245", lh->node == coord->node);
68959+ lowest_level->track_type = CARRY_TRACK_CHANGE;
68960+ lowest_level->tracked = lh;
68961+ }
68962+
68963+ result = carry(lowest_level, NULL);
68964+ done_carry_pool(pool);
68965+
68966+ return result;
68967+}
68968+
68969+/* form carry queue to perform paste of @data with @key at @coord, and launch
68970+ its execution by calling carry().
68971+
68972+ Instruct carry to update @lh it after balancing insertion coord moves into
68973+ different block.
68974+
68975+*/
68976+static int paste_with_carry(coord_t * coord, /* coord of paste */
68977+ lock_handle * lh, /* lock handle of node
68978+ * where item is
68979+ * pasted */
68980+ reiser4_item_data * data, /* parameters of new
68981+ * item */
68982+ const reiser4_key * key, /* key of new item */
68983+ unsigned flags /* paste flags */ )
68984+{
68985+ int result;
68986+ carry_pool *pool;
68987+ carry_level *lowest_level;
68988+ carry_insert_data *cdata;
68989+ carry_op *op;
68990+
68991+ assert("umka-315", coord != NULL);
68992+ assert("umka-316", key != NULL);
68993+
68994+ pool =
68995+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68996+ sizeof(*cdata));
68997+ if (IS_ERR(pool))
68998+ return PTR_ERR(pool);
68999+ lowest_level = (carry_level *) (pool + 1);
69000+ init_carry_level(lowest_level, pool);
69001+
69002+ op = post_carry(lowest_level, COP_PASTE, coord->node, 0);
69003+ if (IS_ERR(op) || (op == NULL)) {
69004+ done_carry_pool(pool);
69005+ return RETERR(op ? PTR_ERR(op) : -EIO);
69006+ }
69007+ cdata = (carry_insert_data *) (lowest_level + 3);
69008+ cdata->coord = coord;
69009+ cdata->data = data;
69010+ cdata->key = key;
69011+ op->u.paste.d = cdata;
69012+ if (flags == 0)
69013+ flags = znode_get_tree(coord->node)->carry.paste_flags;
69014+ op->u.paste.flags = flags;
69015+ op->u.paste.type = COPT_ITEM_DATA;
69016+ if (lh != NULL) {
69017+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69018+ lowest_level->tracked = lh;
69019+ }
69020+
69021+ result = carry(lowest_level, NULL);
69022+ done_carry_pool(pool);
69023+
69024+ return result;
69025+}
69026+
69027+/* insert item at the given coord.
69028+
69029+ First try to skip carry by directly calling ->create_item() method of node
69030+ plugin. If this is impossible (there is not enough free space in the node,
69031+ or leftmost item in the node is created), call insert_with_carry_by_coord()
69032+ that will do full carry().
69033+
69034+*/
69035+insert_result insert_by_coord(coord_t * coord /* coord where to
69036+ * insert. coord->node has
69037+ * to be write locked by
69038+ * caller */ ,
69039+ reiser4_item_data * data /* data to be
69040+ * inserted */ ,
69041+ const reiser4_key * key /* key of new item */ ,
69042+ lock_handle * lh /* lock handle of write
69043+ * lock on node */ ,
69044+ __u32 flags /* insertion flags */ )
69045+{
69046+ unsigned item_size;
69047+ int result;
69048+ znode *node;
69049+
69050+ assert("vs-247", coord != NULL);
69051+ assert("vs-248", data != NULL);
69052+ assert("vs-249", data->length >= 0);
69053+ assert("nikita-1191", znode_is_write_locked(coord->node));
69054+
69055+ node = coord->node;
69056+ coord_clear_iplug(coord);
69057+ result = zload(node);
69058+ if (result != 0)
69059+ return result;
69060+
69061+ item_size = space_needed(node, NULL, data, 1);
69062+ if (item_size > znode_free_space(node) &&
69063+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69064+ && (flags & COPI_DONT_ALLOCATE)) {
69065+ /* we are forced to use free space of coord->node and new item
69066+ does not fit into it.
69067+
69068+ Currently we get here only when we allocate and copy units
69069+ of extent item from a node to its left neighbor during
69070+ "squalloc"-ing. If @node (this is left neighbor) does not
69071+ have enough free space - we do not want to attempt any
69072+ shifting and allocations because we are in squeezing and
69073+ everything to the left of @node is tightly packed.
69074+ */
69075+ result = -E_NODE_FULL;
69076+ } else if ((item_size <= znode_free_space(node)) &&
69077+ !coord_is_before_leftmost(coord) &&
69078+ (node_plugin_by_node(node)->fast_insert != NULL)
69079+ && node_plugin_by_node(node)->fast_insert(coord)) {
69080+ /* shortcut insertion without carry() overhead.
69081+
69082+ Only possible if:
69083+
69084+ - there is enough free space
69085+
69086+ - insertion is not into the leftmost position in a node
69087+ (otherwise it would require updating of delimiting key in a
69088+ parent)
69089+
69090+ - node plugin agrees with this
69091+
69092+ */
69093+ result =
69094+ node_plugin_by_node(node)->create_item(coord, key, data,
69095+ NULL);
69096+ znode_make_dirty(node);
69097+ } else {
69098+ /* otherwise do full-fledged carry(). */
69099+ result =
69100+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
69101+ flags);
69102+ }
69103+ zrelse(node);
69104+ return result;
69105+}
69106+
69107+/* @coord is set to leaf level and @data is to be inserted to twig level */
69108+insert_result
69109+insert_extent_by_coord(coord_t *
69110+ coord
69111+ /* coord where to insert. coord->node * has to be write * locked by caller */
69112+ ,
69113+ reiser4_item_data * data /* data to be inserted */ ,
69114+ const reiser4_key * key /* key of new item */ ,
69115+ lock_handle *
69116+ lh /* lock handle of write lock on * node */ )
69117+{
69118+ assert("vs-405", coord != NULL);
69119+ assert("vs-406", data != NULL);
69120+ assert("vs-407", data->length > 0);
69121+ assert("vs-408", znode_is_write_locked(coord->node));
69122+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
69123+
69124+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
69125+ 0 /*flags */ );
69126+}
69127+
69128+/* Insert into the item at the given coord.
69129+
69130+ First try to skip carry by directly calling ->paste() method of item
69131+ plugin. If this is impossible (there is not enough free space in the node,
69132+ or we are pasting into leftmost position in the node), call
69133+ paste_with_carry() that will do full carry().
69134+
69135+*/
69136+/* paste_into_item */
69137+int insert_into_item(coord_t * coord /* coord of pasting */ ,
69138+ lock_handle * lh /* lock handle on node involved */ ,
69139+ const reiser4_key * key /* key of unit being pasted */ ,
69140+ reiser4_item_data * data /* parameters for new unit */ ,
69141+ unsigned flags /* insert/paste flags */ )
69142+{
69143+ int result;
69144+ int size_change;
69145+ node_plugin *nplug;
69146+ item_plugin *iplug;
69147+
69148+ assert("umka-317", coord != NULL);
69149+ assert("umka-318", key != NULL);
69150+
69151+ iplug = item_plugin_by_coord(coord);
69152+ nplug = node_plugin_by_coord(coord);
69153+
69154+ assert("nikita-1480", iplug == data->iplug);
69155+
69156+ size_change = space_needed(coord->node, coord, data, 0);
69157+ if (size_change > (int)znode_free_space(coord->node) &&
69158+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69159+ && (flags & COPI_DONT_ALLOCATE)) {
69160+ /* we are forced to use free space of coord->node and new data
69161+ does not fit into it. */
69162+ return -E_NODE_FULL;
69163+ }
69164+
69165+ /* shortcut paste without carry() overhead.
69166+
69167+ Only possible if:
69168+
69169+ - there is enough free space
69170+
69171+ - paste is not into the leftmost unit in a node (otherwise
69172+ it would require updating of delimiting key in a parent)
69173+
69174+ - node plugin agrees with this
69175+
69176+ - item plugin agrees with us
69177+ */
69178+ if (size_change <= (int)znode_free_space(coord->node) &&
69179+ (coord->item_pos != 0 ||
69180+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
69181+ coord->unit_pos != 0 && nplug->fast_paste != NULL &&
69182+ nplug->fast_paste(coord) &&
69183+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
69184+ if (size_change > 0)
69185+ nplug->change_item_size(coord, size_change);
69186+ /* NOTE-NIKITA: huh? where @key is used? */
69187+ result = iplug->b.paste(coord, data, NULL);
69188+ if (size_change < 0)
69189+ nplug->change_item_size(coord, size_change);
69190+ znode_make_dirty(coord->node);
69191+ } else
69192+ /* otherwise do full-fledged carry(). */
69193+ result = paste_with_carry(coord, lh, data, key, flags);
69194+ return result;
69195+}
69196+
69197+/* this either appends or truncates item @coord */
69198+int resize_item(coord_t * coord /* coord of item being resized */ ,
69199+ reiser4_item_data * data /* parameters of resize */ ,
69200+ reiser4_key * key /* key of new unit */ ,
69201+ lock_handle * lh /* lock handle of node
69202+ * being modified */ ,
69203+ cop_insert_flag flags /* carry flags */ )
69204+{
69205+ int result;
69206+ znode *node;
69207+
69208+ assert("nikita-362", coord != NULL);
69209+ assert("nikita-363", data != NULL);
69210+ assert("vs-245", data->length != 0);
69211+
69212+ node = coord->node;
69213+ coord_clear_iplug(coord);
69214+ result = zload(node);
69215+ if (result != 0)
69216+ return result;
69217+
69218+ if (data->length < 0)
69219+ result = node_plugin_by_coord(coord)->shrink_item(coord,
69220+ -data->length);
69221+ else
69222+ result = insert_into_item(coord, lh, key, data, flags);
69223+
69224+ zrelse(node);
69225+ return result;
69226+}
69227+
69228+/* insert flow @f */
69229+int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
69230+{
69231+ int result;
69232+ carry_pool *pool;
69233+ carry_level *lowest_level;
69234+ reiser4_item_data *data;
69235+ carry_op *op;
69236+
69237+ pool =
69238+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69239+ sizeof(*data));
69240+ if (IS_ERR(pool))
69241+ return PTR_ERR(pool);
69242+ lowest_level = (carry_level *) (pool + 1);
69243+ init_carry_level(lowest_level, pool);
69244+
69245+ op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
69246+ 0 /* operate directly on coord -> node */ );
69247+ if (IS_ERR(op) || (op == NULL)) {
69248+ done_carry_pool(pool);
69249+ return RETERR(op ? PTR_ERR(op) : -EIO);
69250+ }
69251+
69252+ /* these are permanent during insert_flow */
69253+ data = (reiser4_item_data *) (lowest_level + 3);
69254+ data->user = 1;
69255+ data->iplug = item_plugin_by_id(FORMATTING_ID);
69256+ data->arg = NULL;
69257+ /* data.length and data.data will be set before calling paste or
69258+ insert */
69259+ data->length = 0;
69260+ data->data = NULL;
69261+
69262+ op->u.insert_flow.flags = 0;
69263+ op->u.insert_flow.insert_point = coord;
69264+ op->u.insert_flow.flow = f;
69265+ op->u.insert_flow.data = data;
69266+ op->u.insert_flow.new_nodes = 0;
69267+
69268+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69269+ lowest_level->tracked = lh;
69270+
69271+ result = carry(lowest_level, NULL);
69272+ done_carry_pool(pool);
69273+
69274+ return result;
69275+}
69276+
69277+/* Given a coord in parent node, obtain a znode for the corresponding child */
69278+znode *child_znode(const coord_t * parent_coord /* coord of pointer to
69279+ * child */ ,
69280+ znode * parent /* parent of child */ ,
69281+ int incore_p /* if !0 only return child if already in
69282+ * memory */ ,
69283+ int setup_dkeys_p /* if !0 update delimiting keys of
69284+ * child */ )
69285+{
69286+ znode *child;
69287+
69288+ assert("nikita-1374", parent_coord != NULL);
69289+ assert("nikita-1482", parent != NULL);
69290+#if REISER4_DEBUG
69291+ if (setup_dkeys_p)
69292+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69293+#endif
69294+ assert("nikita-2947", znode_is_any_locked(parent));
69295+
69296+ if (znode_get_level(parent) <= LEAF_LEVEL) {
69297+ /* trying to get child of leaf node */
69298+ warning("nikita-1217", "Child of maize?");
69299+ return ERR_PTR(RETERR(-EIO));
69300+ }
69301+ if (item_is_internal(parent_coord)) {
69302+ reiser4_block_nr addr;
69303+ item_plugin *iplug;
69304+ reiser4_tree *tree;
69305+
69306+ iplug = item_plugin_by_coord(parent_coord);
69307+ assert("vs-512", iplug->s.internal.down_link);
69308+ iplug->s.internal.down_link(parent_coord, NULL, &addr);
69309+
69310+ tree = znode_get_tree(parent);
69311+ if (incore_p)
69312+ child = zlook(tree, &addr);
69313+ else
69314+ child =
69315+ zget(tree, &addr, parent,
69316+ znode_get_level(parent) - 1, get_gfp_mask());
69317+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69318+ set_child_delimiting_keys(parent, parent_coord, child);
69319+ } else {
69320+ warning("nikita-1483", "Internal item expected");
69321+ child = ERR_PTR(RETERR(-EIO));
69322+ }
69323+ return child;
69324+}
69325+
69326+/* remove znode from transaction */
69327+static void uncapture_znode(znode * node)
69328+{
69329+ struct page *page;
69330+
69331+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69332+
69333+ if (!blocknr_is_fake(znode_get_block(node))) {
69334+ int ret;
69335+
69336+ /* An already allocated block goes right to the atom's delete set. */
69337+ ret =
69338+ reiser4_dealloc_block(znode_get_block(node), 0,
69339+ BA_DEFER | BA_FORMATTED);
69340+ if (ret)
69341+ warning("zam-942",
69342+ "can\'t add a block (%llu) number to atom's delete set\n",
69343+ (unsigned long long)(*znode_get_block(node)));
69344+
69345+ spin_lock_znode(node);
69346+ /* Here we return flush reserved block which was reserved at the
69347+ * moment when this allocated node was marked dirty and still
69348+ * not used by flush in node relocation procedure. */
69349+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69350+ txn_atom *atom;
69351+
69352+ atom = jnode_get_atom(ZJNODE(node));
69353+ assert("zam-939", atom != NULL);
69354+ spin_unlock_znode(node);
69355+ flush_reserved2grabbed(atom, (__u64) 1);
69356+ spin_unlock_atom(atom);
69357+ } else
69358+ spin_unlock_znode(node);
69359+ } else {
69360+ /* znode has assigned block which is counted as "fake
69361+ allocated". Return it back to "free blocks") */
69362+ fake_allocated2free((__u64) 1, BA_FORMATTED);
69363+ }
69364+
69365+ /*
69366+ * uncapture page from transaction. There is a possibility of a race
69367+ * with ->releasepage(): reiser4_releasepage() detaches page from this
69368+ * jnode and we have nothing to uncapture. To avoid this, get
69369+ * reference of node->pg under jnode spin lock. uncapture_page() will
69370+ * deal with released page itself.
69371+ */
69372+ spin_lock_znode(node);
69373+ page = znode_page(node);
69374+ if (likely(page != NULL)) {
69375+ /*
69376+ * uncapture_page() can only be called when we are sure that
69377+ * znode is pinned in memory, which we are, because
69378+ * forget_znode() is only called from longterm_unlock_znode().
69379+ */
69380+ page_cache_get(page);
69381+ spin_unlock_znode(node);
69382+ lock_page(page);
69383+ uncapture_page(page);
69384+ unlock_page(page);
69385+ page_cache_release(page);
69386+ } else {
69387+ txn_atom *atom;
69388+
69389+ /* handle "flush queued" znodes */
69390+ while (1) {
69391+ atom = jnode_get_atom(ZJNODE(node));
69392+ assert("zam-943", atom != NULL);
69393+
69394+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69395+ || !atom->nr_running_queues)
69396+ break;
69397+
69398+ spin_unlock_znode(node);
69399+ atom_wait_event(atom);
69400+ spin_lock_znode(node);
69401+ }
69402+
69403+ uncapture_block(ZJNODE(node));
69404+ spin_unlock_atom(atom);
69405+ zput(node);
69406+ }
69407+}
69408+
69409+/* This is called from longterm_unlock_znode() when last lock is released from
69410+ the node that has been removed from the tree. At this point node is removed
69411+ from sibling list and its lock is invalidated. */
69412+void forget_znode(lock_handle * handle)
69413+{
69414+ znode *node;
69415+ reiser4_tree *tree;
69416+
69417+ assert("umka-319", handle != NULL);
69418+
69419+ node = handle->node;
69420+ tree = znode_get_tree(node);
69421+
69422+ assert("vs-164", znode_is_write_locked(node));
69423+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69424+ assert_rw_locked(&(node->lock.guard));
69425+
69426+ /* We assume that this node was detached from its parent before
69427+ * unlocking, it gives no way to reach this node from parent through a
69428+ * down link. The node should have no children and, thereby, can't be
69429+ * reached from them by their parent pointers. The only way to obtain a
69430+ * reference to the node is to use sibling pointers from its left and
69431+ * right neighbors. In the next several lines we remove the node from
69432+ * the sibling list. */
69433+
69434+ write_lock_tree(tree);
69435+ sibling_list_remove(node);
69436+ znode_remove(node, tree);
69437+ write_unlock_tree(tree);
69438+
69439+ /* Here we set JNODE_DYING and cancel all pending lock requests. It
69440+ * forces all lock requestor threads to repeat iterations of getting
69441+ * lock on a child, neighbor or parent node. But, those threads can't
69442+ * come to this node again, because this node is no longer a child,
69443+ * neighbor or parent of any other node. This order of znode
69444+ * invalidation does not allow other threads to waste cpu time is a busy
69445+ * loop, trying to lock dying object. The exception is in the flush
69446+ * code when we take node directly from atom's capture list.*/
69447+ invalidate_lock(handle);
69448+ uncapture_znode(node);
69449+}
69450+
69451+/* Check that internal item at @pointer really contains pointer to @child. */
69452+int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69453+ * @child */ ,
69454+ const znode * child /* child znode */ )
69455+{
69456+ assert("nikita-1016", pointer != NULL);
69457+ assert("nikita-1017", child != NULL);
69458+ assert("nikita-1018", pointer->node != NULL);
69459+
69460+ assert("nikita-1325", znode_is_any_locked(pointer->node));
69461+
69462+ assert("nikita-2985",
69463+ znode_get_level(pointer->node) == znode_get_level(child) + 1);
69464+
69465+ coord_clear_iplug((coord_t *) pointer);
69466+
69467+ if (coord_is_existing_unit(pointer)) {
69468+ item_plugin *iplug;
69469+ reiser4_block_nr addr;
69470+
69471+ if (item_is_internal(pointer)) {
69472+ iplug = item_plugin_by_coord(pointer);
69473+ assert("vs-513", iplug->s.internal.down_link);
69474+ iplug->s.internal.down_link(pointer, NULL, &addr);
69475+ /* check that cached value is correct */
69476+ if (disk_addr_eq(&addr, znode_get_block(child))) {
69477+ return NS_FOUND;
69478+ }
69479+ }
69480+ }
69481+ /* warning ("jmacd-1002", "tree pointer incorrect"); */
69482+ return NS_NOT_FOUND;
69483+}
69484+
69485+/* find coord of pointer to new @child in @parent.
69486+
69487+ Find the &coord_t in the @parent where pointer to a given @child will
69488+ be in.
69489+
69490+*/
69491+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
69492+ znode *
69493+ child UNUSED_ARG /* child znode, passed locked */ ,
69494+ znode * left /* left brother of new node */ ,
69495+ coord_t * result /* where result is stored in */ )
69496+{
69497+ int ret;
69498+
69499+ assert("nikita-1486", parent != NULL);
69500+ assert("nikita-1487", child != NULL);
69501+ assert("nikita-1488", result != NULL);
69502+
69503+ ret = find_child_ptr(parent, left, result);
69504+ if (ret != NS_FOUND) {
69505+ warning("nikita-1489", "Cannot find brother position: %i", ret);
69506+ return RETERR(-EIO);
69507+ } else {
69508+ result->between = AFTER_UNIT;
69509+ return RETERR(NS_NOT_FOUND);
69510+ }
69511+}
69512+
69513+/* find coord of pointer to @child in @parent.
69514+
69515+ Find the &coord_t in the @parent where pointer to a given @child is in.
69516+
69517+*/
69518+int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
69519+ znode * child /* child znode, passed locked */ ,
69520+ coord_t * result /* where result is stored in */ )
69521+{
69522+ int lookup_res;
69523+ node_plugin *nplug;
69524+ /* left delimiting key of a child */
69525+ reiser4_key ld;
69526+ reiser4_tree *tree;
69527+
69528+ assert("nikita-934", parent != NULL);
69529+ assert("nikita-935", child != NULL);
69530+ assert("nikita-936", result != NULL);
69531+ assert("zam-356", znode_is_loaded(parent));
69532+
69533+ coord_init_zero(result);
69534+ result->node = parent;
69535+
69536+ nplug = parent->nplug;
69537+ assert("nikita-939", nplug != NULL);
69538+
69539+ tree = znode_get_tree(parent);
69540+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
69541+ * not aliased to ->in_parent of some znode. Otherwise,
69542+ * parent_coord_to_coord() below would modify data protected by tree
69543+ * lock. */
69544+ read_lock_tree(tree);
69545+ /* fast path. Try to use cached value. Lock tree to keep
69546+ node->pos_in_parent and pos->*_blocknr consistent. */
69547+ if (child->in_parent.item_pos + 1 != 0) {
69548+ parent_coord_to_coord(&child->in_parent, result);
69549+ if (check_tree_pointer(result, child) == NS_FOUND) {
69550+ read_unlock_tree(tree);
69551+ return NS_FOUND;
69552+ }
69553+
69554+ child->in_parent.item_pos = (unsigned short)~0;
69555+ }
69556+ read_unlock_tree(tree);
69557+
69558+ /* is above failed, find some key from @child. We are looking for the
69559+ least key in a child. */
69560+ read_lock_dk(tree);
69561+ ld = *znode_get_ld_key(child);
69562+ read_unlock_dk(tree);
69563+ /*
69564+ * now, lookup parent with key just found. Note, that left delimiting
69565+ * key doesn't identify node uniquely, because (in extremely rare
69566+ * case) two nodes can have equal left delimiting keys, if one of them
69567+ * is completely filled with directory entries that all happened to be
69568+ * hash collision. But, we check block number in check_tree_pointer()
69569+ * and, so, are safe.
69570+ */
69571+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
69572+ /* update cached pos_in_node */
69573+ if (lookup_res == NS_FOUND) {
69574+ write_lock_tree(tree);
69575+ coord_to_parent_coord(result, &child->in_parent);
69576+ write_unlock_tree(tree);
69577+ lookup_res = check_tree_pointer(result, child);
69578+ }
69579+ if (lookup_res == NS_NOT_FOUND)
69580+ lookup_res = find_child_by_addr(parent, child, result);
69581+ return lookup_res;
69582+}
69583+
69584+/* find coord of pointer to @child in @parent by scanning
69585+
69586+ Find the &coord_t in the @parent where pointer to a given @child
69587+ is in by scanning all internal items in @parent and comparing block
69588+ numbers in them with that of @child.
69589+
69590+*/
69591+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
69592+ znode * child /* child znode, passed locked */ ,
69593+ coord_t * result /* where result is stored in */ )
69594+{
69595+ int ret;
69596+
69597+ assert("nikita-1320", parent != NULL);
69598+ assert("nikita-1321", child != NULL);
69599+ assert("nikita-1322", result != NULL);
69600+
69601+ ret = NS_NOT_FOUND;
69602+
69603+ for_all_units(result, parent) {
69604+ if (check_tree_pointer(result, child) == NS_FOUND) {
69605+ write_lock_tree(znode_get_tree(parent));
69606+ coord_to_parent_coord(result, &child->in_parent);
69607+ write_unlock_tree(znode_get_tree(parent));
69608+ ret = NS_FOUND;
69609+ break;
69610+ }
69611+ }
69612+ return ret;
69613+}
69614+
69615+/* true, if @addr is "unallocated block number", which is just address, with
69616+ highest bit set. */
69617+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
69618+ * check */ )
69619+{
69620+ assert("nikita-1766", addr != NULL);
69621+ cassert(sizeof(reiser4_block_nr) == 8);
69622+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
69623+ REISER4_UNALLOCATED_STATUS_VALUE;
69624+}
69625+
69626+/* returns true if removing bytes of given range of key [from_key, to_key]
69627+ causes removing of whole item @from */
69628+static int
69629+item_removed_completely(coord_t * from, const reiser4_key * from_key,
69630+ const reiser4_key * to_key)
69631+{
69632+ item_plugin *iplug;
69633+ reiser4_key key_in_item;
69634+
69635+ assert("umka-325", from != NULL);
69636+ assert("", item_is_extent(from));
69637+
69638+ /* check first key just for case */
69639+ item_key_by_coord(from, &key_in_item);
69640+ if (keygt(from_key, &key_in_item))
69641+ return 0;
69642+
69643+ /* check last key */
69644+ iplug = item_plugin_by_coord(from);
69645+ assert("vs-611", iplug && iplug->s.file.append_key);
69646+
69647+ iplug->s.file.append_key(from, &key_in_item);
69648+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
69649+
69650+ if (keylt(to_key, &key_in_item))
69651+ /* last byte is not removed */
69652+ return 0;
69653+ return 1;
69654+}
69655+
69656+/* helper function for prepare_twig_kill(): @left and @right are formatted
69657+ * neighbors of extent item being completely removed. Load and lock neighbors
69658+ * and store lock handles into @cdata for later use by kill_hook_extent() */
69659+static int
69660+prepare_children(znode * left, znode * right, carry_kill_data * kdata)
69661+{
69662+ int result;
69663+ int left_loaded;
69664+ int right_loaded;
69665+
69666+ result = 0;
69667+ left_loaded = right_loaded = 0;
69668+
69669+ if (left != NULL) {
69670+ result = zload(left);
69671+ if (result == 0) {
69672+ left_loaded = 1;
69673+ result = longterm_lock_znode(kdata->left, left,
69674+ ZNODE_READ_LOCK,
69675+ ZNODE_LOCK_LOPRI);
69676+ }
69677+ }
69678+ if (result == 0 && right != NULL) {
69679+ result = zload(right);
69680+ if (result == 0) {
69681+ right_loaded = 1;
69682+ result = longterm_lock_znode(kdata->right, right,
69683+ ZNODE_READ_LOCK,
69684+ ZNODE_LOCK_HIPRI |
69685+ ZNODE_LOCK_NONBLOCK);
69686+ }
69687+ }
69688+ if (result != 0) {
69689+ done_lh(kdata->left);
69690+ done_lh(kdata->right);
69691+ if (left_loaded != 0)
69692+ zrelse(left);
69693+ if (right_loaded != 0)
69694+ zrelse(right);
69695+ }
69696+ return result;
69697+}
69698+
69699+static void done_children(carry_kill_data * kdata)
69700+{
69701+ if (kdata->left != NULL && kdata->left->node != NULL) {
69702+ zrelse(kdata->left->node);
69703+ done_lh(kdata->left);
69704+ }
69705+ if (kdata->right != NULL && kdata->right->node != NULL) {
69706+ zrelse(kdata->right->node);
69707+ done_lh(kdata->right);
69708+ }
69709+}
69710+
69711+/* part of cut_node. It is called when cut_node is called to remove or cut part
69712+ of extent item. When head of that item is removed - we have to update right
69713+ delimiting of left neighbor of extent. When item is removed completely - we
69714+ have to set sibling link between left and right neighbor of removed
69715+ extent. This may return -E_DEADLOCK because of trying to get left neighbor
69716+ locked. So, caller should repeat an attempt
69717+*/
69718+/* Audited by: umka (2002.06.16) */
69719+static int
69720+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
69721+{
69722+ int result;
69723+ reiser4_key key;
69724+ lock_handle left_lh;
69725+ lock_handle right_lh;
69726+ coord_t left_coord;
69727+ coord_t *from;
69728+ znode *left_child;
69729+ znode *right_child;
69730+ reiser4_tree *tree;
69731+ int left_zloaded_here, right_zloaded_here;
69732+
69733+ from = kdata->params.from;
69734+ assert("umka-326", from != NULL);
69735+ assert("umka-327", kdata->params.to != NULL);
69736+
69737+ /* for one extent item only yet */
69738+ assert("vs-591", item_is_extent(from));
69739+ assert("vs-592", from->item_pos == kdata->params.to->item_pos);
69740+
69741+ if ((kdata->params.from_key
69742+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
69743+ || from->unit_pos != 0) {
69744+ /* head of item @from is not removed, there is nothing to
69745+ worry about */
69746+ return 0;
69747+ }
69748+
69749+ result = 0;
69750+ left_zloaded_here = 0;
69751+ right_zloaded_here = 0;
69752+
69753+ left_child = right_child = NULL;
69754+
69755+ coord_dup(&left_coord, from);
69756+ init_lh(&left_lh);
69757+ init_lh(&right_lh);
69758+ if (coord_prev_unit(&left_coord)) {
69759+ /* @from is leftmost item in its node */
69760+ if (!locked_left_neighbor) {
69761+ result =
69762+ reiser4_get_left_neighbor(&left_lh, from->node,
69763+ ZNODE_READ_LOCK,
69764+ GN_CAN_USE_UPPER_LEVELS);
69765+ switch (result) {
69766+ case 0:
69767+ break;
69768+ case -E_NO_NEIGHBOR:
69769+ /* there is no formatted node to the left of
69770+ from->node */
69771+ warning("vs-605",
69772+ "extent item has smallest key in "
69773+ "the tree and it is about to be removed");
69774+ return 0;
69775+ case -E_DEADLOCK:
69776+ /* need to restart */
69777+ default:
69778+ return result;
69779+ }
69780+
69781+ /* we have acquired left neighbor of from->node */
69782+ result = zload(left_lh.node);
69783+ if (result)
69784+ goto done;
69785+
69786+ locked_left_neighbor = left_lh.node;
69787+ } else {
69788+ /* squalloc_right_twig_cut should have supplied locked
69789+ * left neighbor */
69790+ assert("vs-834",
69791+ znode_is_write_locked(locked_left_neighbor));
69792+ result = zload(locked_left_neighbor);
69793+ if (result)
69794+ return result;
69795+ }
69796+
69797+ left_zloaded_here = 1;
69798+ coord_init_last_unit(&left_coord, locked_left_neighbor);
69799+ }
69800+
69801+ if (!item_is_internal(&left_coord)) {
69802+ /* what else but extent can be on twig level */
69803+ assert("vs-606", item_is_extent(&left_coord));
69804+
69805+ /* there is no left formatted child */
69806+ if (left_zloaded_here)
69807+ zrelse(locked_left_neighbor);
69808+ done_lh(&left_lh);
69809+ return 0;
69810+ }
69811+
69812+ tree = znode_get_tree(left_coord.node);
69813+ left_child = child_znode(&left_coord, left_coord.node, 1, 0);
69814+
69815+ if (IS_ERR(left_child)) {
69816+ result = PTR_ERR(left_child);
69817+ goto done;
69818+ }
69819+
69820+ /* left child is acquired, calculate new right delimiting key for it
69821+ and get right child if it is necessary */
69822+ if (item_removed_completely
69823+ (from, kdata->params.from_key, kdata->params.to_key)) {
69824+ /* try to get right child of removed item */
69825+ coord_t right_coord;
69826+
69827+ assert("vs-607",
69828+ kdata->params.to->unit_pos ==
69829+ coord_last_unit_pos(kdata->params.to));
69830+ coord_dup(&right_coord, kdata->params.to);
69831+ if (coord_next_unit(&right_coord)) {
69832+ /* @to is rightmost unit in the node */
69833+ result =
69834+ reiser4_get_right_neighbor(&right_lh, from->node,
69835+ ZNODE_READ_LOCK,
69836+ GN_CAN_USE_UPPER_LEVELS);
69837+ switch (result) {
69838+ case 0:
69839+ result = zload(right_lh.node);
69840+ if (result)
69841+ goto done;
69842+
69843+ right_zloaded_here = 1;
69844+ coord_init_first_unit(&right_coord,
69845+ right_lh.node);
69846+ item_key_by_coord(&right_coord, &key);
69847+ break;
69848+
69849+ case -E_NO_NEIGHBOR:
69850+ /* there is no formatted node to the right of
69851+ from->node */
69852+ read_lock_dk(tree);
69853+ key = *znode_get_rd_key(from->node);
69854+ read_unlock_dk(tree);
69855+ right_coord.node = NULL;
69856+ result = 0;
69857+ break;
69858+ default:
69859+ /* real error */
69860+ goto done;
69861+ }
69862+ } else {
69863+ /* there is an item to the right of @from - take its key */
69864+ item_key_by_coord(&right_coord, &key);
69865+ }
69866+
69867+ /* try to get right child of @from */
69868+ if (right_coord.node && /* there is right neighbor of @from */
69869+ item_is_internal(&right_coord)) { /* it is internal item */
69870+ right_child = child_znode(&right_coord,
69871+ right_coord.node, 1, 0);
69872+
69873+ if (IS_ERR(right_child)) {
69874+ result = PTR_ERR(right_child);
69875+ goto done;
69876+ }
69877+
69878+ }
69879+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
69880+ update of right delimiting key of left_child */
69881+ result = prepare_children(left_child, right_child, kdata);
69882+ } else {
69883+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
69884+ result = prepare_children(left_child, NULL, kdata);
69885+ }
69886+
69887+ done:
69888+ if (right_child)
69889+ zput(right_child);
69890+ if (right_zloaded_here)
69891+ zrelse(right_lh.node);
69892+ done_lh(&right_lh);
69893+
69894+ if (left_child)
69895+ zput(left_child);
69896+ if (left_zloaded_here)
69897+ zrelse(locked_left_neighbor);
69898+ done_lh(&left_lh);
69899+ return result;
69900+}
69901+
69902+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
69903+ are to be cut completely */
69904+/* for try_to_merge_with_left, delete_copied, delete_node */
69905+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
69906+ const reiser4_key * to_key, /* last key to be removed */
69907+ reiser4_key *
69908+ smallest_removed /* smallest key actually removed */ )
69909+{
69910+ int result;
69911+ carry_pool *pool;
69912+ carry_level *lowest_level;
69913+ carry_cut_data *cut_data;
69914+ carry_op *op;
69915+
69916+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
69917+
69918+ pool =
69919+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69920+ sizeof(*cut_data));
69921+ if (IS_ERR(pool))
69922+ return PTR_ERR(pool);
69923+ lowest_level = (carry_level *) (pool + 1);
69924+ init_carry_level(lowest_level, pool);
69925+
69926+ op = post_carry(lowest_level, COP_CUT, from->node, 0);
69927+ assert("vs-1509", op != 0);
69928+ if (IS_ERR(op)) {
69929+ done_carry_pool(pool);
69930+ return PTR_ERR(op);
69931+ }
69932+
69933+ cut_data = (carry_cut_data *) (lowest_level + 3);
69934+ cut_data->params.from = from;
69935+ cut_data->params.to = to;
69936+ cut_data->params.from_key = from_key;
69937+ cut_data->params.to_key = to_key;
69938+ cut_data->params.smallest_removed = smallest_removed;
69939+
69940+ op->u.cut_or_kill.is_cut = 1;
69941+ op->u.cut_or_kill.u.cut = cut_data;
69942+
69943+ result = carry(lowest_level, NULL);
69944+ done_carry_pool(pool);
69945+
69946+ return result;
69947+}
69948+
69949+/* cut part of the node
69950+
69951+ Cut part or whole content of node.
69952+
69953+ cut data between @from and @to of @from->node and call carry() to make
69954+ corresponding changes in the tree. @from->node may become empty. If so -
69955+ pointer to it will be removed. Neighboring nodes are not changed. Smallest
69956+ removed key is stored in @smallest_removed
69957+
69958+*/
69959+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
69960+ coord_t * to, /* coord of the last unit/item that will be eliminated */
69961+ const reiser4_key * from_key, /* first key to be removed */
69962+ const reiser4_key * to_key, /* last key to be removed */
69963+ reiser4_key * smallest_removed, /* smallest key actually removed */
69964+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
69965+ * locked (in squalloc_right_twig_cut, namely) */
69966+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
69967+ invalidate pages together with item pointing to them */
69968+ int truncate)
69969+{ /* this call is made for file truncate) */
69970+ int result;
69971+ carry_pool *pool;
69972+ carry_level *lowest_level;
69973+ carry_kill_data *kdata;
69974+ lock_handle *left_child;
69975+ lock_handle *right_child;
69976+ carry_op *op;
69977+
69978+ assert("umka-328", from != NULL);
69979+ assert("vs-316", !node_is_empty(from->node));
69980+ assert("nikita-1812", coord_is_existing_unit(from)
69981+ && coord_is_existing_unit(to));
69982+
69983+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
69984+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69985+ sizeof(carry_kill_data) +
69986+ 2 * sizeof(lock_handle) +
69987+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
69988+ if (IS_ERR(pool))
69989+ return PTR_ERR(pool);
69990+
69991+ lowest_level = (carry_level *) (pool + 1);
69992+ init_carry_level(lowest_level, pool);
69993+
69994+ kdata = (carry_kill_data *) (lowest_level + 3);
69995+ left_child = (lock_handle *) (kdata + 1);
69996+ right_child = left_child + 1;
69997+
69998+ init_lh(left_child);
69999+ init_lh(right_child);
70000+
70001+ kdata->params.from = from;
70002+ kdata->params.to = to;
70003+ kdata->params.from_key = from_key;
70004+ kdata->params.to_key = to_key;
70005+ kdata->params.smallest_removed = smallest_removed;
70006+ kdata->params.truncate = truncate;
70007+ kdata->flags = 0;
70008+ kdata->inode = inode;
70009+ kdata->left = left_child;
70010+ kdata->right = right_child;
70011+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
70012+ kdata->buf = (char *)(right_child + 1);
70013+
70014+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
70015+ /* left child of extent item may have to get updated right
70016+ delimiting key and to get linked with right child of extent
70017+ @from if it will be removed completely */
70018+ result = prepare_twig_kill(kdata, locked_left_neighbor);
70019+ if (result) {
70020+ done_children(kdata);
70021+ done_carry_pool(pool);
70022+ return result;
70023+ }
70024+ }
70025+
70026+ op = post_carry(lowest_level, COP_CUT, from->node, 0);
70027+ if (IS_ERR(op) || (op == NULL)) {
70028+ done_children(kdata);
70029+ done_carry_pool(pool);
70030+ return RETERR(op ? PTR_ERR(op) : -EIO);
70031+ }
70032+
70033+ op->u.cut_or_kill.is_cut = 0;
70034+ op->u.cut_or_kill.u.kill = kdata;
70035+
70036+ result = carry(lowest_level, NULL);
70037+
70038+ done_children(kdata);
70039+ done_carry_pool(pool);
70040+ return result;
70041+}
70042+
70043+void
70044+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
70045+{
70046+ if (inode_get_flag(inode, REISER4_HAS_MMAP)) {
70047+ pgoff_t start_pg, end_pg;
70048+
70049+ start_pg = start >> PAGE_CACHE_SHIFT;
70050+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
70051+
70052+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
70053+ /*
70054+ * kill up to the page boundary.
70055+ */
70056+ assert("vs-123456", start_pg == end_pg);
70057+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
70058+ truncate);
70059+ } else if (start_pg != end_pg) {
70060+ /*
70061+ * page boundary is within killed portion of node.
70062+ */
70063+ assert("vs-654321", end_pg - start_pg == 1);
70064+ reiser4_invalidate_pages(inode->i_mapping, end_pg,
70065+ end_pg - start_pg, 1);
70066+ }
70067+ }
70068+ inode_sub_bytes(inode, end - start);
70069+}
70070+
70071+/**
70072+ * Delete whole @node from the reiser4 tree without loading it.
70073+ *
70074+ * @left: locked left neighbor,
70075+ * @node: node to be deleted,
70076+ * @smallest_removed: leftmost key of deleted node,
70077+ * @object: inode pointer, if we truncate a file body.
70078+ * @truncate: true if called for file truncate.
70079+ *
70080+ * @return: 0 if success, error code otherwise.
70081+ *
70082+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
70083+ * contains the right value of the smallest removed key from the previous
70084+ * cut_worker() iteration. This is needed for proper accounting of
70085+ * "i_blocks" and "i_bytes" fields of the @object.
70086+ */
70087+int delete_node(znode * node, reiser4_key * smallest_removed,
70088+ struct inode *object, int truncate)
70089+{
70090+ lock_handle parent_lock;
70091+ coord_t cut_from;
70092+ coord_t cut_to;
70093+ reiser4_tree *tree;
70094+ int ret;
70095+
70096+ assert("zam-937", node != NULL);
70097+ assert("zam-933", znode_is_write_locked(node));
70098+ assert("zam-999", smallest_removed != NULL);
70099+
70100+ init_lh(&parent_lock);
70101+
70102+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
70103+ if (ret)
70104+ return ret;
70105+
70106+ assert("zam-934", !znode_above_root(parent_lock.node));
70107+
70108+ ret = zload(parent_lock.node);
70109+ if (ret)
70110+ goto failed_nozrelse;
70111+
70112+ ret = find_child_ptr(parent_lock.node, node, &cut_from);
70113+ if (ret)
70114+ goto failed;
70115+
70116+ /* decrement child counter and set parent pointer to NULL before
70117+ deleting the list from parent node because of checks in
70118+ internal_kill_item_hook (we can delete the last item from the parent
70119+ node, the parent node is going to be deleted and its c_count should
70120+ be zero). */
70121+
70122+ tree = znode_get_tree(node);
70123+ write_lock_tree(tree);
70124+ init_parent_coord(&node->in_parent, NULL);
70125+ --parent_lock.node->c_count;
70126+ write_unlock_tree(tree);
70127+
70128+ assert("zam-989", item_is_internal(&cut_from));
70129+
70130+ /* @node should be deleted after unlocking. */
70131+ ZF_SET(node, JNODE_HEARD_BANSHEE);
70132+
70133+ /* remove a pointer from the parent node to the node being deleted. */
70134+ coord_dup(&cut_to, &cut_from);
70135+ /* FIXME: shouldn't this be kill_node_content */
70136+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
70137+ if (ret)
70138+ /* FIXME(Zam): Should we re-connect the node to its parent if
70139+ * cut_node fails? */
70140+ goto failed;
70141+
70142+ {
70143+ reiser4_tree *tree = current_tree;
70144+ __u64 start_offset = 0, end_offset = 0;
70145+
70146+ read_lock_tree(tree);
70147+ write_lock_dk(tree);
70148+ if (object) {
70149+ /* We use @smallest_removed and the left delimiting of
70150+ * the current node for @object->i_blocks, i_bytes
70151+ * calculation. We assume that the items after the
70152+ * *@smallest_removed key have been deleted from the
70153+ * file body. */
70154+ start_offset = get_key_offset(znode_get_ld_key(node));
70155+ end_offset = get_key_offset(smallest_removed);
70156+ }
70157+
70158+ assert("zam-1021", znode_is_connected(node));
70159+ if (node->left)
70160+ znode_set_rd_key(node->left, znode_get_rd_key(node));
70161+
70162+ *smallest_removed = *znode_get_ld_key(node);
70163+
70164+ write_unlock_dk(tree);
70165+ read_unlock_tree(tree);
70166+
70167+ if (object) {
70168+ /* we used to perform actions which are to be performed on items on their removal from tree in
70169+ special item method - kill_hook. Here for optimization reasons we avoid reading node
70170+ containing item we remove and can not call item's kill hook. Instead we call function which
70171+ does exactly the same things as tail kill hook in assumption that node we avoid reading
70172+ contains only one item and that item is a tail one. */
70173+ fake_kill_hook_tail(object, start_offset, end_offset,
70174+ truncate);
70175+ }
70176+ }
70177+ failed:
70178+ zrelse(parent_lock.node);
70179+ failed_nozrelse:
70180+ done_lh(&parent_lock);
70181+
70182+ return ret;
70183+}
70184+
70185+static int can_delete(const reiser4_key *key, znode *node)
70186+{
70187+ int result;
70188+
70189+ read_lock_dk(current_tree);
70190+ result = keyle(key, znode_get_ld_key(node));
70191+ read_unlock_dk(current_tree);
70192+ return result;
70193+}
70194+
70195+/**
70196+ * This subroutine is not optimal but implementation seems to
70197+ * be easier).
70198+ *
70199+ * @tap: the point deletion process begins from,
70200+ * @from_key: the beginning of the deleted key range,
70201+ * @to_key: the end of the deleted key range,
70202+ * @smallest_removed: the smallest removed key,
70203+ * @truncate: true if called for file truncate.
70204+ * @progress: return true if a progress in file items deletions was made,
70205+ * @smallest_removed value is actual in that case.
70206+ *
70207+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70208+ * operation was interrupted for allowing atom commit .
70209+ */
70210+int
70211+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
70212+ const reiser4_key * to_key,
70213+ reiser4_key * smallest_removed, struct inode *object,
70214+ int truncate, int *progress)
70215+{
70216+ lock_handle next_node_lock;
70217+ coord_t left_coord;
70218+ int result;
70219+
70220+ assert("zam-931", tap->coord->node != NULL);
70221+ assert("zam-932", znode_is_write_locked(tap->coord->node));
70222+
70223+ *progress = 0;
70224+ init_lh(&next_node_lock);
70225+
70226+ while (1) {
70227+ znode *node; /* node from which items are cut */
70228+ node_plugin *nplug; /* node plugin for @node */
70229+
70230+ node = tap->coord->node;
70231+
70232+ /* Move next_node_lock to the next node on the left. */
70233+ result =
70234+ reiser4_get_left_neighbor(&next_node_lock, node,
70235+ ZNODE_WRITE_LOCK,
70236+ GN_CAN_USE_UPPER_LEVELS);
70237+ if (result != 0 && result != -E_NO_NEIGHBOR)
70238+ break;
70239+ /* Check can we delete the node as a whole. */
70240+ if (*progress && znode_get_level(node) == LEAF_LEVEL &&
70241+ can_delete(from_key, node)) {
70242+ result = delete_node(node, smallest_removed, object,
70243+ truncate);
70244+ } else {
70245+ result = tap_load(tap);
70246+ if (result)
70247+ return result;
70248+
70249+ /* Prepare the second (right) point for cut_node() */
70250+ if (*progress)
70251+ coord_init_last_unit(tap->coord, node);
70252+
70253+ else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70254+ NULL)
70255+ /* set rightmost unit for the items without lookup method */
70256+ tap->coord->unit_pos =
70257+ coord_last_unit_pos(tap->coord);
70258+
70259+ nplug = node->nplug;
70260+
70261+ assert("vs-686", nplug);
70262+ assert("vs-687", nplug->lookup);
70263+
70264+ /* left_coord is leftmost unit cut from @node */
70265+ result = nplug->lookup(node, from_key,
70266+ FIND_MAX_NOT_MORE_THAN,
70267+ &left_coord);
70268+
70269+ if (IS_CBKERR(result))
70270+ break;
70271+
70272+ /* adjust coordinates so that they are set to existing units */
70273+ if (coord_set_to_right(&left_coord)
70274+ || coord_set_to_left(tap->coord)) {
70275+ result = 0;
70276+ break;
70277+ }
70278+
70279+ if (coord_compare(&left_coord, tap->coord) ==
70280+ COORD_CMP_ON_RIGHT) {
70281+ /* keys from @from_key to @to_key are not in the tree */
70282+ result = 0;
70283+ break;
70284+ }
70285+
70286+ if (left_coord.item_pos != tap->coord->item_pos) {
70287+ /* do not allow to cut more than one item. It is added to solve problem of truncating
70288+ partially converted files. If file is partially converted there may exist a twig node
70289+ containing both internal item or items pointing to leaf nodes with formatting items
70290+ and extent item. We do not want to kill internal items being at twig node here
70291+ because cut_tree_worker assumes killing them from level level */
70292+ coord_dup(&left_coord, tap->coord);
70293+ assert("vs-1652",
70294+ coord_is_existing_unit(&left_coord));
70295+ left_coord.unit_pos = 0;
70296+ }
70297+
70298+ /* cut data from one node */
70299+ // *smallest_removed = *min_key();
70300+ result =
70301+ kill_node_content(&left_coord, tap->coord, from_key,
70302+ to_key, smallest_removed,
70303+ next_node_lock.node, object,
70304+ truncate);
70305+ tap_relse(tap);
70306+ }
70307+ if (result)
70308+ break;
70309+
70310+ ++(*progress);
70311+
70312+ /* Check whether all items with keys >= from_key were removed
70313+ * from the tree. */
70314+ if (keyle(smallest_removed, from_key))
70315+ /* result = 0; */
70316+ break;
70317+
70318+ if (next_node_lock.node == NULL)
70319+ break;
70320+
70321+ result = tap_move(tap, &next_node_lock);
70322+ done_lh(&next_node_lock);
70323+ if (result)
70324+ break;
70325+
70326+ /* Break long cut_tree operation (deletion of a large file) if
70327+ * atom requires commit. */
70328+ if (*progress > CUT_TREE_MIN_ITERATIONS
70329+ && current_atom_should_commit()) {
70330+ result = -E_REPEAT;
70331+ break;
70332+ }
70333+ }
70334+ done_lh(&next_node_lock);
70335+ // assert("vs-301", !keyeq(&smallest_removed, min_key()));
70336+ return result;
70337+}
70338+
70339+/* there is a fundamental problem with optimizing deletes: VFS does it
70340+ one file at a time. Another problem is that if an item can be
70341+ anything, then deleting items must be done one at a time. It just
70342+ seems clean to writes this to specify a from and a to key, and cut
70343+ everything between them though. */
70344+
70345+/* use this function with care if deleting more than what is part of a single file. */
70346+/* do not use this when cutting a single item, it is suboptimal for that */
70347+
70348+/* You are encouraged to write plugin specific versions of this. It
70349+ cannot be optimal for all plugins because it works item at a time,
70350+ and some plugins could sometimes work node at a time. Regular files
70351+ however are not optimizable to work node at a time because of
70352+ extents needing to free the blocks they point to.
70353+
70354+ Optimizations compared to v3 code:
70355+
70356+ It does not balance (that task is left to memory pressure code).
70357+
70358+ Nodes are deleted only if empty.
70359+
70360+ Uses extents.
70361+
70362+ Performs read-ahead of formatted nodes whose contents are part of
70363+ the deletion.
70364+*/
70365+
70366+/**
70367+ * Delete everything from the reiser4 tree between two keys: @from_key and
70368+ * @to_key.
70369+ *
70370+ * @from_key: the beginning of the deleted key range,
70371+ * @to_key: the end of the deleted key range,
70372+ * @smallest_removed: the smallest removed key,
70373+ * @object: owner of cutting items.
70374+ * @truncate: true if called for file truncate.
70375+ * @progress: return true if a progress in file items deletions was made,
70376+ * @smallest_removed value is actual in that case.
70377+ *
70378+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70379+ * operation was interrupted for allowing atom commit .
70380+ */
70381+
70382+int
70383+cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70384+ const reiser4_key * to_key, reiser4_key * smallest_removed_p,
70385+ struct inode *object, int truncate, int *progress)
70386+{
70387+ lock_handle lock;
70388+ int result;
70389+ tap_t tap;
70390+ coord_t right_coord;
70391+ reiser4_key smallest_removed;
70392+ int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70393+ const reiser4_key *, reiser4_key *,
70394+ struct inode *, int, int *);
70395+ STORE_COUNTERS;
70396+
70397+ assert("umka-329", tree != NULL);
70398+ assert("umka-330", from_key != NULL);
70399+ assert("umka-331", to_key != NULL);
70400+ assert("zam-936", keyle(from_key, to_key));
70401+
70402+ if (smallest_removed_p == NULL)
70403+ smallest_removed_p = &smallest_removed;
70404+
70405+ init_lh(&lock);
70406+
70407+ do {
70408+ /* Find rightmost item to cut away from the tree. */
70409+ result = object_lookup(object, to_key, &right_coord, &lock,
70410+ ZNODE_WRITE_LOCK, FIND_MAX_NOT_MORE_THAN,
70411+ TWIG_LEVEL, LEAF_LEVEL, CBK_UNIQUE,
70412+ NULL /*ra_info */ );
70413+ if (result != CBK_COORD_FOUND)
70414+ break;
70415+ if (object == NULL
70416+ || inode_file_plugin(object)->cut_tree_worker == NULL)
70417+ cut_tree_worker = cut_tree_worker_common;
70418+ else
70419+ cut_tree_worker =
70420+ inode_file_plugin(object)->cut_tree_worker;
70421+ tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70422+ result =
70423+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70424+ object, truncate, progress);
70425+ tap_done(&tap);
70426+
70427+ preempt_point();
70428+
70429+ } while (0);
70430+
70431+ done_lh(&lock);
70432+
70433+ if (result) {
70434+ switch (result) {
70435+ case -E_NO_NEIGHBOR:
70436+ result = 0;
70437+ break;
70438+ case -E_DEADLOCK:
70439+ result = -E_REPEAT;
70440+ case -E_REPEAT:
70441+ case -ENOMEM:
70442+ case -ENOENT:
70443+ break;
70444+ default:
70445+ warning("nikita-2861", "failure: %i", result);
70446+ }
70447+ }
70448+
70449+ CHECK_COUNTERS;
70450+ return result;
70451+}
70452+
70453+/* repeat cut_tree_object until everything is deleted. unlike cut_file_items, it
70454+ * does not end current transaction if -E_REPEAT is returned by
70455+ * cut_tree_object. */
70456+int
70457+cut_tree(reiser4_tree * tree, const reiser4_key * from, const reiser4_key * to,
70458+ struct inode *inode, int truncate)
70459+{
70460+ int result;
70461+ int progress;
70462+
70463+ do {
70464+ result =
70465+ cut_tree_object(tree, from, to, NULL, inode, truncate,
70466+ &progress);
70467+ } while (result == -E_REPEAT);
70468+
70469+ return result;
70470+}
70471+
70472+/* finishing reiser4 initialization */
70473+int init_tree(reiser4_tree * tree /* pointer to structure being
70474+ * initialized */ ,
70475+ const reiser4_block_nr * root_block /* address of a root block
70476+ * on a disk */ ,
70477+ tree_level height /* height of a tree */ ,
70478+ node_plugin * nplug /* default node plugin */ )
70479+{
70480+ int result;
70481+
70482+ assert("nikita-306", tree != NULL);
70483+ assert("nikita-307", root_block != NULL);
70484+ assert("nikita-308", height > 0);
70485+ assert("nikita-309", nplug != NULL);
70486+ assert("zam-587", tree->super != NULL);
70487+
70488+ tree->root_block = *root_block;
70489+ tree->height = height;
70490+ tree->estimate_one_insert = calc_estimate_one_insert(height);
70491+ tree->nplug = nplug;
70492+
70493+ tree->znode_epoch = 1ull;
70494+
70495+ cbk_cache_init(&tree->cbk_cache);
70496+
70497+ result = znodes_tree_init(tree);
70498+ if (result == 0)
70499+ result = jnodes_tree_init(tree);
70500+ if (result == 0) {
70501+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, get_gfp_mask());
70502+ if (IS_ERR(tree->uber)) {
70503+ result = PTR_ERR(tree->uber);
70504+ tree->uber = NULL;
70505+ }
70506+ }
70507+ return result;
70508+}
70509+
70510+/* release resources associated with @tree */
70511+void done_tree(reiser4_tree * tree /* tree to release */ )
70512+{
70513+ if (tree == NULL)
70514+ return;
70515+
70516+ if (tree->uber != NULL) {
70517+ zput(tree->uber);
70518+ tree->uber = NULL;
70519+ }
70520+ znodes_tree_done(tree);
70521+ jnodes_tree_done(tree);
70522+ cbk_cache_done(&tree->cbk_cache);
70523+}
70524+
70525+/* Make Linus happy.
70526+ Local variables:
70527+ c-indentation-style: "K&R"
70528+ mode-name: "LC"
70529+ c-basic-offset: 8
70530+ tab-width: 8
70531+ fill-column: 120
70532+ scroll-step: 1
70533+ End:
70534+*/
70535Index: linux-2.6.16/fs/reiser4/tree.h
70536===================================================================
70537--- /dev/null
70538+++ linux-2.6.16/fs/reiser4/tree.h
70539@@ -0,0 +1,579 @@
70540+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70541+ * reiser4/README */
70542+
70543+/* Tree operations. See fs/reiser4/tree.c for comments */
70544+
70545+#if !defined( __REISER4_TREE_H__ )
70546+#define __REISER4_TREE_H__
70547+
70548+#include "forward.h"
70549+#include "debug.h"
70550+#include "dformat.h"
70551+#include "plugin/node/node.h"
70552+#include "plugin/plugin.h"
70553+#include "znode.h"
70554+#include "tap.h"
70555+
70556+#include <linux/types.h> /* for __u?? */
70557+#include <linux/fs.h> /* for struct super_block */
70558+#include <linux/spinlock.h>
70559+#include <linux/sched.h> /* for struct task_struct */
70560+
70561+/* fictive block number never actually used */
70562+extern const reiser4_block_nr UBER_TREE_ADDR;
70563+
70564+/* &cbk_cache_slot - entry in a coord cache.
70565+
70566+ This is entry in a coord_by_key (cbk) cache, represented by
70567+ &cbk_cache.
70568+
70569+*/
70570+typedef struct cbk_cache_slot {
70571+ /* cached node */
70572+ znode *node;
70573+ /* linkage to the next cbk cache slot in a LRU order */
70574+ struct list_head lru;
70575+} cbk_cache_slot;
70576+
70577+/* &cbk_cache - coord cache. This is part of reiser4_tree.
70578+
70579+ cbk_cache is supposed to speed up tree lookups by caching results of recent
70580+ successful lookups (we don't cache negative results as dentry cache
70581+ does). Cache consists of relatively small number of entries kept in a LRU
70582+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
70583+ which we can obtain a range of keys that covered by this znode. Before
70584+ embarking into real tree traversal we scan cbk_cache slot by slot and for
70585+ each slot check whether key we are looking for is between minimal and
70586+ maximal keys for node pointed to by this slot. If no match is found, real
70587+ tree traversal is performed and if result is successful, appropriate entry
70588+ is inserted into cache, possibly pulling least recently used entry out of
70589+ it.
70590+
70591+ Tree spin lock is used to protect coord cache. If contention for this
70592+ lock proves to be too high, more finer grained locking can be added.
70593+
70594+ Invariants involving parts of this data-type:
70595+
70596+ [cbk-cache-invariant]
70597+*/
70598+typedef struct cbk_cache {
70599+ /* serializator */
70600+ rwlock_t guard;
70601+ int nr_slots;
70602+ /* head of LRU list of cache slots */
70603+ struct list_head lru;
70604+ /* actual array of slots */
70605+ cbk_cache_slot *slot;
70606+} cbk_cache;
70607+
70608+
70609+/* level_lookup_result - possible outcome of looking up key at some level.
70610+ This is used by coord_by_key when traversing tree downward. */
70611+typedef enum {
70612+ /* continue to the next level */
70613+ LOOKUP_CONT,
70614+ /* done. Either required item was found, or we can prove it
70615+ doesn't exist, or some error occurred. */
70616+ LOOKUP_DONE,
70617+ /* restart traversal from the root. Infamous "repetition". */
70618+ LOOKUP_REST
70619+} level_lookup_result;
70620+
70621+/* This is representation of internal reiser4 tree where all file-system
70622+ data and meta-data are stored. This structure is passed to all tree
70623+ manipulation functions. It's different from the super block because:
70624+ we don't want to limit ourselves to strictly one to one mapping
70625+ between super blocks and trees, and, because they are logically
70626+ different: there are things in a super block that have no relation to
70627+ the tree (bitmaps, journalling area, mount options, etc.) and there
70628+ are things in a tree that bear no relation to the super block, like
70629+ tree of znodes.
70630+
70631+ At this time, there is only one tree
70632+ per filesystem, and this struct is part of the super block. We only
70633+ call the super block the super block for historical reasons (most
70634+ other filesystems call the per filesystem metadata the super block).
70635+*/
70636+
70637+struct reiser4_tree {
70638+ /* block_nr == 0 is fake znode. Write lock it, while changing
70639+ tree height. */
70640+ /* disk address of root node of a tree */
70641+ reiser4_block_nr root_block;
70642+
70643+ /* level of the root node. If this is 1, tree consists of root
70644+ node only */
70645+ tree_level height;
70646+
70647+ /*
70648+ * this is cached here avoid calling plugins through function
70649+ * dereference all the time.
70650+ */
70651+ __u64 estimate_one_insert;
70652+
70653+ /* cache of recent tree lookup results */
70654+ cbk_cache cbk_cache;
70655+
70656+ /* hash table to look up znodes by block number. */
70657+ z_hash_table zhash_table;
70658+ z_hash_table zfake_table;
70659+ /* hash table to look up jnodes by inode and offset. */
70660+ j_hash_table jhash_table;
70661+
70662+ /* lock protecting:
70663+ - parent pointers,
70664+ - sibling pointers,
70665+ - znode hash table
70666+ - coord cache
70667+ */
70668+ /* NOTE: The "giant" tree lock can be replaced by more spin locks,
70669+ hoping they will be less contented. We can use one spin lock per one
70670+ znode hash bucket. With adding of some code complexity, sibling
70671+ pointers can be protected by both znode spin locks. However it looks
70672+ more SMP scalable we should test this locking change on n-ways (n >
70673+ 4) SMP machines. Current 4-ways machine test does not show that tree
70674+ lock is contented and it is a bottleneck (2003.07.25). */
70675+
70676+ rwlock_t tree_lock;
70677+
70678+ /* lock protecting delimiting keys */
70679+ rwlock_t dk_lock;
70680+
70681+ /* spin lock protecting znode_epoch */
70682+ spinlock_t epoch_lock;
70683+ /* version stamp used to mark znode updates. See seal.[ch] for more
70684+ * information. */
70685+ __u64 znode_epoch;
70686+
70687+ znode *uber;
70688+ node_plugin *nplug;
70689+ struct super_block *super;
70690+ struct {
70691+ /* carry flags used for insertion of new nodes */
70692+ __u32 new_node_flags;
70693+ /* carry flags used for insertion of new extents */
70694+ __u32 new_extent_flags;
70695+ /* carry flags used for paste operations */
70696+ __u32 paste_flags;
70697+ /* carry flags used for insert operations */
70698+ __u32 insert_flags;
70699+ } carry;
70700+};
70701+
70702+extern int init_tree(reiser4_tree * tree,
70703+ const reiser4_block_nr * root_block, tree_level height,
70704+ node_plugin * default_plugin);
70705+extern void done_tree(reiser4_tree * tree);
70706+
70707+/* cbk flags: options for coord_by_key() */
70708+typedef enum {
70709+ /* coord_by_key() is called for insertion. This is necessary because
70710+ of extents being located at the twig level. For explanation, see
70711+ comment just above is_next_item_internal().
70712+ */
70713+ CBK_FOR_INSERT = (1 << 0),
70714+ /* coord_by_key() is called with key that is known to be unique */
70715+ CBK_UNIQUE = (1 << 1),
70716+ /* coord_by_key() can trust delimiting keys. This options is not user
70717+ accessible. coord_by_key() will set it automatically. It will be
70718+ only cleared by special-case in extents-on-the-twig-level handling
70719+ where it is necessary to insert item with a key smaller than
70720+ leftmost key in a node. This is necessary because of extents being
70721+ located at the twig level. For explanation, see comment just above
70722+ is_next_item_internal().
70723+ */
70724+ CBK_TRUST_DK = (1 << 2),
70725+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
70726+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
70727+ CBK_DKSET = (1 << 5),
70728+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
70729+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */
70730+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
70731+ * lock */
70732+} cbk_flags;
70733+
70734+/* insertion outcome. IBK = insert by key */
70735+typedef enum {
70736+ IBK_INSERT_OK = 0,
70737+ IBK_ALREADY_EXISTS = -EEXIST,
70738+ IBK_IO_ERROR = -EIO,
70739+ IBK_NO_SPACE = -E_NODE_FULL,
70740+ IBK_OOM = -ENOMEM
70741+} insert_result;
70742+
70743+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
70744+
70745+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
70746+ lock_handle * lh, void *arg);
70747+extern int iterate_tree(reiser4_tree * tree, coord_t * coord, lock_handle * lh,
70748+ tree_iterate_actor_t actor, void *arg,
70749+ znode_lock_mode mode, int through_units_p);
70750+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
70751+ znode_lock_request pri, lock_handle * lh);
70752+
70753+/* return node plugin of @node */
70754+static inline node_plugin *node_plugin_by_node(const znode *
70755+ node /* node to query */ )
70756+{
70757+ assert("vs-213", node != NULL);
70758+ assert("vs-214", znode_is_loaded(node));
70759+
70760+ return node->nplug;
70761+}
70762+
70763+/* number of items in @node */
70764+static inline pos_in_node_t node_num_items(const znode * node)
70765+{
70766+ assert("nikita-2754", znode_is_loaded(node));
70767+ assert("nikita-2468",
70768+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
70769+
70770+ return node->nr_items;
70771+}
70772+
70773+/* Return the number of items at the present node. Asserts coord->node !=
70774+ NULL. */
70775+static inline unsigned coord_num_items(const coord_t * coord)
70776+{
70777+ assert("jmacd-9805", coord->node != NULL);
70778+
70779+ return node_num_items(coord->node);
70780+}
70781+
70782+/* true if @node is empty */
70783+static inline int node_is_empty(const znode * node)
70784+{
70785+ return node_num_items(node) == 0;
70786+}
70787+
70788+typedef enum {
70789+ SHIFTED_SOMETHING = 0,
70790+ SHIFT_NO_SPACE = -E_NODE_FULL,
70791+ SHIFT_IO_ERROR = -EIO,
70792+ SHIFT_OOM = -ENOMEM,
70793+} shift_result;
70794+
70795+extern node_plugin *node_plugin_by_coord(const coord_t * coord);
70796+extern int is_coord_in_node(const coord_t * coord);
70797+extern int key_in_node(const reiser4_key *, const coord_t *);
70798+extern void coord_item_move_to(coord_t * coord, int items);
70799+extern void coord_unit_move_to(coord_t * coord, int units);
70800+
70801+/* there are two types of repetitive accesses (ra): intra-syscall
70802+ (local) and inter-syscall (global). Local ra is used when
70803+ during single syscall we add/delete several items and units in the
70804+ same place in a tree. Note that plan-A fragments local ra by
70805+ separating stat-data and file body in key-space. Global ra is
70806+ used when user does repetitive modifications in the same place in a
70807+ tree.
70808+
70809+ Our ra implementation serves following purposes:
70810+ 1 it affects balancing decisions so that next operation in a row
70811+ can be performed faster;
70812+ 2 it affects lower-level read-ahead in page-cache;
70813+ 3 it allows to avoid unnecessary lookups by maintaining some state
70814+ across several operations (this is only for local ra);
70815+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of
70816+ operations they are performed without actually doing any intra-node
70817+ shifts, until we finish sequence or scope of sequence leaves
70818+ current node, only then we really pack node (local ra only).
70819+*/
70820+
70821+/* another thing that can be useful is to keep per-tree and/or
70822+ per-process cache of recent lookups. This cache can be organised as a
70823+ list of block numbers of formatted nodes sorted by starting key in
70824+ this node. Balancings should invalidate appropriate parts of this
70825+ cache.
70826+*/
70827+
70828+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
70829+ coord_t * coord, lock_handle * handle,
70830+ znode_lock_mode lock, lookup_bias bias,
70831+ tree_level lock_level, tree_level stop_level,
70832+ __u32 flags, ra_info_t *);
70833+
70834+lookup_result object_lookup(struct inode *object,
70835+ const reiser4_key * key,
70836+ coord_t * coord,
70837+ lock_handle * lh,
70838+ znode_lock_mode lock_mode,
70839+ lookup_bias bias,
70840+ tree_level lock_level,
70841+ tree_level stop_level,
70842+ __u32 flags, ra_info_t * info);
70843+
70844+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
70845+ reiser4_item_data * data, coord_t * coord,
70846+ lock_handle * lh,
70847+ tree_level stop_level, __u32 flags);
70848+insert_result insert_by_coord(coord_t * coord,
70849+ reiser4_item_data * data, const reiser4_key * key,
70850+ lock_handle * lh, __u32);
70851+insert_result insert_extent_by_coord(coord_t * coord,
70852+ reiser4_item_data * data,
70853+ const reiser4_key * key, lock_handle * lh);
70854+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
70855+ const reiser4_key * to_key,
70856+ reiser4_key * smallest_removed);
70857+int kill_node_content(coord_t * from, coord_t * to,
70858+ const reiser4_key * from_key, const reiser4_key * to_key,
70859+ reiser4_key * smallest_removed,
70860+ znode * locked_left_neighbor, struct inode *inode,
70861+ int truncate);
70862+
70863+int resize_item(coord_t * coord, reiser4_item_data * data,
70864+ reiser4_key * key, lock_handle * lh, cop_insert_flag);
70865+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
70866+ reiser4_item_data * data, unsigned);
70867+int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
70868+int find_new_child_ptr(znode * parent, znode * child, znode * left,
70869+ coord_t * result);
70870+
70871+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
70872+int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
70873+
70874+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
70875+
70876+extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
70877+ const reiser4_key *, reiser4_key *,
70878+ struct inode *, int, int *);
70879+extern int cut_tree_object(reiser4_tree *, const reiser4_key *,
70880+ const reiser4_key *, reiser4_key *, struct inode *,
70881+ int, int *);
70882+extern int cut_tree(reiser4_tree * tree, const reiser4_key * from,
70883+ const reiser4_key * to, struct inode *, int);
70884+
70885+extern int delete_node(znode * node, reiser4_key *, struct inode *, int);
70886+extern int check_tree_pointer(const coord_t * pointer, const znode * child);
70887+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
70888+ znode * left, coord_t * result);
70889+extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
70890+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
70891+ znode * child);
70892+extern znode *child_znode(const coord_t * in_parent, znode * parent,
70893+ int incore_p, int setup_dkeys_p);
70894+
70895+extern int cbk_cache_init(cbk_cache * cache);
70896+extern void cbk_cache_done(cbk_cache * cache);
70897+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
70898+
70899+extern char *sprint_address(const reiser4_block_nr * block);
70900+
70901+#if REISER4_DEBUG
70902+extern void print_coord_content(const char *prefix, coord_t * p);
70903+extern void reiser4_print_address(const char *prefix,
70904+ const reiser4_block_nr * block);
70905+extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
70906+ __u32 flags);
70907+extern void check_dkeys(znode *node);
70908+#else
70909+#define print_coord_content(p, c) noop
70910+#define reiser4_print_address(p, b) noop
70911+#endif
70912+
70913+extern void forget_znode(lock_handle * handle);
70914+extern int deallocate_znode(znode * node);
70915+
70916+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
70917+
70918+/* struct used internally to pack all numerous arguments of tree lookup.
70919+ Used to avoid passing a lot of arguments to helper functions. */
70920+typedef struct cbk_handle {
70921+ /* tree we are in */
70922+ reiser4_tree *tree;
70923+ /* key we are going after */
70924+ const reiser4_key *key;
70925+ /* coord we will store result in */
70926+ coord_t *coord;
70927+ /* type of lock to take on target node */
70928+ znode_lock_mode lock_mode;
70929+ /* lookup bias. See comments at the declaration of lookup_bias */
70930+ lookup_bias bias;
70931+ /* lock level: level starting from which tree traversal starts taking
70932+ * write locks. */
70933+ tree_level lock_level;
70934+ /* level where search will stop. Either item will be found between
70935+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be
70936+ returned.
70937+ */
70938+ tree_level stop_level;
70939+ /* level we are currently at */
70940+ tree_level level;
70941+ /* block number of @active node. Tree traversal operates on two
70942+ nodes: active and parent. */
70943+ reiser4_block_nr block;
70944+ /* put here error message to be printed by caller */
70945+ const char *error;
70946+ /* result passed back to caller */
70947+ lookup_result result;
70948+ /* lock handles for active and parent */
70949+ lock_handle *parent_lh;
70950+ lock_handle *active_lh;
70951+ reiser4_key ld_key;
70952+ reiser4_key rd_key;
70953+ /* flags, passed to the cbk routine. Bits of this bitmask are defined
70954+ in tree.h:cbk_flags enum. */
70955+ __u32 flags;
70956+ ra_info_t *ra_info;
70957+ struct inode *object;
70958+} cbk_handle;
70959+
70960+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
70961+
70962+/* eottl.c */
70963+extern int handle_eottl(cbk_handle *h, int *outcome);
70964+
70965+int lookup_multikey(cbk_handle * handle, int nr_keys);
70966+int lookup_couple(reiser4_tree * tree,
70967+ const reiser4_key * key1, const reiser4_key * key2,
70968+ coord_t * coord1, coord_t * coord2,
70969+ lock_handle * lh1, lock_handle * lh2,
70970+ znode_lock_mode lock_mode, lookup_bias bias,
70971+ tree_level lock_level, tree_level stop_level, __u32 flags,
70972+ int *result1, int *result2);
70973+
70974+
70975+static inline void read_lock_tree(reiser4_tree *tree)
70976+{
70977+ /* check that tree is not locked */
70978+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
70979+ LOCK_CNT_NIL(read_locked_tree) &&
70980+ LOCK_CNT_NIL(write_locked_tree)));
70981+ /* check that spinlocks of lower priorities are not held */
70982+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
70983+ LOCK_CNT_NIL(rw_locked_dk) &&
70984+ LOCK_CNT_NIL(spin_locked_stack)));
70985+
70986+ read_lock(&(tree->tree_lock));
70987+
70988+ LOCK_CNT_INC(read_locked_tree);
70989+ LOCK_CNT_INC(rw_locked_tree);
70990+ LOCK_CNT_INC(spin_locked);
70991+}
70992+
70993+static inline void read_unlock_tree(reiser4_tree *tree)
70994+{
70995+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
70996+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
70997+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70998+
70999+ LOCK_CNT_DEC(read_locked_tree);
71000+ LOCK_CNT_DEC(rw_locked_tree);
71001+ LOCK_CNT_DEC(spin_locked);
71002+
71003+ read_unlock(&(tree->tree_lock));
71004+}
71005+
71006+static inline void write_lock_tree(reiser4_tree *tree)
71007+{
71008+ /* check that tree is not locked */
71009+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71010+ LOCK_CNT_NIL(read_locked_tree) &&
71011+ LOCK_CNT_NIL(write_locked_tree)));
71012+ /* check that spinlocks of lower priorities are not held */
71013+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71014+ LOCK_CNT_NIL(rw_locked_dk) &&
71015+ LOCK_CNT_NIL(spin_locked_stack)));
71016+
71017+ write_lock(&(tree->tree_lock));
71018+
71019+ LOCK_CNT_INC(write_locked_tree);
71020+ LOCK_CNT_INC(rw_locked_tree);
71021+ LOCK_CNT_INC(spin_locked);
71022+}
71023+
71024+static inline void write_unlock_tree(reiser4_tree *tree)
71025+{
71026+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
71027+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71028+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71029+
71030+ LOCK_CNT_DEC(write_locked_tree);
71031+ LOCK_CNT_DEC(rw_locked_tree);
71032+ LOCK_CNT_DEC(spin_locked);
71033+
71034+ write_unlock(&(tree->tree_lock));
71035+}
71036+
71037+static inline void read_lock_dk(reiser4_tree *tree)
71038+{
71039+ /* check that dk is not locked */
71040+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71041+ LOCK_CNT_NIL(read_locked_dk) &&
71042+ LOCK_CNT_NIL(write_locked_dk)));
71043+ /* check that spinlocks of lower priorities are not held */
71044+ assert("", LOCK_CNT_NIL(spin_locked_stack));
71045+
71046+ read_lock(&((tree)->dk_lock));
71047+
71048+ LOCK_CNT_INC(read_locked_dk);
71049+ LOCK_CNT_INC(rw_locked_dk);
71050+ LOCK_CNT_INC(spin_locked);
71051+}
71052+
71053+static inline void read_unlock_dk(reiser4_tree *tree)
71054+{
71055+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
71056+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71057+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71058+
71059+ LOCK_CNT_DEC(read_locked_dk);
71060+ LOCK_CNT_DEC(rw_locked_dk);
71061+ LOCK_CNT_DEC(spin_locked);
71062+
71063+ read_unlock(&(tree->dk_lock));
71064+}
71065+
71066+static inline void write_lock_dk(reiser4_tree *tree)
71067+{
71068+ /* check that dk is not locked */
71069+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71070+ LOCK_CNT_NIL(read_locked_dk) &&
71071+ LOCK_CNT_NIL(write_locked_dk)));
71072+ /* check that spinlocks of lower priorities are not held */
71073+ assert("", LOCK_CNT_NIL(spin_locked_stack));
71074+
71075+ write_lock(&((tree)->dk_lock));
71076+
71077+ LOCK_CNT_INC(write_locked_dk);
71078+ LOCK_CNT_INC(rw_locked_dk);
71079+ LOCK_CNT_INC(spin_locked);
71080+}
71081+
71082+static inline void write_unlock_dk(reiser4_tree *tree)
71083+{
71084+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
71085+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71086+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71087+
71088+ LOCK_CNT_DEC(write_locked_dk);
71089+ LOCK_CNT_DEC(rw_locked_dk);
71090+ LOCK_CNT_DEC(spin_locked);
71091+
71092+ write_unlock(&(tree->dk_lock));
71093+}
71094+
71095+/* estimate api. Implementation is in estimate.c */
71096+reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
71097+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
71098+reiser4_block_nr estimate_insert_flow(tree_level);
71099+reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
71100+reiser4_block_nr calc_estimate_one_insert(tree_level);
71101+reiser4_block_nr estimate_dirty_cluster(struct inode *);
71102+reiser4_block_nr estimate_insert_cluster(struct inode *);
71103+reiser4_block_nr estimate_update_cluster(struct inode *);
71104+
71105+
71106+/* __REISER4_TREE_H__ */
71107+#endif
71108+
71109+/* Make Linus happy.
71110+ Local variables:
71111+ c-indentation-style: "K&R"
71112+ mode-name: "LC"
71113+ c-basic-offset: 8
71114+ tab-width: 8
71115+ fill-column: 120
71116+ scroll-step: 1
71117+ End:
71118+*/
71119Index: linux-2.6.16/fs/reiser4/tree_mod.c
71120===================================================================
71121--- /dev/null
71122+++ linux-2.6.16/fs/reiser4/tree_mod.c
71123@@ -0,0 +1,383 @@
71124+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71125+ * reiser4/README */
71126+
71127+/*
71128+ * Functions to add/delete new nodes to/from the tree.
71129+ *
71130+ * Functions from this file are used by carry (see carry*) to handle:
71131+ *
71132+ * . insertion of new formatted node into tree
71133+ *
71134+ * . addition of new tree root, increasing tree height
71135+ *
71136+ * . removing tree root, decreasing tree height
71137+ *
71138+ */
71139+
71140+#include "forward.h"
71141+#include "debug.h"
71142+#include "dformat.h"
71143+#include "key.h"
71144+#include "coord.h"
71145+#include "plugin/plugin.h"
71146+#include "jnode.h"
71147+#include "znode.h"
71148+#include "tree_mod.h"
71149+#include "block_alloc.h"
71150+#include "tree_walk.h"
71151+#include "tree.h"
71152+#include "super.h"
71153+
71154+#include <linux/err.h>
71155+
71156+static int add_child_ptr(znode * parent, znode * child);
71157+/* warning only issued if error is not -E_REPEAT */
71158+#define ewarning( error, ... ) \
71159+ if( ( error ) != -E_REPEAT ) \
71160+ warning( __VA_ARGS__ )
71161+
71162+/* allocate new node on the @level and immediately on the right of @brother. */
71163+znode *new_node(znode * brother /* existing left neighbor of new node */ ,
71164+ tree_level level /* tree level at which new node is to
71165+ * be allocated */ )
71166+{
71167+ znode *result;
71168+ int retcode;
71169+ reiser4_block_nr blocknr;
71170+
71171+ assert("nikita-930", brother != NULL);
71172+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
71173+
71174+ retcode = assign_fake_blocknr_formatted(&blocknr);
71175+ if (retcode == 0) {
71176+ result =
71177+ zget(znode_get_tree(brother), &blocknr, NULL, level,
71178+ get_gfp_mask());
71179+ if (IS_ERR(result)) {
71180+ ewarning(PTR_ERR(result), "nikita-929",
71181+ "Cannot allocate znode for carry: %li",
71182+ PTR_ERR(result));
71183+ return result;
71184+ }
71185+ /* cheap test, can be executed even when debugging is off */
71186+ if (!znode_just_created(result)) {
71187+ warning("nikita-2213",
71188+ "Allocated already existing block: %llu",
71189+ (unsigned long long)blocknr);
71190+ zput(result);
71191+ return ERR_PTR(RETERR(-EIO));
71192+ }
71193+
71194+ assert("nikita-931", result != NULL);
71195+ result->nplug = znode_get_tree(brother)->nplug;
71196+ assert("nikita-933", result->nplug != NULL);
71197+
71198+ retcode = zinit_new(result, get_gfp_mask());
71199+ if (retcode == 0) {
71200+ ZF_SET(result, JNODE_CREATED);
71201+ zrelse(result);
71202+ } else {
71203+ zput(result);
71204+ result = ERR_PTR(retcode);
71205+ }
71206+ } else {
71207+ /* failure to allocate new node during balancing.
71208+ This should never happen. Ever. Returning -E_REPEAT
71209+ is not viable solution, because "out of disk space"
71210+ is not transient error that will go away by itself.
71211+ */
71212+ ewarning(retcode, "nikita-928",
71213+ "Cannot allocate block for carry: %i", retcode);
71214+ result = ERR_PTR(retcode);
71215+ }
71216+ assert("nikita-1071", result != NULL);
71217+ return result;
71218+}
71219+
71220+/* allocate new root and add it to the tree
71221+
71222+ This helper function is called by add_new_root().
71223+
71224+*/
71225+znode *add_tree_root(znode * old_root /* existing tree root */ ,
71226+ znode * fake /* "fake" znode */ )
71227+{
71228+ reiser4_tree *tree = znode_get_tree(old_root);
71229+ znode *new_root = NULL; /* to shut gcc up */
71230+ int result;
71231+
71232+ assert("nikita-1069", old_root != NULL);
71233+ assert("umka-262", fake != NULL);
71234+ assert("umka-263", tree != NULL);
71235+
71236+ /* "fake" znode---one always hanging just above current root. This
71237+ node is locked when new root is created or existing root is
71238+ deleted. Downward tree traversal takes lock on it before taking
71239+ lock on a root node. This avoids race conditions with root
71240+ manipulations.
71241+
71242+ */
71243+ assert("nikita-1348", znode_above_root(fake));
71244+ assert("nikita-1211", znode_is_root(old_root));
71245+
71246+ result = 0;
71247+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
71248+ warning("nikita-1344", "Tree is too tall: %i", tree->height);
71249+ /* ext2 returns -ENOSPC when it runs out of free inodes with a
71250+ following comment (fs/ext2/ialloc.c:441): Is it really
71251+ ENOSPC?
71252+
71253+ -EXFULL? -EINVAL?
71254+ */
71255+ result = RETERR(-ENOSPC);
71256+ } else {
71257+ /* Allocate block for new root. It's not that
71258+ important where it will be allocated, as root is
71259+ almost always in memory. Moreover, allocate on
71260+ flush can be going here.
71261+ */
71262+ assert("nikita-1448", znode_is_root(old_root));
71263+ new_root = new_node(fake, tree->height + 1);
71264+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71265+ lock_handle rlh;
71266+
71267+ init_lh(&rlh);
71268+ result =
71269+ longterm_lock_znode(&rlh, new_root,
71270+ ZNODE_WRITE_LOCK,
71271+ ZNODE_LOCK_LOPRI);
71272+ if (result == 0) {
71273+ parent_coord_t *in_parent;
71274+
71275+ znode_make_dirty(fake);
71276+
71277+ /* new root is a child of "fake" node */
71278+ write_lock_tree(tree);
71279+
71280+ ++tree->height;
71281+
71282+ /* recalculate max balance overhead */
71283+ tree->estimate_one_insert =
71284+ estimate_one_insert_item(tree);
71285+
71286+ tree->root_block = *znode_get_block(new_root);
71287+ in_parent = &new_root->in_parent;
71288+ init_parent_coord(in_parent, fake);
71289+ /* manually insert new root into sibling
71290+ * list. With this all nodes involved into
71291+ * balancing are connected after balancing is
71292+ * done---useful invariant to check. */
71293+ sibling_list_insert_nolock(new_root, NULL);
71294+ write_unlock_tree(tree);
71295+
71296+ /* insert into new root pointer to the
71297+ @old_root. */
71298+ assert("nikita-1110",
71299+ WITH_DATA(new_root,
71300+ node_is_empty(new_root)));
71301+ write_lock_dk(tree);
71302+ znode_set_ld_key(new_root, min_key());
71303+ znode_set_rd_key(new_root, max_key());
71304+ write_unlock_dk(tree);
71305+ if (REISER4_DEBUG) {
71306+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71307+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71308+ ZF_SET(old_root, JNODE_ORPHAN);
71309+ }
71310+ result = add_child_ptr(new_root, old_root);
71311+ done_lh(&rlh);
71312+ }
71313+ zrelse(new_root);
71314+ }
71315+ }
71316+ if (result != 0)
71317+ new_root = ERR_PTR(result);
71318+ return new_root;
71319+}
71320+
71321+/* build &reiser4_item_data for inserting child pointer
71322+
71323+ Build &reiser4_item_data that can be later used to insert pointer to @child
71324+ in its parent.
71325+
71326+*/
71327+void build_child_ptr_data(znode * child /* node pointer to which will be
71328+ * inserted */ ,
71329+ reiser4_item_data * data /* where to store result */ )
71330+{
71331+ assert("nikita-1116", child != NULL);
71332+ assert("nikita-1117", data != NULL);
71333+
71334+ /*
71335+ * NOTE: use address of child's blocknr as address of data to be
71336+ * inserted. As result of this data gets into on-disk structure in cpu
71337+ * byte order. internal's create_hook converts it to little endian byte
71338+ * order.
71339+ */
71340+ data->data = (char *)znode_get_block(child);
71341+ /* data -> data is kernel space */
71342+ data->user = 0;
71343+ data->length = sizeof(reiser4_block_nr);
71344+ /* FIXME-VS: hardcoded internal item? */
71345+
71346+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71347+ data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71348+}
71349+
71350+/* add pointer to @child into empty @parent.
71351+
71352+ This is used when pointer to old root is inserted into new root which is
71353+ empty.
71354+*/
71355+static int add_child_ptr(znode * parent, znode * child)
71356+{
71357+ coord_t coord;
71358+ reiser4_item_data data;
71359+ int result;
71360+ reiser4_key key;
71361+
71362+ assert("nikita-1111", parent != NULL);
71363+ assert("nikita-1112", child != NULL);
71364+ assert("nikita-1115",
71365+ znode_get_level(parent) == znode_get_level(child) + 1);
71366+
71367+ result = zload(parent);
71368+ if (result != 0)
71369+ return result;
71370+ assert("nikita-1113", node_is_empty(parent));
71371+ coord_init_first_unit(&coord, parent);
71372+
71373+ build_child_ptr_data(child, &data);
71374+ data.arg = NULL;
71375+
71376+ read_lock_dk(znode_get_tree(parent));
71377+ key = *znode_get_ld_key(child);
71378+ read_unlock_dk(znode_get_tree(parent));
71379+
71380+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71381+ NULL);
71382+ znode_make_dirty(parent);
71383+ zrelse(parent);
71384+ return result;
71385+}
71386+
71387+/* actually remove tree root */
71388+static int kill_root(reiser4_tree * tree /* tree from which root is being
71389+ * removed */ ,
71390+ znode * old_root /* root node that is being removed */ ,
71391+ znode * new_root /* new root---sole child of *
71392+ * @old_root */ ,
71393+ const reiser4_block_nr * new_root_blk /* disk address of
71394+ * @new_root */ )
71395+{
71396+ znode *uber;
71397+ int result;
71398+ lock_handle handle_for_uber;
71399+
71400+ assert("umka-265", tree != NULL);
71401+ assert("nikita-1198", new_root != NULL);
71402+ assert("nikita-1199",
71403+ znode_get_level(new_root) + 1 == znode_get_level(old_root));
71404+
71405+ assert("nikita-1201", znode_is_write_locked(old_root));
71406+
71407+ assert("nikita-1203",
71408+ disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71409+
71410+ init_lh(&handle_for_uber);
71411+ /* obtain and lock "fake" znode protecting changes in tree height. */
71412+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71413+ &handle_for_uber);
71414+ if (result == 0) {
71415+ uber = handle_for_uber.node;
71416+
71417+ znode_make_dirty(uber);
71418+
71419+ /* don't take long term lock a @new_root. Take spinlock. */
71420+
71421+ write_lock_tree(tree);
71422+
71423+ tree->root_block = *new_root_blk;
71424+ --tree->height;
71425+
71426+ /* recalculate max balance overhead */
71427+ tree->estimate_one_insert = estimate_one_insert_item(tree);
71428+
71429+ assert("nikita-1202",
71430+ tree->height == znode_get_level(new_root));
71431+
71432+ /* new root is child on "fake" node */
71433+ init_parent_coord(&new_root->in_parent, uber);
71434+ ++uber->c_count;
71435+
71436+ /* sibling_list_insert_nolock(new_root, NULL); */
71437+ write_unlock_tree(tree);
71438+
71439+ /* reinitialise old root. */
71440+ result = node_plugin_by_node(old_root)->init(old_root);
71441+ znode_make_dirty(old_root);
71442+ if (result == 0) {
71443+ assert("nikita-1279", node_is_empty(old_root));
71444+ ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71445+ old_root->c_count = 0;
71446+ }
71447+ }
71448+ done_lh(&handle_for_uber);
71449+
71450+ return result;
71451+}
71452+
71453+/* remove tree root
71454+
71455+ This function removes tree root, decreasing tree height by one. Tree root
71456+ and its only child (that is going to become new tree root) are write locked
71457+ at the entry.
71458+
71459+ To remove tree root we need to take lock on special "fake" znode that
71460+ protects changes of tree height. See comments in add_tree_root() for more
71461+ on this.
71462+
71463+ Also parent pointers have to be updated in
71464+ old and new root. To simplify code, function is split into two parts: outer
71465+ kill_tree_root() collects all necessary arguments and calls kill_root()
71466+ to do the actual job.
71467+
71468+*/
71469+int kill_tree_root(znode * old_root /* tree root that we are removing */ )
71470+{
71471+ int result;
71472+ coord_t down_link;
71473+ znode *new_root;
71474+ reiser4_tree *tree;
71475+
71476+ assert("umka-266", current_tree != NULL);
71477+ assert("nikita-1194", old_root != NULL);
71478+ assert("nikita-1196", znode_is_root(old_root));
71479+ assert("nikita-1200", node_num_items(old_root) == 1);
71480+ assert("nikita-1401", znode_is_write_locked(old_root));
71481+
71482+ coord_init_first_unit(&down_link, old_root);
71483+
71484+ tree = znode_get_tree(old_root);
71485+ new_root = child_znode(&down_link, old_root, 0, 1);
71486+ if (!IS_ERR(new_root)) {
71487+ result =
71488+ kill_root(tree, old_root, new_root,
71489+ znode_get_block(new_root));
71490+ zput(new_root);
71491+ } else
71492+ result = PTR_ERR(new_root);
71493+
71494+ return result;
71495+}
71496+
71497+/* Make Linus happy.
71498+ Local variables:
71499+ c-indentation-style: "K&R"
71500+ mode-name: "LC"
71501+ c-basic-offset: 8
71502+ tab-width: 8
71503+ fill-column: 120
71504+ scroll-step: 1
71505+ End:
71506+*/
71507Index: linux-2.6.16/fs/reiser4/tree_mod.h
71508===================================================================
71509--- /dev/null
71510+++ linux-2.6.16/fs/reiser4/tree_mod.h
71511@@ -0,0 +1,29 @@
71512+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71513+ * reiser4/README */
71514+
71515+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
71516+ * comments. */
71517+
71518+#if !defined( __REISER4_TREE_MOD_H__ )
71519+#define __REISER4_TREE_MOD_H__
71520+
71521+#include "forward.h"
71522+
71523+znode *new_node(znode * brother, tree_level level);
71524+znode *add_tree_root(znode * old_root, znode * fake);
71525+int kill_tree_root(znode * old_root);
71526+void build_child_ptr_data(znode * child, reiser4_item_data * data);
71527+
71528+/* __REISER4_TREE_MOD_H__ */
71529+#endif
71530+
71531+/* Make Linus happy.
71532+ Local variables:
71533+ c-indentation-style: "K&R"
71534+ mode-name: "LC"
71535+ c-basic-offset: 8
71536+ tab-width: 8
71537+ fill-column: 120
71538+ scroll-step: 1
71539+ End:
71540+*/
71541Index: linux-2.6.16/fs/reiser4/tree_walk.c
71542===================================================================
71543--- /dev/null
71544+++ linux-2.6.16/fs/reiser4/tree_walk.c
71545@@ -0,0 +1,926 @@
71546+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71547+ * reiser4/README */
71548+
71549+/* Routines and macros to:
71550+
71551+ get_left_neighbor()
71552+
71553+ get_right_neighbor()
71554+
71555+ get_parent()
71556+
71557+ get_first_child()
71558+
71559+ get_last_child()
71560+
71561+ various routines to walk the whole tree and do things to it like
71562+ repack it, or move it to tertiary storage. Please make them as
71563+ generic as is reasonable.
71564+
71565+*/
71566+
71567+#include "forward.h"
71568+#include "debug.h"
71569+#include "dformat.h"
71570+#include "coord.h"
71571+#include "plugin/item/item.h"
71572+#include "jnode.h"
71573+#include "znode.h"
71574+#include "tree_walk.h"
71575+#include "tree.h"
71576+#include "super.h"
71577+
71578+/* These macros are used internally in tree_walk.c in attempt to make
71579+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
71580+ lock_left_neighbor */
71581+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
71582+#define FIELD_OFFSET(name) offsetof(znode, name)
71583+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
71584+#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
71585+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
71586+
71587+/* This is the generic procedure to get and lock `generic' neighbor (left or
71588+ right neighbor or parent). It implements common algorithm for all cases of
71589+ getting lock on neighbor node, only znode structure field is different in
71590+ each case. This is parameterized by ptr_offset argument, which is byte
71591+ offset for the pointer to the desired neighbor within the current node's
71592+ znode structure. This function should be called with the tree lock held */
71593+static int lock_neighbor(
71594+ /* resulting lock handle */
71595+ lock_handle * result,
71596+ /* znode to lock */
71597+ znode * node,
71598+ /* pointer to neighbor (or parent) znode field offset, in bytes from
71599+ the base address of znode structure */
71600+ int ptr_offset,
71601+ /* lock mode for longterm_lock_znode call */
71602+ znode_lock_mode mode,
71603+ /* lock request for longterm_lock_znode call */
71604+ znode_lock_request req,
71605+ /* GN_* flags */
71606+ int flags, int rlocked)
71607+{
71608+ reiser4_tree *tree = znode_get_tree(node);
71609+ znode *neighbor;
71610+ int ret;
71611+
71612+ assert("umka-236", node != NULL);
71613+ assert("umka-237", tree != NULL);
71614+ assert_rw_locked(&(tree->tree_lock));
71615+
71616+ if (flags & GN_TRY_LOCK)
71617+ req |= ZNODE_LOCK_NONBLOCK;
71618+ if (flags & GN_SAME_ATOM)
71619+ req |= ZNODE_LOCK_DONT_FUSE;
71620+
71621+ /* get neighbor's address by using of sibling link, quit while loop
71622+ (and return) if link is not available. */
71623+ while (1) {
71624+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
71625+
71626+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
71627+ * node pointed by it is not connected.
71628+ *
71629+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
71630+ * check and allows passing reference to not connected znode to
71631+ * subsequent longterm_lock_znode() call. This kills possible
71632+ * busy loop if we are trying to get longterm lock on locked but
71633+ * not yet connected parent node. */
71634+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
71635+ || znode_is_connected(neighbor))) {
71636+ return RETERR(-E_NO_NEIGHBOR);
71637+ }
71638+
71639+ /* protect it from deletion. */
71640+ zref(neighbor);
71641+
71642+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71643+
71644+ ret = longterm_lock_znode(result, neighbor, mode, req);
71645+
71646+ /* The lock handle obtains its own reference, release the one from above. */
71647+ zput(neighbor);
71648+
71649+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71650+
71651+ /* restart if node we got reference to is being
71652+ invalidated. we should not get reference to this node
71653+ again. */
71654+ if (ret == -EINVAL)
71655+ continue;
71656+ if (ret)
71657+ return ret;
71658+
71659+ /* check if neighbor link still points to just locked znode;
71660+ the link could have been changed while the process slept. */
71661+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
71662+ return 0;
71663+
71664+ /* znode was locked by mistake; unlock it and restart locking
71665+ process from beginning. */
71666+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71667+ longterm_unlock_znode(result);
71668+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71669+ }
71670+}
71671+
71672+/* get parent node with longterm lock, accepts GN* flags. */
71673+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
71674+ znode * node /* child node */ ,
71675+ znode_lock_mode mode
71676+ /* type of lock: read or write */ ,
71677+ int flags /* GN_* flags */ )
71678+{
71679+ int result;
71680+
71681+ read_lock_tree(znode_get_tree(node));
71682+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
71683+ ZNODE_LOCK_HIPRI, flags, 1);
71684+ read_unlock_tree(znode_get_tree(node));
71685+ return result;
71686+}
71687+
71688+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
71689+ bit in @flags parameter */
71690+/* Audited by: umka (2002.06.14) */
71691+static inline int
71692+lock_side_neighbor(lock_handle * result,
71693+ znode * node, znode_lock_mode mode, int flags, int rlocked)
71694+{
71695+ int ret;
71696+ int ptr_offset;
71697+ znode_lock_request req;
71698+
71699+ if (flags & GN_GO_LEFT) {
71700+ ptr_offset = LEFT_PTR_OFFSET;
71701+ req = ZNODE_LOCK_LOPRI;
71702+ } else {
71703+ ptr_offset = RIGHT_PTR_OFFSET;
71704+ req = ZNODE_LOCK_HIPRI;
71705+ }
71706+
71707+ ret =
71708+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
71709+
71710+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
71711+ * guarantee that neighbor is absent in the
71712+ * tree; in this case we return -ENOENT --
71713+ * means neighbor at least not found in
71714+ * cache */
71715+ return RETERR(-ENOENT);
71716+
71717+ return ret;
71718+}
71719+
71720+#if REISER4_DEBUG
71721+
71722+int check_sibling_list(znode * node)
71723+{
71724+ znode *scan;
71725+ znode *next;
71726+
71727+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
71728+
71729+ if (node == NULL)
71730+ return 1;
71731+
71732+ if (ZF_ISSET(node, JNODE_RIP))
71733+ return 1;
71734+
71735+ assert("nikita-3270", node != NULL);
71736+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
71737+
71738+ for (scan = node; znode_is_left_connected(scan); scan = next) {
71739+ next = scan->left;
71740+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71741+ assert("nikita-3271", znode_is_right_connected(next));
71742+ assert("nikita-3272", next->right == scan);
71743+ } else
71744+ break;
71745+ }
71746+ for (scan = node; znode_is_right_connected(scan); scan = next) {
71747+ next = scan->right;
71748+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71749+ assert("nikita-3273", znode_is_left_connected(next));
71750+ assert("nikita-3274", next->left == scan);
71751+ } else
71752+ break;
71753+ }
71754+ return 1;
71755+}
71756+
71757+#endif
71758+
71759+/* Znode sibling pointers maintenence. */
71760+
71761+/* Znode sibling pointers are established between any neighbored nodes which are
71762+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
71763+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
71764+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
71765+
71766+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
71767+ take care about searching (hash table lookup may be required) of znode
71768+ neighbors, establishing sibling pointers between them and setting
71769+ JNODE_*_CONNECTED state bits. */
71770+
71771+/* adjusting of sibling pointers and `connected' states for two
71772+ neighbors; works if one neighbor is NULL (was not found). */
71773+
71774+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
71775+void link_left_and_right(znode * left, znode * right)
71776+{
71777+ assert("nikita-3275", check_sibling_list(left));
71778+ assert("nikita-3275", check_sibling_list(right));
71779+
71780+ if (left != NULL) {
71781+ if (left->right == NULL) {
71782+ left->right = right;
71783+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
71784+
71785+ ON_DEBUG(left->right_version =
71786+ atomic_inc_return(&delim_key_version);
71787+ );
71788+
71789+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
71790+ && left->right != right) {
71791+
71792+ ON_DEBUG(left->right->left_version =
71793+ atomic_inc_return(&delim_key_version);
71794+ left->right_version =
71795+ atomic_inc_return(&delim_key_version););
71796+
71797+ left->right->left = NULL;
71798+ left->right = right;
71799+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
71800+ } else
71801+ /*
71802+ * there is a race condition in renew_sibling_link()
71803+ * and assertions below check that it is only one
71804+ * there. Thread T1 calls renew_sibling_link() without
71805+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor
71806+ * node, but before T1 gets to the
71807+ * link_left_and_right(), another thread T2 creates
71808+ * neighbor node and connects it. check for
71809+ * left->right == NULL above protects T1 from
71810+ * overwriting correct left->right pointer installed
71811+ * by T2.
71812+ */
71813+ assert("nikita-3302",
71814+ right == NULL || left->right == right);
71815+ }
71816+ if (right != NULL) {
71817+ if (right->left == NULL) {
71818+ right->left = left;
71819+ ZF_SET(right, JNODE_LEFT_CONNECTED);
71820+
71821+ ON_DEBUG(right->left_version =
71822+ atomic_inc_return(&delim_key_version);
71823+ );
71824+
71825+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
71826+ && right->left != left) {
71827+
71828+ ON_DEBUG(right->left->right_version =
71829+ atomic_inc_return(&delim_key_version);
71830+ right->left_version =
71831+ atomic_inc_return(&delim_key_version););
71832+
71833+ right->left->right = NULL;
71834+ right->left = left;
71835+ ZF_SET(right, JNODE_LEFT_CONNECTED);
71836+
71837+ } else
71838+ assert("nikita-3303",
71839+ left == NULL || right->left == left);
71840+ }
71841+ assert("nikita-3275", check_sibling_list(left));
71842+ assert("nikita-3275", check_sibling_list(right));
71843+}
71844+
71845+/* Audited by: umka (2002.06.14) */
71846+static void link_znodes(znode * first, znode * second, int to_left)
71847+{
71848+ if (to_left)
71849+ link_left_and_right(second, first);
71850+ else
71851+ link_left_and_right(first, second);
71852+}
71853+
71854+/* getting of next (to left or to right, depend on gn_to_left bit in flags)
71855+ coord's unit position in horizontal direction, even across node
71856+ boundary. Should be called under tree lock, it protects nonexistence of
71857+ sibling link on parent level, if lock_side_neighbor() fails with
71858+ -ENOENT. */
71859+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
71860+{
71861+ int ret;
71862+ znode *node;
71863+ reiser4_tree *tree;
71864+
71865+ assert("umka-243", coord != NULL);
71866+ assert("umka-244", handle != NULL);
71867+ assert("zam-1069", handle->node == NULL);
71868+
71869+ ret =
71870+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
71871+ coord_next_unit(coord);
71872+ if (!ret)
71873+ return 0;
71874+
71875+ ret =
71876+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
71877+ if (ret)
71878+ return ret;
71879+
71880+ node = handle->node;
71881+ tree = znode_get_tree(node);
71882+ write_unlock_tree(tree);
71883+
71884+ coord_init_zero(coord);
71885+
71886+ /* We avoid synchronous read here if it is specified by flag. */
71887+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
71888+ ret = jstartio(ZJNODE(handle->node));
71889+ if (!ret)
71890+ ret = -E_REPEAT;
71891+ goto error_locked;
71892+ }
71893+
71894+ /* corresponded zrelse() should be called by the clients of
71895+ far_next_coord(), in place when this node gets unlocked. */
71896+ ret = zload(handle->node);
71897+ if (ret)
71898+ goto error_locked;
71899+
71900+ if (flags & GN_GO_LEFT)
71901+ coord_init_last_unit(coord, node);
71902+ else
71903+ coord_init_first_unit(coord, node);
71904+
71905+ if (0) {
71906+ error_locked:
71907+ longterm_unlock_znode(handle);
71908+ }
71909+ write_lock_tree(tree);
71910+ return ret;
71911+}
71912+
71913+/* Very significant function which performs a step in horizontal direction
71914+ when sibling pointer is not available. Actually, it is only function which
71915+ does it.
71916+ Note: this function does not restore locking status at exit,
71917+ caller should does care about proper unlocking and zrelsing */
71918+static int
71919+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
71920+ tree_level level, int flags, int *nr_locked)
71921+{
71922+ int ret;
71923+ int to_left = flags & GN_GO_LEFT;
71924+ reiser4_block_nr da;
71925+ /* parent of the neighbor node; we set it to parent until not sharing
71926+ of one parent between child and neighbor node is detected */
71927+ znode *side_parent = coord->node;
71928+ reiser4_tree *tree = znode_get_tree(child);
71929+ znode *neighbor = NULL;
71930+
71931+ assert("umka-245", coord != NULL);
71932+ assert("umka-246", handle != NULL);
71933+ assert("umka-247", child != NULL);
71934+ assert("umka-303", tree != NULL);
71935+
71936+ init_lh(handle);
71937+ write_lock_tree(tree);
71938+ ret = far_next_coord(coord, handle, flags);
71939+
71940+ if (ret) {
71941+ if (ret != -ENOENT) {
71942+ write_unlock_tree(tree);
71943+ return ret;
71944+ }
71945+ } else {
71946+ item_plugin *iplug;
71947+
71948+ if (handle->node != NULL) {
71949+ (*nr_locked)++;
71950+ side_parent = handle->node;
71951+ }
71952+
71953+ /* does coord object points to internal item? We do not
71954+ support sibling pointers between znode for formatted and
71955+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */
71956+ iplug = item_plugin_by_coord(coord);
71957+ if (!item_is_internal(coord)) {
71958+ link_znodes(child, NULL, to_left);
71959+ write_unlock_tree(tree);
71960+ /* we know there can't be formatted neighbor */
71961+ return RETERR(-E_NO_NEIGHBOR);
71962+ }
71963+ write_unlock_tree(tree);
71964+
71965+ iplug->s.internal.down_link(coord, NULL, &da);
71966+
71967+ if (flags & GN_NO_ALLOC) {
71968+ neighbor = zlook(tree, &da);
71969+ } else {
71970+ neighbor =
71971+ zget(tree, &da, side_parent, level, get_gfp_mask());
71972+ }
71973+
71974+ if (IS_ERR(neighbor)) {
71975+ ret = PTR_ERR(neighbor);
71976+ return ret;
71977+ }
71978+
71979+ if (neighbor)
71980+ /* update delimiting keys */
71981+ set_child_delimiting_keys(coord->node, coord, neighbor);
71982+
71983+ write_lock_tree(tree);
71984+ }
71985+
71986+ if (likely(neighbor == NULL ||
71987+ (znode_get_level(child) == znode_get_level(neighbor)
71988+ && child != neighbor)))
71989+ link_znodes(child, neighbor, to_left);
71990+ else {
71991+ warning("nikita-3532",
71992+ "Sibling nodes on the different levels: %i != %i\n",
71993+ znode_get_level(child), znode_get_level(neighbor));
71994+ ret = RETERR(-EIO);
71995+ }
71996+
71997+ write_unlock_tree(tree);
71998+
71999+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
72000+ if (neighbor != NULL && (flags & GN_NO_ALLOC))
72001+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */
72002+ zput(neighbor);
72003+
72004+ return ret;
72005+}
72006+
72007+/* This function is for establishing of one side relation. */
72008+/* Audited by: umka (2002.06.14) */
72009+static int connect_one_side(coord_t * coord, znode * node, int flags)
72010+{
72011+ coord_t local;
72012+ lock_handle handle;
72013+ int nr_locked;
72014+ int ret;
72015+
72016+ assert("umka-248", coord != NULL);
72017+ assert("umka-249", node != NULL);
72018+
72019+ coord_dup_nocheck(&local, coord);
72020+
72021+ init_lh(&handle);
72022+
72023+ ret =
72024+ renew_sibling_link(&local, &handle, node, znode_get_level(node),
72025+ flags | GN_NO_ALLOC, &nr_locked);
72026+
72027+ if (handle.node != NULL) {
72028+ /* complementary operations for zload() and lock() in far_next_coord() */
72029+ zrelse(handle.node);
72030+ longterm_unlock_znode(&handle);
72031+ }
72032+
72033+ /* we catch error codes which are not interesting for us because we
72034+ run renew_sibling_link() only for znode connection. */
72035+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
72036+ return 0;
72037+
72038+ return ret;
72039+}
72040+
72041+/* if @child is not in `connected' state, performs hash searches for left and
72042+ right neighbor nodes and establishes horizontal sibling links */
72043+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72044+int connect_znode(coord_t * parent_coord, znode * child)
72045+{
72046+ reiser4_tree *tree = znode_get_tree(child);
72047+ int ret = 0;
72048+
72049+ assert("zam-330", parent_coord != NULL);
72050+ assert("zam-331", child != NULL);
72051+ assert("zam-332", parent_coord->node != NULL);
72052+ assert("umka-305", tree != NULL);
72053+
72054+ /* it is trivial to `connect' root znode because it can't have
72055+ neighbors */
72056+ if (znode_above_root(parent_coord->node)) {
72057+ child->left = NULL;
72058+ child->right = NULL;
72059+ ZF_SET(child, JNODE_LEFT_CONNECTED);
72060+ ZF_SET(child, JNODE_RIGHT_CONNECTED);
72061+
72062+ ON_DEBUG(child->left_version =
72063+ atomic_inc_return(&delim_key_version);
72064+ child->right_version =
72065+ atomic_inc_return(&delim_key_version););
72066+
72067+ return 0;
72068+ }
72069+
72070+ /* load parent node */
72071+ coord_clear_iplug(parent_coord);
72072+ ret = zload(parent_coord->node);
72073+
72074+ if (ret != 0)
72075+ return ret;
72076+
72077+ /* protect `connected' state check by tree_lock */
72078+ read_lock_tree(tree);
72079+
72080+ if (!znode_is_right_connected(child)) {
72081+ read_unlock_tree(tree);
72082+ /* connect right (default is right) */
72083+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
72084+ if (ret)
72085+ goto zrelse_and_ret;
72086+
72087+ read_lock_tree(tree);
72088+ }
72089+
72090+ ret = znode_is_left_connected(child);
72091+
72092+ read_unlock_tree(tree);
72093+
72094+ if (!ret) {
72095+ ret =
72096+ connect_one_side(parent_coord, child,
72097+ GN_NO_ALLOC | GN_GO_LEFT);
72098+ } else
72099+ ret = 0;
72100+
72101+ zrelse_and_ret:
72102+ zrelse(parent_coord->node);
72103+
72104+ return ret;
72105+}
72106+
72107+/* this function is like renew_sibling_link() but allocates neighbor node if
72108+ it doesn't exist and `connects' it. It may require making two steps in
72109+ horizontal direction, first one for neighbor node finding/allocation,
72110+ second one is for finding neighbor of neighbor to connect freshly allocated
72111+ znode. */
72112+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72113+static int
72114+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
72115+{
72116+ coord_t local;
72117+ lock_handle empty[2];
72118+ reiser4_tree *tree = znode_get_tree(node);
72119+ znode *neighbor = NULL;
72120+ int nr_locked = 0;
72121+ int ret;
72122+
72123+ assert("umka-250", coord != NULL);
72124+ assert("umka-251", node != NULL);
72125+ assert("umka-307", tree != NULL);
72126+ assert("umka-308", level <= tree->height);
72127+
72128+ /* umka (2002.06.14)
72129+ Here probably should be a check for given "level" validness.
72130+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
72131+ */
72132+
72133+ coord_dup(&local, coord);
72134+
72135+ ret =
72136+ renew_sibling_link(&local, &empty[0], node, level,
72137+ flags & ~GN_NO_ALLOC, &nr_locked);
72138+ if (ret)
72139+ goto out;
72140+
72141+ /* tree lock is not needed here because we keep parent node(s) locked
72142+ and reference to neighbor znode incremented */
72143+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
72144+
72145+ read_lock_tree(tree);
72146+ ret = znode_is_connected(neighbor);
72147+ read_unlock_tree(tree);
72148+ if (ret) {
72149+ ret = 0;
72150+ goto out;
72151+ }
72152+
72153+ ret =
72154+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
72155+ flags | GN_NO_ALLOC, &nr_locked);
72156+ /* second renew_sibling_link() call is used for znode connection only,
72157+ so we can live with these errors */
72158+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
72159+ ret = 0;
72160+
72161+ out:
72162+
72163+ for (--nr_locked; nr_locked >= 0; --nr_locked) {
72164+ zrelse(empty[nr_locked].node);
72165+ longterm_unlock_znode(&empty[nr_locked]);
72166+ }
72167+
72168+ if (neighbor != NULL)
72169+ /* decrement znode reference counter without actually
72170+ releasing it. */
72171+ atomic_dec(&ZJNODE(neighbor)->x_count);
72172+
72173+ return ret;
72174+}
72175+
72176+/*
72177+ reiser4_get_neighbor() -- lock node's neighbor.
72178+
72179+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
72180+ given parameter) using sibling link to it. If sibling link is not available
72181+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
72182+ level up for information about neighbor's disk address. We lock node's
72183+ parent, if it is common parent for both 'node' and its neighbor, neighbor's
72184+ disk address is in next (to left or to right) down link from link that points
72185+ to original node. If not, we need to lock parent's neighbor, read its content
72186+ and take first(last) downlink with neighbor's disk address. That locking
72187+ could be done by using sibling link and lock_neighbor() function, if sibling
72188+ link exists. In another case we have to go level up again until we find
72189+ common parent or valid sibling link. Then go down
72190+ allocating/connecting/locking/reading nodes until neighbor of first one is
72191+ locked.
72192+
72193+ @neighbor: result lock handle,
72194+ @node: a node which we lock neighbor of,
72195+ @lock_mode: lock mode {LM_READ, LM_WRITE},
72196+ @flags: logical OR of {GN_*} (see description above) subset.
72197+
72198+ @return: 0 if success, negative value if lock was impossible due to an error
72199+ or lack of neighbor node.
72200+*/
72201+
72202+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72203+int
72204+reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72205+ znode_lock_mode lock_mode, int flags)
72206+{
72207+ reiser4_tree *tree = znode_get_tree(node);
72208+ lock_handle path[REAL_MAX_ZTREE_HEIGHT];
72209+
72210+ coord_t coord;
72211+
72212+ tree_level base_level;
72213+ tree_level h = 0;
72214+ int ret;
72215+
72216+ assert("umka-252", tree != NULL);
72217+ assert("umka-253", neighbor != NULL);
72218+ assert("umka-254", node != NULL);
72219+
72220+ base_level = znode_get_level(node);
72221+
72222+ assert("umka-310", base_level <= tree->height);
72223+
72224+ coord_init_zero(&coord);
72225+
72226+ again:
72227+ /* first, we try to use simple lock_neighbor() which requires sibling
72228+ link existence */
72229+ read_lock_tree(tree);
72230+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
72231+ read_unlock_tree(tree);
72232+ if (!ret) {
72233+ /* load znode content if it was specified */
72234+ if (flags & GN_LOAD_NEIGHBOR) {
72235+ ret = zload(node);
72236+ if (ret)
72237+ longterm_unlock_znode(neighbor);
72238+ }
72239+ return ret;
72240+ }
72241+
72242+ /* only -ENOENT means we may look upward and try to connect
72243+ @node with its neighbor (if @flags allow us to do it) */
72244+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
72245+ return ret;
72246+
72247+ /* before establishing of sibling link we lock parent node; it is
72248+ required by renew_neighbor() to work. */
72249+ init_lh(&path[0]);
72250+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72251+ if (ret)
72252+ return ret;
72253+ if (znode_above_root(path[0].node)) {
72254+ longterm_unlock_znode(&path[0]);
72255+ return RETERR(-E_NO_NEIGHBOR);
72256+ }
72257+
72258+ while (1) {
72259+ znode *child = (h == 0) ? node : path[h - 1].node;
72260+ znode *parent = path[h].node;
72261+
72262+ ret = zload(parent);
72263+ if (ret)
72264+ break;
72265+
72266+ ret = find_child_ptr(parent, child, &coord);
72267+
72268+ if (ret) {
72269+ zrelse(parent);
72270+ break;
72271+ }
72272+
72273+ /* try to establish missing sibling link */
72274+ ret = renew_neighbor(&coord, child, h + base_level, flags);
72275+
72276+ zrelse(parent);
72277+
72278+ switch (ret) {
72279+ case 0:
72280+ /* unlocking of parent znode prevents simple
72281+ deadlock situation */
72282+ done_lh(&path[h]);
72283+
72284+ /* depend on tree level we stay on we repeat first
72285+ locking attempt ... */
72286+ if (h == 0)
72287+ goto again;
72288+
72289+ /* ... or repeat establishing of sibling link at
72290+ one level below. */
72291+ --h;
72292+ break;
72293+
72294+ case -ENOENT:
72295+ /* sibling link is not available -- we go
72296+ upward. */
72297+ init_lh(&path[h + 1]);
72298+ ret =
72299+ reiser4_get_parent(&path[h + 1], parent,
72300+ ZNODE_READ_LOCK);
72301+ if (ret)
72302+ goto fail;
72303+ ++h;
72304+ if (znode_above_root(path[h].node)) {
72305+ ret = RETERR(-E_NO_NEIGHBOR);
72306+ goto fail;
72307+ }
72308+ break;
72309+
72310+ case -E_DEADLOCK:
72311+ /* there was lock request from hi-pri locker. if
72312+ it is possible we unlock last parent node and
72313+ re-lock it again. */
72314+ for (; check_deadlock(); h--) {
72315+ done_lh(&path[h]);
72316+ if (h == 0)
72317+ goto fail;
72318+ }
72319+
72320+ break;
72321+
72322+ default: /* other errors. */
72323+ goto fail;
72324+ }
72325+ }
72326+ fail:
72327+ ON_DEBUG(check_lock_node_data(node));
72328+ ON_DEBUG(check_lock_data());
72329+
72330+ /* unlock path */
72331+ do {
72332+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72333+ fail; path[0] is already done_lh-ed, therefore
72334+ longterm_unlock_znode(&path[h]); is not applicable */
72335+ done_lh(&path[h]);
72336+ --h;
72337+ } while (h + 1 != 0);
72338+
72339+ return ret;
72340+}
72341+
72342+/* remove node from sibling list */
72343+/* Audited by: umka (2002.06.14) */
72344+void sibling_list_remove(znode * node)
72345+{
72346+ reiser4_tree *tree;
72347+
72348+ tree = znode_get_tree(node);
72349+ assert("umka-255", node != NULL);
72350+ assert_rw_write_locked(&(tree->tree_lock));
72351+ assert("nikita-3275", check_sibling_list(node));
72352+
72353+ write_lock_dk(tree);
72354+ if (znode_is_right_connected(node) && node->right != NULL &&
72355+ znode_is_left_connected(node) && node->left != NULL) {
72356+ assert("zam-32245",
72357+ keyeq(znode_get_rd_key(node),
72358+ znode_get_ld_key(node->right)));
72359+ znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72360+ }
72361+ write_unlock_dk(tree);
72362+
72363+ if (znode_is_right_connected(node) && node->right != NULL) {
72364+ assert("zam-322", znode_is_left_connected(node->right));
72365+ node->right->left = node->left;
72366+ ON_DEBUG(node->right->left_version =
72367+ atomic_inc_return(&delim_key_version);
72368+ );
72369+ }
72370+ if (znode_is_left_connected(node) && node->left != NULL) {
72371+ assert("zam-323", znode_is_right_connected(node->left));
72372+ node->left->right = node->right;
72373+ ON_DEBUG(node->left->right_version =
72374+ atomic_inc_return(&delim_key_version);
72375+ );
72376+ }
72377+
72378+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72379+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72380+ ON_DEBUG(node->left = node->right = NULL;
72381+ node->left_version = atomic_inc_return(&delim_key_version);
72382+ node->right_version = atomic_inc_return(&delim_key_version););
72383+ assert("nikita-3276", check_sibling_list(node));
72384+}
72385+
72386+/* disconnect node from sibling list */
72387+void sibling_list_drop(znode * node)
72388+{
72389+ znode *right;
72390+ znode *left;
72391+
72392+ assert("nikita-2464", node != NULL);
72393+ assert("nikita-3277", check_sibling_list(node));
72394+
72395+ right = node->right;
72396+ if (right != NULL) {
72397+ assert("nikita-2465", znode_is_left_connected(right));
72398+ right->left = NULL;
72399+ ON_DEBUG(right->left_version =
72400+ atomic_inc_return(&delim_key_version);
72401+ );
72402+ }
72403+ left = node->left;
72404+ if (left != NULL) {
72405+ assert("zam-323", znode_is_right_connected(left));
72406+ left->right = NULL;
72407+ ON_DEBUG(left->right_version =
72408+ atomic_inc_return(&delim_key_version);
72409+ );
72410+ }
72411+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72412+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72413+ ON_DEBUG(node->left = node->right = NULL;
72414+ node->left_version = atomic_inc_return(&delim_key_version);
72415+ node->right_version = atomic_inc_return(&delim_key_version););
72416+}
72417+
72418+/* Insert new node into sibling list. Regular balancing inserts new node
72419+ after (at right side) existing and locked node (@before), except one case
72420+ of adding new tree root node. @before should be NULL in that case. */
72421+void sibling_list_insert_nolock(znode * new, znode * before)
72422+{
72423+ assert("zam-334", new != NULL);
72424+ assert("nikita-3298", !znode_is_left_connected(new));
72425+ assert("nikita-3299", !znode_is_right_connected(new));
72426+ assert("nikita-3300", new->left == NULL);
72427+ assert("nikita-3301", new->right == NULL);
72428+ assert("nikita-3278", check_sibling_list(new));
72429+ assert("nikita-3279", check_sibling_list(before));
72430+
72431+ if (before != NULL) {
72432+ assert("zam-333", znode_is_connected(before));
72433+ new->right = before->right;
72434+ new->left = before;
72435+ ON_DEBUG(new->right_version =
72436+ atomic_inc_return(&delim_key_version);
72437+ new->left_version =
72438+ atomic_inc_return(&delim_key_version););
72439+ if (before->right != NULL) {
72440+ before->right->left = new;
72441+ ON_DEBUG(before->right->left_version =
72442+ atomic_inc_return(&delim_key_version);
72443+ );
72444+ }
72445+ before->right = new;
72446+ ON_DEBUG(before->right_version =
72447+ atomic_inc_return(&delim_key_version);
72448+ );
72449+ } else {
72450+ new->right = NULL;
72451+ new->left = NULL;
72452+ ON_DEBUG(new->right_version =
72453+ atomic_inc_return(&delim_key_version);
72454+ new->left_version =
72455+ atomic_inc_return(&delim_key_version););
72456+ }
72457+ ZF_SET(new, JNODE_LEFT_CONNECTED);
72458+ ZF_SET(new, JNODE_RIGHT_CONNECTED);
72459+ assert("nikita-3280", check_sibling_list(new));
72460+ assert("nikita-3281", check_sibling_list(before));
72461+}
72462+
72463+/*
72464+ Local variables:
72465+ c-indentation-style: "K&R"
72466+ mode-name: "LC"
72467+ c-basic-offset: 8
72468+ tab-width: 8
72469+ fill-column: 80
72470+ End:
72471+*/
72472Index: linux-2.6.16/fs/reiser4/tree_walk.h
72473===================================================================
72474--- /dev/null
72475+++ linux-2.6.16/fs/reiser4/tree_walk.h
72476@@ -0,0 +1,125 @@
72477+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
72478+
72479+/* definitions of reiser4 tree walk functions */
72480+
72481+#ifndef __FS_REISER4_TREE_WALK_H__
72482+#define __FS_REISER4_TREE_WALK_H__
72483+
72484+#include "debug.h"
72485+#include "forward.h"
72486+
72487+/* establishes horizontal links between cached znodes */
72488+int connect_znode(coord_t * coord, znode * node);
72489+
72490+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
72491+ have the following common arguments:
72492+
72493+ return codes:
72494+
72495+ @return : 0 - OK,
72496+
72497+ZAM-FIXME-HANS: wrong return code name. Change them all.
72498+ -ENOENT - neighbor is not in cache, what is detected by sibling
72499+ link absence.
72500+
72501+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
72502+ found (because we are left-/right- most node of the
72503+ tree, for example). Also, this return code is for
72504+ reiser4_get_parent() when we see no parent link -- it
72505+ means that our node is root node.
72506+
72507+ -E_DEADLOCK - deadlock detected (request from high-priority process
72508+ received), other error codes are conformed to
72509+ /usr/include/asm/errno.h .
72510+*/
72511+
72512+int
72513+reiser4_get_parent_flags(lock_handle * result, znode * node,
72514+ znode_lock_mode mode, int flags);
72515+
72516+/* bits definition for reiser4_get_neighbor function `flags' arg. */
72517+typedef enum {
72518+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
72519+ * find not allocated not connected neigbor by going though upper
72520+ * levels */
72521+ GN_CAN_USE_UPPER_LEVELS = 0x1,
72522+ /* locking left neighbor instead of right one */
72523+ GN_GO_LEFT = 0x2,
72524+ /* automatically load neighbor node content */
72525+ GN_LOAD_NEIGHBOR = 0x4,
72526+ /* return -E_REPEAT if can't lock */
72527+ GN_TRY_LOCK = 0x8,
72528+ /* used internally in tree_walk.c, causes renew_sibling to not
72529+ allocate neighbor znode, but only search for it in znode cache */
72530+ GN_NO_ALLOC = 0x10,
72531+ /* do not go across atom boundaries */
72532+ GN_SAME_ATOM = 0x20,
72533+ /* allow to lock not connected nodes */
72534+ GN_ALLOW_NOT_CONNECTED = 0x40,
72535+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
72536+ GN_ASYNC = 0x80
72537+} znode_get_neigbor_flags;
72538+
72539+/* A commonly used wrapper for reiser4_get_parent_flags(). */
72540+static inline int reiser4_get_parent(lock_handle * result, znode * node,
72541+ znode_lock_mode mode)
72542+{
72543+ return reiser4_get_parent_flags(result, node, mode,
72544+ GN_ALLOW_NOT_CONNECTED);
72545+}
72546+
72547+int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72548+ znode_lock_mode lock_mode, int flags);
72549+
72550+/* there are wrappers for most common usages of reiser4_get_neighbor() */
72551+static inline int
72552+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
72553+ int flags)
72554+{
72555+ return reiser4_get_neighbor(result, node, lock_mode,
72556+ flags | GN_GO_LEFT);
72557+}
72558+
72559+static inline int
72560+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
72561+ int flags)
72562+{
72563+ ON_DEBUG(check_lock_node_data(node));
72564+ ON_DEBUG(check_lock_data());
72565+ return reiser4_get_neighbor(result, node, lock_mode,
72566+ flags & (~GN_GO_LEFT));
72567+}
72568+
72569+extern void sibling_list_remove(znode * node);
72570+extern void sibling_list_drop(znode * node);
72571+extern void sibling_list_insert_nolock(znode * new, znode * before);
72572+extern void link_left_and_right(znode * left, znode * right);
72573+
72574+/* Functions called by tree_walk() when tree_walk() ... */
72575+struct tree_walk_actor {
72576+ /* ... meets a formatted node, */
72577+ int (*process_znode) (tap_t *, void *);
72578+ /* ... meets an extent, */
72579+ int (*process_extent) (tap_t *, void *);
72580+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
72581+ * node or extent processing functions. */
72582+ int (*before) (void *);
72583+};
72584+
72585+#if REISER4_DEBUG
72586+int check_sibling_list(znode * node);
72587+#else
72588+#define check_sibling_list(n) (1)
72589+#endif
72590+
72591+#endif /* __FS_REISER4_TREE_WALK_H__ */
72592+
72593+/*
72594+ Local variables:
72595+ c-indentation-style: "K&R"
72596+ mode-name: "LC"
72597+ c-basic-offset: 8
72598+ tab-width: 8
72599+ fill-column: 120
72600+ End:
72601+*/
72602Index: linux-2.6.16/fs/reiser4/txnmgr.c
72603===================================================================
72604--- /dev/null
72605+++ linux-2.6.16/fs/reiser4/txnmgr.c
72606@@ -0,0 +1,3158 @@
72607+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72608+ * reiser4/README */
72609+
72610+/* Joshua MacDonald wrote the first draft of this code. */
72611+
72612+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
72613+filesystem scales only as well as its worst locking design. You need to
72614+substantially restructure this code. Josh was not as experienced a programmer
72615+as you. Particularly review how the locking style differs from what you did
72616+for znodes usingt hi-lo priority locking, and present to me an opinion on
72617+whether the differences are well founded. */
72618+
72619+/* I cannot help but to disagree with the sentiment above. Locking of
72620+ * transaction manager is _not_ badly designed, and, at the very least, is not
72621+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
72622+ * locking on znodes, especially on the root node of the tree. --nikita,
72623+ * 2003.10.13 */
72624+
72625+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
72626+ txnmgr processes capture_block requests and manages the relationship between jnodes and
72627+ atoms through the various stages of a transcrash, and it also oversees the fusion and
72628+ capture-on-copy processes. The main difficulty with this task is maintaining a
72629+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
72630+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
72631+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you
72632+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
72633+ that any time you check the atom-pointer of a jnode or handle and then try to lock that
72634+ atom, you must use trylock() and possibly reverse the order.
72635+
72636+ This code implements the design documented at:
72637+
72638+ http://namesys.com/txn-doc.html
72639+
72640+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
72641+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
72642+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
72643+year old --- define all technical terms used.
72644+
72645+*/
72646+
72647+/* Thoughts on the external transaction interface:
72648+
72649+ In the current code, a TRANSCRASH handle is created implicitly by init_context() (which
72650+ creates state that lasts for the duration of a system call and is called at the start
72651+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
72652+ occupying the scope of a single system call. We wish to give certain applications an
72653+ interface to begin and close (commit) transactions. Since our implementation of
72654+ transactions does not yet support isolation, allowing an application to open a
72655+ transaction implies trusting it to later close the transaction. Part of the
72656+ transaction interface will be aimed at enabling that trust, but the interface for
72657+ actually using transactions is fairly narrow.
72658+
72659+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
72660+ this identifier into a string that a shell-script could use, allowing you to start a
72661+ transaction by issuing a command. Once open, the transcrash should be set in the task
72662+ structure, and there should be options (I suppose) to allow it to be carried across
72663+ fork/exec. A transcrash has several options:
72664+
72665+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
72666+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
72667+ capture on reads as well, it should set READ_FUSING.
72668+
72669+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
72670+ eventually close (or else the machine must crash). If the application dies an
72671+ unexpected death with an open transcrash, for example, or if it hangs for a long
72672+ duration, one solution (to avoid crashing the machine) is to simply close it anyway.
72673+ This is a dangerous option, but it is one way to solve the problem until isolated
72674+ transcrashes are available for untrusted applications.
72675+
72676+ It seems to be what databases do, though it is unclear how one avoids a DoS attack
72677+ creating a vulnerability based on resource starvation. Guaranteeing that some
72678+ minimum amount of computational resources are made available would seem more correct
72679+ than guaranteeing some amount of time. When we again have someone to code the work,
72680+ this issue should be considered carefully. -Hans
72681+
72682+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
72683+ many dirty blocks it expects. The reserve_blocks interface should be called at a point
72684+ where it is safe for the application to fail, because the system may not be able to
72685+ grant the allocation and the application must be able to back-out. For this reason,
72686+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
72687+ the application may also wish to extend the allocation after beginning its transcrash.
72688+
72689+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
72690+ modifications that require transaction protection. When isolated transactions are
72691+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
72692+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling
72693+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
72694+ why, for safety, the application should call RESERVE_BLOCKS before making any changes).
72695+
72696+ For actually implementing these out-of-system-call-scopped transcrashes, the
72697+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open
72698+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
72699+ "kmem_cache_t *_txnh_slab" created for that purpose in this file.
72700+*/
72701+
72702+/* Extending the other system call interfaces for future transaction features:
72703+
72704+ Specialized applications may benefit from passing flags to the ordinary system call
72705+ interface such as read(), write(), or stat(). For example, the application specifies
72706+ WRITE_FUSING by default but wishes to add that a certain read() command should be
72707+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
72708+ read, or the file-data read? These issues are straight-forward, but there are a lot of
72709+ them and adding the necessary flags-passing code will be tedious.
72710+
72711+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
72712+ flag, which specifies that although it is a read operation being requested, a
72713+ write-lock should be taken. The reason is that read-locks are shared while write-locks
72714+ are exclusive, so taking a read-lock when a later-write is known in advance will often
72715+ leads to deadlock. If a reader knows it will write later, it should issue read
72716+ requests with the RMW flag set.
72717+*/
72718+
72719+/*
72720+ The znode/atom deadlock avoidance.
72721+
72722+ FIXME(Zam): writing of this comment is in progress.
72723+
72724+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
72725+ long-term locking, which makes reiser4 locking scheme more complex. It had
72726+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks
72727+ looked as the following: one stopped thread waits for a long-term lock on
72728+ znode, the thread who owns that lock waits when fusion with another atom will
72729+ be allowed.
72730+
72731+ The source of the deadlocks is an optimization of not capturing index nodes
72732+ for read. Let's prove it. Suppose we have dumb node capturing scheme which
72733+ unconditionally captures each block before locking it.
72734+
72735+ That scheme has no deadlocks. Let's begin with the thread which stage is
72736+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
72737+ a capture because it's stage allows fusion with any atom except which are
72738+ being committed currently. A process of atom commit can't deadlock because
72739+ atom commit procedure does not acquire locks and does not fuse with other
72740+ atoms. Reiser4 does capturing right before going to sleep inside the
72741+ longtertm_lock_znode() function, it means the znode which we want to lock is
72742+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
72743+ continue the analysis we understand that no one process in the sequence may
72744+ waits atom fusion. Thereby there are no deadlocks of described kind.
72745+
72746+ The capturing optimization makes the deadlocks possible. A thread can wait a
72747+ lock which owner did not captured that node. The lock owner's current atom
72748+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
72749+ state. A deadlock is possible when that atom meets another one which is in
72750+ ASTAGE_CAPTURE_WAIT already.
72751+
72752+ The deadlock avoidance scheme includes two algorithms:
72753+
72754+ First algorithm is used when a thread captures a node which is locked but not
72755+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
72756+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
72757+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
72758+ routine which forces all lock owners to join with current atom is executed.
72759+
72760+ Second algorithm does not allow to skip capturing of already captured nodes.
72761+
72762+ Both algorithms together prevent waiting a longterm lock without atom fusion
72763+ with atoms of all lock owners, which is a key thing for getting atom/znode
72764+ locking deadlocks.
72765+*/
72766+
72767+/*
72768+ * Transactions and mmap(2).
72769+ *
72770+ * 1. Transactions are not supported for accesses through mmap(2), because
72771+ * this would effectively amount to user-level transactions whose duration
72772+ * is beyond control of the kernel.
72773+ *
72774+ * 2. That said, we still want to preserve some decency with regard to
72775+ * mmap(2). During normal write(2) call, following sequence of events
72776+ * happens:
72777+ *
72778+ * 1. page is created;
72779+ *
72780+ * 2. jnode is created, dirtied and captured into current atom.
72781+ *
72782+ * 3. extent is inserted and modified.
72783+ *
72784+ * Steps (2) and (3) take place under long term lock on the twig node.
72785+ *
72786+ * When file is accessed through mmap(2) page is always created during
72787+ * page fault. After this (in reiser4_readpage()->readpage_extent()):
72788+ *
72789+ * 1. if access is made to non-hole page new jnode is created, (if
72790+ * necessary)
72791+ *
72792+ * 2. if access is made to the hole page, jnode is not created (XXX
72793+ * not clear why).
72794+ *
72795+ * Also, even if page is created by write page fault it is not marked
72796+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races
72797+ * with page write-out.
72798+ *
72799+ * Dirty bit installed by hardware is only transferred to the struct page
72800+ * later, when page is unmapped (in zap_pte_range(), or
72801+ * try_to_unmap_one()).
72802+ *
72803+ * So, with mmap(2) we have to handle following irksome situations:
72804+ *
72805+ * 1. there exists modified page (clean or dirty) without jnode
72806+ *
72807+ * 2. there exists modified page (clean or dirty) with clean jnode
72808+ *
72809+ * 3. clean page which is a part of atom can be transparently modified
72810+ * at any moment through mapping without becoming dirty.
72811+ *
72812+ * (1) and (2) can lead to the out-of-memory situation: ->writepage()
72813+ * doesn't know what to do with such pages and ->sync_sb()/->writepages()
72814+ * don't see them, because these methods operate on atoms.
72815+ *
72816+ * (3) can lead to the loss of data: suppose we have dirty page with dirty
72817+ * captured jnode captured by some atom. As part of early flush (for
72818+ * example) page was written out. Dirty bit was cleared on both page and
72819+ * jnode. After this page is modified through mapping, but kernel doesn't
72820+ * notice and just discards page and jnode as part of commit. (XXX
72821+ * actually it doesn't, because to reclaim page ->releasepage() has to be
72822+ * called and before this dirty bit will be transferred to the struct
72823+ * page).
72824+ *
72825+ */
72826+
72827+#include "debug.h"
72828+#include "txnmgr.h"
72829+#include "jnode.h"
72830+#include "znode.h"
72831+#include "block_alloc.h"
72832+#include "tree.h"
72833+#include "wander.h"
72834+#include "ktxnmgrd.h"
72835+#include "super.h"
72836+#include "page_cache.h"
72837+#include "reiser4.h"
72838+#include "vfs_ops.h"
72839+#include "inode.h"
72840+#include "flush.h"
72841+
72842+#include <asm/atomic.h>
72843+#include <linux/types.h>
72844+#include <linux/fs.h>
72845+#include <linux/mm.h>
72846+#include <linux/slab.h>
72847+#include <linux/pagemap.h>
72848+#include <linux/writeback.h>
72849+#include <linux/swap.h> /* for totalram_pages */
72850+
72851+static void atom_free(txn_atom * atom);
72852+
72853+static int commit_txnh(txn_handle * txnh);
72854+
72855+static void wakeup_atom_waitfor_list(txn_atom * atom);
72856+static void wakeup_atom_waiting_list(txn_atom * atom);
72857+
72858+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
72859+
72860+static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
72861+
72862+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
72863+
72864+static int capture_init_fusion(jnode * node, txn_handle * txnh,
72865+ txn_capture mode);
72866+
72867+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
72868+
72869+static void capture_fuse_into(txn_atom * small, txn_atom * large);
72870+
72871+void invalidate_list(struct list_head *);
72872+
72873+/* GENERIC STRUCTURES */
72874+
72875+typedef struct _txn_wait_links txn_wait_links;
72876+
72877+struct _txn_wait_links {
72878+ lock_stack *_lock_stack;
72879+ struct list_head _fwaitfor_link;
72880+ struct list_head _fwaiting_link;
72881+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72882+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72883+};
72884+
72885+/* FIXME: In theory, we should be using the slab cache init & destructor
72886+ methods instead of, e.g., jnode_init, etc. */
72887+static kmem_cache_t *_atom_slab = NULL;
72888+/* this is for user-visible, cross system-call transactions. */
72889+static kmem_cache_t *_txnh_slab = NULL;
72890+
72891+/**
72892+ * init_txnmgr_static - create transaction manager slab caches
72893+ *
72894+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
72895+ * initialization.
72896+ */
72897+int init_txnmgr_static(void)
72898+{
72899+ assert("jmacd-600", _atom_slab == NULL);
72900+ assert("jmacd-601", _txnh_slab == NULL);
72901+
72902+ ON_DEBUG(atomic_set(&flush_cnt, 0));
72903+
72904+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
72905+ SLAB_HWCACHE_ALIGN |
72906+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
72907+ if (_atom_slab == NULL)
72908+ return RETERR(-ENOMEM);
72909+
72910+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
72911+ SLAB_HWCACHE_ALIGN, NULL, NULL);
72912+ if (_txnh_slab == NULL) {
72913+ kmem_cache_destroy(_atom_slab);
72914+ _atom_slab = NULL;
72915+ return RETERR(-ENOMEM);
72916+ }
72917+
72918+ return 0;
72919+}
72920+
72921+/**
72922+ * done_txnmgr_static - delete txn_atom and txn_handle caches
72923+ *
72924+ * This is called on reiser4 module unloading or system shutdown.
72925+ */
72926+void done_txnmgr_static(void)
72927+{
72928+ destroy_reiser4_cache(&_atom_slab);
72929+ destroy_reiser4_cache(&_txnh_slab);
72930+}
72931+
72932+/**
72933+ * init_txnmgr - initialize a new transaction manager
72934+ * @mgr: pointer to transaction manager embedded in reiser4 super block
72935+ *
72936+ * This is called on mount. Makes necessary initializations.
72937+ */
72938+void init_txnmgr(txn_mgr *mgr)
72939+{
72940+ assert("umka-169", mgr != NULL);
72941+
72942+ mgr->atom_count = 0;
72943+ mgr->id_count = 1;
72944+ INIT_LIST_HEAD(&mgr->atoms_list);
72945+ spin_lock_init(&mgr->tmgr_lock);
72946+ sema_init(&mgr->commit_semaphore, 1);
72947+}
72948+
72949+/**
72950+ * done_txnmgr - stop transaction manager
72951+ * @mgr: pointer to transaction manager embedded in reiser4 super block
72952+ *
72953+ * This is called on umount. Does sanity checks.
72954+ */
72955+void done_txnmgr(txn_mgr *mgr)
72956+{
72957+ assert("umka-170", mgr != NULL);
72958+ assert("umka-1701", list_empty_careful(&mgr->atoms_list));
72959+ assert("umka-1702", mgr->atom_count == 0);
72960+}
72961+
72962+/* Initialize a transaction handle. */
72963+/* Audited by: umka (2002.06.13) */
72964+static void txnh_init(txn_handle * txnh, txn_mode mode)
72965+{
72966+ assert("umka-171", txnh != NULL);
72967+
72968+ txnh->mode = mode;
72969+ txnh->atom = NULL;
72970+ set_gfp_mask();
72971+ txnh->flags = 0;
72972+ spin_lock_init(&txnh->hlock);
72973+ INIT_LIST_HEAD(&txnh->txnh_link);
72974+}
72975+
72976+#if REISER4_DEBUG
72977+/* Check if a transaction handle is clean. */
72978+static int txnh_isclean(txn_handle * txnh)
72979+{
72980+ assert("umka-172", txnh != NULL);
72981+ return txnh->atom == NULL &&
72982+ LOCK_CNT_NIL(spin_locked_txnh);
72983+}
72984+#endif
72985+
72986+/* Initialize an atom. */
72987+static void atom_init(txn_atom * atom)
72988+{
72989+ int level;
72990+
72991+ assert("umka-173", atom != NULL);
72992+
72993+ memset(atom, 0, sizeof(txn_atom));
72994+
72995+ atom->stage = ASTAGE_FREE;
72996+ atom->start_time = jiffies;
72997+
72998+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
72999+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
73000+
73001+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
73002+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
73003+ INIT_LIST_HEAD(ATOM_WB_LIST(atom));
73004+ INIT_LIST_HEAD(&atom->inodes);
73005+ spin_lock_init(&atom->alock);
73006+ /* list of transaction handles */
73007+ INIT_LIST_HEAD(&atom->txnh_list);
73008+ /* link to transaction manager's list of atoms */
73009+ INIT_LIST_HEAD(&atom->atom_link);
73010+ INIT_LIST_HEAD(&atom->fwaitfor_list);
73011+ INIT_LIST_HEAD(&atom->fwaiting_list);
73012+ blocknr_set_init(&atom->delete_set);
73013+ blocknr_set_init(&atom->wandered_map);
73014+
73015+ init_atom_fq_parts(atom);
73016+}
73017+
73018+#if REISER4_DEBUG
73019+/* Check if an atom is clean. */
73020+static int atom_isclean(txn_atom * atom)
73021+{
73022+ int level;
73023+
73024+ assert("umka-174", atom != NULL);
73025+
73026+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73027+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
73028+ return 0;
73029+ }
73030+ }
73031+
73032+ return atom->stage == ASTAGE_FREE &&
73033+ atom->txnh_count == 0 &&
73034+ atom->capture_count == 0 &&
73035+ atomic_read(&atom->refcount) == 0 &&
73036+ (&atom->atom_link == atom->atom_link.next &&
73037+ &atom->atom_link == atom->atom_link.prev) &&
73038+ list_empty_careful(&atom->txnh_list) &&
73039+ list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
73040+ list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
73041+ list_empty_careful(ATOM_WB_LIST(atom)) &&
73042+ list_empty_careful(&atom->fwaitfor_list) &&
73043+ list_empty_careful(&atom->fwaiting_list) &&
73044+ atom_fq_parts_are_clean(atom);
73045+}
73046+#endif
73047+
73048+/* Begin a transaction in this context. Currently this uses the reiser4_context's
73049+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
73050+ this will be extended to allow transaction handles to span several contexts. */
73051+/* Audited by: umka (2002.06.13) */
73052+void txn_begin(reiser4_context * context)
73053+{
73054+ assert("jmacd-544", context->trans == NULL);
73055+
73056+ context->trans = &context->trans_in_ctx;
73057+
73058+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
73059+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
73060+ stack allocated right now, but we would like to allow for dynamically allocated
73061+ transcrashes that span multiple system calls.
73062+ */
73063+ txnh_init(context->trans, TXN_WRITE_FUSING);
73064+}
73065+
73066+/* Finish a transaction handle context. */
73067+int txn_end(reiser4_context * context)
73068+{
73069+ long ret = 0;
73070+ txn_handle *txnh;
73071+
73072+ assert("umka-283", context != NULL);
73073+ assert("nikita-3012", schedulable());
73074+ assert("vs-24", context == get_current_context());
73075+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
73076+
73077+ txnh = context->trans;
73078+ if (txnh != NULL) {
73079+ if (txnh->atom != NULL)
73080+ ret = commit_txnh(txnh);
73081+ assert("jmacd-633", txnh_isclean(txnh));
73082+ context->trans = NULL;
73083+ }
73084+ return ret;
73085+}
73086+
73087+void txn_restart(reiser4_context * context)
73088+{
73089+ txn_end(context);
73090+ preempt_point();
73091+ txn_begin(context);
73092+}
73093+
73094+void txn_restart_current(void)
73095+{
73096+ txn_restart(get_current_context());
73097+}
73098+
73099+/* TXN_ATOM */
73100+
73101+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
73102+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
73103+ return NULL. */
73104+static txn_atom *txnh_get_atom(txn_handle * txnh)
73105+{
73106+ txn_atom *atom;
73107+
73108+ assert("umka-180", txnh != NULL);
73109+ assert_spin_not_locked(&(txnh->hlock));
73110+
73111+ while (1) {
73112+ spin_lock_txnh(txnh);
73113+ atom = txnh->atom;
73114+
73115+ if (atom == NULL)
73116+ break;
73117+
73118+ if (spin_trylock_atom(atom))
73119+ break;
73120+
73121+ atomic_inc(&atom->refcount);
73122+
73123+ spin_unlock_txnh(txnh);
73124+ spin_lock_atom(atom);
73125+ spin_lock_txnh(txnh);
73126+
73127+ if (txnh->atom == atom) {
73128+ atomic_dec(&atom->refcount);
73129+ break;
73130+ }
73131+
73132+ spin_unlock_txnh(txnh);
73133+ atom_dec_and_unlock(atom);
73134+ }
73135+
73136+ return atom;
73137+}
73138+
73139+/* Get the current atom and spinlock it if current atom present. May return NULL */
73140+txn_atom *get_current_atom_locked_nocheck(void)
73141+{
73142+ reiser4_context *cx;
73143+ txn_atom *atom;
73144+ txn_handle *txnh;
73145+
73146+ cx = get_current_context();
73147+ assert("zam-437", cx != NULL);
73148+
73149+ txnh = cx->trans;
73150+ assert("zam-435", txnh != NULL);
73151+
73152+ atom = txnh_get_atom(txnh);
73153+
73154+ spin_unlock_txnh(txnh);
73155+ return atom;
73156+}
73157+
73158+/* Get the atom belonging to a jnode, which is initially locked. Return with
73159+ both jnode and atom locked. This performs the necessary spin_trylock to
73160+ break the lock-ordering cycle. Assumes the jnode is already locked, and
73161+ returns NULL if atom is not set. */
73162+txn_atom *jnode_get_atom(jnode * node)
73163+{
73164+ txn_atom *atom;
73165+
73166+ assert("umka-181", node != NULL);
73167+
73168+ while (1) {
73169+ assert_spin_locked(&(node->guard));
73170+
73171+ atom = node->atom;
73172+ /* node is not in any atom */
73173+ if (atom == NULL)
73174+ break;
73175+
73176+ /* If atom is not locked, grab the lock and return */
73177+ if (spin_trylock_atom(atom))
73178+ break;
73179+
73180+ /* At least one jnode belongs to this atom it guarantees that
73181+ * atom->refcount > 0, we can safely increment refcount. */
73182+ atomic_inc(&atom->refcount);
73183+ spin_unlock_jnode(node);
73184+
73185+ /* re-acquire spin locks in the right order */
73186+ spin_lock_atom(atom);
73187+ spin_lock_jnode(node);
73188+
73189+ /* check if node still points to the same atom. */
73190+ if (node->atom == atom) {
73191+ atomic_dec(&atom->refcount);
73192+ break;
73193+ }
73194+
73195+ /* releasing of atom lock and reference requires not holding
73196+ * locks on jnodes. */
73197+ spin_unlock_jnode(node);
73198+
73199+ /* We do not sure that this atom has extra references except our
73200+ * one, so we should call proper function which may free atom if
73201+ * last reference is released. */
73202+ atom_dec_and_unlock(atom);
73203+
73204+ /* lock jnode again for getting valid node->atom pointer
73205+ * value. */
73206+ spin_lock_jnode(node);
73207+ }
73208+
73209+ return atom;
73210+}
73211+
73212+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
73213+ by flush code to indicate whether the next node (in some direction) is suitable for
73214+ flushing. */
73215+int
73216+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
73217+{
73218+ int compat;
73219+ txn_atom *atom;
73220+
73221+ assert("umka-182", node != NULL);
73222+ assert("umka-183", check != NULL);
73223+
73224+ /* Not sure what this function is supposed to do if supplied with @check that is
73225+ neither formatted nor unformatted (bitmap or so). */
73226+ assert("nikita-2373", jnode_is_znode(check)
73227+ || jnode_is_unformatted(check));
73228+
73229+ /* Need a lock on CHECK to get its atom and to check various state bits.
73230+ Don't need a lock on NODE once we get the atom lock. */
73231+ /* It is not enough to lock two nodes and check (node->atom ==
73232+ check->atom) because atom could be locked and being fused at that
73233+ moment, jnodes of the atom of that state (being fused) can point to
73234+ different objects, but the atom is the same. */
73235+ spin_lock_jnode(check);
73236+
73237+ atom = jnode_get_atom(check);
73238+
73239+ if (atom == NULL) {
73240+ compat = 0;
73241+ } else {
73242+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
73243+
73244+ if (compat && jnode_is_znode(check)) {
73245+ compat &= znode_is_connected(JZNODE(check));
73246+ }
73247+
73248+ if (compat && alloc_check) {
73249+ compat &= (alloc_value == jnode_is_flushprepped(check));
73250+ }
73251+
73252+ spin_unlock_atom(atom);
73253+ }
73254+
73255+ spin_unlock_jnode(check);
73256+
73257+ return compat;
73258+}
73259+
73260+/* Decrement the atom's reference count and if it falls to zero, free it. */
73261+void atom_dec_and_unlock(txn_atom * atom)
73262+{
73263+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73264+
73265+ assert("umka-186", atom != NULL);
73266+ assert_spin_locked(&(atom->alock));
73267+ assert("zam-1039", atomic_read(&atom->refcount) > 0);
73268+
73269+ if (atomic_dec_and_test(&atom->refcount)) {
73270+ /* take txnmgr lock and atom lock in proper order. */
73271+ if (!spin_trylock_txnmgr(mgr)) {
73272+ /* This atom should exist after we re-acquire its
73273+ * spinlock, so we increment its reference counter. */
73274+ atomic_inc(&atom->refcount);
73275+ spin_unlock_atom(atom);
73276+ spin_lock_txnmgr(mgr);
73277+ spin_lock_atom(atom);
73278+
73279+ if (!atomic_dec_and_test(&atom->refcount)) {
73280+ spin_unlock_atom(atom);
73281+ spin_unlock_txnmgr(mgr);
73282+ return;
73283+ }
73284+ }
73285+ assert_spin_locked(&(mgr->tmgr_lock));
73286+ atom_free(atom);
73287+ spin_unlock_txnmgr(mgr);
73288+ } else
73289+ spin_unlock_atom(atom);
73290+}
73291+
73292+/* Create new atom and connect it to given transaction handle. This adds the
73293+ atom to the transaction manager's list and sets its reference count to 1, an
73294+ artificial reference which is kept until it commits. We play strange games
73295+ to avoid allocation under jnode & txnh spinlocks.*/
73296+
73297+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73298+{
73299+ txn_atom *atom;
73300+ txn_mgr *mgr;
73301+
73302+ if (REISER4_DEBUG && rofs_tree(current_tree)) {
73303+ warning("nikita-3366", "Creating atom on rofs");
73304+ dump_stack();
73305+ }
73306+
73307+ if (*atom_alloc == NULL) {
73308+ (*atom_alloc) = kmem_cache_alloc(_atom_slab, get_gfp_mask());
73309+
73310+ if (*atom_alloc == NULL)
73311+ return RETERR(-ENOMEM);
73312+ }
73313+
73314+ /* and, also, txnmgr spin lock should be taken before jnode and txnh
73315+ locks. */
73316+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73317+ spin_lock_txnmgr(mgr);
73318+ spin_lock_txnh(txnh);
73319+
73320+ /* Check whether new atom still needed */
73321+ if (txnh->atom != NULL) {
73322+ /* NOTE-NIKITA probably it is rather better to free
73323+ * atom_alloc here than thread it up to try_capture(). */
73324+
73325+ spin_unlock_txnh(txnh);
73326+ spin_unlock_txnmgr(mgr);
73327+
73328+ return -E_REPEAT;
73329+ }
73330+
73331+ atom = *atom_alloc;
73332+ *atom_alloc = NULL;
73333+
73334+ atom_init(atom);
73335+
73336+ assert("jmacd-17", atom_isclean(atom));
73337+
73338+ /*
73339+ * do not use spin_lock_atom because we have broken lock ordering here
73340+ * which is ok, as long as @atom is new and inaccessible for others.
73341+ */
73342+ spin_lock(&(atom->alock));
73343+
73344+ /* add atom to the end of transaction manager's list of atoms */
73345+ list_add_tail(&atom->atom_link, &mgr->atoms_list);
73346+ atom->atom_id = mgr->id_count++;
73347+ mgr->atom_count += 1;
73348+
73349+ /* Release txnmgr lock */
73350+ spin_unlock_txnmgr(mgr);
73351+
73352+ /* One reference until it commits. */
73353+ atomic_inc(&atom->refcount);
73354+ atom->stage = ASTAGE_CAPTURE_FUSE;
73355+ atom->super = reiser4_get_current_sb();
73356+ capture_assign_txnh_nolock(atom, txnh);
73357+
73358+ spin_unlock(&(atom->alock));
73359+ spin_unlock_txnh(txnh);
73360+
73361+ return -E_REPEAT;
73362+}
73363+
73364+/* Return true if an atom is currently "open". */
73365+static int atom_isopen(const txn_atom * atom)
73366+{
73367+ assert("umka-185", atom != NULL);
73368+
73369+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73370+}
73371+
73372+/* Return the number of pointers to this atom that must be updated during fusion. This
73373+ approximates the amount of work to be done. Fusion chooses the atom with fewer
73374+ pointers to fuse into the atom with more pointers. */
73375+static int atom_pointer_count(const txn_atom * atom)
73376+{
73377+ assert("umka-187", atom != NULL);
73378+
73379+ /* This is a measure of the amount of work needed to fuse this atom
73380+ * into another. */
73381+ return atom->txnh_count + atom->capture_count;
73382+}
73383+
73384+/* Called holding the atom lock, this removes the atom from the transaction manager list
73385+ and frees it. */
73386+static void atom_free(txn_atom * atom)
73387+{
73388+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73389+
73390+ assert("umka-188", atom != NULL);
73391+ assert_spin_locked(&(atom->alock));
73392+
73393+ /* Remove from the txn_mgr's atom list */
73394+ assert_spin_locked(&(mgr->tmgr_lock));
73395+ mgr->atom_count -= 1;
73396+ list_del_init(&atom->atom_link);
73397+
73398+ /* Clean the atom */
73399+ assert("jmacd-16",
73400+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73401+ atom->stage = ASTAGE_FREE;
73402+
73403+ blocknr_set_destroy(&atom->delete_set);
73404+ blocknr_set_destroy(&atom->wandered_map);
73405+
73406+ assert("jmacd-16", atom_isclean(atom));
73407+
73408+ spin_unlock_atom(atom);
73409+
73410+ kmem_cache_free(_atom_slab, atom);
73411+}
73412+
73413+static int atom_is_dotard(const txn_atom * atom)
73414+{
73415+ return time_after(jiffies, atom->start_time +
73416+ get_current_super_private()->tmgr.atom_max_age);
73417+}
73418+
73419+static int atom_can_be_committed(txn_atom * atom)
73420+{
73421+ assert_spin_locked(&(atom->alock));
73422+ assert("zam-885", atom->txnh_count > atom->nr_waiters);
73423+ return atom->txnh_count == atom->nr_waiters + 1;
73424+}
73425+
73426+/* Return true if an atom should commit now. This is determined by aging, atom
73427+ size or atom flags. */
73428+static int atom_should_commit(const txn_atom * atom)
73429+{
73430+ assert("umka-189", atom != NULL);
73431+ return
73432+ (atom->flags & ATOM_FORCE_COMMIT) ||
73433+ ((unsigned)atom_pointer_count(atom) >
73434+ get_current_super_private()->tmgr.atom_max_size)
73435+ || atom_is_dotard(atom);
73436+}
73437+
73438+/* return 1 if current atom exists and requires commit. */
73439+int current_atom_should_commit(void)
73440+{
73441+ txn_atom *atom;
73442+ int result = 0;
73443+
73444+ atom = get_current_atom_locked_nocheck();
73445+ if (atom) {
73446+ result = atom_should_commit(atom);
73447+ spin_unlock_atom(atom);
73448+ }
73449+ return result;
73450+}
73451+
73452+static int atom_should_commit_asap(const txn_atom * atom)
73453+{
73454+ unsigned int captured;
73455+ unsigned int pinnedpages;
73456+
73457+ assert("nikita-3309", atom != NULL);
73458+
73459+ captured = (unsigned)atom->capture_count;
73460+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73461+
73462+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
73463+}
73464+
73465+static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
73466+{
73467+ jnode *first_dirty;
73468+
73469+ list_for_each_entry(first_dirty, head, capture_link) {
73470+ if (!(flags & JNODE_FLUSH_COMMIT)) {
73471+ /*
73472+ * skip jnodes which "heard banshee" or having active
73473+ * I/O
73474+ */
73475+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
73476+ JF_ISSET(first_dirty, JNODE_WRITEBACK))
73477+ continue;
73478+ }
73479+ return first_dirty;
73480+ }
73481+ return NULL;
73482+}
73483+
73484+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
73485+ nodes on atom's lists */
73486+jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
73487+{
73488+ jnode *first_dirty;
73489+ tree_level level;
73490+
73491+ assert_spin_locked(&(atom->alock));
73492+
73493+ /* The flush starts from LEAF_LEVEL (=1). */
73494+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73495+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
73496+ continue;
73497+
73498+ first_dirty =
73499+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
73500+ flags);
73501+ if (first_dirty)
73502+ return first_dirty;
73503+ }
73504+
73505+ /* znode-above-root is on the list #0. */
73506+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
73507+}
73508+
73509+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
73510+{
73511+ jnode *cur;
73512+
73513+ assert("zam-905", atom_is_protected(atom));
73514+
73515+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
73516+ while (ATOM_WB_LIST(atom) != &cur->capture_link) {
73517+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
73518+
73519+ spin_lock_jnode(cur);
73520+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
73521+ if (JF_ISSET(cur, JNODE_DIRTY)) {
73522+ queue_jnode(fq, cur);
73523+ } else {
73524+ /* move jnode to atom's clean list */
73525+ list_move_tail(&cur->capture_link,
73526+ ATOM_CLEAN_LIST(atom));
73527+ }
73528+ }
73529+ spin_unlock_jnode(cur);
73530+
73531+ cur = next;
73532+ }
73533+}
73534+
73535+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
73536+ * jnodes to disk. */
73537+static int submit_wb_list(void)
73538+{
73539+ int ret;
73540+ flush_queue_t *fq;
73541+
73542+ fq = get_fq_for_current_atom();
73543+ if (IS_ERR(fq))
73544+ return PTR_ERR(fq);
73545+
73546+ dispatch_wb_list(fq->atom, fq);
73547+ spin_unlock_atom(fq->atom);
73548+
73549+ ret = write_fq(fq, NULL, 1);
73550+ fq_put(fq);
73551+
73552+ return ret;
73553+}
73554+
73555+/* Wait completion of all writes, re-submit atom writeback list if needed. */
73556+static int current_atom_complete_writes(void)
73557+{
73558+ int ret;
73559+
73560+ /* Each jnode from that list was modified and dirtied when it had i/o
73561+ * request running already. After i/o completion we have to resubmit
73562+ * them to disk again.*/
73563+ ret = submit_wb_list();
73564+ if (ret < 0)
73565+ return ret;
73566+
73567+ /* Wait all i/o completion */
73568+ ret = current_atom_finish_all_fq();
73569+ if (ret)
73570+ return ret;
73571+
73572+ /* Scan wb list again; all i/o should be completed, we re-submit dirty
73573+ * nodes to disk */
73574+ ret = submit_wb_list();
73575+ if (ret < 0)
73576+ return ret;
73577+
73578+ /* Wait all nodes we just submitted */
73579+ return current_atom_finish_all_fq();
73580+}
73581+
73582+#define TOOMANYFLUSHES (1 << 13)
73583+
73584+/* Called with the atom locked and no open "active" transaction handlers except
73585+ ours, this function calls flush_current_atom() until all dirty nodes are
73586+ processed. Then it initiates commit processing.
73587+
73588+ Called by the single remaining open "active" txnh, which is closing. Other
73589+ open txnhs belong to processes which wait atom commit in commit_txnh()
73590+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
73591+ long as we hold the atom lock none of the jnodes can be captured and/or
73592+ locked.
73593+
73594+ Return value is an error code if commit fails.
73595+*/
73596+static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
73597+{
73598+ reiser4_super_info_data *sbinfo = get_current_super_private();
73599+ long ret = 0;
73600+ /* how many times jnode_flush() was called as a part of attempt to
73601+ * commit this atom. */
73602+ int flushiters;
73603+
73604+ assert("zam-888", atom != NULL && *atom != NULL);
73605+ assert_spin_locked(&((*atom)->alock));
73606+ assert("zam-887", get_current_context()->trans->atom == *atom);
73607+ assert("jmacd-151", atom_isopen(*atom));
73608+
73609+ /* lock ordering: delete_sema and commit_sema are unordered */
73610+ assert("nikita-3184",
73611+ get_current_super_private()->delete_sema_owner != current);
73612+
73613+ for (flushiters = 0;; ++flushiters) {
73614+ ret =
73615+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
73616+ JNODE_FLUSH_COMMIT,
73617+ LONG_MAX /* nr_to_write */ ,
73618+ nr_submitted, atom, NULL);
73619+ if (ret != -E_REPEAT)
73620+ break;
73621+
73622+ /* if atom's dirty list contains one znode which is
73623+ HEARD_BANSHEE and is locked we have to allow lock owner to
73624+ continue and uncapture that znode */
73625+ preempt_point();
73626+
73627+ *atom = get_current_atom_locked();
73628+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
73629+ warning("nikita-3176",
73630+ "Flushing like mad: %i", flushiters);
73631+ info_atom("atom", *atom);
73632+ DEBUGON(flushiters > (1 << 20));
73633+ }
73634+ }
73635+
73636+ if (ret)
73637+ return ret;
73638+
73639+ assert_spin_locked(&((*atom)->alock));
73640+
73641+ if (!atom_can_be_committed(*atom)) {
73642+ spin_unlock_atom(*atom);
73643+ return RETERR(-E_REPEAT);
73644+ }
73645+
73646+ if ((*atom)->capture_count == 0)
73647+ goto done;
73648+
73649+ /* Up to this point we have been flushing and after flush is called we
73650+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
73651+ at this point, commit should be successful. */
73652+ atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
73653+ ON_DEBUG(((*atom)->committer = current));
73654+ spin_unlock_atom(*atom);
73655+
73656+ ret = current_atom_complete_writes();
73657+ if (ret)
73658+ return ret;
73659+
73660+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
73661+
73662+ /* isolate critical code path which should be executed by only one
73663+ * thread using tmgr semaphore */
73664+ down(&sbinfo->tmgr.commit_semaphore);
73665+
73666+ ret = reiser4_write_logs(nr_submitted);
73667+ if (ret < 0)
73668+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
73669+
73670+ /* The atom->ovrwr_nodes list is processed under commit semaphore held
73671+ because of bitmap nodes which are captured by special way in
73672+ bitmap_pre_commit_hook(), that way does not include
73673+ capture_fuse_wait() as a capturing of other nodes does -- the commit
73674+ semaphore is used for transaction isolation instead. */
73675+ invalidate_list(ATOM_OVRWR_LIST(*atom));
73676+ up(&sbinfo->tmgr.commit_semaphore);
73677+
73678+ invalidate_list(ATOM_CLEAN_LIST(*atom));
73679+ invalidate_list(ATOM_WB_LIST(*atom));
73680+ assert("zam-927", list_empty(&(*atom)->inodes));
73681+
73682+ spin_lock_atom(*atom);
73683+ done:
73684+ atom_set_stage(*atom, ASTAGE_DONE);
73685+ ON_DEBUG((*atom)->committer = NULL);
73686+
73687+ /* Atom's state changes, so wake up everybody waiting for this
73688+ event. */
73689+ wakeup_atom_waiting_list(*atom);
73690+
73691+ /* Decrement the "until commit" reference, at least one txnh (the caller) is
73692+ still open. */
73693+ atomic_dec(&(*atom)->refcount);
73694+
73695+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
73696+ assert("jmacd-1062", (*atom)->capture_count == 0);
73697+ BUG_ON((*atom)->capture_count != 0);
73698+ assert_spin_locked(&((*atom)->alock));
73699+
73700+ return ret;
73701+}
73702+
73703+/* TXN_TXNH */
73704+
73705+/**
73706+ * force_commit_atom - commit current atom and wait commit completion
73707+ * @txnh:
73708+ *
73709+ * Commits current atom and wait commit completion; current atom and @txnh have
73710+ * to be spinlocked before call, this function unlocks them on exit.
73711+ */
73712+int force_commit_atom(txn_handle *txnh)
73713+{
73714+ txn_atom *atom;
73715+
73716+ assert("zam-837", txnh != NULL);
73717+ assert_spin_locked(&(txnh->hlock));
73718+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
73719+
73720+ atom = txnh->atom;
73721+
73722+ assert("zam-834", atom != NULL);
73723+ assert_spin_locked(&(atom->alock));
73724+
73725+ /*
73726+ * Set flags for atom and txnh: forcing atom commit and waiting for
73727+ * commit completion
73728+ */
73729+ txnh->flags |= TXNH_WAIT_COMMIT;
73730+ atom->flags |= ATOM_FORCE_COMMIT;
73731+
73732+ spin_unlock_txnh(txnh);
73733+ spin_unlock_atom(atom);
73734+
73735+ /* commit is here */
73736+ txn_restart_current();
73737+ return 0;
73738+}
73739+
73740+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
73741+ * should we commit all atoms including new ones which are created after this
73742+ * functions is called. */
73743+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
73744+{
73745+ int ret;
73746+ txn_atom *atom;
73747+ txn_mgr *mgr;
73748+ txn_handle *txnh;
73749+ unsigned long start_time = jiffies;
73750+ reiser4_context *ctx = get_current_context();
73751+
73752+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
73753+ assert("nikita-3058", commit_check_locks());
73754+
73755+ txn_restart_current();
73756+
73757+ mgr = &get_super_private(super)->tmgr;
73758+
73759+ txnh = ctx->trans;
73760+
73761+ again:
73762+
73763+ spin_lock_txnmgr(mgr);
73764+
73765+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
73766+ spin_lock_atom(atom);
73767+
73768+ /* Commit any atom which can be committed. If @commit_new_atoms
73769+ * is not set we commit only atoms which were created before
73770+ * this call is started. */
73771+ if (commit_all_atoms
73772+ || time_before_eq(atom->start_time, start_time)) {
73773+ if (atom->stage <= ASTAGE_POST_COMMIT) {
73774+ spin_unlock_txnmgr(mgr);
73775+
73776+ if (atom->stage < ASTAGE_PRE_COMMIT) {
73777+ spin_lock_txnh(txnh);
73778+ /* Add force-context txnh */
73779+ capture_assign_txnh_nolock(atom, txnh);
73780+ ret = force_commit_atom(txnh);
73781+ if (ret)
73782+ return ret;
73783+ } else
73784+ /* wait atom commit */
73785+ atom_wait_event(atom);
73786+
73787+ goto again;
73788+ }
73789+ }
73790+
73791+ spin_unlock_atom(atom);
73792+ }
73793+
73794+#if REISER4_DEBUG
73795+ if (commit_all_atoms) {
73796+ reiser4_super_info_data *sbinfo = get_super_private(super);
73797+ spin_lock_reiser4_super(sbinfo);
73798+ assert("zam-813",
73799+ sbinfo->blocks_fake_allocated_unformatted == 0);
73800+ assert("zam-812", sbinfo->blocks_fake_allocated == 0);
73801+ spin_unlock_reiser4_super(sbinfo);
73802+ }
73803+#endif
73804+
73805+ spin_unlock_txnmgr(mgr);
73806+
73807+ return 0;
73808+}
73809+
73810+/* check whether commit_some_atoms() can commit @atom. Locking is up to the
73811+ * caller */
73812+static int atom_is_committable(txn_atom * atom)
73813+{
73814+ return
73815+ atom->stage < ASTAGE_PRE_COMMIT &&
73816+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
73817+}
73818+
73819+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
73820+ * lock at exit */
73821+int commit_some_atoms(txn_mgr * mgr)
73822+{
73823+ int ret = 0;
73824+ txn_atom *atom;
73825+ txn_handle *txnh;
73826+ reiser4_context *ctx;
73827+ struct list_head *pos, *tmp;
73828+
73829+ ctx = get_current_context();
73830+ assert("nikita-2444", ctx != NULL);
73831+
73832+ txnh = ctx->trans;
73833+ spin_lock_txnmgr(mgr);
73834+
73835+ /*
73836+ * this is to avoid gcc complain that atom might be used
73837+ * uninitialized
73838+ */
73839+ atom = NULL;
73840+
73841+ /* look for atom to commit */
73842+ list_for_each_safe(pos, tmp, &mgr->atoms_list) {
73843+ atom = list_entry(pos, txn_atom, atom_link);
73844+ /*
73845+ * first test without taking atom spin lock, whether it is
73846+ * eligible for committing at all
73847+ */
73848+ if (atom_is_committable(atom)) {
73849+ /* now, take spin lock and re-check */
73850+ spin_lock_atom(atom);
73851+ if (atom_is_committable(atom))
73852+ break;
73853+ spin_unlock_atom(atom);
73854+ }
73855+ }
73856+
73857+ ret = (&mgr->atoms_list == pos);
73858+ spin_unlock_txnmgr(mgr);
73859+
73860+ if (ret) {
73861+ /* nothing found */
73862+ spin_unlock(&mgr->daemon->guard);
73863+ return 0;
73864+ }
73865+
73866+ spin_lock_txnh(txnh);
73867+
73868+ BUG_ON(atom == NULL);
73869+ /* Set the atom to force committing */
73870+ atom->flags |= ATOM_FORCE_COMMIT;
73871+
73872+ /* Add force-context txnh */
73873+ capture_assign_txnh_nolock(atom, txnh);
73874+
73875+ spin_unlock_txnh(txnh);
73876+ spin_unlock_atom(atom);
73877+
73878+ /* we are about to release daemon spin lock, notify daemon it
73879+ has to rescan atoms */
73880+ mgr->daemon->rescan = 1;
73881+ spin_unlock(&mgr->daemon->guard);
73882+ txn_restart_current();
73883+ return 0;
73884+}
73885+
73886+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
73887+{
73888+ int atom_stage;
73889+ txn_atom *atom_2;
73890+ int repeat;
73891+
73892+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
73893+
73894+ atom_stage = atom->stage;
73895+ repeat = 0;
73896+
73897+ if (!spin_trylock_txnmgr(tmgr)) {
73898+ atomic_inc(&atom->refcount);
73899+ spin_unlock_atom(atom);
73900+ spin_lock_txnmgr(tmgr);
73901+ spin_lock_atom(atom);
73902+ repeat = 1;
73903+ if (atom->stage != atom_stage) {
73904+ spin_unlock_txnmgr(tmgr);
73905+ atom_dec_and_unlock(atom);
73906+ return -E_REPEAT;
73907+ }
73908+ atomic_dec(&atom->refcount);
73909+ }
73910+
73911+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
73912+ if (atom == atom_2)
73913+ continue;
73914+ /*
73915+ * if trylock does not succeed we just do not fuse with that
73916+ * atom.
73917+ */
73918+ if (spin_trylock_atom(atom_2)) {
73919+ if (atom_2->stage < ASTAGE_PRE_COMMIT) {
73920+ spin_unlock_txnmgr(tmgr);
73921+ capture_fuse_into(atom_2, atom);
73922+ /* all locks are lost we can only repeat here */
73923+ return -E_REPEAT;
73924+ }
73925+ spin_unlock_atom(atom_2);
73926+ }
73927+ }
73928+ atom->flags |= ATOM_CANCEL_FUSION;
73929+ spin_unlock_txnmgr(tmgr);
73930+ if (repeat) {
73931+ spin_unlock_atom(atom);
73932+ return -E_REPEAT;
73933+ }
73934+ return 0;
73935+}
73936+
73937+/* Calls jnode_flush for current atom if it exists; if not, just take another
73938+ atom and call jnode_flush() for him. If current transaction handle has
73939+ already assigned atom (current atom) we have to close current transaction
73940+ prior to switch to another atom or do something with current atom. This
73941+ code tries to flush current atom.
73942+
73943+ flush_some_atom() is called as part of memory clearing process. It is
73944+ invoked from balance_dirty_pages(), pdflushd, and entd.
73945+
73946+ If we can flush no nodes, atom is committed, because this frees memory.
73947+
73948+ If atom is too large or too old it is committed also.
73949+*/
73950+int
73951+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
73952+ int flags)
73953+{
73954+ reiser4_context *ctx = get_current_context();
73955+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
73956+ txn_handle *txnh = ctx->trans;
73957+ txn_atom *atom;
73958+ int ret;
73959+
73960+ BUG_ON(wbc->nr_to_write == 0);
73961+ BUG_ON(*nr_submitted != 0);
73962+ assert("zam-1042", txnh != NULL);
73963+ repeat:
73964+ if (txnh->atom == NULL) {
73965+ /* current atom is not available, take first from txnmgr */
73966+ spin_lock_txnmgr(tmgr);
73967+
73968+ /* traverse the list of all atoms */
73969+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73970+ /* lock atom before checking its state */
73971+ spin_lock_atom(atom);
73972+
73973+ /*
73974+ * we need an atom which is not being committed and
73975+ * which has no flushers (jnode_flush() add one flusher
73976+ * at the beginning and subtract one at the end).
73977+ */
73978+ if (atom->stage < ASTAGE_PRE_COMMIT &&
73979+ atom->nr_flushers == 0) {
73980+ spin_lock_txnh(txnh);
73981+ capture_assign_txnh_nolock(atom, txnh);
73982+ spin_unlock_txnh(txnh);
73983+
73984+ goto found;
73985+ }
73986+
73987+ spin_unlock_atom(atom);
73988+ }
73989+
73990+ /*
73991+ * Write throttling is case of no one atom can be
73992+ * flushed/committed.
73993+ */
73994+ if (!current_is_pdflush() && !wbc->nonblocking) {
73995+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73996+ spin_lock_atom(atom);
73997+ /* Repeat the check from the above. */
73998+ if (atom->stage < ASTAGE_PRE_COMMIT
73999+ && atom->nr_flushers == 0) {
74000+ spin_lock_txnh(txnh);
74001+ capture_assign_txnh_nolock(atom, txnh);
74002+ spin_unlock_txnh(txnh);
74003+
74004+ goto found;
74005+ }
74006+ if (atom->stage <= ASTAGE_POST_COMMIT) {
74007+ spin_unlock_txnmgr(tmgr);
74008+ /*
74009+ * we just wait until atom's flusher
74010+ * makes a progress in flushing or
74011+ * committing the atom
74012+ */
74013+ atom_wait_event(atom);
74014+ goto repeat;
74015+ }
74016+ spin_unlock_atom(atom);
74017+ }
74018+ }
74019+ spin_unlock_txnmgr(tmgr);
74020+ return 0;
74021+ found:
74022+ spin_unlock_txnmgr(tmgr);
74023+ } else
74024+ atom = get_current_atom_locked();
74025+
74026+ BUG_ON(atom->super != ctx->super);
74027+ assert("vs-35", atom->super == ctx->super);
74028+ if (start) {
74029+ spin_lock_jnode(start);
74030+ ret = (atom == start->atom) ? 1 : 0;
74031+ spin_unlock_jnode(start);
74032+ if (ret == 0)
74033+ start = NULL;
74034+ }
74035+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
74036+ if (ret == 0) {
74037+ /* flush_current_atom returns 0 only if it submitted for write
74038+ nothing */
74039+ BUG_ON(*nr_submitted != 0);
74040+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
74041+ if (atom->capture_count < tmgr->atom_min_size &&
74042+ !(atom->flags & ATOM_CANCEL_FUSION)) {
74043+ ret = txn_try_to_fuse_small_atom(tmgr, atom);
74044+ if (ret == -E_REPEAT) {
74045+ preempt_point();
74046+ goto repeat;
74047+ }
74048+ }
74049+ /* if early flushing could not make more nodes clean,
74050+ * or atom is too old/large,
74051+ * we force current atom to commit */
74052+ /* wait for commit completion but only if this
74053+ * wouldn't stall pdflushd and ent thread. */
74054+ if (!wbc->nonblocking && !ctx->entd)
74055+ txnh->flags |= TXNH_WAIT_COMMIT;
74056+ atom->flags |= ATOM_FORCE_COMMIT;
74057+ }
74058+ spin_unlock_atom(atom);
74059+ } else if (ret == -E_REPEAT) {
74060+ if (*nr_submitted == 0) {
74061+ /* let others who hampers flushing (hold longterm locks,
74062+ for instance) to free the way for flush */
74063+ preempt_point();
74064+ goto repeat;
74065+ }
74066+ ret = 0;
74067+ }
74068+/*
74069+ if (*nr_submitted > wbc->nr_to_write)
74070+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
74071+*/
74072+ txn_restart(ctx);
74073+
74074+ return ret;
74075+}
74076+
74077+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
74078+void invalidate_list(struct list_head *head)
74079+{
74080+ while (!list_empty(head)) {
74081+ jnode *node;
74082+
74083+ node = list_entry(head->next, jnode, capture_link);
74084+ spin_lock_jnode(node);
74085+ uncapture_block(node);
74086+ jput(node);
74087+ }
74088+}
74089+
74090+static void init_wlinks(txn_wait_links * wlinks)
74091+{
74092+ wlinks->_lock_stack = get_current_lock_stack();
74093+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
74094+ INIT_LIST_HEAD(&wlinks->_fwaiting_link);
74095+ wlinks->waitfor_cb = NULL;
74096+ wlinks->waiting_cb = NULL;
74097+}
74098+
74099+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
74100+void atom_wait_event(txn_atom * atom)
74101+{
74102+ txn_wait_links _wlinks;
74103+
74104+ assert_spin_locked(&(atom->alock));
74105+ assert("nikita-3156",
74106+ lock_stack_isclean(get_current_lock_stack()) ||
74107+ atom->nr_running_queues > 0);
74108+
74109+ init_wlinks(&_wlinks);
74110+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
74111+ atomic_inc(&atom->refcount);
74112+ spin_unlock_atom(atom);
74113+
74114+ prepare_to_sleep(_wlinks._lock_stack);
74115+ go_to_sleep(_wlinks._lock_stack);
74116+
74117+ spin_lock_atom(atom);
74118+ list_del(&_wlinks._fwaitfor_link);
74119+ atom_dec_and_unlock(atom);
74120+}
74121+
74122+void atom_set_stage(txn_atom * atom, txn_stage stage)
74123+{
74124+ assert("nikita-3535", atom != NULL);
74125+ assert_spin_locked(&(atom->alock));
74126+ assert("nikita-3536", ASTAGE_FREE <= stage && stage <= ASTAGE_INVALID);
74127+ /* Excelsior! */
74128+ assert("nikita-3537", stage >= atom->stage);
74129+ if (atom->stage != stage) {
74130+ atom->stage = stage;
74131+ atom_send_event(atom);
74132+ }
74133+}
74134+
74135+/* wake all threads which wait for an event */
74136+void atom_send_event(txn_atom * atom)
74137+{
74138+ assert_spin_locked(&(atom->alock));
74139+ wakeup_atom_waitfor_list(atom);
74140+}
74141+
74142+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
74143+ example, because it does fsync(2)) */
74144+static int should_wait_commit(txn_handle * h)
74145+{
74146+ return h->flags & TXNH_WAIT_COMMIT;
74147+}
74148+
74149+typedef struct commit_data {
74150+ txn_atom *atom;
74151+ txn_handle *txnh;
74152+ long nr_written;
74153+ /* as an optimization we start committing atom by first trying to
74154+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
74155+ * allows to reduce stalls due to other threads waiting for atom in
74156+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
74157+ * preliminary flushes. */
74158+ int preflush;
74159+ /* have we waited on atom. */
74160+ int wait;
74161+ int failed;
74162+ int wake_ktxnmgrd_up;
74163+} commit_data;
74164+
74165+/*
74166+ * Called from commit_txnh() repeatedly, until either error happens, or atom
74167+ * commits successfully.
74168+ */
74169+static int try_commit_txnh(commit_data * cd)
74170+{
74171+ int result;
74172+
74173+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
74174+
74175+ /* Get the atom and txnh locked. */
74176+ cd->atom = txnh_get_atom(cd->txnh);
74177+ assert("jmacd-309", cd->atom != NULL);
74178+ spin_unlock_txnh(cd->txnh);
74179+
74180+ if (cd->wait) {
74181+ cd->atom->nr_waiters--;
74182+ cd->wait = 0;
74183+ }
74184+
74185+ if (cd->atom->stage == ASTAGE_DONE)
74186+ return 0;
74187+
74188+ if (cd->failed)
74189+ return 0;
74190+
74191+ if (atom_should_commit(cd->atom)) {
74192+ /* if atom is _very_ large schedule it for commit as soon as
74193+ * possible. */
74194+ if (atom_should_commit_asap(cd->atom)) {
74195+ /*
74196+ * When atom is in PRE_COMMIT or later stage following
74197+ * invariant (encoded in atom_can_be_committed())
74198+ * holds: there is exactly one non-waiter transaction
74199+ * handle opened on this atom. When thread wants to
74200+ * wait until atom commits (for example sync()) it
74201+ * waits on atom event after increasing
74202+ * atom->nr_waiters (see blow in this function). It
74203+ * cannot be guaranteed that atom is already committed
74204+ * after receiving event, so loop has to be
74205+ * re-started. But if atom switched into PRE_COMMIT
74206+ * stage and became too large, we cannot change its
74207+ * state back to CAPTURE_WAIT (atom stage can only
74208+ * increase monotonically), hence this check.
74209+ */
74210+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
74211+ atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74212+ cd->atom->flags |= ATOM_FORCE_COMMIT;
74213+ }
74214+ if (cd->txnh->flags & TXNH_DONT_COMMIT) {
74215+ /*
74216+ * this thread (transaction handle that is) doesn't
74217+ * want to commit atom. Notify waiters that handle is
74218+ * closed. This can happen, for example, when we are
74219+ * under VFS directory lock and don't want to commit
74220+ * atom right now to avoid stalling other threads
74221+ * working in the same directory.
74222+ */
74223+
74224+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
74225+ * commit this atom: no atom waiters and only one
74226+ * (our) open transaction handle. */
74227+ cd->wake_ktxnmgrd_up =
74228+ cd->atom->txnh_count == 1 &&
74229+ cd->atom->nr_waiters == 0;
74230+ atom_send_event(cd->atom);
74231+ result = 0;
74232+ } else if (!atom_can_be_committed(cd->atom)) {
74233+ if (should_wait_commit(cd->txnh)) {
74234+ /* sync(): wait for commit */
74235+ cd->atom->nr_waiters++;
74236+ cd->wait = 1;
74237+ atom_wait_event(cd->atom);
74238+ result = RETERR(-E_REPEAT);
74239+ } else {
74240+ result = 0;
74241+ }
74242+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74243+ /*
74244+ * optimization: flush atom without switching it into
74245+ * ASTAGE_CAPTURE_WAIT.
74246+ *
74247+ * But don't do this for ktxnmgrd, because ktxnmgrd
74248+ * should never block on atom fusion.
74249+ */
74250+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74251+ LONG_MAX, &cd->nr_written,
74252+ &cd->atom, NULL);
74253+ if (result == 0) {
74254+ spin_unlock_atom(cd->atom);
74255+ cd->preflush = 0;
74256+ result = RETERR(-E_REPEAT);
74257+ } else /* Atoms wasn't flushed
74258+ * completely. Rinse. Repeat. */
74259+ --cd->preflush;
74260+ } else {
74261+ /* We change atom state to ASTAGE_CAPTURE_WAIT to
74262+ prevent atom fusion and count ourself as an active
74263+ flusher */
74264+ atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74265+ cd->atom->flags |= ATOM_FORCE_COMMIT;
74266+
74267+ result =
74268+ commit_current_atom(&cd->nr_written, &cd->atom);
74269+ if (result != 0 && result != -E_REPEAT)
74270+ cd->failed = 1;
74271+ }
74272+ } else
74273+ result = 0;
74274+
74275+#if REISER4_DEBUG
74276+ if (result == 0)
74277+ assert_spin_locked(&(cd->atom->alock));
74278+#endif
74279+
74280+ /* perfectly valid assertion, except that when atom/txnh is not locked
74281+ * fusion can take place, and cd->atom points nowhere. */
74282+ /*
74283+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74284+ */
74285+ return result;
74286+}
74287+
74288+/* Called to commit a transaction handle. This decrements the atom's number of open
74289+ handles and if it is the last handle to commit and the atom should commit, initiates
74290+ atom commit. if commit does not fail, return number of written blocks */
74291+static int commit_txnh(txn_handle * txnh)
74292+{
74293+ commit_data cd;
74294+ assert("umka-192", txnh != NULL);
74295+
74296+ memset(&cd, 0, sizeof cd);
74297+ cd.txnh = txnh;
74298+ cd.preflush = 10;
74299+
74300+ /* calls try_commit_txnh() until either atom commits, or error
74301+ * happens */
74302+ while (try_commit_txnh(&cd) != 0)
74303+ preempt_point();
74304+
74305+ spin_lock_txnh(txnh);
74306+
74307+ cd.atom->txnh_count -= 1;
74308+ txnh->atom = NULL;
74309+ /* remove transaction handle from atom's list of transaction handles */
74310+ list_del_init(&txnh->txnh_link);
74311+
74312+ spin_unlock_txnh(txnh);
74313+ atom_dec_and_unlock(cd.atom);
74314+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74315+ * because it takes time) by current thread, we do that work
74316+ * asynchronously by ktxnmgrd daemon. */
74317+ if (cd.wake_ktxnmgrd_up)
74318+ ktxnmgrd_kick(&get_current_super_private()->tmgr);
74319+
74320+ return 0;
74321+}
74322+
74323+/* TRY_CAPTURE */
74324+
74325+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
74326+ condition indicates that the request should be retried, and it may block if the
74327+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74328+
74329+ This routine encodes the basic logic of block capturing described by:
74330+
74331+ http://namesys.com/v4/v4.html
74332+
74333+ Our goal here is to ensure that any two blocks that contain dependent modifications
74334+ should commit at the same time. This function enforces this discipline by initiating
74335+ fusion whenever a transaction handle belonging to one atom requests to read or write a
74336+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74337+
74338+ In addition, this routine handles the initial assignment of atoms to blocks and
74339+ transaction handles. These are possible outcomes of this function:
74340+
74341+ 1. The block and handle are already part of the same atom: return immediate success
74342+
74343+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74344+ the handle to the block's atom.
74345+
74346+ 3. The handle is assigned but the block is not: call capture_assign_block to assign
74347+ the block to the handle's atom.
74348+
74349+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74350+ to fuse atoms.
74351+
74352+ 5. Neither block nor handle are assigned: create a new atom and assign them both.
74353+
74354+ 6. A read request for a non-captured block: return immediate success.
74355+
74356+ This function acquires and releases the handle's spinlock. This function is called
74357+ under the jnode lock and if the return value is 0, it returns with the jnode lock still
74358+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is
74359+ released. The external interface (try_capture) manages re-aquiring the jnode lock
74360+ in the failure case.
74361+*/
74362+static int try_capture_block(
74363+ txn_handle * txnh, jnode * node, txn_capture mode,
74364+ txn_atom ** atom_alloc)
74365+{
74366+ txn_atom *block_atom;
74367+ txn_atom *txnh_atom;
74368+
74369+ /* Should not call capture for READ_NONCOM requests, handled in try_capture. */
74370+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74371+
74372+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74373+ * node->tree somewhere. */
74374+ assert("umka-194", txnh != NULL);
74375+ assert("umka-195", node != NULL);
74376+
74377+ /* The jnode is already locked! Being called from try_capture(). */
74378+ assert_spin_locked(&(node->guard));
74379+ block_atom = node->atom;
74380+
74381+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74382+ let us touch the atoms themselves. */
74383+ spin_lock_txnh(txnh);
74384+ txnh_atom = txnh->atom;
74385+ /* Process of capturing continues into one of four branches depends on
74386+ which atoms from (block atom (node->atom), current atom (txnh->atom))
74387+ exist. */
74388+ if (txnh_atom == NULL) {
74389+ if (block_atom == NULL) {
74390+ spin_unlock_txnh(txnh);
74391+ spin_unlock_jnode(node);
74392+ /* assign empty atom to the txnh and repeat */
74393+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74394+ } else {
74395+ atomic_inc(&block_atom->refcount);
74396+ /* node spin-lock isn't needed anymore */
74397+ spin_unlock_jnode(node);
74398+ if (!spin_trylock_atom(block_atom)) {
74399+ spin_unlock_txnh(txnh);
74400+ spin_lock_atom(block_atom);
74401+ spin_lock_txnh(txnh);
74402+ }
74403+ /* re-check state after getting txnh and the node
74404+ * atom spin-locked */
74405+ if (node->atom != block_atom || txnh->atom != NULL) {
74406+ spin_unlock_txnh(txnh);
74407+ atom_dec_and_unlock(block_atom);
74408+ return RETERR(-E_REPEAT);
74409+ }
74410+ atomic_dec(&block_atom->refcount);
74411+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74412+ (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74413+ block_atom->txnh_count != 0))
74414+ return capture_fuse_wait(txnh, block_atom, NULL, mode);
74415+ capture_assign_txnh_nolock(block_atom, txnh);
74416+ spin_unlock_txnh(txnh);
74417+ spin_unlock_atom(block_atom);
74418+ return RETERR(-E_REPEAT);
74419+ }
74420+ } else {
74421+ /* It is time to perform deadlock prevention check over the
74422+ node we want to capture. It is possible this node was locked
74423+ for read without capturing it. The optimization which allows
74424+ to do it helps us in keeping atoms independent as long as
74425+ possible but it may cause lock/fuse deadlock problems.
74426+
74427+ A number of similar deadlock situations with locked but not
74428+ captured nodes were found. In each situation there are two
74429+ or more threads: one of them does flushing while another one
74430+ does routine balancing or tree lookup. The flushing thread
74431+ (F) sleeps in long term locking request for node (N), another
74432+ thread (A) sleeps in trying to capture some node already
74433+ belonging the atom F, F has a state which prevents
74434+ immediately fusion .
74435+
74436+ Deadlocks of this kind cannot happen if node N was properly
74437+ captured by thread A. The F thread fuse atoms before locking
74438+ therefore current atom of thread F and current atom of thread
74439+ A became the same atom and thread A may proceed. This does
74440+ not work if node N was not captured because the fusion of
74441+ atom does not happens.
74442+
74443+ The following scheme solves the deadlock: If
74444+ longterm_lock_znode locks and does not capture a znode, that
74445+ znode is marked as MISSED_IN_CAPTURE. A node marked this way
74446+ is processed by the code below which restores the missed
74447+ capture and fuses current atoms of all the node lock owners
74448+ by calling the fuse_not_fused_lock_owners() function. */
74449+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
74450+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
74451+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
74452+ spin_unlock_txnh(txnh);
74453+ spin_unlock_jnode(node);
74454+ fuse_not_fused_lock_owners(txnh, JZNODE(node));
74455+ return RETERR(-E_REPEAT);
74456+ }
74457+ }
74458+ if (block_atom == NULL) {
74459+ atomic_inc(&txnh_atom->refcount);
74460+ spin_unlock_txnh(txnh);
74461+ if (!spin_trylock_atom(txnh_atom)) {
74462+ spin_unlock_jnode(node);
74463+ spin_lock_atom(txnh_atom);
74464+ spin_lock_jnode(node);
74465+ }
74466+ if (txnh->atom != txnh_atom || node->atom != NULL
74467+ || JF_ISSET(node, JNODE_IS_DYING)) {
74468+ spin_unlock_jnode(node);
74469+ atom_dec_and_unlock(txnh_atom);
74470+ return RETERR(-E_REPEAT);
74471+ }
74472+ atomic_dec(&txnh_atom->refcount);
74473+ capture_assign_block_nolock(txnh_atom, node);
74474+ spin_unlock_atom(txnh_atom);
74475+ } else {
74476+ if (txnh_atom != block_atom) {
74477+ if (mode & TXN_CAPTURE_DONT_FUSE) {
74478+ spin_unlock_txnh(txnh);
74479+ spin_unlock_jnode(node);
74480+ /* we are in a "no-fusion" mode and @node is
74481+ * already part of transaction. */
74482+ return RETERR(-E_NO_NEIGHBOR);
74483+ }
74484+ return capture_init_fusion(node, txnh, mode);
74485+ }
74486+ spin_unlock_txnh(txnh);
74487+ }
74488+ }
74489+ return 0;
74490+}
74491+
74492+static txn_capture
74493+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
74494+{
74495+ txn_capture cap_mode;
74496+
74497+ assert_spin_locked(&(node->guard));
74498+
74499+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
74500+
74501+ if (lock_mode == ZNODE_WRITE_LOCK) {
74502+ cap_mode = TXN_CAPTURE_WRITE;
74503+ } else if (node->atom != NULL) {
74504+ cap_mode = TXN_CAPTURE_WRITE;
74505+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
74506+ jnode_get_level(node) == LEAF_LEVEL) {
74507+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
74508+ /* We only need a READ_FUSING capture at the leaf level. This
74509+ is because the internal levels of the tree (twigs included)
74510+ are redundant from the point of the user that asked for a
74511+ read-fusing transcrash. The user only wants to read-fuse
74512+ atoms due to reading uncommitted data that another user has
74513+ written. It is the file system that reads/writes the
74514+ internal tree levels, the user only reads/writes leaves. */
74515+ cap_mode = TXN_CAPTURE_READ_ATOMIC;
74516+ } else {
74517+ /* In this case (read lock at a non-leaf) there's no reason to
74518+ * capture. */
74519+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
74520+ return 0;
74521+ }
74522+
74523+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
74524+ assert("nikita-3186", cap_mode != 0);
74525+ return cap_mode;
74526+}
74527+
74528+/* This is an external interface to try_capture_block(), it calls
74529+ try_capture_block() repeatedly as long as -E_REPEAT is returned.
74530+
74531+ @node: node to capture,
74532+ @lock_mode: read or write lock is used in capture mode calculation,
74533+ @flags: see txn_capture flags enumeration,
74534+ @can_coc : can copy-on-capture
74535+
74536+ @return: 0 - node was successfully captured, -E_REPEAT - capture request
74537+ cannot be processed immediately as it was requested in flags,
74538+ < 0 - other errors.
74539+*/
74540+int try_capture(jnode *node, znode_lock_mode lock_mode,
74541+ txn_capture flags)
74542+{
74543+ txn_atom *atom_alloc = NULL;
74544+ txn_capture cap_mode;
74545+ txn_handle *txnh = get_current_context()->trans;
74546+ int ret;
74547+
74548+ assert_spin_locked(&(node->guard));
74549+
74550+ repeat:
74551+ if (JF_ISSET(node, JNODE_IS_DYING))
74552+ return RETERR(-EINVAL);
74553+ if (node->atom != NULL && txnh->atom == node->atom)
74554+ return 0;
74555+ cap_mode = build_capture_mode(node, lock_mode, flags);
74556+ if (cap_mode == 0 ||
74557+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
74558+ /* Mark this node as "MISSED". It helps in further deadlock
74559+ * analysis */
74560+ if (jnode_is_znode(node))
74561+ JF_SET(node, JNODE_MISSED_IN_CAPTURE);
74562+ return 0;
74563+ }
74564+ /* Repeat try_capture as long as -E_REPEAT is returned. */
74565+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
74566+ /* Regardless of non_blocking:
74567+
74568+ If ret == 0 then jnode is still locked.
74569+ If ret != 0 then jnode is unlocked.
74570+ */
74571+#if REISER4_DEBUG
74572+ if (ret == 0)
74573+ assert_spin_locked(&(node->guard));
74574+ else
74575+ assert_spin_not_locked(&(node->guard));
74576+#endif
74577+ assert_spin_not_locked(&(txnh->guard));
74578+
74579+ if (ret == -E_REPEAT) {
74580+ /* E_REPEAT implies all locks were released, therefore we need
74581+ to take the jnode's lock again. */
74582+ spin_lock_jnode(node);
74583+
74584+ /* Although this may appear to be a busy loop, it is not.
74585+ There are several conditions that cause E_REPEAT to be
74586+ returned by the call to try_capture_block, all cases
74587+ indicating some kind of state change that means you should
74588+ retry the request and will get a different result. In some
74589+ cases this could be avoided with some extra code, but
74590+ generally it is done because the necessary locks were
74591+ released as a result of the operation and repeating is the
74592+ simplest thing to do (less bug potential). The cases are:
74593+ atom fusion returns E_REPEAT after it completes (jnode and
74594+ txnh were unlocked); race conditions in assign_block,
74595+ assign_txnh, and init_fusion return E_REPEAT (trylock
74596+ failure); after going to sleep in capture_fuse_wait
74597+ (request was blocked but may now succeed). I'm not quite
74598+ sure how capture_copy works yet, but it may also return
74599+ E_REPEAT. When the request is legitimately blocked, the
74600+ requestor goes to sleep in fuse_wait, so this is not a busy
74601+ loop. */
74602+ /* NOTE-NIKITA: still don't understand:
74603+
74604+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
74605+
74606+ looks like busy loop?
74607+ */
74608+ goto repeat;
74609+ }
74610+
74611+ /* free extra atom object that was possibly allocated by
74612+ try_capture_block().
74613+
74614+ Do this before acquiring jnode spin lock to
74615+ minimize time spent under lock. --nikita */
74616+ if (atom_alloc != NULL) {
74617+ kmem_cache_free(_atom_slab, atom_alloc);
74618+ }
74619+
74620+ if (ret != 0) {
74621+ if (ret == -E_BLOCK) {
74622+ assert("nikita-3360",
74623+ cap_mode & TXN_CAPTURE_NONBLOCKING);
74624+ ret = -E_REPEAT;
74625+ }
74626+
74627+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May
74628+ want to fix the above code to avoid releasing the lock and
74629+ re-acquiring it, but there are cases were failure occurs
74630+ when the lock is not held, and those cases would need to be
74631+ modified to re-take the lock. */
74632+ spin_lock_jnode(node);
74633+ }
74634+
74635+ /* Jnode is still locked. */
74636+ assert_spin_locked(&(node->guard));
74637+ return ret;
74638+}
74639+
74640+static void release_two_atoms(txn_atom *one, txn_atom *two)
74641+{
74642+ spin_unlock_atom(one);
74643+ atom_dec_and_unlock(two);
74644+ spin_lock_atom(one);
74645+ atom_dec_and_unlock(one);
74646+}
74647+
74648+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
74649+ returned by that routine. The txn_capture request mode is computed here depending on
74650+ the transaction handle's type and the lock request. This is called from the depths of
74651+ the lock manager with the jnode lock held and it always returns with the jnode lock
74652+ held.
74653+*/
74654+
74655+/* fuse all 'active' atoms of lock owners of given node. */
74656+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
74657+{
74658+ lock_handle *lh;
74659+ int repeat;
74660+ txn_atom *atomh, *atomf;
74661+ reiser4_context *me = get_current_context();
74662+ reiser4_context *ctx = NULL;
74663+
74664+ assert_spin_not_locked(&(ZJNODE(node)->guard));
74665+ assert_spin_not_locked(&(txnh->hlock));
74666+
74667+ repeat:
74668+ repeat = 0;
74669+ atomh = txnh_get_atom(txnh);
74670+ spin_unlock_txnh(txnh);
74671+ assert("zam-692", atomh != NULL);
74672+
74673+ spin_lock_zlock(&node->lock);
74674+ /* inspect list of lock owners */
74675+ list_for_each_entry(lh, &node->lock.owners, owners_link) {
74676+ ctx = get_context_by_lock_stack(lh->owner);
74677+ if (ctx == me)
74678+ continue;
74679+ /* below we use two assumptions to avoid addition spin-locks
74680+ for checking the condition :
74681+
74682+ 1) if the lock stack has lock, the transaction should be
74683+ opened, i.e. ctx->trans != NULL;
74684+
74685+ 2) reading of well-aligned ctx->trans->atom is atomic, if it
74686+ equals to the address of spin-locked atomh, we take that
74687+ the atoms are the same, nothing has to be captured. */
74688+ if (atomh != ctx->trans->atom) {
74689+ reiser4_wake_up(lh->owner);
74690+ repeat = 1;
74691+ break;
74692+ }
74693+ }
74694+ if (repeat) {
74695+ if (!spin_trylock_txnh(ctx->trans)) {
74696+ spin_unlock_zlock(&node->lock);
74697+ spin_unlock_atom(atomh);
74698+ goto repeat;
74699+ }
74700+ atomf = ctx->trans->atom;
74701+ if (atomf == NULL) {
74702+ capture_assign_txnh_nolock(atomh, ctx->trans);
74703+ /* release zlock lock _after_ assigning the atom to the
74704+ * transaction handle, otherwise the lock owner thread
74705+ * may unlock all znodes, exit kernel context and here
74706+ * we would access an invalid transaction handle. */
74707+ spin_unlock_zlock(&node->lock);
74708+ spin_unlock_atom(atomh);
74709+ spin_unlock_txnh(ctx->trans);
74710+ goto repeat;
74711+ }
74712+ assert("zam-1059", atomf != atomh);
74713+ spin_unlock_zlock(&node->lock);
74714+ atomic_inc(&atomh->refcount);
74715+ atomic_inc(&atomf->refcount);
74716+ spin_unlock_txnh(ctx->trans);
74717+ if (atomf > atomh) {
74718+ spin_lock_atom(atomf);
74719+ } else {
74720+ spin_unlock_atom(atomh);
74721+ spin_lock_atom(atomf);
74722+ spin_lock_atom(atomh);
74723+ }
74724+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
74725+ release_two_atoms(atomf, atomh);
74726+ goto repeat;
74727+ }
74728+ atomic_dec(&atomh->refcount);
74729+ atomic_dec(&atomf->refcount);
74730+ capture_fuse_into(atomf, atomh);
74731+ goto repeat;
74732+ }
74733+ spin_unlock_zlock(&node->lock);
74734+ spin_unlock_atom(atomh);
74735+}
74736+
74737+/* This is the interface to capture unformatted nodes via their struct page
74738+ reference. Currently it is only used in reiser4_invalidatepage */
74739+int try_capture_page_to_invalidate(struct page *pg)
74740+{
74741+ int ret;
74742+ jnode *node;
74743+
74744+ assert("umka-292", pg != NULL);
74745+ assert("nikita-2597", PageLocked(pg));
74746+
74747+ if (IS_ERR(node = jnode_of_page(pg))) {
74748+ return PTR_ERR(node);
74749+ }
74750+
74751+ spin_lock_jnode(node);
74752+ unlock_page(pg);
74753+
74754+ ret = try_capture(node, ZNODE_WRITE_LOCK, 0);
74755+ spin_unlock_jnode(node);
74756+ jput(node);
74757+ lock_page(pg);
74758+ return ret;
74759+}
74760+
74761+/* This informs the transaction manager when a node is deleted. Add the block to the
74762+ atom's delete set and uncapture the block.
74763+
74764+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
74765+explanations. find all the functions that use it, and unless there is some very
74766+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
74767+move the loop to inside the function.
74768+
74769+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
74770+ */
74771+void uncapture_page(struct page *pg)
74772+{
74773+ jnode *node;
74774+ txn_atom *atom;
74775+
74776+ assert("umka-199", pg != NULL);
74777+ assert("nikita-3155", PageLocked(pg));
74778+
74779+ clear_page_dirty_for_io(pg);
74780+
74781+ reiser4_wait_page_writeback(pg);
74782+
74783+ node = jprivate(pg);
74784+ BUG_ON(node == NULL);
74785+
74786+ spin_lock_jnode(node);
74787+
74788+ atom = jnode_get_atom(node);
74789+ if (atom == NULL) {
74790+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74791+ spin_unlock_jnode(node);
74792+ return;
74793+ }
74794+
74795+ /* We can remove jnode from transaction even if it is on flush queue
74796+ * prepped list, we only need to be sure that flush queue is not being
74797+ * written by write_fq(). write_fq() does not use atom spin lock for
74798+ * protection of the prepped nodes list, instead write_fq() increments
74799+ * atom's nr_running_queues counters for the time when prepped list is
74800+ * not protected by spin lock. Here we check this counter if we want
74801+ * to remove jnode from flush queue and, if the counter is not zero,
74802+ * wait all write_fq() for this atom to complete. This is not
74803+ * significant overhead. */
74804+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
74805+ spin_unlock_jnode(node);
74806+ /*
74807+ * at this moment we want to wait for "atom event", viz. wait
74808+ * until @node can be removed from flush queue. But
74809+ * atom_wait_event() cannot be called with page locked, because
74810+ * it deadlocks with jnode_extent_write(). Unlock page, after
74811+ * making sure (through page_cache_get()) that it cannot be
74812+ * released from memory.
74813+ */
74814+ page_cache_get(pg);
74815+ unlock_page(pg);
74816+ atom_wait_event(atom);
74817+ lock_page(pg);
74818+ /*
74819+ * page may has been detached by ->writepage()->releasepage().
74820+ */
74821+ reiser4_wait_page_writeback(pg);
74822+ spin_lock_jnode(node);
74823+ page_cache_release(pg);
74824+ atom = jnode_get_atom(node);
74825+/* VS-FIXME-HANS: improve the commenting in this function */
74826+ if (atom == NULL) {
74827+ spin_unlock_jnode(node);
74828+ return;
74829+ }
74830+ }
74831+ uncapture_block(node);
74832+ spin_unlock_atom(atom);
74833+ jput(node);
74834+}
74835+
74836+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
74837+ * inode's tree of jnodes */
74838+void uncapture_jnode(jnode * node)
74839+{
74840+ txn_atom *atom;
74841+
74842+ assert_spin_locked(&(node->guard));
74843+ assert("", node->pg == 0);
74844+
74845+ atom = jnode_get_atom(node);
74846+ if (atom == NULL) {
74847+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74848+ spin_unlock_jnode(node);
74849+ return;
74850+ }
74851+
74852+ uncapture_block(node);
74853+ spin_unlock_atom(atom);
74854+ jput(node);
74855+}
74856+
74857+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
74858+ increases atom refcount and txnh_count, adds to txnh_list. */
74859+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
74860+{
74861+ assert("umka-200", atom != NULL);
74862+ assert("umka-201", txnh != NULL);
74863+
74864+ assert_spin_locked(&(txnh->hlock));
74865+ assert_spin_locked(&(atom->alock));
74866+ assert("jmacd-824", txnh->atom == NULL);
74867+ assert("nikita-3540", atom_isopen(atom));
74868+ BUG_ON(txnh->atom != NULL);
74869+
74870+ atomic_inc(&atom->refcount);
74871+ txnh->atom = atom;
74872+ set_gfp_mask();
74873+ list_add_tail(&txnh->txnh_link, &atom->txnh_list);
74874+ atom->txnh_count += 1;
74875+}
74876+
74877+/* No-locking version of assign_block. Sets the block's atom pointer, references the
74878+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
74879+static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
74880+{
74881+ assert("umka-202", atom != NULL);
74882+ assert("umka-203", node != NULL);
74883+ assert_spin_locked(&(node->guard));
74884+ assert_spin_locked(&(atom->alock));
74885+ assert("jmacd-323", node->atom == NULL);
74886+ BUG_ON(!list_empty_careful(&node->capture_link));
74887+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
74888+
74889+ /* Pointer from jnode to atom is not counted in atom->refcount. */
74890+ node->atom = atom;
74891+
74892+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
74893+ atom->capture_count += 1;
74894+ /* reference to jnode is acquired by atom. */
74895+ jref(node);
74896+
74897+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
74898+
74899+ LOCK_CNT_INC(t_refs);
74900+}
74901+
74902+/* common code for dirtying both unformatted jnodes and formatted znodes. */
74903+static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
74904+{
74905+ assert_spin_locked(&(node->guard));
74906+ assert_spin_locked(&(atom->alock));
74907+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
74908+
74909+ JF_SET(node, JNODE_DIRTY);
74910+
74911+ get_current_context()->nr_marked_dirty++;
74912+
74913+ /* We grab2flush_reserve one additional block only if node was
74914+ not CREATED and jnode_flush did not sort it into neither
74915+ relocate set nor overwrite one. If node is in overwrite or
74916+ relocate set we assume that atom's flush reserved counter was
74917+ already adjusted. */
74918+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
74919+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
74920+ && !jnode_is_cluster_page(node)) {
74921+ assert("vs-1093", !blocknr_is_fake(&node->blocknr));
74922+ assert("vs-1506", *jnode_get_block(node) != 0);
74923+ grabbed2flush_reserved_nolock(atom, (__u64) 1);
74924+ JF_SET(node, JNODE_FLUSH_RESERVED);
74925+ }
74926+
74927+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
74928+ /* If the atom is not set yet, it will be added to the appropriate list in
74929+ capture_assign_block_nolock. */
74930+ /* Sometimes a node is set dirty before being captured -- the case for new
74931+ jnodes. In that case the jnode will be added to the appropriate list
74932+ in capture_assign_block_nolock. Another reason not to re-link jnode is
74933+ that jnode is on a flush queue (see flush.c for details) */
74934+
74935+ int level = jnode_get_level(node);
74936+
74937+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
74938+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
74939+ assert("nikita-2607", 0 <= level);
74940+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
74941+
74942+ /* move node to atom's dirty list */
74943+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
74944+ ON_DEBUG(count_jnode
74945+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
74946+ }
74947+}
74948+
74949+/* Set the dirty status for this (spin locked) jnode. */
74950+void jnode_make_dirty_locked(jnode * node)
74951+{
74952+ assert("umka-204", node != NULL);
74953+ assert_spin_locked(&(node->guard));
74954+
74955+ if (REISER4_DEBUG && rofs_jnode(node)) {
74956+ warning("nikita-3365", "Dirtying jnode on rofs");
74957+ dump_stack();
74958+ }
74959+
74960+ /* Fast check for already dirty node */
74961+ if (!JF_ISSET(node, JNODE_DIRTY)) {
74962+ txn_atom *atom;
74963+
74964+ atom = jnode_get_atom(node);
74965+ assert("vs-1094", atom);
74966+ /* Check jnode dirty status again because node spin lock might
74967+ * be released inside jnode_get_atom(). */
74968+ if (likely(!JF_ISSET(node, JNODE_DIRTY)))
74969+ do_jnode_make_dirty(node, atom);
74970+ spin_unlock_atom(atom);
74971+ }
74972+}
74973+
74974+/* Set the dirty status for this znode. */
74975+void znode_make_dirty(znode * z)
74976+{
74977+ jnode *node;
74978+ struct page *page;
74979+
74980+ assert("umka-204", z != NULL);
74981+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
74982+ assert("nikita-3560", znode_is_write_locked(z));
74983+
74984+ node = ZJNODE(z);
74985+ /* znode is longterm locked, we can check dirty bit without spinlock */
74986+ if (JF_ISSET(node, JNODE_DIRTY)) {
74987+ /* znode is dirty already. All we have to do is to change znode version */
74988+ z->version = znode_build_version(jnode_get_tree(node));
74989+ return;
74990+ }
74991+
74992+ spin_lock_jnode(node);
74993+ jnode_make_dirty_locked(node);
74994+ page = jnode_page(node);
74995+ if (page != NULL) {
74996+ /* this is useful assertion (allows one to check that no
74997+ * modifications are lost due to update of in-flight page),
74998+ * but it requires locking on page to check PG_writeback
74999+ * bit. */
75000+ /* assert("nikita-3292",
75001+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
75002+ page_cache_get(page);
75003+
75004+ /* jnode lock is not needed for the rest of
75005+ * znode_set_dirty(). */
75006+ spin_unlock_jnode(node);
75007+ /* reiser4 file write code calls set_page_dirty for
75008+ * unformatted nodes, for formatted nodes we do it here. */
75009+ set_page_dirty_internal(page);
75010+ page_cache_release(page);
75011+ /* bump version counter in znode */
75012+ z->version = znode_build_version(jnode_get_tree(node));
75013+ } else {
75014+ assert("zam-596", znode_above_root(JZNODE(node)));
75015+ spin_unlock_jnode(node);
75016+ }
75017+
75018+ assert("nikita-1900", znode_is_write_locked(z));
75019+ assert("jmacd-9777", node->atom != NULL);
75020+}
75021+
75022+int sync_atom(txn_atom * atom)
75023+{
75024+ int result;
75025+ txn_handle *txnh;
75026+
75027+ txnh = get_current_context()->trans;
75028+
75029+ result = 0;
75030+ if (atom != NULL) {
75031+ if (atom->stage < ASTAGE_PRE_COMMIT) {
75032+ spin_lock_txnh(txnh);
75033+ capture_assign_txnh_nolock(atom, txnh);
75034+ result = force_commit_atom(txnh);
75035+ } else if (atom->stage < ASTAGE_POST_COMMIT) {
75036+ /* wait atom commit */
75037+ atom_wait_event(atom);
75038+ /* try once more */
75039+ result = RETERR(-E_REPEAT);
75040+ } else
75041+ spin_unlock_atom(atom);
75042+ }
75043+ return result;
75044+}
75045+
75046+#if REISER4_DEBUG
75047+
75048+/* move jnode form one list to another
75049+ call this after atom->capture_count is updated */
75050+void
75051+count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
75052+ atom_list new_list, int check_lists)
75053+{
75054+ struct list_head *pos;
75055+
75056+ assert("zam-1018", atom_is_protected(atom));
75057+ assert_spin_locked(&(node->guard));
75058+ assert("", NODE_LIST(node) == old_list);
75059+
75060+ switch (NODE_LIST(node)) {
75061+ case NOT_CAPTURED:
75062+ break;
75063+ case DIRTY_LIST:
75064+ assert("", atom->dirty > 0);
75065+ atom->dirty--;
75066+ break;
75067+ case CLEAN_LIST:
75068+ assert("", atom->clean > 0);
75069+ atom->clean--;
75070+ break;
75071+ case FQ_LIST:
75072+ assert("", atom->fq > 0);
75073+ atom->fq--;
75074+ break;
75075+ case WB_LIST:
75076+ assert("", atom->wb > 0);
75077+ atom->wb--;
75078+ break;
75079+ case OVRWR_LIST:
75080+ assert("", atom->ovrwr > 0);
75081+ atom->ovrwr--;
75082+ break;
75083+ default:
75084+ impossible("", "");
75085+ }
75086+
75087+ switch (new_list) {
75088+ case NOT_CAPTURED:
75089+ break;
75090+ case DIRTY_LIST:
75091+ atom->dirty++;
75092+ break;
75093+ case CLEAN_LIST:
75094+ atom->clean++;
75095+ break;
75096+ case FQ_LIST:
75097+ atom->fq++;
75098+ break;
75099+ case WB_LIST:
75100+ atom->wb++;
75101+ break;
75102+ case OVRWR_LIST:
75103+ atom->ovrwr++;
75104+ break;
75105+ default:
75106+ impossible("", "");
75107+ }
75108+ ASSIGN_NODE_LIST(node, new_list);
75109+ if (0 && check_lists) {
75110+ int count;
75111+ tree_level level;
75112+
75113+ count = 0;
75114+
75115+ /* flush queue list */
75116+ /*check_fq(atom); */
75117+
75118+ /* dirty list */
75119+ count = 0;
75120+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75121+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
75122+ count++;
75123+ }
75124+ if (count != atom->dirty)
75125+ warning("", "dirty counter %d, real %d\n", atom->dirty,
75126+ count);
75127+
75128+ /* clean list */
75129+ count = 0;
75130+ list_for_each(pos, ATOM_CLEAN_LIST(atom))
75131+ count++;
75132+ if (count != atom->clean)
75133+ warning("", "clean counter %d, real %d\n", atom->clean,
75134+ count);
75135+
75136+ /* wb list */
75137+ count = 0;
75138+ list_for_each(pos, ATOM_WB_LIST(atom))
75139+ count++;
75140+ if (count != atom->wb)
75141+ warning("", "wb counter %d, real %d\n", atom->wb,
75142+ count);
75143+
75144+ /* overwrite list */
75145+ count = 0;
75146+ list_for_each(pos, ATOM_OVRWR_LIST(atom))
75147+ count++;
75148+
75149+ if (count != atom->ovrwr)
75150+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
75151+ count);
75152+ }
75153+ assert("vs-1624", atom->num_queued == atom->fq);
75154+ if (atom->capture_count !=
75155+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
75156+ printk
75157+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
75158+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
75159+ atom->wb, atom->fq);
75160+ assert("vs-1622",
75161+ atom->capture_count ==
75162+ atom->dirty + atom->clean + atom->ovrwr + atom->wb +
75163+ atom->fq);
75164+ }
75165+}
75166+
75167+#endif
75168+
75169+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
75170+ * lock should be taken before calling this function. */
75171+void jnode_make_wander_nolock(jnode * node)
75172+{
75173+ txn_atom *atom;
75174+
75175+ assert("nikita-2431", node != NULL);
75176+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
75177+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
75178+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75179+ assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
75180+
75181+ atom = node->atom;
75182+
75183+ assert("zam-895", atom != NULL);
75184+ assert("zam-894", atom_is_protected(atom));
75185+
75186+ JF_SET(node, JNODE_OVRWR);
75187+ /* move node to atom's overwrite list */
75188+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
75189+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
75190+}
75191+
75192+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
75193+ * this function. */
75194+void jnode_make_wander(jnode * node)
75195+{
75196+ txn_atom *atom;
75197+
75198+ spin_lock_jnode(node);
75199+ atom = jnode_get_atom(node);
75200+ assert("zam-913", atom != NULL);
75201+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
75202+
75203+ jnode_make_wander_nolock(node);
75204+ spin_unlock_atom(atom);
75205+ spin_unlock_jnode(node);
75206+}
75207+
75208+/* this just sets RELOC bit */
75209+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
75210+{
75211+ assert_spin_locked(&(node->guard));
75212+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
75213+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
75214+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
75215+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75216+ assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
75217+ jnode_set_reloc(node);
75218+}
75219+
75220+/* Make znode RELOC and put it on flush queue */
75221+void znode_make_reloc(znode * z, flush_queue_t * fq)
75222+{
75223+ jnode *node;
75224+ txn_atom *atom;
75225+
75226+ node = ZJNODE(z);
75227+ spin_lock_jnode(node);
75228+
75229+ atom = jnode_get_atom(node);
75230+ assert("zam-919", atom != NULL);
75231+
75232+ jnode_make_reloc_nolock(fq, node);
75233+ queue_jnode(fq, node);
75234+
75235+ spin_unlock_atom(atom);
75236+ spin_unlock_jnode(node);
75237+
75238+}
75239+
75240+/* Make unformatted node RELOC and put it on flush queue */
75241+void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75242+{
75243+ assert("vs-1479", jnode_is_unformatted(node));
75244+
75245+ jnode_make_reloc_nolock(fq, node);
75246+ queue_jnode(fq, node);
75247+}
75248+
75249+int capture_super_block(struct super_block *s)
75250+{
75251+ int result;
75252+ znode *uber;
75253+ lock_handle lh;
75254+
75255+ init_lh(&lh);
75256+ result = get_uber_znode(get_tree(s),
75257+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75258+ if (result)
75259+ return result;
75260+
75261+ uber = lh.node;
75262+ /* Grabbing one block for superblock */
75263+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75264+ if (result != 0)
75265+ return result;
75266+
75267+ znode_make_dirty(uber);
75268+
75269+ done_lh(&lh);
75270+ return 0;
75271+}
75272+
75273+/* Wakeup every handle on the atom's WAITFOR list */
75274+static void wakeup_atom_waitfor_list(txn_atom * atom)
75275+{
75276+ txn_wait_links *wlinks;
75277+
75278+ assert("umka-210", atom != NULL);
75279+
75280+ /* atom is locked */
75281+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75282+ if (wlinks->waitfor_cb == NULL ||
75283+ wlinks->waitfor_cb(atom, wlinks))
75284+ /* Wake up. */
75285+ reiser4_wake_up(wlinks->_lock_stack);
75286+ }
75287+}
75288+
75289+/* Wakeup every handle on the atom's WAITING list */
75290+static void wakeup_atom_waiting_list(txn_atom * atom)
75291+{
75292+ txn_wait_links *wlinks;
75293+
75294+ assert("umka-211", atom != NULL);
75295+
75296+ /* atom is locked */
75297+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75298+ if (wlinks->waiting_cb == NULL ||
75299+ wlinks->waiting_cb(atom, wlinks))
75300+ /* Wake up. */
75301+ reiser4_wake_up(wlinks->_lock_stack);
75302+ }
75303+}
75304+
75305+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75306+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75307+{
75308+ assert("nikita-3330", atom != NULL);
75309+ assert_spin_locked(&(atom->alock));
75310+
75311+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75312+ * last transaction handle. */
75313+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75314+}
75315+
75316+/* The general purpose of this function is to wait on the first of two possible events.
75317+ The situation is that a handle (and its atom atomh) is blocked trying to capture a
75318+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
75319+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
75320+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75321+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
75322+ proceed and fuse the two atoms in the CAPTURE_WAIT state.
75323+
75324+ In other words, if either atomh or atomf change state, the handle will be awakened,
75325+ thus there are two lists per atom: WAITING and WAITFOR.
75326+
75327+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75328+ close but it is not assigned to an atom of its own.
75329+
75330+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75331+ BOTH_ATOM_LOCKS. Result: all four locks are released.
75332+*/
75333+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75334+ txn_atom * atomh, txn_capture mode)
75335+{
75336+ int ret;
75337+ txn_wait_links wlinks;
75338+
75339+ assert("umka-213", txnh != NULL);
75340+ assert("umka-214", atomf != NULL);
75341+
75342+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75343+ spin_unlock_txnh(txnh);
75344+ spin_unlock_atom(atomf);
75345+
75346+ if (atomh) {
75347+ spin_unlock_atom(atomh);
75348+ }
75349+
75350+ return RETERR(-E_BLOCK);
75351+ }
75352+
75353+ /* Initialize the waiting list links. */
75354+ init_wlinks(&wlinks);
75355+
75356+ /* Add txnh to atomf's waitfor list, unlock atomf. */
75357+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75358+ wlinks.waitfor_cb = wait_for_fusion;
75359+ atomic_inc(&atomf->refcount);
75360+ spin_unlock_atom(atomf);
75361+
75362+ if (atomh) {
75363+ /* Add txnh to atomh's waiting list, unlock atomh. */
75364+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75365+ atomic_inc(&atomh->refcount);
75366+ spin_unlock_atom(atomh);
75367+ }
75368+
75369+ /* Go to sleep. */
75370+ spin_unlock_txnh(txnh);
75371+
75372+ ret = prepare_to_sleep(wlinks._lock_stack);
75373+ if (ret == 0) {
75374+ go_to_sleep(wlinks._lock_stack);
75375+ ret = RETERR(-E_REPEAT);
75376+ }
75377+
75378+ /* Remove from the waitfor list. */
75379+ spin_lock_atom(atomf);
75380+
75381+ list_del(&wlinks._fwaitfor_link);
75382+ atom_dec_and_unlock(atomf);
75383+
75384+ if (atomh) {
75385+ /* Remove from the waiting list. */
75386+ spin_lock_atom(atomh);
75387+ list_del(&wlinks._fwaiting_link);
75388+ atom_dec_and_unlock(atomh);
75389+ }
75390+ return ret;
75391+}
75392+
75393+static void lock_two_atoms(txn_atom * one, txn_atom * two)
75394+{
75395+ assert("zam-1067", one != two);
75396+
75397+ /* lock the atom with lesser address first */
75398+ if (one < two) {
75399+ spin_lock_atom(one);
75400+ spin_lock_atom(two);
75401+ } else {
75402+ spin_lock_atom(two);
75403+ spin_lock_atom(one);
75404+ }
75405+}
75406+
75407+
75408+/* Perform the necessary work to prepare for fusing two atoms, which involves
75409+ * acquiring two atom locks in the proper order. If one of the node's atom is
75410+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75411+ * atom is not then the handle's request is put to sleep. If the node's atom
75412+ * is committing, then the node can be copy-on-captured. Otherwise, pick the
75413+ * atom with fewer pointers to be fused into the atom with more pointer and
75414+ * call capture_fuse_into.
75415+ */
75416+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75417+{
75418+ txn_atom * txnh_atom = txnh->atom;
75419+ txn_atom * block_atom = node->atom;
75420+
75421+ atomic_inc(&txnh_atom->refcount);
75422+ atomic_inc(&block_atom->refcount);
75423+
75424+ spin_unlock_txnh(txnh);
75425+ spin_unlock_jnode(node);
75426+
75427+ lock_two_atoms(txnh_atom, block_atom);
75428+
75429+ if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75430+ release_two_atoms(txnh_atom, block_atom);
75431+ return RETERR(-E_REPEAT);
75432+ }
75433+
75434+ atomic_dec(&txnh_atom->refcount);
75435+ atomic_dec(&block_atom->refcount);
75436+
75437+ assert ("zam-1066", atom_isopen(txnh_atom));
75438+
75439+ if (txnh_atom->stage >= block_atom->stage ||
75440+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
75441+ capture_fuse_into(txnh_atom, block_atom);
75442+ return RETERR(-E_REPEAT);
75443+ }
75444+ spin_lock_txnh(txnh);
75445+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
75446+}
75447+
75448+/* This function splices together two jnode lists (small and large) and sets all jnodes in
75449+ the small list to point to the large atom. Returns the length of the list. */
75450+static int
75451+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
75452+ struct list_head *small_head)
75453+{
75454+ int count = 0;
75455+ jnode *node;
75456+
75457+ assert("umka-218", large != NULL);
75458+ assert("umka-219", large_head != NULL);
75459+ assert("umka-220", small_head != NULL);
75460+ /* small atom should be locked also. */
75461+ assert_spin_locked(&(large->alock));
75462+
75463+ /* For every jnode on small's capture list... */
75464+ list_for_each_entry(node, small_head, capture_link) {
75465+ count += 1;
75466+
75467+ /* With the jnode lock held, update atom pointer. */
75468+ spin_lock_jnode(node);
75469+ node->atom = large;
75470+ spin_unlock_jnode(node);
75471+ }
75472+
75473+ /* Splice the lists. */
75474+ list_splice_init(small_head, large_head->prev);
75475+
75476+ return count;
75477+}
75478+
75479+/* This function splices together two txnh lists (small and large) and sets all txn handles in
75480+ the small list to point to the large atom. Returns the length of the list. */
75481+static int
75482+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
75483+ struct list_head *small_head)
75484+{
75485+ int count = 0;
75486+ txn_handle *txnh;
75487+
75488+ assert("umka-221", large != NULL);
75489+ assert("umka-222", large_head != NULL);
75490+ assert("umka-223", small_head != NULL);
75491+
75492+ /* Adjust every txnh to the new atom. */
75493+ list_for_each_entry(txnh, small_head, txnh_link) {
75494+ count += 1;
75495+
75496+ /* With the txnh lock held, update atom pointer. */
75497+ spin_lock_txnh(txnh);
75498+ txnh->atom = large;
75499+ spin_unlock_txnh(txnh);
75500+ }
75501+
75502+ /* Splice the txn_handle list. */
75503+ list_splice_init(small_head, large_head->prev);
75504+
75505+ return count;
75506+}
75507+
75508+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
75509+ added to LARGE and their ->atom pointers are all updated. The associated counts are
75510+ updated as well, and any waiting handles belonging to either are awakened. Finally the
75511+ smaller atom's refcount is decremented.
75512+*/
75513+static void capture_fuse_into(txn_atom * small, txn_atom * large)
75514+{
75515+ int level;
75516+ unsigned zcount = 0;
75517+ unsigned tcount = 0;
75518+
75519+ assert("umka-224", small != NULL);
75520+ assert("umka-225", small != NULL);
75521+
75522+ assert_spin_locked(&(large->alock));
75523+ assert_spin_locked(&(small->alock));
75524+
75525+ assert("jmacd-201", atom_isopen(small));
75526+ assert("jmacd-202", atom_isopen(large));
75527+
75528+ /* Splice and update the per-level dirty jnode lists */
75529+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75530+ zcount +=
75531+ capture_fuse_jnode_lists(large,
75532+ ATOM_DIRTY_LIST(large, level),
75533+ ATOM_DIRTY_LIST(small, level));
75534+ }
75535+
75536+ /* Splice and update the [clean,dirty] jnode and txnh lists */
75537+ zcount +=
75538+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
75539+ ATOM_CLEAN_LIST(small));
75540+ zcount +=
75541+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
75542+ ATOM_OVRWR_LIST(small));
75543+ zcount +=
75544+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
75545+ ATOM_WB_LIST(small));
75546+ zcount +=
75547+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
75548+ tcount +=
75549+ capture_fuse_txnh_lists(large, &large->txnh_list,
75550+ &small->txnh_list);
75551+
75552+ /* Check our accounting. */
75553+ assert("jmacd-1063",
75554+ zcount + small->num_queued == small->capture_count);
75555+ assert("jmacd-1065", tcount == small->txnh_count);
75556+
75557+ /* sum numbers of waiters threads */
75558+ large->nr_waiters += small->nr_waiters;
75559+ small->nr_waiters = 0;
75560+
75561+ /* splice flush queues */
75562+ fuse_fq(large, small);
75563+
75564+ /* update counter of jnode on every atom' list */
75565+ ON_DEBUG(large->dirty += small->dirty;
75566+ small->dirty = 0;
75567+ large->clean += small->clean;
75568+ small->clean = 0;
75569+ large->ovrwr += small->ovrwr;
75570+ small->ovrwr = 0;
75571+ large->wb += small->wb;
75572+ small->wb = 0;
75573+ large->fq += small->fq;
75574+ small->fq = 0;);
75575+
75576+ /* count flushers in result atom */
75577+ large->nr_flushers += small->nr_flushers;
75578+ small->nr_flushers = 0;
75579+
75580+ /* update counts of flushed nodes */
75581+ large->flushed += small->flushed;
75582+ small->flushed = 0;
75583+
75584+ /* Transfer list counts to large. */
75585+ large->txnh_count += small->txnh_count;
75586+ large->capture_count += small->capture_count;
75587+
75588+ /* Add all txnh references to large. */
75589+ atomic_add(small->txnh_count, &large->refcount);
75590+ atomic_sub(small->txnh_count, &small->refcount);
75591+
75592+ /* Reset small counts */
75593+ small->txnh_count = 0;
75594+ small->capture_count = 0;
75595+
75596+ /* Assign the oldest start_time, merge flags. */
75597+ large->start_time = min(large->start_time, small->start_time);
75598+ large->flags |= small->flags;
75599+
75600+ /* Merge blocknr sets. */
75601+ blocknr_set_merge(&small->delete_set, &large->delete_set);
75602+ blocknr_set_merge(&small->wandered_map, &large->wandered_map);
75603+
75604+ /* Merge allocated/deleted file counts */
75605+ large->nr_objects_deleted += small->nr_objects_deleted;
75606+ large->nr_objects_created += small->nr_objects_created;
75607+
75608+ small->nr_objects_deleted = 0;
75609+ small->nr_objects_created = 0;
75610+
75611+ /* Merge allocated blocks counts */
75612+ large->nr_blocks_allocated += small->nr_blocks_allocated;
75613+
75614+ large->nr_running_queues += small->nr_running_queues;
75615+ small->nr_running_queues = 0;
75616+
75617+ /* Merge blocks reserved for overwrite set. */
75618+ large->flush_reserved += small->flush_reserved;
75619+ small->flush_reserved = 0;
75620+
75621+ if (large->stage < small->stage) {
75622+ /* Large only needs to notify if it has changed state. */
75623+ atom_set_stage(large, small->stage);
75624+ wakeup_atom_waiting_list(large);
75625+ }
75626+
75627+ atom_set_stage(small, ASTAGE_INVALID);
75628+
75629+ /* Notify any waiters--small needs to unload its wait lists. Waiters
75630+ actually remove themselves from the list before returning from the
75631+ fuse_wait function. */
75632+ wakeup_atom_waiting_list(small);
75633+
75634+ /* Unlock atoms */
75635+ spin_unlock_atom(large);
75636+ atom_dec_and_unlock(small);
75637+}
75638+
75639+/* TXNMGR STUFF */
75640+
75641+/* Release a block from the atom, reversing the effects of being captured,
75642+ do not release atom's reference to jnode due to holding spin-locks.
75643+ Currently this is only called when the atom commits.
75644+
75645+ NOTE: this function does not release a (journal) reference to jnode
75646+ due to locking optimizations, you should call jput() somewhere after
75647+ calling uncapture_block(). */
75648+void uncapture_block(jnode * node)
75649+{
75650+ txn_atom *atom;
75651+
75652+ assert("umka-226", node != NULL);
75653+ atom = node->atom;
75654+ assert("umka-228", atom != NULL);
75655+
75656+ assert("jmacd-1021", node->atom == atom);
75657+ assert_spin_locked(&(node->guard));
75658+ assert("jmacd-1023", atom_is_protected(atom));
75659+
75660+ JF_CLR(node, JNODE_DIRTY);
75661+ JF_CLR(node, JNODE_RELOC);
75662+ JF_CLR(node, JNODE_OVRWR);
75663+ JF_CLR(node, JNODE_CREATED);
75664+ JF_CLR(node, JNODE_WRITEBACK);
75665+ JF_CLR(node, JNODE_REPACK);
75666+
75667+ list_del_init(&node->capture_link);
75668+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75669+ assert("zam-925", atom_isopen(atom));
75670+ assert("vs-1623", NODE_LIST(node) == FQ_LIST);
75671+ ON_DEBUG(atom->num_queued--);
75672+ JF_CLR(node, JNODE_FLUSH_QUEUED);
75673+ }
75674+ atom->capture_count -= 1;
75675+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
75676+ node->atom = NULL;
75677+
75678+ spin_unlock_jnode(node);
75679+ LOCK_CNT_DEC(t_refs);
75680+}
75681+
75682+/* Unconditional insert of jnode into atom's overwrite list. Currently used in
75683+ bitmap-based allocator code for adding modified bitmap blocks the
75684+ transaction. @atom and @node are spin locked */
75685+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
75686+{
75687+ assert("zam-538", atom_is_protected(atom));
75688+ assert_spin_locked(&(node->guard));
75689+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
75690+ assert("zam-543", node->atom == NULL);
75691+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
75692+
75693+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
75694+ jref(node);
75695+ node->atom = atom;
75696+ atom->capture_count++;
75697+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
75698+}
75699+
75700+
75701+#if REISER4_DEBUG
75702+
75703+void info_atom(const char *prefix, const txn_atom * atom)
75704+{
75705+ if (atom == NULL) {
75706+ printk("%s: no atom\n", prefix);
75707+ return;
75708+ }
75709+
75710+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
75711+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
75712+ atomic_read(&atom->refcount), atom->atom_id, atom->flags,
75713+ atom->txnh_count, atom->capture_count, atom->stage,
75714+ atom->start_time, atom->flushed);
75715+}
75716+
75717+#endif
75718+
75719+static int count_deleted_blocks_actor(txn_atom * atom,
75720+ const reiser4_block_nr * a,
75721+ const reiser4_block_nr * b, void *data)
75722+{
75723+ reiser4_block_nr *counter = data;
75724+
75725+ assert("zam-995", data != NULL);
75726+ assert("zam-996", a != NULL);
75727+ if (b == NULL)
75728+ *counter += 1;
75729+ else
75730+ *counter += *b;
75731+ return 0;
75732+}
75733+
75734+reiser4_block_nr txnmgr_count_deleted_blocks(void)
75735+{
75736+ reiser4_block_nr result;
75737+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
75738+ txn_atom *atom;
75739+
75740+ result = 0;
75741+
75742+ spin_lock_txnmgr(tmgr);
75743+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
75744+ spin_lock_atom(atom);
75745+ if (atom_isopen(atom))
75746+ blocknr_set_iterator(
75747+ atom, &atom->delete_set,
75748+ count_deleted_blocks_actor, &result, 0);
75749+ spin_unlock_atom(atom);
75750+ }
75751+ spin_unlock_txnmgr(tmgr);
75752+
75753+ return result;
75754+}
75755+
75756+/*
75757+ * Local variables:
75758+ * c-indentation-style: "K&R"
75759+ * mode-name: "LC"
75760+ * c-basic-offset: 8
75761+ * tab-width: 8
75762+ * fill-column: 79
75763+ * End:
75764+ */
75765Index: linux-2.6.16/fs/reiser4/txnmgr.h
75766===================================================================
75767--- /dev/null
75768+++ linux-2.6.16/fs/reiser4/txnmgr.h
75769@@ -0,0 +1,704 @@
75770+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75771+ * reiser4/README */
75772+
75773+/* data-types and function declarations for transaction manager. See txnmgr.c
75774+ * for details. */
75775+
75776+#ifndef __REISER4_TXNMGR_H__
75777+#define __REISER4_TXNMGR_H__
75778+
75779+#include "forward.h"
75780+#include "dformat.h"
75781+
75782+#include <linux/fs.h>
75783+#include <linux/mm.h>
75784+#include <linux/types.h>
75785+#include <linux/spinlock.h>
75786+#include <asm/atomic.h>
75787+#include <asm/semaphore.h>
75788+
75789+/* TYPE DECLARATIONS */
75790+
75791+/* This enumeration describes the possible types of a capture request (try_capture).
75792+ A capture request dynamically assigns a block to the calling thread's transaction
75793+ handle. */
75794+typedef enum {
75795+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's
75796+ atom should fuse in order to ensure that the block commits atomically with the
75797+ caller. */
75798+ TXN_CAPTURE_READ_ATOMIC = (1 << 0),
75799+
75800+ /* A READ_NONCOM request indicates that a block will be read and that the caller is
75801+ willing to read a non-committed block without causing atoms to fuse. */
75802+ TXN_CAPTURE_READ_NONCOM = (1 << 1),
75803+
75804+ /* A READ_MODIFY request indicates that a block will be read but that the caller
75805+ wishes for the block to be captured as it will be written. This capture request
75806+ mode is not currently used, but eventually it will be useful for preventing
75807+ deadlock in read-modify-write cycles. */
75808+ TXN_CAPTURE_READ_MODIFY = (1 << 2),
75809+
75810+ /* A WRITE capture request indicates that a block will be modified and that atoms
75811+ should fuse to make the commit atomic. */
75812+ TXN_CAPTURE_WRITE = (1 << 3),
75813+
75814+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
75815+ exclusive type designation from extra bits that may be supplied -- see
75816+ below. */
75817+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
75818+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
75819+ TXN_CAPTURE_WRITE),
75820+
75821+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
75822+ indicate modification will occur. */
75823+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
75824+
75825+ /* An option to try_capture, NONBLOCKING indicates that the caller would
75826+ prefer not to sleep waiting for an aging atom to commit. */
75827+ TXN_CAPTURE_NONBLOCKING = (1 << 4),
75828+
75829+ /* An option to try_capture to prevent atom fusion, just simple capturing is allowed */
75830+ TXN_CAPTURE_DONT_FUSE = (1 << 5)
75831+
75832+ /* This macro selects only the exclusive capture request types, stripping out any
75833+ options that were supplied (i.e., NONBLOCKING). */
75834+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
75835+} txn_capture;
75836+
75837+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
75838+ difference is in the handling of read requests. A WRITE_FUSING transaction handle
75839+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
75840+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
75841+typedef enum {
75842+ TXN_WRITE_FUSING = (1 << 0),
75843+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
75844+} txn_mode;
75845+
75846+/* Every atom has a stage, which is one of these exclusive values: */
75847+typedef enum {
75848+ /* Initially an atom is free. */
75849+ ASTAGE_FREE = 0,
75850+
75851+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
75852+ blocks and fuse with other atoms. */
75853+ ASTAGE_CAPTURE_FUSE = 1,
75854+
75855+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
75856+
75857+ /* When an atom reaches a certain age it must do all it can to commit. An atom in
75858+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
75859+ atoms in the CAPTURE_FUSE stage. */
75860+ ASTAGE_CAPTURE_WAIT = 2,
75861+
75862+ /* Waiting for I/O before commit. Copy-on-capture (see
75863+ http://namesys.com/v4/v4.html). */
75864+ ASTAGE_PRE_COMMIT = 3,
75865+
75866+ /* Post-commit overwrite I/O. Steal-on-capture. */
75867+ ASTAGE_POST_COMMIT = 4,
75868+
75869+ /* Atom which waits for the removal of the last reference to (it? ) to
75870+ * be deleted from memory */
75871+ ASTAGE_DONE = 5,
75872+
75873+ /* invalid atom. */
75874+ ASTAGE_INVALID = 6,
75875+
75876+} txn_stage;
75877+
75878+/* Certain flags may be set in the txn_atom->flags field. */
75879+typedef enum {
75880+ /* Indicates that the atom should commit as soon as possible. */
75881+ ATOM_FORCE_COMMIT = (1 << 0),
75882+ /* to avoid endless loop, mark the atom (which was considered as too
75883+ * small) after failed attempt to fuse it. */
75884+ ATOM_CANCEL_FUSION = (1 << 1)
75885+} txn_flags;
75886+
75887+/* Flags for controlling commit_txnh */
75888+typedef enum {
75889+ /* Wait commit atom completion in commit_txnh */
75890+ TXNH_WAIT_COMMIT = 0x2,
75891+ /* Don't commit atom when this handle is closed */
75892+ TXNH_DONT_COMMIT = 0x4
75893+} txn_handle_flags_t;
75894+
75895+/* TYPE DEFINITIONS */
75896+
75897+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
75898+ fields, so typically an operation on the atom through either of these objects must (1)
75899+ lock the object, (2) read the atom pointer, (3) lock the atom.
75900+
75901+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates
75902+ through the list of handles and pages held by the smaller of the two atoms. For each
75903+ handle and page referencing the smaller atom, the fusing process must: (1) lock the
75904+ object, and (2) update the atom pointer.
75905+
75906+ You can see that there is a conflict of lock ordering here, so the more-complex
75907+ procedure should have priority, i.e., the fusing process has priority so that it is
75908+ guaranteed to make progress and to avoid restarts.
75909+
75910+ This decision, however, means additional complexity for aquiring the atom lock in the
75911+ first place.
75912+
75913+ The general original procedure followed in the code was:
75914+
75915+ TXN_OBJECT *obj = ...;
75916+ TXN_ATOM *atom;
75917+
75918+ spin_lock (& obj->_lock);
75919+
75920+ atom = obj->_atom;
75921+
75922+ if (! spin_trylock_atom (atom))
75923+ {
75924+ spin_unlock (& obj->_lock);
75925+ RESTART OPERATION, THERE WAS A RACE;
75926+ }
75927+
75928+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75929+
75930+ It has however been found that this wastes CPU a lot in a manner that is
75931+ hard to profile. So, proper refcounting was added to atoms, and new
75932+ standard locking sequence is like following:
75933+
75934+ TXN_OBJECT *obj = ...;
75935+ TXN_ATOM *atom;
75936+
75937+ spin_lock (& obj->_lock);
75938+
75939+ atom = obj->_atom;
75940+
75941+ if (! spin_trylock_atom (atom))
75942+ {
75943+ atomic_inc (& atom->refcount);
75944+ spin_unlock (& obj->_lock);
75945+ spin_lock (&atom->_lock);
75946+ atomic_dec (& atom->refcount);
75947+ // HERE atom is locked
75948+ spin_unlock (&atom->_lock);
75949+ RESTART OPERATION, THERE WAS A RACE;
75950+ }
75951+
75952+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75953+
75954+ (core of this is implemented in trylock_throttle() function)
75955+
75956+ See the jnode_get_atom() function for a common case.
75957+
75958+ As an additional (and important) optimization allowing to avoid restarts,
75959+ it is possible to re-check required pre-conditions at the HERE point in
75960+ code above and proceed without restarting if they are still satisfied.
75961+*/
75962+
75963+/* A block number set consists of only the list head. */
75964+struct blocknr_set {
75965+ struct list_head entries;
75966+};
75967+
75968+/* An atomic transaction: this is the underlying system representation
75969+ of a transaction, not the one seen by clients.
75970+
75971+ Invariants involving this data-type:
75972+
75973+ [sb-fake-allocated]
75974+*/
75975+struct txn_atom {
75976+ /* The spinlock protecting the atom, held during fusion and various other state
75977+ changes. */
75978+ spinlock_t alock;
75979+
75980+ /* The atom's reference counter, increasing (in case of a duplication
75981+ of an existing reference or when we are sure that some other
75982+ reference exists) may be done without taking spinlock, decrementing
75983+ of the ref. counter requires a spinlock to be held.
75984+
75985+ Each transaction handle counts in ->refcount. All jnodes count as
75986+ one reference acquired in atom_begin_andlock(), released in
75987+ commit_current_atom().
75988+ */
75989+ atomic_t refcount;
75990+
75991+ /* The atom_id identifies the atom in persistent records such as the log. */
75992+ __u32 atom_id;
75993+
75994+ /* Flags holding any of the txn_flags enumerated values (e.g.,
75995+ ATOM_FORCE_COMMIT). */
75996+ __u32 flags;
75997+
75998+ /* Number of open handles. */
75999+ __u32 txnh_count;
76000+
76001+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the
76002+ dirty_nodes[level] and clean_nodes lists. */
76003+ __u32 capture_count;
76004+
76005+#if REISER4_DEBUG
76006+ int clean;
76007+ int dirty;
76008+ int ovrwr;
76009+ int wb;
76010+ int fq;
76011+#endif
76012+
76013+ __u32 flushed;
76014+
76015+ /* Current transaction stage. */
76016+ txn_stage stage;
76017+
76018+ /* Start time. */
76019+ unsigned long start_time;
76020+
76021+ /* The atom's delete set. It collects block numbers of the nodes
76022+ which were deleted during the transaction. */
76023+ blocknr_set delete_set;
76024+
76025+ /* The atom's wandered_block mapping. */
76026+ blocknr_set wandered_map;
76027+
76028+ /* The transaction's list of dirty captured nodes--per level. Index
76029+ by (level). dirty_nodes[0] is for znode-above-root */
76030+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
76031+
76032+ /* The transaction's list of clean captured nodes. */
76033+ struct list_head clean_nodes;
76034+
76035+ /* The atom's overwrite set */
76036+ struct list_head ovrwr_nodes;
76037+
76038+ /* nodes which are being written to disk */
76039+ struct list_head writeback_nodes;
76040+
76041+ /* list of inodes */
76042+ struct list_head inodes;
76043+
76044+ /* List of handles associated with this atom. */
76045+ struct list_head txnh_list;
76046+
76047+ /* Transaction list link: list of atoms in the transaction manager. */
76048+ struct list_head atom_link;
76049+
76050+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
76051+ struct list_head fwaitfor_list;
76052+
76053+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
76054+ struct list_head fwaiting_list;
76055+
76056+ /* Numbers of objects which were deleted/created in this transaction
76057+ thereby numbers of objects IDs which were released/deallocated. */
76058+ int nr_objects_deleted;
76059+ int nr_objects_created;
76060+ /* number of blocks allocated during the transaction */
76061+ __u64 nr_blocks_allocated;
76062+ /* All atom's flush queue objects are on this list */
76063+ struct list_head flush_queues;
76064+#if REISER4_DEBUG
76065+ /* number of flush queues for this atom. */
76066+ int nr_flush_queues;
76067+ /* Number of jnodes which were removed from atom's lists and put
76068+ on flush_queue */
76069+ int num_queued;
76070+#endif
76071+ /* number of threads who wait for this atom to complete commit */
76072+ int nr_waiters;
76073+ /* number of threads which do jnode_flush() over this atom */
76074+ int nr_flushers;
76075+ /* number of flush queues which are IN_USE and jnodes from fq->prepped
76076+ are submitted to disk by the write_fq() routine. */
76077+ int nr_running_queues;
76078+ /* A counter of grabbed unformatted nodes, see a description of the
76079+ * reiser4 space reservation scheme at block_alloc.c */
76080+ reiser4_block_nr flush_reserved;
76081+#if REISER4_DEBUG
76082+ void *committer;
76083+#endif
76084+ struct super_block *super;
76085+};
76086+
76087+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
76088+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
76089+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
76090+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
76091+#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
76092+
76093+#define NODE_LIST(node) (node)->list
76094+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
76095+ON_DEBUG(void
76096+ count_jnode(txn_atom *, jnode *, atom_list old_list,
76097+ atom_list new_list, int check_lists));
76098+
76099+typedef struct protected_jnodes {
76100+ struct list_head inatom; /* link to atom's list these structures */
76101+ struct list_head nodes; /* head of list of protected nodes */
76102+} protected_jnodes;
76103+
76104+/* A transaction handle: the client obtains and commits this handle which is assigned by
76105+ the system to a txn_atom. */
76106+struct txn_handle {
76107+ /* Spinlock protecting ->atom pointer */
76108+ spinlock_t hlock;
76109+
76110+ /* Flags for controlling commit_txnh() behavior */
76111+ /* from txn_handle_flags_t */
76112+ txn_handle_flags_t flags;
76113+
76114+ /* Whether it is READ_FUSING or WRITE_FUSING. */
76115+ txn_mode mode;
76116+
76117+ /* If assigned, the atom it is part of. */
76118+ txn_atom *atom;
76119+
76120+ /* Transaction list link. Head is in txn_atom. */
76121+ struct list_head txnh_link;
76122+};
76123+
76124+/* The transaction manager: one is contained in the reiser4_super_info_data */
76125+struct txn_mgr {
76126+ /* A spinlock protecting the atom list, id_count, flush_control */
76127+ spinlock_t tmgr_lock;
76128+
76129+ /* List of atoms. */
76130+ struct list_head atoms_list;
76131+
76132+ /* Number of atoms. */
76133+ int atom_count;
76134+
76135+ /* A counter used to assign atom->atom_id values. */
76136+ __u32 id_count;
76137+
76138+ /* a semaphore object for commit serialization */
76139+ struct semaphore commit_semaphore;
76140+
76141+ /* a list of all txnmrgs served by particular daemon. */
76142+ struct list_head linkage;
76143+
76144+ /* description of daemon for this txnmgr */
76145+ ktxnmgrd_context *daemon;
76146+
76147+ /* parameters. Adjustable through mount options. */
76148+ unsigned int atom_max_size;
76149+ unsigned int atom_max_age;
76150+ unsigned int atom_min_size;
76151+ /* max number of concurrent flushers for one atom, 0 - unlimited. */
76152+ unsigned int atom_max_flushers;
76153+ struct dentry *debugfs_atom_count;
76154+ struct dentry *debugfs_id_count;
76155+};
76156+
76157+/* FUNCTION DECLARATIONS */
76158+
76159+/* These are the externally (within Reiser4) visible transaction functions, therefore they
76160+ are prefixed with "txn_". For comments, see txnmgr.c. */
76161+
76162+extern int init_txnmgr_static(void);
76163+extern void done_txnmgr_static(void);
76164+
76165+extern void init_txnmgr(txn_mgr *);
76166+extern void done_txnmgr(txn_mgr *);
76167+
76168+extern int txn_reserve(int reserved);
76169+
76170+extern void txn_begin(reiser4_context * context);
76171+extern int txn_end(reiser4_context * context);
76172+
76173+extern void txn_restart(reiser4_context * context);
76174+extern void txn_restart_current(void);
76175+
76176+extern int txnmgr_force_commit_all(struct super_block *, int);
76177+extern int current_atom_should_commit(void);
76178+
76179+extern jnode *find_first_dirty_jnode(txn_atom *, int);
76180+
76181+extern int commit_some_atoms(txn_mgr *);
76182+extern int force_commit_atom(txn_handle *);
76183+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
76184+
76185+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
76186+
76187+extern void atom_set_stage(txn_atom * atom, txn_stage stage);
76188+
76189+extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
76190+ int alloc_value);
76191+extern void atom_dec_and_unlock(txn_atom * atom);
76192+
76193+extern int try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
76194+extern int try_capture_page_to_invalidate(struct page *pg);
76195+
76196+extern void uncapture_page(struct page *pg);
76197+extern void uncapture_block(jnode *);
76198+extern void uncapture_jnode(jnode *);
76199+
76200+extern int capture_inode(struct inode *);
76201+extern int uncapture_inode(struct inode *);
76202+
76203+extern txn_atom *get_current_atom_locked_nocheck(void);
76204+
76205+#if REISER4_DEBUG
76206+
76207+/**
76208+ * atom_is_protected - make sure that nobody but us can do anything with atom
76209+ * @atom: atom to be checked
76210+ *
76211+ * This is used to assert that atom either entered commit stages or is spin
76212+ * locked.
76213+ */
76214+static inline int atom_is_protected(txn_atom *atom)
76215+{
76216+ if (atom->stage >= ASTAGE_PRE_COMMIT)
76217+ return 1;
76218+ assert_spin_locked(&(atom->alock));
76219+ return 1;
76220+}
76221+
76222+#endif
76223+
76224+/* Get the current atom and spinlock it if current atom present. May not return NULL */
76225+static inline txn_atom *get_current_atom_locked(void)
76226+{
76227+ txn_atom *atom;
76228+
76229+ atom = get_current_atom_locked_nocheck();
76230+ assert("zam-761", atom != NULL);
76231+
76232+ return atom;
76233+}
76234+
76235+extern txn_atom *jnode_get_atom(jnode *);
76236+
76237+extern void atom_wait_event(txn_atom *);
76238+extern void atom_send_event(txn_atom *);
76239+
76240+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
76241+extern int capture_super_block(struct super_block *s);
76242+int capture_bulk(jnode **, int count);
76243+
76244+/* See the comment on the function blocknrset.c:blocknr_set_add for the
76245+ calling convention of these three routines. */
76246+extern void blocknr_set_init(blocknr_set * bset);
76247+extern void blocknr_set_destroy(blocknr_set * bset);
76248+extern void blocknr_set_merge(blocknr_set * from, blocknr_set * into);
76249+extern int blocknr_set_add_extent(txn_atom * atom,
76250+ blocknr_set * bset,
76251+ blocknr_set_entry ** new_bsep,
76252+ const reiser4_block_nr * start,
76253+ const reiser4_block_nr * len);
76254+extern int blocknr_set_add_pair(txn_atom * atom, blocknr_set * bset,
76255+ blocknr_set_entry ** new_bsep,
76256+ const reiser4_block_nr * a,
76257+ const reiser4_block_nr * b);
76258+
76259+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76260+ const reiser4_block_nr *, void *);
76261+
76262+extern int blocknr_set_iterator(txn_atom * atom, blocknr_set * bset,
76263+ blocknr_set_actor_f actor, void *data,
76264+ int delete);
76265+
76266+/* flush code takes care about how to fuse flush queues */
76267+extern void flush_init_atom(txn_atom * atom);
76268+extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76269+
76270+static inline void spin_lock_atom(txn_atom *atom)
76271+{
76272+ /* check that spinlocks of lower priorities are not held */
76273+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76274+ LOCK_CNT_NIL(spin_locked_jnode) &&
76275+ LOCK_CNT_NIL(spin_locked_zlock) &&
76276+ LOCK_CNT_NIL(rw_locked_dk) &&
76277+ LOCK_CNT_NIL(rw_locked_tree)));
76278+
76279+ spin_lock(&(atom->alock));
76280+
76281+ LOCK_CNT_INC(spin_locked_atom);
76282+ LOCK_CNT_INC(spin_locked);
76283+}
76284+
76285+static inline int spin_trylock_atom(txn_atom *atom)
76286+{
76287+ if (spin_trylock(&(atom->alock))) {
76288+ LOCK_CNT_INC(spin_locked_atom);
76289+ LOCK_CNT_INC(spin_locked);
76290+ return 1;
76291+ }
76292+ return 0;
76293+}
76294+
76295+static inline void spin_unlock_atom(txn_atom *atom)
76296+{
76297+ assert_spin_locked(&(atom->alock));
76298+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76299+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76300+
76301+ LOCK_CNT_DEC(spin_locked_atom);
76302+ LOCK_CNT_DEC(spin_locked);
76303+
76304+ spin_unlock(&(atom->alock));
76305+}
76306+
76307+static inline void spin_lock_txnh(txn_handle *txnh)
76308+{
76309+ /* check that spinlocks of lower priorities are not held */
76310+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76311+ LOCK_CNT_NIL(spin_locked_zlock) &&
76312+ LOCK_CNT_NIL(rw_locked_tree)));
76313+
76314+ spin_lock(&(txnh->hlock));
76315+
76316+ LOCK_CNT_INC(spin_locked_txnh);
76317+ LOCK_CNT_INC(spin_locked);
76318+}
76319+
76320+static inline int spin_trylock_txnh(txn_handle *txnh)
76321+{
76322+ if (spin_trylock(&(txnh->hlock))) {
76323+ LOCK_CNT_INC(spin_locked_txnh);
76324+ LOCK_CNT_INC(spin_locked);
76325+ return 1;
76326+ }
76327+ return 0;
76328+}
76329+
76330+static inline void spin_unlock_txnh(txn_handle *txnh)
76331+{
76332+ assert_spin_locked(&(txnh->hlock));
76333+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76334+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76335+
76336+ LOCK_CNT_DEC(spin_locked_txnh);
76337+ LOCK_CNT_DEC(spin_locked);
76338+
76339+ spin_unlock(&(txnh->hlock));
76340+}
76341+
76342+#define spin_ordering_pred_txnmgr(tmgr) \
76343+ ( LOCK_CNT_NIL(spin_locked_atom) && \
76344+ LOCK_CNT_NIL(spin_locked_txnh) && \
76345+ LOCK_CNT_NIL(spin_locked_jnode) && \
76346+ LOCK_CNT_NIL(rw_locked_zlock) && \
76347+ LOCK_CNT_NIL(rw_locked_dk) && \
76348+ LOCK_CNT_NIL(rw_locked_tree) )
76349+
76350+static inline void spin_lock_txnmgr(txn_mgr *mgr)
76351+{
76352+ /* check that spinlocks of lower priorities are not held */
76353+ assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76354+ LOCK_CNT_NIL(spin_locked_txnh) &&
76355+ LOCK_CNT_NIL(spin_locked_jnode) &&
76356+ LOCK_CNT_NIL(spin_locked_zlock) &&
76357+ LOCK_CNT_NIL(rw_locked_dk) &&
76358+ LOCK_CNT_NIL(rw_locked_tree)));
76359+
76360+ spin_lock(&(mgr->tmgr_lock));
76361+
76362+ LOCK_CNT_INC(spin_locked_txnmgr);
76363+ LOCK_CNT_INC(spin_locked);
76364+}
76365+
76366+static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76367+{
76368+ if (spin_trylock(&(mgr->tmgr_lock))) {
76369+ LOCK_CNT_INC(spin_locked_txnmgr);
76370+ LOCK_CNT_INC(spin_locked);
76371+ return 1;
76372+ }
76373+ return 0;
76374+}
76375+
76376+static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76377+{
76378+ assert_spin_locked(&(mgr->tmgr_lock));
76379+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76380+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76381+
76382+ LOCK_CNT_DEC(spin_locked_txnmgr);
76383+ LOCK_CNT_DEC(spin_locked);
76384+
76385+ spin_unlock(&(mgr->tmgr_lock));
76386+}
76387+
76388+typedef enum {
76389+ FQ_IN_USE = 0x1
76390+} flush_queue_state_t;
76391+
76392+typedef struct flush_queue flush_queue_t;
76393+
76394+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76395+ is filled by the jnode_flush() routine, and written to disk under memory
76396+ pressure or at atom commit time. */
76397+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76398+ field and fq->prepped list can be modified if atom is spin-locked and fq
76399+ object is "in-use" state. For read-only traversal of the fq->prepped list
76400+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76401+ only have atom spin-locked. */
76402+struct flush_queue {
76403+ /* linkage element is the first in this structure to make debugging
76404+ easier. See field in atom struct for description of list. */
76405+ struct list_head alink;
76406+ /* A spinlock to protect changes of fq state and fq->atom pointer */
76407+ spinlock_t guard;
76408+ /* flush_queue state: [in_use | ready] */
76409+ flush_queue_state_t state;
76410+ /* A list which contains queued nodes, queued nodes are removed from any
76411+ * atom's list and put on this ->prepped one. */
76412+ struct list_head prepped;
76413+ /* number of submitted i/o requests */
76414+ atomic_t nr_submitted;
76415+ /* number of i/o errors */
76416+ atomic_t nr_errors;
76417+ /* An atom this flush queue is attached to */
76418+ txn_atom *atom;
76419+ /* A semaphore for waiting on i/o completion */
76420+ struct semaphore io_sem;
76421+#if REISER4_DEBUG
76422+ /* A thread which took this fq in exclusive use, NULL if fq is free,
76423+ * used for debugging. */
76424+ struct task_struct *owner;
76425+#endif
76426+};
76427+
76428+extern int fq_by_atom(txn_atom *, flush_queue_t **);
76429+extern void fq_put_nolock(flush_queue_t *);
76430+extern void fq_put(flush_queue_t *);
76431+extern void fuse_fq(txn_atom * to, txn_atom * from);
76432+extern void queue_jnode(flush_queue_t *, jnode *);
76433+extern void mark_jnode_queued(flush_queue_t *, jnode *);
76434+
76435+extern int write_fq(flush_queue_t *, long *, int);
76436+extern int current_atom_finish_all_fq(void);
76437+extern void init_atom_fq_parts(txn_atom *);
76438+
76439+extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76440+
76441+extern void znode_make_dirty(znode * node);
76442+extern void jnode_make_dirty_locked(jnode * node);
76443+
76444+extern int sync_atom(txn_atom * atom);
76445+
76446+#if REISER4_DEBUG
76447+extern int atom_fq_parts_are_clean(txn_atom *);
76448+#endif
76449+
76450+extern void add_fq_to_bio(flush_queue_t *, struct bio *);
76451+extern flush_queue_t *get_fq_for_current_atom(void);
76452+
76453+void protected_jnodes_init(protected_jnodes * list);
76454+void protected_jnodes_done(protected_jnodes * list);
76455+void invalidate_list(struct list_head * head);
76456+
76457+#if REISER4_DEBUG
76458+void info_atom(const char *prefix, const txn_atom * atom);
76459+#else
76460+#define info_atom(p,a) noop
76461+#endif
76462+
76463+# endif /* __REISER4_TXNMGR_H__ */
76464+
76465+/* Make Linus happy.
76466+ Local variables:
76467+ c-indentation-style: "K&R"
76468+ mode-name: "LC"
76469+ c-basic-offset: 8
76470+ tab-width: 8
76471+ fill-column: 120
76472+ End:
76473+*/
76474Index: linux-2.6.16/fs/reiser4/type_safe_hash.h
76475===================================================================
76476--- /dev/null
76477+++ linux-2.6.16/fs/reiser4/type_safe_hash.h
76478@@ -0,0 +1,320 @@
76479+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76480+ * reiser4/README */
76481+
76482+/* A hash table class that uses hash chains (singly-linked) and is
76483+ parametrized to provide type safety. */
76484+
76485+#ifndef __REISER4_TYPE_SAFE_HASH_H__
76486+#define __REISER4_TYPE_SAFE_HASH_H__
76487+
76488+#include "debug.h"
76489+
76490+#include <asm/errno.h>
76491+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
76492+ based on the object type. You need to declare the item type before
76493+ this definition, define it after this definition. */
76494+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
76495+ \
76496+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
76497+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
76498+ \
76499+struct PREFIX##_hash_table_ \
76500+{ \
76501+ ITEM_TYPE **_table; \
76502+ __u32 _buckets; \
76503+}; \
76504+ \
76505+struct PREFIX##_hash_link_ \
76506+{ \
76507+ ITEM_TYPE *_next; \
76508+}
76509+
76510+/* Step 2: Define the object type of the hash: give it field of type
76511+ PREFIX_hash_link. */
76512+
76513+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
76514+ the type and field name used in step 3. The arguments are:
76515+
76516+ ITEM_TYPE The item type being hashed
76517+ KEY_TYPE The type of key being hashed
76518+ KEY_NAME The name of the key field within the item
76519+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
76520+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
76521+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
76522+
76523+ It implements these functions:
76524+
76525+ prefix_hash_init Initialize the table given its size.
76526+ prefix_hash_insert Insert an item
76527+ prefix_hash_insert_index Insert an item w/ precomputed hash_index
76528+ prefix_hash_find Find an item by key
76529+ prefix_hash_find_index Find an item w/ precomputed hash_index
76530+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
76531+ prefix_hash_remove_index Remove an item w/ precomputed hash_index
76532+
76533+ If you'd like something to be done differently, feel free to ask me
76534+ for modifications. Additional features that could be added but
76535+ have not been:
76536+
76537+ prefix_hash_remove_key Find and remove an item by key
76538+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
76539+
76540+ The hash_function currently receives only the key as an argument,
76541+ meaning it must somehow know the number of buckets. If this is a
76542+ problem let me know.
76543+
76544+ This hash table uses a single-linked hash chain. This means
76545+ insertion is fast but deletion requires searching the chain.
76546+
76547+ There is also the doubly-linked hash chain approach, under which
76548+ deletion requires no search but the code is longer and it takes two
76549+ pointers per item.
76550+
76551+ The circularly-linked approach has the shortest code but requires
76552+ two pointers per bucket, doubling the size of the bucket array (in
76553+ addition to two pointers per item).
76554+*/
76555+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
76556+ \
76557+static __inline__ void \
76558+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
76559+ __u32 hash UNUSED_ARG) \
76560+{ \
76561+ assert("nikita-2780", hash < table->_buckets); \
76562+} \
76563+ \
76564+static __inline__ int \
76565+PREFIX##_hash_init (PREFIX##_hash_table *hash, \
76566+ __u32 buckets) \
76567+{ \
76568+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
76569+ hash->_buckets = buckets; \
76570+ if (hash->_table == NULL) \
76571+ { \
76572+ return RETERR(-ENOMEM); \
76573+ } \
76574+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
76575+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
76576+ return 0; \
76577+} \
76578+ \
76579+static __inline__ void \
76580+PREFIX##_hash_done (PREFIX##_hash_table *hash) \
76581+{ \
76582+ if (REISER4_DEBUG && hash->_table != NULL) { \
76583+ __u32 i; \
76584+ for (i = 0 ; i < hash->_buckets ; ++ i) \
76585+ assert("nikita-2905", hash->_table[i] == NULL); \
76586+ } \
76587+ if (hash->_table != NULL) \
76588+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
76589+ hash->_table = NULL; \
76590+} \
76591+ \
76592+static __inline__ void \
76593+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
76594+{ \
76595+ prefetch(item->LINK_NAME._next); \
76596+} \
76597+ \
76598+static __inline__ void \
76599+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
76600+ __u32 index) \
76601+{ \
76602+ prefetch(hash->_table[index]); \
76603+} \
76604+ \
76605+static __inline__ ITEM_TYPE* \
76606+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
76607+ __u32 hash_index, \
76608+ KEY_TYPE const *find_key) \
76609+{ \
76610+ ITEM_TYPE *item; \
76611+ \
76612+ PREFIX##_check_hash(hash, hash_index); \
76613+ \
76614+ for (item = hash->_table[hash_index]; \
76615+ item != NULL; \
76616+ item = item->LINK_NAME._next) \
76617+ { \
76618+ prefetch(item->LINK_NAME._next); \
76619+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
76620+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \
76621+ { \
76622+ return item; \
76623+ } \
76624+ } \
76625+ \
76626+ return NULL; \
76627+} \
76628+ \
76629+static __inline__ ITEM_TYPE* \
76630+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
76631+ __u32 hash_index, \
76632+ KEY_TYPE const *find_key) \
76633+{ \
76634+ ITEM_TYPE ** item = &hash->_table[hash_index]; \
76635+ \
76636+ PREFIX##_check_hash(hash, hash_index); \
76637+ \
76638+ while (*item != NULL) { \
76639+ prefetch(&(*item)->LINK_NAME._next); \
76640+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
76641+ ITEM_TYPE *found; \
76642+ \
76643+ found = *item; \
76644+ *item = found->LINK_NAME._next; \
76645+ found->LINK_NAME._next = hash->_table[hash_index]; \
76646+ hash->_table[hash_index] = found; \
76647+ return found; \
76648+ } \
76649+ item = &(*item)->LINK_NAME._next; \
76650+ } \
76651+ return NULL; \
76652+} \
76653+ \
76654+static __inline__ int \
76655+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
76656+ __u32 hash_index, \
76657+ ITEM_TYPE *del_item) \
76658+{ \
76659+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
76660+ \
76661+ PREFIX##_check_hash(hash, hash_index); \
76662+ \
76663+ while (*hash_item_p != NULL) { \
76664+ prefetch(&(*hash_item_p)->LINK_NAME._next); \
76665+ if (*hash_item_p == del_item) { \
76666+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
76667+ return 1; \
76668+ } \
76669+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
76670+ } \
76671+ return 0; \
76672+} \
76673+ \
76674+static __inline__ void \
76675+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
76676+ __u32 hash_index, \
76677+ ITEM_TYPE *ins_item) \
76678+{ \
76679+ PREFIX##_check_hash(hash, hash_index); \
76680+ \
76681+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76682+ hash->_table[hash_index] = ins_item; \
76683+} \
76684+ \
76685+static __inline__ void \
76686+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
76687+ __u32 hash_index, \
76688+ ITEM_TYPE *ins_item) \
76689+{ \
76690+ PREFIX##_check_hash(hash, hash_index); \
76691+ \
76692+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76693+ smp_wmb(); \
76694+ hash->_table[hash_index] = ins_item; \
76695+} \
76696+ \
76697+static __inline__ ITEM_TYPE* \
76698+PREFIX##_hash_find (PREFIX##_hash_table *hash, \
76699+ KEY_TYPE const *find_key) \
76700+{ \
76701+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
76702+} \
76703+ \
76704+static __inline__ ITEM_TYPE* \
76705+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
76706+ KEY_TYPE const *find_key) \
76707+{ \
76708+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
76709+} \
76710+ \
76711+static __inline__ int \
76712+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
76713+ ITEM_TYPE *del_item) \
76714+{ \
76715+ return PREFIX##_hash_remove_index (hash, \
76716+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
76717+} \
76718+ \
76719+static __inline__ int \
76720+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
76721+ ITEM_TYPE *del_item) \
76722+{ \
76723+ return PREFIX##_hash_remove (hash, del_item); \
76724+} \
76725+ \
76726+static __inline__ void \
76727+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
76728+ ITEM_TYPE *ins_item) \
76729+{ \
76730+ return PREFIX##_hash_insert_index (hash, \
76731+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
76732+} \
76733+ \
76734+static __inline__ void \
76735+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
76736+ ITEM_TYPE *ins_item) \
76737+{ \
76738+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
76739+ ins_item); \
76740+} \
76741+ \
76742+static __inline__ ITEM_TYPE * \
76743+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
76744+{ \
76745+ ITEM_TYPE *first; \
76746+ \
76747+ for (first = NULL; ind < hash->_buckets; ++ ind) { \
76748+ first = hash->_table[ind]; \
76749+ if (first != NULL) \
76750+ break; \
76751+ } \
76752+ return first; \
76753+} \
76754+ \
76755+static __inline__ ITEM_TYPE * \
76756+PREFIX##_hash_next (PREFIX##_hash_table *hash, \
76757+ ITEM_TYPE *item) \
76758+{ \
76759+ ITEM_TYPE *next; \
76760+ \
76761+ if (item == NULL) \
76762+ return NULL; \
76763+ next = item->LINK_NAME._next; \
76764+ if (next == NULL) \
76765+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
76766+ return next; \
76767+} \
76768+ \
76769+typedef struct {} PREFIX##_hash_dummy
76770+
76771+#define for_all_ht_buckets(table, head) \
76772+for ((head) = &(table) -> _table[ 0 ] ; \
76773+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
76774+
76775+#define for_all_in_bucket(bucket, item, next, field) \
76776+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
76777+ (item) != NULL ; \
76778+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
76779+
76780+#define for_all_in_htable(table, prefix, item, next) \
76781+for ((item) = prefix ## _hash_first ((table), 0), \
76782+ (next) = prefix ## _hash_next ((table), (item)) ; \
76783+ (item) != NULL ; \
76784+ (item) = (next), \
76785+ (next) = prefix ## _hash_next ((table), (item)))
76786+
76787+/* __REISER4_TYPE_SAFE_HASH_H__ */
76788+#endif
76789+
76790+/* Make Linus happy.
76791+ Local variables:
76792+ c-indentation-style: "K&R"
76793+ mode-name: "LC"
76794+ c-basic-offset: 8
76795+ tab-width: 8
76796+ fill-column: 120
76797+ End:
76798+*/
76799Index: linux-2.6.16/fs/reiser4/vfs_ops.c
76800===================================================================
76801--- /dev/null
76802+++ linux-2.6.16/fs/reiser4/vfs_ops.c
76803@@ -0,0 +1,267 @@
76804+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76805+ * reiser4/README */
76806+
76807+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
76808+ here. */
76809+
76810+#include "forward.h"
76811+#include "debug.h"
76812+#include "dformat.h"
76813+#include "coord.h"
76814+#include "plugin/item/item.h"
76815+#include "plugin/file/file.h"
76816+#include "plugin/security/perm.h"
76817+#include "plugin/disk_format/disk_format.h"
76818+#include "plugin/plugin.h"
76819+#include "plugin/plugin_set.h"
76820+#include "plugin/object.h"
76821+#include "txnmgr.h"
76822+#include "jnode.h"
76823+#include "znode.h"
76824+#include "block_alloc.h"
76825+#include "tree.h"
76826+#include "vfs_ops.h"
76827+#include "inode.h"
76828+#include "page_cache.h"
76829+#include "ktxnmgrd.h"
76830+#include "super.h"
76831+#include "reiser4.h"
76832+#include "entd.h"
76833+#include "status_flags.h"
76834+#include "flush.h"
76835+#include "dscale.h"
76836+
76837+#include <linux/profile.h>
76838+#include <linux/types.h>
76839+#include <linux/mount.h>
76840+#include <linux/vfs.h>
76841+#include <linux/mm.h>
76842+#include <linux/buffer_head.h>
76843+#include <linux/dcache.h>
76844+#include <linux/list.h>
76845+#include <linux/pagemap.h>
76846+#include <linux/slab.h>
76847+#include <linux/seq_file.h>
76848+#include <linux/init.h>
76849+#include <linux/module.h>
76850+#include <linux/writeback.h>
76851+#include <linux/blkdev.h>
76852+#include <linux/quotaops.h>
76853+#include <linux/security.h>
76854+#include <linux/reboot.h>
76855+#include <linux/rcupdate.h>
76856+
76857+
76858+/* update inode stat-data by calling plugin */
76859+int reiser4_update_sd(struct inode *object)
76860+{
76861+ file_plugin *fplug;
76862+
76863+ assert("nikita-2338", object != NULL);
76864+ /* check for read-only file system. */
76865+ if (IS_RDONLY(object))
76866+ return 0;
76867+
76868+ fplug = inode_file_plugin(object);
76869+ assert("nikita-2339", fplug != NULL);
76870+ return fplug->write_sd_by_inode(object);
76871+}
76872+
76873+/* helper function: increase inode nlink count and call plugin method to save
76874+ updated stat-data.
76875+
76876+ Used by link/create and during creation of dot and dotdot in mkdir
76877+*/
76878+int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
76879+ struct inode *parent /* parent where new entry will be */
76880+ ,
76881+ int write_sd_p /* true if stat-data has to be
76882+ * updated */ )
76883+{
76884+ file_plugin *fplug;
76885+ int result;
76886+
76887+ assert("nikita-1351", object != NULL);
76888+
76889+ fplug = inode_file_plugin(object);
76890+ assert("nikita-1445", fplug != NULL);
76891+
76892+ /* ask plugin whether it can add yet another link to this
76893+ object */
76894+ if (!fplug->can_add_link(object))
76895+ return RETERR(-EMLINK);
76896+
76897+ assert("nikita-2211", fplug->add_link != NULL);
76898+ /* call plugin to do actual addition of link */
76899+ result = fplug->add_link(object, parent);
76900+
76901+ /* optionally update stat data */
76902+ if (result == 0 && write_sd_p)
76903+ result = fplug->write_sd_by_inode(object);
76904+ return result;
76905+}
76906+
76907+/* helper function: decrease inode nlink count and call plugin method to save
76908+ updated stat-data.
76909+
76910+ Used by unlink/create
76911+*/
76912+int reiser4_del_nlink(struct inode *object /* object from which link is
76913+ * removed */ ,
76914+ struct inode *parent /* parent where entry was */ ,
76915+ int write_sd_p /* true is stat-data has to be
76916+ * updated */ )
76917+{
76918+ file_plugin *fplug;
76919+ int result;
76920+
76921+ assert("nikita-1349", object != NULL);
76922+
76923+ fplug = inode_file_plugin(object);
76924+ assert("nikita-1350", fplug != NULL);
76925+ assert("nikita-1446", object->i_nlink > 0);
76926+ assert("nikita-2210", fplug->rem_link != NULL);
76927+
76928+ /* call plugin to do actual deletion of link */
76929+ result = fplug->rem_link(object, parent);
76930+
76931+ /* optionally update stat data */
76932+ if (result == 0 && write_sd_p)
76933+ result = fplug->write_sd_by_inode(object);
76934+ return result;
76935+}
76936+
76937+
76938+
76939+
76940+/* Release reiser4 dentry. This is d_op->d_release() method. */
76941+static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
76942+{
76943+ reiser4_free_dentry_fsdata(dentry);
76944+}
76945+
76946+/*
76947+ * Called by reiser4_sync_inodes(), during speculative write-back (through
76948+ * pdflush, or balance_dirty_pages()).
76949+ */
76950+void writeout(struct super_block *sb, struct writeback_control *wbc)
76951+{
76952+ long written = 0;
76953+ int repeats = 0;
76954+ int result;
76955+ struct address_space *mapping;
76956+
76957+ /*
76958+ * Performs early flushing, trying to free some memory. If there is
76959+ * nothing to flush, commits some atoms.
76960+ */
76961+
76962+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
76963+ sys_fsync(). */
76964+ if (wbc->sync_mode != WB_SYNC_NONE) {
76965+ txnmgr_force_commit_all(sb, 0);
76966+ return;
76967+ }
76968+
76969+ BUG_ON(get_super_fake(sb) == NULL);
76970+ mapping = get_super_fake(sb)->i_mapping;
76971+ do {
76972+ long nr_submitted = 0;
76973+ jnode *node = NULL;
76974+
76975+ /* do not put more requests to overload write queue */
76976+ if (wbc->nonblocking &&
76977+ bdi_write_congested(mapping->backing_dev_info)) {
76978+ blk_run_address_space(mapping);
76979+ wbc->encountered_congestion = 1;
76980+ break;
76981+ }
76982+ repeats++;
76983+ BUG_ON(wbc->nr_to_write <= 0);
76984+
76985+ if (get_current_context()->entd) {
76986+ entd_context *ent = get_entd_context(sb);
76987+
76988+ if (ent->cur_request->node)
76989+ /*
76990+ * this is ent thread and it managed to capture
76991+ * requested page itself - start flush from
76992+ * that page
76993+ */
76994+ node = jref(ent->cur_request->node);
76995+ }
76996+
76997+ result = flush_some_atom(node, &nr_submitted, wbc,
76998+ JNODE_FLUSH_WRITE_BLOCKS);
76999+ if (result != 0)
77000+ warning("nikita-31001", "Flush failed: %i", result);
77001+ if (node)
77002+ jput(node);
77003+ if (!nr_submitted)
77004+ break;
77005+
77006+ wbc->nr_to_write -= nr_submitted;
77007+ written += nr_submitted;
77008+ } while (wbc->nr_to_write > 0);
77009+}
77010+
77011+
77012+void reiser4_throttle_write(struct inode *inode)
77013+{
77014+ txn_restart_current();
77015+ balance_dirty_pages_ratelimited(inode->i_mapping);
77016+}
77017+
77018+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
77019+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
77020+ * beginning of device */
77021+
77022+
77023+
77024+/*
77025+ * Reiser4 initialization/shutdown.
77026+ *
77027+ * Code below performs global reiser4 initialization that is done either as
77028+ * part of kernel initialization (when reiser4 is statically built-in), or
77029+ * during reiser4 module load (when compiled as module).
77030+ */
77031+
77032+
77033+void reiser4_handle_error(void)
77034+{
77035+ struct super_block *sb = reiser4_get_current_sb();
77036+
77037+ if (!sb)
77038+ return;
77039+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
77040+ "Filesystem error occured");
77041+ switch (get_super_private(sb)->onerror) {
77042+ case 0:
77043+ reiser4_panic("foobar-42", "Filesystem error occured\n");
77044+ case 1:
77045+ default:
77046+ if (sb->s_flags & MS_RDONLY)
77047+ return;
77048+ sb->s_flags |= MS_RDONLY;
77049+ break;
77050+ }
77051+}
77052+
77053+struct dentry_operations reiser4_dentry_operations = {
77054+ .d_revalidate = NULL,
77055+ .d_hash = NULL,
77056+ .d_compare = NULL,
77057+ .d_delete = NULL,
77058+ .d_release = reiser4_d_release,
77059+ .d_iput = NULL,
77060+};
77061+
77062+/* Make Linus happy.
77063+ Local variables:
77064+ c-indentation-style: "K&R"
77065+ mode-name: "LC"
77066+ c-basic-offset: 8
77067+ tab-width: 8
77068+ fill-column: 120
77069+ End:
77070+*/
77071Index: linux-2.6.16/fs/reiser4/vfs_ops.h
77072===================================================================
77073--- /dev/null
77074+++ linux-2.6.16/fs/reiser4/vfs_ops.h
77075@@ -0,0 +1,58 @@
77076+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77077+ * reiser4/README */
77078+
77079+/* vfs_ops.c's exported symbols */
77080+
77081+#if !defined( __FS_REISER4_VFS_OPS_H__ )
77082+#define __FS_REISER4_VFS_OPS_H__
77083+
77084+#include "forward.h"
77085+#include "coord.h"
77086+#include "seal.h"
77087+#include "plugin/file/file.h"
77088+#include "super.h"
77089+#include "readahead.h"
77090+
77091+#include <linux/types.h> /* for loff_t */
77092+#include <linux/fs.h> /* for struct address_space */
77093+#include <linux/dcache.h> /* for struct dentry */
77094+#include <linux/mm.h>
77095+#include <linux/backing-dev.h>
77096+
77097+/* address space operations */
77098+int reiser4_writepage(struct page *, struct writeback_control *);
77099+int reiser4_set_page_dirty(struct page *);
77100+int reiser4_readpages(struct file *, struct address_space *,
77101+ struct list_head *pages, unsigned nr_pages);
77102+int reiser4_invalidatepage(struct page *, unsigned long offset);
77103+int reiser4_releasepage(struct page *, gfp_t);
77104+
77105+extern int reiser4_update_sd(struct inode *);
77106+extern int reiser4_add_nlink(struct inode *, struct inode *, int);
77107+extern int reiser4_del_nlink(struct inode *, struct inode *, int);
77108+
77109+
77110+extern int reiser4_start_up_io(struct page *page);
77111+extern void reiser4_throttle_write(struct inode *);
77112+extern int jnode_is_releasable(jnode *);
77113+
77114+#define CAPTURE_APAGE_BURST (1024l)
77115+void writeout(struct super_block *, struct writeback_control *);
77116+
77117+
77118+extern void reiser4_handle_error(void);
77119+
77120+
77121+/* __FS_REISER4_VFS_OPS_H__ */
77122+#endif
77123+
77124+/* Make Linus happy.
77125+ Local variables:
77126+ c-indentation-style: "K&R"
77127+ mode-name: "LC"
77128+ c-basic-offset: 8
77129+ tab-width: 8
77130+ fill-column: 120
77131+ scroll-step: 1
77132+ End:
77133+*/
77134Index: linux-2.6.16/fs/reiser4/wander.c
77135===================================================================
77136--- /dev/null
77137+++ linux-2.6.16/fs/reiser4/wander.c
77138@@ -0,0 +1,1799 @@
77139+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77140+ * reiser4/README */
77141+
77142+/* Reiser4 Wandering Log */
77143+
77144+/* You should read http://www.namesys.com/txn-doc.html
77145+
77146+ That describes how filesystem operations are performed as atomic
77147+ transactions, and how we try to arrange it so that we can write most of the
77148+ data only once while performing the operation atomically.
77149+
77150+ For the purposes of this code, it is enough for it to understand that it
77151+ has been told a given block should be written either once, or twice (if
77152+ twice then once to the wandered location and once to the real location).
77153+
77154+ This code guarantees that those blocks that are defined to be part of an
77155+ atom either all take effect or none of them take effect.
77156+
77157+ Relocate set nodes are submitted to write by the jnode_flush() routine, and
77158+ the overwrite set is submitted by reiser4_write_log(). This is because with
77159+ the overwrite set we seek to optimize writes, and with the relocate set we
77160+ seek to cause disk order to correlate with the parent first pre-order.
77161+
77162+ reiser4_write_log() allocates and writes wandered blocks and maintains
77163+ additional on-disk structures of the atom as wander records (each wander
77164+ record occupies one block) for storing of the "wandered map" (a table which
77165+ contains a relation between wandered and real block numbers) and other
77166+ information which might be needed at transaction recovery time.
77167+
77168+ The wander records are unidirectionally linked into a circle: each wander
77169+ record contains a block number of the next wander record, the last wander
77170+ record points to the first one.
77171+
77172+ One wander record (named "tx head" in this file) has a format which is
77173+ different from the other wander records. The "tx head" has a reference to the
77174+ "tx head" block of the previously committed atom. Also, "tx head" contains
77175+ fs information (the free blocks counter, and the oid allocator state) which
77176+ is logged in a special way .
77177+
77178+ There are two journal control blocks, named journal header and journal
77179+ footer which have fixed on-disk locations. The journal header has a
77180+ reference to the "tx head" block of the last committed atom. The journal
77181+ footer points to the "tx head" of the last flushed atom. The atom is
77182+ "played" when all blocks from its overwrite set are written to disk the
77183+ second time (i.e. written to their real locations).
77184+
77185+ NOTE: People who know reiserfs internals and its journal structure might be
77186+ confused with these terms journal footer and journal header. There is a table
77187+ with terms of similar semantics in reiserfs (reiser3) and reiser4:
77188+
77189+ REISER3 TERM | REISER4 TERM | DESCRIPTION
77190+ --------------------+-----------------------+----------------------------
77191+ commit record | journal header | atomic write of this record
77192+ | | ends transaction commit
77193+ --------------------+-----------------------+----------------------------
77194+ journal header | journal footer | atomic write of this record
77195+ | | ends post-commit writes.
77196+ | | After successful
77197+ | | writing of this journal
77198+ | | blocks (in reiser3) or
77199+ | | wandered blocks/records are
77200+ | | free for re-use.
77201+ --------------------+-----------------------+----------------------------
77202+
77203+ The atom commit process is the following:
77204+
77205+ 1. The overwrite set is taken from atom's clean list, and its size is
77206+ counted.
77207+
77208+ 2. The number of necessary wander records (including tx head) is calculated,
77209+ and the wander record blocks are allocated.
77210+
77211+ 3. Allocate wandered blocks and populate wander records by wandered map.
77212+
77213+ 4. submit write requests for wander records and wandered blocks.
77214+
77215+ 5. wait until submitted write requests complete.
77216+
77217+ 6. update journal header: change the pointer to the block number of just
77218+ written tx head, submit an i/o for modified journal header block and wait
77219+ for i/o completion.
77220+
77221+ NOTE: The special logging for bitmap blocks and some reiser4 super block
77222+ fields makes processes of atom commit, flush and recovering a bit more
77223+ complex (see comments in the source code for details).
77224+
77225+ The atom playing process is the following:
77226+
77227+ 1. Write atom's overwrite set in-place.
77228+
77229+ 2. Wait on i/o.
77230+
77231+ 3. Update journal footer: change the pointer to block number of tx head
77232+ block of the atom we currently flushing, submit an i/o, wait on i/o
77233+ completion.
77234+
77235+ 4. Free disk space which was used for wandered blocks and wander records.
77236+
77237+ After the freeing of wandered blocks and wander records we have that journal
77238+ footer points to the on-disk structure which might be overwritten soon.
77239+ Neither the log writer nor the journal recovery procedure use that pointer
77240+ for accessing the data. When the journal recovery procedure finds the oldest
77241+ transaction it compares the journal footer pointer value with the "prev_tx"
77242+ pointer value in tx head, if values are equal the oldest not flushed
77243+ transaction is found.
77244+
77245+ NOTE on disk space leakage: the information about of what blocks and how many
77246+ blocks are allocated for wandered blocks, wandered records is not written to
77247+ the disk because of special logging for bitmaps and some super blocks
77248+ counters. After a system crash we the reiser4 does not remember those
77249+ objects allocation, thus we have no such a kind of disk space leakage.
77250+*/
77251+
77252+/* Special logging of reiser4 super block fields. */
77253+
77254+/* There are some reiser4 super block fields (free block count and OID allocator
77255+ state (number of files and next free OID) which are logged separately from
77256+ super block to avoid unnecessary atom fusion.
77257+
77258+ So, the reiser4 super block can be not captured by a transaction with
77259+ allocates/deallocates disk blocks or create/delete file objects. Moreover,
77260+ the reiser4 on-disk super block is not touched when such a transaction is
77261+ committed and flushed. Those "counters logged specially" are logged in "tx
77262+ head" blocks and in the journal footer block.
77263+
77264+ A step-by-step description of special logging:
77265+
77266+ 0. The per-atom information about deleted or created files and allocated or
77267+ freed blocks is collected during the transaction. The atom's
77268+ ->nr_objects_created and ->nr_objects_deleted are for object
77269+ deletion/creation tracking, the numbers of allocated and freed blocks are
77270+ calculated using atom's delete set and atom's capture list -- all new and
77271+ relocated nodes should be on atom's clean list and should have JNODE_RELOC
77272+ bit set.
77273+
77274+ 1. The "logged specially" reiser4 super block fields have their "committed"
77275+ versions in the reiser4 in-memory super block. They get modified only at
77276+ atom commit time. The atom's commit thread has an exclusive access to those
77277+ "committed" fields because the log writer implementation supports only one
77278+ atom commit a time (there is a per-fs "commit" semaphore). At
77279+ that time "committed" counters are modified using per-atom information
77280+ collected during the transaction. These counters are stored on disk as a
77281+ part of tx head block when atom is committed.
77282+
77283+ 2. When the atom is flushed the value of the free block counter and the OID
77284+ allocator state get written to the journal footer block. A special journal
77285+ procedure (journal_recover_sb_data()) takes those values from the journal
77286+ footer and updates the reiser4 in-memory super block.
77287+
77288+ NOTE: That means free block count and OID allocator state are logged
77289+ separately from the reiser4 super block regardless of the fact that the
77290+ reiser4 super block has fields to store both the free block counter and the
77291+ OID allocator.
77292+
77293+ Writing the whole super block at commit time requires knowing true values of
77294+ all its fields without changes made by not yet committed transactions. It is
77295+ possible by having their "committed" version of the super block like the
77296+ reiser4 bitmap blocks have "committed" and "working" versions. However,
77297+ another scheme was implemented which stores special logged values in the
77298+ unused free space inside transaction head block. In my opinion it has an
77299+ advantage of not writing whole super block when only part of it was
77300+ modified. */
77301+
77302+#include "debug.h"
77303+#include "dformat.h"
77304+#include "txnmgr.h"
77305+#include "jnode.h"
77306+#include "znode.h"
77307+#include "block_alloc.h"
77308+#include "page_cache.h"
77309+#include "wander.h"
77310+#include "reiser4.h"
77311+#include "super.h"
77312+#include "vfs_ops.h"
77313+#include "writeout.h"
77314+#include "inode.h"
77315+#include "entd.h"
77316+
77317+#include <linux/types.h>
77318+#include <linux/fs.h> /* for struct super_block */
77319+#include <linux/mm.h> /* for struct page */
77320+#include <linux/pagemap.h>
77321+#include <linux/bio.h> /* for struct bio */
77322+#include <linux/blkdev.h>
77323+
77324+static int write_jnodes_to_disk_extent(
77325+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77326+
77327+/* The commit_handle is a container for objects needed at atom commit time */
77328+struct commit_handle {
77329+ /* A pointer to atom's list of OVRWR nodes */
77330+ struct list_head *overwrite_set;
77331+ /* atom's overwrite set size */
77332+ int overwrite_set_size;
77333+ /* jnodes for wander record blocks */
77334+ struct list_head tx_list;
77335+ /* number of wander records */
77336+ __u32 tx_size;
77337+ /* 'committed' sb counters are saved here until atom is completely
77338+ flushed */
77339+ __u64 free_blocks;
77340+ __u64 nr_files;
77341+ __u64 next_oid;
77342+ /* A pointer to the atom which is being committed */
77343+ txn_atom *atom;
77344+ /* A pointer to current super block */
77345+ struct super_block *super;
77346+ /* The counter of modified bitmaps */
77347+ reiser4_block_nr nr_bitmap;
77348+};
77349+
77350+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77351+{
77352+ memset(ch, 0, sizeof(struct commit_handle));
77353+ INIT_LIST_HEAD(&ch->tx_list);
77354+
77355+ ch->atom = atom;
77356+ ch->super = reiser4_get_current_sb();
77357+}
77358+
77359+static void done_commit_handle(struct commit_handle *ch)
77360+{
77361+ assert("zam-690", list_empty(&ch->tx_list));
77362+}
77363+
77364+static inline int reiser4_use_write_barrier(struct super_block * s)
77365+{
77366+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77367+}
77368+
77369+static void disable_write_barrier(struct super_block * s)
77370+{
77371+ notice("zam-1055", "%s does not support write barriers,"
77372+ " using synchronous write instead.", s->s_id);
77373+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77374+}
77375+
77376+
77377+/* fill journal header block data */
77378+static void format_journal_header(struct commit_handle *ch)
77379+{
77380+ struct reiser4_super_info_data *sbinfo;
77381+ struct journal_header *header;
77382+ jnode *txhead;
77383+
77384+ sbinfo = get_super_private(ch->super);
77385+ assert("zam-479", sbinfo != NULL);
77386+ assert("zam-480", sbinfo->journal_header != NULL);
77387+
77388+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77389+
77390+ jload(sbinfo->journal_header);
77391+
77392+ header = (struct journal_header *)jdata(sbinfo->journal_header);
77393+ assert("zam-484", header != NULL);
77394+
77395+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77396+ &header->last_committed_tx);
77397+
77398+ jrelse(sbinfo->journal_header);
77399+}
77400+
77401+/* fill journal footer block data */
77402+static void format_journal_footer(struct commit_handle *ch)
77403+{
77404+ struct reiser4_super_info_data *sbinfo;
77405+ struct journal_footer *footer;
77406+ jnode *tx_head;
77407+
77408+ sbinfo = get_super_private(ch->super);
77409+
77410+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77411+
77412+ assert("zam-493", sbinfo != NULL);
77413+ assert("zam-494", sbinfo->journal_header != NULL);
77414+
77415+ check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77416+
77417+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77418+ assert("zam-495", footer != NULL);
77419+
77420+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77421+ &footer->last_flushed_tx);
77422+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77423+
77424+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77425+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77426+
77427+ jrelse(sbinfo->journal_footer);
77428+}
77429+
77430+/* wander record capacity depends on current block size */
77431+static int wander_record_capacity(const struct super_block *super)
77432+{
77433+ return (super->s_blocksize -
77434+ sizeof(struct wander_record_header)) /
77435+ sizeof(struct wander_entry);
77436+}
77437+
77438+/* Fill first wander record (tx head) in accordance with supplied given data */
77439+static void format_tx_head(struct commit_handle *ch)
77440+{
77441+ jnode *tx_head;
77442+ jnode *next;
77443+ struct tx_header *header;
77444+
77445+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77446+ assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77447+
77448+ next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77449+ if (&ch->tx_list == &next->capture_link)
77450+ next = tx_head;
77451+
77452+ header = (struct tx_header *)jdata(tx_head);
77453+
77454+ assert("zam-460", header != NULL);
77455+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77456+
77457+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77458+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77459+
77460+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77461+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77462+ &header->prev_tx);
77463+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77464+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
77465+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
77466+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
77467+}
77468+
77469+/* prepare ordinary wander record block (fill all service fields) */
77470+static void
77471+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
77472+{
77473+ struct wander_record_header *LRH;
77474+ jnode *next;
77475+
77476+ assert("zam-464", node != NULL);
77477+
77478+ LRH = (struct wander_record_header *)jdata(node);
77479+ next = list_entry(node->capture_link.next, jnode, capture_link);
77480+
77481+ if (&ch->tx_list == &next->capture_link)
77482+ next = list_entry(ch->tx_list.next, jnode, capture_link);
77483+
77484+ assert("zam-465", LRH != NULL);
77485+ assert("zam-463",
77486+ ch->super->s_blocksize > sizeof(struct wander_record_header));
77487+
77488+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
77489+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
77490+
77491+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
77492+ put_unaligned(cpu_to_le32(serial), &LRH->serial);
77493+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
77494+}
77495+
77496+/* add one wandered map entry to formatted wander record */
77497+static void
77498+store_entry(jnode * node, int index, const reiser4_block_nr * a,
77499+ const reiser4_block_nr * b)
77500+{
77501+ char *data;
77502+ struct wander_entry *pairs;
77503+
77504+ data = jdata(node);
77505+ assert("zam-451", data != NULL);
77506+
77507+ pairs =
77508+ (struct wander_entry *)(data + sizeof(struct wander_record_header));
77509+
77510+ put_unaligned(cpu_to_le64(*a), &pairs[index].original);
77511+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
77512+}
77513+
77514+/* currently, wander records contains contain only wandered map, which depend on
77515+ overwrite set size */
77516+static void get_tx_size(struct commit_handle *ch)
77517+{
77518+ assert("zam-440", ch->overwrite_set_size != 0);
77519+ assert("zam-695", ch->tx_size == 0);
77520+
77521+ /* count all ordinary wander records
77522+ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
77523+ for tx head block */
77524+ ch->tx_size =
77525+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
77526+ 2;
77527+}
77528+
77529+/* A special structure for using in store_wmap_actor() for saving its state
77530+ between calls */
77531+struct store_wmap_params {
77532+ jnode *cur; /* jnode of current wander record to fill */
77533+ int idx; /* free element index in wander record */
77534+ int capacity; /* capacity */
77535+
77536+#if REISER4_DEBUG
77537+ struct list_head *tx_list;
77538+#endif
77539+};
77540+
77541+/* an actor for use in blocknr_set_iterator routine which populates the list
77542+ of pre-formatted wander records by wandered map info */
77543+static int
77544+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
77545+ const reiser4_block_nr * b, void *data)
77546+{
77547+ struct store_wmap_params *params = data;
77548+
77549+ if (params->idx >= params->capacity) {
77550+ /* a new wander record should be taken from the tx_list */
77551+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
77552+ assert("zam-454",
77553+ params->tx_list != &params->cur->capture_link);
77554+
77555+ params->idx = 0;
77556+ }
77557+
77558+ store_entry(params->cur, params->idx, a, b);
77559+ params->idx++;
77560+
77561+ return 0;
77562+}
77563+
77564+/* This function is called after Relocate set gets written to disk, Overwrite
77565+ set is written to wandered locations and all wander records are written
77566+ also. Updated journal header blocks contains a pointer (block number) to
77567+ first wander record of the just written transaction */
77568+static int update_journal_header(struct commit_handle *ch, int use_barrier)
77569+{
77570+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77571+ jnode *jh = sbinfo->journal_header;
77572+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
77573+ int ret;
77574+
77575+ format_journal_header(ch);
77576+
77577+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
77578+ use_barrier ? WRITEOUT_BARRIER : 0);
77579+ if (ret)
77580+ return ret;
77581+
77582+ // blk_run_address_space(sbinfo->fake->i_mapping);
77583+ /*blk_run_queues(); */
77584+
77585+ ret = jwait_io(jh, WRITE);
77586+
77587+ if (ret)
77588+ return ret;
77589+
77590+ sbinfo->last_committed_tx = *jnode_get_block(head);
77591+
77592+ return 0;
77593+}
77594+
77595+/* This function is called after write-back is finished. We update journal
77596+ footer block and free blocks which were occupied by wandered blocks and
77597+ transaction wander records */
77598+static int update_journal_footer(struct commit_handle *ch, int use_barrier)
77599+{
77600+ reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77601+
77602+ jnode *jf = sbinfo->journal_footer;
77603+
77604+ int ret;
77605+
77606+ format_journal_footer(ch);
77607+
77608+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
77609+ use_barrier ? WRITEOUT_BARRIER : 0);
77610+ if (ret)
77611+ return ret;
77612+
77613+ // blk_run_address_space(sbinfo->fake->i_mapping);
77614+ /*blk_run_queue(); */
77615+
77616+ ret = jwait_io(jf, WRITE);
77617+ if (ret)
77618+ return ret;
77619+
77620+ return 0;
77621+}
77622+
77623+/* free block numbers of wander records of already written in place transaction */
77624+static void dealloc_tx_list(struct commit_handle *ch)
77625+{
77626+ while (!list_empty(&ch->tx_list)) {
77627+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
77628+ list_del(&cur->capture_link);
77629+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
77630+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
77631+ BA_FORMATTED);
77632+
77633+ unpin_jnode_data(cur);
77634+ drop_io_head(cur);
77635+ }
77636+}
77637+
77638+/* An actor for use in block_nr_iterator() routine which frees wandered blocks
77639+ from atom's overwrite set. */
77640+static int
77641+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
77642+ const reiser4_block_nr * a UNUSED_ARG,
77643+ const reiser4_block_nr * b, void *data UNUSED_ARG)
77644+{
77645+
77646+ assert("zam-499", b != NULL);
77647+ assert("zam-500", *b != 0);
77648+ assert("zam-501", !blocknr_is_fake(b));
77649+
77650+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
77651+ return 0;
77652+}
77653+
77654+/* free wandered block locations of already written in place transaction */
77655+static void dealloc_wmap(struct commit_handle *ch)
77656+{
77657+ assert("zam-696", ch->atom != NULL);
77658+
77659+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
77660+ dealloc_wmap_actor, NULL, 1);
77661+}
77662+
77663+/* helper function for alloc wandered blocks, which refill set of block
77664+ numbers needed for wandered blocks */
77665+static int
77666+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
77667+{
77668+ reiser4_blocknr_hint hint;
77669+ int ret;
77670+
77671+ reiser4_block_nr wide_len = count;
77672+
77673+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
77674+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
77675+ reserved allocation area so as to get the best qualities of fixed
77676+ journals? */
77677+ blocknr_hint_init(&hint);
77678+ hint.block_stage = BLOCK_GRABBED;
77679+
77680+ ret = reiser4_alloc_blocks(&hint, start, &wide_len,
77681+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
77682+ *len = (int)wide_len;
77683+
77684+ return ret;
77685+}
77686+
77687+/*
77688+ * roll back changes made before issuing BIO in the case of IO error.
77689+ */
77690+static void undo_bio(struct bio *bio)
77691+{
77692+ int i;
77693+
77694+ for (i = 0; i < bio->bi_vcnt; ++i) {
77695+ struct page *pg;
77696+ jnode *node;
77697+
77698+ pg = bio->bi_io_vec[i].bv_page;
77699+ ClearPageWriteback(pg);
77700+ node = jprivate(pg);
77701+ spin_lock_jnode(node);
77702+ JF_CLR(node, JNODE_WRITEBACK);
77703+ JF_SET(node, JNODE_DIRTY);
77704+ spin_unlock_jnode(node);
77705+ }
77706+ bio_put(bio);
77707+}
77708+
77709+/* put overwrite set back to atom's clean list */
77710+static void put_overwrite_set(struct commit_handle *ch)
77711+{
77712+ jnode *cur;
77713+
77714+ list_for_each_entry(cur, ch->overwrite_set, capture_link)
77715+ jrelse_tail(cur);
77716+}
77717+
77718+/* Count overwrite set size, grab disk space for wandered blocks allocation.
77719+ Since we have a separate list for atom's overwrite set we just scan the list,
77720+ count bitmap and other not leaf nodes which wandered blocks allocation we
77721+ have to grab space for. */
77722+static int get_overwrite_set(struct commit_handle *ch)
77723+{
77724+ int ret;
77725+ jnode *cur;
77726+ __u64 nr_not_leaves = 0;
77727+#if REISER4_DEBUG
77728+ __u64 nr_formatted_leaves = 0;
77729+ __u64 nr_unformatted_leaves = 0;
77730+#endif
77731+
77732+ assert("zam-697", ch->overwrite_set_size == 0);
77733+
77734+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
77735+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77736+
77737+ while (ch->overwrite_set != &cur->capture_link) {
77738+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
77739+
77740+ /* Count bitmap locks for getting correct statistics what number
77741+ * of blocks were cleared by the transaction commit. */
77742+ if (jnode_get_type(cur) == JNODE_BITMAP)
77743+ ch->nr_bitmap++;
77744+
77745+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
77746+ || jnode_get_type(cur) == JNODE_BITMAP);
77747+
77748+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
77749+ /* we replace fake znode by another (real)
77750+ znode which is suggested by disk_layout
77751+ plugin */
77752+
77753+ /* FIXME: it looks like fake znode should be
77754+ replaced by jnode supplied by
77755+ disk_layout. */
77756+
77757+ struct super_block *s = reiser4_get_current_sb();
77758+ reiser4_super_info_data *sbinfo =
77759+ get_current_super_private();
77760+
77761+ if (sbinfo->df_plug->log_super) {
77762+ jnode *sj = sbinfo->df_plug->log_super(s);
77763+
77764+ assert("zam-593", sj != NULL);
77765+
77766+ if (IS_ERR(sj))
77767+ return PTR_ERR(sj);
77768+
77769+ spin_lock_jnode(sj);
77770+ JF_SET(sj, JNODE_OVRWR);
77771+ insert_into_atom_ovrwr_list(ch->atom, sj);
77772+ spin_unlock_jnode(sj);
77773+
77774+ /* jload it as the rest of overwrite set */
77775+ jload_gfp(sj, get_gfp_mask(), 0);
77776+
77777+ ch->overwrite_set_size++;
77778+ }
77779+ spin_lock_jnode(cur);
77780+ uncapture_block(cur);
77781+ jput(cur);
77782+
77783+ } else {
77784+ int ret;
77785+ ch->overwrite_set_size++;
77786+ ret = jload_gfp(cur, get_gfp_mask(), 0);
77787+ if (ret)
77788+ reiser4_panic("zam-783",
77789+ "cannot load e-flushed jnode back (ret = %d)\n",
77790+ ret);
77791+ }
77792+
77793+ /* Count not leaves here because we have to grab disk space
77794+ * for wandered blocks. They were not counted as "flush
77795+ * reserved". Counting should be done _after_ nodes are pinned
77796+ * into memory by jload(). */
77797+ if (!jnode_is_leaf(cur))
77798+ nr_not_leaves++;
77799+ else {
77800+#if REISER4_DEBUG
77801+ /* at this point @cur either has JNODE_FLUSH_RESERVED
77802+ * or is eflushed. Locking is not strong enough to
77803+ * write an assertion checking for this. */
77804+ if (jnode_is_znode(cur))
77805+ nr_formatted_leaves++;
77806+ else
77807+ nr_unformatted_leaves++;
77808+#endif
77809+ JF_CLR(cur, JNODE_FLUSH_RESERVED);
77810+ }
77811+
77812+ cur = next;
77813+ }
77814+
77815+ /* Grab space for writing (wandered blocks) of not leaves found in
77816+ * overwrite set. */
77817+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
77818+ if (ret)
77819+ return ret;
77820+
77821+ /* Disk space for allocation of wandered blocks of leaf nodes already
77822+ * reserved as "flush reserved", move it to grabbed space counter. */
77823+ spin_lock_atom(ch->atom);
77824+ assert("zam-940",
77825+ nr_formatted_leaves + nr_unformatted_leaves <=
77826+ ch->atom->flush_reserved);
77827+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
77828+ spin_unlock_atom(ch->atom);
77829+
77830+ return ch->overwrite_set_size;
77831+}
77832+
77833+/**
77834+ * write_jnodes_to_disk_extent - submit write request
77835+ * @head:
77836+ * @first: first jnode of the list
77837+ * @nr: number of jnodes on the list
77838+ * @block_p:
77839+ * @fq:
77840+ * @flags: used to decide whether page is to get PG_reclaim flag
77841+ *
77842+ * Submits a write request for @nr jnodes beginning from the @first, other
77843+ * jnodes are after the @first on the double-linked "capture" list. All jnodes
77844+ * will be written to the disk region of @nr blocks starting with @block_p block
77845+ * number. If @fq is not NULL it means that waiting for i/o completion will be
77846+ * done more efficiently by using flush_queue_t objects.
77847+ * This function is the one which writes list of jnodes in batch mode. It does
77848+ * all low-level things as bio construction and page states manipulation.
77849+ *
77850+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
77851+ * aggregated in this function instead of being left to the layers below
77852+ *
77853+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
77854+ * Why that layer needed? Why BIOs cannot be constructed here?
77855+ */
77856+static int write_jnodes_to_disk_extent(
77857+ jnode *first, int nr, const reiser4_block_nr *block_p,
77858+ flush_queue_t *fq, int flags)
77859+{
77860+ struct super_block *super = reiser4_get_current_sb();
77861+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
77862+ int max_blocks;
77863+ jnode *cur = first;
77864+ reiser4_block_nr block;
77865+
77866+ assert("zam-571", first != NULL);
77867+ assert("zam-572", block_p != NULL);
77868+ assert("zam-570", nr > 0);
77869+
77870+ block = *block_p;
77871+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
77872+
77873+ while (nr > 0) {
77874+ struct bio *bio;
77875+ int nr_blocks = min(nr, max_blocks);
77876+ int i;
77877+ int nr_used;
77878+
77879+ bio = bio_alloc(GFP_NOIO, nr_blocks);
77880+ if (!bio)
77881+ return RETERR(-ENOMEM);
77882+
77883+ bio->bi_bdev = super->s_bdev;
77884+ bio->bi_sector = block * (super->s_blocksize >> 9);
77885+ for (nr_used = 0, i = 0; i < nr_blocks; i++) {
77886+ struct page *pg;
77887+
77888+ pg = jnode_page(cur);
77889+ assert("zam-573", pg != NULL);
77890+
77891+ page_cache_get(pg);
77892+
77893+ lock_and_wait_page_writeback(pg);
77894+
77895+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
77896+ /*
77897+ * underlying device is satiated. Stop adding
77898+ * pages to the bio.
77899+ */
77900+ unlock_page(pg);
77901+ page_cache_release(pg);
77902+ break;
77903+ }
77904+
77905+ spin_lock_jnode(cur);
77906+ assert("nikita-3166",
77907+ pg->mapping == jnode_get_mapping(cur));
77908+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
77909+#if REISER4_DEBUG
77910+ spin_lock(&cur->load);
77911+ assert("nikita-3165", !jnode_is_releasable(cur));
77912+ spin_unlock(&cur->load);
77913+#endif
77914+ JF_SET(cur, JNODE_WRITEBACK);
77915+ JF_CLR(cur, JNODE_DIRTY);
77916+ ON_DEBUG(cur->written++);
77917+ spin_unlock_jnode(cur);
77918+
77919+ ClearPageError(pg);
77920+ set_page_writeback(pg);
77921+
77922+ if (get_current_context()->entd) {
77923+ /* this is ent thread */
77924+ entd_context *ent = get_entd_context(super);
77925+ struct wbq *rq, *next;
77926+
77927+ spin_lock(&ent->guard);
77928+
77929+ if (pg == ent->cur_request->page) {
77930+ /*
77931+ * entd is called for this page. This
77932+ * request is not in th etodo list
77933+ */
77934+ ent->cur_request->written = 1;
77935+ } else {
77936+ /*
77937+ * if we have written a page for which writepage
77938+ * is called for - move request to another list.
77939+ */
77940+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
77941+ assert("", rq->magic == WBQ_MAGIC);
77942+ if (pg == rq->page) {
77943+ /*
77944+ * remove request from
77945+ * entd's queue, but do
77946+ * not wake up a thread
77947+ * which put this
77948+ * request
77949+ */
77950+ list_del_init(&rq->link);
77951+ ent->nr_todo_reqs --;
77952+ list_add_tail(&rq->link, &ent->done_list);
77953+ ent->nr_done_reqs ++;
77954+ rq->written = 1;
77955+ break;
77956+ }
77957+ }
77958+ }
77959+ spin_unlock(&ent->guard);
77960+ }
77961+
77962+ clear_page_dirty_for_io(pg);
77963+
77964+ unlock_page(pg);
77965+
77966+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77967+ nr_used++;
77968+ }
77969+ if (nr_used > 0) {
77970+ assert("nikita-3453",
77971+ bio->bi_size == super->s_blocksize * nr_used);
77972+ assert("nikita-3454", bio->bi_vcnt == nr_used);
77973+
77974+ /* Check if we are allowed to write at all */
77975+ if (super->s_flags & MS_RDONLY)
77976+ undo_bio(bio);
77977+ else {
77978+ int not_supported;
77979+
77980+ add_fq_to_bio(fq, bio);
77981+ bio_get(bio);
77982+ reiser4_submit_bio(write_op, bio);
77983+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
77984+ bio_put(bio);
77985+ if (not_supported)
77986+ return -EOPNOTSUPP;
77987+ }
77988+
77989+ block += nr_used - 1;
77990+ update_blocknr_hint_default(super, &block);
77991+ block += 1;
77992+ } else {
77993+ bio_put(bio);
77994+ }
77995+ nr -= nr_used;
77996+ }
77997+
77998+ return 0;
77999+}
78000+
78001+/* This is a procedure which recovers a contiguous sequences of disk block
78002+ numbers in the given list of j-nodes and submits write requests on this
78003+ per-sequence basis */
78004+int
78005+write_jnode_list(struct list_head *head, flush_queue_t *fq,
78006+ long *nr_submitted, int flags)
78007+{
78008+ int ret;
78009+ jnode *beg = list_entry(head->next, jnode, capture_link);
78010+
78011+ while (head != &beg->capture_link) {
78012+ int nr = 1;
78013+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
78014+
78015+ while (head != &cur->capture_link) {
78016+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
78017+ break;
78018+ ++nr;
78019+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78020+ }
78021+
78022+ ret = write_jnodes_to_disk_extent(
78023+ beg, nr, jnode_get_block(beg), fq, flags);
78024+ if (ret)
78025+ return ret;
78026+
78027+ if (nr_submitted)
78028+ *nr_submitted += nr;
78029+
78030+ beg = cur;
78031+ }
78032+
78033+ return 0;
78034+}
78035+
78036+/* add given wandered mapping to atom's wandered map */
78037+static int
78038+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
78039+{
78040+ int ret;
78041+ blocknr_set_entry *new_bsep = NULL;
78042+ reiser4_block_nr block;
78043+
78044+ txn_atom *atom;
78045+
78046+ assert("zam-568", block_p != NULL);
78047+ block = *block_p;
78048+ assert("zam-569", len > 0);
78049+
78050+ while ((len--) > 0) {
78051+ do {
78052+ atom = get_current_atom_locked();
78053+ assert("zam-536",
78054+ !blocknr_is_fake(jnode_get_block(cur)));
78055+ ret =
78056+ blocknr_set_add_pair(atom, &atom->wandered_map,
78057+ &new_bsep,
78058+ jnode_get_block(cur), &block);
78059+ } while (ret == -E_REPEAT);
78060+
78061+ if (ret) {
78062+ /* deallocate blocks which were not added to wandered
78063+ map */
78064+ reiser4_block_nr wide_len = len;
78065+
78066+ reiser4_dealloc_blocks(&block, &wide_len,
78067+ BLOCK_NOT_COUNTED,
78068+ BA_FORMATTED
78069+ /* formatted, without defer */ );
78070+
78071+ return ret;
78072+ }
78073+
78074+ spin_unlock_atom(atom);
78075+
78076+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78077+ ++block;
78078+ }
78079+
78080+ return 0;
78081+}
78082+
78083+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
78084+ submit IO for allocated blocks. We assume that current atom is in a stage
78085+ when any atom fusion is impossible and atom is unlocked and it is safe. */
78086+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
78087+{
78088+ reiser4_block_nr block;
78089+
78090+ int rest;
78091+ int len;
78092+ int ret;
78093+
78094+ jnode *cur;
78095+
78096+ assert("zam-534", ch->overwrite_set_size > 0);
78097+
78098+ rest = ch->overwrite_set_size;
78099+
78100+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78101+ while (ch->overwrite_set != &cur->capture_link) {
78102+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
78103+
78104+ ret = get_more_wandered_blocks(rest, &block, &len);
78105+ if (ret)
78106+ return ret;
78107+
78108+ rest -= len;
78109+
78110+ ret = add_region_to_wmap(cur, len, &block);
78111+ if (ret)
78112+ return ret;
78113+
78114+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
78115+ if (ret)
78116+ return ret;
78117+
78118+ while ((len--) > 0) {
78119+ assert("zam-604",
78120+ ch->overwrite_set != &cur->capture_link);
78121+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78122+ }
78123+ }
78124+
78125+ return 0;
78126+}
78127+
78128+/* allocate given number of nodes over the journal area and link them into a
78129+ list, return pointer to the first jnode in the list */
78130+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
78131+{
78132+ reiser4_blocknr_hint hint;
78133+ reiser4_block_nr allocated = 0;
78134+ reiser4_block_nr first, len;
78135+ jnode *cur;
78136+ jnode *txhead;
78137+ int ret;
78138+ reiser4_context *ctx;
78139+ reiser4_super_info_data *sbinfo;
78140+
78141+ assert("zam-698", ch->tx_size > 0);
78142+ assert("zam-699", list_empty_careful(&ch->tx_list));
78143+
78144+ ctx = get_current_context();
78145+ sbinfo = get_super_private(ctx->super);
78146+
78147+ while (allocated < (unsigned)ch->tx_size) {
78148+ len = (ch->tx_size - allocated);
78149+
78150+ blocknr_hint_init(&hint);
78151+
78152+ hint.block_stage = BLOCK_GRABBED;
78153+
78154+ /* FIXME: there should be some block allocation policy for
78155+ nodes which contain wander records */
78156+
78157+ /* We assume that disk space for wandered record blocks can be
78158+ * taken from reserved area. */
78159+ ret = reiser4_alloc_blocks(&hint, &first, &len,
78160+ BA_FORMATTED | BA_RESERVED |
78161+ BA_USE_DEFAULT_SEARCH_START);
78162+ blocknr_hint_done(&hint);
78163+
78164+ if (ret)
78165+ return ret;
78166+
78167+ allocated += len;
78168+
78169+ /* create jnodes for all wander records */
78170+ while (len--) {
78171+ cur = alloc_io_head(&first);
78172+
78173+ if (cur == NULL) {
78174+ ret = RETERR(-ENOMEM);
78175+ goto free_not_assigned;
78176+ }
78177+
78178+ ret = jinit_new(cur, get_gfp_mask());
78179+
78180+ if (ret != 0) {
78181+ jfree(cur);
78182+ goto free_not_assigned;
78183+ }
78184+
78185+ pin_jnode_data(cur);
78186+
78187+ list_add_tail(&cur->capture_link, &ch->tx_list);
78188+
78189+ first++;
78190+ }
78191+ }
78192+
78193+ { /* format a on-disk linked list of wander records */
78194+ int serial = 1;
78195+
78196+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
78197+ format_tx_head(ch);
78198+
78199+ cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78200+ while (&ch->tx_list != &cur->capture_link) {
78201+ format_wander_record(ch, cur, serial++);
78202+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78203+ }
78204+ }
78205+
78206+ { /* Fill wander records with Wandered Set */
78207+ struct store_wmap_params params;
78208+ txn_atom *atom;
78209+
78210+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78211+
78212+ params.idx = 0;
78213+ params.capacity =
78214+ wander_record_capacity(reiser4_get_current_sb());
78215+
78216+ atom = get_current_atom_locked();
78217+ blocknr_set_iterator(atom, &atom->wandered_map,
78218+ &store_wmap_actor, &params, 0);
78219+ spin_unlock_atom(atom);
78220+ }
78221+
78222+ { /* relse all jnodes from tx_list */
78223+ cur = list_entry(ch->tx_list.next, jnode, capture_link);
78224+ while (&ch->tx_list != &cur->capture_link) {
78225+ jrelse(cur);
78226+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78227+ }
78228+ }
78229+
78230+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
78231+
78232+ return ret;
78233+
78234+ free_not_assigned:
78235+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The
78236+ caller takes care about invalidating of tx list */
78237+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
78238+
78239+ return ret;
78240+}
78241+
78242+static int commit_tx(struct commit_handle *ch)
78243+{
78244+ flush_queue_t *fq;
78245+ int barrier;
78246+ int ret;
78247+
78248+ /* Grab more space for wandered records. */
78249+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
78250+ if (ret)
78251+ return ret;
78252+
78253+ fq = get_fq_for_current_atom();
78254+ if (IS_ERR(fq))
78255+ return PTR_ERR(fq);
78256+
78257+ spin_unlock_atom(fq->atom);
78258+ do {
78259+ ret = alloc_wandered_blocks(ch, fq);
78260+ if (ret)
78261+ break;
78262+ ret = alloc_tx(ch, fq);
78263+ if (ret)
78264+ break;
78265+ } while (0);
78266+
78267+ fq_put(fq);
78268+ if (ret)
78269+ return ret;
78270+ repeat_wo_barrier:
78271+ barrier = reiser4_use_write_barrier(ch->super);
78272+ if (!barrier) {
78273+ ret = current_atom_finish_all_fq();
78274+ if (ret)
78275+ return ret;
78276+ }
78277+ ret = update_journal_header(ch, barrier);
78278+ if (barrier) {
78279+ if (ret) {
78280+ if (ret == -EOPNOTSUPP) {
78281+ disable_write_barrier(ch->super);
78282+ goto repeat_wo_barrier;
78283+ }
78284+ return ret;
78285+ }
78286+ ret = current_atom_finish_all_fq();
78287+ }
78288+ return ret;
78289+}
78290+
78291+
78292+static int write_tx_back(struct commit_handle * ch)
78293+{
78294+ flush_queue_t *fq;
78295+ int ret;
78296+ int barrier;
78297+
78298+ post_commit_hook();
78299+ fq = get_fq_for_current_atom();
78300+ if (IS_ERR(fq))
78301+ return PTR_ERR(fq);
78302+ spin_unlock_atom(fq->atom);
78303+ ret = write_jnode_list(
78304+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78305+ fq_put(fq);
78306+ if (ret)
78307+ return ret;
78308+ repeat_wo_barrier:
78309+ barrier = reiser4_use_write_barrier(ch->super);
78310+ if (!barrier) {
78311+ ret = current_atom_finish_all_fq();
78312+ if (ret)
78313+ return ret;
78314+ }
78315+ ret = update_journal_footer(ch, barrier);
78316+ if (barrier) {
78317+ if (ret) {
78318+ if (ret == -EOPNOTSUPP) {
78319+ disable_write_barrier(ch->super);
78320+ goto repeat_wo_barrier;
78321+ }
78322+ return ret;
78323+ }
78324+ ret = current_atom_finish_all_fq();
78325+ }
78326+ if (ret)
78327+ return ret;
78328+ post_write_back_hook();
78329+ return 0;
78330+}
78331+
78332+/* We assume that at this moment all captured blocks are marked as RELOC or
78333+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78334+ are submitted to write.
78335+*/
78336+
78337+int reiser4_write_logs(long *nr_submitted)
78338+{
78339+ txn_atom *atom;
78340+ struct super_block *super = reiser4_get_current_sb();
78341+ reiser4_super_info_data *sbinfo = get_super_private(super);
78342+ struct commit_handle ch;
78343+ int ret;
78344+
78345+ writeout_mode_enable();
78346+
78347+ /* block allocator may add j-nodes to the clean_list */
78348+ ret = pre_commit_hook();
78349+ if (ret)
78350+ return ret;
78351+
78352+ /* No locks are required if we take atom which stage >=
78353+ * ASTAGE_PRE_COMMIT */
78354+ atom = get_current_context()->trans->atom;
78355+ assert("zam-965", atom != NULL);
78356+
78357+ /* relocate set is on the atom->clean_nodes list after
78358+ * current_atom_complete_writes() finishes. It can be safely
78359+ * uncaptured after commit_semaphore is taken, because any atom that
78360+ * captures these nodes is guaranteed to commit after current one.
78361+ *
78362+ * This can only be done after pre_commit_hook(), because it is where
78363+ * early flushed jnodes with CREATED bit are transferred to the
78364+ * overwrite list. */
78365+ invalidate_list(ATOM_CLEAN_LIST(atom));
78366+ spin_lock_atom(atom);
78367+ /* There might be waiters for the relocate nodes which we have
78368+ * released, wake them up. */
78369+ atom_send_event(atom);
78370+ spin_unlock_atom(atom);
78371+
78372+ if (REISER4_DEBUG) {
78373+ int level;
78374+
78375+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78376+ assert("nikita-3352",
78377+ list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78378+ }
78379+
78380+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78381+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78382+
78383+ init_commit_handle(&ch, atom);
78384+
78385+ ch.free_blocks = sbinfo->blocks_free_committed;
78386+ ch.nr_files = sbinfo->nr_files_committed;
78387+ /* ZAM-FIXME-HANS: email me what the contention level is for the super
78388+ * lock. */
78389+ ch.next_oid = oid_next(super);
78390+
78391+ /* count overwrite set and place it in a separate list */
78392+ ret = get_overwrite_set(&ch);
78393+
78394+ if (ret <= 0) {
78395+ /* It is possible that overwrite set is empty here, it means
78396+ all captured nodes are clean */
78397+ goto up_and_ret;
78398+ }
78399+
78400+ /* Inform the caller about what number of dirty pages will be
78401+ * submitted to disk. */
78402+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78403+
78404+ /* count all records needed for storing of the wandered set */
78405+ get_tx_size(&ch);
78406+
78407+ ret = commit_tx(&ch);
78408+ if (ret)
78409+ goto up_and_ret;
78410+
78411+ spin_lock_atom(atom);
78412+ atom_set_stage(atom, ASTAGE_POST_COMMIT);
78413+ spin_unlock_atom(atom);
78414+
78415+ ret = write_tx_back(&ch);
78416+ post_write_back_hook();
78417+
78418+ up_and_ret:
78419+ if (ret) {
78420+ /* there could be fq attached to current atom; the only way to
78421+ remove them is: */
78422+ current_atom_finish_all_fq();
78423+ }
78424+
78425+ /* free blocks of flushed transaction */
78426+ dealloc_tx_list(&ch);
78427+ dealloc_wmap(&ch);
78428+
78429+ put_overwrite_set(&ch);
78430+
78431+ done_commit_handle(&ch);
78432+
78433+ writeout_mode_disable();
78434+
78435+ return ret;
78436+}
78437+
78438+/* consistency checks for journal data/control blocks: header, footer, log
78439+ records, transactions head blocks. All functions return zero on success. */
78440+
78441+static int check_journal_header(const jnode * node UNUSED_ARG)
78442+{
78443+ /* FIXME: journal header has no magic field yet. */
78444+ return 0;
78445+}
78446+
78447+/* wait for write completion for all jnodes from given list */
78448+static int wait_on_jnode_list(struct list_head *head)
78449+{
78450+ jnode *scan;
78451+ int ret = 0;
78452+
78453+ list_for_each_entry(scan, head, capture_link) {
78454+ struct page *pg = jnode_page(scan);
78455+
78456+ if (pg) {
78457+ if (PageWriteback(pg))
78458+ wait_on_page_writeback(pg);
78459+
78460+ if (PageError(pg))
78461+ ret++;
78462+ }
78463+ }
78464+
78465+ return ret;
78466+}
78467+
78468+static int check_journal_footer(const jnode * node UNUSED_ARG)
78469+{
78470+ /* FIXME: journal footer has no magic field yet. */
78471+ return 0;
78472+}
78473+
78474+static int check_tx_head(const jnode * node)
78475+{
78476+ struct tx_header *header = (struct tx_header *)jdata(node);
78477+
78478+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
78479+ warning("zam-627", "tx head at block %s corrupted\n",
78480+ sprint_address(jnode_get_block(node)));
78481+ return RETERR(-EIO);
78482+ }
78483+
78484+ return 0;
78485+}
78486+
78487+static int check_wander_record(const jnode * node)
78488+{
78489+ struct wander_record_header *RH =
78490+ (struct wander_record_header *)jdata(node);
78491+
78492+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
78493+ 0) {
78494+ warning("zam-628", "wander record at block %s corrupted\n",
78495+ sprint_address(jnode_get_block(node)));
78496+ return RETERR(-EIO);
78497+ }
78498+
78499+ return 0;
78500+}
78501+
78502+/* fill commit_handler structure by everything what is needed for update_journal_footer */
78503+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
78504+{
78505+ struct tx_header *TXH;
78506+ int ret;
78507+
78508+ ret = jload(tx_head);
78509+ if (ret)
78510+ return ret;
78511+
78512+ TXH = (struct tx_header *)jdata(tx_head);
78513+
78514+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
78515+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
78516+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
78517+
78518+ jrelse(tx_head);
78519+
78520+ list_add(&tx_head->capture_link, &ch->tx_list);
78521+
78522+ return 0;
78523+}
78524+
78525+/* replay one transaction: restore and write overwrite set in place */
78526+static int replay_transaction(const struct super_block *s,
78527+ jnode * tx_head,
78528+ const reiser4_block_nr * log_rec_block_p,
78529+ const reiser4_block_nr * end_block,
78530+ unsigned int nr_wander_records)
78531+{
78532+ reiser4_block_nr log_rec_block = *log_rec_block_p;
78533+ struct commit_handle ch;
78534+ LIST_HEAD(overwrite_set);
78535+ jnode *log;
78536+ int ret;
78537+
78538+ init_commit_handle(&ch, NULL);
78539+ ch.overwrite_set = &overwrite_set;
78540+
78541+ restore_commit_handle(&ch, tx_head);
78542+
78543+ while (log_rec_block != *end_block) {
78544+ struct wander_record_header *header;
78545+ struct wander_entry *entry;
78546+
78547+ int i;
78548+
78549+ if (nr_wander_records == 0) {
78550+ warning("zam-631",
78551+ "number of wander records in the linked list"
78552+ " greater than number stored in tx head.\n");
78553+ ret = RETERR(-EIO);
78554+ goto free_ow_set;
78555+ }
78556+
78557+ log = alloc_io_head(&log_rec_block);
78558+ if (log == NULL)
78559+ return RETERR(-ENOMEM);
78560+
78561+ ret = jload(log);
78562+ if (ret < 0) {
78563+ drop_io_head(log);
78564+ return ret;
78565+ }
78566+
78567+ ret = check_wander_record(log);
78568+ if (ret) {
78569+ jrelse(log);
78570+ drop_io_head(log);
78571+ return ret;
78572+ }
78573+
78574+ header = (struct wander_record_header *)jdata(log);
78575+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
78576+
78577+ entry = (struct wander_entry *)(header + 1);
78578+
78579+ /* restore overwrite set from wander record content */
78580+ for (i = 0; i < wander_record_capacity(s); i++) {
78581+ reiser4_block_nr block;
78582+ jnode *node;
78583+
78584+ block = le64_to_cpu(get_unaligned(&entry->wandered));
78585+ if (block == 0)
78586+ break;
78587+
78588+ node = alloc_io_head(&block);
78589+ if (node == NULL) {
78590+ ret = RETERR(-ENOMEM);
78591+ /*
78592+ * FIXME-VS:???
78593+ */
78594+ jrelse(log);
78595+ drop_io_head(log);
78596+ goto free_ow_set;
78597+ }
78598+
78599+ ret = jload(node);
78600+
78601+ if (ret < 0) {
78602+ drop_io_head(node);
78603+ /*
78604+ * FIXME-VS:???
78605+ */
78606+ jrelse(log);
78607+ drop_io_head(log);
78608+ goto free_ow_set;
78609+ }
78610+
78611+ block = le64_to_cpu(get_unaligned(&entry->original));
78612+
78613+ assert("zam-603", block != 0);
78614+
78615+ jnode_set_block(node, &block);
78616+
78617+ list_add_tail(&node->capture_link, ch.overwrite_set);
78618+
78619+ ++entry;
78620+ }
78621+
78622+ jrelse(log);
78623+ drop_io_head(log);
78624+
78625+ --nr_wander_records;
78626+ }
78627+
78628+ if (nr_wander_records != 0) {
78629+ warning("zam-632", "number of wander records in the linked list"
78630+ " less than number stored in tx head.\n");
78631+ ret = RETERR(-EIO);
78632+ goto free_ow_set;
78633+ }
78634+
78635+ { /* write wandered set in place */
78636+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
78637+ ret = wait_on_jnode_list(ch.overwrite_set);
78638+
78639+ if (ret) {
78640+ ret = RETERR(-EIO);
78641+ goto free_ow_set;
78642+ }
78643+ }
78644+
78645+ ret = update_journal_footer(&ch, 0);
78646+
78647+ free_ow_set:
78648+
78649+ while (!list_empty(ch.overwrite_set)) {
78650+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
78651+ list_del_init(&cur->capture_link);
78652+ jrelse(cur);
78653+ drop_io_head(cur);
78654+ }
78655+
78656+ list_del_init(&tx_head->capture_link);
78657+
78658+ done_commit_handle(&ch);
78659+
78660+ return ret;
78661+}
78662+
78663+/* find oldest committed and not played transaction and play it. The transaction
78664+ * was committed and journal header block was updated but the blocks from the
78665+ * process of writing the atom's overwrite set in-place and updating of journal
78666+ * footer block were not completed. This function completes the process by
78667+ * recovering the atom's overwrite set from their wandered locations and writes
78668+ * them in-place and updating the journal footer. */
78669+static int replay_oldest_transaction(struct super_block *s)
78670+{
78671+ reiser4_super_info_data *sbinfo = get_super_private(s);
78672+ jnode *jf = sbinfo->journal_footer;
78673+ unsigned int total;
78674+ struct journal_footer *F;
78675+ struct tx_header *T;
78676+
78677+ reiser4_block_nr prev_tx;
78678+ reiser4_block_nr last_flushed_tx;
78679+ reiser4_block_nr log_rec_block = 0;
78680+
78681+ jnode *tx_head;
78682+
78683+ int ret;
78684+
78685+ if ((ret = jload(jf)) < 0)
78686+ return ret;
78687+
78688+ F = (struct journal_footer *)jdata(jf);
78689+
78690+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
78691+
78692+ jrelse(jf);
78693+
78694+ if (sbinfo->last_committed_tx == last_flushed_tx) {
78695+ /* all transactions are replayed */
78696+ return 0;
78697+ }
78698+
78699+ prev_tx = sbinfo->last_committed_tx;
78700+
78701+ /* searching for oldest not flushed transaction */
78702+ while (1) {
78703+ tx_head = alloc_io_head(&prev_tx);
78704+ if (!tx_head)
78705+ return RETERR(-ENOMEM);
78706+
78707+ ret = jload(tx_head);
78708+ if (ret < 0) {
78709+ drop_io_head(tx_head);
78710+ return ret;
78711+ }
78712+
78713+ ret = check_tx_head(tx_head);
78714+ if (ret) {
78715+ jrelse(tx_head);
78716+ drop_io_head(tx_head);
78717+ return ret;
78718+ }
78719+
78720+ T = (struct tx_header *)jdata(tx_head);
78721+
78722+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
78723+
78724+ if (prev_tx == last_flushed_tx)
78725+ break;
78726+
78727+ jrelse(tx_head);
78728+ drop_io_head(tx_head);
78729+ }
78730+
78731+ total = le32_to_cpu(get_unaligned(&T->total));
78732+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
78733+
78734+ pin_jnode_data(tx_head);
78735+ jrelse(tx_head);
78736+
78737+ ret =
78738+ replay_transaction(s, tx_head, &log_rec_block,
78739+ jnode_get_block(tx_head), total - 1);
78740+
78741+ unpin_jnode_data(tx_head);
78742+ drop_io_head(tx_head);
78743+
78744+ if (ret)
78745+ return ret;
78746+ return -E_REPEAT;
78747+}
78748+
78749+/* The reiser4 journal current implementation was optimized to not to capture
78750+ super block if certain super blocks fields are modified. Currently, the set
78751+ is (<free block count>, <OID allocator>). These fields are logged by
78752+ special way which includes storing them in each transaction head block at
78753+ atom commit time and writing that information to journal footer block at
78754+ atom flush time. For getting info from journal footer block to the
78755+ in-memory super block there is a special function
78756+ reiser4_journal_recover_sb_data() which should be called after disk format
78757+ plugin re-reads super block after journal replaying.
78758+*/
78759+
78760+/* get the information from journal footer in-memory super block */
78761+int reiser4_journal_recover_sb_data(struct super_block *s)
78762+{
78763+ reiser4_super_info_data *sbinfo = get_super_private(s);
78764+ struct journal_footer *jf;
78765+ int ret;
78766+
78767+ assert("zam-673", sbinfo->journal_footer != NULL);
78768+
78769+ ret = jload(sbinfo->journal_footer);
78770+ if (ret != 0)
78771+ return ret;
78772+
78773+ ret = check_journal_footer(sbinfo->journal_footer);
78774+ if (ret != 0)
78775+ goto out;
78776+
78777+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
78778+
78779+ /* was there at least one flushed transaction? */
78780+ if (jf->last_flushed_tx) {
78781+
78782+ /* restore free block counter logged in this transaction */
78783+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
78784+
78785+ /* restore oid allocator state */
78786+ oid_init_allocator(s,
78787+ le64_to_cpu(get_unaligned(&jf->nr_files)),
78788+ le64_to_cpu(get_unaligned(&jf->next_oid)));
78789+ }
78790+ out:
78791+ jrelse(sbinfo->journal_footer);
78792+ return ret;
78793+}
78794+
78795+/* reiser4 replay journal procedure */
78796+int reiser4_journal_replay(struct super_block *s)
78797+{
78798+ reiser4_super_info_data *sbinfo = get_super_private(s);
78799+ jnode *jh, *jf;
78800+ struct journal_header *header;
78801+ int nr_tx_replayed = 0;
78802+ int ret;
78803+
78804+ assert("zam-582", sbinfo != NULL);
78805+
78806+ jh = sbinfo->journal_header;
78807+ jf = sbinfo->journal_footer;
78808+
78809+ if (!jh || !jf) {
78810+ /* it is possible that disk layout does not support journal
78811+ structures, we just warn about this */
78812+ warning("zam-583",
78813+ "journal control blocks were not loaded by disk layout plugin. "
78814+ "journal replaying is not possible.\n");
78815+ return 0;
78816+ }
78817+
78818+ /* Take free block count from journal footer block. The free block
78819+ counter value corresponds the last flushed transaction state */
78820+ ret = jload(jf);
78821+ if (ret < 0)
78822+ return ret;
78823+
78824+ ret = check_journal_footer(jf);
78825+ if (ret) {
78826+ jrelse(jf);
78827+ return ret;
78828+ }
78829+
78830+ jrelse(jf);
78831+
78832+ /* store last committed transaction info in reiser4 in-memory super
78833+ block */
78834+ ret = jload(jh);
78835+ if (ret < 0)
78836+ return ret;
78837+
78838+ ret = check_journal_header(jh);
78839+ if (ret) {
78840+ jrelse(jh);
78841+ return ret;
78842+ }
78843+
78844+ header = (struct journal_header *)jdata(jh);
78845+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
78846+
78847+ jrelse(jh);
78848+
78849+ /* replay committed transactions */
78850+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
78851+ nr_tx_replayed++;
78852+
78853+ return ret;
78854+}
78855+
78856+/* load journal control block (either journal header or journal footer block) */
78857+static int
78858+load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
78859+{
78860+ int ret;
78861+
78862+ *node = alloc_io_head(block);
78863+ if (!(*node))
78864+ return RETERR(-ENOMEM);
78865+
78866+ ret = jload(*node);
78867+
78868+ if (ret) {
78869+ drop_io_head(*node);
78870+ *node = NULL;
78871+ return ret;
78872+ }
78873+
78874+ pin_jnode_data(*node);
78875+ jrelse(*node);
78876+
78877+ return 0;
78878+}
78879+
78880+/* unload journal header or footer and free jnode */
78881+static void unload_journal_control_block(jnode ** node)
78882+{
78883+ if (*node) {
78884+ unpin_jnode_data(*node);
78885+ drop_io_head(*node);
78886+ *node = NULL;
78887+ }
78888+}
78889+
78890+/* release journal control blocks */
78891+void done_journal_info(struct super_block *s)
78892+{
78893+ reiser4_super_info_data *sbinfo = get_super_private(s);
78894+
78895+ assert("zam-476", sbinfo != NULL);
78896+
78897+ unload_journal_control_block(&sbinfo->journal_header);
78898+ unload_journal_control_block(&sbinfo->journal_footer);
78899+ rcu_barrier();
78900+}
78901+
78902+/* load journal control blocks */
78903+int init_journal_info(struct super_block *s)
78904+{
78905+ reiser4_super_info_data *sbinfo = get_super_private(s);
78906+ journal_location *loc;
78907+ int ret;
78908+
78909+ loc = &sbinfo->jloc;
78910+
78911+ assert("zam-651", loc != NULL);
78912+ assert("zam-652", loc->header != 0);
78913+ assert("zam-653", loc->footer != 0);
78914+
78915+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
78916+
78917+ if (ret)
78918+ return ret;
78919+
78920+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
78921+
78922+ if (ret) {
78923+ unload_journal_control_block(&sbinfo->journal_header);
78924+ }
78925+
78926+ return ret;
78927+}
78928+
78929+/* Make Linus happy.
78930+ Local variables:
78931+ c-indentation-style: "K&R"
78932+ mode-name: "LC"
78933+ c-basic-offset: 8
78934+ tab-width: 8
78935+ fill-column: 80
78936+ End:
78937+*/
78938Index: linux-2.6.16/fs/reiser4/wander.h
78939===================================================================
78940--- /dev/null
78941+++ linux-2.6.16/fs/reiser4/wander.h
78942@@ -0,0 +1,135 @@
78943+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
78944+
78945+#if !defined (__FS_REISER4_WANDER_H__)
78946+#define __FS_REISER4_WANDER_H__
78947+
78948+#include "dformat.h"
78949+
78950+#include <linux/fs.h> /* for struct super_block */
78951+
78952+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
78953+
78954+#define TX_HEADER_MAGIC "TxMagic4"
78955+#define WANDER_RECORD_MAGIC "LogMagc4"
78956+
78957+#define TX_HEADER_MAGIC_SIZE (8)
78958+#define WANDER_RECORD_MAGIC_SIZE (8)
78959+
78960+/* journal header block format */
78961+struct journal_header {
78962+ /* last written transaction head location */
78963+ d64 last_committed_tx;
78964+};
78965+
78966+typedef struct journal_location {
78967+ reiser4_block_nr footer;
78968+ reiser4_block_nr header;
78969+} journal_location;
78970+
78971+/* The wander.c head comment describes usage and semantic of all these structures */
78972+/* journal footer block format */
78973+struct journal_footer {
78974+ /* last flushed transaction location. */
78975+ /* This block number is no more valid after the transaction it points
78976+ to gets flushed, this number is used only at journal replaying time
78977+ for detection of the end of on-disk list of committed transactions
78978+ which were not flushed completely */
78979+ d64 last_flushed_tx;
78980+
78981+ /* free block counter is written in journal footer at transaction
78982+ flushing , not in super block because free blocks counter is logged
78983+ by another way than super block fields (root pointer, for
78984+ example). */
78985+ d64 free_blocks;
78986+
78987+ /* number of used OIDs and maximal used OID are logged separately from
78988+ super block */
78989+ d64 nr_files;
78990+ d64 next_oid;
78991+};
78992+
78993+/* Each wander record (except the first one) has unified format with wander
78994+ record header followed by an array of log entries */
78995+struct wander_record_header {
78996+ /* when there is no predefined location for wander records, this magic
78997+ string should help reiser4fsck. */
78998+ char magic[WANDER_RECORD_MAGIC_SIZE];
78999+
79000+ /* transaction id */
79001+ d64 id;
79002+
79003+ /* total number of wander records in current transaction */
79004+ d32 total;
79005+
79006+ /* this block number in transaction */
79007+ d32 serial;
79008+
79009+ /* number of previous block in commit */
79010+ d64 next_block;
79011+};
79012+
79013+/* The first wander record (transaction head) of written transaction has the
79014+ special format */
79015+struct tx_header {
79016+ /* magic string makes first block in transaction different from other
79017+ logged blocks, it should help fsck. */
79018+ char magic[TX_HEADER_MAGIC_SIZE];
79019+
79020+ /* transaction id */
79021+ d64 id;
79022+
79023+ /* total number of records (including this first tx head) in the
79024+ transaction */
79025+ d32 total;
79026+
79027+ /* align next field to 8-byte boundary; this field always is zero */
79028+ d32 padding;
79029+
79030+ /* block number of previous transaction head */
79031+ d64 prev_tx;
79032+
79033+ /* next wander record location */
79034+ d64 next_block;
79035+
79036+ /* committed versions of free blocks counter */
79037+ d64 free_blocks;
79038+
79039+ /* number of used OIDs (nr_files) and maximal used OID are logged
79040+ separately from super block */
79041+ d64 nr_files;
79042+ d64 next_oid;
79043+};
79044+
79045+/* A transaction gets written to disk as a set of wander records (each wander
79046+ record size is fs block) */
79047+
79048+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
79049+ by zeroes */
79050+struct wander_entry {
79051+ d64 original; /* block original location */
79052+ d64 wandered; /* block wandered location */
79053+};
79054+
79055+/* REISER4 JOURNAL WRITER FUNCTIONS */
79056+
79057+extern int reiser4_write_logs(long *);
79058+extern int reiser4_journal_replay(struct super_block *);
79059+extern int reiser4_journal_recover_sb_data(struct super_block *);
79060+
79061+extern int init_journal_info(struct super_block *);
79062+extern void done_journal_info(struct super_block *);
79063+
79064+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
79065+
79066+#endif /* __FS_REISER4_WANDER_H__ */
79067+
79068+/* Make Linus happy.
79069+ Local variables:
79070+ c-indentation-style: "K&R"
79071+ mode-name: "LC"
79072+ c-basic-offset: 8
79073+ tab-width: 8
79074+ fill-column: 80
79075+ scroll-step: 1
79076+ End:
79077+*/
79078Index: linux-2.6.16/fs/reiser4/writeout.h
79079===================================================================
79080--- /dev/null
79081+++ linux-2.6.16/fs/reiser4/writeout.h
79082@@ -0,0 +1,21 @@
79083+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
79084+
79085+#if !defined (__FS_REISER4_WRITEOUT_H__)
79086+
79087+#define WRITEOUT_SINGLE_STREAM (0x1)
79088+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
79089+#define WRITEOUT_BARRIER (0x4)
79090+
79091+extern int get_writeout_flags(void);
79092+
79093+#endif /* __FS_REISER4_WRITEOUT_H__ */
79094+
79095+/* Make Linus happy.
79096+ Local variables:
79097+ c-indentation-style: "K&R"
79098+ mode-name: "LC"
79099+ c-basic-offset: 8
79100+ tab-width: 8
79101+ fill-column: 80
79102+ End:
79103+*/
79104Index: linux-2.6.16/fs/reiser4/znode.c
79105===================================================================
79106--- /dev/null
79107+++ linux-2.6.16/fs/reiser4/znode.c
79108@@ -0,0 +1,1028 @@
79109+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
79110+ * reiser4/README */
79111+/* Znode manipulation functions. */
79112+/* Znode is the in-memory header for a tree node. It is stored
79113+ separately from the node itself so that it does not get written to
79114+ disk. In this respect znode is like buffer head or page head. We
79115+ also use znodes for additional reiser4 specific purposes:
79116+
79117+ . they are organized into tree structure which is a part of whole
79118+ reiser4 tree.
79119+ . they are used to implement node grained locking
79120+ . they are used to keep additional state associated with a
79121+ node
79122+ . they contain links to lists used by the transaction manager
79123+
79124+ Znode is attached to some variable "block number" which is instance of
79125+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
79126+ appropriate node being actually loaded in memory. Existence of znode itself
79127+ is regulated by reference count (->x_count) in it. Each time thread
79128+ acquires reference to znode through call to zget(), ->x_count is
79129+ incremented and decremented on call to zput(). Data (content of node) are
79130+ brought in memory through call to zload(), which also increments ->d_count
79131+ reference counter. zload can block waiting on IO. Call to zrelse()
79132+ decreases this counter. Also, ->c_count keeps track of number of child
79133+ znodes and prevents parent znode from being recycled until all of its
79134+ children are. ->c_count is decremented whenever child goes out of existence
79135+ (being actually recycled in zdestroy()) which can be some time after last
79136+ reference to this child dies if we support some form of LRU cache for
79137+ znodes.
79138+
79139+*/
79140+/* EVERY ZNODE'S STORY
79141+
79142+ 1. His infancy.
79143+
79144+ Once upon a time, the znode was born deep inside of zget() by call to
79145+ zalloc(). At the return from zget() znode had:
79146+
79147+ . reference counter (x_count) of 1
79148+ . assigned block number, marked as used in bitmap
79149+ . pointer to parent znode. Root znode parent pointer points
79150+ to its father: "fake" znode. This, in turn, has NULL parent pointer.
79151+ . hash table linkage
79152+ . no data loaded from disk
79153+ . no node plugin
79154+ . no sibling linkage
79155+
79156+ 2. His childhood
79157+
79158+ Each node is either brought into memory as a result of tree traversal, or
79159+ created afresh, creation of the root being a special case of the latter. In
79160+ either case it's inserted into sibling list. This will typically require
79161+ some ancillary tree traversing, but ultimately both sibling pointers will
79162+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
79163+ zjnode.state.
79164+
79165+ 3. His youth.
79166+
79167+ If znode is bound to already existing node in a tree, its content is read
79168+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
79169+ in zjnode.state and zdata() function starts to return non null for this
79170+ znode. zload() further calls zparse() that determines which node layout
79171+ this node is rendered in, and sets ->nplug on success.
79172+
79173+ If znode is for new node just created, memory for it is allocated and
79174+ zinit_new() function is called to initialise data, according to selected
79175+ node layout.
79176+
79177+ 4. His maturity.
79178+
79179+ After this point, znode lingers in memory for some time. Threads can
79180+ acquire references to znode either by blocknr through call to zget(), or by
79181+ following a pointer to unallocated znode from internal item. Each time
79182+ reference to znode is obtained, x_count is increased. Thread can read/write
79183+ lock znode. Znode data can be loaded through calls to zload(), d_count will
79184+ be increased appropriately. If all references to znode are released
79185+ (x_count drops to 0), znode is not recycled immediately. Rather, it is
79186+ still cached in the hash table in the hope that it will be accessed
79187+ shortly.
79188+
79189+ There are two ways in which znode existence can be terminated:
79190+
79191+ . sudden death: node bound to this znode is removed from the tree
79192+ . overpopulation: znode is purged out of memory due to memory pressure
79193+
79194+ 5. His death.
79195+
79196+ Death is complex process.
79197+
79198+ When we irrevocably commit ourselves to decision to remove node from the
79199+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
79200+ znode. This is done either in ->kill_hook() of internal item or in
79201+ kill_root() function when tree root is removed.
79202+
79203+ At this moment znode still has:
79204+
79205+ . locks held on it, necessary write ones
79206+ . references to it
79207+ . disk block assigned to it
79208+ . data loaded from the disk
79209+ . pending requests for lock
79210+
79211+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
79212+ deletion. Node deletion includes two phases. First all ways to get
79213+ references to that znode (sibling and parent links and hash lookup using
79214+ block number stored in parent node) should be deleted -- it is done through
79215+ sibling_list_remove(), also we assume that nobody uses down link from
79216+ parent node due to its nonexistence or proper parent node locking and
79217+ nobody uses parent pointers from children due to absence of them. Second we
79218+ invalidate all pending lock requests which still are on znode's lock
79219+ request queue, this is done by invalidate_lock(). Another JNODE_IS_DYING
79220+ znode status bit is used to invalidate pending lock requests. Once it set
79221+ all requesters are forced to return -EINVAL from
79222+ longterm_lock_znode(). Future locking attempts are not possible because all
79223+ ways to get references to that znode are removed already. Last, node is
79224+ uncaptured from transaction.
79225+
79226+ When last reference to the dying znode is just about to be released,
79227+ block number for this lock is released and znode is removed from the
79228+ hash table.
79229+
79230+ Now znode can be recycled.
79231+
79232+ [it's possible to free bitmap block and remove znode from the hash
79233+ table when last lock is released. This will result in having
79234+ referenced but completely orphaned znode]
79235+
79236+ 6. Limbo
79237+
79238+ As have been mentioned above znodes with reference counter 0 are
79239+ still cached in a hash table. Once memory pressure increases they are
79240+ purged out of there [this requires something like LRU list for
79241+ efficient implementation. LRU list would also greatly simplify
79242+ implementation of coord cache that would in this case morph to just
79243+ scanning some initial segment of LRU list]. Data loaded into
79244+ unreferenced znode are flushed back to the durable storage if
79245+ necessary and memory is freed. Znodes themselves can be recycled at
79246+ this point too.
79247+
79248+*/
79249+
79250+#include "debug.h"
79251+#include "dformat.h"
79252+#include "key.h"
79253+#include "coord.h"
79254+#include "plugin/plugin_header.h"
79255+#include "plugin/node/node.h"
79256+#include "plugin/plugin.h"
79257+#include "txnmgr.h"
79258+#include "jnode.h"
79259+#include "znode.h"
79260+#include "block_alloc.h"
79261+#include "tree.h"
79262+#include "tree_walk.h"
79263+#include "super.h"
79264+#include "reiser4.h"
79265+
79266+#include <linux/pagemap.h>
79267+#include <linux/spinlock.h>
79268+#include <linux/slab.h>
79269+#include <linux/err.h>
79270+
79271+static z_hash_table *get_htable(reiser4_tree *,
79272+ const reiser4_block_nr * const blocknr);
79273+static z_hash_table *znode_get_htable(const znode *);
79274+static void zdrop(znode *);
79275+
79276+/* hash table support */
79277+
79278+/* compare two block numbers for equality. Used by hash-table macros */
79279+static inline int
79280+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79281+{
79282+ assert("nikita-534", b1 != NULL);
79283+ assert("nikita-535", b2 != NULL);
79284+
79285+ return *b1 == *b2;
79286+}
79287+
79288+/* Hash znode by block number. Used by hash-table macros */
79289+/* Audited by: umka (2002.06.11) */
79290+static inline __u32
79291+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79292+{
79293+ assert("nikita-536", b != NULL);
79294+
79295+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79296+}
79297+
79298+/* The hash table definition */
79299+#define KMALLOC(size) kmalloc((size), GFP_KERNEL)
79300+#define KFREE(ptr, size) kfree(ptr)
79301+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79302+ blknrhashfn, blknreq);
79303+#undef KFREE
79304+#undef KMALLOC
79305+
79306+/* slab for znodes */
79307+static kmem_cache_t *znode_cache;
79308+
79309+int znode_shift_order;
79310+
79311+/**
79312+ * init_znodes - create znode cache
79313+ *
79314+ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79315+ */
79316+int init_znodes(void)
79317+{
79318+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79319+ SLAB_HWCACHE_ALIGN |
79320+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79321+ if (znode_cache == NULL)
79322+ return RETERR(-ENOMEM);
79323+
79324+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79325+ ++znode_shift_order);
79326+ --znode_shift_order;
79327+ return 0;
79328+}
79329+
79330+/**
79331+ * done_znodes - delete znode cache
79332+ *
79333+ * This is called on reiser4 module unloading or system shutdown.
79334+ */
79335+void done_znodes(void)
79336+{
79337+ destroy_reiser4_cache(&znode_cache);
79338+}
79339+
79340+/* call this to initialise tree of znodes */
79341+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79342+{
79343+ int result;
79344+ assert("umka-050", tree != NULL);
79345+
79346+ rwlock_init(&tree->dk_lock);
79347+
79348+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79349+ if (result != 0)
79350+ return result;
79351+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79352+ return result;
79353+}
79354+
79355+/* free this znode */
79356+void zfree(znode * node /* znode to free */ )
79357+{
79358+ assert("nikita-465", node != NULL);
79359+ assert("nikita-2120", znode_page(node) == NULL);
79360+ assert("nikita-2301", list_empty_careful(&node->lock.owners));
79361+ assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79362+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79363+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79364+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79365+ assert("nikita-3293", !znode_is_right_connected(node));
79366+ assert("nikita-3294", !znode_is_left_connected(node));
79367+ assert("nikita-3295", node->left == NULL);
79368+ assert("nikita-3296", node->right == NULL);
79369+
79370+ /* not yet phash_jnode_destroy(ZJNODE(node)); */
79371+
79372+ kmem_cache_free(znode_cache, node);
79373+}
79374+
79375+/* call this to free tree of znodes */
79376+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79377+{
79378+ znode *node;
79379+ znode *next;
79380+ z_hash_table *ztable;
79381+
79382+ /* scan znode hash-tables and kill all znodes, then free hash tables
79383+ * themselves. */
79384+
79385+ assert("nikita-795", tree != NULL);
79386+
79387+ ztable = &tree->zhash_table;
79388+
79389+ if (ztable->_table != NULL) {
79390+ for_all_in_htable(ztable, z, node, next) {
79391+ node->c_count = 0;
79392+ node->in_parent.node = NULL;
79393+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79394+ zdrop(node);
79395+ }
79396+
79397+ z_hash_done(&tree->zhash_table);
79398+ }
79399+
79400+ ztable = &tree->zfake_table;
79401+
79402+ if (ztable->_table != NULL) {
79403+ for_all_in_htable(ztable, z, node, next) {
79404+ node->c_count = 0;
79405+ node->in_parent.node = NULL;
79406+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79407+ zdrop(node);
79408+ }
79409+
79410+ z_hash_done(&tree->zfake_table);
79411+ }
79412+}
79413+
79414+/* ZNODE STRUCTURES */
79415+
79416+/* allocate fresh znode */
79417+znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79418+{
79419+ znode *node;
79420+
79421+ node = kmem_cache_alloc(znode_cache, gfp_flag);
79422+ return node;
79423+}
79424+
79425+/* Initialize fields of znode
79426+ @node: znode to initialize;
79427+ @parent: parent znode;
79428+ @tree: tree we are in. */
79429+void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79430+{
79431+ assert("nikita-466", node != NULL);
79432+ assert("umka-268", current_tree != NULL);
79433+
79434+ memset(node, 0, sizeof *node);
79435+
79436+ assert("umka-051", tree != NULL);
79437+
79438+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79439+ reiser4_init_lock(&node->lock);
79440+ init_parent_coord(&node->in_parent, parent);
79441+}
79442+
79443+/*
79444+ * remove znode from indices. This is called jput() when last reference on
79445+ * znode is released.
79446+ */
79447+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79448+{
79449+ assert("nikita-2108", node != NULL);
79450+ assert("nikita-470", node->c_count == 0);
79451+ assert_rw_write_locked(&(tree->tree_lock));
79452+
79453+ /* remove reference to this znode from cbk cache */
79454+ cbk_cache_invalidate(node, tree);
79455+
79456+ /* update c_count of parent */
79457+ if (znode_parent(node) != NULL) {
79458+ assert("nikita-472", znode_parent(node)->c_count > 0);
79459+ /* father, onto your hands I forward my spirit... */
79460+ znode_parent(node)->c_count--;
79461+ node->in_parent.node = NULL;
79462+ } else {
79463+ /* orphaned znode?! Root? */
79464+ }
79465+
79466+ /* remove znode from hash-table */
79467+ z_hash_remove_rcu(znode_get_htable(node), node);
79468+}
79469+
79470+/* zdrop() -- Remove znode from the tree.
79471+
79472+ This is called when znode is removed from the memory. */
79473+static void zdrop(znode * node /* znode to finish with */ )
79474+{
79475+ jdrop(ZJNODE(node));
79476+}
79477+
79478+/*
79479+ * put znode into right place in the hash table. This is called by relocate
79480+ * code.
79481+ */
79482+int znode_rehash(znode * node /* node to rehash */ ,
79483+ const reiser4_block_nr * new_block_nr /* new block number */ )
79484+{
79485+ z_hash_table *oldtable;
79486+ z_hash_table *newtable;
79487+ reiser4_tree *tree;
79488+
79489+ assert("nikita-2018", node != NULL);
79490+
79491+ tree = znode_get_tree(node);
79492+ oldtable = znode_get_htable(node);
79493+ newtable = get_htable(tree, new_block_nr);
79494+
79495+ write_lock_tree(tree);
79496+ /* remove znode from hash-table */
79497+ z_hash_remove_rcu(oldtable, node);
79498+
79499+ /* assertion no longer valid due to RCU */
79500+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
79501+
79502+ /* update blocknr */
79503+ znode_set_block(node, new_block_nr);
79504+ node->zjnode.key.z = *new_block_nr;
79505+
79506+ /* insert it into hash */
79507+ z_hash_insert_rcu(newtable, node);
79508+ write_unlock_tree(tree);
79509+ return 0;
79510+}
79511+
79512+/* ZNODE LOOKUP, GET, PUT */
79513+
79514+/* zlook() - get znode with given block_nr in a hash table or return NULL
79515+
79516+ If result is non-NULL then the znode's x_count is incremented. Internal version
79517+ accepts pre-computed hash index. The hash table is accessed under caller's
79518+ tree->hash_lock.
79519+*/
79520+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
79521+{
79522+ znode *result;
79523+ __u32 hash;
79524+ z_hash_table *htable;
79525+
79526+ assert("jmacd-506", tree != NULL);
79527+ assert("jmacd-507", blocknr != NULL);
79528+
79529+ htable = get_htable(tree, blocknr);
79530+ hash = blknrhashfn(htable, blocknr);
79531+
79532+ rcu_read_lock();
79533+ result = z_hash_find_index(htable, hash, blocknr);
79534+
79535+ if (result != NULL) {
79536+ add_x_ref(ZJNODE(result));
79537+ result = znode_rip_check(tree, result);
79538+ }
79539+ rcu_read_unlock();
79540+
79541+ return result;
79542+}
79543+
79544+/* return hash table where znode with block @blocknr is (or should be)
79545+ * stored */
79546+static z_hash_table *get_htable(reiser4_tree * tree,
79547+ const reiser4_block_nr * const blocknr)
79548+{
79549+ z_hash_table *table;
79550+ if (is_disk_addr_unallocated(blocknr))
79551+ table = &tree->zfake_table;
79552+ else
79553+ table = &tree->zhash_table;
79554+ return table;
79555+}
79556+
79557+/* return hash table where znode @node is (or should be) stored */
79558+static z_hash_table *znode_get_htable(const znode * node)
79559+{
79560+ return get_htable(znode_get_tree(node), znode_get_block(node));
79561+}
79562+
79563+/* zget() - get znode from hash table, allocating it if necessary.
79564+
79565+ First a call to zlook, locating a x-referenced znode if one
79566+ exists. If znode is not found, allocate new one and return. Result
79567+ is returned with x_count reference increased.
79568+
79569+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
79570+ LOCK ORDERING: NONE
79571+*/
79572+znode *zget(reiser4_tree * tree,
79573+ const reiser4_block_nr * const blocknr,
79574+ znode * parent, tree_level level, gfp_t gfp_flag)
79575+{
79576+ znode *result;
79577+ __u32 hashi;
79578+
79579+ z_hash_table *zth;
79580+
79581+ assert("jmacd-512", tree != NULL);
79582+ assert("jmacd-513", blocknr != NULL);
79583+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
79584+
79585+ zth = get_htable(tree, blocknr);
79586+ hashi = blknrhashfn(zth, blocknr);
79587+
79588+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not
79589+ implemented. */
79590+
79591+ z_hash_prefetch_bucket(zth, hashi);
79592+
79593+ rcu_read_lock();
79594+ /* Find a matching BLOCKNR in the hash table. If the znode is found,
79595+ we obtain an reference (x_count) but the znode remains unlocked.
79596+ Have to worry about race conditions later. */
79597+ result = z_hash_find_index(zth, hashi, blocknr);
79598+ /* According to the current design, the hash table lock protects new
79599+ znode references. */
79600+ if (result != NULL) {
79601+ add_x_ref(ZJNODE(result));
79602+ /* NOTE-NIKITA it should be so, but special case during
79603+ creation of new root makes such assertion highly
79604+ complicated. */
79605+ assert("nikita-2131", 1 || znode_parent(result) == parent ||
79606+ (ZF_ISSET(result, JNODE_ORPHAN)
79607+ && (znode_parent(result) == NULL)));
79608+ result = znode_rip_check(tree, result);
79609+ }
79610+
79611+ rcu_read_unlock();
79612+
79613+ if (!result) {
79614+ znode *shadow;
79615+
79616+ result = zalloc(gfp_flag);
79617+ if (!result) {
79618+ return ERR_PTR(RETERR(-ENOMEM));
79619+ }
79620+
79621+ zinit(result, parent, tree);
79622+ ZJNODE(result)->blocknr = *blocknr;
79623+ ZJNODE(result)->key.z = *blocknr;
79624+ result->level = level;
79625+
79626+ write_lock_tree(tree);
79627+
79628+ shadow = z_hash_find_index(zth, hashi, blocknr);
79629+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
79630+ jnode_list_remove(ZJNODE(result));
79631+ zfree(result);
79632+ result = shadow;
79633+ } else {
79634+ result->version = znode_build_version(tree);
79635+ z_hash_insert_index_rcu(zth, hashi, result);
79636+
79637+ if (parent != NULL)
79638+ ++parent->c_count;
79639+ }
79640+
79641+ add_x_ref(ZJNODE(result));
79642+
79643+ write_unlock_tree(tree);
79644+ }
79645+#if REISER4_DEBUG
79646+ if (!blocknr_is_fake(blocknr) && *blocknr != 0)
79647+ reiser4_check_block(blocknr, 1);
79648+#endif
79649+ /* Check for invalid tree level, return -EIO */
79650+ if (unlikely(znode_get_level(result) != level)) {
79651+ warning("jmacd-504",
79652+ "Wrong level for cached block %llu: %i expecting %i",
79653+ (unsigned long long)(*blocknr), znode_get_level(result),
79654+ level);
79655+ zput(result);
79656+ return ERR_PTR(RETERR(-EIO));
79657+ }
79658+
79659+ assert("nikita-1227", znode_invariant(result));
79660+
79661+ return result;
79662+}
79663+
79664+/* ZNODE PLUGINS/DATA */
79665+
79666+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
79667+ stored at the fixed offset from the beginning of the node. */
79668+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
79669+ * plugin of */ )
79670+{
79671+ reiser4_tree *tree;
79672+
79673+ assert("nikita-1053", node != NULL);
79674+ assert("nikita-1055", zdata(node) != NULL);
79675+
79676+ tree = znode_get_tree(node);
79677+ assert("umka-053", tree != NULL);
79678+
79679+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
79680+ return tree->nplug;
79681+ } else {
79682+ return node_plugin_by_disk_id
79683+ (tree, &((common_node_header *) zdata(node))->plugin_id);
79684+#ifdef GUESS_EXISTS
79685+ reiser4_plugin *plugin;
79686+
79687+ /* NOTE-NIKITA add locking here when dynamic plugins will be
79688+ * implemented */
79689+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
79690+ if ((plugin->u.node.guess != NULL)
79691+ && plugin->u.node.guess(node))
79692+ return plugin;
79693+ }
79694+ warning("nikita-1057", "Cannot guess node plugin");
79695+ print_znode("node", node);
79696+ return NULL;
79697+#endif
79698+ }
79699+}
79700+
79701+/* parse node header and install ->node_plugin */
79702+int zparse(znode * node /* znode to parse */ )
79703+{
79704+ int result;
79705+
79706+ assert("nikita-1233", node != NULL);
79707+ assert("nikita-2370", zdata(node) != NULL);
79708+
79709+ if (node->nplug == NULL) {
79710+ node_plugin *nplug;
79711+
79712+ nplug = znode_guess_plugin(node);
79713+ if (likely(nplug != NULL)) {
79714+ result = nplug->parse(node);
79715+ if (likely(result == 0))
79716+ node->nplug = nplug;
79717+ } else {
79718+ result = RETERR(-EIO);
79719+ }
79720+ } else
79721+ result = 0;
79722+ return result;
79723+}
79724+
79725+/* zload with readahead */
79726+int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
79727+{
79728+ int result;
79729+
79730+ assert("nikita-484", node != NULL);
79731+ assert("nikita-1377", znode_invariant(node));
79732+ assert("jmacd-7771", !znode_above_root(node));
79733+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
79734+ assert("nikita-3016", schedulable());
79735+
79736+ if (info)
79737+ formatted_readahead(node, info);
79738+
79739+ result = jload(ZJNODE(node));
79740+ assert("nikita-1378", znode_invariant(node));
79741+ return result;
79742+}
79743+
79744+/* load content of node into memory */
79745+int zload(znode * node)
79746+{
79747+ return zload_ra(node, NULL);
79748+}
79749+
79750+/* call node plugin to initialise newly allocated node. */
79751+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
79752+{
79753+ return jinit_new(ZJNODE(node), gfp_flags);
79754+}
79755+
79756+/* drop reference to node data. When last reference is dropped, data are
79757+ unloaded. */
79758+void zrelse(znode * node /* znode to release references to */ )
79759+{
79760+ assert("nikita-1381", znode_invariant(node));
79761+
79762+ jrelse(ZJNODE(node));
79763+}
79764+
79765+/* returns free space in node */
79766+unsigned znode_free_space(znode * node /* znode to query */ )
79767+{
79768+ assert("nikita-852", node != NULL);
79769+ return node_plugin_by_node(node)->free_space(node);
79770+}
79771+
79772+/* left delimiting key of znode */
79773+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
79774+{
79775+ assert("nikita-958", node != NULL);
79776+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79777+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
79778+ assert("nikita-30671", node->rd_key_version != 0);
79779+ return &node->rd_key;
79780+}
79781+
79782+/* right delimiting key of znode */
79783+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
79784+{
79785+ assert("nikita-974", node != NULL);
79786+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79787+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
79788+ assert("nikita-30681", node->ld_key_version != 0);
79789+ return &node->ld_key;
79790+}
79791+
79792+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
79793+ )
79794+
79795+/* update right-delimiting key of @node */
79796+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
79797+{
79798+ assert("nikita-2937", node != NULL);
79799+ assert("nikita-2939", key != NULL);
79800+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79801+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
79802+ assert("nikita-2944",
79803+ znode_is_any_locked(node) ||
79804+ znode_get_level(node) != LEAF_LEVEL ||
79805+ keyge(key, &node->rd_key) ||
79806+ keyeq(&node->rd_key, min_key()) ||
79807+ ZF_ISSET(node, JNODE_HEARD_BANSHEE));
79808+
79809+ node->rd_key = *key;
79810+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
79811+ return &node->rd_key;
79812+}
79813+
79814+/* update left-delimiting key of @node */
79815+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
79816+{
79817+ assert("nikita-2940", node != NULL);
79818+ assert("nikita-2941", key != NULL);
79819+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79820+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
79821+ assert("nikita-2943",
79822+ znode_is_any_locked(node) || keyeq(&node->ld_key, min_key()));
79823+
79824+ node->ld_key = *key;
79825+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
79826+ return &node->ld_key;
79827+}
79828+
79829+/* true if @key is inside key range for @node */
79830+int znode_contains_key(znode * node /* znode to look in */ ,
79831+ const reiser4_key * key /* key to look for */ )
79832+{
79833+ assert("nikita-1237", node != NULL);
79834+ assert("nikita-1238", key != NULL);
79835+
79836+ /* left_delimiting_key <= key <= right_delimiting_key */
79837+ return keyle(znode_get_ld_key(node), key)
79838+ && keyle(key, znode_get_rd_key(node));
79839+}
79840+
79841+/* same as znode_contains_key(), but lock dk lock */
79842+int znode_contains_key_lock(znode * node /* znode to look in */ ,
79843+ const reiser4_key * key /* key to look for */ )
79844+{
79845+ int result;
79846+
79847+ assert("umka-056", node != NULL);
79848+ assert("umka-057", key != NULL);
79849+
79850+ read_lock_dk(znode_get_tree(node));
79851+ result = znode_contains_key(node, key);
79852+ read_unlock_dk(znode_get_tree(node));
79853+ return result;
79854+}
79855+
79856+/* get parent pointer, assuming tree is not locked */
79857+znode *znode_parent_nolock(const znode * node /* child znode */ )
79858+{
79859+ assert("nikita-1444", node != NULL);
79860+ return node->in_parent.node;
79861+}
79862+
79863+/* get parent pointer of znode */
79864+znode *znode_parent(const znode * node /* child znode */ )
79865+{
79866+ assert("nikita-1226", node != NULL);
79867+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
79868+ return znode_parent_nolock(node);
79869+}
79870+
79871+/* detect uber znode used to protect in-superblock tree root pointer */
79872+int znode_above_root(const znode * node /* znode to query */ )
79873+{
79874+ assert("umka-059", node != NULL);
79875+
79876+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
79877+}
79878+
79879+/* check that @node is root---that its block number is recorder in the tree as
79880+ that of root node */
79881+#if REISER4_DEBUG
79882+static int znode_is_true_root(const znode * node /* znode to query */ )
79883+{
79884+ assert("umka-060", node != NULL);
79885+ assert("umka-061", current_tree != NULL);
79886+
79887+ return disk_addr_eq(znode_get_block(node),
79888+ &znode_get_tree(node)->root_block);
79889+}
79890+#endif
79891+
79892+/* check that @node is root */
79893+int znode_is_root(const znode * node /* znode to query */ )
79894+{
79895+ assert("nikita-1206", node != NULL);
79896+
79897+ return znode_get_level(node) == znode_get_tree(node)->height;
79898+}
79899+
79900+/* Returns true is @node was just created by zget() and wasn't ever loaded
79901+ into memory. */
79902+/* NIKITA-HANS: yes */
79903+int znode_just_created(const znode * node)
79904+{
79905+ assert("nikita-2188", node != NULL);
79906+ return (znode_page(node) == NULL);
79907+}
79908+
79909+/* obtain updated ->znode_epoch. See seal.c for description. */
79910+__u64 znode_build_version(reiser4_tree * tree)
79911+{
79912+ __u64 result;
79913+
79914+ spin_lock(&tree->epoch_lock);
79915+ result = ++tree->znode_epoch;
79916+ spin_unlock(&tree->epoch_lock);
79917+ return result;
79918+}
79919+
79920+void init_load_count(load_count * dh)
79921+{
79922+ assert("nikita-2105", dh != NULL);
79923+ memset(dh, 0, sizeof *dh);
79924+}
79925+
79926+void done_load_count(load_count * dh)
79927+{
79928+ assert("nikita-2106", dh != NULL);
79929+ if (dh->node != NULL) {
79930+ for (; dh->d_ref > 0; --dh->d_ref)
79931+ zrelse(dh->node);
79932+ dh->node = NULL;
79933+ }
79934+}
79935+
79936+static int incr_load_count(load_count * dh)
79937+{
79938+ int result;
79939+
79940+ assert("nikita-2110", dh != NULL);
79941+ assert("nikita-2111", dh->node != NULL);
79942+
79943+ result = zload(dh->node);
79944+ if (result == 0)
79945+ ++dh->d_ref;
79946+ return result;
79947+}
79948+
79949+int incr_load_count_znode(load_count * dh, znode * node)
79950+{
79951+ assert("nikita-2107", dh != NULL);
79952+ assert("nikita-2158", node != NULL);
79953+ assert("nikita-2109",
79954+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
79955+
79956+ dh->node = node;
79957+ return incr_load_count(dh);
79958+}
79959+
79960+int incr_load_count_jnode(load_count * dh, jnode * node)
79961+{
79962+ if (jnode_is_znode(node)) {
79963+ return incr_load_count_znode(dh, JZNODE(node));
79964+ }
79965+ return 0;
79966+}
79967+
79968+void copy_load_count(load_count * new, load_count * old)
79969+{
79970+ int ret = 0;
79971+ done_load_count(new);
79972+ new->node = old->node;
79973+ new->d_ref = 0;
79974+
79975+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
79976+ }
79977+
79978+ assert("jmacd-87589", ret == 0);
79979+}
79980+
79981+void move_load_count(load_count * new, load_count * old)
79982+{
79983+ done_load_count(new);
79984+ new->node = old->node;
79985+ new->d_ref = old->d_ref;
79986+ old->node = NULL;
79987+ old->d_ref = 0;
79988+}
79989+
79990+/* convert parent pointer into coord */
79991+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
79992+{
79993+ assert("nikita-3204", pcoord != NULL);
79994+ assert("nikita-3205", coord != NULL);
79995+
79996+ coord_init_first_unit_nocheck(coord, pcoord->node);
79997+ coord_set_item_pos(coord, pcoord->item_pos);
79998+ coord->between = AT_UNIT;
79999+}
80000+
80001+/* pack coord into parent_coord_t */
80002+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
80003+{
80004+ assert("nikita-3206", pcoord != NULL);
80005+ assert("nikita-3207", coord != NULL);
80006+
80007+ pcoord->node = coord->node;
80008+ pcoord->item_pos = coord->item_pos;
80009+}
80010+
80011+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
80012+ look for comments there) */
80013+void init_parent_coord(parent_coord_t * pcoord, const znode * node)
80014+{
80015+ pcoord->node = (znode *) node;
80016+ pcoord->item_pos = (unsigned short)~0;
80017+}
80018+
80019+#if REISER4_DEBUG
80020+
80021+/* debugging aid: znode invariant */
80022+static int znode_invariant_f(const znode * node /* znode to check */ ,
80023+ char const **msg /* where to store error
80024+ * message, if any */ )
80025+{
80026+#define _ergo(ant, con) \
80027+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
80028+
80029+#define _equi(e1, e2) \
80030+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
80031+
80032+#define _check(exp) ((*msg) = #exp, (exp))
80033+
80034+ return jnode_invariant_f(ZJNODE(node), msg) &&
80035+ /* [znode-fake] invariant */
80036+ /* fake znode doesn't have a parent, and */
80037+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
80038+ /* there is another way to express this very check, and */
80039+ _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
80040+ /* it has special block number, and */
80041+ _ergo(znode_get_level(node) == 0,
80042+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80043+ /* it is the only znode with such block number, and */
80044+ _ergo(!znode_above_root(node) && znode_is_loaded(node),
80045+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80046+ /* it is parent of the tree root node */
80047+ _ergo(znode_is_true_root(node),
80048+ znode_above_root(znode_parent(node))) &&
80049+ /* [znode-level] invariant */
80050+ /* level of parent znode is one larger than that of child,
80051+ except for the fake znode, and */
80052+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
80053+ znode_get_level(znode_parent(node)) ==
80054+ znode_get_level(node) + 1) &&
80055+ /* left neighbor is at the same level, and */
80056+ _ergo(znode_is_left_connected(node) && node->left != NULL,
80057+ znode_get_level(node) == znode_get_level(node->left)) &&
80058+ /* right neighbor is at the same level */
80059+ _ergo(znode_is_right_connected(node) && node->right != NULL,
80060+ znode_get_level(node) == znode_get_level(node->right)) &&
80061+ /* [znode-connected] invariant */
80062+ _ergo(node->left != NULL, znode_is_left_connected(node)) &&
80063+ _ergo(node->right != NULL, znode_is_right_connected(node)) &&
80064+ _ergo(!znode_is_root(node) && node->left != NULL,
80065+ znode_is_right_connected(node->left) &&
80066+ node->left->right == node) &&
80067+ _ergo(!znode_is_root(node) && node->right != NULL,
80068+ znode_is_left_connected(node->right) &&
80069+ node->right->left == node) &&
80070+ /* [znode-c_count] invariant */
80071+ /* for any znode, c_count of its parent is greater than 0 */
80072+ _ergo(znode_parent(node) != NULL &&
80073+ !znode_above_root(znode_parent(node)),
80074+ znode_parent(node)->c_count > 0) &&
80075+ /* leaves don't have children */
80076+ _ergo(znode_get_level(node) == LEAF_LEVEL,
80077+ node->c_count == 0) &&
80078+ _check(node->zjnode.jnodes.prev != NULL) &&
80079+ _check(node->zjnode.jnodes.next != NULL) &&
80080+ /* orphan doesn't have a parent */
80081+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
80082+ /* [znode-modify] invariant */
80083+ /* if znode is not write-locked, its checksum remains
80084+ * invariant */
80085+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
80086+ * cannot check this. */
80087+ /* [znode-refs] invariant */
80088+ /* only referenced znode can be long-term locked */
80089+ _ergo(znode_is_locked(node),
80090+ atomic_read(&ZJNODE(node)->x_count) != 0);
80091+}
80092+
80093+/* debugging aid: check znode invariant and panic if it doesn't hold */
80094+int znode_invariant(znode * node /* znode to check */ )
80095+{
80096+ char const *failed_msg;
80097+ int result;
80098+
80099+ assert("umka-063", node != NULL);
80100+ assert("umka-064", current_tree != NULL);
80101+
80102+ spin_lock_znode(node);
80103+ read_lock_tree(znode_get_tree(node));
80104+ result = znode_invariant_f(node, &failed_msg);
80105+ if (!result) {
80106+ /* print_znode("corrupted node", node); */
80107+ warning("jmacd-555", "Condition %s failed", failed_msg);
80108+ }
80109+ read_unlock_tree(znode_get_tree(node));
80110+ spin_unlock_znode(node);
80111+ return result;
80112+}
80113+
80114+/* return non-0 iff data are loaded into znode */
80115+int znode_is_loaded(const znode * node /* znode to query */ )
80116+{
80117+ assert("nikita-497", node != NULL);
80118+ return jnode_is_loaded(ZJNODE(node));
80119+}
80120+
80121+unsigned long znode_times_locked(const znode * z)
80122+{
80123+ return z->times_locked;
80124+}
80125+
80126+#endif /* REISER4_DEBUG */
80127+
80128+/* Make Linus happy.
80129+ Local variables:
80130+ c-indentation-style: "K&R"
80131+ mode-name: "LC"
80132+ c-basic-offset: 8
80133+ tab-width: 8
80134+ fill-column: 120
80135+ End:
80136+*/
80137Index: linux-2.6.16/fs/reiser4/znode.h
80138===================================================================
80139--- /dev/null
80140+++ linux-2.6.16/fs/reiser4/znode.h
80141@@ -0,0 +1,434 @@
80142+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
80143+ * reiser4/README */
80144+
80145+/* Declaration of znode (Zam's node). See znode.c for more details. */
80146+
80147+#ifndef __ZNODE_H__
80148+#define __ZNODE_H__
80149+
80150+#include "forward.h"
80151+#include "debug.h"
80152+#include "dformat.h"
80153+#include "key.h"
80154+#include "coord.h"
80155+#include "plugin/node/node.h"
80156+#include "jnode.h"
80157+#include "lock.h"
80158+#include "readahead.h"
80159+
80160+#include <linux/types.h>
80161+#include <linux/spinlock.h>
80162+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
80163+#include <asm/atomic.h>
80164+#include <asm/semaphore.h>
80165+
80166+/* znode tracks its position within parent (internal item in a parent node,
80167+ * that contains znode's block number). */
80168+typedef struct parent_coord {
80169+ znode *node;
80170+ pos_in_node_t item_pos;
80171+} parent_coord_t;
80172+
80173+/* &znode - node in a reiser4 tree.
80174+
80175+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
80176+ cacheline pressure.
80177+
80178+ Locking:
80179+
80180+ Long term: data in a disk node attached to this znode are protected
80181+ by long term, deadlock aware lock ->lock;
80182+
80183+ Spin lock: the following fields are protected by the spin lock:
80184+
80185+ ->lock
80186+
80187+ Following fields are protected by the global tree lock:
80188+
80189+ ->left
80190+ ->right
80191+ ->in_parent
80192+ ->c_count
80193+
80194+ Following fields are protected by the global delimiting key lock (dk_lock):
80195+
80196+ ->ld_key (to update ->ld_key long-term lock on the node is also required)
80197+ ->rd_key
80198+
80199+ Following fields are protected by the long term lock:
80200+
80201+ ->nr_items
80202+
80203+ ->node_plugin is never changed once set. This means that after code made
80204+ itself sure that field is valid it can be accessed without any additional
80205+ locking.
80206+
80207+ ->level is immutable.
80208+
80209+ Invariants involving this data-type:
80210+
80211+ [znode-fake]
80212+ [znode-level]
80213+ [znode-connected]
80214+ [znode-c_count]
80215+ [znode-refs]
80216+ [jnode-refs]
80217+ [jnode-queued]
80218+ [znode-modify]
80219+
80220+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
80221+ Suggestions for how to do that are desired.*/
80222+struct znode {
80223+ /* Embedded jnode. */
80224+ jnode zjnode;
80225+
80226+ /* contains three subfields, node, pos_in_node, and pos_in_unit.
80227+
80228+ pos_in_node and pos_in_unit are only hints that are cached to
80229+ speed up lookups during balancing. They are not required to be up to
80230+ date. Synched in find_child_ptr().
80231+
80232+ This value allows us to avoid expensive binary searches.
80233+
80234+ in_parent->node points to the parent of this node, and is NOT a
80235+ hint.
80236+ */
80237+ parent_coord_t in_parent;
80238+
80239+ /*
80240+ * sibling list pointers
80241+ */
80242+
80243+ /* left-neighbor */
80244+ znode *left;
80245+ /* right-neighbor */
80246+ znode *right;
80247+
80248+ /* long term lock on node content. This lock supports deadlock
80249+ detection. See lock.c
80250+ */
80251+ zlock lock;
80252+
80253+ /* You cannot remove from memory a node that has children in
80254+ memory. This is because we rely on the fact that parent of given
80255+ node can always be reached without blocking for io. When reading a
80256+ node into memory you must increase the c_count of its parent, when
80257+ removing it from memory you must decrease the c_count. This makes
80258+ the code simpler, and the cases where it is suboptimal are truly
80259+ obscure.
80260+ */
80261+ int c_count;
80262+
80263+ /* plugin of node attached to this znode. NULL if znode is not
80264+ loaded. */
80265+ node_plugin *nplug;
80266+
80267+ /* version of znode data. This is increased on each modification. This
80268+ * is necessary to implement seals (see seal.[ch]) efficiently. */
80269+ __u64 version;
80270+
80271+ /* left delimiting key. Necessary to efficiently perform
80272+ balancing with node-level locking. Kept in memory only. */
80273+ reiser4_key ld_key;
80274+ /* right delimiting key. */
80275+ reiser4_key rd_key;
80276+
80277+ /* znode's tree level */
80278+ __u16 level;
80279+ /* number of items in this node. This field is modified by node
80280+ * plugin. */
80281+ __u16 nr_items;
80282+
80283+#if REISER4_DEBUG
80284+ void *creator;
80285+ reiser4_key first_key;
80286+ unsigned long times_locked;
80287+ int left_version; /* when node->left was updated */
80288+ int right_version; /* when node->right was updated */
80289+ int ld_key_version; /* when node->ld_key was updated */
80290+ int rd_key_version; /* when node->rd_key was updated */
80291+#endif
80292+
80293+} __attribute__ ((aligned(16)));
80294+
80295+ON_DEBUG(extern atomic_t delim_key_version;
80296+ )
80297+
80298+/* In general I think these macros should not be exposed. */
80299+#define znode_is_locked(node) (lock_is_locked(&node->lock))
80300+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
80301+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
80302+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
80303+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
80304+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80305+/* Macros for accessing the znode state. */
80306+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
80307+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
80308+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
80309+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80310+ znode * parent, tree_level level, gfp_t gfp_flag);
80311+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80312+extern int zload(znode * node);
80313+extern int zload_ra(znode * node, ra_info_t * info);
80314+extern int zinit_new(znode * node, gfp_t gfp_flags);
80315+extern void zrelse(znode * node);
80316+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80317+
80318+/* size of data in znode */
80319+static inline unsigned
80320+znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80321+{
80322+ assert("nikita-1416", node != NULL);
80323+ return PAGE_CACHE_SIZE;
80324+}
80325+
80326+extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80327+ coord_t * coord);
80328+extern void coord_to_parent_coord(const coord_t * coord,
80329+ parent_coord_t * pcoord);
80330+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80331+
80332+extern unsigned znode_free_space(znode * node);
80333+
80334+extern reiser4_key *znode_get_rd_key(znode * node);
80335+extern reiser4_key *znode_get_ld_key(znode * node);
80336+
80337+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80338+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80339+
80340+/* `connected' state checks */
80341+static inline int znode_is_right_connected(const znode * node)
80342+{
80343+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80344+}
80345+
80346+static inline int znode_is_left_connected(const znode * node)
80347+{
80348+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80349+}
80350+
80351+static inline int znode_is_connected(const znode * node)
80352+{
80353+ return znode_is_right_connected(node) && znode_is_left_connected(node);
80354+}
80355+
80356+extern int znode_shift_order;
80357+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80358+extern void znode_remove(znode *, reiser4_tree *);
80359+extern znode *znode_parent(const znode * node);
80360+extern znode *znode_parent_nolock(const znode * node);
80361+extern int znode_above_root(const znode * node);
80362+extern int init_znodes(void);
80363+extern void done_znodes(void);
80364+extern int znodes_tree_init(reiser4_tree * ztree);
80365+extern void znodes_tree_done(reiser4_tree * ztree);
80366+extern int znode_contains_key(znode * node, const reiser4_key * key);
80367+extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80368+extern unsigned znode_save_free_space(znode * node);
80369+extern unsigned znode_recover_free_space(znode * node);
80370+extern znode *zalloc(gfp_t gfp_flag);
80371+extern void zinit(znode *, const znode * parent, reiser4_tree *);
80372+extern int zparse(znode * node);
80373+
80374+
80375+extern int znode_just_created(const znode * node);
80376+
80377+extern void zfree(znode * node);
80378+
80379+#if REISER4_DEBUG
80380+extern void print_znode(const char *prefix, const znode * node);
80381+#else
80382+#define print_znode( p, n ) noop
80383+#endif
80384+
80385+/* Make it look like various znode functions exist instead of treating znodes as
80386+ jnodes in znode-specific code. */
80387+#define znode_page(x) jnode_page ( ZJNODE(x) )
80388+#define zdata(x) jdata ( ZJNODE(x) )
80389+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
80390+#define znode_created(x) jnode_created ( ZJNODE(x) )
80391+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
80392+#define znode_convertible(x) jnode_convertible (ZJNODE(x))
80393+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
80394+
80395+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
80396+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
80397+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
80398+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
80399+
80400+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
80401+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
80402+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
80403+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
80404+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80405+
80406+#if REISER4_DEBUG
80407+extern int znode_x_count_is_protected(const znode * node);
80408+extern int znode_invariant(znode * node);
80409+#endif
80410+
80411+/* acquire reference to @node */
80412+static inline znode *zref(znode * node)
80413+{
80414+ /* change of x_count from 0 to 1 is protected by tree spin-lock */
80415+ return JZNODE(jref(ZJNODE(node)));
80416+}
80417+
80418+/* release reference to @node */
80419+static inline void zput(znode * node)
80420+{
80421+ assert("nikita-3564", znode_invariant(node));
80422+ jput(ZJNODE(node));
80423+}
80424+
80425+/* get the level field for a znode */
80426+static inline tree_level znode_get_level(const znode * node)
80427+{
80428+ return node->level;
80429+}
80430+
80431+/* get the level field for a jnode */
80432+static inline tree_level jnode_get_level(const jnode * node)
80433+{
80434+ if (jnode_is_znode(node))
80435+ return znode_get_level(JZNODE(node));
80436+ else
80437+ /* unformatted nodes are all at the LEAF_LEVEL and for
80438+ "semi-formatted" nodes like bitmaps, level doesn't matter. */
80439+ return LEAF_LEVEL;
80440+}
80441+
80442+/* true if jnode is on leaf level */
80443+static inline int jnode_is_leaf(const jnode * node)
80444+{
80445+ if (jnode_is_znode(node))
80446+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80447+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80448+ return 1;
80449+ return 0;
80450+}
80451+
80452+/* return znode's tree */
80453+static inline reiser4_tree *znode_get_tree(const znode * node)
80454+{
80455+ assert("nikita-2692", node != NULL);
80456+ return jnode_get_tree(ZJNODE(node));
80457+}
80458+
80459+/* resolve race with zput */
80460+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80461+{
80462+ jnode *j;
80463+
80464+ j = jnode_rip_sync(tree, ZJNODE(node));
80465+ if (likely(j != NULL))
80466+ node = JZNODE(j);
80467+ else
80468+ node = NULL;
80469+ return node;
80470+}
80471+
80472+#if defined(REISER4_DEBUG)
80473+int znode_is_loaded(const znode * node /* znode to query */ );
80474+#endif
80475+
80476+extern __u64 znode_build_version(reiser4_tree * tree);
80477+
80478+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
80479+ must load the data for a node in many places. We could do this by simply calling
80480+ zload() everywhere, the difficulty arises when we must release the loaded data by
80481+ calling zrelse. In a function with many possible error/return paths, it requires extra
80482+ work to figure out which exit paths must call zrelse and those which do not. The data
80483+ handle automatically calls zrelse for every zload that it is responsible for. In that
80484+ sense, it acts much like a lock_handle.
80485+*/
80486+typedef struct load_count {
80487+ znode *node;
80488+ int d_ref;
80489+} load_count;
80490+
80491+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
80492+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
80493+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
80494+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
80495+ * incr_load_count_znode, otherwise do nothing (unformatted nodes
80496+ * don't require zload/zrelse treatment). */
80497+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
80498+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
80499+
80500+/* Variable initializers for load_count. */
80501+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
80502+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
80503+/* A convenience macro for use in assertions or debug-only code, where loaded
80504+ data is only required to perform the debugging check. This macro
80505+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */
80506+#define WITH_DATA( node, exp ) \
80507+({ \
80508+ long __with_dh_result; \
80509+ znode *__with_dh_node; \
80510+ \
80511+ __with_dh_node = ( node ); \
80512+ __with_dh_result = zload( __with_dh_node ); \
80513+ if( __with_dh_result == 0 ) { \
80514+ __with_dh_result = ( long )( exp ); \
80515+ zrelse( __with_dh_node ); \
80516+ } \
80517+ __with_dh_result; \
80518+})
80519+
80520+/* Same as above, but accepts a return value in case zload fails. */
80521+#define WITH_DATA_RET( node, ret, exp ) \
80522+({ \
80523+ int __with_dh_result; \
80524+ znode *__with_dh_node; \
80525+ \
80526+ __with_dh_node = ( node ); \
80527+ __with_dh_result = zload( __with_dh_node ); \
80528+ if( __with_dh_result == 0 ) { \
80529+ __with_dh_result = ( int )( exp ); \
80530+ zrelse( __with_dh_node ); \
80531+ } else \
80532+ __with_dh_result = ( ret ); \
80533+ __with_dh_result; \
80534+})
80535+
80536+#define WITH_COORD(coord, exp) \
80537+({ \
80538+ coord_t *__coord; \
80539+ \
80540+ __coord = (coord); \
80541+ coord_clear_iplug(__coord); \
80542+ WITH_DATA(__coord->node, exp); \
80543+})
80544+
80545+#if REISER4_DEBUG
80546+#define STORE_COUNTERS \
80547+ lock_counters_info __entry_counters = *lock_counters()
80548+#define CHECK_COUNTERS \
80549+ON_DEBUG_CONTEXT( \
80550+({ \
80551+ __entry_counters.x_refs = lock_counters() -> x_refs; \
80552+ __entry_counters.t_refs = lock_counters() -> t_refs; \
80553+ __entry_counters.d_refs = lock_counters() -> d_refs; \
80554+ assert("nikita-2159", \
80555+ !memcmp(&__entry_counters, lock_counters(), \
80556+ sizeof __entry_counters)); \
80557+}) )
80558+
80559+#else
80560+#define STORE_COUNTERS
80561+#define CHECK_COUNTERS noop
80562+#endif
80563+
80564+/* __ZNODE_H__ */
80565+#endif
80566+
80567+/* Make Linus happy.
80568+ Local variables:
80569+ c-indentation-style: "K&R"
80570+ mode-name: "LC"
80571+ c-basic-offset: 8
80572+ tab-width: 8
80573+ fill-column: 120
80574+ End:
80575+*/
80576Index: linux-2.6.16/include/linux/fs.h
80577===================================================================
80578--- linux-2.6.16.orig/include/linux/fs.h
80579+++ linux-2.6.16/include/linux/fs.h
80580@@ -1085,6 +1085,8 @@ struct super_operations {
80581 void (*clear_inode) (struct inode *);
80582 void (*umount_begin) (struct super_block *);
80583
80584+ void (*sync_inodes) (struct super_block *sb,
80585+ struct writeback_control *wbc);
80586 int (*show_options)(struct seq_file *, struct vfsmount *);
80587
80588 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
80589@@ -1449,6 +1451,7 @@ extern int invalidate_inode_pages2(struc
80590 extern int invalidate_inode_pages2_range(struct address_space *mapping,
80591 pgoff_t start, pgoff_t end);
80592 extern int write_inode_now(struct inode *, int);
80593+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
80594 extern int filemap_fdatawrite(struct address_space *);
80595 extern int filemap_flush(struct address_space *);
80596 extern int filemap_fdatawait(struct address_space *);
80597Index: linux-2.6.16/lib/radix-tree.c
80598===================================================================
80599--- linux-2.6.16.orig/lib/radix-tree.c
80600+++ linux-2.6.16/lib/radix-tree.c
80601@@ -139,6 +139,7 @@ static inline void tag_set(struct radix_
80602 {
80603 __set_bit(offset, node->tags[tag]);
80604 }
80605+EXPORT_SYMBOL(radix_tree_preload);
80606
80607 static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
80608 {
80609Index: linux-2.6.16/mm/filemap.c
80610===================================================================
80611--- linux-2.6.16.orig/mm/filemap.c
80612+++ linux-2.6.16/mm/filemap.c
80613@@ -119,6 +119,7 @@ void __remove_from_page_cache(struct pag
80614 mapping->nrpages--;
80615 pagecache_acct(-1);
80616 }
80617+EXPORT_SYMBOL(__remove_from_page_cache);
80618
80619 void remove_from_page_cache(struct page *page)
80620 {
80621@@ -130,6 +131,7 @@ void remove_from_page_cache(struct page
80622 __remove_from_page_cache(page);
80623 write_unlock_irq(&mapping->tree_lock);
80624 }
80625+EXPORT_SYMBOL(remove_from_page_cache);
80626
80627 static int sync_page(void *word)
80628 {
80629@@ -272,6 +274,7 @@ static int wait_on_page_writeback_range(
80630
80631 return ret;
80632 }
80633+EXPORT_SYMBOL(add_to_page_cache_lru);
80634
80635 /*
80636 * Write and wait upon all the pages in the passed range. This is a "data
80637@@ -632,7 +635,6 @@ repeat:
80638 page_cache_release(cached_page);
80639 return page;
80640 }
80641-
80642 EXPORT_SYMBOL(find_or_create_page);
80643
80644 /**
80645@@ -665,6 +667,7 @@ unsigned find_get_pages(struct address_s
80646 read_unlock_irq(&mapping->tree_lock);
80647 return ret;
80648 }
80649+EXPORT_SYMBOL(find_get_pages);
80650
80651 /*
80652 * Like find_get_pages, except we only return pages which are tagged with
80653@@ -686,6 +689,7 @@ unsigned find_get_pages_tag(struct addre
80654 read_unlock_irq(&mapping->tree_lock);
80655 return ret;
80656 }
80657+EXPORT_SYMBOL(find_get_pages_tag);
80658
80659 /*
80660 * Same as grab_cache_page, but do not wait if the page is unavailable.
80661Index: linux-2.6.16/mm/page-writeback.c
80662===================================================================
80663--- linux-2.6.16.orig/mm/page-writeback.c
80664+++ linux-2.6.16/mm/page-writeback.c
80665@@ -187,7 +187,7 @@ get_dirty_limits(struct writeback_state
80666 * If we're over `background_thresh' then pdflush is woken to perform some
80667 * writeout.
80668 */
80669-static void balance_dirty_pages(struct address_space *mapping)
80670+void balance_dirty_pages(struct address_space *mapping)
80671 {
80672 struct writeback_state wbs;
80673 long nr_reclaimable;
80674@@ -253,6 +253,7 @@ static void balance_dirty_pages(struct a
80675 (!laptop_mode && (nr_reclaimable > background_thresh)))
80676 pdflush_operation(background_writeout, 0);
80677 }
80678+EXPORT_SYMBOL(balance_dirty_pages);
80679
80680 /**
80681 * balance_dirty_pages_ratelimited - balance dirty memory state
80682Index: linux-2.6.16/mm/readahead.c
80683===================================================================
80684--- linux-2.6.16.orig/mm/readahead.c
80685+++ linux-2.6.16/mm/readahead.c
80686@@ -541,6 +541,7 @@ page_cache_readahead(struct address_spac
80687 out:
80688 return ra->prev_page + 1;
80689 }
80690+EXPORT_SYMBOL_GPL(page_cache_readahead);
80691
80692 /*
80693 * handle_ra_miss() is called when it is known that a page which should have
80694@@ -558,6 +559,7 @@ void handle_ra_miss(struct address_space
80695 ra->flags &= ~RA_FLAG_INCACHE;
80696 ra->cache_hit = 0;
80697 }
80698+EXPORT_SYMBOL_GPL(handle_ra_miss);
80699
80700 /*
80701 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a