]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/reiser4-for-2.6.16-5.patch
0a62c8631c9182b7ca2035bf12ea9131025855a9
[people/pmueller/ipfire-2.x.git] / src / patches / reiser4-for-2.6.16-5.patch
1 Index: linux-2.6.16/Documentation/Changes
2 ===================================================================
3 --- linux-2.6.16.orig/Documentation/Changes
4 +++ linux-2.6.16/Documentation/Changes
5 @@ -54,6 +54,7 @@ o module-init-tools 0.9.10
6 o e2fsprogs 1.29 # tune2fs
7 o jfsutils 1.1.3 # fsck.jfs -V
8 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
9 +o reiser4progs 1.0.0 # fsck.reiser4 -V
10 o xfsprogs 2.6.0 # xfs_db -V
11 o pcmciautils 004
12 o pcmcia-cs 3.1.21 # cardmgr -V
13 @@ -163,6 +164,13 @@ The reiserfsprogs package should be used
14 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
15 reiserfsck. These utils work on both i386 and alpha platforms.
16
17 +Reiser4progs
18 +------------
19 +
20 +The reiser4progs package contains utilities for the reiser4 file system.
21 +Detailed instructions are provided in the README file located at:
22 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
23 +
24 Xfsprogs
25 --------
26
27 @@ -344,6 +352,10 @@ Reiserfsprogs
28 -------------
29 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
30
31 +Reiser4progs
32 +------------
33 +o <ftp://ftp.namesys.com/pub/reiser4progs/>
34 +
35 Xfsprogs
36 --------
37 o <ftp://oss.sgi.com/projects/xfs/download/>
38 Index: linux-2.6.16/Documentation/filesystems/reiser4.txt
39 ===================================================================
40 --- /dev/null
41 +++ linux-2.6.16/Documentation/filesystems/reiser4.txt
42 @@ -0,0 +1,75 @@
43 +Reiser4 filesystem
44 +==================
45 +Reiser4 is a file system based on dancing tree algorithms, and is
46 +described at http://www.namesys.com
47 +
48 +
49 +References
50 +==========
51 +web page http://namesys.com/v4/v4.html
52 +source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
53 +userland tools ftp://ftp.namesys.com/pub/reiser4progs/
54 +install page http://www.namesys.com/install_v4.html
55 +
56 +Compile options
57 +===============
58 +Enable reiser4 debug mode
59 + This checks everything imaginable while reiser4
60 + runs
61 +
62 +Mount options
63 +=============
64 +tmgr.atom_max_size=N
65 + Atoms containing more than N blocks will be forced to commit.
66 + N is decimal.
67 + Default is nr_free_pagecache_pages() / 2 at mount time.
68 +
69 +tmgr.atom_max_age=N
70 + Atoms older than N seconds will be forced to commit. N is decimal.
71 + Default is 600.
72 +
73 +tmgr.atom_max_flushers=N
74 + Limit of concurrent flushers for one atom. 0 means no limit.
75 + Default is 0.
76 +
77 +tree.cbk_cache.nr_slots=N
78 + Number of slots in the cbk cache.
79 +
80 +flush.relocate_threshold=N
81 + If flush finds more than N adjacent dirty leaf-level blocks it
82 + will force them to be relocated.
83 + Default is 64.
84 +
85 +flush.relocate_distance=N
86 + If flush finds can find a block allocation closer than at most
87 + N from the preceder it will relocate to that position.
88 + Default is 64.
89 +
90 +flush.scan_maxnodes=N
91 + The maximum number of nodes to scan left on a level during
92 + flush.
93 + Default is 10000.
94 +
95 +optimal_io_size=N
96 + Preferred IO size. This value is used to set st_blksize of
97 + struct stat.
98 + Default is 65536.
99 +
100 +bsdgroups
101 + Turn on BSD-style gid assignment.
102 +
103 +32bittimes
104 + By default file in reiser4 have 64 bit timestamps. Files
105 + created when filesystem is mounted with 32bittimes mount
106 + option will get 32 bit timestamps.
107 +
108 +mtflush
109 + Turn off concurrent flushing.
110 +
111 +nopseudo
112 + Disable pseudo files support. See
113 + http://namesys.com/v4/pseudo.html for more about pseudo files.
114 +
115 +dont_load_bitmap
116 + Don't load all bitmap blocks at mount time, it is useful for
117 + machines with tiny RAM and large disks.
118 Index: linux-2.6.16/fs/Kconfig
119 ===================================================================
120 --- linux-2.6.16.orig/fs/Kconfig
121 +++ linux-2.6.16/fs/Kconfig
122 @@ -177,6 +177,8 @@ config FS_MBCACHE
123 default y if EXT2_FS=y || EXT3_FS=y
124 default m if EXT2_FS=m || EXT3_FS=m
125
126 +source "fs/reiser4/Kconfig"
127 +
128 config REISERFS_FS
129 tristate "Reiserfs support"
130 help
131 Index: linux-2.6.16/fs/Makefile
132 ===================================================================
133 --- linux-2.6.16.orig/fs/Makefile
134 +++ linux-2.6.16/fs/Makefile
135 @@ -51,6 +51,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
136
137 # Do not add any filesystems before this line
138 obj-$(CONFIG_REISERFS_FS) += reiserfs/
139 +obj-$(CONFIG_REISER4_FS) += reiser4/
140 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
141 obj-$(CONFIG_JBD) += jbd/
142 obj-$(CONFIG_EXT2_FS) += ext2/
143 Index: linux-2.6.16/fs/fs-writeback.c
144 ===================================================================
145 --- linux-2.6.16.orig/fs/fs-writeback.c
146 +++ linux-2.6.16/fs/fs-writeback.c
147 @@ -286,8 +286,6 @@ __writeback_single_inode(struct inode *i
148 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
149 * that it can be located for waiting on in __writeback_single_inode().
150 *
151 - * Called under inode_lock.
152 - *
153 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
154 * This function assumes that the blockdev superblock's inodes are backed by
155 * a variety of queues, so all inodes are searched. For other superblocks,
156 @@ -303,11 +301,13 @@ __writeback_single_inode(struct inode *i
157 * on the writer throttling path, and we get decent balancing between many
158 * throttled threads: we don't want them all piling up on __wait_on_inode.
159 */
160 -static void
161 -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
162 +void
163 +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
164 {
165 const unsigned long start = jiffies; /* livelock avoidance */
166
167 + spin_lock(&inode_lock);
168 +
169 if (!wbc->for_kupdate || list_empty(&sb->s_io))
170 list_splice_init(&sb->s_dirty, &sb->s_io);
171
172 @@ -387,8 +387,19 @@ sync_sb_inodes(struct super_block *sb, s
173 if (wbc->nr_to_write <= 0)
174 break;
175 }
176 + spin_unlock(&inode_lock);
177 return; /* Leave any unwritten inodes on s_io */
178 }
179 +EXPORT_SYMBOL(generic_sync_sb_inodes);
180 +
181 +static void
182 +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
183 +{
184 + if (sb->s_op->sync_inodes)
185 + sb->s_op->sync_inodes(sb, wbc);
186 + else
187 + generic_sync_sb_inodes(sb, wbc);
188 +}
189
190 /*
191 * Start writeback of dirty pagecache data against all unlocked inodes.
192 @@ -429,11 +440,8 @@ restart:
193 * be unmounted by the time it is released.
194 */
195 if (down_read_trylock(&sb->s_umount)) {
196 - if (sb->s_root) {
197 - spin_lock(&inode_lock);
198 + if (sb->s_root)
199 sync_sb_inodes(sb, wbc);
200 - spin_unlock(&inode_lock);
201 - }
202 up_read(&sb->s_umount);
203 }
204 spin_lock(&sb_lock);
205 @@ -469,9 +477,7 @@ void sync_inodes_sb(struct super_block *
206 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
207 nr_dirty + nr_unstable;
208 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
209 - spin_lock(&inode_lock);
210 sync_sb_inodes(sb, &wbc);
211 - spin_unlock(&inode_lock);
212 }
213
214 /*
215 Index: linux-2.6.16/fs/reiser4/Kconfig
216 ===================================================================
217 --- /dev/null
218 +++ linux-2.6.16/fs/reiser4/Kconfig
219 @@ -0,0 +1,31 @@
220 +config REISER4_FS
221 + tristate "Reiser4 (EXPERIMENTAL)"
222 + depends on EXPERIMENTAL
223 + select ZLIB_INFLATE
224 + select ZLIB_DEFLATE
225 + help
226 + Reiser4 is a filesystem that performs all filesystem operations
227 + as atomic transactions, which means that it either performs a
228 + write, or it does not, and in the event of a crash it does not
229 + partially perform it or corrupt it.
230 +
231 + It stores files in dancing trees, which are like balanced trees but
232 + faster. It packs small files together so that they share blocks
233 + without wasting space. This means you can use it to store really
234 + small files. It also means that it saves you disk space. It avoids
235 + hassling you with anachronisms like having a maximum number of
236 + inodes, and wasting space if you use less than that number.
237 +
238 + Reiser4 is a distinct filesystem type from reiserfs (V3).
239 + It's therefore not possible to use reiserfs file systems
240 + with reiser4.
241 +
242 + To learn more about reiser4, go to http://www.namesys.com
243 +
244 +config REISER4_DEBUG
245 + bool "Enable reiser4 debug mode"
246 + depends on REISER4_FS
247 + help
248 + Don't use this unless you are debugging reiser4.
249 +
250 + If unsure, say N.
251 Index: linux-2.6.16/fs/reiser4/Makefile
252 ===================================================================
253 --- /dev/null
254 +++ linux-2.6.16/fs/reiser4/Makefile
255 @@ -0,0 +1,100 @@
256 +#
257 +# reiser4/Makefile
258 +#
259 +
260 +obj-$(CONFIG_REISER4_FS) += reiser4.o
261 +
262 +reiser4-y := \
263 + debug.o \
264 + jnode.o \
265 + znode.o \
266 + key.o \
267 + pool.o \
268 + tree_mod.o \
269 + estimate.o \
270 + carry.o \
271 + carry_ops.o \
272 + lock.o \
273 + tree.o \
274 + context.o \
275 + tap.o \
276 + coord.o \
277 + block_alloc.o \
278 + txnmgr.o \
279 + kassign.o \
280 + flush.o \
281 + wander.o \
282 + eottl.o \
283 + search.o \
284 + page_cache.o \
285 + seal.o \
286 + dscale.o \
287 + flush_queue.o \
288 + ktxnmgrd.o \
289 + blocknrset.o \
290 + super.o \
291 + super_ops.o \
292 + fsdata.o \
293 + export_ops.o \
294 + oid.o \
295 + tree_walk.o \
296 + inode.o \
297 + vfs_ops.o \
298 + as_ops.o \
299 + entd.o\
300 + readahead.o \
301 + status_flags.o \
302 + init_super.o \
303 + safe_link.o \
304 + \
305 + plugin/plugin.o \
306 + plugin/plugin_set.o \
307 + plugin/node/node.o \
308 + plugin/object.o \
309 + plugin/cluster.o \
310 + plugin/inode_ops.o \
311 + plugin/inode_ops_rename.o \
312 + plugin/file_ops.o \
313 + plugin/file_ops_readdir.o \
314 + plugin/file_plugin_common.o \
315 + plugin/file/file.o \
316 + plugin/file/tail_conversion.o \
317 + plugin/file/symlink.o \
318 + plugin/file/cryptcompress.o \
319 + plugin/dir_plugin_common.o \
320 + plugin/dir/hashed_dir.o \
321 + plugin/dir/seekable_dir.o \
322 + plugin/node/node40.o \
323 + \
324 + plugin/crypto/cipher.o \
325 + plugin/crypto/digest.o \
326 + \
327 + plugin/compress/minilzo.o \
328 + plugin/compress/compress.o \
329 + plugin/compress/compress_mode.o \
330 + \
331 + plugin/item/static_stat.o \
332 + plugin/item/sde.o \
333 + plugin/item/cde.o \
334 + plugin/item/blackbox.o \
335 + plugin/item/internal.o \
336 + plugin/item/tail.o \
337 + plugin/item/ctail.o \
338 + plugin/item/extent.o \
339 + plugin/item/extent_item_ops.o \
340 + plugin/item/extent_file_ops.o \
341 + plugin/item/extent_flush_ops.o \
342 + \
343 + plugin/hash.o \
344 + plugin/fibration.o \
345 + plugin/tail_policy.o \
346 + plugin/item/item.o \
347 + \
348 + plugin/security/perm.o \
349 + plugin/space/bitmap.o \
350 + \
351 + plugin/disk_format/disk_format40.o \
352 + plugin/disk_format/disk_format.o \
353 + \
354 + plugin/regular.o
355 +
356 Index: linux-2.6.16/fs/reiser4/README
357 ===================================================================
358 --- /dev/null
359 +++ linux-2.6.16/fs/reiser4/README
360 @@ -0,0 +1,125 @@
361 +[LICENSING]
362 +
363 +Reiser4 is hereby licensed under the GNU General
364 +Public License version 2.
365 +
366 +Source code files that contain the phrase "licensing governed by
367 +reiser4/README" are "governed files" throughout this file. Governed
368 +files are licensed under the GPL. The portions of them owned by Hans
369 +Reiser, or authorized to be licensed by him, have been in the past,
370 +and likely will be in the future, licensed to other parties under
371 +other licenses. If you add your code to governed files, and don't
372 +want it to be owned by Hans Reiser, put your copyright label on that
373 +code so the poor blight and his customers can keep things straight.
374 +All portions of governed files not labeled otherwise are owned by Hans
375 +Reiser, and by adding your code to it, widely distributing it to
376 +others or sending us a patch, and leaving the sentence in stating that
377 +licensing is governed by the statement in this file, you accept this.
378 +It will be a kindness if you identify whether Hans Reiser is allowed
379 +to license code labeled as owned by you on your behalf other than
380 +under the GPL, because he wants to know if it is okay to do so and put
381 +a check in the mail to you (for non-trivial improvements) when he
382 +makes his next sale. He makes no guarantees as to the amount if any,
383 +though he feels motivated to motivate contributors, and you can surely
384 +discuss this with him before or after contributing. You have the
385 +right to decline to allow him to license your code contribution other
386 +than under the GPL.
387 +
388 +Further licensing options are available for commercial and/or other
389 +interests directly from Hans Reiser: reiser@namesys.com. If you interpret
390 +the GPL as not allowing those additional licensing options, you read
391 +it wrongly, and Richard Stallman agrees with me, when carefully read
392 +you can see that those restrictions on additional terms do not apply
393 +to the owner of the copyright, and my interpretation of this shall
394 +govern for this license.
395 +
396 +[END LICENSING]
397 +
398 +Reiser4 is a file system based on dancing tree algorithms, and is
399 +described at http://www.namesys.com
400 +
401 +mkfs.reiser4 and other utilities are on our webpage or wherever your
402 +Linux provider put them. You really want to be running the latest
403 +version off the website if you use fsck.
404 +
405 +Yes, if you update your reiser4 kernel module you do have to
406 +recompile your kernel, most of the time. The errors you get will be
407 +quite cryptic if your forget to do so.
408 +
409 +Hideous Commercial Pitch: Spread your development costs across other OS
410 +vendors. Select from the best in the world, not the best in your
411 +building, by buying from third party OS component suppliers. Leverage
412 +the software component development power of the internet. Be the most
413 +aggressive in taking advantage of the commercial possibilities of
414 +decentralized internet development, and add value through your branded
415 +integration that you sell as an operating system. Let your competitors
416 +be the ones to compete against the entire internet by themselves. Be
417 +hip, get with the new economic trend, before your competitors do. Send
418 +email to reiser@namesys.com
419 +
420 +Hans Reiser was the primary architect of Reiser4, but a whole team
421 +chipped their ideas in. He invested everything he had into Namesys
422 +for 5.5 dark years of no money before Reiser3 finally started to work well
423 +enough to bring in money. He owns the copyright.
424 +
425 +DARPA was the primary sponsor of Reiser4. DARPA does not endorse
426 +Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
427 +opinion, unique in its willingness to invest into things more
428 +theoretical than the VC community can readily understand, and more
429 +longterm than allows them to be sure that they will be the ones to
430 +extract the economic benefits from. DARPA also integrated us into a
431 +security community that transformed our security worldview.
432 +
433 +Vladimir Saveliev is our lead programmer, with us from the beginning,
434 +and he worked long hours writing the cleanest code. This is why he is
435 +now the lead programmer after years of commitment to our work. He
436 +always made the effort to be the best he could be, and to make his
437 +code the best that it could be. What resulted was quite remarkable. I
438 +don't think that money can ever motivate someone to work the way he
439 +did, he is one of the most selfless men I know.
440 +
441 +Alexander Lyamin was our sysadmin, and helped to educate us in
442 +security issues. Moscow State University and IMT were very generous
443 +in the internet access they provided us, and in lots of other little
444 +ways that a generous institution can be.
445 +
446 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
447 +locking code, the block allocator, and finished the flushing code.
448 +His code is always crystal clean and well structured.
449 +
450 +Nikita Danilov wrote the core of the balancing code, the core of the
451 +plugins code, and the directory code. He worked a steady pace of long
452 +hours that produced a whole lot of well abstracted code. He is our
453 +senior computer scientist.
454 +
455 +Vladimir Demidov wrote the parser. Writing an in kernel parser is
456 +something very few persons have the skills for, and it is thanks to
457 +him that we can say that the parser is really not so big compared to
458 +various bits of our other code, and making a parser work in the kernel
459 +was not so complicated as everyone would imagine mainly because it was
460 +him doing it...
461 +
462 +Joshua McDonald wrote the transaction manager, and the flush code.
463 +The flush code unexpectedly turned out be extremely hairy for reasons
464 +you can read about on our web page, and he did a great job on an
465 +extremely difficult task.
466 +
467 +Nina Reiser handled our accounting, government relations, and much
468 +more.
469 +
470 +Ramon Reiser developed our website.
471 +
472 +Beverly Palmer drew our graphics.
473 +
474 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
475 +and worked with Umka on developing libreiser4 and userspace plugins.
476 +
477 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
478 +userspace tools (reiser4progs).
479 +
480 +Oleg Drokin (aka Green) is the release manager who fixes everything.
481 +It is so nice to have someone like that on the team. He (plus Chris
482 +and Jeff) make it possible for the entire rest of the Namesys team to
483 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
484 +is just amazing to watch his talent for spotting bugs in action.
485 +
486 Index: linux-2.6.16/fs/reiser4/as_ops.c
487 ===================================================================
488 --- /dev/null
489 +++ linux-2.6.16/fs/reiser4/as_ops.c
490 @@ -0,0 +1,392 @@
491 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
492 +
493 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
494 +
495 +#include "forward.h"
496 +#include "debug.h"
497 +#include "dformat.h"
498 +#include "coord.h"
499 +#include "plugin/item/item.h"
500 +#include "plugin/file/file.h"
501 +#include "plugin/security/perm.h"
502 +#include "plugin/disk_format/disk_format.h"
503 +#include "plugin/plugin.h"
504 +#include "plugin/plugin_set.h"
505 +#include "plugin/object.h"
506 +#include "txnmgr.h"
507 +#include "jnode.h"
508 +#include "znode.h"
509 +#include "block_alloc.h"
510 +#include "tree.h"
511 +#include "vfs_ops.h"
512 +#include "inode.h"
513 +#include "page_cache.h"
514 +#include "ktxnmgrd.h"
515 +#include "super.h"
516 +#include "reiser4.h"
517 +#include "entd.h"
518 +
519 +#include <linux/profile.h>
520 +#include <linux/types.h>
521 +#include <linux/mount.h>
522 +#include <linux/vfs.h>
523 +#include <linux/mm.h>
524 +#include <linux/buffer_head.h>
525 +#include <linux/dcache.h>
526 +#include <linux/list.h>
527 +#include <linux/pagemap.h>
528 +#include <linux/slab.h>
529 +#include <linux/seq_file.h>
530 +#include <linux/init.h>
531 +#include <linux/module.h>
532 +#include <linux/writeback.h>
533 +#include <linux/backing-dev.h>
534 +#include <linux/quotaops.h>
535 +#include <linux/security.h>
536 +
537 +/* address space operations */
538 +
539 +/**
540 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
541 + * @page: page to be dirtied
542 + *
543 + * Operation of struct address_space_operations. This implementation is used by
544 + * unix and crc file plugins.
545 + *
546 + * This is called when reiser4 page gets dirtied outside of reiser4, for
547 + * example, when dirty bit is moved from pte to physical page.
548 + *
549 + * Tags page in the mapping's page tree with special tag so that it is possible
550 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
551 + * capturing by an atom) later because it can not be done in the contexts where
552 + * set_page_dirty is called.
553 + */
554 +int reiser4_set_page_dirty(struct page *page)
555 +{
556 + /* this page can be unformatted only */
557 + assert("vs-1734", (page->mapping &&
558 + page->mapping->host &&
559 + get_super_fake(page->mapping->host->i_sb) !=
560 + page->mapping->host
561 + && get_cc_fake(page->mapping->host->i_sb) !=
562 + page->mapping->host
563 + && get_bitmap_fake(page->mapping->host->i_sb) !=
564 + page->mapping->host));
565 +
566 + if (!TestSetPageDirty(page)) {
567 + struct address_space *mapping = page->mapping;
568 +
569 + if (mapping) {
570 + write_lock_irq(&mapping->tree_lock);
571 +
572 + /* check for race with truncate */
573 + if (page->mapping) {
574 + assert("vs-1652", page->mapping == mapping);
575 + if (mapping_cap_account_dirty(mapping))
576 + inc_page_state(nr_dirty);
577 + radix_tree_tag_set(&mapping->page_tree,
578 + page->index,
579 + PAGECACHE_TAG_REISER4_MOVED);
580 + }
581 + write_unlock_irq(&mapping->tree_lock);
582 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
583 + }
584 + }
585 + return 0;
586 +}
587 +
588 +static int filler(void *vp, struct page *page)
589 +{
590 + return page->mapping->a_ops->readpage(vp, page);
591 +}
592 +
593 +/**
594 + * reiser4_readpages - submit read for a set of pages
595 + * @file: file to read
596 + * @mapping: address space
597 + * @pages: list of pages to submit read for
598 + * @nr_pages: number of pages no the list
599 + *
600 + * Operation of struct address_space_operations. This implementation is used by
601 + * unix and crc file plugins.
602 + *
603 + * Calls read_cache_pages or readpages hook if it is set.
604 + */
605 +int
606 +reiser4_readpages(struct file *file, struct address_space *mapping,
607 + struct list_head *pages, unsigned nr_pages)
608 +{
609 + reiser4_context *ctx;
610 + reiser4_file_fsdata *fsdata;
611 +
612 + ctx = init_context(mapping->host->i_sb);
613 + if (IS_ERR(ctx))
614 + return PTR_ERR(ctx);
615 +
616 + fsdata = reiser4_get_file_fsdata(file);
617 + if (IS_ERR(fsdata)) {
618 + reiser4_exit_context(ctx);
619 + return PTR_ERR(fsdata);
620 + }
621 +
622 + if (fsdata->ra2.readpages)
623 + fsdata->ra2.readpages(mapping, pages, fsdata->ra2.data);
624 + else {
625 + /*
626 + * filler (reiser4 readpage method) may involve tree search
627 + * which is not allowed when lock stack is not clean. If lock
628 + * stack is not clean - do nothing.
629 + */
630 + if (lock_stack_isclean(get_current_lock_stack()))
631 + read_cache_pages(mapping, pages, filler, file);
632 + else {
633 + while (!list_empty(pages)) {
634 + struct page *victim;
635 +
636 + victim = list_entry(pages->prev, struct page, lru);
637 + list_del(&victim->lru);
638 + page_cache_release(victim);
639 + }
640 + }
641 + }
642 + reiser4_exit_context(ctx);
643 + return 0;
644 +}
645 +
646 +/* ->invalidatepage method for reiser4 */
647 +
648 +/*
649 + * this is called for each truncated page from
650 + * truncate_inode_pages()->truncate_{complete,partial}_page().
651 + *
652 + * At the moment of call, page is under lock, and outstanding io (if any) has
653 + * completed.
654 + */
655 +
656 +/**
657 + * reiser4_invalidatepage
658 + * @page: page to invalidate
659 + * @offset: starting offset for partial invalidation
660 + *
661 + */
662 +int reiser4_invalidatepage(struct page *page, unsigned long offset)
663 +{
664 + int ret = 0;
665 + reiser4_context *ctx;
666 + struct inode *inode;
667 + jnode *node;
668 +
669 + /*
670 + * This is called to truncate file's page.
671 + *
672 + * Originally, reiser4 implemented truncate in a standard way
673 + * (vmtruncate() calls ->invalidatepage() on all truncated pages
674 + * first, then file system ->truncate() call-back is invoked).
675 + *
676 + * This lead to the problem when ->invalidatepage() was called on a
677 + * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
678 + * process. That is, truncate was bypassing transactions. To avoid
679 + * this, try_capture_page_to_invalidate() call was added here.
680 + *
681 + * After many troubles with vmtruncate() based truncate (including
682 + * races with flush, tail conversion, etc.) it was re-written in the
683 + * top-to-bottom style: items are killed in cut_tree_object() and
684 + * pages belonging to extent are invalidated in kill_hook_extent(). So
685 + * probably now additional call to capture is not needed here.
686 + */
687 +
688 + assert("nikita-3137", PageLocked(page));
689 + assert("nikita-3138", !PageWriteback(page));
690 + inode = page->mapping->host;
691 +
692 + /*
693 + * ->invalidatepage() should only be called for the unformatted
694 + * jnodes. Destruction of all other types of jnodes is performed
695 + * separately. But, during some corner cases (like handling errors
696 + * during mount) it is simpler to let ->invalidatepage to be called on
697 + * them. Check for this, and do nothing.
698 + */
699 + if (get_super_fake(inode->i_sb) == inode)
700 + return 0;
701 + if (get_cc_fake(inode->i_sb) == inode)
702 + return 0;
703 + if (get_bitmap_fake(inode->i_sb) == inode)
704 + return 0;
705 + assert("vs-1426", PagePrivate(page));
706 + assert("vs-1427",
707 + page->mapping == jnode_get_mapping(jnode_by_page(page)));
708 + assert("", jprivate(page) != NULL);
709 + assert("", ergo(inode_file_plugin(inode) !=
710 + file_plugin_by_id(CRC_FILE_PLUGIN_ID), offset == 0));
711 +
712 + ctx = init_context(inode->i_sb);
713 + if (IS_ERR(ctx))
714 + return PTR_ERR(ctx);
715 +
716 + node = jprivate(page);
717 + spin_lock_jnode(node);
718 + if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
719 + (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
720 + /* there is not need to capture */
721 + jref(node);
722 + JF_SET(node, JNODE_HEARD_BANSHEE);
723 + page_clear_jnode(page, node);
724 + uncapture_jnode(node);
725 + unhash_unformatted_jnode(node);
726 + jput(node);
727 + reiser4_exit_context(ctx);
728 + return 0;
729 + }
730 + spin_unlock_jnode(node);
731 +
732 + /* capture page being truncated. */
733 + ret = try_capture_page_to_invalidate(page);
734 + if (ret != 0)
735 + warning("nikita-3141", "Cannot capture: %i", ret);
736 +
737 + if (offset == 0) {
738 + /* remove jnode from transaction and detach it from page. */
739 + jref(node);
740 + JF_SET(node, JNODE_HEARD_BANSHEE);
741 + /* page cannot be detached from jnode concurrently, because it
742 + * is locked */
743 + uncapture_page(page);
744 +
745 + /* this detaches page from jnode, so that jdelete will not try
746 + * to lock page which is already locked */
747 + spin_lock_jnode(node);
748 + page_clear_jnode(page, node);
749 + spin_unlock_jnode(node);
750 + unhash_unformatted_jnode(node);
751 +
752 + jput(node);
753 + }
754 +
755 + reiser4_exit_context(ctx);
756 + return 0;
757 +}
758 +
759 +/* help function called from reiser4_releasepage(). It returns true if jnode
760 + * can be detached from its page and page released. */
761 +int jnode_is_releasable(jnode * node /* node to check */ )
762 +{
763 + assert("nikita-2781", node != NULL);
764 + assert_spin_locked(&(node->guard));
765 + assert_spin_locked(&(node->load));
766 +
767 + /* is some thread is currently using jnode page, later cannot be
768 + * detached */
769 + if (atomic_read(&node->d_count) != 0) {
770 + return 0;
771 + }
772 +
773 + assert("vs-1214", !jnode_is_loaded(node));
774 +
775 + /*
776 + * can only release page if real block number is assigned to it. Simple
777 + * check for ->atom wouldn't do, because it is possible for node to be
778 + * clean, not it atom yet, and still having fake block number. For
779 + * example, node just created in jinit_new().
780 + */
781 + if (blocknr_is_fake(jnode_get_block(node)))
782 + return 0;
783 +
784 + /*
785 + * pages prepared for write can not be released anyway, so avoid
786 + * detaching jnode from the page
787 + */
788 + if (JF_ISSET(node, JNODE_WRITE_PREPARED))
789 + return 0;
790 +
791 + /*
792 + * dirty jnode cannot be released. It can however be submitted to disk
793 + * as part of early flushing, but only after getting flush-prepped.
794 + */
795 + if (JF_ISSET(node, JNODE_DIRTY))
796 + return 0;
797 +
798 + /* overwrite set is only written by log writer. */
799 + if (JF_ISSET(node, JNODE_OVRWR))
800 + return 0;
801 +
802 + /* jnode is already under writeback */
803 + if (JF_ISSET(node, JNODE_WRITEBACK))
804 + return 0;
805 +
806 + /* don't flush bitmaps or journal records */
807 + if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
808 + return 0;
809 +
810 + return 1;
811 +}
812 +
813 +/*
814 + * ->releasepage method for reiser4
815 + *
816 + * This is called by VM scanner when it comes across clean page. What we have
817 + * to do here is to check whether page can really be released (freed that is)
818 + * and if so, detach jnode from it and remove page from the page cache.
819 + *
820 + * Check for releasability is done by releasable() function.
821 + */
822 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
823 +{
824 + jnode *node;
825 +
826 + assert("nikita-2257", PagePrivate(page));
827 + assert("nikita-2259", PageLocked(page));
828 + assert("nikita-2892", !PageWriteback(page));
829 + assert("nikita-3019", schedulable());
830 +
831 + /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
832 + is not clear what to do in this case. A lot of deadlocks seems be
833 + possible. */
834 +
835 + node = jnode_by_page(page);
836 + assert("nikita-2258", node != NULL);
837 + assert("reiser4-4", page->mapping != NULL);
838 + assert("reiser4-5", page->mapping->host != NULL);
839 +
840 + if (PageDirty(page))
841 + return 0;
842 +
843 + if (page_count(page) > 3)
844 + return 0;
845 +
846 + /* releasable() needs jnode lock, because it looks at the jnode fields
847 + * and we need jload_lock here to avoid races with jload(). */
848 + spin_lock_jnode(node);
849 + spin_lock(&(node->load));
850 + if (jnode_is_releasable(node)) {
851 + struct address_space *mapping;
852 +
853 + mapping = page->mapping;
854 + jref(node);
855 + /* there is no need to synchronize against
856 + * jnode_extent_write() here, because pages seen by
857 + * jnode_extent_write() are !releasable(). */
858 + page_clear_jnode(page, node);
859 + spin_unlock(&(node->load));
860 + spin_unlock_jnode(node);
861 +
862 + /* we are under memory pressure so release jnode also. */
863 + jput(node);
864 +
865 + return 1;
866 + } else {
867 + spin_unlock(&(node->load));
868 + spin_unlock_jnode(node);
869 + assert("nikita-3020", schedulable());
870 + return 0;
871 + }
872 +}
873 +
874 +/* Make Linus happy.
875 + Local variables:
876 + c-indentation-style: "K&R"
877 + mode-name: "LC"
878 + c-basic-offset: 8
879 + tab-width: 8
880 + fill-column: 120
881 + End:
882 +*/
883 Index: linux-2.6.16/fs/reiser4/block_alloc.c
884 ===================================================================
885 --- /dev/null
886 +++ linux-2.6.16/fs/reiser4/block_alloc.c
887 @@ -0,0 +1,1139 @@
888 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
889 +
890 +#include "debug.h"
891 +#include "dformat.h"
892 +#include "plugin/plugin.h"
893 +#include "txnmgr.h"
894 +#include "znode.h"
895 +#include "block_alloc.h"
896 +#include "tree.h"
897 +#include "super.h"
898 +
899 +#include <linux/types.h> /* for __u?? */
900 +#include <linux/fs.h> /* for struct super_block */
901 +#include <linux/spinlock.h>
902 +
903 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
904 +
905 +/* We need to be able to reserve enough disk space to ensure that an atomic
906 + operation will have enough disk space to flush (see flush.c and
907 + http://namesys.com/v4/v4.html) and commit it once it is started.
908 +
909 + In our design a call for reserving disk space may fail but not an actual
910 + block allocation.
911 +
912 + All free blocks, already allocated blocks, and all kinds of reserved blocks
913 + are counted in different per-fs block counters.
914 +
915 + A reiser4 super block's set of block counters currently is:
916 +
917 + free -- free blocks,
918 + used -- already allocated blocks,
919 +
920 + grabbed -- initially reserved for performing an fs operation, those blocks
921 + are taken from free blocks, then grabbed disk space leaks from grabbed
922 + blocks counter to other counters like "fake allocated", "flush
923 + reserved", "used", the rest of not used grabbed space is returned to
924 + free space at the end of fs operation;
925 +
926 + fake allocated -- counts all nodes without real disk block numbers assigned,
927 + we have separate accounting for formatted and unformatted
928 + nodes (for easier debugging);
929 +
930 + flush reserved -- disk space needed for flushing and committing an atom.
931 + Each dirty already allocated block could be written as a
932 + part of atom's overwrite set or as a part of atom's
933 + relocate set. In both case one additional block is needed,
934 + it is used as a wandered block if we do overwrite or as a
935 + new location for a relocated block.
936 +
937 + In addition, blocks in some states are counted on per-thread and per-atom
938 + basis. A reiser4 context has a counter of blocks grabbed by this transaction
939 + and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
940 + of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
941 + blocks, which are reserved for flush processing and atom commit. */
942 +
943 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
944 + number of blocks to grab for most expensive case of balancing when the leaf
945 + node we insert new item to gets split and new leaf node is allocated.
946 +
947 + So, we need to grab blocks for
948 +
949 + 1) one block for possible dirtying the node we insert an item to. That block
950 + would be used for node relocation at flush time or for allocating of a
951 + wandered one, it depends what will be a result (what set, relocate or
952 + overwrite the node gets assigned to) of the node processing by the flush
953 + algorithm.
954 +
955 + 2) one block for either allocating a new node, or dirtying of right or left
956 + clean neighbor, only one case may happen.
957 +
958 + VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
959 + node, and creation of new node. have I forgotten something? email me.
960 +
961 + These grabbed blocks are counted in both reiser4 context "grabbed blocks"
962 + counter and in the fs-wide one (both ctx->grabbed_blocks and
963 + sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
964 + decremented by 2.
965 +
966 + Suppose both two blocks were spent for dirtying of an already allocated clean
967 + node (one block went from "grabbed" to "flush reserved") and for new block
968 + allocating (one block went from "grabbed" to "fake allocated formatted").
969 +
970 + Inserting of a child pointer to the parent node caused parent node to be
971 + split, the balancing code takes care about this grabbing necessary space
972 + immediately by calling reiser4_grab with BA_RESERVED flag set which means
973 + "can use the 5% reserved disk space".
974 +
975 + At this moment insertion completes and grabbed blocks (if they were not used)
976 + should be returned to the free space counter.
977 +
978 + However the atom life-cycle is not completed. The atom had one "flush
979 + reserved" block added by our insertion and the new fake allocated node is
980 + counted as a "fake allocated formatted" one. The atom has to be fully
981 + processed by flush before commit. Suppose that the flush moved the first,
982 + already allocated node to the atom's overwrite list, the new fake allocated
983 + node, obviously, went into the atom relocate set. The reiser4 flush
984 + allocates the new node using one unit from "fake allocated formatted"
985 + counter, the log writer uses one from "flush reserved" for wandered block
986 + allocation.
987 +
988 + And, it is not the end. When the wandered block is deallocated after the
989 + atom gets fully played (see wander.c for term description), the disk space
990 + occupied for it is returned to free blocks. */
991 +
992 +/* BLOCK NUMBERS */
993 +
994 +/* Any reiser4 node has a block number assigned to it. We use these numbers for
995 + indexing in hash tables, so if a block has not yet been assigned a location
996 + on disk we need to give it a temporary fake block number.
997 +
998 + Current implementation of reiser4 uses 64-bit integers for block numbers. We
999 + use highest bit in 64-bit block number to distinguish fake and real block
1000 + numbers. So, only 63 bits may be used to addressing of real device
1001 + blocks. That "fake" block numbers space is divided into subspaces of fake
1002 + block numbers for data blocks and for shadow (working) bitmap blocks.
1003 +
1004 + Fake block numbers for data blocks are generated by a cyclic counter, which
1005 + gets incremented after each real block allocation. We assume that it is
1006 + impossible to overload this counter during one transaction life. */
1007 +
1008 +/* Initialize a blocknr hint. */
1009 +void blocknr_hint_init(reiser4_blocknr_hint * hint)
1010 +{
1011 + memset(hint, 0, sizeof(reiser4_blocknr_hint));
1012 +}
1013 +
1014 +/* Release any resources of a blocknr hint. */
1015 +void blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
1016 +{
1017 + /* No resources should be freed in current blocknr_hint implementation. */
1018 +}
1019 +
1020 +/* see above for explanation of fake block number. */
1021 +/* Audited by: green(2002.06.11) */
1022 +int blocknr_is_fake(const reiser4_block_nr * da)
1023 +{
1024 + /* The reason for not simply returning result of '&' operation is that
1025 + while return value is (possibly 32bit) int, the reiser4_block_nr is
1026 + at least 64 bits long, and high bit (which is the only possible
1027 + non zero bit after the masking) would be stripped off */
1028 + return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
1029 +}
1030 +
1031 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
1032 + arithmetic. Mostly, they are isolated to not to code same assertions in
1033 + several places. */
1034 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
1035 +{
1036 + BUG_ON(ctx->grabbed_blocks < count);
1037 + assert("zam-527", ctx->grabbed_blocks >= count);
1038 + ctx->grabbed_blocks -= count;
1039 +}
1040 +
1041 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
1042 +{
1043 + ctx->grabbed_blocks += count;
1044 +}
1045 +
1046 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
1047 +{
1048 + assert("zam-525", sbinfo->blocks_grabbed >= count);
1049 + sbinfo->blocks_grabbed -= count;
1050 +}
1051 +
1052 +/* Decrease the counter of block reserved for flush in super block. */
1053 +static void
1054 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1055 +{
1056 + assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
1057 + sbinfo->blocks_flush_reserved -= count;
1058 +}
1059 +
1060 +static void
1061 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1062 + reiser4_ba_flags_t flags)
1063 +{
1064 + if (flags & BA_FORMATTED) {
1065 + assert("zam-806", sbinfo->blocks_fake_allocated >= count);
1066 + sbinfo->blocks_fake_allocated -= count;
1067 + } else {
1068 + assert("zam-528",
1069 + sbinfo->blocks_fake_allocated_unformatted >= count);
1070 + sbinfo->blocks_fake_allocated_unformatted -= count;
1071 + }
1072 +}
1073 +
1074 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
1075 +{
1076 + assert("zam-530",
1077 + sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
1078 + sbinfo->blocks_used -= count;
1079 +}
1080 +
1081 +static void
1082 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1083 +{
1084 + assert("edward-501", sbinfo->blocks_clustered >= count);
1085 + sbinfo->blocks_clustered -= count;
1086 +}
1087 +
1088 +/* Increase the counter of block reserved for flush in atom. */
1089 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1090 +{
1091 + assert("zam-772", atom != NULL);
1092 + assert_spin_locked(&(atom->alock));
1093 + atom->flush_reserved += count;
1094 +}
1095 +
1096 +/* Decrease the counter of block reserved for flush in atom. */
1097 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1098 +{
1099 + assert("zam-774", atom != NULL);
1100 + assert_spin_locked(&(atom->alock));
1101 + assert("nikita-2790", atom->flush_reserved >= count);
1102 + atom->flush_reserved -= count;
1103 +}
1104 +
1105 +/* super block has 6 counters: free, used, grabbed, fake allocated
1106 + (formatted and unformatted) and flush reserved. Their sum must be
1107 + number of blocks on a device. This function checks this */
1108 +int check_block_counters(const struct super_block *super)
1109 +{
1110 + __u64 sum;
1111 +
1112 + sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
1113 + reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
1114 + reiser4_fake_allocated_unformatted(super) + flush_reserved(super) +
1115 + reiser4_clustered_blocks(super);
1116 + if (reiser4_block_count(super) != sum) {
1117 + printk("super block counters: "
1118 + "used %llu, free %llu, "
1119 + "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
1120 + "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
1121 + (unsigned long long)reiser4_data_blocks(super),
1122 + (unsigned long long)reiser4_free_blocks(super),
1123 + (unsigned long long)reiser4_grabbed_blocks(super),
1124 + (unsigned long long)reiser4_fake_allocated(super),
1125 + (unsigned long long)
1126 + reiser4_fake_allocated_unformatted(super),
1127 + (unsigned long long)flush_reserved(super),
1128 + (unsigned long long)reiser4_clustered_blocks(super),
1129 + (unsigned long long)sum,
1130 + (unsigned long long)reiser4_block_count(super));
1131 + return 0;
1132 + }
1133 + return 1;
1134 +}
1135 +
1136 +/* Adjust "working" free blocks counter for number of blocks we are going to
1137 + allocate. Record number of grabbed blocks in fs-wide and per-thread
1138 + counters. This function should be called before bitmap scanning or
1139 + allocating fake block numbers
1140 +
1141 + @super -- pointer to reiser4 super block;
1142 + @count -- number of blocks we reserve;
1143 +
1144 + @return -- 0 if success, -ENOSPC, if all
1145 + free blocks are preserved or already allocated.
1146 +*/
1147 +
1148 +static int
1149 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
1150 +{
1151 + __u64 free_blocks;
1152 + int ret = 0, use_reserved = flags & BA_RESERVED;
1153 + reiser4_super_info_data *sbinfo;
1154 +
1155 + assert("vs-1276", ctx == get_current_context());
1156 +
1157 + /* Do not grab anything on ro-mounted fs. */
1158 + if (rofs_super(ctx->super)) {
1159 + ctx->grab_enabled = 0;
1160 + return 0;
1161 + }
1162 +
1163 + sbinfo = get_super_private(ctx->super);
1164 +
1165 + spin_lock_reiser4_super(sbinfo);
1166 +
1167 + free_blocks = sbinfo->blocks_free;
1168 +
1169 + if ((use_reserved && free_blocks < count) ||
1170 + (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1171 + ret = RETERR(-ENOSPC);
1172 + goto unlock_and_ret;
1173 + }
1174 +
1175 + add_to_ctx_grabbed(ctx, count);
1176 +
1177 + sbinfo->blocks_grabbed += count;
1178 + sbinfo->blocks_free -= count;
1179 +
1180 +#if REISER4_DEBUG
1181 + if (ctx->grabbed_initially == 0)
1182 + ctx->grabbed_initially = count;
1183 +#endif
1184 +
1185 + assert("nikita-2986", check_block_counters(ctx->super));
1186 +
1187 + /* disable grab space in current context */
1188 + ctx->grab_enabled = 0;
1189 +
1190 + unlock_and_ret:
1191 + spin_unlock_reiser4_super(sbinfo);
1192 +
1193 + return ret;
1194 +}
1195 +
1196 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1197 +{
1198 + int ret;
1199 + reiser4_context *ctx;
1200 +
1201 + assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1202 + lock_stack_isclean(get_current_lock_stack
1203 + ())));
1204 + ctx = get_current_context();
1205 + if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
1206 + return 0;
1207 + }
1208 +
1209 + ret = reiser4_grab(ctx, count, flags);
1210 + if (ret == -ENOSPC) {
1211 +
1212 + /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
1213 + if (flags & BA_CAN_COMMIT) {
1214 + txnmgr_force_commit_all(ctx->super, 0);
1215 + ctx->grab_enabled = 1;
1216 + ret = reiser4_grab(ctx, count, flags);
1217 + }
1218 + }
1219 + /*
1220 + * allocation from reserved pool cannot fail. This is severe error.
1221 + */
1222 + assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1223 + return ret;
1224 +}
1225 +
1226 +/*
1227 + * SPACE RESERVED FOR UNLINK/TRUNCATE
1228 + *
1229 + * Unlink and truncate require space in transaction (to update stat data, at
1230 + * least). But we don't want rm(1) to fail with "No space on device" error.
1231 + *
1232 + * Solution is to reserve 5% of disk space for truncates and
1233 + * unlinks. Specifically, normal space grabbing requests don't grab space from
1234 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1235 + * drain it. Per super block delete_sema semaphore is used to allow only one
1236 + * thread at a time to grab from reserved area.
1237 + *
1238 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1239 + * flag.
1240 + *
1241 + */
1242 +
1243 +int reiser4_grab_reserved(struct super_block *super,
1244 + __u64 count, reiser4_ba_flags_t flags)
1245 +{
1246 + reiser4_super_info_data *sbinfo = get_super_private(super);
1247 +
1248 + assert("nikita-3175", flags & BA_CAN_COMMIT);
1249 +
1250 + /* Check the delete semaphore already taken by us, we assume that
1251 + * reading of machine word is atomic. */
1252 + if (sbinfo->delete_sema_owner == current) {
1253 + if (reiser4_grab_space
1254 + (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1255 + warning("zam-1003",
1256 + "nested call of grab_reserved fails count=(%llu)",
1257 + (unsigned long long)count);
1258 + reiser4_release_reserved(super);
1259 + return RETERR(-ENOSPC);
1260 + }
1261 + return 0;
1262 + }
1263 +
1264 + if (reiser4_grab_space(count, flags)) {
1265 + down(&sbinfo->delete_sema);
1266 + assert("nikita-2929", sbinfo->delete_sema_owner == NULL);
1267 + sbinfo->delete_sema_owner = current;
1268 +
1269 + if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1270 + warning("zam-833",
1271 + "reserved space is not enough (%llu)",
1272 + (unsigned long long)count);
1273 + reiser4_release_reserved(super);
1274 + return RETERR(-ENOSPC);
1275 + }
1276 + }
1277 + return 0;
1278 +}
1279 +
1280 +void reiser4_release_reserved(struct super_block *super)
1281 +{
1282 + reiser4_super_info_data *info;
1283 +
1284 + info = get_super_private(super);
1285 + if (info->delete_sema_owner == current) {
1286 + info->delete_sema_owner = NULL;
1287 + up(&info->delete_sema);
1288 + }
1289 +}
1290 +
1291 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1292 +{
1293 + reiser4_context *ctx;
1294 + reiser4_super_info_data *sbinfo;
1295 +
1296 + ctx = get_current_context();
1297 + sub_from_ctx_grabbed(ctx, count);
1298 +
1299 + sbinfo = get_super_private(ctx->super);
1300 + spin_lock_reiser4_super(sbinfo);
1301 +
1302 + sub_from_sb_grabbed(sbinfo, count);
1303 + /* return sbinfo locked */
1304 + return sbinfo;
1305 +}
1306 +
1307 +/* is called after @count fake block numbers are allocated and pointer to
1308 + those blocks are inserted into tree. */
1309 +static void grabbed2fake_allocated_formatted(void)
1310 +{
1311 + reiser4_super_info_data *sbinfo;
1312 +
1313 + sbinfo = grabbed2fake_allocated_head(1);
1314 + sbinfo->blocks_fake_allocated++;
1315 +
1316 + assert("vs-922", check_block_counters(reiser4_get_current_sb()));
1317 +
1318 + spin_unlock_reiser4_super(sbinfo);
1319 +}
1320 +
1321 +/**
1322 + * grabbed2fake_allocated_unformatted
1323 + * @count:
1324 + *
1325 + */
1326 +static void grabbed2fake_allocated_unformatted(int count)
1327 +{
1328 + reiser4_super_info_data *sbinfo;
1329 +
1330 + sbinfo = grabbed2fake_allocated_head(count);
1331 + sbinfo->blocks_fake_allocated_unformatted += count;
1332 +
1333 + assert("vs-9221", check_block_counters(reiser4_get_current_sb()));
1334 +
1335 + spin_unlock_reiser4_super(sbinfo);
1336 +}
1337 +
1338 +void grabbed2cluster_reserved(int count)
1339 +{
1340 + reiser4_context *ctx;
1341 + reiser4_super_info_data *sbinfo;
1342 +
1343 + ctx = get_current_context();
1344 + sub_from_ctx_grabbed(ctx, count);
1345 +
1346 + sbinfo = get_super_private(ctx->super);
1347 + spin_lock_reiser4_super(sbinfo);
1348 +
1349 + sub_from_sb_grabbed(sbinfo, count);
1350 + sbinfo->blocks_clustered += count;
1351 +
1352 + assert("edward-504", check_block_counters(ctx->super));
1353 +
1354 + spin_unlock_reiser4_super(sbinfo);
1355 +}
1356 +
1357 +void cluster_reserved2grabbed(int count)
1358 +{
1359 + reiser4_context *ctx;
1360 + reiser4_super_info_data *sbinfo;
1361 +
1362 + ctx = get_current_context();
1363 +
1364 + sbinfo = get_super_private(ctx->super);
1365 + spin_lock_reiser4_super(sbinfo);
1366 +
1367 + sub_from_cluster_reserved(sbinfo, count);
1368 + sbinfo->blocks_grabbed += count;
1369 +
1370 + assert("edward-505", check_block_counters(ctx->super));
1371 +
1372 + spin_unlock_reiser4_super(sbinfo);
1373 + add_to_ctx_grabbed(ctx, count);
1374 +}
1375 +
1376 +void cluster_reserved2free(int count)
1377 +{
1378 + reiser4_context *ctx;
1379 + reiser4_super_info_data *sbinfo;
1380 +
1381 + assert("edward-503", get_current_context()->grabbed_blocks == 0);
1382 +
1383 + ctx = get_current_context();
1384 + sbinfo = get_super_private(ctx->super);
1385 + spin_lock_reiser4_super(sbinfo);
1386 +
1387 + sub_from_cluster_reserved(sbinfo, count);
1388 + sbinfo->blocks_free += count;
1389 +
1390 + assert("edward-502", check_block_counters(ctx->super));
1391 +
1392 + spin_unlock_reiser4_super(sbinfo);
1393 +}
1394 +
1395 +static DEFINE_SPINLOCK(fake_lock);
1396 +static reiser4_block_nr fake_gen = 0;
1397 +
1398 +/**
1399 + * assign_fake_blocknr
1400 + * @blocknr:
1401 + * @count:
1402 + *
1403 + * Obtain a fake block number for new node which will be used to refer to
1404 + * this newly allocated node until real allocation is done.
1405 + */
1406 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1407 +{
1408 + spin_lock(&fake_lock);
1409 + *blocknr = fake_gen;
1410 + fake_gen += count;
1411 + spin_unlock(&fake_lock);
1412 +
1413 + BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1414 + /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1415 + *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1416 + assert("zam-394", zlook(current_tree, blocknr) == NULL);
1417 +}
1418 +
1419 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1420 +{
1421 + assign_fake_blocknr(blocknr, 1);
1422 + grabbed2fake_allocated_formatted();
1423 + return 0;
1424 +}
1425 +
1426 +/**
1427 + * fake_blocknrs_unformatted
1428 + * @count: number of fake numbers to get
1429 + *
1430 + * Allocates @count fake block numbers which will be assigned to jnodes
1431 + */
1432 +reiser4_block_nr fake_blocknr_unformatted(int count)
1433 +{
1434 + reiser4_block_nr blocknr;
1435 +
1436 + assign_fake_blocknr(&blocknr, count);
1437 + grabbed2fake_allocated_unformatted(count);
1438 +
1439 + return blocknr;
1440 +}
1441 +
1442 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1443 + follows grabbing of free disk space. */
1444 +void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1445 + __u64 count)
1446 +{
1447 + sub_from_ctx_grabbed(ctx, count);
1448 +
1449 + spin_lock_reiser4_super(sbinfo);
1450 +
1451 + sub_from_sb_grabbed(sbinfo, count);
1452 + sbinfo->blocks_used += count;
1453 +
1454 + assert("nikita-2679", check_block_counters(ctx->super));
1455 +
1456 + spin_unlock_reiser4_super(sbinfo);
1457 +}
1458 +
1459 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1460 +void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1461 + reiser4_ba_flags_t flags)
1462 +{
1463 + spin_lock_reiser4_super(sbinfo);
1464 +
1465 + sub_from_sb_fake_allocated(sbinfo, count, flags);
1466 + sbinfo->blocks_used += count;
1467 +
1468 + assert("nikita-2680", check_block_counters(reiser4_get_current_sb()));
1469 +
1470 + spin_unlock_reiser4_super(sbinfo);
1471 +}
1472 +
1473 +void flush_reserved2used(txn_atom * atom, __u64 count)
1474 +{
1475 + reiser4_super_info_data *sbinfo;
1476 +
1477 + assert("zam-787", atom != NULL);
1478 + assert_spin_locked(&(atom->alock));
1479 +
1480 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1481 +
1482 + sbinfo = get_current_super_private();
1483 + spin_lock_reiser4_super(sbinfo);
1484 +
1485 + sub_from_sb_flush_reserved(sbinfo, count);
1486 + sbinfo->blocks_used += count;
1487 +
1488 + assert("zam-789", check_block_counters(reiser4_get_current_sb()));
1489 +
1490 + spin_unlock_reiser4_super(sbinfo);
1491 +}
1492 +
1493 +/* update the per fs blocknr hint default value. */
1494 +void
1495 +update_blocknr_hint_default(const struct super_block *s,
1496 + const reiser4_block_nr * block)
1497 +{
1498 + reiser4_super_info_data *sbinfo = get_super_private(s);
1499 +
1500 + assert("nikita-3342", !blocknr_is_fake(block));
1501 +
1502 + spin_lock_reiser4_super(sbinfo);
1503 + if (*block < sbinfo->block_count) {
1504 + sbinfo->blocknr_hint_default = *block;
1505 + } else {
1506 + warning("zam-676",
1507 + "block number %llu is too large to be used in a blocknr hint\n",
1508 + (unsigned long long)*block);
1509 + dump_stack();
1510 + DEBUGON(1);
1511 + }
1512 + spin_unlock_reiser4_super(sbinfo);
1513 +}
1514 +
1515 +/* get current value of the default blocknr hint. */
1516 +void get_blocknr_hint_default(reiser4_block_nr * result)
1517 +{
1518 + reiser4_super_info_data *sbinfo = get_current_super_private();
1519 +
1520 + spin_lock_reiser4_super(sbinfo);
1521 + *result = sbinfo->blocknr_hint_default;
1522 + assert("zam-677", *result < sbinfo->block_count);
1523 + spin_unlock_reiser4_super(sbinfo);
1524 +}
1525 +
1526 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1527 + * method. Blocks are allocated in one contiguous disk region. The plugin
1528 + * independent part accounts blocks by subtracting allocated amount from grabbed
1529 + * or fake block counter and add the same amount to the counter of allocated
1530 + * blocks.
1531 + *
1532 + * @hint -- a reiser4 blocknr hint object which contains further block
1533 + * allocation hints and parameters (search start, a stage of block
1534 + * which will be mapped to disk, etc.),
1535 + * @blk -- an out parameter for the beginning of the allocated region,
1536 + * @len -- in/out parameter, it should contain the maximum number of allocated
1537 + * blocks, after block allocation completes, it contains the length of
1538 + * allocated disk region.
1539 + * @flags -- see reiser4_ba_flags_t description.
1540 + *
1541 + * @return -- 0 if success, error code otherwise.
1542 + */
1543 +int
1544 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1545 + reiser4_block_nr * len, reiser4_ba_flags_t flags)
1546 +{
1547 + __u64 needed = *len;
1548 + reiser4_context *ctx;
1549 + reiser4_super_info_data *sbinfo;
1550 + int ret;
1551 +
1552 + assert("zam-986", hint != NULL);
1553 +
1554 + ctx = get_current_context();
1555 + sbinfo = get_super_private(ctx->super);
1556 +
1557 + /* For write-optimized data we use default search start value, which is
1558 + * close to last write location. */
1559 + if (flags & BA_USE_DEFAULT_SEARCH_START) {
1560 + get_blocknr_hint_default(&hint->blk);
1561 + }
1562 +
1563 + /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1564 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1565 + if (hint->block_stage == BLOCK_NOT_COUNTED) {
1566 + ret = reiser4_grab_space_force(*len, flags);
1567 + if (ret != 0)
1568 + return ret;
1569 + }
1570 +
1571 + ret =
1572 + sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int)needed,
1573 + blk, len);
1574 +
1575 + if (!ret) {
1576 + assert("zam-680", *blk < reiser4_block_count(ctx->super));
1577 + assert("zam-681",
1578 + *blk + *len <= reiser4_block_count(ctx->super));
1579 +
1580 + if (flags & BA_PERMANENT) {
1581 + /* we assume that current atom exists at this moment */
1582 + txn_atom *atom = get_current_atom_locked();
1583 + atom->nr_blocks_allocated += *len;
1584 + spin_unlock_atom(atom);
1585 + }
1586 +
1587 + switch (hint->block_stage) {
1588 + case BLOCK_NOT_COUNTED:
1589 + case BLOCK_GRABBED:
1590 + grabbed2used(ctx, sbinfo, *len);
1591 + break;
1592 + case BLOCK_UNALLOCATED:
1593 + fake_allocated2used(sbinfo, *len, flags);
1594 + break;
1595 + case BLOCK_FLUSH_RESERVED:
1596 + {
1597 + txn_atom *atom = get_current_atom_locked();
1598 + flush_reserved2used(atom, *len);
1599 + spin_unlock_atom(atom);
1600 + }
1601 + break;
1602 + default:
1603 + impossible("zam-531", "wrong block stage");
1604 + }
1605 + } else {
1606 + assert("zam-821",
1607 + ergo(hint->max_dist == 0
1608 + && !hint->backward, ret != -ENOSPC));
1609 + if (hint->block_stage == BLOCK_NOT_COUNTED)
1610 + grabbed2free(ctx, sbinfo, needed);
1611 + }
1612 +
1613 + return ret;
1614 +}
1615 +
1616 +/* used -> fake_allocated -> grabbed -> free */
1617 +
1618 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1619 + disk */
1620 +static void
1621 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1622 + int formatted)
1623 +{
1624 + spin_lock_reiser4_super(sbinfo);
1625 +
1626 + if (formatted)
1627 + sbinfo->blocks_fake_allocated += count;
1628 + else
1629 + sbinfo->blocks_fake_allocated_unformatted += count;
1630 +
1631 + sub_from_sb_used(sbinfo, count);
1632 +
1633 + assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1634 +
1635 + spin_unlock_reiser4_super(sbinfo);
1636 +}
1637 +
1638 +static void
1639 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1640 + __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1641 +{
1642 + assert("nikita-2791", atom != NULL);
1643 + assert_spin_locked(&(atom->alock));
1644 +
1645 + add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1646 +
1647 + spin_lock_reiser4_super(sbinfo);
1648 +
1649 + sbinfo->blocks_flush_reserved += count;
1650 + /*add_to_sb_flush_reserved(sbinfo, count); */
1651 + sub_from_sb_used(sbinfo, count);
1652 +
1653 + assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1654 +
1655 + spin_unlock_reiser4_super(sbinfo);
1656 +}
1657 +
1658 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1659 +static void
1660 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1661 + __u64 count, reiser4_ba_flags_t flags)
1662 +{
1663 + add_to_ctx_grabbed(ctx, count);
1664 +
1665 + spin_lock_reiser4_super(sbinfo);
1666 +
1667 + assert("nikita-2682", check_block_counters(ctx->super));
1668 +
1669 + sbinfo->blocks_grabbed += count;
1670 + sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1671 +
1672 + assert("nikita-2683", check_block_counters(ctx->super));
1673 +
1674 + spin_unlock_reiser4_super(sbinfo);
1675 +}
1676 +
1677 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1678 +{
1679 + reiser4_context *ctx;
1680 + reiser4_super_info_data *sbinfo;
1681 +
1682 + ctx = get_current_context();
1683 + sbinfo = get_super_private(ctx->super);
1684 +
1685 + fake_allocated2grabbed(ctx, sbinfo, count, flags);
1686 + grabbed2free(ctx, sbinfo, count);
1687 +}
1688 +
1689 +void grabbed2free_mark(__u64 mark)
1690 +{
1691 + reiser4_context *ctx;
1692 + reiser4_super_info_data *sbinfo;
1693 +
1694 + ctx = get_current_context();
1695 + sbinfo = get_super_private(ctx->super);
1696 +
1697 + assert("nikita-3007", (__s64) mark >= 0);
1698 + assert("nikita-3006", ctx->grabbed_blocks >= mark);
1699 + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1700 +}
1701 +
1702 +/**
1703 + * grabbed2free - adjust grabbed and free block counters
1704 + * @ctx: context to update grabbed block counter of
1705 + * @sbinfo: super block to update grabbed and free block counters of
1706 + * @count: number of blocks to adjust counters by
1707 + *
1708 + * Decreases context's and per filesystem's counters of grabbed
1709 + * blocks. Increases per filesystem's counter of free blocks.
1710 + */
1711 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1712 + __u64 count)
1713 +{
1714 + sub_from_ctx_grabbed(ctx, count);
1715 +
1716 + spin_lock_reiser4_super(sbinfo);
1717 +
1718 + sub_from_sb_grabbed(sbinfo, count);
1719 + sbinfo->blocks_free += count;
1720 + assert("nikita-2684", check_block_counters(ctx->super));
1721 +
1722 + spin_unlock_reiser4_super(sbinfo);
1723 +}
1724 +
1725 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1726 +{
1727 + reiser4_context *ctx;
1728 + reiser4_super_info_data *sbinfo;
1729 +
1730 + assert("vs-1095", atom);
1731 +
1732 + ctx = get_current_context();
1733 + sbinfo = get_super_private(ctx->super);
1734 +
1735 + sub_from_ctx_grabbed(ctx, count);
1736 +
1737 + add_to_atom_flush_reserved_nolock(atom, count);
1738 +
1739 + spin_lock_reiser4_super(sbinfo);
1740 +
1741 + sbinfo->blocks_flush_reserved += count;
1742 + sub_from_sb_grabbed(sbinfo, count);
1743 +
1744 + assert("vpf-292", check_block_counters(ctx->super));
1745 +
1746 + spin_unlock_reiser4_super(sbinfo);
1747 +}
1748 +
1749 +void grabbed2flush_reserved(__u64 count)
1750 +{
1751 + txn_atom *atom = get_current_atom_locked();
1752 +
1753 + grabbed2flush_reserved_nolock(atom, count);
1754 +
1755 + spin_unlock_atom(atom);
1756 +}
1757 +
1758 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1759 +{
1760 + reiser4_context *ctx;
1761 + reiser4_super_info_data *sbinfo;
1762 +
1763 + assert("nikita-2788", atom != NULL);
1764 + assert_spin_locked(&(atom->alock));
1765 +
1766 + ctx = get_current_context();
1767 + sbinfo = get_super_private(ctx->super);
1768 +
1769 + add_to_ctx_grabbed(ctx, count);
1770 +
1771 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1772 +
1773 + spin_lock_reiser4_super(sbinfo);
1774 +
1775 + sbinfo->blocks_grabbed += count;
1776 + sub_from_sb_flush_reserved(sbinfo, count);
1777 +
1778 + assert("vpf-292", check_block_counters(ctx->super));
1779 +
1780 + spin_unlock_reiser4_super(sbinfo);
1781 +}
1782 +
1783 +/**
1784 + * all_grabbed2free - releases all blocks grabbed in context
1785 + *
1786 + * Decreases context's and super block's grabbed block counters by number of
1787 + * blocks grabbed by current context and increases super block's free block
1788 + * counter correspondingly.
1789 + */
1790 +void all_grabbed2free(void)
1791 +{
1792 + reiser4_context *ctx = get_current_context();
1793 +
1794 + grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1795 +}
1796 +
1797 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1798 + after freeing, @count blocks become "grabbed". */
1799 +static void
1800 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1801 + __u64 count)
1802 +{
1803 + add_to_ctx_grabbed(ctx, count);
1804 +
1805 + spin_lock_reiser4_super(sbinfo);
1806 +
1807 + sbinfo->blocks_grabbed += count;
1808 + sub_from_sb_used(sbinfo, count);
1809 +
1810 + assert("nikita-2685", check_block_counters(ctx->super));
1811 +
1812 + spin_unlock_reiser4_super(sbinfo);
1813 +}
1814 +
1815 +/* this used to be done through used2grabbed and grabbed2free*/
1816 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1817 +{
1818 + spin_lock_reiser4_super(sbinfo);
1819 +
1820 + sbinfo->blocks_free += count;
1821 + sub_from_sb_used(sbinfo, count);
1822 +
1823 + assert("nikita-2685", check_block_counters(reiser4_get_current_sb()));
1824 +
1825 + spin_unlock_reiser4_super(sbinfo);
1826 +}
1827 +
1828 +#if REISER4_DEBUG
1829 +
1830 +/* check "allocated" state of given block range */
1831 +static void
1832 +reiser4_check_blocks(const reiser4_block_nr * start,
1833 + const reiser4_block_nr * len, int desired)
1834 +{
1835 + sa_check_blocks(start, len, desired);
1836 +}
1837 +
1838 +/* check "allocated" state of given block */
1839 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1840 +{
1841 + const reiser4_block_nr one = 1;
1842 +
1843 + reiser4_check_blocks(block, &one, desired);
1844 +}
1845 +
1846 +#endif
1847 +
1848 +/* Blocks deallocation function may do an actual deallocation through space
1849 + plugin allocation or store deleted block numbers in atom's delete_set data
1850 + structure depend on @defer parameter. */
1851 +
1852 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1853 + will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1854 + freed but disk space is still grabbed by current thread, or these blocks must
1855 + not be counted in any reiser4 sb block counters, see block_stage_t comment */
1856 +
1857 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1858 + distinguish blocks allocated for unformatted and formatted nodes */
1859 +
1860 +int
1861 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
1862 + const reiser4_block_nr * len,
1863 + block_stage_t target_stage, reiser4_ba_flags_t flags)
1864 +{
1865 + txn_atom *atom = NULL;
1866 + int ret;
1867 + reiser4_context *ctx;
1868 + reiser4_super_info_data *sbinfo;
1869 +
1870 + ctx = get_current_context();
1871 + sbinfo = get_super_private(ctx->super);
1872 +
1873 + if (REISER4_DEBUG) {
1874 + assert("zam-431", *len != 0);
1875 + assert("zam-432", *start != 0);
1876 + assert("zam-558", !blocknr_is_fake(start));
1877 +
1878 + spin_lock_reiser4_super(sbinfo);
1879 + assert("zam-562", *start < sbinfo->block_count);
1880 + spin_unlock_reiser4_super(sbinfo);
1881 + }
1882 +
1883 + if (flags & BA_DEFER) {
1884 + blocknr_set_entry *bsep = NULL;
1885 +
1886 + /* storing deleted block numbers in a blocknr set
1887 + datastructure for further actual deletion */
1888 + do {
1889 + atom = get_current_atom_locked();
1890 + assert("zam-430", atom != NULL);
1891 +
1892 + ret =
1893 + blocknr_set_add_extent(atom, &atom->delete_set,
1894 + &bsep, start, len);
1895 +
1896 + if (ret == -ENOMEM)
1897 + return ret;
1898 +
1899 + /* This loop might spin at most two times */
1900 + } while (ret == -E_REPEAT);
1901 +
1902 + assert("zam-477", ret == 0);
1903 + assert("zam-433", atom != NULL);
1904 +
1905 + spin_unlock_atom(atom);
1906 +
1907 + } else {
1908 + assert("zam-425", get_current_super_private() != NULL);
1909 + sa_dealloc_blocks(get_space_allocator(ctx->super), *start,
1910 + *len);
1911 +
1912 + if (flags & BA_PERMANENT) {
1913 + /* These blocks were counted as allocated, we have to revert it
1914 + * back if allocation is discarded. */
1915 + txn_atom *atom = get_current_atom_locked();
1916 + atom->nr_blocks_allocated -= *len;
1917 + spin_unlock_atom(atom);
1918 + }
1919 +
1920 + switch (target_stage) {
1921 + case BLOCK_NOT_COUNTED:
1922 + assert("vs-960", flags & BA_FORMATTED);
1923 + /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1924 + used2free(sbinfo, *len);
1925 + break;
1926 +
1927 + case BLOCK_GRABBED:
1928 + used2grabbed(ctx, sbinfo, *len);
1929 + break;
1930 +
1931 + case BLOCK_UNALLOCATED:
1932 + used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1933 + break;
1934 +
1935 + case BLOCK_FLUSH_RESERVED:{
1936 + txn_atom *atom;
1937 +
1938 + atom = get_current_atom_locked();
1939 + used2flush_reserved(sbinfo, atom, *len,
1940 + flags & BA_FORMATTED);
1941 + spin_unlock_atom(atom);
1942 + break;
1943 + }
1944 + default:
1945 + impossible("zam-532", "wrong block stage");
1946 + }
1947 + }
1948 +
1949 + return 0;
1950 +}
1951 +
1952 +/* wrappers for block allocator plugin methods */
1953 +int pre_commit_hook(void)
1954 +{
1955 + assert("zam-502", get_current_super_private() != NULL);
1956 + sa_pre_commit_hook();
1957 + return 0;
1958 +}
1959 +
1960 +/* an actor which applies delete set to block allocator data */
1961 +static int
1962 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1963 + const reiser4_block_nr * b, void *data UNUSED_ARG)
1964 +{
1965 + reiser4_context *ctx;
1966 + reiser4_super_info_data *sbinfo;
1967 +
1968 + __u64 len = 1;
1969 +
1970 + ctx = get_current_context();
1971 + sbinfo = get_super_private(ctx->super);
1972 +
1973 + assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1974 + assert("zam-552", sbinfo != NULL);
1975 +
1976 + if (b != NULL)
1977 + len = *b;
1978 +
1979 + if (REISER4_DEBUG) {
1980 + spin_lock_reiser4_super(sbinfo);
1981 +
1982 + assert("zam-554", *a < reiser4_block_count(ctx->super));
1983 + assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1984 +
1985 + spin_unlock_reiser4_super(sbinfo);
1986 + }
1987 +
1988 + sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1989 + /* adjust sb block counters */
1990 + used2free(sbinfo, len);
1991 + return 0;
1992 +}
1993 +
1994 +void post_commit_hook(void)
1995 +{
1996 + txn_atom *atom;
1997 +
1998 + atom = get_current_atom_locked();
1999 + assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
2000 + spin_unlock_atom(atom);
2001 +
2002 + /* do the block deallocation which was deferred
2003 + until commit is done */
2004 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
2005 +
2006 + assert("zam-504", get_current_super_private() != NULL);
2007 + sa_post_commit_hook();
2008 +}
2009 +
2010 +void post_write_back_hook(void)
2011 +{
2012 + assert("zam-504", get_current_super_private() != NULL);
2013 +
2014 + sa_post_commit_hook();
2015 +}
2016 +
2017 +/*
2018 + Local variables:
2019 + c-indentation-style: "K&R"
2020 + mode-name: "LC"
2021 + c-basic-offset: 8
2022 + tab-width: 8
2023 + fill-column: 120
2024 + scroll-step: 1
2025 + End:
2026 +*/
2027 Index: linux-2.6.16/fs/reiser4/block_alloc.h
2028 ===================================================================
2029 --- /dev/null
2030 +++ linux-2.6.16/fs/reiser4/block_alloc.h
2031 @@ -0,0 +1,175 @@
2032 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2033 +
2034 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
2035 +#define __FS_REISER4_BLOCK_ALLOC_H__
2036 +
2037 +#include "dformat.h"
2038 +#include "forward.h"
2039 +
2040 +#include <linux/types.h> /* for __u?? */
2041 +#include <linux/fs.h>
2042 +
2043 +/* Mask when is applied to given block number shows is that block number is a fake one */
2044 +#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
2045 +/* Mask which isolates a type of object this fake block number was assigned to */
2046 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
2047 +
2048 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
2049 + against these two values to understand is the object unallocated or bitmap
2050 + shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
2051 +#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
2052 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
2053 +
2054 +/* specification how block allocation was counted in sb block counters */
2055 +typedef enum {
2056 + BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
2057 + BLOCK_GRABBED = 1, /* free space grabbed for further allocation
2058 + of this block */
2059 + BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
2060 + BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
2061 + ( unallocated formatted or unformatted
2062 + node) */
2063 + BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
2064 + number assigned */
2065 +} block_stage_t;
2066 +
2067 +/* a hint for block allocator */
2068 +struct reiser4_blocknr_hint {
2069 + /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
2070 + is to prevent jnode_flush() calls from interleaving allocations on the same
2071 + bitmap, once a hint is established. */
2072 +
2073 + /* search start hint */
2074 + reiser4_block_nr blk;
2075 + /* if not zero, it is a region size we search for free blocks in */
2076 + reiser4_block_nr max_dist;
2077 + /* level for allocation, may be useful have branch-level and higher
2078 + write-optimized. */
2079 + tree_level level;
2080 + /* block allocator assumes that blocks, which will be mapped to disk,
2081 + are in this specified block_stage */
2082 + block_stage_t block_stage;
2083 + /* If direction = 1 allocate blocks in backward direction from the end
2084 + * of disk to the beginning of disk. */
2085 + unsigned int backward:1;
2086 +
2087 +};
2088 +
2089 +/* These flags control block allocation/deallocation behavior */
2090 +enum reiser4_ba_flags {
2091 + /* do allocatations from reserved (5%) area */
2092 + BA_RESERVED = (1 << 0),
2093 +
2094 + /* block allocator can do commit trying to recover free space */
2095 + BA_CAN_COMMIT = (1 << 1),
2096 +
2097 + /* if operation will be applied to formatted block */
2098 + BA_FORMATTED = (1 << 2),
2099 +
2100 + /* defer actual block freeing until transaction commit */
2101 + BA_DEFER = (1 << 3),
2102 +
2103 + /* allocate blocks for permanent fs objects (formatted or unformatted), not
2104 + wandered of log blocks */
2105 + BA_PERMANENT = (1 << 4),
2106 +
2107 + /* grab space even it was disabled */
2108 + BA_FORCE = (1 << 5),
2109 +
2110 + /* use default start value for free blocks search. */
2111 + BA_USE_DEFAULT_SEARCH_START = (1 << 6)
2112 +};
2113 +
2114 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
2115 +
2116 +extern void blocknr_hint_init(reiser4_blocknr_hint * hint);
2117 +extern void blocknr_hint_done(reiser4_blocknr_hint * hint);
2118 +extern void update_blocknr_hint_default(const struct super_block *,
2119 + const reiser4_block_nr *);
2120 +extern void get_blocknr_hint_default(reiser4_block_nr *);
2121 +
2122 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
2123 +
2124 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
2125 +reiser4_block_nr fake_blocknr_unformatted(int);
2126 +
2127 +/* free -> grabbed -> fake_allocated -> used */
2128 +
2129 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
2130 +void all_grabbed2free(void);
2131 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
2132 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
2133 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
2134 +void grabbed2flush_reserved(__u64 count);
2135 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
2136 + reiser4_block_nr * start,
2137 + reiser4_block_nr * len, reiser4_ba_flags_t flags);
2138 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
2139 + const reiser4_block_nr *,
2140 + block_stage_t, reiser4_ba_flags_t flags);
2141 +
2142 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
2143 + reiser4_block_nr * start,
2144 + reiser4_ba_flags_t flags)
2145 +{
2146 + reiser4_block_nr one = 1;
2147 + return reiser4_alloc_blocks(hint, start, &one, flags);
2148 +}
2149 +
2150 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
2151 + block_stage_t stage,
2152 + reiser4_ba_flags_t flags)
2153 +{
2154 + const reiser4_block_nr one = 1;
2155 + return reiser4_dealloc_blocks(block, &one, stage, flags);
2156 +}
2157 +
2158 +#define reiser4_grab_space_force(count, flags) \
2159 + reiser4_grab_space(count, flags | BA_FORCE)
2160 +
2161 +extern void grabbed2free_mark(__u64 mark);
2162 +extern int reiser4_grab_reserved(struct super_block *,
2163 + __u64, reiser4_ba_flags_t);
2164 +extern void reiser4_release_reserved(struct super_block *super);
2165 +
2166 +/* grabbed -> fake_allocated */
2167 +
2168 +/* fake_allocated -> used */
2169 +
2170 +/* used -> fake_allocated -> grabbed -> free */
2171 +
2172 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2173 +
2174 +extern int blocknr_is_fake(const reiser4_block_nr * da);
2175 +
2176 +extern void grabbed2cluster_reserved(int count);
2177 +extern void cluster_reserved2grabbed(int count);
2178 +extern void cluster_reserved2free(int count);
2179 +
2180 +extern int check_block_counters(const struct super_block *);
2181 +
2182 +#if REISER4_DEBUG
2183 +
2184 +extern void reiser4_check_block(const reiser4_block_nr *, int);
2185 +
2186 +#else
2187 +
2188 +# define reiser4_check_block(beg, val) noop
2189 +
2190 +#endif
2191 +
2192 +extern int pre_commit_hook(void);
2193 +extern void post_commit_hook(void);
2194 +extern void post_write_back_hook(void);
2195 +
2196 +#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
2197 +
2198 +/* Make Linus happy.
2199 + Local variables:
2200 + c-indentation-style: "K&R"
2201 + mode-name: "LC"
2202 + c-basic-offset: 8
2203 + tab-width: 8
2204 + fill-column: 120
2205 + End:
2206 +*/
2207 Index: linux-2.6.16/fs/reiser4/blocknrset.c
2208 ===================================================================
2209 --- /dev/null
2210 +++ linux-2.6.16/fs/reiser4/blocknrset.c
2211 @@ -0,0 +1,368 @@
2212 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2213 +
2214 +/* This file contains code for various block number sets used by the atom to
2215 + track the deleted set and wandered block mappings. */
2216 +
2217 +#include "debug.h"
2218 +#include "dformat.h"
2219 +#include "txnmgr.h"
2220 +#include "context.h"
2221 +
2222 +#include <linux/slab.h>
2223 +
2224 +/* The proposed data structure for storing unordered block number sets is a
2225 + list of elements, each of which contains an array of block number or/and
2226 + array of block number pairs. That element called blocknr_set_entry is used
2227 + to store block numbers from the beginning and for extents from the end of
2228 + the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2229 + count numbers of blocks and extents.
2230 +
2231 + +------------------- blocknr_set_entry->data ------------------+
2232 + |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2233 + +------------------------------------------------------------+
2234 +
2235 + When current blocknr_set_entry is full, allocate a new one. */
2236 +
2237 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2238 + * set (single blocks and block extents), in that case blocknr pair represent an
2239 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2240 + * there represent a (real block) -> (wandered block) mapping. */
2241 +
2242 +typedef struct blocknr_pair blocknr_pair;
2243 +
2244 +/* The total size of a blocknr_set_entry. */
2245 +#define BLOCKNR_SET_ENTRY_SIZE 128
2246 +
2247 +/* The number of blocks that can fit the blocknr data area. */
2248 +#define BLOCKNR_SET_ENTRIES_NUMBER \
2249 + ((BLOCKNR_SET_ENTRY_SIZE - \
2250 + 2 * sizeof (unsigned) - \
2251 + sizeof(struct list_head)) / \
2252 + sizeof(reiser4_block_nr))
2253 +
2254 +/* An entry of the blocknr_set */
2255 +struct blocknr_set_entry {
2256 + unsigned nr_singles;
2257 + unsigned nr_pairs;
2258 + struct list_head link;
2259 + reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2260 +};
2261 +
2262 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
2263 +struct blocknr_pair {
2264 + reiser4_block_nr a;
2265 + reiser4_block_nr b;
2266 +};
2267 +
2268 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
2269 +/* Audited by: green(2002.06.11) */
2270 +static unsigned bse_avail(blocknr_set_entry * bse)
2271 +{
2272 + unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2273 +
2274 + assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2275 + cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2276 +
2277 + return BLOCKNR_SET_ENTRIES_NUMBER - used;
2278 +}
2279 +
2280 +/* Initialize a blocknr_set_entry. */
2281 +static void bse_init(blocknr_set_entry *bse)
2282 +{
2283 + bse->nr_singles = 0;
2284 + bse->nr_pairs = 0;
2285 + INIT_LIST_HEAD(&bse->link);
2286 +}
2287 +
2288 +/* Allocate and initialize a blocknr_set_entry. */
2289 +/* Audited by: green(2002.06.11) */
2290 +static blocknr_set_entry *bse_alloc(void)
2291 +{
2292 + blocknr_set_entry *e;
2293 +
2294 + if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2295 + get_gfp_mask())) == NULL)
2296 + return NULL;
2297 +
2298 + bse_init(e);
2299 +
2300 + return e;
2301 +}
2302 +
2303 +/* Free a blocknr_set_entry. */
2304 +/* Audited by: green(2002.06.11) */
2305 +static void bse_free(blocknr_set_entry * bse)
2306 +{
2307 + kfree(bse);
2308 +}
2309 +
2310 +/* Add a block number to a blocknr_set_entry */
2311 +/* Audited by: green(2002.06.11) */
2312 +static void
2313 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2314 +{
2315 + assert("jmacd-5099", bse_avail(bse) >= 1);
2316 +
2317 + bse->entries[bse->nr_singles++] = *block;
2318 +}
2319 +
2320 +/* Get a pair of block numbers */
2321 +/* Audited by: green(2002.06.11) */
2322 +static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2323 +{
2324 + assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2325 +
2326 + return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2327 + 2 * (pno + 1));
2328 +}
2329 +
2330 +/* Add a pair of block numbers to a blocknr_set_entry */
2331 +/* Audited by: green(2002.06.11) */
2332 +static void
2333 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2334 + const reiser4_block_nr * b)
2335 +{
2336 + blocknr_pair *pair;
2337 +
2338 + assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2339 +
2340 + pair = bse_get_pair(bse, bse->nr_pairs++);
2341 +
2342 + pair->a = *a;
2343 + pair->b = *b;
2344 +}
2345 +
2346 +/* Add either a block or pair of blocks to the block number set. The first
2347 + blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2348 + @b is non-NULL a pair is added. The block number set belongs to atom, and
2349 + the call is made with the atom lock held. There may not be enough space in
2350 + the current blocknr_set_entry. If new_bsep points to a non-NULL
2351 + blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2352 + will be set to NULL. If new_bsep contains NULL then the atom lock will be
2353 + released and a new bse will be allocated in new_bsep. E_REPEAT will be
2354 + returned with the atom unlocked for the operation to be tried again. If
2355 + the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2356 + used during the call, it will be freed automatically. */
2357 +static int blocknr_set_add(txn_atom *atom, blocknr_set *bset,
2358 + blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2359 + const reiser4_block_nr *b)
2360 +{
2361 + blocknr_set_entry *bse;
2362 + unsigned entries_needed;
2363 +
2364 + assert("jmacd-5101", a != NULL);
2365 +
2366 + entries_needed = (b == NULL) ? 1 : 2;
2367 + if (list_empty(&bset->entries) ||
2368 + bse_avail(list_entry(bset->entries.next, blocknr_set_entry, link)) < entries_needed) {
2369 + /* See if a bse was previously allocated. */
2370 + if (*new_bsep == NULL) {
2371 + spin_unlock_atom(atom);
2372 + *new_bsep = bse_alloc();
2373 + return (*new_bsep != NULL) ? -E_REPEAT :
2374 + RETERR(-ENOMEM);
2375 + }
2376 +
2377 + /* Put it on the head of the list. */
2378 + list_add(&((*new_bsep)->link), &bset->entries);
2379 +
2380 + *new_bsep = NULL;
2381 + }
2382 +
2383 + /* Add the single or pair. */
2384 + bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2385 + if (b == NULL) {
2386 + bse_put_single(bse, a);
2387 + } else {
2388 + bse_put_pair(bse, a, b);
2389 + }
2390 +
2391 + /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2392 + if (*new_bsep != NULL) {
2393 + bse_free(*new_bsep);
2394 + *new_bsep = NULL;
2395 + }
2396 +
2397 + return 0;
2398 +}
2399 +
2400 +/* Add an extent to the block set. If the length is 1, it is treated as a
2401 + single block (e.g., reiser4_set_add_block). */
2402 +/* Audited by: green(2002.06.11) */
2403 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2404 + kmalloc might schedule. The only exception is atom spinlock, which is
2405 + properly freed. */
2406 +int
2407 +blocknr_set_add_extent(txn_atom * atom,
2408 + blocknr_set * bset,
2409 + blocknr_set_entry ** new_bsep,
2410 + const reiser4_block_nr * start,
2411 + const reiser4_block_nr * len)
2412 +{
2413 + assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2414 + return blocknr_set_add(atom, bset, new_bsep, start,
2415 + *len == 1 ? NULL : len);
2416 +}
2417 +
2418 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2419 + * by an assertion that both arguments are not null.*/
2420 +/* Audited by: green(2002.06.11) */
2421 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2422 + kmalloc might schedule. The only exception is atom spinlock, which is
2423 + properly freed. */
2424 +int
2425 +blocknr_set_add_pair(txn_atom * atom,
2426 + blocknr_set * bset,
2427 + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2428 + const reiser4_block_nr * b)
2429 +{
2430 + assert("jmacd-5103", a != NULL && b != NULL);
2431 + return blocknr_set_add(atom, bset, new_bsep, a, b);
2432 +}
2433 +
2434 +/* Initialize a blocknr_set. */
2435 +void blocknr_set_init(blocknr_set *bset)
2436 +{
2437 + INIT_LIST_HEAD(&bset->entries);
2438 +}
2439 +
2440 +/* Release the entries of a blocknr_set. */
2441 +void blocknr_set_destroy(blocknr_set *bset)
2442 +{
2443 + blocknr_set_entry *bse;
2444 +
2445 + while (!list_empty_careful(&bset->entries)) {
2446 + bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2447 + list_del_init(&bse->link);
2448 + bse_free(bse);
2449 + }
2450 +}
2451 +
2452 +/* Merge blocknr_set entries out of @from into @into. */
2453 +/* Audited by: green(2002.06.11) */
2454 +/* Auditor comments: This merge does not know if merged sets contain
2455 + blocks pairs (As for wandered sets) or extents, so it cannot really merge
2456 + overlapping ranges if there is some. So I believe it may lead to
2457 + some blocks being presented several times in one blocknr_set. To help
2458 + debugging such problems it might help to check for duplicate entries on
2459 + actual processing of this set. Testing this kind of stuff right here is
2460 + also complicated by the fact that these sets are not sorted and going
2461 + through whole set on each element addition is going to be CPU-heavy task */
2462 +void blocknr_set_merge(blocknr_set * from, blocknr_set * into)
2463 +{
2464 + blocknr_set_entry *bse_into = NULL;
2465 +
2466 + /* If @from is empty, no work to perform. */
2467 + if (list_empty_careful(&from->entries)) {
2468 + return;
2469 + }
2470 +
2471 + /* If @into is not empty, try merging partial-entries. */
2472 + if (!list_empty_careful(&into->entries)) {
2473 +
2474 + /* Neither set is empty, pop the front to members and try to combine them. */
2475 + blocknr_set_entry *bse_from;
2476 + unsigned into_avail;
2477 +
2478 + bse_into = list_entry(into->entries.next, blocknr_set_entry, link);
2479 + list_del_init(&bse_into->link);
2480 + bse_from = list_entry(from->entries.next, blocknr_set_entry, link);
2481 + list_del_init(&bse_from->link);
2482 +
2483 + /* Combine singles. */
2484 + for (into_avail = bse_avail(bse_into);
2485 + into_avail != 0 && bse_from->nr_singles != 0;
2486 + into_avail -= 1) {
2487 + bse_put_single(bse_into,
2488 + &bse_from->entries[--bse_from->
2489 + nr_singles]);
2490 + }
2491 +
2492 + /* Combine pairs. */
2493 + for (; into_avail > 1 && bse_from->nr_pairs != 0;
2494 + into_avail -= 2) {
2495 + blocknr_pair *pair =
2496 + bse_get_pair(bse_from, --bse_from->nr_pairs);
2497 + bse_put_pair(bse_into, &pair->a, &pair->b);
2498 + }
2499 +
2500 + /* If bse_from is empty, delete it now. */
2501 + if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2502 + bse_free(bse_from);
2503 + } else {
2504 + /* Otherwise, bse_into is full or nearly full (e.g.,
2505 + it could have one slot avail and bse_from has one
2506 + pair left). Push it back onto the list. bse_from
2507 + becomes bse_into, which will be the new partial. */
2508 + list_add(&bse_into->link, &into->entries);
2509 + bse_into = bse_from;
2510 + }
2511 + }
2512 +
2513 + /* Splice lists together. */
2514 + list_splice_init(&from->entries, into->entries.prev);
2515 +
2516 + /* Add the partial entry back to the head of the list. */
2517 + if (bse_into != NULL) {
2518 + list_add(&bse_into->link, &into->entries);
2519 + }
2520 +}
2521 +
2522 +/* Iterate over all blocknr set elements. */
2523 +int blocknr_set_iterator(txn_atom *atom, blocknr_set *bset,
2524 + blocknr_set_actor_f actor, void *data, int delete)
2525 +{
2526 +
2527 + blocknr_set_entry *entry;
2528 +
2529 + assert("zam-429", atom != NULL);
2530 + assert("zam-430", atom_is_protected(atom));
2531 + assert("zam-431", bset != 0);
2532 + assert("zam-432", actor != NULL);
2533 +
2534 + entry = list_entry(bset->entries.next, blocknr_set_entry, link);
2535 + while (&bset->entries != &entry->link) {
2536 + blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2537 + unsigned int i;
2538 + int ret;
2539 +
2540 + for (i = 0; i < entry->nr_singles; i++) {
2541 + ret = actor(atom, &entry->entries[i], NULL, data);
2542 +
2543 + /* We can't break a loop if delete flag is set. */
2544 + if (ret != 0 && !delete)
2545 + return ret;
2546 + }
2547 +
2548 + for (i = 0; i < entry->nr_pairs; i++) {
2549 + struct blocknr_pair *ab;
2550 +
2551 + ab = bse_get_pair(entry, i);
2552 +
2553 + ret = actor(atom, &ab->a, &ab->b, data);
2554 +
2555 + if (ret != 0 && !delete)
2556 + return ret;
2557 + }
2558 +
2559 + if (delete) {
2560 + list_del(&entry->link);
2561 + bse_free(entry);
2562 + }
2563 +
2564 + entry = tmp;
2565 + }
2566 +
2567 + return 0;
2568 +}
2569 +
2570 +/*
2571 + * Local variables:
2572 + * c-indentation-style: "K&R"
2573 + * mode-name: "LC"
2574 + * c-basic-offset: 8
2575 + * tab-width: 8
2576 + * fill-column: 79
2577 + * scroll-step: 1
2578 + * End:
2579 + */
2580 Index: linux-2.6.16/fs/reiser4/carry.c
2581 ===================================================================
2582 --- /dev/null
2583 +++ linux-2.6.16/fs/reiser4/carry.c
2584 @@ -0,0 +1,1381 @@
2585 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2586 +/* Functions to "carry" tree modification(s) upward. */
2587 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2588 + set of changes that need to be propagated to the next level. We manage
2589 + node locking such that any searches that collide with carrying are
2590 + restarted, from the root if necessary.
2591 +
2592 + Insertion of a new item may result in items being moved among nodes and
2593 + this requires the delimiting key to be updated at the least common parent
2594 + of the nodes modified to preserve search tree invariants. Also, insertion
2595 + may require allocation of a new node. A pointer to the new node has to be
2596 + inserted into some node on the parent level, etc.
2597 +
2598 + Tree carrying is meant to be analogous to arithmetic carrying.
2599 +
2600 + A carry operation is always associated with some node (&carry_node).
2601 +
2602 + Carry process starts with some initial set of operations to be performed
2603 + and an initial set of already locked nodes. Operations are performed one
2604 + by one. Performing each single operation has following possible effects:
2605 +
2606 + - content of carry node associated with operation is modified
2607 + - new carry nodes are locked and involved into carry process on this level
2608 + - new carry operations are posted to the next level
2609 +
2610 + After all carry operations on this level are done, process is repeated for
2611 + the accumulated sequence on carry operations for the next level. This
2612 + starts by trying to lock (in left to right order) all carry nodes
2613 + associated with carry operations on the parent level. After this, we decide
2614 + whether more nodes are required on the left of already locked set. If so,
2615 + all locks taken on the parent level are released, new carry nodes are
2616 + added, and locking process repeats.
2617 +
2618 + It may happen that balancing process fails owing to unrecoverable error on
2619 + some of upper levels of a tree (possible causes are io error, failure to
2620 + allocate new node, etc.). In this case we should unmount the filesystem,
2621 + rebooting if it is the root, and possibly advise the use of fsck.
2622 +
2623 + USAGE:
2624 +
2625 + int some_tree_operation( znode *node, ... )
2626 + {
2627 + // Allocate on a stack pool of carry objects: operations and nodes.
2628 + // Most carry processes will only take objects from here, without
2629 + // dynamic allocation.
2630 +
2631 +I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2632 +
2633 + carry_pool pool;
2634 + carry_level lowest_level;
2635 + carry_op *op;
2636 +
2637 + init_carry_pool( &pool );
2638 + init_carry_level( &lowest_level, &pool );
2639 +
2640 + // operation may be one of:
2641 + // COP_INSERT --- insert new item into node
2642 + // COP_CUT --- remove part of or whole node
2643 + // COP_PASTE --- increase size of item
2644 + // COP_DELETE --- delete pointer from parent node
2645 + // COP_UPDATE --- update delimiting key in least
2646 + // common ancestor of two
2647 +
2648 + op = post_carry( &lowest_level, operation, node, 0 );
2649 + if( IS_ERR( op ) || ( op == NULL ) ) {
2650 + handle error
2651 + } else {
2652 + // fill in remaining fields in @op, according to carry.h:carry_op
2653 + result = carry( &lowest_level, NULL );
2654 + }
2655 + done_carry_pool( &pool );
2656 + }
2657 +
2658 + When you are implementing node plugin method that participates in carry
2659 + (shifting, insertion, deletion, etc.), do the following:
2660 +
2661 + int foo_node_method( znode *node, ..., carry_level *todo )
2662 + {
2663 + carry_op *op;
2664 +
2665 + ....
2666 +
2667 + // note, that last argument to post_carry() is non-null
2668 + // here, because @op is to be applied to the parent of @node, rather
2669 + // than to the @node itself as in the previous case.
2670 +
2671 + op = node_post_carry( todo, operation, node, 1 );
2672 + // fill in remaining fields in @op, according to carry.h:carry_op
2673 +
2674 + ....
2675 +
2676 + }
2677 +
2678 + BATCHING:
2679 +
2680 + One of the main advantages of level-by-level balancing implemented here is
2681 + ability to batch updates on a parent level and to peform them more
2682 + efficiently as a result.
2683 +
2684 + Description To Be Done (TBD).
2685 +
2686 + DIFFICULTIES AND SUBTLE POINTS:
2687 +
2688 + 1. complex plumbing is required, because:
2689 +
2690 + a. effective allocation through pools is needed
2691 +
2692 + b. target of operation is not exactly known when operation is
2693 + posted. This is worked around through bitfields in &carry_node and
2694 + logic in lock_carry_node()
2695 +
2696 + c. of interaction with locking code: node should be added into sibling
2697 + list when pointer to it is inserted into its parent, which is some time
2698 + after node was created. Between these moments, node is somewhat in
2699 + suspended state and is only registered in the carry lists
2700 +
2701 + 2. whole balancing logic is implemented here, in particular, insertion
2702 + logic is coded in make_space().
2703 +
2704 + 3. special cases like insertion (add_tree_root()) or deletion
2705 + (kill_tree_root()) of tree root and morphing of paste into insert
2706 + (insert_paste()) have to be handled.
2707 +
2708 + 4. there is non-trivial interdependency between allocation of new nodes
2709 + and almost everything else. This is mainly due to the (1.c) above. I shall
2710 + write about this later.
2711 +
2712 +*/
2713 +
2714 +#include "forward.h"
2715 +#include "debug.h"
2716 +#include "key.h"
2717 +#include "coord.h"
2718 +#include "plugin/item/item.h"
2719 +#include "plugin/item/extent.h"
2720 +#include "plugin/node/node.h"
2721 +#include "jnode.h"
2722 +#include "znode.h"
2723 +#include "tree_mod.h"
2724 +#include "tree_walk.h"
2725 +#include "block_alloc.h"
2726 +#include "pool.h"
2727 +#include "tree.h"
2728 +#include "carry.h"
2729 +#include "carry_ops.h"
2730 +#include "super.h"
2731 +#include "reiser4.h"
2732 +
2733 +#include <linux/types.h>
2734 +
2735 +/* level locking/unlocking */
2736 +static int lock_carry_level(carry_level * level);
2737 +static void unlock_carry_level(carry_level * level, int failure);
2738 +static void done_carry_level(carry_level * level);
2739 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2740 +
2741 +int lock_carry_node(carry_level * level, carry_node * node);
2742 +int lock_carry_node_tail(carry_node * node);
2743 +
2744 +/* carry processing proper */
2745 +static int carry_on_level(carry_level * doing, carry_level * todo);
2746 +
2747 +static carry_op *add_op(carry_level * level, pool_ordering order,
2748 + carry_op * reference);
2749 +
2750 +/* handlers for carry operations. */
2751 +
2752 +static void fatal_carry_error(carry_level * doing, int ecode);
2753 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2754 +
2755 +
2756 +static void print_level(const char *prefix, carry_level * level);
2757 +
2758 +#if REISER4_DEBUG
2759 +typedef enum {
2760 + CARRY_TODO,
2761 + CARRY_DOING
2762 +} carry_queue_state;
2763 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2764 +#endif
2765 +
2766 +/* main entry point for tree balancing.
2767 +
2768 + Tree carry performs operations from @doing and while doing so accumulates
2769 + information about operations to be performed on the next level ("carried"
2770 + to the parent level). Carried operations are performed, causing possibly
2771 + more operations to be carried upward etc. carry() takes care about
2772 + locking and pinning znodes while operating on them.
2773 +
2774 + For usage, see comment at the top of fs/reiser4/carry.c
2775 +
2776 +*/
2777 +int carry(carry_level * doing /* set of carry operations to be performed */ ,
2778 + carry_level * done /* set of nodes, already performed at the
2779 + * previous level. NULL in most cases */ )
2780 +{
2781 + int result = 0;
2782 + /* queue of new requests */
2783 + carry_level *todo;
2784 + ON_DEBUG(STORE_COUNTERS);
2785 +
2786 + assert("nikita-888", doing != NULL);
2787 + BUG_ON(done != NULL);
2788 +
2789 + todo = doing + 1;
2790 + init_carry_level(todo, doing->pool);
2791 +
2792 + /* queue of requests preformed on the previous level */
2793 + done = todo + 1;
2794 + init_carry_level(done, doing->pool);
2795 +
2796 + /* iterate until there is nothing more to do */
2797 + while (result == 0 && doing->ops_num > 0) {
2798 + carry_level *tmp;
2799 +
2800 + /* at this point @done is locked. */
2801 + /* repeat lock/do/unlock while
2802 +
2803 + (1) lock_carry_level() fails due to deadlock avoidance, or
2804 +
2805 + (2) carry_on_level() decides that more nodes have to
2806 + be involved.
2807 +
2808 + (3) some unexpected error occurred while balancing on the
2809 + upper levels. In this case all changes are rolled back.
2810 +
2811 + */
2812 + while (1) {
2813 + result = lock_carry_level(doing);
2814 + if (result == 0) {
2815 + /* perform operations from @doing and
2816 + accumulate new requests in @todo */
2817 + result = carry_on_level(doing, todo);
2818 + if (result == 0)
2819 + break;
2820 + else if (result != -E_REPEAT ||
2821 + !doing->restartable) {
2822 + warning("nikita-1043",
2823 + "Fatal error during carry: %i",
2824 + result);
2825 + print_level("done", done);
2826 + print_level("doing", doing);
2827 + print_level("todo", todo);
2828 + /* do some rough stuff like aborting
2829 + all pending transcrashes and thus
2830 + pushing tree back to the consistent
2831 + state. Alternatvely, just panic.
2832 + */
2833 + fatal_carry_error(doing, result);
2834 + return result;
2835 + }
2836 + } else if (result != -E_REPEAT) {
2837 + fatal_carry_error(doing, result);
2838 + return result;
2839 + }
2840 + unlock_carry_level(doing, 1);
2841 + }
2842 + /* at this point @done can be safely unlocked */
2843 + done_carry_level(done);
2844 +
2845 + /* cyclically shift queues */
2846 + tmp = done;
2847 + done = doing;
2848 + doing = todo;
2849 + todo = tmp;
2850 + init_carry_level(todo, doing->pool);
2851 +
2852 + /* give other threads chance to run */
2853 + preempt_point();
2854 + }
2855 + done_carry_level(done);
2856 +
2857 + /* all counters, but x_refs should remain the same. x_refs can change
2858 + owing to transaction manager */
2859 + ON_DEBUG(CHECK_COUNTERS);
2860 + return result;
2861 +}
2862 +
2863 +/* perform carry operations on given level.
2864 +
2865 + Optimizations proposed by pooh:
2866 +
2867 + (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2868 + required;
2869 +
2870 + (2) unlock node if there are no more operations to be performed upon it and
2871 + node didn't add any operation to @todo. This can be implemented by
2872 + attaching to each node two counters: counter of operaions working on this
2873 + node and counter and operations carried upward from this node.
2874 +
2875 +*/
2876 +static int carry_on_level(carry_level * doing /* queue of carry operations to
2877 + * do on this level */ ,
2878 + carry_level * todo /* queue where new carry
2879 + * operations to be performed on
2880 + * the * parent level are
2881 + * accumulated during @doing
2882 + * processing. */ )
2883 +{
2884 + int result;
2885 + int (*f) (carry_op *, carry_level *, carry_level *);
2886 + carry_op *op;
2887 + carry_op *tmp_op;
2888 +
2889 + assert("nikita-1034", doing != NULL);
2890 + assert("nikita-1035", todo != NULL);
2891 +
2892 + /* @doing->nodes are locked. */
2893 +
2894 + /* This function can be split into two phases: analysis and modification.
2895 +
2896 + Analysis calculates precisely what items should be moved between
2897 + nodes. This information is gathered in some structures attached to
2898 + each carry_node in a @doing queue. Analysis also determines whether
2899 + new nodes are to be allocated etc.
2900 +
2901 + After analysis is completed, actual modification is performed. Here
2902 + we can take advantage of "batch modification": if there are several
2903 + operations acting on the same node, modifications can be performed
2904 + more efficiently when batched together.
2905 +
2906 + Above is an optimization left for the future.
2907 + */
2908 + /* Important, but delayed optimization: it's possible to batch
2909 + operations together and perform them more efficiently as a
2910 + result. For example, deletion of several neighboring items from a
2911 + node can be converted to a single ->cut() operation.
2912 +
2913 + Before processing queue, it should be scanned and "mergeable"
2914 + operations merged.
2915 + */
2916 + result = 0;
2917 + for_all_ops(doing, op, tmp_op) {
2918 + carry_opcode opcode;
2919 +
2920 + assert("nikita-1041", op != NULL);
2921 + opcode = op->op;
2922 + assert("nikita-1042", op->op < COP_LAST_OP);
2923 + f = op_dispatch_table[op->op].handler;
2924 + result = f(op, doing, todo);
2925 + /* locking can fail with -E_REPEAT. Any different error is fatal
2926 + and will be handled by fatal_carry_error() sledgehammer.
2927 + */
2928 + if (result != 0)
2929 + break;
2930 + }
2931 + if (result == 0) {
2932 + carry_plugin_info info;
2933 + carry_node *scan;
2934 + carry_node *tmp_scan;
2935 +
2936 + info.doing = doing;
2937 + info.todo = todo;
2938 +
2939 + assert("nikita-3002",
2940 + carry_level_invariant(doing, CARRY_DOING));
2941 + for_all_nodes(doing, scan, tmp_scan) {
2942 + znode *node;
2943 +
2944 + node = carry_real(scan);
2945 + assert("nikita-2547", node != NULL);
2946 + if (node_is_empty(node)) {
2947 + result =
2948 + node_plugin_by_node(node)->
2949 + prepare_removal(node, &info);
2950 + if (result != 0)
2951 + break;
2952 + }
2953 + }
2954 + }
2955 + return result;
2956 +}
2957 +
2958 +/* post carry operation
2959 +
2960 + This is main function used by external carry clients: node layout plugins
2961 + and tree operations to create new carry operation to be performed on some
2962 + level.
2963 +
2964 + New operation will be included in the @level queue. To actually perform it,
2965 + call carry( level, ... ). This function takes write lock on @node. Carry
2966 + manages all its locks by itself, don't worry about this.
2967 +
2968 + This function adds operation and node at the end of the queue. It is up to
2969 + caller to guarantee proper ordering of node queue.
2970 +
2971 +*/
2972 +carry_op *post_carry(carry_level * level /* queue where new operation is to
2973 + * be posted at */ ,
2974 + carry_opcode op /* opcode of operation */ ,
2975 + znode * node /* node on which this operation
2976 + * will operate */ ,
2977 + int apply_to_parent_p /* whether operation will operate
2978 + * directly on @node or on it
2979 + * parent. */ )
2980 +{
2981 + carry_op *result;
2982 + carry_node *child;
2983 +
2984 + assert("nikita-1046", level != NULL);
2985 + assert("nikita-1788", znode_is_write_locked(node));
2986 +
2987 + result = add_op(level, POOLO_LAST, NULL);
2988 + if (IS_ERR(result))
2989 + return result;
2990 + child = add_carry(level, POOLO_LAST, NULL);
2991 + if (IS_ERR(child)) {
2992 + reiser4_pool_free(&level->pool->op_pool, &result->header);
2993 + return (carry_op *) child;
2994 + }
2995 + result->node = child;
2996 + result->op = op;
2997 + child->parent = apply_to_parent_p;
2998 + if (ZF_ISSET(node, JNODE_ORPHAN))
2999 + child->left_before = 1;
3000 + child->node = node;
3001 + return result;
3002 +}
3003 +
3004 +/* initialize carry queue */
3005 +void init_carry_level(carry_level * level /* level to initialize */ ,
3006 + carry_pool * pool /* pool @level will allocate objects
3007 + * from */ )
3008 +{
3009 + assert("nikita-1045", level != NULL);
3010 + assert("nikita-967", pool != NULL);
3011 +
3012 + memset(level, 0, sizeof *level);
3013 + level->pool = pool;
3014 +
3015 + INIT_LIST_HEAD(&level->nodes);
3016 + INIT_LIST_HEAD(&level->ops);
3017 +}
3018 +
3019 +/* allocate carry pool and initialize pools within queue */
3020 +carry_pool *init_carry_pool(int size)
3021 +{
3022 + carry_pool *pool;
3023 +
3024 + assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
3025 + pool = kmalloc(size, get_gfp_mask());
3026 + if (pool == NULL)
3027 + return ERR_PTR(RETERR(-ENOMEM));
3028 +
3029 + reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
3030 + (char *)pool->op);
3031 + reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
3032 + NODES_LOCKED_POOL_SIZE, (char *)pool->node);
3033 + return pool;
3034 +}
3035 +
3036 +/* finish with queue pools */
3037 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
3038 +{
3039 + reiser4_done_pool(&pool->op_pool);
3040 + reiser4_done_pool(&pool->node_pool);
3041 + kfree(pool);
3042 +}
3043 +
3044 +/* add new carry node to the @level.
3045 +
3046 + Returns pointer to the new carry node allocated from pool. It's up to
3047 + callers to maintain proper order in the @level. Assumption is that if carry
3048 + nodes on one level are already sorted and modifications are peroformed from
3049 + left to right, carry nodes added on the parent level will be ordered
3050 + automatically. To control ordering use @order and @reference parameters.
3051 +
3052 +*/
3053 +carry_node *add_carry_skip(carry_level * level /* &carry_level to add node
3054 + * to */ ,
3055 + pool_ordering order /* where to insert: at the
3056 + * beginning of @level,
3057 + * before @reference, after
3058 + * @reference, at the end
3059 + * of @level */ ,
3060 + carry_node * reference /* reference node for
3061 + * insertion */ )
3062 +{
3063 + ON_DEBUG(carry_node * orig_ref = reference);
3064 +
3065 + if (order == POOLO_BEFORE) {
3066 + reference = find_left_carry(reference, level);
3067 + if (reference == NULL)
3068 + reference = list_entry(level->nodes.next, carry_node,
3069 + header.level_linkage);
3070 + else
3071 + reference = list_entry(reference->header.level_linkage.next,
3072 + carry_node, header.level_linkage);
3073 + } else if (order == POOLO_AFTER) {
3074 + reference = find_right_carry(reference, level);
3075 + if (reference == NULL)
3076 + reference = list_entry(level->nodes.prev, carry_node,
3077 + header.level_linkage);
3078 + else
3079 + reference = list_entry(reference->header.level_linkage.prev,
3080 + carry_node, header.level_linkage);
3081 + }
3082 + assert("nikita-2209",
3083 + ergo(orig_ref != NULL,
3084 + carry_real(reference) == carry_real(orig_ref)));
3085 + return add_carry(level, order, reference);
3086 +}
3087 +
3088 +carry_node *add_carry(carry_level * level /* &carry_level to add node
3089 + * to */ ,
3090 + pool_ordering order /* where to insert: at the
3091 + * beginning of @level, before
3092 + * @reference, after @reference,
3093 + * at the end of @level */ ,
3094 + carry_node * reference /* reference node for
3095 + * insertion */ )
3096 +{
3097 + carry_node *result;
3098 +
3099 + result =
3100 + (carry_node *) add_obj(&level->pool->node_pool, &level->nodes,
3101 + order, &reference->header);
3102 + if (!IS_ERR(result) && (result != NULL))
3103 + ++level->nodes_num;
3104 + return result;
3105 +}
3106 +
3107 +/* add new carry operation to the @level.
3108 +
3109 + Returns pointer to the new carry operations allocated from pool. It's up to
3110 + callers to maintain proper order in the @level. To control ordering use
3111 + @order and @reference parameters.
3112 +
3113 +*/
3114 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
3115 + pool_ordering order /* where to insert: at the beginning of
3116 + * @level, before @reference, after
3117 + * @reference, at the end of @level */ ,
3118 + carry_op *
3119 + reference /* reference node for insertion */ )
3120 +{
3121 + carry_op *result;
3122 +
3123 + result =
3124 + (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order,
3125 + &reference->header);
3126 + if (!IS_ERR(result) && (result != NULL))
3127 + ++level->ops_num;
3128 + return result;
3129 +}
3130 +
3131 +/* Return node on the right of which @node was created.
3132 +
3133 + Each node is created on the right of some existing node (or it is new root,
3134 + which is special case not handled here).
3135 +
3136 + @node is new node created on some level, but not yet inserted into its
3137 + parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3138 +
3139 +*/
3140 +static carry_node *find_begetting_brother(carry_node * node /* node to start search
3141 + * from */ ,
3142 + carry_level * kin UNUSED_ARG /* level to
3143 + * scan */ )
3144 +{
3145 + carry_node *scan;
3146 +
3147 + assert("nikita-1614", node != NULL);
3148 + assert("nikita-1615", kin != NULL);
3149 + assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3150 + assert("nikita-1619", ergo(carry_real(node) != NULL,
3151 + ZF_ISSET(carry_real(node), JNODE_ORPHAN)));
3152 +
3153 + for (scan = node;;
3154 + scan = list_entry(scan->header.level_linkage.prev, carry_node,
3155 + header.level_linkage)) {
3156 + assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
3157 + if ((scan->node != node->node) &&
3158 + !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3159 + assert("nikita-1618", carry_real(scan) != NULL);
3160 + break;
3161 + }
3162 + }
3163 + return scan;
3164 +}
3165 +
3166 +static cmp_t
3167 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3168 +{
3169 + assert("nikita-2199", n1 != NULL);
3170 + assert("nikita-2200", n2 != NULL);
3171 +
3172 + if (n1 == n2)
3173 + return EQUAL_TO;
3174 + while (1) {
3175 + n1 = carry_node_next(n1);
3176 + if (carry_node_end(level, n1))
3177 + return GREATER_THAN;
3178 + if (n1 == n2)
3179 + return LESS_THAN;
3180 + }
3181 + impossible("nikita-2201", "End of level reached");
3182 +}
3183 +
3184 +carry_node *find_carry_node(carry_level * level, const znode * node)
3185 +{
3186 + carry_node *scan;
3187 + carry_node *tmp_scan;
3188 +
3189 + assert("nikita-2202", level != NULL);
3190 + assert("nikita-2203", node != NULL);
3191 +
3192 + for_all_nodes(level, scan, tmp_scan) {
3193 + if (carry_real(scan) == node)
3194 + return scan;
3195 + }
3196 + return NULL;
3197 +}
3198 +
3199 +znode *carry_real(const carry_node * node)
3200 +{
3201 + assert("nikita-3061", node != NULL);
3202 +
3203 + return node->lock_handle.node;
3204 +}
3205 +
3206 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
3207 + const znode * node)
3208 +{
3209 + carry_node *base;
3210 + carry_node *scan;
3211 + carry_node *tmp_scan;
3212 + carry_node *proj;
3213 +
3214 + base = find_carry_node(doing, node);
3215 + assert("nikita-2204", base != NULL);
3216 +
3217 + for_all_nodes(todo, scan, tmp_scan) {
3218 + proj = find_carry_node(doing, scan->node);
3219 + assert("nikita-2205", proj != NULL);
3220 + if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3221 + break;
3222 + }
3223 + return scan;
3224 +}
3225 +
3226 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
3227 + znode * node)
3228 +{
3229 + carry_node *reference;
3230 +
3231 + assert("nikita-2994", doing != NULL);
3232 + assert("nikita-2995", todo != NULL);
3233 + assert("nikita-2996", node != NULL);
3234 +
3235 + reference = insert_carry_node(doing, todo, node);
3236 + assert("nikita-2997", reference != NULL);
3237 +
3238 + return add_carry(todo, POOLO_BEFORE, reference);
3239 +}
3240 +
3241 +/* like post_carry(), but designed to be called from node plugin methods.
3242 + This function is different from post_carry() in that it finds proper place
3243 + to insert node in the queue. */
3244 +carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
3245 + * passed down to node
3246 + * plugin */ ,
3247 + carry_opcode op /* opcode of operation */ ,
3248 + znode * node /* node on which this
3249 + * operation will operate */ ,
3250 + int apply_to_parent_p /* whether operation will
3251 + * operate directly on @node
3252 + * or on it parent. */ )
3253 +{
3254 + carry_op *result;
3255 + carry_node *child;
3256 +
3257 + assert("nikita-2207", info != NULL);
3258 + assert("nikita-2208", info->todo != NULL);
3259 +
3260 + if (info->doing == NULL)
3261 + return post_carry(info->todo, op, node, apply_to_parent_p);
3262 +
3263 + result = add_op(info->todo, POOLO_LAST, NULL);
3264 + if (IS_ERR(result))
3265 + return result;
3266 + child = add_carry_atplace(info->doing, info->todo, node);
3267 + if (IS_ERR(child)) {
3268 + reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3269 + return (carry_op *) child;
3270 + }
3271 + result->node = child;
3272 + result->op = op;
3273 + child->parent = apply_to_parent_p;
3274 + if (ZF_ISSET(node, JNODE_ORPHAN))
3275 + child->left_before = 1;
3276 + child->node = node;
3277 + return result;
3278 +}
3279 +
3280 +/* lock all carry nodes in @level */
3281 +static int lock_carry_level(carry_level * level /* level to lock */ )
3282 +{
3283 + int result;
3284 + carry_node *node;
3285 + carry_node *tmp_node;
3286 +
3287 + assert("nikita-881", level != NULL);
3288 + assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3289 +
3290 + /* lock nodes from left to right */
3291 + result = 0;
3292 + for_all_nodes(level, node, tmp_node) {
3293 + result = lock_carry_node(level, node);
3294 + if (result != 0)
3295 + break;
3296 + }
3297 + return result;
3298 +}
3299 +
3300 +/* Synchronize delimiting keys between @node and its left neighbor.
3301 +
3302 + To reduce contention on dk key and simplify carry code, we synchronize
3303 + delimiting keys only when carry ultimately leaves tree level (carrying
3304 + changes upward) and unlocks nodes at this level.
3305 +
3306 + This function first finds left neighbor of @node and then updates left
3307 + neighbor's right delimiting key to conincide with least key in @node.
3308 +
3309 +*/
3310 +
3311 +ON_DEBUG(extern atomic_t delim_key_version;
3312 + )
3313 +
3314 +static void sync_dkeys(znode * spot /* node to update */ )
3315 +{
3316 + reiser4_key pivot;
3317 + reiser4_tree *tree;
3318 +
3319 + assert("nikita-1610", spot != NULL);
3320 + assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3321 +
3322 + tree = znode_get_tree(spot);
3323 + read_lock_tree(tree);
3324 + write_lock_dk(tree);
3325 +
3326 + assert("nikita-2192", znode_is_loaded(spot));
3327 +
3328 + /* sync left delimiting key of @spot with key in its leftmost item */
3329 + if (node_is_empty(spot))
3330 + pivot = *znode_get_rd_key(spot);
3331 + else
3332 + leftmost_key_in_node(spot, &pivot);
3333 +
3334 + znode_set_ld_key(spot, &pivot);
3335 +
3336 + /* there can be sequence of empty nodes pending removal on the left of
3337 + @spot. Scan them and update their left and right delimiting keys to
3338 + match left delimiting key of @spot. Also, update right delimiting
3339 + key of first non-empty left neighbor.
3340 + */
3341 + while (1) {
3342 + if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3343 + break;
3344 +
3345 + spot = spot->left;
3346 + if (spot == NULL)
3347 + break;
3348 +
3349 + znode_set_rd_key(spot, &pivot);
3350 + /* don't sink into the domain of another balancing */
3351 + if (!znode_is_write_locked(spot))
3352 + break;
3353 + if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3354 + znode_set_ld_key(spot, &pivot);
3355 + else
3356 + break;
3357 + }
3358 +
3359 + write_unlock_dk(tree);
3360 + read_unlock_tree(tree);
3361 +}
3362 +
3363 +/* unlock all carry nodes in @level */
3364 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3365 + int failure /* true if unlocking owing to
3366 + * failure */ )
3367 +{
3368 + carry_node *node;
3369 + carry_node *tmp_node;
3370 +
3371 + assert("nikita-889", level != NULL);
3372 +
3373 + if (!failure) {
3374 + znode *spot;
3375 +
3376 + spot = NULL;
3377 + /* update delimiting keys */
3378 + for_all_nodes(level, node, tmp_node) {
3379 + if (carry_real(node) != spot) {
3380 + spot = carry_real(node);
3381 + sync_dkeys(spot);
3382 + }
3383 + }
3384 + }
3385 +
3386 + /* nodes can be unlocked in arbitrary order. In preemptible
3387 + environment it's better to unlock in reverse order of locking,
3388 + though.
3389 + */
3390 + for_all_nodes_back(level, node, tmp_node) {
3391 + /* all allocated nodes should be already linked to their
3392 + parents at this moment. */
3393 + assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node),
3394 + JNODE_ORPHAN)));
3395 + ON_DEBUG(check_dkeys(carry_real(node)));
3396 + unlock_carry_node(level, node, failure);
3397 + }
3398 + level->new_root = NULL;
3399 +}
3400 +
3401 +/* finish with @level
3402 +
3403 + Unlock nodes and release all allocated resources */
3404 +static void done_carry_level(carry_level * level /* level to finish */ )
3405 +{
3406 + carry_node *node;
3407 + carry_node *tmp_node;
3408 + carry_op *op;
3409 + carry_op *tmp_op;
3410 +
3411 + assert("nikita-1076", level != NULL);
3412 +
3413 + unlock_carry_level(level, 0);
3414 + for_all_nodes(level, node, tmp_node) {
3415 + assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3416 + assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3417 + reiser4_pool_free(&level->pool->node_pool, &node->header);
3418 + }
3419 + for_all_ops(level, op, tmp_op)
3420 + reiser4_pool_free(&level->pool->op_pool, &op->header);
3421 +}
3422 +
3423 +/* helper function to complete locking of carry node
3424 +
3425 + Finish locking of carry node. There are several ways in which new carry
3426 + node can be added into carry level and locked. Normal is through
3427 + lock_carry_node(), but also from find_{left|right}_neighbor(). This
3428 + function factors out common final part of all locking scenarios. It
3429 + supposes that @node -> lock_handle is lock handle for lock just taken and
3430 + fills ->real_node from this lock handle.
3431 +
3432 +*/
3433 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3434 +{
3435 + assert("nikita-1052", node != NULL);
3436 + assert("nikita-1187", carry_real(node) != NULL);
3437 + assert("nikita-1188", !node->unlock);
3438 +
3439 + node->unlock = 1;
3440 + /* Load node content into memory and install node plugin by
3441 + looking at the node header.
3442 +
3443 + Most of the time this call is cheap because the node is
3444 + already in memory.
3445 +
3446 + Corresponding zrelse() is in unlock_carry_node()
3447 + */
3448 + return zload(carry_real(node));
3449 +}
3450 +
3451 +/* lock carry node
3452 +
3453 + "Resolve" node to real znode, lock it and mark as locked.
3454 + This requires recursive locking of znodes.
3455 +
3456 + When operation is posted to the parent level, node it will be applied to is
3457 + not yet known. For example, when shifting data between two nodes,
3458 + delimiting has to be updated in parent or parents of nodes involved. But
3459 + their parents is not yet locked and, moreover said nodes can be reparented
3460 + by concurrent balancing.
3461 +
3462 + To work around this, carry operation is applied to special "carry node"
3463 + rather than to the znode itself. Carry node consists of some "base" or
3464 + "reference" znode and flags indicating how to get to the target of carry
3465 + operation (->real_node field of carry_node) from base.
3466 +
3467 +*/
3468 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3469 + carry_node * node /* node to lock */ )
3470 +{
3471 + int result;
3472 + znode *reference_point;
3473 + lock_handle lh;
3474 + lock_handle tmp_lh;
3475 + reiser4_tree *tree;
3476 +
3477 + assert("nikita-887", level != NULL);
3478 + assert("nikita-882", node != NULL);
3479 +
3480 + result = 0;
3481 + reference_point = node->node;
3482 + init_lh(&lh);
3483 + init_lh(&tmp_lh);
3484 + if (node->left_before) {
3485 + /* handling of new nodes, allocated on the previous level:
3486 +
3487 + some carry ops were propably posted from the new node, but
3488 + this node neither has parent pointer set, nor is
3489 + connected. This will be done in ->create_hook() for
3490 + internal item.
3491 +
3492 + No then less, parent of new node has to be locked. To do
3493 + this, first go to the "left" in the carry order. This
3494 + depends on the decision to always allocate new node on the
3495 + right of existing one.
3496 +
3497 + Loop handles case when multiple nodes, all orphans, were
3498 + inserted.
3499 +
3500 + Strictly speaking, taking tree lock is not necessary here,
3501 + because all nodes scanned by loop in
3502 + find_begetting_brother() are write-locked by this thread,
3503 + and thus, their sibling linkage cannot change.
3504 +
3505 + */
3506 + tree = znode_get_tree(reference_point);
3507 + read_lock_tree(tree);
3508 + reference_point = find_begetting_brother(node, level)->node;
3509 + read_unlock_tree(tree);
3510 + assert("nikita-1186", reference_point != NULL);
3511 + }
3512 + if (node->parent && (result == 0)) {
3513 + result =
3514 + reiser4_get_parent(&tmp_lh, reference_point,
3515 + ZNODE_WRITE_LOCK);
3516 + if (result != 0) {
3517 + ; /* nothing */
3518 + } else if (znode_get_level(tmp_lh.node) == 0) {
3519 + assert("nikita-1347", znode_above_root(tmp_lh.node));
3520 + result = add_new_root(level, node, tmp_lh.node);
3521 + if (result == 0) {
3522 + reference_point = level->new_root;
3523 + move_lh(&lh, &node->lock_handle);
3524 + }
3525 + } else if ((level->new_root != NULL)
3526 + && (level->new_root !=
3527 + znode_parent_nolock(reference_point))) {
3528 + /* parent of node exists, but this level aready
3529 + created different new root, so */
3530 + warning("nikita-1109",
3531 + /* it should be "radicis", but tradition is
3532 + tradition. do banshees read latin? */
3533 + "hodie natus est radici frater");
3534 + result = -EIO;
3535 + } else {
3536 + move_lh(&lh, &tmp_lh);
3537 + reference_point = lh.node;
3538 + }
3539 + }
3540 + if (node->left && (result == 0)) {
3541 + assert("nikita-1183", node->parent);
3542 + assert("nikita-883", reference_point != NULL);
3543 + result =
3544 + reiser4_get_left_neighbor(&tmp_lh, reference_point,
3545 + ZNODE_WRITE_LOCK,
3546 + GN_CAN_USE_UPPER_LEVELS);
3547 + if (result == 0) {
3548 + done_lh(&lh);
3549 + move_lh(&lh, &tmp_lh);
3550 + reference_point = lh.node;
3551 + }
3552 + }
3553 + if (!node->parent && !node->left && !node->left_before) {
3554 + result =
3555 + longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3556 + ZNODE_LOCK_HIPRI);
3557 + }
3558 + if (result == 0) {
3559 + move_lh(&node->lock_handle, &lh);
3560 + result = lock_carry_node_tail(node);
3561 + }
3562 + done_lh(&tmp_lh);
3563 + done_lh(&lh);
3564 + return result;
3565 +}
3566 +
3567 +/* release a lock on &carry_node.
3568 +
3569 + Release if necessary lock on @node. This opearion is pair of
3570 + lock_carry_node() and is idempotent: you can call it more than once on the
3571 + same node.
3572 +
3573 +*/
3574 +static void
3575 +unlock_carry_node(carry_level * level,
3576 + carry_node * node /* node to be released */ ,
3577 + int failure /* 0 if node is unlocked due
3578 + * to some error */ )
3579 +{
3580 + znode *real_node;
3581 +
3582 + assert("nikita-884", node != NULL);
3583 +
3584 + real_node = carry_real(node);
3585 + /* pair to zload() in lock_carry_node_tail() */
3586 + zrelse(real_node);
3587 + if (node->unlock && (real_node != NULL)) {
3588 + assert("nikita-899", real_node == node->lock_handle.node);
3589 + longterm_unlock_znode(&node->lock_handle);
3590 + }
3591 + if (failure) {
3592 + if (node->deallocate && (real_node != NULL)) {
3593 + /* free node in bitmap
3594 +
3595 + Prepare node for removal. Last zput() will finish
3596 + with it.
3597 + */
3598 + ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3599 + }
3600 + if (node->free) {
3601 + assert("nikita-2177",
3602 + list_empty_careful(&node->lock_handle.locks_link));
3603 + assert("nikita-2112",
3604 + list_empty_careful(&node->lock_handle.owners_link));
3605 + reiser4_pool_free(&level->pool->node_pool,
3606 + &node->header);
3607 + }
3608 + }
3609 +}
3610 +
3611 +/* fatal_carry_error() - all-catching error handling function
3612 +
3613 + It is possible that carry faces unrecoverable error, like unability to
3614 + insert pointer at the internal level. Our simple solution is just panic in
3615 + this situation. More sophisticated things like attempt to remount
3616 + file-system as read-only can be implemented without much difficlties.
3617 +
3618 + It is believed, that:
3619 +
3620 + 1. in stead of panicking, all current transactions can be aborted rolling
3621 + system back to the consistent state.
3622 +
3623 +Umm, if you simply panic without doing anything more at all, then all current
3624 +transactions are aborted and the system is rolled back to a consistent state,
3625 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3626 +precise. If an internal node is corrupted on disk due to hardware failure,
3627 +then there may be no consistent state that can be rolled back to, so instead
3628 +we should say that it will rollback the transactions, which barring other
3629 +factors means rolling back to a consistent state.
3630 +
3631 +# Nikita: there is a subtle difference between panic and aborting
3632 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3633 +# don't using reiser4 (not that we care about such processes), or using other
3634 +# reiser4 mounts (about them we do care) will simply continue to run. With
3635 +# some luck, even application using aborted file system can survive: it will
3636 +# get some error, like EBADF, from each file descriptor on failed file system,
3637 +# but applications that do care about tolerance will cope with this (squid
3638 +# will).
3639 +
3640 +It would be a nice feature though to support rollback without rebooting
3641 +followed by remount, but this can wait for later versions.
3642 +
3643 + 2. once isolated transactions will be implemented it will be possible to
3644 + roll back offending transaction.
3645 +
3646 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3647 +it more before deciding if it should be done. -Hans
3648 +
3649 +*/
3650 +static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3651 + * where
3652 + * unrecoverable
3653 + * error
3654 + * occurred */ ,
3655 + int ecode /* error code */ )
3656 +{
3657 + assert("nikita-1230", doing != NULL);
3658 + assert("nikita-1231", ecode < 0);
3659 +
3660 + reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3661 +}
3662 +
3663 +/* add new root to the tree
3664 +
3665 + This function itself only manages changes in carry structures and delegates
3666 + all hard work (allocation of znode for new root, changes of parent and
3667 + sibling pointers to the add_tree_root().
3668 +
3669 + Locking: old tree root is locked by carry at this point. Fake znode is also
3670 + locked.
3671 +
3672 +*/
3673 +static int add_new_root(carry_level * level /* carry level in context of which
3674 + * operation is performed */ ,
3675 + carry_node * node /* carry node for existing root */ ,
3676 + znode * fake /* "fake" znode already locked by
3677 + * us */ )
3678 +{
3679 + int result;
3680 +
3681 + assert("nikita-1104", level != NULL);
3682 + assert("nikita-1105", node != NULL);
3683 +
3684 + assert("nikita-1403", znode_is_write_locked(node->node));
3685 + assert("nikita-1404", znode_is_write_locked(fake));
3686 +
3687 + /* trying to create new root. */
3688 + /* @node is root and it's already locked by us. This
3689 + means that nobody else can be trying to add/remove
3690 + tree root right now.
3691 + */
3692 + if (level->new_root == NULL)
3693 + level->new_root = add_tree_root(node->node, fake);
3694 + if (!IS_ERR(level->new_root)) {
3695 + assert("nikita-1210", znode_is_root(level->new_root));
3696 + node->deallocate = 1;
3697 + result =
3698 + longterm_lock_znode(&node->lock_handle, level->new_root,
3699 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3700 + if (result == 0)
3701 + zput(level->new_root);
3702 + } else {
3703 + result = PTR_ERR(level->new_root);
3704 + level->new_root = NULL;
3705 + }
3706 + return result;
3707 +}
3708 +
3709 +/* allocate new znode and add the operation that inserts the
3710 + pointer to it into the parent node into the todo level
3711 +
3712 + Allocate new znode, add it into carry queue and post into @todo queue
3713 + request to add pointer to new node into its parent.
3714 +
3715 + This is carry related routing that calls new_node() to allocate new
3716 + node.
3717 +*/
3718 +carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3719 + * node */ ,
3720 + carry_node * ref /* carry node after which new
3721 + * carry node is to be inserted
3722 + * into queue. This affects
3723 + * locking. */ ,
3724 + carry_level * doing /* carry queue where new node is
3725 + * to be added */ ,
3726 + carry_level * todo /* carry queue where COP_INSERT
3727 + * operation to add pointer to
3728 + * new node will ne added */ )
3729 +{
3730 + carry_node *fresh;
3731 + znode *new_znode;
3732 + carry_op *add_pointer;
3733 + carry_plugin_info info;
3734 +
3735 + assert("nikita-1048", brother != NULL);
3736 + assert("nikita-1049", todo != NULL);
3737 +
3738 + /* There is a lot of possible variations here: to what parent
3739 + new node will be attached and where. For simplicity, always
3740 + do the following:
3741 +
3742 + (1) new node and @brother will have the same parent.
3743 +
3744 + (2) new node is added on the right of @brother
3745 +
3746 + */
3747 +
3748 + fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref);
3749 + if (IS_ERR(fresh))
3750 + return fresh;
3751 +
3752 + fresh->deallocate = 1;
3753 + fresh->free = 1;
3754 +
3755 + new_znode = new_node(brother, znode_get_level(brother));
3756 + if (IS_ERR(new_znode))
3757 + /* @fresh will be deallocated automatically by error
3758 + handling code in the caller. */
3759 + return (carry_node *) new_znode;
3760 +
3761 + /* new_znode returned znode with x_count 1. Caller has to decrease
3762 + it. make_space() does. */
3763 +
3764 + ZF_SET(new_znode, JNODE_ORPHAN);
3765 + fresh->node = new_znode;
3766 +
3767 + while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) {
3768 + ref = carry_node_prev(ref);
3769 + assert("nikita-1606", !carry_node_end(doing, ref));
3770 + }
3771 +
3772 + info.todo = todo;
3773 + info.doing = doing;
3774 + add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1);
3775 + if (IS_ERR(add_pointer)) {
3776 + /* no need to deallocate @new_znode here: it will be
3777 + deallocated during carry error handling. */
3778 + return (carry_node *) add_pointer;
3779 + }
3780 +
3781 + add_pointer->u.insert.type = COPT_CHILD;
3782 + add_pointer->u.insert.child = fresh;
3783 + add_pointer->u.insert.brother = brother;
3784 + /* initially new node spawns empty key range */
3785 + write_lock_dk(znode_get_tree(brother));
3786 + znode_set_ld_key(new_znode,
3787 + znode_set_rd_key(new_znode,
3788 + znode_get_rd_key(brother)));
3789 + write_unlock_dk(znode_get_tree(brother));
3790 + return fresh;
3791 +}
3792 +
3793 +/* DEBUGGING FUNCTIONS.
3794 +
3795 + Probably we also should leave them on even when
3796 + debugging is turned off to print dumps at errors.
3797 +*/
3798 +#if REISER4_DEBUG
3799 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3800 +{
3801 + carry_node *node;
3802 + carry_node *tmp_node;
3803 +
3804 + if (level == NULL)
3805 + return 0;
3806 +
3807 + if (level->track_type != 0 &&
3808 + level->track_type != CARRY_TRACK_NODE &&
3809 + level->track_type != CARRY_TRACK_CHANGE)
3810 + return 0;
3811 +
3812 + /* check that nodes are in ascending order */
3813 + for_all_nodes(level, node, tmp_node) {
3814 + znode *left;
3815 + znode *right;
3816 +
3817 + reiser4_key lkey;
3818 + reiser4_key rkey;
3819 +
3820 + if (node != carry_node_front(level)) {
3821 + if (state == CARRY_TODO) {
3822 + right = node->node;
3823 + left = carry_node_prev(node)->node;
3824 + } else {
3825 + right = carry_real(node);
3826 + left = carry_real(carry_node_prev(node));
3827 + }
3828 + if (right == NULL || left == NULL)
3829 + continue;
3830 + if (node_is_empty(right) || node_is_empty(left))
3831 + continue;
3832 + if (!keyle(leftmost_key_in_node(left, &lkey),
3833 + leftmost_key_in_node(right, &rkey))) {
3834 + warning("", "wrong key order");
3835 + return 0;
3836 + }
3837 + }
3838 + }
3839 + return 1;
3840 +}
3841 +#endif
3842 +
3843 +/* get symbolic name for boolean */
3844 +static const char *tf(int boolean /* truth value */ )
3845 +{
3846 + return boolean ? "t" : "f";
3847 +}
3848 +
3849 +/* symbolic name for carry operation */
3850 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3851 +{
3852 + switch (op) {
3853 + case COP_INSERT:
3854 + return "COP_INSERT";
3855 + case COP_DELETE:
3856 + return "COP_DELETE";
3857 + case COP_CUT:
3858 + return "COP_CUT";
3859 + case COP_PASTE:
3860 + return "COP_PASTE";
3861 + case COP_UPDATE:
3862 + return "COP_UPDATE";
3863 + case COP_EXTENT:
3864 + return "COP_EXTENT";
3865 + case COP_INSERT_FLOW:
3866 + return "COP_INSERT_FLOW";
3867 + default:{
3868 + /* not mt safe, but who cares? */
3869 + static char buf[20];
3870 +
3871 + sprintf(buf, "unknown op: %x", op);
3872 + return buf;
3873 + }
3874 + }
3875 +}
3876 +
3877 +/* dump information about carry node */
3878 +static void print_carry(const char *prefix /* prefix to print */ ,
3879 + carry_node * node /* node to print */ )
3880 +{
3881 + if (node == NULL) {
3882 + printk("%s: null\n", prefix);
3883 + return;
3884 + }
3885 + printk
3886 + ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3887 + prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3888 + tf(node->free), tf(node->deallocate));
3889 +}
3890 +
3891 +/* dump information about carry operation */
3892 +static void print_op(const char *prefix /* prefix to print */ ,
3893 + carry_op * op /* operation to print */ )
3894 +{
3895 + if (op == NULL) {
3896 + printk("%s: null\n", prefix);
3897 + return;
3898 + }
3899 + printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3900 + print_carry("\tnode", op->node);
3901 + switch (op->op) {
3902 + case COP_INSERT:
3903 + case COP_PASTE:
3904 + print_coord("\tcoord",
3905 + op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3906 + print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL);
3907 + print_carry("\tchild", op->u.insert.child);
3908 + break;
3909 + case COP_DELETE:
3910 + print_carry("\tchild", op->u.delete.child);
3911 + break;
3912 + case COP_CUT:
3913 + if (op->u.cut_or_kill.is_cut) {
3914 + print_coord("\tfrom",
3915 + op->u.cut_or_kill.u.kill->params.from, 0);
3916 + print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3917 + 0);
3918 + } else {
3919 + print_coord("\tfrom",
3920 + op->u.cut_or_kill.u.cut->params.from, 0);
3921 + print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3922 + 0);
3923 + }
3924 + break;
3925 + case COP_UPDATE:
3926 + print_carry("\tleft", op->u.update.left);
3927 + break;
3928 + default:
3929 + /* do nothing */
3930 + break;
3931 + }
3932 +}
3933 +
3934 +/* dump information about all nodes and operations in a @level */
3935 +static void print_level(const char *prefix /* prefix to print */ ,
3936 + carry_level * level /* level to print */ )
3937 +{
3938 + carry_node *node;
3939 + carry_node *tmp_node;
3940 + carry_op *op;
3941 + carry_op *tmp_op;
3942 +
3943 + if (level == NULL) {
3944 + printk("%s: null\n", prefix);
3945 + return;
3946 + }
3947 + printk("%s: %p, restartable: %s\n",
3948 + prefix, level, tf(level->restartable));
3949 +
3950 + for_all_nodes(level, node, tmp_node)
3951 + print_carry("\tcarry node", node);
3952 + for_all_ops(level, op, tmp_op)
3953 + print_op("\tcarry op", op);
3954 +}
3955 +
3956 +/* Make Linus happy.
3957 + Local variables:
3958 + c-indentation-style: "K&R"
3959 + mode-name: "LC"
3960 + c-basic-offset: 8
3961 + tab-width: 8
3962 + fill-column: 120
3963 + scroll-step: 1
3964 + End:
3965 +*/
3966 Index: linux-2.6.16/fs/reiser4/carry.h
3967 ===================================================================
3968 --- /dev/null
3969 +++ linux-2.6.16/fs/reiser4/carry.h
3970 @@ -0,0 +1,442 @@
3971 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3972 +
3973 +/* Functions and data types to "carry" tree modification(s) upward.
3974 + See fs/reiser4/carry.c for details. */
3975 +
3976 +#if !defined( __FS_REISER4_CARRY_H__ )
3977 +#define __FS_REISER4_CARRY_H__
3978 +
3979 +#include "forward.h"
3980 +#include "debug.h"
3981 +#include "pool.h"
3982 +#include "znode.h"
3983 +
3984 +#include <linux/types.h>
3985 +
3986 +/* &carry_node - "location" of carry node.
3987 +
3988 + "location" of node that is involved or going to be involved into
3989 + carry process. Node where operation will be carried to on the
3990 + parent level cannot be recorded explicitly. Operation will be carried
3991 + usually to the parent of some node (where changes are performed at
3992 + the current level) or, to the left neighbor of its parent. But while
3993 + modifications are performed at the current level, parent may
3994 + change. So, we have to allow some indirection (or, positevly,
3995 + flexibility) in locating carry nodes.
3996 +
3997 +*/
3998 +typedef struct carry_node {
3999 + /* pool linkage */
4000 + reiser4_pool_header header;
4001 +
4002 + /* base node from which real_node is calculated. See
4003 + fs/reiser4/carry.c:lock_carry_node(). */
4004 + znode *node;
4005 +
4006 + /* how to get ->real_node */
4007 + /* to get ->real_node obtain parent of ->node */
4008 + __u32 parent:1;
4009 + /* to get ->real_node obtain left neighbor of parent of
4010 + ->node */
4011 + __u32 left:1;
4012 + __u32 left_before:1;
4013 +
4014 + /* locking */
4015 +
4016 + /* this node was locked by carry process and should be
4017 + unlocked when carry leaves a level */
4018 + __u32 unlock:1;
4019 +
4020 + /* disk block for this node was allocated by carry process and
4021 + should be deallocated when carry leaves a level */
4022 + __u32 deallocate:1;
4023 + /* this carry node was allocated by carry process and should be
4024 + freed when carry leaves a level */
4025 + __u32 free:1;
4026 +
4027 + /* type of lock we want to take on this node */
4028 + lock_handle lock_handle;
4029 +} carry_node;
4030 +
4031 +/* &carry_opcode - elementary operations that can be carried upward
4032 +
4033 + Operations that carry() can handle. This list is supposed to be
4034 + expanded.
4035 +
4036 + Each carry operation (cop) is handled by appropriate function defined
4037 + in fs/reiser4/carry.c. For example COP_INSERT is handled by
4038 + fs/reiser4/carry.c:carry_insert() etc. These functions in turn
4039 + call plugins of nodes affected by operation to modify nodes' content
4040 + and to gather operations to be performed on the next level.
4041 +
4042 +*/
4043 +typedef enum {
4044 + /* insert new item into node. */
4045 + COP_INSERT,
4046 + /* delete pointer from parent node */
4047 + COP_DELETE,
4048 + /* remove part of or whole node. */
4049 + COP_CUT,
4050 + /* increase size of item. */
4051 + COP_PASTE,
4052 + /* insert extent (that is sequence of unformatted nodes). */
4053 + COP_EXTENT,
4054 + /* update delimiting key in least common ancestor of two
4055 + nodes. This is performed when items are moved between two
4056 + nodes.
4057 + */
4058 + COP_UPDATE,
4059 + /* insert flow */
4060 + COP_INSERT_FLOW,
4061 + COP_LAST_OP,
4062 +} carry_opcode;
4063 +
4064 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
4065 +
4066 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
4067 + item is determined. */
4068 +typedef enum {
4069 + /* target item is one containing pointer to the ->child node */
4070 + COPT_CHILD,
4071 + /* target item is given explicitly by @coord */
4072 + COPT_ITEM_DATA,
4073 + /* target item is given by key */
4074 + COPT_KEY,
4075 + /* see insert_paste_common() for more comments on this. */
4076 + COPT_PASTE_RESTARTED,
4077 +} cop_insert_pos_type;
4078 +
4079 +/* flags to cut and delete */
4080 +typedef enum {
4081 + /* don't kill node even if it became completely empty as results of
4082 + * cut. This is needed for eottl handling. See carry_extent() for
4083 + * details. */
4084 + DELETE_RETAIN_EMPTY = (1 << 0)
4085 +} cop_delete_flag;
4086 +
4087 +/*
4088 + * carry() implements "lock handle tracking" feature.
4089 + *
4090 + * Callers supply carry with node where to perform initial operation and lock
4091 + * handle on this node. Trying to optimize node utilization carry may actually
4092 + * move insertion point to different node. Callers expect that lock handle
4093 + * will rebe transferred to the new node also.
4094 + *
4095 + */
4096 +typedef enum {
4097 + /* transfer lock handle along with insertion point */
4098 + CARRY_TRACK_CHANGE = 1,
4099 + /* acquire new lock handle to the node where insertion point is. This
4100 + * is used when carry() client doesn't initially possess lock handle
4101 + * on the insertion point node, for example, by extent insertion
4102 + * code. See carry_extent(). */
4103 + CARRY_TRACK_NODE = 2
4104 +} carry_track_type;
4105 +
4106 +/* data supplied to COP_{INSERT|PASTE} by callers */
4107 +typedef struct carry_insert_data {
4108 + /* position where new item is to be inserted */
4109 + coord_t *coord;
4110 + /* new item description */
4111 + reiser4_item_data *data;
4112 + /* key of new item */
4113 + const reiser4_key *key;
4114 +} carry_insert_data;
4115 +
4116 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
4117 +struct cut_kill_params {
4118 + /* coord where cut starts (inclusive) */
4119 + coord_t *from;
4120 + /* coord where cut stops (inclusive, this item/unit will also be
4121 + * cut) */
4122 + coord_t *to;
4123 + /* starting key. This is necessary when item and unit pos don't
4124 + * uniquely identify what portion or tree to remove. For example, this
4125 + * indicates what portion of extent unit will be affected. */
4126 + const reiser4_key *from_key;
4127 + /* exclusive stop key */
4128 + const reiser4_key *to_key;
4129 + /* if this is not NULL, smallest actually removed key is stored
4130 + * here. */
4131 + reiser4_key *smallest_removed;
4132 + /* kill_node_content() is called for file truncate */
4133 + int truncate;
4134 +};
4135 +
4136 +struct carry_cut_data {
4137 + struct cut_kill_params params;
4138 +};
4139 +
4140 +struct carry_kill_data {
4141 + struct cut_kill_params params;
4142 + /* parameter to be passed to the ->kill_hook() method of item
4143 + * plugin */
4144 + /*void *iplug_params; *//* FIXME: unused currently */
4145 + /* if not NULL---inode whose items are being removed. This is needed
4146 + * for ->kill_hook() of extent item to update VM structures when
4147 + * removing pages. */
4148 + struct inode *inode;
4149 + /* sibling list maintenance is complicated by existence of eottl. When
4150 + * eottl whose left and right neighbors are formatted leaves is
4151 + * removed, one has to connect said leaves in the sibling list. This
4152 + * cannot be done when extent removal is just started as locking rules
4153 + * require sibling list update to happen atomically with removal of
4154 + * extent item. Therefore: 1. pointers to left and right neighbors
4155 + * have to be passed down to the ->kill_hook() of extent item, and
4156 + * 2. said neighbors have to be locked. */
4157 + lock_handle *left;
4158 + lock_handle *right;
4159 + /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
4160 + unsigned flags;
4161 + char *buf;
4162 +};
4163 +
4164 +/* &carry_tree_op - operation to "carry" upward.
4165 +
4166 + Description of an operation we want to "carry" to the upper level of
4167 + a tree: e.g, when we insert something and there is not enough space
4168 + we allocate a new node and "carry" the operation of inserting a
4169 + pointer to the new node to the upper level, on removal of empty node,
4170 + we carry up operation of removing appropriate entry from parent.
4171 +
4172 + There are two types of carry ops: when adding or deleting node we
4173 + node at the parent level where appropriate modification has to be
4174 + performed is known in advance. When shifting items between nodes
4175 + (split, merge), delimiting key should be changed in the least common
4176 + parent of the nodes involved that is not known in advance.
4177 +
4178 + For the operations of the first type we store in &carry_op pointer to
4179 + the &carry_node at the parent level. For the operation of the second
4180 + type we store &carry_node or parents of the left and right nodes
4181 + modified and keep track of them upward until they coincide.
4182 +
4183 +*/
4184 +typedef struct carry_op {
4185 + /* pool linkage */
4186 + reiser4_pool_header header;
4187 + carry_opcode op;
4188 + /* node on which operation is to be performed:
4189 +
4190 + for insert, paste: node where new item is to be inserted
4191 +
4192 + for delete: node where pointer is to be deleted
4193 +
4194 + for cut: node to cut from
4195 +
4196 + for update: node where delimiting key is to be modified
4197 +
4198 + for modify: parent of modified node
4199 +
4200 + */
4201 + carry_node *node;
4202 + union {
4203 + struct {
4204 + /* (sub-)type of insertion/paste. Taken from
4205 + cop_insert_pos_type. */
4206 + __u8 type;
4207 + /* various operation flags. Taken from
4208 + cop_insert_flag. */
4209 + __u8 flags;
4210 + carry_insert_data *d;
4211 + carry_node *child;
4212 + znode *brother;
4213 + } insert, paste, extent;
4214 +
4215 + struct {
4216 + int is_cut;
4217 + union {
4218 + carry_kill_data *kill;
4219 + carry_cut_data *cut;
4220 + } u;
4221 + } cut_or_kill;
4222 +
4223 + struct {
4224 + carry_node *left;
4225 + } update;
4226 + struct {
4227 + /* changed child */
4228 + carry_node *child;
4229 + /* bitmask of changes. See &cop_modify_flag */
4230 + __u32 flag;
4231 + } modify;
4232 + struct {
4233 + /* flags to deletion operation. Are taken from
4234 + cop_delete_flag */
4235 + __u32 flags;
4236 + /* child to delete from parent. If this is
4237 + NULL, delete op->node. */
4238 + carry_node *child;
4239 + } delete;
4240 + struct {
4241 + /* various operation flags. Taken from
4242 + cop_insert_flag. */
4243 + __u32 flags;
4244 + flow_t *flow;
4245 + coord_t *insert_point;
4246 + reiser4_item_data *data;
4247 + /* flow insertion is limited by number of new blocks
4248 + added in that operation which do not get any data
4249 + but part of flow. This limit is set by macro
4250 + CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4251 + of nodes added already during one carry_flow */
4252 + int new_nodes;
4253 + } insert_flow;
4254 + } u;
4255 +} carry_op;
4256 +
4257 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4258 +typedef struct carry_pool {
4259 + carry_op op[CARRIES_POOL_SIZE];
4260 + reiser4_pool op_pool;
4261 + carry_node node[NODES_LOCKED_POOL_SIZE];
4262 + reiser4_pool node_pool;
4263 +} carry_pool;
4264 +
4265 +/* &carry_tree_level - carry process on given level
4266 +
4267 + Description of balancing process on the given level.
4268 +
4269 + No need for locking here, as carry_tree_level is essentially per
4270 + thread thing (for now).
4271 +
4272 +*/
4273 +struct carry_level {
4274 + /* this level may be restarted */
4275 + __u32 restartable:1;
4276 + /* list of carry nodes on this level, ordered by key order */
4277 + struct list_head nodes;
4278 + struct list_head ops;
4279 + /* pool where new objects are allocated from */
4280 + carry_pool *pool;
4281 + int ops_num;
4282 + int nodes_num;
4283 + /* new root created on this level, if any */
4284 + znode *new_root;
4285 + /* This is set by caller (insert_by_key(), resize_item(), etc.) when
4286 + they want ->tracked to automagically wander to the node where
4287 + insertion point moved after insert or paste.
4288 + */
4289 + carry_track_type track_type;
4290 + /* lock handle supplied by user that we are tracking. See
4291 + above. */
4292 + lock_handle *tracked;
4293 +};
4294 +
4295 +/* information carry passes to plugin methods that may add new operations to
4296 + the @todo queue */
4297 +struct carry_plugin_info {
4298 + carry_level *doing;
4299 + carry_level *todo;
4300 +};
4301 +
4302 +int carry(carry_level * doing, carry_level * done);
4303 +
4304 +carry_node *add_carry(carry_level * level, pool_ordering order,
4305 + carry_node * reference);
4306 +carry_node *add_carry_skip(carry_level * level, pool_ordering order,
4307 + carry_node * reference);
4308 +
4309 +extern carry_node *insert_carry_node(carry_level * doing,
4310 + carry_level * todo, const znode * node);
4311 +
4312 +extern carry_pool *init_carry_pool(int);
4313 +extern void done_carry_pool(carry_pool * pool);
4314 +
4315 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4316 +
4317 +extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node,
4318 + int apply_to_parent);
4319 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4320 + znode * node, int apply_to_parent_p);
4321 +
4322 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4323 + carry_level * doing, carry_level * todo);
4324 +
4325 +carry_node *find_carry_node(carry_level * level, const znode * node);
4326 +
4327 +extern znode *carry_real(const carry_node * node);
4328 +
4329 +/* helper macros to iterate over carry queues */
4330 +
4331 +#define carry_node_next( node ) \
4332 + list_entry((node)->header.level_linkage.next, carry_node, \
4333 + header.level_linkage)
4334 +
4335 +#define carry_node_prev( node ) \
4336 + list_entry((node)->header.level_linkage.prev, carry_node, \
4337 + header.level_linkage)
4338 +
4339 +#define carry_node_front( level ) \
4340 + list_entry((level)->nodes.next, carry_node, header.level_linkage)
4341 +
4342 +#define carry_node_back( level ) \
4343 + list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4344 +
4345 +#define carry_node_end( level, node ) \
4346 + (&(level)->nodes == &(node)->header.level_linkage)
4347 +
4348 +/* macro to iterate over all operations in a @level */
4349 +#define for_all_ops( level /* carry level (of type carry_level *) */, \
4350 + op /* pointer to carry operation, modified by loop (of \
4351 + * type carry_op *) */, \
4352 + tmp /* pointer to carry operation (of type carry_op *), \
4353 + * used to make iterator stable in the face of \
4354 + * deletions from the level */ ) \
4355 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4356 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4357 + &op->header.level_linkage != &level->ops; \
4358 + op = tmp, \
4359 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4360 +
4361 +#if 0
4362 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4363 + tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4364 + ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4365 + op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4366 +#endif
4367 +
4368 +/* macro to iterate over all nodes in a @level */ \
4369 +#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4370 + node /* pointer to carry node, modified by loop (of \
4371 + * type carry_node *) */, \
4372 + tmp /* pointer to carry node (of type carry_node *), \
4373 + * used to make iterator stable in the face of * \
4374 + * deletions from the level */ ) \
4375 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4376 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4377 + &node->header.level_linkage != &level->nodes; \
4378 + node = tmp, \
4379 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4380 +
4381 +#if 0
4382 +for( node = carry_node_front( level ), \
4383 + tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4384 + node = tmp, tmp = carry_node_next( node ) )
4385 +#endif
4386 +
4387 +/* macro to iterate over all nodes in a @level in reverse order
4388 +
4389 + This is used, because nodes are unlocked in reversed order of locking */
4390 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4391 + node /* pointer to carry node, modified by loop \
4392 + * (of type carry_node *) */, \
4393 + tmp /* pointer to carry node (of type carry_node \
4394 + * *), used to make iterator stable in the \
4395 + * face of deletions from the level */ ) \
4396 +for( node = carry_node_back( level ), \
4397 + tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4398 + node = tmp, tmp = carry_node_prev( node ) )
4399 +
4400 +/* __FS_REISER4_CARRY_H__ */
4401 +#endif
4402 +
4403 +/* Make Linus happy.
4404 + Local variables:
4405 + c-indentation-style: "K&R"
4406 + mode-name: "LC"
4407 + c-basic-offset: 8
4408 + tab-width: 8
4409 + fill-column: 120
4410 + scroll-step: 1
4411 + End:
4412 +*/
4413 Index: linux-2.6.16/fs/reiser4/carry_ops.c
4414 ===================================================================
4415 --- /dev/null
4416 +++ linux-2.6.16/fs/reiser4/carry_ops.c
4417 @@ -0,0 +1,2103 @@
4418 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4419 +
4420 +/* implementation of carry operations */
4421 +
4422 +#include "forward.h"
4423 +#include "debug.h"
4424 +#include "key.h"
4425 +#include "coord.h"
4426 +#include "plugin/item/item.h"
4427 +#include "plugin/node/node.h"
4428 +#include "jnode.h"
4429 +#include "znode.h"
4430 +#include "block_alloc.h"
4431 +#include "tree_walk.h"
4432 +#include "pool.h"
4433 +#include "tree_mod.h"
4434 +#include "carry.h"
4435 +#include "carry_ops.h"
4436 +#include "tree.h"
4437 +#include "super.h"
4438 +#include "reiser4.h"
4439 +
4440 +#include <linux/types.h>
4441 +#include <linux/err.h>
4442 +
4443 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4444 + carry_level * doing, carry_level * todo,
4445 + unsigned int including_insert_coord_p);
4446 +
4447 +extern int lock_carry_node(carry_level * level, carry_node * node);
4448 +extern int lock_carry_node_tail(carry_node * node);
4449 +
4450 +/* find left neighbor of a carry node
4451 +
4452 + Look for left neighbor of @node and add it to the @doing queue. See
4453 + comments in the body.
4454 +
4455 +*/
4456 +static carry_node *find_left_neighbor(carry_op * op /* node to find left
4457 + * neighbor of */ ,
4458 + carry_level * doing /* level to scan */ )
4459 +{
4460 + int result;
4461 + carry_node *node;
4462 + carry_node *left;
4463 + int flags;
4464 + reiser4_tree *tree;
4465 +
4466 + node = op->node;
4467 +
4468 + tree = current_tree;
4469 + read_lock_tree(tree);
4470 + /* first, check whether left neighbor is already in a @doing queue */
4471 + if (carry_real(node)->left != NULL) {
4472 + /* NOTE: there is locking subtlety here. Look into
4473 + * find_right_neighbor() for more info */
4474 + if (find_carry_node(doing, carry_real(node)->left) != NULL) {
4475 + read_unlock_tree(tree);
4476 + left = node;
4477 + do {
4478 + left = list_entry(left->header.level_linkage.prev,
4479 + carry_node, header.level_linkage);
4480 + assert("nikita-3408", !carry_node_end(doing,
4481 + left));
4482 + } while (carry_real(left) == carry_real(node));
4483 + return left;
4484 + }
4485 + }
4486 + read_unlock_tree(tree);
4487 +
4488 + left = add_carry_skip(doing, POOLO_BEFORE, node);
4489 + if (IS_ERR(left))
4490 + return left;
4491 +
4492 + left->node = node->node;
4493 + left->free = 1;
4494 +
4495 + flags = GN_TRY_LOCK;
4496 + if (!op->u.insert.flags & COPI_LOAD_LEFT)
4497 + flags |= GN_NO_ALLOC;
4498 +
4499 + /* then, feeling lucky, peek left neighbor in the cache. */
4500 + result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node),
4501 + ZNODE_WRITE_LOCK, flags);
4502 + if (result == 0) {
4503 + /* ok, node found and locked. */
4504 + result = lock_carry_node_tail(left);
4505 + if (result != 0)
4506 + left = ERR_PTR(result);
4507 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4508 + /* node is leftmost node in a tree, or neighbor wasn't in
4509 + cache, or there is an extent on the left. */
4510 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4511 + left = NULL;
4512 + } else if (doing->restartable) {
4513 + /* if left neighbor is locked, and level is restartable, add
4514 + new node to @doing and restart. */
4515 + assert("nikita-913", node->parent != 0);
4516 + assert("nikita-914", node->node != NULL);
4517 + left->left = 1;
4518 + left->free = 0;
4519 + left = ERR_PTR(-E_REPEAT);
4520 + } else {
4521 + /* left neighbor is locked, level cannot be restarted. Just
4522 + ignore left neighbor. */
4523 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4524 + left = NULL;
4525 + }
4526 + return left;
4527 +}
4528 +
4529 +/* find right neighbor of a carry node
4530 +
4531 + Look for right neighbor of @node and add it to the @doing queue. See
4532 + comments in the body.
4533 +
4534 +*/
4535 +static carry_node *find_right_neighbor(carry_op * op /* node to find right
4536 + * neighbor of */ ,
4537 + carry_level * doing /* level to scan */ )
4538 +{
4539 + int result;
4540 + carry_node *node;
4541 + carry_node *right;
4542 + lock_handle lh;
4543 + int flags;
4544 + reiser4_tree *tree;
4545 +
4546 + init_lh(&lh);
4547 +
4548 + node = op->node;
4549 +
4550 + tree = current_tree;
4551 + read_lock_tree(tree);
4552 + /* first, check whether right neighbor is already in a @doing queue */
4553 + if (carry_real(node)->right != NULL) {
4554 + /*
4555 + * Tree lock is taken here anyway, because, even if _outcome_
4556 + * of (find_carry_node() != NULL) doesn't depends on
4557 + * concurrent updates to ->right, find_carry_node() cannot
4558 + * work with second argument NULL. Hence, following comment is
4559 + * of historic importance only.
4560 + *
4561 + * Subtle:
4562 + *
4563 + * Q: why don't we need tree lock here, looking for the right
4564 + * neighbor?
4565 + *
4566 + * A: even if value of node->real_node->right were changed
4567 + * during find_carry_node() execution, outcome of execution
4568 + * wouldn't change, because (in short) other thread cannot add
4569 + * elements to the @doing, and if node->real_node->right
4570 + * already was in @doing, value of node->real_node->right
4571 + * couldn't change, because node cannot be inserted between
4572 + * locked neighbors.
4573 + */
4574 + if (find_carry_node(doing, carry_real(node)->right) != NULL) {
4575 + read_unlock_tree(tree);
4576 + /*
4577 + * What we are doing here (this is also applicable to
4578 + * the find_left_neighbor()).
4579 + *
4580 + * tree_walk.c code requires that insertion of a
4581 + * pointer to a child, modification of parent pointer
4582 + * in the child, and insertion of the child into
4583 + * sibling list are atomic (see
4584 + * plugin/item/internal.c:create_hook_internal()).
4585 + *
4586 + * carry allocates new node long before pointer to it
4587 + * is inserted into parent and, actually, long before
4588 + * parent is even known. Such allocated-but-orphaned
4589 + * nodes are only trackable through carry level lists.
4590 + *
4591 + * Situation that is handled here is following: @node
4592 + * has valid ->right pointer, but there is
4593 + * allocated-but-orphaned node in the carry queue that
4594 + * is logically between @node and @node->right. Here
4595 + * we are searching for it. Critical point is that
4596 + * this is only possible if @node->right is also in
4597 + * the carry queue (this is checked above), because
4598 + * this is the only way new orphaned node could be
4599 + * inserted between them (before inserting new node,
4600 + * make_space() first tries to shift to the right, so,
4601 + * right neighbor will be locked and queued).
4602 + *
4603 + */
4604 + right = node;
4605 + do {
4606 + right = list_entry(right->header.level_linkage.next,
4607 + carry_node, header.level_linkage);
4608 + assert("nikita-3408", !carry_node_end(doing,
4609 + right));
4610 + } while (carry_real(right) == carry_real(node));
4611 + return right;
4612 + }
4613 + }
4614 + read_unlock_tree(tree);
4615 +
4616 + flags = GN_CAN_USE_UPPER_LEVELS;
4617 + if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4618 + flags = GN_NO_ALLOC;
4619 +
4620 + /* then, try to lock right neighbor */
4621 + init_lh(&lh);
4622 + result = reiser4_get_right_neighbor(&lh, carry_real(node),
4623 + ZNODE_WRITE_LOCK, flags);
4624 + if (result == 0) {
4625 + /* ok, node found and locked. */
4626 + right = add_carry_skip(doing, POOLO_AFTER, node);
4627 + if (!IS_ERR(right)) {
4628 + right->node = lh.node;
4629 + move_lh(&right->lock_handle, &lh);
4630 + right->free = 1;
4631 + result = lock_carry_node_tail(right);
4632 + if (result != 0)
4633 + right = ERR_PTR(result);
4634 + }
4635 + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4636 + /* node is rightmost node in a tree, or neighbor wasn't in
4637 + cache, or there is an extent on the right. */
4638 + right = NULL;
4639 + } else
4640 + right = ERR_PTR(result);
4641 + done_lh(&lh);
4642 + return right;
4643 +}
4644 +
4645 +/* how much free space in a @node is needed for @op
4646 +
4647 + How much space in @node is required for completion of @op, where @op is
4648 + insert or paste operation.
4649 +*/
4650 +static unsigned int space_needed_for_op(znode * node /* znode data are
4651 + * inserted or
4652 + * pasted in */ ,
4653 + carry_op * op /* carry
4654 + operation */ )
4655 +{
4656 + assert("nikita-919", op != NULL);
4657 +
4658 + switch (op->op) {
4659 + default:
4660 + impossible("nikita-1701", "Wrong opcode");
4661 + case COP_INSERT:
4662 + return space_needed(node, NULL, op->u.insert.d->data, 1);
4663 + case COP_PASTE:
4664 + return space_needed(node, op->u.insert.d->coord,
4665 + op->u.insert.d->data, 0);
4666 + }
4667 +}
4668 +
4669 +/* how much space in @node is required to insert or paste @data at
4670 + @coord. */
4671 +unsigned int space_needed(const znode * node /* node data are inserted or
4672 + * pasted in */ ,
4673 + const coord_t * coord /* coord where data are
4674 + * inserted or pasted
4675 + * at */ ,
4676 + const reiser4_item_data * data /* data to insert or
4677 + * paste */ ,
4678 + int insertion /* non-0 is inserting, 0---paste */ )
4679 +{
4680 + int result;
4681 + item_plugin *iplug;
4682 +
4683 + assert("nikita-917", node != NULL);
4684 + assert("nikita-918", node_plugin_by_node(node) != NULL);
4685 + assert("vs-230", !insertion || (coord == NULL));
4686 +
4687 + result = 0;
4688 + iplug = data->iplug;
4689 + if (iplug->b.estimate != NULL) {
4690 + /* ask item plugin how much space is needed to insert this
4691 + item */
4692 + result += iplug->b.estimate(insertion ? NULL : coord, data);
4693 + } else {
4694 + /* reasonable default */
4695 + result += data->length;
4696 + }
4697 + if (insertion) {
4698 + node_plugin *nplug;
4699 +
4700 + nplug = node->nplug;
4701 + /* and add node overhead */
4702 + if (nplug->item_overhead != NULL) {
4703 + result += nplug->item_overhead(node, NULL);
4704 + }
4705 + }
4706 + return result;
4707 +}
4708 +
4709 +/* find &coord in parent where pointer to new child is to be stored. */
4710 +static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4711 + * insert pointer to new
4712 + * child */ )
4713 +{
4714 + int result;
4715 + znode *node;
4716 + znode *child;
4717 +
4718 + assert("nikita-941", op != NULL);
4719 + assert("nikita-942", op->op == COP_INSERT);
4720 +
4721 + node = carry_real(op->node);
4722 + assert("nikita-943", node != NULL);
4723 + assert("nikita-944", node_plugin_by_node(node) != NULL);
4724 +
4725 + child = carry_real(op->u.insert.child);
4726 + result =
4727 + find_new_child_ptr(node, child, op->u.insert.brother,
4728 + op->u.insert.d->coord);
4729 +
4730 + build_child_ptr_data(child, op->u.insert.d->data);
4731 + return result;
4732 +}
4733 +
4734 +/* additional amount of free space in @node required to complete @op */
4735 +static int free_space_shortage(znode * node /* node to check */ ,
4736 + carry_op * op /* operation being performed */ )
4737 +{
4738 + assert("nikita-1061", node != NULL);
4739 + assert("nikita-1062", op != NULL);
4740 +
4741 + switch (op->op) {
4742 + default:
4743 + impossible("nikita-1702", "Wrong opcode");
4744 + case COP_INSERT:
4745 + case COP_PASTE:
4746 + return space_needed_for_op(node, op) - znode_free_space(node);
4747 + case COP_EXTENT:
4748 + /* when inserting extent shift data around until insertion
4749 + point is utmost in the node. */
4750 + if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4751 + return +1;
4752 + else
4753 + return -1;
4754 + }
4755 +}
4756 +
4757 +/* helper function: update node pointer in operation after insertion
4758 + point was probably shifted into @target. */
4759 +static znode *sync_op(carry_op * op, carry_node * target)
4760 +{
4761 + znode *insertion_node;
4762 +
4763 + /* reget node from coord: shift might move insertion coord to
4764 + the neighbor */
4765 + insertion_node = op->u.insert.d->coord->node;
4766 + /* if insertion point was actually moved into new node,
4767 + update carry node pointer in operation. */
4768 + if (insertion_node != carry_real(op->node)) {
4769 + op->node = target;
4770 + assert("nikita-2540", carry_real(target) == insertion_node);
4771 + }
4772 + assert("nikita-2541",
4773 + carry_real(op->node) == op->u.insert.d->coord->node);
4774 + return insertion_node;
4775 +}
4776 +
4777 +/*
4778 + * complete make_space() call: update tracked lock handle if necessary. See
4779 + * comments for fs/reiser4/carry.h:carry_track_type
4780 + */
4781 +static int
4782 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4783 +{
4784 + int result;
4785 + carry_track_type tracking;
4786 + znode *node;
4787 +
4788 + tracking = doing->track_type;
4789 + node = op->u.insert.d->coord->node;
4790 +
4791 + if (tracking == CARRY_TRACK_NODE ||
4792 + (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4793 + /* inserting or pasting into node different from
4794 + original. Update lock handle supplied by caller. */
4795 + assert("nikita-1417", doing->tracked != NULL);
4796 + done_lh(doing->tracked);
4797 + init_lh(doing->tracked);
4798 + result = longterm_lock_znode(doing->tracked, node,
4799 + ZNODE_WRITE_LOCK,
4800 + ZNODE_LOCK_HIPRI);
4801 + } else
4802 + result = 0;
4803 + return result;
4804 +}
4805 +
4806 +/* This is insertion policy function. It shifts data to the left and right
4807 + neighbors of insertion coord and allocates new nodes until there is enough
4808 + free space to complete @op.
4809 +
4810 + See comments in the body.
4811 +
4812 + Assumes that the node format favors insertions at the right end of the node
4813 + as node40 does.
4814 +
4815 + See carry_flow() on detail about flow insertion
4816 +*/
4817 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4818 + carry_level * doing /* current carry queue */ ,
4819 + carry_level * todo /* carry queue on the parent level */ )
4820 +{
4821 + znode *node;
4822 + int result;
4823 + int not_enough_space;
4824 + int blk_alloc;
4825 + znode *orig_node;
4826 + __u32 flags;
4827 +
4828 + coord_t *coord;
4829 +
4830 + assert("nikita-890", op != NULL);
4831 + assert("nikita-891", todo != NULL);
4832 + assert("nikita-892",
4833 + op->op == COP_INSERT ||
4834 + op->op == COP_PASTE || op->op == COP_EXTENT);
4835 + assert("nikita-1607",
4836 + carry_real(op->node) == op->u.insert.d->coord->node);
4837 +
4838 + flags = op->u.insert.flags;
4839 +
4840 + /* NOTE check that new node can only be allocated after checking left
4841 + * and right neighbors. This is necessary for proper work of
4842 + * find_{left,right}_neighbor(). */
4843 + assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4844 + flags & COPI_DONT_SHIFT_LEFT));
4845 + assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4846 + flags & COPI_DONT_SHIFT_RIGHT));
4847 +
4848 + coord = op->u.insert.d->coord;
4849 + orig_node = node = coord->node;
4850 +
4851 + assert("nikita-908", node != NULL);
4852 + assert("nikita-909", node_plugin_by_node(node) != NULL);
4853 +
4854 + result = 0;
4855 + /* If there is not enough space in a node, try to shift something to
4856 + the left neighbor. This is a bit tricky, as locking to the left is
4857 + low priority. This is handled by restart logic in carry().
4858 + */
4859 + not_enough_space = free_space_shortage(node, op);
4860 + if (not_enough_space <= 0)
4861 + /* it is possible that carry was called when there actually
4862 + was enough space in the node. For example, when inserting
4863 + leftmost item so that delimiting keys have to be updated.
4864 + */
4865 + return make_space_tail(op, doing, orig_node);
4866 + if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4867 + carry_node *left;
4868 + /* make note in statistics of an attempt to move
4869 + something into the left neighbor */
4870 + left = find_left_neighbor(op, doing);
4871 + if (unlikely(IS_ERR(left))) {
4872 + if (PTR_ERR(left) == -E_REPEAT)
4873 + return -E_REPEAT;
4874 + else {
4875 + /* some error other than restart request
4876 + occurred. This shouldn't happen. Issue a
4877 + warning and continue as if left neighbor
4878 + weren't existing.
4879 + */
4880 + warning("nikita-924",
4881 + "Error accessing left neighbor: %li",
4882 + PTR_ERR(left));
4883 + }
4884 + } else if (left != NULL) {
4885 +
4886 + /* shift everything possible on the left of and
4887 + including insertion coord into the left neighbor */
4888 + result = carry_shift_data(LEFT_SIDE, coord,
4889 + carry_real(left), doing, todo,
4890 + flags & COPI_GO_LEFT);
4891 +
4892 + /* reget node from coord: shift_left() might move
4893 + insertion coord to the left neighbor */
4894 + node = sync_op(op, left);
4895 +
4896 + not_enough_space = free_space_shortage(node, op);
4897 + /* There is not enough free space in @node, but
4898 + may be, there is enough free space in
4899 + @left. Various balancing decisions are valid here.
4900 + The same for the shifiting to the right.
4901 + */
4902 + }
4903 + }
4904 + /* If there still is not enough space, shift to the right */
4905 + if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4906 + carry_node *right;
4907 +
4908 + right = find_right_neighbor(op, doing);
4909 + if (IS_ERR(right)) {
4910 + warning("nikita-1065",
4911 + "Error accessing right neighbor: %li",
4912 + PTR_ERR(right));
4913 + } else if (right != NULL) {
4914 + /* node containing insertion point, and its right
4915 + neighbor node are write locked by now.
4916 +
4917 + shift everything possible on the right of but
4918 + excluding insertion coord into the right neighbor
4919 + */
4920 + result = carry_shift_data(RIGHT_SIDE, coord,
4921 + carry_real(right),
4922 + doing, todo,
4923 + flags & COPI_GO_RIGHT);
4924 + /* reget node from coord: shift_right() might move
4925 + insertion coord to the right neighbor */
4926 + node = sync_op(op, right);
4927 + not_enough_space = free_space_shortage(node, op);
4928 + }
4929 + }
4930 + /* If there is still not enough space, allocate new node(s).
4931 +
4932 + We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4933 + the carry operation flags (currently this is needed during flush
4934 + only).
4935 + */
4936 + for (blk_alloc = 0;
4937 + not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4938 + !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4939 + carry_node *fresh; /* new node we are allocating */
4940 + coord_t coord_shadow; /* remembered insertion point before
4941 + * shifting data into new node */
4942 + carry_node *node_shadow; /* remembered insertion node before
4943 + * shifting */
4944 + unsigned int gointo; /* whether insertion point should move
4945 + * into newly allocated node */
4946 +
4947 + /* allocate new node on the right of @node. Znode and disk
4948 + fake block number for new node are allocated.
4949 +
4950 + add_new_znode() posts carry operation COP_INSERT with
4951 + COPT_CHILD option to the parent level to add
4952 + pointer to newly created node to its parent.
4953 +
4954 + Subtle point: if several new nodes are required to complete
4955 + insertion operation at this level, they will be inserted
4956 + into their parents in the order of creation, which means
4957 + that @node will be valid "cookie" at the time of insertion.
4958 +
4959 + */
4960 + fresh = add_new_znode(node, op->node, doing, todo);
4961 + if (IS_ERR(fresh))
4962 + return PTR_ERR(fresh);
4963 +
4964 + /* Try to shift into new node. */
4965 + result = lock_carry_node(doing, fresh);
4966 + zput(carry_real(fresh));
4967 + if (result != 0) {
4968 + warning("nikita-947",
4969 + "Cannot lock new node: %i", result);
4970 + return result;
4971 + }
4972 +
4973 + /* both nodes are write locked by now.
4974 +
4975 + shift everything possible on the right of and
4976 + including insertion coord into the right neighbor.
4977 + */
4978 + coord_dup(&coord_shadow, op->u.insert.d->coord);
4979 + node_shadow = op->node;
4980 + /* move insertion point into newly created node if:
4981 +
4982 + . insertion point is rightmost in the source node, or
4983 + . this is not the first node we are allocating in a row.
4984 + */
4985 + gointo =
4986 + (blk_alloc > 0) ||
4987 + coord_is_after_rightmost(op->u.insert.d->coord);
4988 +
4989 + result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh),
4990 + doing, todo, gointo);
4991 + /* if insertion point was actually moved into new node,
4992 + update carry node pointer in operation. */
4993 + node = sync_op(op, fresh);
4994 + not_enough_space = free_space_shortage(node, op);
4995 + if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4996 + /* there is not enough free in new node. Shift
4997 + insertion point back to the @shadow_node so that
4998 + next new node would be inserted between
4999 + @shadow_node and @fresh.
5000 + */
5001 + coord_normalize(&coord_shadow);
5002 + coord_dup(coord, &coord_shadow);
5003 + node = coord->node;
5004 + op->node = node_shadow;
5005 + if (1 || (flags & COPI_STEP_BACK)) {
5006 + /* still not enough space?! Maybe there is
5007 + enough space in the source node (i.e., node
5008 + data are moved from) now.
5009 + */
5010 + not_enough_space =
5011 + free_space_shortage(node, op);
5012 + }
5013 + }
5014 + }
5015 + if (not_enough_space > 0) {
5016 + if (!(flags & COPI_DONT_ALLOCATE))
5017 + warning("nikita-948", "Cannot insert new item");
5018 + result = -E_NODE_FULL;
5019 + }
5020 + assert("nikita-1622", ergo(result == 0,
5021 + carry_real(op->node) == coord->node));
5022 + assert("nikita-2616", coord == op->u.insert.d->coord);
5023 + if (result == 0)
5024 + result = make_space_tail(op, doing, orig_node);
5025 + return result;
5026 +}
5027 +
5028 +/* insert_paste_common() - common part of insert and paste operations
5029 +
5030 + This function performs common part of COP_INSERT and COP_PASTE.
5031 +
5032 + There are two ways in which insertion/paste can be requested:
5033 +
5034 + . by directly supplying reiser4_item_data. In this case, op ->
5035 + u.insert.type is set to COPT_ITEM_DATA.
5036 +
5037 + . by supplying child pointer to which is to inserted into parent. In this
5038 + case op -> u.insert.type == COPT_CHILD.
5039 +
5040 + . by supplying key of new item/unit. This is currently only used during
5041 + extent insertion
5042 +
5043 + This is required, because when new node is allocated we don't know at what
5044 + position pointer to it is to be stored in the parent. Actually, we don't
5045 + even know what its parent will be, because parent can be re-balanced
5046 + concurrently and new node re-parented, and because parent can be full and
5047 + pointer to the new node will go into some other node.
5048 +
5049 + insert_paste_common() resolves pointer to child node into position in the
5050 + parent by calling find_new_child_coord(), that fills
5051 + reiser4_item_data. After this, insertion/paste proceeds uniformly.
5052 +
5053 + Another complication is with finding free space during pasting. It may
5054 + happen that while shifting items to the neighbors and newly allocated
5055 + nodes, insertion coord can no longer be in the item we wanted to paste
5056 + into. At this point, paste becomes (morphs) into insert. Moreover free
5057 + space analysis has to be repeated, because amount of space required for
5058 + insertion is different from that of paste (item header overhead, etc).
5059 +
5060 + This function "unifies" different insertion modes (by resolving child
5061 + pointer or key into insertion coord), and then calls make_space() to free
5062 + enough space in the node by shifting data to the left and right and by
5063 + allocating new nodes if necessary. Carry operation knows amount of space
5064 + required for its completion. After enough free space is obtained, caller of
5065 + this function (carry_{insert,paste,etc.}) performs actual insertion/paste
5066 + by calling item plugin method.
5067 +
5068 +*/
5069 +static int insert_paste_common(carry_op * op /* carry operation being
5070 + * performed */ ,
5071 + carry_level * doing /* current carry level */ ,
5072 + carry_level * todo /* next carry level */ ,
5073 + carry_insert_data * cdata /* pointer to
5074 + * cdata */ ,
5075 + coord_t * coord /* insertion/paste coord */ ,
5076 + reiser4_item_data * data /* data to be
5077 + * inserted/pasted */ )
5078 +{
5079 + assert("nikita-981", op != NULL);
5080 + assert("nikita-980", todo != NULL);
5081 + assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
5082 + || (op->op == COP_EXTENT));
5083 +
5084 + if (op->u.insert.type == COPT_PASTE_RESTARTED) {
5085 + /* nothing to do. Fall through to make_space(). */
5086 + ;
5087 + } else if (op->u.insert.type == COPT_KEY) {
5088 + node_search_result intra_node;
5089 + znode *node;
5090 + /* Problem with doing batching at the lowest level, is that
5091 + operations here are given by coords where modification is
5092 + to be performed, and one modification can invalidate coords
5093 + of all following operations.
5094 +
5095 + So, we are implementing yet another type for operation that
5096 + will use (the only) "locator" stable across shifting of
5097 + data between nodes, etc.: key (COPT_KEY).
5098 +
5099 + This clause resolves key to the coord in the node.
5100 +
5101 + But node can change also. Probably some pieces have to be
5102 + added to the lock_carry_node(), to lock node by its key.
5103 +
5104 + */
5105 + /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
5106 + if you need something else. */
5107 + op->u.insert.d->coord = coord;
5108 + node = carry_real(op->node);
5109 + intra_node = node_plugin_by_node(node)->lookup
5110 + (node, op->u.insert.d->key, FIND_EXACT,
5111 + op->u.insert.d->coord);
5112 + if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
5113 + warning("nikita-1715", "Intra node lookup failure: %i",
5114 + intra_node);
5115 + return intra_node;
5116 + }
5117 + } else if (op->u.insert.type == COPT_CHILD) {
5118 + /* if we are asked to insert pointer to the child into
5119 + internal node, first convert pointer to the child into
5120 + coord within parent node.
5121 + */
5122 + znode *child;
5123 + int result;
5124 +
5125 + op->u.insert.d = cdata;
5126 + op->u.insert.d->coord = coord;
5127 + op->u.insert.d->data = data;
5128 + op->u.insert.d->coord->node = carry_real(op->node);
5129 + result = find_new_child_coord(op);
5130 + child = carry_real(op->u.insert.child);
5131 + if (result != NS_NOT_FOUND) {
5132 + warning("nikita-993",
5133 + "Cannot find a place for child pointer: %i",
5134 + result);
5135 + return result;
5136 + }
5137 + /* This only happens when we did multiple insertions at
5138 + the previous level, trying to insert single item and
5139 + it so happened, that insertion of pointers to all new
5140 + nodes before this one already caused parent node to
5141 + split (may be several times).
5142 +
5143 + I am going to come up with better solution.
5144 +
5145 + You are not expected to understand this.
5146 + -- v6root/usr/sys/ken/slp.c
5147 +
5148 + Basically, what happens here is the following: carry came
5149 + to the parent level and is about to insert internal item
5150 + pointing to the child node that it just inserted in the
5151 + level below. Position where internal item is to be inserted
5152 + was found by find_new_child_coord() above, but node of the
5153 + current carry operation (that is, parent node of child
5154 + inserted on the previous level), was determined earlier in
5155 + the lock_carry_level/lock_carry_node. It could so happen
5156 + that other carry operations already performed on the parent
5157 + level already split parent node, so that insertion point
5158 + moved into another node. Handle this by creating new carry
5159 + node for insertion point if necessary.
5160 + */
5161 + if (carry_real(op->node) != op->u.insert.d->coord->node) {
5162 + pool_ordering direction;
5163 + znode *z1;
5164 + znode *z2;
5165 + reiser4_key k1;
5166 + reiser4_key k2;
5167 +
5168 + /*
5169 + * determine in what direction insertion point
5170 + * moved. Do this by comparing delimiting keys.
5171 + */
5172 + z1 = op->u.insert.d->coord->node;
5173 + z2 = carry_real(op->node);
5174 + if (keyle(leftmost_key_in_node(z1, &k1),
5175 + leftmost_key_in_node(z2, &k2)))
5176 + /* insertion point moved to the left */
5177 + direction = POOLO_BEFORE;
5178 + else
5179 + /* insertion point moved to the right */
5180 + direction = POOLO_AFTER;
5181 +
5182 + op->node = add_carry_skip(doing, direction, op->node);
5183 + if (IS_ERR(op->node))
5184 + return PTR_ERR(op->node);
5185 + op->node->node = op->u.insert.d->coord->node;
5186 + op->node->free = 1;
5187 + result = lock_carry_node(doing, op->node);
5188 + if (result != 0)
5189 + return result;
5190 + }
5191 +
5192 + /*
5193 + * set up key of an item being inserted: we are inserting
5194 + * internal item and its key is (by the very definition of
5195 + * search tree) is leftmost key in the child node.
5196 + */
5197 + write_lock_dk(znode_get_tree(child));
5198 + op->u.insert.d->key = leftmost_key_in_node(child,
5199 + znode_get_ld_key(child));
5200 + write_unlock_dk(znode_get_tree(child));
5201 + op->u.insert.d->data->arg = op->u.insert.brother;
5202 + } else {
5203 + assert("vs-243", op->u.insert.d->coord != NULL);
5204 + op->u.insert.d->coord->node = carry_real(op->node);
5205 + }
5206 +
5207 + /* find free space. */
5208 + return make_space(op, doing, todo);
5209 +}
5210 +
5211 +/* handle carry COP_INSERT operation.
5212 +
5213 + Insert new item into node. New item can be given in one of two ways:
5214 +
5215 + - by passing &tree_coord and &reiser4_item_data as part of @op. This is
5216 + only applicable at the leaf/twig level.
5217 +
5218 + - by passing a child node pointer to which is to be inserted by this
5219 + operation.
5220 +
5221 +*/
5222 +static int carry_insert(carry_op * op /* operation to perform */ ,
5223 + carry_level * doing /* queue of operations @op
5224 + * is part of */ ,
5225 + carry_level * todo /* queue where new operations
5226 + * are accumulated */ )
5227 +{
5228 + znode *node;
5229 + carry_insert_data cdata;
5230 + coord_t coord;
5231 + reiser4_item_data data;
5232 + carry_plugin_info info;
5233 + int result;
5234 +
5235 + assert("nikita-1036", op != NULL);
5236 + assert("nikita-1037", todo != NULL);
5237 + assert("nikita-1038", op->op == COP_INSERT);
5238 +
5239 + coord_init_zero(&coord);
5240 +
5241 + /* perform common functionality of insert and paste. */
5242 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5243 + if (result != 0)
5244 + return result;
5245 +
5246 + node = op->u.insert.d->coord->node;
5247 + assert("nikita-1039", node != NULL);
5248 + assert("nikita-1040", node_plugin_by_node(node) != NULL);
5249 +
5250 + assert("nikita-949",
5251 + space_needed_for_op(node, op) <= znode_free_space(node));
5252 +
5253 + /* ask node layout to create new item. */
5254 + info.doing = doing;
5255 + info.todo = todo;
5256 + result = node_plugin_by_node(node)->create_item
5257 + (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5258 + &info);
5259 + doing->restartable = 0;
5260 + znode_make_dirty(node);
5261 +
5262 + return result;
5263 +}
5264 +
5265 +/*
5266 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5267 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5268 + * by slicing into multiple items.
5269 + */
5270 +
5271 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5272 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5273 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5274 +
5275 +static size_t item_data_overhead(carry_op * op)
5276 +{
5277 + if (flow_insert_data(op)->iplug->b.estimate == NULL)
5278 + return 0;
5279 + return (flow_insert_data(op)->iplug->b.
5280 + estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5281 + flow_insert_data(op)->length);
5282 +}
5283 +
5284 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5285 + and it will always return the same result. Some optimization could be made
5286 + by calculating this value once at the beginning and passing it around. That
5287 + would reduce some flexibility in future changes
5288 +*/
5289 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5290 +static size_t flow_insertion_overhead(carry_op * op)
5291 +{
5292 + znode *node;
5293 + size_t insertion_overhead;
5294 +
5295 + node = flow_insert_point(op)->node;
5296 + insertion_overhead = 0;
5297 + if (node->nplug->item_overhead &&
5298 + !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5299 + flow_insert_data(op)))
5300 + insertion_overhead =
5301 + node->nplug->item_overhead(node, NULL) +
5302 + item_data_overhead(op);
5303 + return insertion_overhead;
5304 +}
5305 +
5306 +/* how many bytes of flow does fit to the node */
5307 +static int what_can_fit_into_node(carry_op * op)
5308 +{
5309 + size_t free, overhead;
5310 +
5311 + overhead = flow_insertion_overhead(op);
5312 + free = znode_free_space(flow_insert_point(op)->node);
5313 + if (free <= overhead)
5314 + return 0;
5315 + free -= overhead;
5316 + /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5317 + if (free < op->u.insert_flow.flow->length)
5318 + return free;
5319 + return (int)op->u.insert_flow.flow->length;
5320 +}
5321 +
5322 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5323 + fits into a node or whether minimal fraction of flow fits into a node */
5324 +static int enough_space_for_whole_flow(carry_op * op)
5325 +{
5326 + return (unsigned)what_can_fit_into_node(op) ==
5327 + op->u.insert_flow.flow->length;
5328 +}
5329 +
5330 +#define MIN_FLOW_FRACTION 1
5331 +static int enough_space_for_min_flow_fraction(carry_op * op)
5332 +{
5333 + assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5334 +
5335 + return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5336 +}
5337 +
5338 +/* this returns 0 if left neighbor was obtained successfully and everything
5339 + upto insertion point including it were shifted and left neighbor still has
5340 + some free space to put minimal fraction of flow into it */
5341 +static int
5342 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5343 +{
5344 + carry_node *left;
5345 + znode *orig;
5346 +
5347 + left = find_left_neighbor(op, doing);
5348 + if (unlikely(IS_ERR(left))) {
5349 + warning("vs-899",
5350 + "make_space_by_shift_left: "
5351 + "error accessing left neighbor: %li", PTR_ERR(left));
5352 + return 1;
5353 + }
5354 + if (left == NULL)
5355 + /* left neighbor either does not exist or is unformatted
5356 + node */
5357 + return 1;
5358 +
5359 + orig = flow_insert_point(op)->node;
5360 + /* try to shift content of node @orig from its head upto insert point
5361 + including insertion point into the left neighbor */
5362 + carry_shift_data(LEFT_SIDE, flow_insert_point(op), carry_real(left), doing, todo, 1 /* including insert
5363 + * point */ );
5364 + if (carry_real(left) != flow_insert_point(op)->node) {
5365 + /* insertion point did not move */
5366 + return 1;
5367 + }
5368 +
5369 + /* insertion point is set after last item in the node */
5370 + assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5371 +
5372 + if (!enough_space_for_min_flow_fraction(op)) {
5373 + /* insertion point node does not have enough free space to put
5374 + even minimal portion of flow into it, therefore, move
5375 + insertion point back to orig node (before first item) */
5376 + coord_init_before_first_item(flow_insert_point(op), orig);
5377 + return 1;
5378 + }
5379 +
5380 + /* part of flow is to be written to the end of node */
5381 + op->node = left;
5382 + return 0;
5383 +}
5384 +
5385 +/* this returns 0 if right neighbor was obtained successfully and everything to
5386 + the right of insertion point was shifted to it and node got enough free
5387 + space to put minimal fraction of flow into it */
5388 +static int
5389 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5390 + carry_level * todo)
5391 +{
5392 + carry_node *right;
5393 +
5394 + right = find_right_neighbor(op, doing);
5395 + if (unlikely(IS_ERR(right))) {
5396 + warning("nikita-1065", "shift_right_excluding_insert_point: "
5397 + "error accessing right neighbor: %li", PTR_ERR(right));
5398 + return 1;
5399 + }
5400 + if (right) {
5401 + /* shift everything possible on the right of but excluding
5402 + insertion coord into the right neighbor */
5403 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(right), doing, todo, 0 /* not
5404 + * including
5405 + * insert
5406 + * point */ );
5407 + } else {
5408 + /* right neighbor either does not exist or is unformatted
5409 + node */
5410 + ;
5411 + }
5412 + if (coord_is_after_rightmost(flow_insert_point(op))) {
5413 + if (enough_space_for_min_flow_fraction(op)) {
5414 + /* part of flow is to be written to the end of node */
5415 + return 0;
5416 + }
5417 + }
5418 +
5419 + /* new node is to be added if insert point node did not get enough
5420 + space for whole flow */
5421 + return 1;
5422 +}
5423 +
5424 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5425 + fits into that node */
5426 +static int
5427 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5428 +{
5429 + int result;
5430 + znode *node;
5431 + carry_node *new;
5432 +
5433 + node = flow_insert_point(op)->node;
5434 +
5435 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5436 + return RETERR(-E_NODE_FULL);
5437 + /* add new node after insert point node */
5438 + new = add_new_znode(node, op->node, doing, todo);
5439 + if (unlikely(IS_ERR(new))) {
5440 + return PTR_ERR(new);
5441 + }
5442 + result = lock_carry_node(doing, new);
5443 + zput(carry_real(new));
5444 + if (unlikely(result)) {
5445 + return result;
5446 + }
5447 + op->u.insert_flow.new_nodes++;
5448 + if (!coord_is_after_rightmost(flow_insert_point(op))) {
5449 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(new), doing, todo, 0 /* not
5450 + * including
5451 + * insert
5452 + * point */ );
5453 +
5454 + assert("vs-901",
5455 + coord_is_after_rightmost(flow_insert_point(op)));
5456 +
5457 + if (enough_space_for_min_flow_fraction(op)) {
5458 + return 0;
5459 + }
5460 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5461 + return RETERR(-E_NODE_FULL);
5462 +
5463 + /* add one more new node */
5464 + new = add_new_znode(node, op->node, doing, todo);
5465 + if (unlikely(IS_ERR(new))) {
5466 + return PTR_ERR(new);
5467 + }
5468 + result = lock_carry_node(doing, new);
5469 + zput(carry_real(new));
5470 + if (unlikely(result)) {
5471 + return result;
5472 + }
5473 + op->u.insert_flow.new_nodes++;
5474 + }
5475 +
5476 + /* move insertion point to new node */
5477 + coord_init_before_first_item(flow_insert_point(op), carry_real(new));
5478 + op->node = new;
5479 + return 0;
5480 +}
5481 +
5482 +static int
5483 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5484 + carry_level * todo)
5485 +{
5486 + __u32 flags = op->u.insert_flow.flags;
5487 +
5488 + if (enough_space_for_whole_flow(op)) {
5489 + /* whole flow fits into insert point node */
5490 + return 0;
5491 + }
5492 +
5493 + if (!(flags & COPI_DONT_SHIFT_LEFT)
5494 + && (make_space_by_shift_left(op, doing, todo) == 0)) {
5495 + /* insert point is shifted to left neighbor of original insert
5496 + point node and is set after last unit in that node. It has
5497 + enough space to fit at least minimal fraction of flow. */
5498 + return 0;
5499 + }
5500 +
5501 + if (enough_space_for_whole_flow(op)) {
5502 + /* whole flow fits into insert point node */
5503 + return 0;
5504 + }
5505 +
5506 + if (!(flags & COPI_DONT_SHIFT_RIGHT)
5507 + && (make_space_by_shift_right(op, doing, todo) == 0)) {
5508 + /* insert point is still set to the same node, but there is
5509 + nothing to the right of insert point. */
5510 + return 0;
5511 + }
5512 +
5513 + if (enough_space_for_whole_flow(op)) {
5514 + /* whole flow fits into insert point node */
5515 + return 0;
5516 + }
5517 +
5518 + return make_space_by_new_nodes(op, doing, todo);
5519 +}
5520 +
5521 +/* implements COP_INSERT_FLOW operation */
5522 +static int
5523 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5524 +{
5525 + int result;
5526 + flow_t *f;
5527 + coord_t *insert_point;
5528 + node_plugin *nplug;
5529 + carry_plugin_info info;
5530 + znode *orig_node;
5531 + lock_handle *orig_lh;
5532 +
5533 + f = op->u.insert_flow.flow;
5534 + result = 0;
5535 +
5536 + /* carry system needs this to work */
5537 + info.doing = doing;
5538 + info.todo = todo;
5539 +
5540 + orig_node = flow_insert_point(op)->node;
5541 + orig_lh = doing->tracked;
5542 +
5543 + while (f->length) {
5544 + result = make_space_for_flow_insertion(op, doing, todo);
5545 + if (result)
5546 + break;
5547 +
5548 + insert_point = flow_insert_point(op);
5549 + nplug = node_plugin_by_node(insert_point->node);
5550 +
5551 + /* compose item data for insertion/pasting */
5552 + flow_insert_data(op)->data = f->data;
5553 + flow_insert_data(op)->length = what_can_fit_into_node(op);
5554 +
5555 + if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5556 + /* insert point is set to item of file we are writing to and we have to append to it */
5557 + assert("vs-903", insert_point->between == AFTER_UNIT);
5558 + nplug->change_item_size(insert_point,
5559 + flow_insert_data(op)->length);
5560 + flow_insert_data(op)->iplug->b.paste(insert_point,
5561 + flow_insert_data
5562 + (op), &info);
5563 + } else {
5564 + /* new item must be inserted */
5565 + pos_in_node_t new_pos;
5566 + flow_insert_data(op)->length += item_data_overhead(op);
5567 +
5568 + /* FIXME-VS: this is because node40_create_item changes
5569 + insert_point for obscure reasons */
5570 + switch (insert_point->between) {
5571 + case AFTER_ITEM:
5572 + new_pos = insert_point->item_pos + 1;
5573 + break;
5574 + case EMPTY_NODE:
5575 + new_pos = 0;
5576 + break;
5577 + case BEFORE_ITEM:
5578 + assert("vs-905", insert_point->item_pos == 0);
5579 + new_pos = 0;
5580 + break;
5581 + default:
5582 + impossible("vs-906",
5583 + "carry_insert_flow: invalid coord");
5584 + new_pos = 0;
5585 + break;
5586 + }
5587 +
5588 + nplug->create_item(insert_point, &f->key,
5589 + flow_insert_data(op), &info);
5590 + coord_set_item_pos(insert_point, new_pos);
5591 + }
5592 + coord_init_after_item_end(insert_point);
5593 + doing->restartable = 0;
5594 + znode_make_dirty(insert_point->node);
5595 +
5596 + move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5597 + }
5598 +
5599 + if (orig_node != flow_insert_point(op)->node) {
5600 + /* move lock to new insert point */
5601 + done_lh(orig_lh);
5602 + init_lh(orig_lh);
5603 + result =
5604 + longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5605 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5606 + }
5607 +
5608 + return result;
5609 +}
5610 +
5611 +/* implements COP_DELETE operation
5612 +
5613 + Remove pointer to @op -> u.delete.child from it's parent.
5614 +
5615 + This function also handles killing of a tree root is last pointer from it
5616 + was removed. This is complicated by our handling of "twig" level: root on
5617 + twig level is never killed.
5618 +
5619 +*/
5620 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5621 + carry_level * doing UNUSED_ARG /* current carry
5622 + * level */ ,
5623 + carry_level * todo /* next carry level */ )
5624 +{
5625 + int result;
5626 + coord_t coord;
5627 + coord_t coord2;
5628 + znode *parent;
5629 + znode *child;
5630 + carry_plugin_info info;
5631 + reiser4_tree *tree;
5632 +
5633 + /*
5634 + * This operation is called to delete internal item pointing to the
5635 + * child node that was removed by carry from the tree on the previous
5636 + * tree level.
5637 + */
5638 +
5639 + assert("nikita-893", op != NULL);
5640 + assert("nikita-894", todo != NULL);
5641 + assert("nikita-895", op->op == COP_DELETE);
5642 +
5643 + coord_init_zero(&coord);
5644 + coord_init_zero(&coord2);
5645 +
5646 + parent = carry_real(op->node);
5647 + child = op->u.delete.child ?
5648 + carry_real(op->u.delete.child) : op->node->node;
5649 + tree = znode_get_tree(child);
5650 + read_lock_tree(tree);
5651 +
5652 + /*
5653 + * @parent was determined when carry entered parent level
5654 + * (lock_carry_level/lock_carry_node). Since then, actual parent of
5655 + * @child node could change due to other carry operations performed on
5656 + * the parent level. Check for this.
5657 + */
5658 +
5659 + if (znode_parent(child) != parent) {
5660 + /* NOTE-NIKITA add stat counter for this. */
5661 + parent = znode_parent(child);
5662 + assert("nikita-2581", find_carry_node(doing, parent));
5663 + }
5664 + read_unlock_tree(tree);
5665 +
5666 + assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5667 +
5668 + /* Twig level horrors: tree should be of height at least 2. So, last
5669 + pointer from the root at twig level is preserved even if child is
5670 + empty. This is ugly, but so it was architectured.
5671 + */
5672 +
5673 + if (znode_is_root(parent) &&
5674 + znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5675 + node_num_items(parent) == 1) {
5676 + /* Delimiting key manipulations. */
5677 + write_lock_dk(tree);
5678 + znode_set_ld_key(child, znode_set_ld_key(parent, min_key()));
5679 + znode_set_rd_key(child, znode_set_rd_key(parent, max_key()));
5680 + ZF_SET(child, JNODE_DKSET);
5681 + write_unlock_dk(tree);
5682 +
5683 + /* @child escaped imminent death! */
5684 + ZF_CLR(child, JNODE_HEARD_BANSHEE);
5685 + return 0;
5686 + }
5687 +
5688 + /* convert child pointer to the coord_t */
5689 + result = find_child_ptr(parent, child, &coord);
5690 + if (result != NS_FOUND) {
5691 + warning("nikita-994", "Cannot find child pointer: %i", result);
5692 + print_coord_content("coord", &coord);
5693 + return result;
5694 + }
5695 +
5696 + coord_dup(&coord2, &coord);
5697 + info.doing = doing;
5698 + info.todo = todo;
5699 + {
5700 + /*
5701 + * Actually kill internal item: prepare structure with
5702 + * arguments for ->cut_and_kill() method...
5703 + */
5704 +
5705 + struct carry_kill_data kdata;
5706 + kdata.params.from = &coord;
5707 + kdata.params.to = &coord2;
5708 + kdata.params.from_key = NULL;
5709 + kdata.params.to_key = NULL;
5710 + kdata.params.smallest_removed = NULL;
5711 + kdata.params.truncate = 1;
5712 + kdata.flags = op->u.delete.flags;
5713 + kdata.inode = NULL;
5714 + kdata.left = NULL;
5715 + kdata.right = NULL;
5716 + kdata.buf = NULL;
5717 + /* ... and call it. */
5718 + result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5719 + &info);
5720 + }
5721 + doing->restartable = 0;
5722 +
5723 + /* check whether root should be killed violently */
5724 + if (znode_is_root(parent) &&
5725 + /* don't kill roots at and lower than twig level */
5726 + znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5727 + node_num_items(parent) == 1) {
5728 + result = kill_tree_root(coord.node);
5729 + }
5730 +
5731 + return result < 0 ? : 0;
5732 +}
5733 +
5734 +/* implements COP_CUT opration
5735 +
5736 + Cuts part or whole content of node.
5737 +
5738 +*/
5739 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5740 + carry_level * doing /* current carry level */ ,
5741 + carry_level * todo /* next carry level */ )
5742 +{
5743 + int result;
5744 + carry_plugin_info info;
5745 + node_plugin *nplug;
5746 +
5747 + assert("nikita-896", op != NULL);
5748 + assert("nikita-897", todo != NULL);
5749 + assert("nikita-898", op->op == COP_CUT);
5750 +
5751 + info.doing = doing;
5752 + info.todo = todo;
5753 +
5754 + nplug = node_plugin_by_node(carry_real(op->node));
5755 + if (op->u.cut_or_kill.is_cut)
5756 + result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5757 + else
5758 + result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5759 +
5760 + doing->restartable = 0;
5761 + return result < 0 ? : 0;
5762 +}
5763 +
5764 +/* helper function for carry_paste(): returns true if @op can be continued as
5765 + paste */
5766 +static int
5767 +can_paste(coord_t * icoord, const reiser4_key * key,
5768 + const reiser4_item_data * data)
5769 +{
5770 + coord_t circa;
5771 + item_plugin *new_iplug;
5772 + item_plugin *old_iplug;
5773 + int result = 0; /* to keep gcc shut */
5774 +
5775 + assert("", icoord->between != AT_UNIT);
5776 +
5777 + /* obviously, one cannot paste when node is empty---there is nothing
5778 + to paste into. */
5779 + if (node_is_empty(icoord->node))
5780 + return 0;
5781 + /* if insertion point is at the middle of the item, then paste */
5782 + if (!coord_is_between_items(icoord))
5783 + return 1;
5784 + coord_dup(&circa, icoord);
5785 + circa.between = AT_UNIT;
5786 +
5787 + old_iplug = item_plugin_by_coord(&circa);
5788 + new_iplug = data->iplug;
5789 +
5790 + /* check whether we can paste to the item @icoord is "at" when we
5791 + ignore ->between field */
5792 + if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5793 + result = 1;
5794 + } else if (icoord->between == BEFORE_UNIT
5795 + || icoord->between == BEFORE_ITEM) {
5796 + /* otherwise, try to glue to the item at the left, if any */
5797 + coord_dup(&circa, icoord);
5798 + if (coord_set_to_left(&circa)) {
5799 + result = 0;
5800 + coord_init_before_item(icoord);
5801 + } else {
5802 + old_iplug = item_plugin_by_coord(&circa);
5803 + result = (old_iplug == new_iplug)
5804 + && item_can_contain_key(icoord, key, data);
5805 + if (result) {
5806 + coord_dup(icoord, &circa);
5807 + icoord->between = AFTER_UNIT;
5808 + }
5809 + }
5810 + } else if (icoord->between == AFTER_UNIT
5811 + || icoord->between == AFTER_ITEM) {
5812 + coord_dup(&circa, icoord);
5813 + /* otherwise, try to glue to the item at the right, if any */
5814 + if (coord_set_to_right(&circa)) {
5815 + result = 0;
5816 + coord_init_after_item(icoord);
5817 + } else {
5818 + int (*cck) (const coord_t *, const reiser4_key *,
5819 + const reiser4_item_data *);
5820 +
5821 + old_iplug = item_plugin_by_coord(&circa);
5822 +
5823 + cck = old_iplug->b.can_contain_key;
5824 + if (cck == NULL)
5825 + /* item doesn't define ->can_contain_key
5826 + method? So it is not expandable. */
5827 + result = 0;
5828 + else {
5829 + result = (old_iplug == new_iplug)
5830 + && cck(&circa /*icoord */ , key, data);
5831 + if (result) {
5832 + coord_dup(icoord, &circa);
5833 + icoord->between = BEFORE_UNIT;
5834 + }
5835 + }
5836 + }
5837 + } else
5838 + impossible("nikita-2513", "Nothing works");
5839 + if (result) {
5840 + if (icoord->between == BEFORE_ITEM) {
5841 + assert("vs-912", icoord->unit_pos == 0);
5842 + icoord->between = BEFORE_UNIT;
5843 + } else if (icoord->between == AFTER_ITEM) {
5844 + coord_init_after_item_end(icoord);
5845 + }
5846 + }
5847 + return result;
5848 +}
5849 +
5850 +/* implements COP_PASTE operation
5851 +
5852 + Paste data into existing item. This is complicated by the fact that after
5853 + we shifted something to the left or right neighbors trying to free some
5854 + space, item we were supposed to paste into can be in different node than
5855 + insertion coord. If so, we are no longer doing paste, but insert. See
5856 + comments in insert_paste_common().
5857 +
5858 +*/
5859 +static int carry_paste(carry_op * op /* operation to be performed */ ,
5860 + carry_level * doing UNUSED_ARG /* current carry
5861 + * level */ ,
5862 + carry_level * todo /* next carry level */ )
5863 +{
5864 + znode *node;
5865 + carry_insert_data cdata;
5866 + coord_t dcoord;
5867 + reiser4_item_data data;
5868 + int result;
5869 + int real_size;
5870 + item_plugin *iplug;
5871 + carry_plugin_info info;
5872 + coord_t *coord;
5873 +
5874 + assert("nikita-982", op != NULL);
5875 + assert("nikita-983", todo != NULL);
5876 + assert("nikita-984", op->op == COP_PASTE);
5877 +
5878 + coord_init_zero(&dcoord);
5879 +
5880 + result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5881 + if (result != 0)
5882 + return result;
5883 +
5884 + coord = op->u.insert.d->coord;
5885 +
5886 + /* handle case when op -> u.insert.coord doesn't point to the item
5887 + of required type. restart as insert. */
5888 + if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5889 + op->op = COP_INSERT;
5890 + op->u.insert.type = COPT_PASTE_RESTARTED;
5891 + result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5892 +
5893 + return result;
5894 + }
5895 +
5896 + node = coord->node;
5897 + iplug = item_plugin_by_coord(coord);
5898 + assert("nikita-992", iplug != NULL);
5899 +
5900 + assert("nikita-985", node != NULL);
5901 + assert("nikita-986", node_plugin_by_node(node) != NULL);
5902 +
5903 + assert("nikita-987",
5904 + space_needed_for_op(node, op) <= znode_free_space(node));
5905 +
5906 + assert("nikita-1286", coord_is_existing_item(coord));
5907 +
5908 + /*
5909 + * if item is expanded as a result of this operation, we should first
5910 + * change item size, than call ->b.paste item method. If item is
5911 + * shrunk, it should be done other way around: first call ->b.paste
5912 + * method, then reduce item size.
5913 + */
5914 +
5915 + real_size = space_needed_for_op(node, op);
5916 + if (real_size > 0)
5917 + node->nplug->change_item_size(coord, real_size);
5918 +
5919 + doing->restartable = 0;
5920 + info.doing = doing;
5921 + info.todo = todo;
5922 +
5923 + result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5924 +
5925 + if (real_size < 0)
5926 + node->nplug->change_item_size(coord, real_size);
5927 +
5928 + /* if we pasted at the beginning of the item, update item's key. */
5929 + if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5930 + node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5931 +
5932 + znode_make_dirty(node);
5933 + return result;
5934 +}
5935 +
5936 +/* handle carry COP_EXTENT operation. */
5937 +static int carry_extent(carry_op * op /* operation to perform */ ,
5938 + carry_level * doing /* queue of operations @op
5939 + * is part of */ ,
5940 + carry_level * todo /* queue where new operations
5941 + * are accumulated */ )
5942 +{
5943 + znode *node;
5944 + carry_insert_data cdata;
5945 + coord_t coord;
5946 + reiser4_item_data data;
5947 + carry_op *delete_dummy;
5948 + carry_op *insert_extent;
5949 + int result;
5950 + carry_plugin_info info;
5951 +
5952 + assert("nikita-1751", op != NULL);
5953 + assert("nikita-1752", todo != NULL);
5954 + assert("nikita-1753", op->op == COP_EXTENT);
5955 +
5956 + /* extent insertion overview:
5957 +
5958 + extents live on the TWIG LEVEL, which is level one above the leaf
5959 + one. This complicates extent insertion logic somewhat: it may
5960 + happen (and going to happen all the time) that in logical key
5961 + ordering extent has to be placed between items I1 and I2, located
5962 + at the leaf level, but I1 and I2 are in the same formatted leaf
5963 + node N1. To insert extent one has to
5964 +
5965 + (1) reach node N1 and shift data between N1, its neighbors and
5966 + possibly newly allocated nodes until I1 and I2 fall into different
5967 + nodes. Since I1 and I2 are still neighboring items in logical key
5968 + order, they will be necessary utmost items in their respective
5969 + nodes.
5970 +
5971 + (2) After this new extent item is inserted into node on the twig
5972 + level.
5973 +
5974 + Fortunately this process can reuse almost all code from standard
5975 + insertion procedure (viz. make_space() and insert_paste_common()),
5976 + due to the following observation: make_space() only shifts data up
5977 + to and excluding or including insertion point. It never
5978 + "over-moves" through insertion point. Thus, one can use
5979 + make_space() to perform step (1). All required for this is just to
5980 + instruct free_space_shortage() to keep make_space() shifting data
5981 + until insertion point is at the node border.
5982 +
5983 + */
5984 +
5985 + /* perform common functionality of insert and paste. */
5986 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5987 + if (result != 0)
5988 + return result;
5989 +
5990 + node = op->u.extent.d->coord->node;
5991 + assert("nikita-1754", node != NULL);
5992 + assert("nikita-1755", node_plugin_by_node(node) != NULL);
5993 + assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5994 +
5995 + /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5996 + extent fits between items. */
5997 +
5998 + info.doing = doing;
5999 + info.todo = todo;
6000 +
6001 + /* there is another complication due to placement of extents on the
6002 + twig level: extents are "rigid" in the sense that key-range
6003 + occupied by extent cannot grow indefinitely to the right as it is
6004 + for the formatted leaf nodes. Because of this when search finds two
6005 + adjacent extents on the twig level, it has to "drill" to the leaf
6006 + level, creating new node. Here we are removing this node.
6007 + */
6008 + if (node_is_empty(node)) {
6009 + delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
6010 + if (IS_ERR(delete_dummy))
6011 + return PTR_ERR(delete_dummy);
6012 + delete_dummy->u.delete.child = NULL;
6013 + delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
6014 + ZF_SET(node, JNODE_HEARD_BANSHEE);
6015 + }
6016 +
6017 + /* proceed with inserting extent item into parent. We are definitely
6018 + inserting rather than pasting if we get that far. */
6019 + insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
6020 + if (IS_ERR(insert_extent))
6021 + /* @delete_dummy will be automatically destroyed on the level
6022 + exiting */
6023 + return PTR_ERR(insert_extent);
6024 + /* NOTE-NIKITA insertion by key is simplest option here. Another
6025 + possibility is to insert on the left or right of already existing
6026 + item.
6027 + */
6028 + insert_extent->u.insert.type = COPT_KEY;
6029 + insert_extent->u.insert.d = op->u.extent.d;
6030 + assert("nikita-1719", op->u.extent.d->key != NULL);
6031 + insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
6032 + insert_extent->u.insert.flags =
6033 + znode_get_tree(node)->carry.new_extent_flags;
6034 +
6035 + /*
6036 + * if carry was asked to track lock handle we should actually track
6037 + * lock handle on the twig node rather than on the leaf where
6038 + * operation was started from. Transfer tracked lock handle.
6039 + */
6040 + if (doing->track_type) {
6041 + assert("nikita-3242", doing->tracked != NULL);
6042 + assert("nikita-3244", todo->tracked == NULL);
6043 + todo->tracked = doing->tracked;
6044 + todo->track_type = CARRY_TRACK_NODE;
6045 + doing->tracked = NULL;
6046 + doing->track_type = 0;
6047 + }
6048 +
6049 + return 0;
6050 +}
6051 +
6052 +/* update key in @parent between pointers to @left and @right.
6053 +
6054 + Find coords of @left and @right and update delimiting key between them.
6055 + This is helper function called by carry_update(). Finds position of
6056 + internal item involved. Updates item key. Updates delimiting keys of child
6057 + nodes involved.
6058 +*/
6059 +static int update_delimiting_key(znode * parent /* node key is updated
6060 + * in */ ,
6061 + znode * left /* child of @parent */ ,
6062 + znode * right /* child of @parent */ ,
6063 + carry_level * doing /* current carry
6064 + * level */ ,
6065 + carry_level * todo /* parent carry
6066 + * level */ ,
6067 + const char **error_msg /* place to
6068 + * store error
6069 + * message */ )
6070 +{
6071 + coord_t left_pos;
6072 + coord_t right_pos;
6073 + int result;
6074 + reiser4_key ldkey;
6075 + carry_plugin_info info;
6076 +
6077 + assert("nikita-1177", right != NULL);
6078 + /* find position of right left child in a parent */
6079 + result = find_child_ptr(parent, right, &right_pos);
6080 + if (result != NS_FOUND) {
6081 + *error_msg = "Cannot find position of right child";
6082 + return result;
6083 + }
6084 +
6085 + if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
6086 + /* find position of the left child in a parent */
6087 + result = find_child_ptr(parent, left, &left_pos);
6088 + if (result != NS_FOUND) {
6089 + *error_msg = "Cannot find position of left child";
6090 + return result;
6091 + }
6092 + assert("nikita-1355", left_pos.node != NULL);
6093 + } else
6094 + left_pos.node = NULL;
6095 +
6096 + /* check that they are separated by exactly one key and are basically
6097 + sane */
6098 + if (REISER4_DEBUG) {
6099 + if ((left_pos.node != NULL)
6100 + && !coord_is_existing_unit(&left_pos)) {
6101 + *error_msg = "Left child is bastard";
6102 + return RETERR(-EIO);
6103 + }
6104 + if (!coord_is_existing_unit(&right_pos)) {
6105 + *error_msg = "Right child is bastard";
6106 + return RETERR(-EIO);
6107 + }
6108 + if (left_pos.node != NULL &&
6109 + !coord_are_neighbors(&left_pos, &right_pos)) {
6110 + *error_msg = "Children are not direct siblings";
6111 + return RETERR(-EIO);
6112 + }
6113 + }
6114 + *error_msg = NULL;
6115 +
6116 + info.doing = doing;
6117 + info.todo = todo;
6118 +
6119 + /*
6120 + * If child node is not empty, new key of internal item is a key of
6121 + * leftmost item in the child node. If the child is empty, take its
6122 + * right delimiting key as a new key of the internal item. Precise key
6123 + * in the latter case is not important per se, because the child (and
6124 + * the internal item) are going to be killed shortly anyway, but we
6125 + * have to preserve correct order of keys in the parent node.
6126 + */
6127 +
6128 + if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
6129 + leftmost_key_in_node(right, &ldkey);
6130 + else {
6131 + read_lock_dk(znode_get_tree(parent));
6132 + ldkey = *znode_get_rd_key(right);
6133 + read_unlock_dk(znode_get_tree(parent));
6134 + }
6135 + node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
6136 + doing->restartable = 0;
6137 + znode_make_dirty(parent);
6138 + return 0;
6139 +}
6140 +
6141 +/* implements COP_UPDATE opration
6142 +
6143 + Update delimiting keys.
6144 +
6145 +*/
6146 +static int carry_update(carry_op * op /* operation to be performed */ ,
6147 + carry_level * doing /* current carry level */ ,
6148 + carry_level * todo /* next carry level */ )
6149 +{
6150 + int result;
6151 + carry_node *missing UNUSED_ARG;
6152 + znode *left;
6153 + znode *right;
6154 + carry_node *lchild;
6155 + carry_node *rchild;
6156 + const char *error_msg;
6157 + reiser4_tree *tree;
6158 +
6159 + /*
6160 + * This operation is called to update key of internal item. This is
6161 + * necessary when carry shifted of cut data on the child
6162 + * level. Arguments of this operation are:
6163 + *
6164 + * @right --- child node. Operation should update key of internal
6165 + * item pointing to @right.
6166 + *
6167 + * @left --- left neighbor of @right. This parameter is optional.
6168 + */
6169 +
6170 + assert("nikita-902", op != NULL);
6171 + assert("nikita-903", todo != NULL);
6172 + assert("nikita-904", op->op == COP_UPDATE);
6173 +
6174 + lchild = op->u.update.left;
6175 + rchild = op->node;
6176 +
6177 + if (lchild != NULL) {
6178 + assert("nikita-1001", lchild->parent);
6179 + assert("nikita-1003", !lchild->left);
6180 + left = carry_real(lchild);
6181 + } else
6182 + left = NULL;
6183 +
6184 + tree = znode_get_tree(rchild->node);
6185 + read_lock_tree(tree);
6186 + right = znode_parent(rchild->node);
6187 + read_unlock_tree(tree);
6188 +
6189 + if (right != NULL) {
6190 + result = update_delimiting_key(right,
6191 + lchild ? lchild->node : NULL,
6192 + rchild->node,
6193 + doing, todo, &error_msg);
6194 + } else {
6195 + error_msg = "Cannot find node to update key in";
6196 + result = RETERR(-EIO);
6197 + }
6198 + /* operation will be reposted to the next level by the
6199 + ->update_item_key() method of node plugin, if necessary. */
6200 +
6201 + if (result != 0) {
6202 + warning("nikita-999", "Error updating delimiting key: %s (%i)",
6203 + error_msg ? : "", result);
6204 + }
6205 + return result;
6206 +}
6207 +
6208 +/* move items from @node during carry */
6209 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
6210 + coord_t * insert_coord /* coord where new item
6211 + * is to be inserted */ ,
6212 + znode * node /* node which data are moved from */ ,
6213 + carry_level * doing /* active carry queue */ ,
6214 + carry_level * todo /* carry queue where new
6215 + * operations are to be put
6216 + * in */ ,
6217 + unsigned int including_insert_coord_p /* true if
6218 + * @insertion_coord
6219 + * can be moved */ )
6220 +{
6221 + int result;
6222 + znode *source;
6223 + carry_plugin_info info;
6224 + node_plugin *nplug;
6225 +
6226 + source = insert_coord->node;
6227 +
6228 + info.doing = doing;
6229 + info.todo = todo;
6230 +
6231 + nplug = node_plugin_by_node(node);
6232 + result = nplug->shift(insert_coord, node,
6233 + (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
6234 + (int)including_insert_coord_p, &info);
6235 + /* the only error ->shift() method of node plugin can return is
6236 + -ENOMEM due to carry node/operation allocation. */
6237 + assert("nikita-915", result >= 0 || result == -ENOMEM);
6238 + if (result > 0) {
6239 + /*
6240 + * if some number of bytes was actually shifted, mark nodes
6241 + * dirty, and carry level as non-restartable.
6242 + */
6243 + doing->restartable = 0;
6244 + znode_make_dirty(source);
6245 + znode_make_dirty(node);
6246 + }
6247 +
6248 + assert("nikita-2077", coord_check(insert_coord));
6249 + return 0;
6250 +}
6251 +
6252 +typedef carry_node *(*carry_iterator) (carry_node * node);
6253 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6254 + carry_iterator iterator);
6255 +
6256 +static carry_node *pool_level_list_prev(carry_node *node)
6257 +{
6258 + return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6259 +}
6260 +
6261 +/* look for the left neighbor of given carry node in a carry queue.
6262 +
6263 + This is used by find_left_neighbor(), but I am not sure that this
6264 + really gives any advantage. More statistics required.
6265 +
6266 +*/
6267 +carry_node *find_left_carry(carry_node * node /* node to find left neighbor
6268 + * of */ ,
6269 + carry_level * level /* level to scan */ )
6270 +{
6271 + return find_dir_carry(node, level,
6272 + (carry_iterator) pool_level_list_prev);
6273 +}
6274 +
6275 +static carry_node *pool_level_list_next(carry_node *node)
6276 +{
6277 + return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6278 +}
6279 +
6280 +/* look for the right neighbor of given carry node in a
6281 + carry queue.
6282 +
6283 + This is used by find_right_neighbor(), but I am not sure that this
6284 + really gives any advantage. More statistics required.
6285 +
6286 +*/
6287 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6288 + * of */ ,
6289 + carry_level * level /* level to scan */ )
6290 +{
6291 + return find_dir_carry(node, level,
6292 + (carry_iterator) pool_level_list_next);
6293 +}
6294 +
6295 +/* look for the left or right neighbor of given carry node in a carry
6296 + queue.
6297 +
6298 + Helper function used by find_{left|right}_carry().
6299 +*/
6300 +static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6301 + * from */ ,
6302 + carry_level * level /* level to scan */ ,
6303 + carry_iterator iterator /* operation to
6304 + * move to the next
6305 + * node */ )
6306 +{
6307 + carry_node *neighbor;
6308 +
6309 + assert("nikita-1059", node != NULL);
6310 + assert("nikita-1060", level != NULL);
6311 +
6312 + /* scan list of carry nodes on this list dir-ward, skipping all
6313 + carry nodes referencing the same znode. */
6314 + neighbor = node;
6315 + while (1) {
6316 + neighbor = iterator(neighbor);
6317 + if (carry_node_end(level, neighbor))
6318 + /* list head is reached */
6319 + return NULL;
6320 + if (carry_real(neighbor) != carry_real(node))
6321 + return neighbor;
6322 + }
6323 +}
6324 +
6325 +/*
6326 + * Memory reservation estimation.
6327 + *
6328 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6329 + * takes tree in consistent state (e.g., that search tree invariants hold),
6330 + * and leaves tree consistent after it finishes. This means that when some
6331 + * error occurs carry cannot simply return if there are pending carry
6332 + * operations. Generic solution for this problem is carry-undo either as
6333 + * transaction manager feature (requiring checkpoints and isolation), or
6334 + * through some carry specific mechanism.
6335 + *
6336 + * Our current approach is to panic if carry hits an error while tree is
6337 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6338 + * this "memory reservation" mechanism was added.
6339 + *
6340 + * Memory reservation is implemented by perthread-pages.diff patch from
6341 + * core-patches. Its API is defined in <linux/gfp.h>
6342 + *
6343 + * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6344 + * void perthread_pages_release(int nrpages);
6345 + * int perthread_pages_count(void);
6346 + *
6347 + * carry estimates its worst case memory requirements at the entry, reserved
6348 + * enough memory, and released unused pages before returning.
6349 + *
6350 + * Code below estimates worst case memory requirements for a given carry
6351 + * queue. This is dome by summing worst case memory requirements for each
6352 + * operation in the queue.
6353 + *
6354 + */
6355 +
6356 +/*
6357 + * Memory memory requirements of many operations depends on the tree
6358 + * height. For example, item insertion requires new node to be inserted at
6359 + * each tree level in the worst case. What tree height should be used for
6360 + * estimation? Current tree height is wrong, because tree height can change
6361 + * between the time when estimation was done and the time when operation is
6362 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6363 + * is also not desirable, because it would lead to the huge over-estimation
6364 + * all the time. Plausible solution is "capped tree height": if current tree
6365 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6366 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6367 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6368 + * to be increased even more during short interval of time.
6369 + */
6370 +#define TREE_HEIGHT_CAP (5)
6371 +
6372 +/* return capped tree height for the @tree. See comment above. */
6373 +static int cap_tree_height(reiser4_tree * tree)
6374 +{
6375 + return max_t(int, tree->height, TREE_HEIGHT_CAP);
6376 +}
6377 +
6378 +/* return capped tree height for the current tree. */
6379 +static int capped_height(void)
6380 +{
6381 + return cap_tree_height(current_tree);
6382 +}
6383 +
6384 +/* return number of pages required to store given number of bytes */
6385 +static int bytes_to_pages(int bytes)
6386 +{
6387 + return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6388 +}
6389 +
6390 +/* how many pages are required to allocate znodes during item insertion. */
6391 +static int carry_estimate_znodes(void)
6392 +{
6393 + /*
6394 + * Note, that there we have some problem here: there is no way to
6395 + * reserve pages specifically for the given slab. This means that
6396 + * these pages can be hijacked for some other end.
6397 + */
6398 +
6399 + /* in the worst case we need 3 new znode on each tree level */
6400 + return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6401 +}
6402 +
6403 +/*
6404 + * how many pages are required to load bitmaps. One bitmap per level.
6405 + */
6406 +static int carry_estimate_bitmaps(void)
6407 +{
6408 + if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6409 + int bytes;
6410 +
6411 + bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6412 + * bitmap.c, skip for now. */
6413 + 2 * sizeof(jnode)); /* working and commit jnodes */
6414 + return bytes_to_pages(bytes) + 2; /* and their contents */
6415 + } else
6416 + /* bitmaps were pre-loaded during mount */
6417 + return 0;
6418 +}
6419 +
6420 +/* worst case item insertion memory requirements */
6421 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6422 +{
6423 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6424 + capped_height() + /* new block on each level */
6425 + 1 + /* and possibly extra new block at the leaf level */
6426 + 3; /* loading of leaves into memory */
6427 +}
6428 +
6429 +/* worst case item deletion memory requirements */
6430 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6431 +{
6432 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6433 + 3; /* loading of leaves into memory */
6434 +}
6435 +
6436 +/* worst case tree cut memory requirements */
6437 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6438 +{
6439 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6440 + 3; /* loading of leaves into memory */
6441 +}
6442 +
6443 +/* worst case memory requirements of pasting into item */
6444 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6445 +{
6446 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6447 + capped_height() + /* new block on each level */
6448 + 1 + /* and possibly extra new block at the leaf level */
6449 + 3; /* loading of leaves into memory */
6450 +}
6451 +
6452 +/* worst case memory requirements of extent insertion */
6453 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6454 +{
6455 + return carry_estimate_insert(op, level) + /* insert extent */
6456 + carry_estimate_delete(op, level); /* kill leaf */
6457 +}
6458 +
6459 +/* worst case memory requirements of key update */
6460 +static int carry_estimate_update(carry_op * op, carry_level * level)
6461 +{
6462 + return 0;
6463 +}
6464 +
6465 +/* worst case memory requirements of flow insertion */
6466 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6467 +{
6468 + int newnodes;
6469 +
6470 + newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6471 + CARRY_FLOW_NEW_NODES_LIMIT);
6472 + /*
6473 + * roughly estimate insert_flow as a sequence of insertions.
6474 + */
6475 + return newnodes * carry_estimate_insert(op, level);
6476 +}
6477 +
6478 +/* This is dispatch table for carry operations. It can be trivially
6479 + abstracted into useful plugin: tunable balancing policy is a good
6480 + thing. */
6481 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6482 + [COP_INSERT] = {
6483 + .handler = carry_insert,
6484 + .estimate = carry_estimate_insert}
6485 + ,
6486 + [COP_DELETE] = {
6487 + .handler = carry_delete,
6488 + .estimate = carry_estimate_delete}
6489 + ,
6490 + [COP_CUT] = {
6491 + .handler = carry_cut,
6492 + .estimate = carry_estimate_cut}
6493 + ,
6494 + [COP_PASTE] = {
6495 + .handler = carry_paste,
6496 + .estimate = carry_estimate_paste}
6497 + ,
6498 + [COP_EXTENT] = {
6499 + .handler = carry_extent,
6500 + .estimate = carry_estimate_extent}
6501 + ,
6502 + [COP_UPDATE] = {
6503 + .handler = carry_update,
6504 + .estimate = carry_estimate_update}
6505 + ,
6506 + [COP_INSERT_FLOW] = {
6507 + .handler = carry_insert_flow,
6508 + .estimate = carry_estimate_insert_flow}
6509 +};
6510 +
6511 +/* Make Linus happy.
6512 + Local variables:
6513 + c-indentation-style: "K&R"
6514 + mode-name: "LC"
6515 + c-basic-offset: 8
6516 + tab-width: 8
6517 + fill-column: 120
6518 + scroll-step: 1
6519 + End:
6520 +*/
6521 Index: linux-2.6.16/fs/reiser4/carry_ops.h
6522 ===================================================================
6523 --- /dev/null
6524 +++ linux-2.6.16/fs/reiser4/carry_ops.h
6525 @@ -0,0 +1,42 @@
6526 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6527 +
6528 +/* implementation of carry operations. See carry_ops.c for details. */
6529 +
6530 +#if !defined( __CARRY_OPS_H__ )
6531 +#define __CARRY_OPS_H__
6532 +
6533 +#include "forward.h"
6534 +#include "znode.h"
6535 +#include "carry.h"
6536 +
6537 +/* carry operation handlers */
6538 +typedef struct carry_op_handler {
6539 + /* perform operation */
6540 + int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6541 + /* estimate memory requirements for @op */
6542 + int (*estimate) (carry_op * op, carry_level * level);
6543 +} carry_op_handler;
6544 +
6545 +/* This is dispatch table for carry operations. It can be trivially
6546 + abstracted into useful plugin: tunable balancing policy is a good
6547 + thing. */
6548 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6549 +
6550 +unsigned int space_needed(const znode * node, const coord_t * coord,
6551 + const reiser4_item_data * data, int inserting);
6552 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6553 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6554 +
6555 +/* __CARRY_OPS_H__ */
6556 +#endif
6557 +
6558 +/* Make Linus happy.
6559 + Local variables:
6560 + c-indentation-style: "K&R"
6561 + mode-name: "LC"
6562 + c-basic-offset: 8
6563 + tab-width: 8
6564 + fill-column: 120
6565 + scroll-step: 1
6566 + End:
6567 +*/
6568 Index: linux-2.6.16/fs/reiser4/context.c
6569 ===================================================================
6570 --- /dev/null
6571 +++ linux-2.6.16/fs/reiser4/context.c
6572 @@ -0,0 +1,278 @@
6573 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6574 +
6575 +/* Manipulation of reiser4_context */
6576 +
6577 +/*
6578 + * global context used during system call. Variable of this type is allocated
6579 + * on the stack at the beginning of the reiser4 part of the system call and
6580 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6581 + * passing pointer to current transaction and current lockstack (both in
6582 + * one-to-one mapping with threads) all over the call chain.
6583 + *
6584 + * It's kind of like those global variables the prof used to tell you not to
6585 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6586 + *
6587 + * In some situations it is desirable to have ability to enter reiser4_context
6588 + * more than once for the same thread (nested contexts). For example, there
6589 + * are some functions that can be called either directly from VFS/VM or from
6590 + * already active reiser4 context (->writepage, for example).
6591 + *
6592 + * In such situations "child" context acts like dummy: all activity is
6593 + * actually performed in the top level context, and get_current_context()
6594 + * always returns top level context. Of course, init_context()/done_context()
6595 + * have to be properly nested any way.
6596 + *
6597 + * Note that there is an important difference between reiser4 uses
6598 + * ->fs_context and the way other file systems use it. Other file systems
6599 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6600 + * (this is why ->fs_context was initially called ->journal_info). This means,
6601 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6602 + * to the file system, they assume that some transaction is already underway,
6603 + * and usually bail out, because starting nested transaction would most likely
6604 + * lead to the deadlock. This gives false positives with reiser4, because we
6605 + * set ->fs_context before starting transaction.
6606 + */
6607 +
6608 +#include "debug.h"
6609 +#include "super.h"
6610 +#include "context.h"
6611 +
6612 +#include <linux/writeback.h> /* balance_dirty_pages() */
6613 +#include <linux/hardirq.h>
6614 +
6615 +
6616 +static void _init_context(reiser4_context * context, struct super_block *super)
6617 +{
6618 + memset(context, 0, sizeof(*context));
6619 +
6620 + context->super = super;
6621 + context->magic = context_magic;
6622 + context->outer = current->journal_info;
6623 + current->journal_info = (void *)context;
6624 + context->nr_children = 0;
6625 + context->gfp_mask = GFP_KERNEL;
6626 +
6627 + init_lock_stack(&context->stack);
6628 +
6629 + txn_begin(context);
6630 +
6631 + /* initialize head of tap list */
6632 + INIT_LIST_HEAD(&context->taps);
6633 +#if REISER4_DEBUG
6634 + context->task = current;
6635 +#endif
6636 + grab_space_enable();
6637 +}
6638 +
6639 +/* initialize context and bind it to the current thread
6640 +
6641 + This function should be called at the beginning of reiser4 part of
6642 + syscall.
6643 +*/
6644 +reiser4_context *init_context(struct super_block *super /* super block we are going to
6645 + * work with */ )
6646 +{
6647 + reiser4_context *context;
6648 +
6649 + assert("nikita-2662", !in_interrupt() && !in_irq());
6650 + assert("nikita-3357", super != NULL);
6651 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6652 +
6653 + context = get_current_context_check();
6654 + if (context && context->super == super) {
6655 + context = (reiser4_context *) current->journal_info;
6656 + context->nr_children++;
6657 + return context;
6658 + }
6659 +
6660 + context = kmalloc(sizeof(*context), GFP_KERNEL);
6661 + if (context == NULL)
6662 + return ERR_PTR(RETERR(-ENOMEM));
6663 +
6664 + _init_context(context, super);
6665 + return context;
6666 +}
6667 +
6668 +/* this is used in scan_mgr which is called with spinlock held and in
6669 + reiser4_fill_super magic */
6670 +void init_stack_context(reiser4_context *context, struct super_block *super)
6671 +{
6672 + assert("nikita-2662", !in_interrupt() && !in_irq());
6673 + assert("nikita-3357", super != NULL);
6674 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6675 + assert("vs-12", !is_in_reiser4_context());
6676 +
6677 + _init_context(context, super);
6678 + context->on_stack = 1;
6679 + return;
6680 +}
6681 +
6682 +/* cast lock stack embedded into reiser4 context up to its container */
6683 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6684 +{
6685 + return container_of(owner, reiser4_context, stack);
6686 +}
6687 +
6688 +/* true if there is already _any_ reiser4 context for the current thread */
6689 +int is_in_reiser4_context(void)
6690 +{
6691 + reiser4_context *ctx;
6692 +
6693 + ctx = current->journal_info;
6694 + return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6695 +}
6696 +
6697 +/*
6698 + * call balance dirty pages for the current context.
6699 + *
6700 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6701 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6702 + * write---this covers vast majority of all dirty traffic), but we cannot do
6703 + * this immediately when formatted node is dirtied, because long term lock is
6704 + * usually held at that time. To work around this, dirtying of formatted node
6705 + * simply increases ->nr_marked_dirty counter in the current reiser4
6706 + * context. When we are about to leave this context,
6707 + * balance_dirty_pages_ratelimited() is called, if necessary.
6708 + *
6709 + * This introduces another problem: sometimes we do not want to run
6710 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6711 + * because some important lock (like ->i_mutex on the parent directory) is
6712 + * held. To achieve this, ->nobalance flag can be set in the current context.
6713 + */
6714 +static void balance_dirty_pages_at(reiser4_context *context)
6715 +{
6716 + reiser4_super_info_data *sbinfo = get_super_private(context->super);
6717 +
6718 + /*
6719 + * call balance_dirty_pages_ratelimited() to process formatted nodes
6720 + * dirtied during this system call. Do that only if we are not in mount
6721 + * and there were nodes dirtied in this context and we are not in
6722 + * writepage (to avoid deadlock) and not in pdflush
6723 + */
6724 + if (sbinfo != NULL && sbinfo->fake != NULL &&
6725 + context->nr_marked_dirty != 0 &&
6726 + !(current->flags & PF_MEMALLOC) &&
6727 + !current_is_pdflush())
6728 + balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6729 +}
6730 +
6731 +/* release resources associated with context.
6732 +
6733 + This function should be called at the end of "session" with reiser4,
6734 + typically just before leaving reiser4 driver back to VFS.
6735 +
6736 + This is good place to put some degugging consistency checks, like that
6737 + thread released all locks and closed transcrash etc.
6738 +
6739 +*/
6740 +static void done_context(reiser4_context * context /* context being released */ )
6741 +{
6742 + assert("nikita-860", context != NULL);
6743 + assert("nikita-859", context->magic == context_magic);
6744 + assert("vs-646", (reiser4_context *) current->journal_info == context);
6745 + assert("zam-686", !in_interrupt() && !in_irq());
6746 +
6747 + /* only do anything when leaving top-level reiser4 context. All nested
6748 + * contexts are just dummies. */
6749 + if (context->nr_children == 0) {
6750 + assert("jmacd-673", context->trans == NULL);
6751 + assert("jmacd-1002", lock_stack_isclean(&context->stack));
6752 + assert("nikita-1936", no_counters_are_held());
6753 + assert("nikita-2626", list_empty_careful(taps_list()));
6754 + assert("zam-1004", ergo(get_super_private(context->super),
6755 + get_super_private(context->super)->delete_sema_owner !=
6756 + current));
6757 +
6758 + /* release all grabbed but as yet unused blocks */
6759 + if (context->grabbed_blocks != 0)
6760 + all_grabbed2free();
6761 +
6762 + /*
6763 + * synchronize against longterm_unlock_znode():
6764 + * wake_up_requestor() wakes up requestors without holding
6765 + * zlock (otherwise they will immediately bump into that lock
6766 + * after wake up on another CPU). To work around (rare)
6767 + * situation where requestor has been woken up asynchronously
6768 + * and managed to run until completion (and destroy its
6769 + * context and lock stack) before wake_up_requestor() called
6770 + * wake_up() on it, wake_up_requestor() synchronize on lock
6771 + * stack spin lock. It has actually been observed that spin
6772 + * lock _was_ locked at this point, because
6773 + * wake_up_requestor() took interrupt.
6774 + */
6775 + spin_lock_stack(&context->stack);
6776 + spin_unlock_stack(&context->stack);
6777 +
6778 + assert("zam-684", context->nr_children == 0);
6779 + /* restore original ->fs_context value */
6780 + current->journal_info = context->outer;
6781 + if (context->on_stack == 0)
6782 + kfree(context);
6783 + } else {
6784 + context->nr_children--;
6785 +#if REISER4_DEBUG
6786 + assert("zam-685", context->nr_children >= 0);
6787 +#endif
6788 + }
6789 +}
6790 +
6791 +/*
6792 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6793 + * transaction. Call done_context() to do context related book-keeping.
6794 + */
6795 +void reiser4_exit_context(reiser4_context * context)
6796 +{
6797 + assert("nikita-3021", schedulable());
6798 +
6799 + if (context->nr_children == 0) {
6800 + if (!context->nobalance) {
6801 + txn_restart(context);
6802 + balance_dirty_pages_at(context);
6803 + }
6804 +
6805 + /* if filesystem is mounted with -o sync or -o dirsync - commit
6806 + transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6807 + commiting on exit_context when inode semaphore is held and
6808 + to have ktxnmgrd to do commit instead to get better
6809 + concurrent filesystem accesses. But, when one mounts with -o
6810 + sync, he cares more about reliability than about
6811 + performance. So, for now we have this simple mount -o sync
6812 + support. */
6813 + if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6814 + txn_atom *atom;
6815 +
6816 + atom = get_current_atom_locked_nocheck();
6817 + if (atom) {
6818 + atom->flags |= ATOM_FORCE_COMMIT;
6819 + context->trans->flags &= ~TXNH_DONT_COMMIT;
6820 + spin_unlock_atom(atom);
6821 + }
6822 + }
6823 + txn_end(context);
6824 + }
6825 + done_context(context);
6826 +}
6827 +
6828 +void set_gfp_mask(void)
6829 +{
6830 + reiser4_context *ctx;
6831 +
6832 + ctx = get_current_context();
6833 + if (ctx->entd == 0 &&
6834 + list_empty(&ctx->stack.locks) &&
6835 + ctx->trans->atom == NULL)
6836 + ctx->gfp_mask = GFP_KERNEL;
6837 + else
6838 + ctx->gfp_mask = GFP_NOFS;
6839 +}
6840 +
6841 +/*
6842 + * Local variables:
6843 + * c-indentation-style: "K&R"
6844 + * mode-name: "LC"
6845 + * c-basic-offset: 8
6846 + * tab-width: 8
6847 + * fill-column: 120
6848 + * scroll-step: 1
6849 + * End:
6850 + */
6851 Index: linux-2.6.16/fs/reiser4/context.h
6852 ===================================================================
6853 --- /dev/null
6854 +++ linux-2.6.16/fs/reiser4/context.h
6855 @@ -0,0 +1,228 @@
6856 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6857 + * reiser4/README */
6858 +
6859 +/* Reiser4 context. See context.c for details. */
6860 +
6861 +#if !defined( __REISER4_CONTEXT_H__ )
6862 +#define __REISER4_CONTEXT_H__
6863 +
6864 +#include "forward.h"
6865 +#include "debug.h"
6866 +#include "dformat.h"
6867 +#include "tap.h"
6868 +#include "lock.h"
6869 +
6870 +#include <linux/types.h> /* for __u?? */
6871 +#include <linux/fs.h> /* for struct super_block */
6872 +#include <linux/spinlock.h>
6873 +#include <linux/sched.h> /* for struct task_struct */
6874 +
6875 +
6876 +/* reiser4 per-thread context */
6877 +struct reiser4_context {
6878 + /* magic constant. For identification of reiser4 contexts. */
6879 + __u32 magic;
6880 +
6881 + /* current lock stack. See lock.[ch]. This is where list of all
6882 + locks taken by current thread is kept. This is also used in
6883 + deadlock detection. */
6884 + lock_stack stack;
6885 +
6886 + /* current transcrash. */
6887 + txn_handle *trans;
6888 + /* transaction handle embedded into reiser4_context. ->trans points
6889 + * here by default. */
6890 + txn_handle trans_in_ctx;
6891 +
6892 + /* super block we are working with. To get the current tree
6893 + use &get_super_private (reiser4_get_current_sb ())->tree. */
6894 + struct super_block *super;
6895 +
6896 + /* parent fs activation */
6897 + struct fs_activation *outer;
6898 +
6899 + /* per-thread grabbed (for further allocation) blocks counter */
6900 + reiser4_block_nr grabbed_blocks;
6901 +
6902 + /* list of taps currently monitored. See tap.c */
6903 + struct list_head taps;
6904 +
6905 + /* grabbing space is enabled */
6906 + unsigned int grab_enabled:1;
6907 + /* should be set when we are write dirty nodes to disk in jnode_flush or
6908 + * reiser4_write_logs() */
6909 + unsigned int writeout_mode:1;
6910 + /* true, if current thread is an ent thread */
6911 + unsigned int entd:1;
6912 + /* true, if balance_dirty_pages() should not be run when leaving this
6913 + * context. This is used to avoid lengthly balance_dirty_pages()
6914 + * operation when holding some important resource, like directory
6915 + * ->i_mutex */
6916 + unsigned int nobalance:1;
6917 +
6918 + /* this bit is used on done_context to decide whether context is
6919 + kmalloc-ed and has to be kfree-ed */
6920 + unsigned int on_stack:1;
6921 +
6922 + /* count non-trivial jnode_set_dirty() calls */
6923 + unsigned long nr_marked_dirty;
6924 +
6925 + /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6926 + * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6927 + * captures pages. When number of pages captured in one
6928 + * reiser4_sync_inodes reaches some threshold - some atoms get
6929 + * flushed */
6930 + int nr_captured;
6931 + int nr_children; /* number of child contexts */
6932 +#if REISER4_DEBUG
6933 + /* debugging information about reiser4 locks held by the current
6934 + * thread */
6935 + lock_counters_info locks;
6936 + struct task_struct *task; /* so we can easily find owner of the stack */
6937 +
6938 + /*
6939 + * disk space grabbing debugging support
6940 + */
6941 + /* how many disk blocks were grabbed by the first call to
6942 + * reiser4_grab_space() in this context */
6943 + reiser4_block_nr grabbed_initially;
6944 +
6945 + /* list of all threads doing flush currently */
6946 + struct list_head flushers_link;
6947 + /* information about last error encountered by reiser4 */
6948 + err_site err;
6949 +#endif
6950 + void *vp;
6951 + gfp_t gfp_mask;
6952 +};
6953 +
6954 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6955 +
6956 +/* Debugging helps. */
6957 +#if REISER4_DEBUG
6958 +extern void print_contexts(void);
6959 +#endif
6960 +
6961 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6962 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
6963 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6964 +
6965 +extern reiser4_context *init_context(struct super_block *);
6966 +extern void init_stack_context(reiser4_context *, struct super_block *);
6967 +extern void reiser4_exit_context(reiser4_context *);
6968 +
6969 +/* magic constant we store in reiser4_context allocated at the stack. Used to
6970 + catch accesses to staled or uninitialized contexts. */
6971 +#define context_magic ((__u32) 0x4b1b5d0b)
6972 +
6973 +extern int is_in_reiser4_context(void);
6974 +
6975 +/*
6976 + * return reiser4_context for the thread @tsk
6977 + */
6978 +static inline reiser4_context *get_context(const struct task_struct *tsk)
6979 +{
6980 + assert("vs-1682",
6981 + ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6982 + return (reiser4_context *) tsk->journal_info;
6983 +}
6984 +
6985 +/*
6986 + * return reiser4 context of the current thread, or NULL if there is none.
6987 + */
6988 +static inline reiser4_context *get_current_context_check(void)
6989 +{
6990 + if (is_in_reiser4_context())
6991 + return get_context(current);
6992 + else
6993 + return NULL;
6994 +}
6995 +
6996 +static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6997 +
6998 +/* return context associated with current thread */
6999 +static inline reiser4_context *get_current_context(void)
7000 +{
7001 + return get_context(current);
7002 +}
7003 +
7004 +static inline gfp_t get_gfp_mask(void)
7005 +{
7006 + reiser4_context *ctx;
7007 +
7008 + ctx = get_current_context_check();
7009 + return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
7010 +}
7011 +
7012 +void set_gfp_mask(void);
7013 +
7014 +/*
7015 + * true if current thread is in the write-out mode. Thread enters write-out
7016 + * mode during jnode_flush and reiser4_write_logs().
7017 + */
7018 +static inline int is_writeout_mode(void)
7019 +{
7020 + return get_current_context()->writeout_mode;
7021 +}
7022 +
7023 +/*
7024 + * enter write-out mode
7025 + */
7026 +static inline void writeout_mode_enable(void)
7027 +{
7028 + assert("zam-941", !get_current_context()->writeout_mode);
7029 + get_current_context()->writeout_mode = 1;
7030 +}
7031 +
7032 +/*
7033 + * leave write-out mode
7034 + */
7035 +static inline void writeout_mode_disable(void)
7036 +{
7037 + assert("zam-942", get_current_context()->writeout_mode);
7038 + get_current_context()->writeout_mode = 0;
7039 +}
7040 +
7041 +static inline void grab_space_enable(void)
7042 +{
7043 + get_current_context()->grab_enabled = 1;
7044 +}
7045 +
7046 +static inline void grab_space_disable(void)
7047 +{
7048 + get_current_context()->grab_enabled = 0;
7049 +}
7050 +
7051 +static inline void grab_space_set_enabled(int enabled)
7052 +{
7053 + get_current_context()->grab_enabled = enabled;
7054 +}
7055 +
7056 +static inline int is_grab_enabled(reiser4_context * ctx)
7057 +{
7058 + return ctx->grab_enabled;
7059 +}
7060 +
7061 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
7062 + * flush would be performed when it is closed. This is necessary when handle
7063 + * has to be closed under some coarse semaphore, like i_mutex of
7064 + * directory. Commit will be performed by ktxnmgrd. */
7065 +static inline void context_set_commit_async(reiser4_context * context)
7066 +{
7067 + context->nobalance = 1;
7068 + context->trans->flags |= TXNH_DONT_COMMIT;
7069 +}
7070 +
7071 +/* __REISER4_CONTEXT_H__ */
7072 +#endif
7073 +
7074 +/* Make Linus happy.
7075 + Local variables:
7076 + c-indentation-style: "K&R"
7077 + mode-name: "LC"
7078 + c-basic-offset: 8
7079 + tab-width: 8
7080 + fill-column: 120
7081 + scroll-step: 1
7082 + End:
7083 +*/
7084 Index: linux-2.6.16/fs/reiser4/coord.c
7085 ===================================================================
7086 --- /dev/null
7087 +++ linux-2.6.16/fs/reiser4/coord.c
7088 @@ -0,0 +1,937 @@
7089 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7090 +
7091 +#include "forward.h"
7092 +#include "debug.h"
7093 +#include "dformat.h"
7094 +#include "tree.h"
7095 +#include "plugin/item/item.h"
7096 +#include "znode.h"
7097 +#include "coord.h"
7098 +
7099 +/* Internal constructor. */
7100 +static inline void
7101 +coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
7102 + pos_in_node_t unit_pos, between_enum between)
7103 +{
7104 + coord->node = (znode *) node;
7105 + coord_set_item_pos(coord, item_pos);
7106 + coord->unit_pos = unit_pos;
7107 + coord->between = between;
7108 + ON_DEBUG(coord->plug_v = 0);
7109 + ON_DEBUG(coord->body_v = 0);
7110 +
7111 + /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
7112 +}
7113 +
7114 +/* after shifting of node content, coord previously set properly may become
7115 + invalid, try to "normalize" it. */
7116 +void coord_normalize(coord_t * coord)
7117 +{
7118 + znode *node;
7119 +
7120 + node = coord->node;
7121 + assert("vs-683", node);
7122 +
7123 + coord_clear_iplug(coord);
7124 +
7125 + if (node_is_empty(node)) {
7126 + coord_init_first_unit(coord, node);
7127 + } else if ((coord->between == AFTER_ITEM)
7128 + || (coord->between == AFTER_UNIT)) {
7129 + return;
7130 + } else if (coord->item_pos == coord_num_items(coord)
7131 + && coord->between == BEFORE_ITEM) {
7132 + coord_dec_item_pos(coord);
7133 + coord->between = AFTER_ITEM;
7134 + } else if (coord->unit_pos == coord_num_units(coord)
7135 + && coord->between == BEFORE_UNIT) {
7136 + coord->unit_pos--;
7137 + coord->between = AFTER_UNIT;
7138 + } else if (coord->item_pos == coord_num_items(coord)
7139 + && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
7140 + coord_dec_item_pos(coord);
7141 + coord->unit_pos = 0;
7142 + coord->between = AFTER_ITEM;
7143 + }
7144 +}
7145 +
7146 +/* Copy a coordinate. */
7147 +void coord_dup(coord_t * coord, const coord_t * old_coord)
7148 +{
7149 + assert("jmacd-9800", coord_check(old_coord));
7150 + coord_dup_nocheck(coord, old_coord);
7151 +}
7152 +
7153 +/* Copy a coordinate without check. Useful when old_coord->node is not
7154 + loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
7155 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
7156 +{
7157 + coord->node = old_coord->node;
7158 + coord_set_item_pos(coord, old_coord->item_pos);
7159 + coord->unit_pos = old_coord->unit_pos;
7160 + coord->between = old_coord->between;
7161 + coord->iplugid = old_coord->iplugid;
7162 + ON_DEBUG(coord->plug_v = old_coord->plug_v);
7163 + ON_DEBUG(coord->body_v = old_coord->body_v);
7164 +}
7165 +
7166 +/* Initialize an invalid coordinate. */
7167 +void coord_init_invalid(coord_t * coord, const znode * node)
7168 +{
7169 + coord_init_values(coord, node, 0, 0, INVALID_COORD);
7170 +}
7171 +
7172 +void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
7173 +{
7174 + coord_init_values(coord, node, 0, 0, AT_UNIT);
7175 +}
7176 +
7177 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
7178 + empty, it is positioned at the EMPTY_NODE. */
7179 +void coord_init_first_unit(coord_t * coord, const znode * node)
7180 +{
7181 + int is_empty = node_is_empty(node);
7182 +
7183 + coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
7184 +
7185 + assert("jmacd-9801", coord_check(coord));
7186 +}
7187 +
7188 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
7189 + empty, it is positioned at the EMPTY_NODE. */
7190 +void coord_init_last_unit(coord_t * coord, const znode * node)
7191 +{
7192 + int is_empty = node_is_empty(node);
7193 +
7194 + coord_init_values(coord, node,
7195 + (is_empty ? 0 : node_num_items(node) - 1), 0,
7196 + (is_empty ? EMPTY_NODE : AT_UNIT));
7197 + if (!is_empty)
7198 + coord->unit_pos = coord_last_unit_pos(coord);
7199 + assert("jmacd-9802", coord_check(coord));
7200 +}
7201 +
7202 +/* Initialize a coordinate to before the first item. If the node is empty, it is
7203 + positioned at the EMPTY_NODE. */
7204 +void coord_init_before_first_item(coord_t * coord, const znode * node)
7205 +{
7206 + int is_empty = node_is_empty(node);
7207 +
7208 + coord_init_values(coord, node, 0, 0,
7209 + (is_empty ? EMPTY_NODE : BEFORE_UNIT));
7210 +
7211 + assert("jmacd-9803", coord_check(coord));
7212 +}
7213 +
7214 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7215 + at the EMPTY_NODE. */
7216 +void coord_init_after_last_item(coord_t * coord, const znode * node)
7217 +{
7218 + int is_empty = node_is_empty(node);
7219 +
7220 + coord_init_values(coord, node,
7221 + (is_empty ? 0 : node_num_items(node) - 1), 0,
7222 + (is_empty ? EMPTY_NODE : AFTER_ITEM));
7223 +
7224 + assert("jmacd-9804", coord_check(coord));
7225 +}
7226 +
7227 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7228 + already to existing item */
7229 +void coord_init_after_item_end(coord_t * coord)
7230 +{
7231 + coord->between = AFTER_UNIT;
7232 + coord->unit_pos = coord_last_unit_pos(coord);
7233 +}
7234 +
7235 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7236 +void coord_init_before_item(coord_t * coord)
7237 +{
7238 + coord->unit_pos = 0;
7239 + coord->between = BEFORE_ITEM;
7240 +}
7241 +
7242 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7243 +void coord_init_after_item(coord_t * coord)
7244 +{
7245 + coord->unit_pos = 0;
7246 + coord->between = AFTER_ITEM;
7247 +}
7248 +
7249 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7250 + it was not clear how actually */
7251 +void coord_init_zero(coord_t * coord)
7252 +{
7253 + memset(coord, 0, sizeof(*coord));
7254 +}
7255 +
7256 +/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
7257 +unsigned coord_num_units(const coord_t * coord)
7258 +{
7259 + assert("jmacd-9806", coord_is_existing_item(coord));
7260 +
7261 + return item_plugin_by_coord(coord)->b.nr_units(coord);
7262 +}
7263 +
7264 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7265 +/* Audited by: green(2002.06.15) */
7266 +int coord_is_invalid(const coord_t * coord)
7267 +{
7268 + return coord->between == INVALID_COORD;
7269 +}
7270 +
7271 +/* Returns true if the coordinate is positioned at an existing item, not before or after
7272 + an item. It may be placed at, before, or after any unit within the item, whether
7273 + existing or not. */
7274 +int coord_is_existing_item(const coord_t * coord)
7275 +{
7276 + switch (coord->between) {
7277 + case EMPTY_NODE:
7278 + case BEFORE_ITEM:
7279 + case AFTER_ITEM:
7280 + case INVALID_COORD:
7281 + return 0;
7282 +
7283 + case BEFORE_UNIT:
7284 + case AT_UNIT:
7285 + case AFTER_UNIT:
7286 + return coord->item_pos < coord_num_items(coord);
7287 + }
7288 +
7289 + impossible("jmacd-9900", "unreachable coord: %p", coord);
7290 + return 0;
7291 +}
7292 +
7293 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7294 + unit. */
7295 +/* Audited by: green(2002.06.15) */
7296 +int coord_is_existing_unit(const coord_t * coord)
7297 +{
7298 + switch (coord->between) {
7299 + case EMPTY_NODE:
7300 + case BEFORE_UNIT:
7301 + case AFTER_UNIT:
7302 + case BEFORE_ITEM:
7303 + case AFTER_ITEM:
7304 + case INVALID_COORD:
7305 + return 0;
7306 +
7307 + case AT_UNIT:
7308 + return (coord->item_pos < coord_num_items(coord)
7309 + && coord->unit_pos < coord_num_units(coord));
7310 + }
7311 +
7312 + impossible("jmacd-9902", "unreachable");
7313 + return 0;
7314 +}
7315 +
7316 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7317 + true for empty nodes nor coordinates positioned before the first item. */
7318 +/* Audited by: green(2002.06.15) */
7319 +int coord_is_leftmost_unit(const coord_t * coord)
7320 +{
7321 + return (coord->between == AT_UNIT && coord->item_pos == 0
7322 + && coord->unit_pos == 0);
7323 +}
7324 +
7325 +#if REISER4_DEBUG
7326 +/* For assertions only, checks for a valid coordinate. */
7327 +int coord_check(const coord_t * coord)
7328 +{
7329 + if (coord->node == NULL) {
7330 + return 0;
7331 + }
7332 + if (znode_above_root(coord->node))
7333 + return 1;
7334 +
7335 + switch (coord->between) {
7336 + default:
7337 + case INVALID_COORD:
7338 + return 0;
7339 + case EMPTY_NODE:
7340 + if (!node_is_empty(coord->node)) {
7341 + return 0;
7342 + }
7343 + return coord->item_pos == 0 && coord->unit_pos == 0;
7344 +
7345 + case BEFORE_UNIT:
7346 + case AFTER_UNIT:
7347 + if (node_is_empty(coord->node) && (coord->item_pos == 0)
7348 + && (coord->unit_pos == 0))
7349 + return 1;
7350 + case AT_UNIT:
7351 + break;
7352 + case AFTER_ITEM:
7353 + case BEFORE_ITEM:
7354 + /* before/after item should not set unit_pos. */
7355 + if (coord->unit_pos != 0) {
7356 + return 0;
7357 + }
7358 + break;
7359 + }
7360 +
7361 + if (coord->item_pos >= node_num_items(coord->node)) {
7362 + return 0;
7363 + }
7364 +
7365 + /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7366 + between is set either AFTER_ITEM or BEFORE_ITEM */
7367 + if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7368 + return 1;
7369 +
7370 + if (coord_is_iplug_set(coord) &&
7371 + coord->unit_pos >
7372 + item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7373 + return 0;
7374 + }
7375 + return 1;
7376 +}
7377 +#endif
7378 +
7379 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7380 + Returns 1 if the new position is does not exist. */
7381 +static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7382 +{
7383 + /* If the node is invalid, leave it. */
7384 + if (coord->between == INVALID_COORD) {
7385 + return 1;
7386 + }
7387 +
7388 + /* If the node is empty, set it appropriately. */
7389 + if (items == 0) {
7390 + coord->between = EMPTY_NODE;
7391 + coord_set_item_pos(coord, 0);
7392 + coord->unit_pos = 0;
7393 + return 1;
7394 + }
7395 +
7396 + /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7397 + if (coord->between == EMPTY_NODE) {
7398 + coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7399 + coord_set_item_pos(coord, 0);
7400 + coord->unit_pos = 0;
7401 + return 0;
7402 + }
7403 +
7404 + /* If the item_pos is out-of-range, set it appropriatly. */
7405 + if (coord->item_pos >= items) {
7406 + coord->between = AFTER_ITEM;
7407 + coord_set_item_pos(coord, items - 1);
7408 + coord->unit_pos = 0;
7409 + /* If is_next, return 1 (can't go any further). */
7410 + return is_next;
7411 + }
7412 +
7413 + return 0;
7414 +}
7415 +
7416 +/* Advances the coordinate by one unit to the right. If empty, no change. If
7417 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7418 + existing unit. */
7419 +int coord_next_unit(coord_t * coord)
7420 +{
7421 + unsigned items = coord_num_items(coord);
7422 +
7423 + if (coord_adjust_items(coord, items, 1) == 1) {
7424 + return 1;
7425 + }
7426 +
7427 + switch (coord->between) {
7428 + case BEFORE_UNIT:
7429 + /* Now it is positioned at the same unit. */
7430 + coord->between = AT_UNIT;
7431 + return 0;
7432 +
7433 + case AFTER_UNIT:
7434 + case AT_UNIT:
7435 + /* If it was at or after a unit and there are more units in this item,
7436 + advance to the next one. */
7437 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7438 + coord->unit_pos += 1;
7439 + coord->between = AT_UNIT;
7440 + return 0;
7441 + }
7442 +
7443 + /* Otherwise, it is crossing an item boundary and treated as if it was
7444 + after the current item. */
7445 + coord->between = AFTER_ITEM;
7446 + coord->unit_pos = 0;
7447 + /* FALLTHROUGH */
7448 +
7449 + case AFTER_ITEM:
7450 + /* Check for end-of-node. */
7451 + if (coord->item_pos == items - 1) {
7452 + return 1;
7453 + }
7454 +
7455 + coord_inc_item_pos(coord);
7456 + coord->unit_pos = 0;
7457 + coord->between = AT_UNIT;
7458 + return 0;
7459 +
7460 + case BEFORE_ITEM:
7461 + /* The adjust_items checks ensure that we are valid here. */
7462 + coord->unit_pos = 0;
7463 + coord->between = AT_UNIT;
7464 + return 0;
7465 +
7466 + case INVALID_COORD:
7467 + case EMPTY_NODE:
7468 + /* Handled in coord_adjust_items(). */
7469 + break;
7470 + }
7471 +
7472 + impossible("jmacd-9902", "unreachable");
7473 + return 0;
7474 +}
7475 +
7476 +/* Advances the coordinate by one item to the right. If empty, no change. If
7477 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7478 + an existing item. */
7479 +int coord_next_item(coord_t * coord)
7480 +{
7481 + unsigned items = coord_num_items(coord);
7482 +
7483 + if (coord_adjust_items(coord, items, 1) == 1) {
7484 + return 1;
7485 + }
7486 +
7487 + switch (coord->between) {
7488 + case AFTER_UNIT:
7489 + case AT_UNIT:
7490 + case BEFORE_UNIT:
7491 + case AFTER_ITEM:
7492 + /* Check for end-of-node. */
7493 + if (coord->item_pos == items - 1) {
7494 + coord->between = AFTER_ITEM;
7495 + coord->unit_pos = 0;
7496 + coord_clear_iplug(coord);
7497 + return 1;
7498 + }
7499 +
7500 + /* Anywhere in an item, go to the next one. */
7501 + coord->between = AT_UNIT;
7502 + coord_inc_item_pos(coord);
7503 + coord->unit_pos = 0;
7504 + return 0;
7505 +
7506 + case BEFORE_ITEM:
7507 + /* The out-of-range check ensures that we are valid here. */
7508 + coord->unit_pos = 0;
7509 + coord->between = AT_UNIT;
7510 + return 0;
7511 + case INVALID_COORD:
7512 + case EMPTY_NODE:
7513 + /* Handled in coord_adjust_items(). */
7514 + break;
7515 + }
7516 +
7517 + impossible("jmacd-9903", "unreachable");
7518 + return 0;
7519 +}
7520 +
7521 +/* Advances the coordinate by one unit to the left. If empty, no change. If
7522 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7523 + is an existing unit. */
7524 +int coord_prev_unit(coord_t * coord)
7525 +{
7526 + unsigned items = coord_num_items(coord);
7527 +
7528 + if (coord_adjust_items(coord, items, 0) == 1) {
7529 + return 1;
7530 + }
7531 +
7532 + switch (coord->between) {
7533 + case AT_UNIT:
7534 + case BEFORE_UNIT:
7535 + if (coord->unit_pos > 0) {
7536 + coord->unit_pos -= 1;
7537 + coord->between = AT_UNIT;
7538 + return 0;
7539 + }
7540 +
7541 + if (coord->item_pos == 0) {
7542 + coord->between = BEFORE_ITEM;
7543 + return 1;
7544 + }
7545 +
7546 + coord_dec_item_pos(coord);
7547 + coord->unit_pos = coord_last_unit_pos(coord);
7548 + coord->between = AT_UNIT;
7549 + return 0;
7550 +
7551 + case AFTER_UNIT:
7552 + /* What if unit_pos is out-of-range? */
7553 + assert("jmacd-5442",
7554 + coord->unit_pos <= coord_last_unit_pos(coord));
7555 + coord->between = AT_UNIT;
7556 + return 0;
7557 +
7558 + case BEFORE_ITEM:
7559 + if (coord->item_pos == 0) {
7560 + return 1;
7561 + }
7562 +
7563 + coord_dec_item_pos(coord);
7564 + /* FALLTHROUGH */
7565 +
7566 + case AFTER_ITEM:
7567 + coord->between = AT_UNIT;
7568 + coord->unit_pos = coord_last_unit_pos(coord);
7569 + return 0;
7570 +
7571 + case INVALID_COORD:
7572 + case EMPTY_NODE:
7573 + break;
7574 + }
7575 +
7576 + impossible("jmacd-9904", "unreachable");
7577 + return 0;
7578 +}
7579 +
7580 +/* Advances the coordinate by one item to the left. If empty, no change. If
7581 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7582 + is an existing item. */
7583 +int coord_prev_item(coord_t * coord)
7584 +{
7585 + unsigned items = coord_num_items(coord);
7586 +
7587 + if (coord_adjust_items(coord, items, 0) == 1) {
7588 + return 1;
7589 + }
7590 +
7591 + switch (coord->between) {
7592 + case AT_UNIT:
7593 + case AFTER_UNIT:
7594 + case BEFORE_UNIT:
7595 + case BEFORE_ITEM:
7596 +
7597 + if (coord->item_pos == 0) {
7598 + coord->between = BEFORE_ITEM;
7599 + coord->unit_pos = 0;
7600 + return 1;
7601 + }
7602 +
7603 + coord_dec_item_pos(coord);
7604 + coord->unit_pos = 0;
7605 + coord->between = AT_UNIT;
7606 + return 0;
7607 +
7608 + case AFTER_ITEM:
7609 + coord->between = AT_UNIT;
7610 + coord->unit_pos = 0;
7611 + return 0;
7612 +
7613 + case INVALID_COORD:
7614 + case EMPTY_NODE:
7615 + break;
7616 + }
7617 +
7618 + impossible("jmacd-9905", "unreachable");
7619 + return 0;
7620 +}
7621 +
7622 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7623 +void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7624 +{
7625 + assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7626 + if (dir == LEFT_SIDE) {
7627 + coord_init_first_unit(coord, node);
7628 + } else {
7629 + coord_init_last_unit(coord, node);
7630 + }
7631 +}
7632 +
7633 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7634 + argument. */
7635 +/* Audited by: green(2002.06.15) */
7636 +int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7637 +{
7638 + assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7639 + if (dir == LEFT_SIDE) {
7640 + return coord_is_before_leftmost(coord);
7641 + } else {
7642 + return coord_is_after_rightmost(coord);
7643 + }
7644 +}
7645 +
7646 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7647 +/* Audited by: green(2002.06.15) */
7648 +int coord_sideof_unit(coord_t * coord, sideof dir)
7649 +{
7650 + assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7651 + if (dir == LEFT_SIDE) {
7652 + return coord_prev_unit(coord);
7653 + } else {
7654 + return coord_next_unit(coord);
7655 + }
7656 +}
7657 +
7658 +#if REISER4_DEBUG
7659 +#define DEBUG_COORD_FIELDS (sizeof(c1->plug_v) + sizeof(c1->body_v))
7660 +#else
7661 +#define DEBUG_COORD_FIELDS (0)
7662 +#endif
7663 +
7664 +int coords_equal(const coord_t * c1, const coord_t * c2)
7665 +{
7666 + assert("nikita-2840", c1 != NULL);
7667 + assert("nikita-2841", c2 != NULL);
7668 +
7669 + return
7670 + c1->node == c2->node &&
7671 + c1->item_pos == c2->item_pos &&
7672 + c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7673 +}
7674 +
7675 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7676 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7677 +/* Audited by: green(2002.06.15) */
7678 +coord_wrt_node coord_wrt(const coord_t * coord)
7679 +{
7680 + if (coord_is_before_leftmost(coord)) {
7681 + return COORD_ON_THE_LEFT;
7682 + }
7683 +
7684 + if (coord_is_after_rightmost(coord)) {
7685 + return COORD_ON_THE_RIGHT;
7686 + }
7687 +
7688 + return COORD_INSIDE;
7689 +}
7690 +
7691 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7692 + of the last item or it is an empty node. */
7693 +/* Audited by: green(2002.06.15) */
7694 +int coord_is_after_rightmost(const coord_t * coord)
7695 +{
7696 + assert("jmacd-7313", coord_check(coord));
7697 +
7698 + switch (coord->between) {
7699 + case INVALID_COORD:
7700 + case AT_UNIT:
7701 + case BEFORE_UNIT:
7702 + case BEFORE_ITEM:
7703 + return 0;
7704 +
7705 + case EMPTY_NODE:
7706 + return 1;
7707 +
7708 + case AFTER_ITEM:
7709 + return (coord->item_pos == node_num_items(coord->node) - 1);
7710 +
7711 + case AFTER_UNIT:
7712 + return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7713 + coord->unit_pos == coord_last_unit_pos(coord));
7714 + }
7715 +
7716 + impossible("jmacd-9908", "unreachable");
7717 + return 0;
7718 +}
7719 +
7720 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7721 + node. */
7722 +int coord_is_before_leftmost(const coord_t * coord)
7723 +{
7724 + /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7725 + necessary to check if coord is set before leftmost
7726 + assert ("jmacd-7313", coord_check (coord)); */
7727 + switch (coord->between) {
7728 + case INVALID_COORD:
7729 + case AT_UNIT:
7730 + case AFTER_ITEM:
7731 + case AFTER_UNIT:
7732 + return 0;
7733 +
7734 + case EMPTY_NODE:
7735 + return 1;
7736 +
7737 + case BEFORE_ITEM:
7738 + case BEFORE_UNIT:
7739 + return (coord->item_pos == 0) && (coord->unit_pos == 0);
7740 + }
7741 +
7742 + impossible("jmacd-9908", "unreachable");
7743 + return 0;
7744 +}
7745 +
7746 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7747 + last unit of an item, before the first unit of an item, or at an empty node. */
7748 +/* Audited by: green(2002.06.15) */
7749 +int coord_is_between_items(const coord_t * coord)
7750 +{
7751 + assert("jmacd-7313", coord_check(coord));
7752 +
7753 + switch (coord->between) {
7754 + case INVALID_COORD:
7755 + case AT_UNIT:
7756 + return 0;
7757 +
7758 + case AFTER_ITEM:
7759 + case BEFORE_ITEM:
7760 + case EMPTY_NODE:
7761 + return 1;
7762 +
7763 + case BEFORE_UNIT:
7764 + return coord->unit_pos == 0;
7765 +
7766 + case AFTER_UNIT:
7767 + return coord->unit_pos == coord_last_unit_pos(coord);
7768 + }
7769 +
7770 + impossible("jmacd-9908", "unreachable");
7771 + return 0;
7772 +}
7773 +
7774 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7775 + before-after or item boundaries. */
7776 +int coord_are_neighbors(coord_t * c1, coord_t * c2)
7777 +{
7778 + coord_t *left;
7779 + coord_t *right;
7780 +
7781 + assert("nikita-1241", c1 != NULL);
7782 + assert("nikita-1242", c2 != NULL);
7783 + assert("nikita-1243", c1->node == c2->node);
7784 + assert("nikita-1244", coord_is_existing_unit(c1));
7785 + assert("nikita-1245", coord_is_existing_unit(c2));
7786 +
7787 + left = right = NULL;
7788 + switch (coord_compare(c1, c2)) {
7789 + case COORD_CMP_ON_LEFT:
7790 + left = c1;
7791 + right = c2;
7792 + break;
7793 + case COORD_CMP_ON_RIGHT:
7794 + left = c2;
7795 + right = c1;
7796 + break;
7797 + case COORD_CMP_SAME:
7798 + return 0;
7799 + default:
7800 + wrong_return_value("nikita-1246", "compare_coords()");
7801 + }
7802 + assert("vs-731", left && right);
7803 + if (left->item_pos == right->item_pos) {
7804 + return left->unit_pos + 1 == right->unit_pos;
7805 + } else if (left->item_pos + 1 == right->item_pos) {
7806 + return (left->unit_pos == coord_last_unit_pos(left))
7807 + && (right->unit_pos == 0);
7808 + } else {
7809 + return 0;
7810 + }
7811 +}
7812 +
7813 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7814 + COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7815 +/* Audited by: green(2002.06.15) */
7816 +coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7817 +{
7818 + assert("vs-209", c1->node == c2->node);
7819 + assert("vs-194", coord_is_existing_unit(c1)
7820 + && coord_is_existing_unit(c2));
7821 +
7822 + if (c1->item_pos > c2->item_pos)
7823 + return COORD_CMP_ON_RIGHT;
7824 + if (c1->item_pos < c2->item_pos)
7825 + return COORD_CMP_ON_LEFT;
7826 + if (c1->unit_pos > c2->unit_pos)
7827 + return COORD_CMP_ON_RIGHT;
7828 + if (c1->unit_pos < c2->unit_pos)
7829 + return COORD_CMP_ON_LEFT;
7830 + return COORD_CMP_SAME;
7831 +}
7832 +
7833 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7834 + non-zero if there is no position to the right. */
7835 +int coord_set_to_right(coord_t * coord)
7836 +{
7837 + unsigned items = coord_num_items(coord);
7838 +
7839 + if (coord_adjust_items(coord, items, 1) == 1) {
7840 + return 1;
7841 + }
7842 +
7843 + switch (coord->between) {
7844 + case AT_UNIT:
7845 + return 0;
7846 +
7847 + case BEFORE_ITEM:
7848 + case BEFORE_UNIT:
7849 + coord->between = AT_UNIT;
7850 + return 0;
7851 +
7852 + case AFTER_UNIT:
7853 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7854 + coord->unit_pos += 1;
7855 + coord->between = AT_UNIT;
7856 + return 0;
7857 + } else {
7858 +
7859 + coord->unit_pos = 0;
7860 +
7861 + if (coord->item_pos == items - 1) {
7862 + coord->between = AFTER_ITEM;
7863 + return 1;
7864 + }
7865 +
7866 + coord_inc_item_pos(coord);
7867 + coord->between = AT_UNIT;
7868 + return 0;
7869 + }
7870 +
7871 + case AFTER_ITEM:
7872 + if (coord->item_pos == items - 1) {
7873 + return 1;
7874 + }
7875 +
7876 + coord_inc_item_pos(coord);
7877 + coord->unit_pos = 0;
7878 + coord->between = AT_UNIT;
7879 + return 0;
7880 +
7881 + case EMPTY_NODE:
7882 + return 1;
7883 +
7884 + case INVALID_COORD:
7885 + break;
7886 + }
7887 +
7888 + impossible("jmacd-9920", "unreachable");
7889 + return 0;
7890 +}
7891 +
7892 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7893 + non-zero if there is no position to the left. */
7894 +int coord_set_to_left(coord_t * coord)
7895 +{
7896 + unsigned items = coord_num_items(coord);
7897 +
7898 + if (coord_adjust_items(coord, items, 0) == 1) {
7899 + return 1;
7900 + }
7901 +
7902 + switch (coord->between) {
7903 + case AT_UNIT:
7904 + return 0;
7905 +
7906 + case AFTER_UNIT:
7907 + coord->between = AT_UNIT;
7908 + return 0;
7909 +
7910 + case AFTER_ITEM:
7911 + coord->between = AT_UNIT;
7912 + coord->unit_pos = coord_last_unit_pos(coord);
7913 + return 0;
7914 +
7915 + case BEFORE_UNIT:
7916 + if (coord->unit_pos > 0) {
7917 + coord->unit_pos -= 1;
7918 + coord->between = AT_UNIT;
7919 + return 0;
7920 + } else {
7921 +
7922 + if (coord->item_pos == 0) {
7923 + coord->between = BEFORE_ITEM;
7924 + return 1;
7925 + }
7926 +
7927 + coord->unit_pos = coord_last_unit_pos(coord);
7928 + coord_dec_item_pos(coord);
7929 + coord->between = AT_UNIT;
7930 + return 0;
7931 + }
7932 +
7933 + case BEFORE_ITEM:
7934 + if (coord->item_pos == 0) {
7935 + return 1;
7936 + }
7937 +
7938 + coord_dec_item_pos(coord);
7939 + coord->unit_pos = coord_last_unit_pos(coord);
7940 + coord->between = AT_UNIT;
7941 + return 0;
7942 +
7943 + case EMPTY_NODE:
7944 + return 1;
7945 +
7946 + case INVALID_COORD:
7947 + break;
7948 + }
7949 +
7950 + impossible("jmacd-9920", "unreachable");
7951 + return 0;
7952 +}
7953 +
7954 +static const char *coord_tween_tostring(between_enum n)
7955 +{
7956 + switch (n) {
7957 + case BEFORE_UNIT:
7958 + return "before unit";
7959 + case BEFORE_ITEM:
7960 + return "before item";
7961 + case AT_UNIT:
7962 + return "at unit";
7963 + case AFTER_UNIT:
7964 + return "after unit";
7965 + case AFTER_ITEM:
7966 + return "after item";
7967 + case EMPTY_NODE:
7968 + return "empty node";
7969 + case INVALID_COORD:
7970 + return "invalid";
7971 + default:
7972 + {
7973 + static char buf[30];
7974 +
7975 + sprintf(buf, "unknown: %i", n);
7976 + return buf;
7977 + }
7978 + }
7979 +}
7980 +
7981 +void print_coord(const char *mes, const coord_t * coord, int node)
7982 +{
7983 + if (coord == NULL) {
7984 + printk("%s: null\n", mes);
7985 + return;
7986 + }
7987 + printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7988 + mes, coord->item_pos, coord->unit_pos,
7989 + coord_tween_tostring(coord->between), coord->iplugid);
7990 +}
7991 +
7992 +int
7993 +item_utmost_child_real_block(const coord_t * coord, sideof side,
7994 + reiser4_block_nr * blk)
7995 +{
7996 + return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7997 + side,
7998 + blk);
7999 +}
8000 +
8001 +int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
8002 +{
8003 + return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
8004 +}
8005 +
8006 +/* @count bytes of flow @f got written, update correspondingly f->length,
8007 + f->data and f->key */
8008 +void move_flow_forward(flow_t * f, unsigned count)
8009 +{
8010 + if (f->data)
8011 + f->data += count;
8012 + f->length -= count;
8013 + set_key_offset(&f->key, get_key_offset(&f->key) + count);
8014 +}
8015 +
8016 +/*
8017 + Local variables:
8018 + c-indentation-style: "K&R"
8019 + mode-name: "LC"
8020 + c-basic-offset: 8
8021 + tab-width: 8
8022 + fill-column: 120
8023 + scroll-step: 1
8024 + End:
8025 +*/
8026 Index: linux-2.6.16/fs/reiser4/coord.h
8027 ===================================================================
8028 --- /dev/null
8029 +++ linux-2.6.16/fs/reiser4/coord.h
8030 @@ -0,0 +1,389 @@
8031 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8032 +
8033 +/* Coords */
8034 +
8035 +#if !defined( __REISER4_COORD_H__ )
8036 +#define __REISER4_COORD_H__
8037 +
8038 +#include "forward.h"
8039 +#include "debug.h"
8040 +#include "dformat.h"
8041 +#include "key.h"
8042 +
8043 +/* insertions happen between coords in the tree, so we need some means
8044 + of specifying the sense of betweenness. */
8045 +typedef enum {
8046 + BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
8047 + AT_UNIT,
8048 + AFTER_UNIT,
8049 + BEFORE_ITEM,
8050 + AFTER_ITEM,
8051 + INVALID_COORD,
8052 + EMPTY_NODE,
8053 +} between_enum;
8054 +
8055 +/* location of coord w.r.t. its node */
8056 +typedef enum {
8057 + COORD_ON_THE_LEFT = -1,
8058 + COORD_ON_THE_RIGHT = +1,
8059 + COORD_INSIDE = 0
8060 +} coord_wrt_node;
8061 +
8062 +typedef enum {
8063 + COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
8064 +} coord_cmp;
8065 +
8066 +struct coord {
8067 + /* node in a tree */
8068 + /* 0 */ znode *node;
8069 +
8070 + /* position of item within node */
8071 + /* 4 */ pos_in_node_t item_pos;
8072 + /* position of unit within item */
8073 + /* 6 */ pos_in_node_t unit_pos;
8074 + /* optimization: plugin of item is stored in coord_t. Until this was
8075 + implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
8076 + is invalidated (set to 0xff) on each modification of ->item_pos,
8077 + and all such modifications are funneled through coord_*_item_pos()
8078 + functions below.
8079 + */
8080 + /* 8 */ char iplugid;
8081 + /* position of coord w.r.t. to neighboring items and/or units.
8082 + Values are taken from &between_enum above.
8083 + */
8084 + /* 9 */ char between;
8085 + /* padding. It will be added by the compiler anyway to conform to the
8086 + * C language alignment requirements. We keep it here to be on the
8087 + * safe side and to have a clear picture of the memory layout of this
8088 + * structure. */
8089 + /* 10 */ __u16 pad;
8090 + /* 12 */ int offset;
8091 +#if REISER4_DEBUG
8092 + unsigned long plug_v;
8093 + unsigned long body_v;
8094 +#endif
8095 +};
8096 +
8097 +#define INVALID_PLUGID ((char)((1 << 8) - 1))
8098 +#define INVALID_OFFSET -1
8099 +
8100 +static inline void coord_clear_iplug(coord_t * coord)
8101 +{
8102 + assert("nikita-2835", coord != NULL);
8103 + coord->iplugid = INVALID_PLUGID;
8104 + coord->offset = INVALID_OFFSET;
8105 +}
8106 +
8107 +static inline int coord_is_iplug_set(const coord_t * coord)
8108 +{
8109 + assert("nikita-2836", coord != NULL);
8110 + return coord->iplugid != INVALID_PLUGID;
8111 +}
8112 +
8113 +static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
8114 +{
8115 + assert("nikita-2478", coord != NULL);
8116 + coord->item_pos = pos;
8117 + coord_clear_iplug(coord);
8118 +}
8119 +
8120 +static inline void coord_dec_item_pos(coord_t * coord)
8121 +{
8122 + assert("nikita-2480", coord != NULL);
8123 + --coord->item_pos;
8124 + coord_clear_iplug(coord);
8125 +}
8126 +
8127 +static inline void coord_inc_item_pos(coord_t * coord)
8128 +{
8129 + assert("nikita-2481", coord != NULL);
8130 + ++coord->item_pos;
8131 + coord_clear_iplug(coord);
8132 +}
8133 +
8134 +static inline void coord_add_item_pos(coord_t * coord, int delta)
8135 +{
8136 + assert("nikita-2482", coord != NULL);
8137 + coord->item_pos += delta;
8138 + coord_clear_iplug(coord);
8139 +}
8140 +
8141 +static inline void coord_invalid_item_pos(coord_t * coord)
8142 +{
8143 + assert("nikita-2832", coord != NULL);
8144 + coord->item_pos = (unsigned short)~0;
8145 + coord_clear_iplug(coord);
8146 +}
8147 +
8148 +/* Reverse a direction. */
8149 +static inline sideof sideof_reverse(sideof side)
8150 +{
8151 + return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
8152 +}
8153 +
8154 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
8155 +
8156 + "first" and "last"
8157 + "next" and "prev"
8158 + "before" and "after"
8159 + "leftmost" and "rightmost"
8160 +
8161 + But I think the chosen names are decent the way they are.
8162 +*/
8163 +
8164 +/* COORD INITIALIZERS */
8165 +
8166 +/* Initialize an invalid coordinate. */
8167 +extern void coord_init_invalid(coord_t * coord, const znode * node);
8168 +
8169 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
8170 +
8171 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
8172 + empty, it is positioned at the EMPTY_NODE. */
8173 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
8174 +
8175 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
8176 + empty, it is positioned at the EMPTY_NODE. */
8177 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
8178 +
8179 +/* Initialize a coordinate to before the first item. If the node is empty, it is
8180 + positioned at the EMPTY_NODE. */
8181 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
8182 +
8183 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
8184 + at the EMPTY_NODE. */
8185 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
8186 +
8187 +/* Initialize a coordinate to after last unit in the item. Coord must be set
8188 + already to existing item */
8189 +void coord_init_after_item_end(coord_t * coord);
8190 +
8191 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
8192 +void coord_init_before_item(coord_t *);
8193 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
8194 +void coord_init_after_item(coord_t *);
8195 +
8196 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
8197 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
8198 + sideof dir);
8199 +
8200 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8201 + it was not clear how actually
8202 + FIXME-VS: added by vs (2002, june, 8) */
8203 +extern void coord_init_zero(coord_t * coord);
8204 +
8205 +/* COORD METHODS */
8206 +
8207 +/* after shifting of node content, coord previously set properly may become
8208 + invalid, try to "normalize" it. */
8209 +void coord_normalize(coord_t * coord);
8210 +
8211 +/* Copy a coordinate. */
8212 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
8213 +
8214 +/* Copy a coordinate without check. */
8215 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
8216 +
8217 +unsigned coord_num_units(const coord_t * coord);
8218 +
8219 +/* Return the last valid unit number at the present item (i.e.,
8220 + coord_num_units() - 1). */
8221 +static inline unsigned coord_last_unit_pos(const coord_t * coord)
8222 +{
8223 + return coord_num_units(coord) - 1;
8224 +}
8225 +
8226 +#if REISER4_DEBUG
8227 +/* For assertions only, checks for a valid coordinate. */
8228 +extern int coord_check(const coord_t * coord);
8229 +
8230 +extern unsigned long znode_times_locked(const znode * z);
8231 +
8232 +static inline void coord_update_v(coord_t * coord)
8233 +{
8234 + coord->plug_v = coord->body_v = znode_times_locked(coord->node);
8235 +}
8236 +#endif
8237 +
8238 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
8239 +
8240 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8241 +
8242 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8243 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8244 +extern coord_wrt_node coord_wrt(const coord_t * coord);
8245 +
8246 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
8247 + before-after or item boundaries. */
8248 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8249 +
8250 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8251 + NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
8252 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8253 +
8254 +/* COORD PREDICATES */
8255 +
8256 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8257 +extern int coord_is_invalid(const coord_t * coord);
8258 +
8259 +/* Returns true if the coordinate is positioned at an existing item, not before or after
8260 + an item. It may be placed at, before, or after any unit within the item, whether
8261 + existing or not. If this is true you can call methods of the item plugin. */
8262 +extern int coord_is_existing_item(const coord_t * coord);
8263 +
8264 +/* Returns true if the coordinate is positioned after a item, before a item, after the
8265 + last unit of an item, before the first unit of an item, or at an empty node. */
8266 +extern int coord_is_between_items(const coord_t * coord);
8267 +
8268 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8269 + unit. */
8270 +extern int coord_is_existing_unit(const coord_t * coord);
8271 +
8272 +/* Returns true if the coordinate is positioned at an empty node. */
8273 +extern int coord_is_empty(const coord_t * coord);
8274 +
8275 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
8276 + true for empty nodes nor coordinates positioned before the first item. */
8277 +extern int coord_is_leftmost_unit(const coord_t * coord);
8278 +
8279 +/* Returns true if the coordinate is positioned after the last item or after the last unit
8280 + of the last item or it is an empty node. */
8281 +extern int coord_is_after_rightmost(const coord_t * coord);
8282 +
8283 +/* Returns true if the coordinate is positioned before the first item or it is an empty
8284 + node. */
8285 +extern int coord_is_before_leftmost(const coord_t * coord);
8286 +
8287 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8288 + argument. */
8289 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8290 +
8291 +/* COORD MODIFIERS */
8292 +
8293 +/* Advances the coordinate by one unit to the right. If empty, no change. If
8294 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8295 + an existing unit. */
8296 +extern int coord_next_unit(coord_t * coord);
8297 +
8298 +/* Advances the coordinate by one item to the right. If empty, no change. If
8299 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8300 + an existing item. */
8301 +extern int coord_next_item(coord_t * coord);
8302 +
8303 +/* Advances the coordinate by one unit to the left. If empty, no change. If
8304 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8305 + is an existing unit. */
8306 +extern int coord_prev_unit(coord_t * coord);
8307 +
8308 +/* Advances the coordinate by one item to the left. If empty, no change. If
8309 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8310 + is an existing item. */
8311 +extern int coord_prev_item(coord_t * coord);
8312 +
8313 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8314 + non-zero if there is no position to the right. */
8315 +extern int coord_set_to_right(coord_t * coord);
8316 +
8317 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8318 + non-zero if there is no position to the left. */
8319 +extern int coord_set_to_left(coord_t * coord);
8320 +
8321 +/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8322 + and non-zero if the unit did not exist. */
8323 +extern int coord_set_after_unit(coord_t * coord);
8324 +
8325 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8326 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
8327 +
8328 +/* iterate over all units in @node */
8329 +#define for_all_units( coord, node ) \
8330 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8331 + coord_next_unit( coord ) == 0 ; )
8332 +
8333 +/* iterate over all items in @node */
8334 +#define for_all_items( coord, node ) \
8335 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8336 + coord_next_item( coord ) == 0 ; )
8337 +
8338 +/* COORD/ITEM METHODS */
8339 +
8340 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8341 + reiser4_block_nr * blk);
8342 +extern int item_utmost_child(const coord_t * coord, sideof side,
8343 + jnode ** child);
8344 +
8345 +/* a flow is a sequence of bytes being written to or read from the tree. The
8346 + tree will slice the flow into items while storing it into nodes, but all of
8347 + that is hidden from anything outside the tree. */
8348 +
8349 +struct flow {
8350 + reiser4_key key; /* key of start of flow's sequence of bytes */
8351 + loff_t length; /* length of flow's sequence of bytes */
8352 + char *data; /* start of flow's sequence of bytes */
8353 + int user; /* if 1 data is user space, 0 - kernel space */
8354 + rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8355 +};
8356 +
8357 +void move_flow_forward(flow_t * f, unsigned count);
8358 +
8359 +/* &reiser4_item_data - description of data to be inserted or pasted
8360 +
8361 + Q: articulate the reasons for the difference between this and flow.
8362 +
8363 + A: Becides flow we insert into tree other things: stat data, directory
8364 + entry, etc. To insert them into tree one has to provide this structure. If
8365 + one is going to insert flow - he can use insert_flow, where this structure
8366 + does not have to be created
8367 +*/
8368 +struct reiser4_item_data {
8369 + /* actual data to be inserted. If NULL, ->create_item() will not
8370 + do xmemcpy itself, leaving this up to the caller. This can
8371 + save some amount of unnecessary memory copying, for example,
8372 + during insertion of stat data.
8373 +
8374 + */
8375 + char *data;
8376 + /* 1 if 'char * data' contains pointer to user space and 0 if it is
8377 + kernel space */
8378 + int user;
8379 + /* amount of data we are going to insert or paste */
8380 + int length;
8381 + /* "Arg" is opaque data that is passed down to the
8382 + ->create_item() method of node layout, which in turn
8383 + hands it to the ->create_hook() of item being created. This
8384 + arg is currently used by:
8385 +
8386 + . ->create_hook() of internal item
8387 + (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8388 + . ->paste() method of directory item.
8389 + . ->create_hook() of extent item
8390 +
8391 + For internal item, this is left "brother" of new node being
8392 + inserted and it is used to add new node into sibling list
8393 + after parent to it was just inserted into parent.
8394 +
8395 + While ->arg does look somewhat of unnecessary compication,
8396 + it actually saves a lot of headache in many places, because
8397 + all data necessary to insert or paste new data into tree are
8398 + collected in one place, and this eliminates a lot of extra
8399 + argument passing and storing everywhere.
8400 +
8401 + */
8402 + void *arg;
8403 + /* plugin of item we are inserting */
8404 + item_plugin *iplug;
8405 +};
8406 +
8407 +/* __REISER4_COORD_H__ */
8408 +#endif
8409 +
8410 +/* Make Linus happy.
8411 + Local variables:
8412 + c-indentation-style: "K&R"
8413 + mode-name: "LC"
8414 + c-basic-offset: 8
8415 + tab-width: 8
8416 + fill-column: 120
8417 + scroll-step: 1
8418 + End:
8419 +*/
8420 Index: linux-2.6.16/fs/reiser4/debug.c
8421 ===================================================================
8422 --- /dev/null
8423 +++ linux-2.6.16/fs/reiser4/debug.c
8424 @@ -0,0 +1,300 @@
8425 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8426 + * reiser4/README */
8427 +
8428 +/* Debugging facilities. */
8429 +
8430 +/*
8431 + * This file contains generic debugging functions used by reiser4. Roughly
8432 + * following:
8433 + *
8434 + * panicking: reiser4_do_panic(), reiser4_print_prefix().
8435 + *
8436 + * locking: schedulable(), lock_counters(), print_lock_counters(),
8437 + * no_counters_are_held(), commit_check_locks()
8438 + *
8439 + * error code monitoring (see comment before RETERR macro): return_err(),
8440 + * report_err().
8441 + *
8442 + * stack back-tracing: fill_backtrace()
8443 + *
8444 + * miscellaneous: preempt_point(), call_on_each_assert(), debugtrap().
8445 + *
8446 + */
8447 +
8448 +#include "reiser4.h"
8449 +#include "context.h"
8450 +#include "super.h"
8451 +#include "txnmgr.h"
8452 +#include "znode.h"
8453 +
8454 +#include <linux/sysfs.h>
8455 +#include <linux/slab.h>
8456 +#include <linux/types.h>
8457 +#include <linux/fs.h>
8458 +#include <linux/spinlock.h>
8459 +#include <linux/kallsyms.h>
8460 +#include <linux/vmalloc.h>
8461 +#include <linux/ctype.h>
8462 +#include <linux/sysctl.h>
8463 +#include <linux/hardirq.h>
8464 +
8465 +#if REISER4_DEBUG
8466 +static void report_err(void);
8467 +#else
8468 +#define report_err() noop
8469 +#endif
8470 +
8471 +/*
8472 + * global buffer where message given to reiser4_panic is formatted.
8473 + */
8474 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8475 +
8476 +/*
8477 + * lock protecting consistency of panic_buf under concurrent panics
8478 + */
8479 +static DEFINE_SPINLOCK(panic_guard);
8480 +
8481 +/* Your best friend. Call it on each occasion. This is called by
8482 + fs/reiser4/debug.h:reiser4_panic(). */
8483 +void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8484 +{
8485 + static int in_panic = 0;
8486 + va_list args;
8487 +
8488 + /*
8489 + * check for recursive panic.
8490 + */
8491 + if (in_panic == 0) {
8492 + in_panic = 1;
8493 +
8494 + spin_lock(&panic_guard);
8495 + va_start(args, format);
8496 + vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8497 + va_end(args);
8498 + printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8499 + spin_unlock(&panic_guard);
8500 +
8501 + /*
8502 + * if kernel debugger is configured---drop in. Early dropping
8503 + * into kgdb is not always convenient, because panic message
8504 + * is not yet printed most of the times. But:
8505 + *
8506 + * (1) message can be extracted from printk_buf[]
8507 + * (declared static inside of printk()), and
8508 + *
8509 + * (2) sometimes serial/kgdb combo dies while printing
8510 + * long panic message, so it's more prudent to break into
8511 + * debugger earlier.
8512 + *
8513 + */
8514 + DEBUGON(1);
8515 + }
8516 + /* to make gcc happy about noreturn attribute */
8517 + panic("%s", panic_buf);
8518 +}
8519 +
8520 +void
8521 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8522 + const char *function, const char *file, int lineno)
8523 +{
8524 + const char *comm;
8525 + int pid;
8526 +
8527 + if (unlikely(in_interrupt() || in_irq())) {
8528 + comm = "interrupt";
8529 + pid = 0;
8530 + } else {
8531 + comm = current->comm;
8532 + pid = current->pid;
8533 + }
8534 + printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8535 + level, comm, pid, function, file, lineno, mid);
8536 + if (reperr)
8537 + report_err();
8538 +}
8539 +
8540 +/* Preemption point: this should be called periodically during long running
8541 + operations (carry, allocate, and squeeze are best examples) */
8542 +int preempt_point(void)
8543 +{
8544 + assert("nikita-3008", schedulable());
8545 + cond_resched();
8546 + return signal_pending(current);
8547 +}
8548 +
8549 +#if REISER4_DEBUG
8550 +/* Debugging aid: return struct where information about locks taken by current
8551 + thread is accumulated. This can be used to formulate lock ordering
8552 + constraints and various assertions.
8553 +
8554 +*/
8555 +lock_counters_info *lock_counters(void)
8556 +{
8557 + reiser4_context *ctx = get_current_context();
8558 + assert("jmacd-1123", ctx != NULL);
8559 + return &ctx->locks;
8560 +}
8561 +
8562 +/*
8563 + * print human readable information about locks held by the reiser4 context.
8564 + */
8565 +static void print_lock_counters(const char *prefix,
8566 + const lock_counters_info * info)
8567 +{
8568 + printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8569 + "jload: %i, "
8570 + "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8571 + "ktxnmgrd: %i, fq: %i\n"
8572 + "inode: %i, "
8573 + "cbk_cache: %i (r:%i,w%i), "
8574 + "eflush: %i, "
8575 + "zlock: %i,\n"
8576 + "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8577 + "d: %i, x: %i, t: %i\n", prefix,
8578 + info->spin_locked_jnode,
8579 + info->rw_locked_tree, info->read_locked_tree,
8580 + info->write_locked_tree,
8581 + info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8582 + info->spin_locked_jload,
8583 + info->spin_locked_txnh,
8584 + info->spin_locked_atom, info->spin_locked_stack,
8585 + info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8586 + info->spin_locked_fq,
8587 + info->spin_locked_inode,
8588 + info->rw_locked_cbk_cache,
8589 + info->read_locked_cbk_cache,
8590 + info->write_locked_cbk_cache,
8591 + info->spin_locked_super_eflush,
8592 + info->spin_locked_zlock,
8593 + info->spin_locked,
8594 + info->long_term_locked_znode,
8595 + info->inode_sem_r, info->inode_sem_w,
8596 + info->d_refs, info->x_refs, info->t_refs);
8597 +}
8598 +
8599 +/* check that no spinlocks are held */
8600 +int schedulable(void)
8601 +{
8602 + if (get_current_context_check() != NULL) {
8603 + if (!LOCK_CNT_NIL(spin_locked)) {
8604 + print_lock_counters("in atomic", lock_counters());
8605 + return 0;
8606 + }
8607 + }
8608 + might_sleep();
8609 + return 1;
8610 +}
8611 +/*
8612 + * return true, iff no locks are held.
8613 + */
8614 +int no_counters_are_held(void)
8615 +{
8616 + lock_counters_info *counters;
8617 +
8618 + counters = lock_counters();
8619 + return
8620 + (counters->spin_locked_zlock == 0) &&
8621 + (counters->spin_locked_jnode == 0) &&
8622 + (counters->rw_locked_tree == 0) &&
8623 + (counters->read_locked_tree == 0) &&
8624 + (counters->write_locked_tree == 0) &&
8625 + (counters->rw_locked_dk == 0) &&
8626 + (counters->read_locked_dk == 0) &&
8627 + (counters->write_locked_dk == 0) &&
8628 + (counters->spin_locked_txnh == 0) &&
8629 + (counters->spin_locked_atom == 0) &&
8630 + (counters->spin_locked_stack == 0) &&
8631 + (counters->spin_locked_txnmgr == 0) &&
8632 + (counters->spin_locked_inode == 0) &&
8633 + (counters->spin_locked == 0) &&
8634 + (counters->long_term_locked_znode == 0) &&
8635 + (counters->inode_sem_r == 0) &&
8636 + (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8637 +}
8638 +
8639 +/*
8640 + * return true, iff transaction commit can be done under locks held by the
8641 + * current thread.
8642 + */
8643 +int commit_check_locks(void)
8644 +{
8645 + lock_counters_info *counters;
8646 + int inode_sem_r;
8647 + int inode_sem_w;
8648 + int result;
8649 +
8650 + /*
8651 + * inode's read/write semaphore is the only reiser4 lock that can be
8652 + * held during commit.
8653 + */
8654 +
8655 + counters = lock_counters();
8656 + inode_sem_r = counters->inode_sem_r;
8657 + inode_sem_w = counters->inode_sem_w;
8658 +
8659 + counters->inode_sem_r = counters->inode_sem_w = 0;
8660 + result = no_counters_are_held();
8661 + counters->inode_sem_r = inode_sem_r;
8662 + counters->inode_sem_w = inode_sem_w;
8663 + return result;
8664 +}
8665 +
8666 +/*
8667 + * fill "error site" in the current reiser4 context. See comment before RETERR
8668 + * macro for more details.
8669 + */
8670 +void return_err(int code, const char *file, int line)
8671 +{
8672 + if (code < 0 && is_in_reiser4_context()) {
8673 + reiser4_context *ctx = get_current_context();
8674 +
8675 + if (ctx != NULL) {
8676 + ctx->err.code = code;
8677 + ctx->err.file = file;
8678 + ctx->err.line = line;
8679 + }
8680 + }
8681 +}
8682 +
8683 +/*
8684 + * report error information recorder by return_err().
8685 + */
8686 +static void report_err(void)
8687 +{
8688 + reiser4_context *ctx = get_current_context_check();
8689 +
8690 + if (ctx != NULL) {
8691 + if (ctx->err.code != 0) {
8692 + printk("code: %i at %s:%i\n",
8693 + ctx->err.code, ctx->err.file, ctx->err.line);
8694 + }
8695 + }
8696 +}
8697 +
8698 +#endif /* REISER4_DEBUG */
8699 +
8700 +#if KERNEL_DEBUGGER
8701 +
8702 +/*
8703 + * this functions just drops into kernel debugger. It is a convenient place to
8704 + * put breakpoint in.
8705 + */
8706 +void debugtrap(void)
8707 +{
8708 + /* do nothing. Put break point here. */
8709 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8710 + extern void breakpoint(void);
8711 + breakpoint();
8712 +#endif
8713 +}
8714 +#endif
8715 +
8716 +/* Make Linus happy.
8717 + Local variables:
8718 + c-indentation-style: "K&R"
8719 + mode-name: "LC"
8720 + c-basic-offset: 8
8721 + tab-width: 8
8722 + fill-column: 120
8723 + End:
8724 +*/
8725 Index: linux-2.6.16/fs/reiser4/debug.h
8726 ===================================================================
8727 --- /dev/null
8728 +++ linux-2.6.16/fs/reiser4/debug.h
8729 @@ -0,0 +1,350 @@
8730 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8731 +
8732 +/* Declarations of debug macros. */
8733 +
8734 +#if !defined( __FS_REISER4_DEBUG_H__ )
8735 +#define __FS_REISER4_DEBUG_H__
8736 +
8737 +#include "forward.h"
8738 +#include "reiser4.h"
8739 +
8740 +/* generic function to produce formatted output, decorating it with
8741 + whatever standard prefixes/postfixes we want. "Fun" is a function
8742 + that will be actually called, can be printk, panic etc.
8743 + This is for use by other debugging macros, not by users. */
8744 +#define DCALL(lev, fun, reperr, label, format, ...) \
8745 +({ \
8746 + fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8747 + current->comm, current->pid, __FUNCTION__, \
8748 + __FILE__, __LINE__, label, ## __VA_ARGS__); \
8749 +})
8750 +
8751 +/*
8752 + * cause kernel to crash
8753 + */
8754 +#define reiser4_panic(mid, format, ...) \
8755 + DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8756 +
8757 +/* print message with indication of current process, file, line and
8758 + function */
8759 +#define reiser4_log(label, format, ...) \
8760 + DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8761 +
8762 +/* Assertion checked during compilation.
8763 + If "cond" is false (0) we get duplicate case label in switch.
8764 + Use this to check something like famous
8765 + cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8766 + in 3.x journal.c. If cassertion fails you get compiler error,
8767 + so no "maintainer-id".
8768 +*/
8769 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8770 +
8771 +#define noop do {;} while(0)
8772 +
8773 +#if REISER4_DEBUG
8774 +/* version of info that only actually prints anything when _d_ebugging
8775 + is on */
8776 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8777 +/* macro to catch logical errors. Put it into `default' clause of
8778 + switch() statement. */
8779 +#define impossible(label, format, ...) \
8780 + reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8781 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8782 + called. Use this for checking logical consistency and _never_ call
8783 + this to check correctness of external data: disk blocks and user-input . */
8784 +#define assert(label, cond) \
8785 +({ \
8786 + /* call_on_each_assert(); */ \
8787 + if (cond) { \
8788 + /* put negated check to avoid using !(cond) that would lose \
8789 + * warnings for things like assert(a = b); */ \
8790 + ; \
8791 + } else { \
8792 + DEBUGON(1); \
8793 + reiser4_panic(label, "assertion failed: %s", #cond); \
8794 + } \
8795 +})
8796 +
8797 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8798 +#define check_me( label, expr ) assert( label, ( expr ) )
8799 +
8800 +#define ON_DEBUG( exp ) exp
8801 +
8802 +extern int schedulable(void);
8803 +extern void call_on_each_assert(void);
8804 +
8805 +#else
8806 +
8807 +#define dinfo( format, args... ) noop
8808 +#define impossible( label, format, args... ) noop
8809 +#define assert( label, cond ) noop
8810 +#define check_me( label, expr ) ( ( void ) ( expr ) )
8811 +#define ON_DEBUG( exp )
8812 +#define schedulable() might_sleep()
8813 +
8814 +/* REISER4_DEBUG */
8815 +#endif
8816 +
8817 +#if REISER4_DEBUG
8818 +/* per-thread information about lock acquired by this thread. Used by lock
8819 + * ordering checking in spin_macros.h */
8820 +typedef struct lock_counters_info {
8821 + int rw_locked_tree;
8822 + int read_locked_tree;
8823 + int write_locked_tree;
8824 +
8825 + int rw_locked_dk;
8826 + int read_locked_dk;
8827 + int write_locked_dk;
8828 +
8829 + int rw_locked_cbk_cache;
8830 + int read_locked_cbk_cache;
8831 + int write_locked_cbk_cache;
8832 +
8833 + int spin_locked_zlock;
8834 + int spin_locked_jnode;
8835 + int spin_locked_jload;
8836 + int spin_locked_txnh;
8837 + int spin_locked_atom;
8838 + int spin_locked_stack;
8839 + int spin_locked_txnmgr;
8840 + int spin_locked_ktxnmgrd;
8841 + int spin_locked_fq;
8842 + int spin_locked_inode;
8843 + int spin_locked_super_eflush;
8844 + int spin_locked;
8845 + int long_term_locked_znode;
8846 +
8847 + int inode_sem_r;
8848 + int inode_sem_w;
8849 +
8850 + int d_refs;
8851 + int x_refs;
8852 + int t_refs;
8853 +} lock_counters_info;
8854 +
8855 +extern lock_counters_info *lock_counters(void);
8856 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8857 +
8858 +/* increment lock-counter @counter, if present */
8859 +#define LOCK_CNT_INC(counter) IN_CONTEXT(++(lock_counters()->counter), 0)
8860 +
8861 +/* decrement lock-counter @counter, if present */
8862 +#define LOCK_CNT_DEC(counter) IN_CONTEXT(--(lock_counters()->counter), 0)
8863 +
8864 +/* check that lock-counter is zero. This is for use in assertions */
8865 +#define LOCK_CNT_NIL(counter) IN_CONTEXT(lock_counters()->counter == 0, 1)
8866 +
8867 +/* check that lock-counter is greater than zero. This is for use in
8868 + * assertions */
8869 +#define LOCK_CNT_GTZ(counter) IN_CONTEXT(lock_counters()->counter > 0, 1)
8870 +#define LOCK_CNT_LT(counter,n) IN_CONTEXT(lock_counters()->counter < n, 1)
8871 +
8872 +#else /* REISER4_DEBUG */
8873 +
8874 +/* no-op versions on the above */
8875 +
8876 +typedef struct lock_counters_info {
8877 +} lock_counters_info;
8878 +
8879 +#define lock_counters() ((lock_counters_info *)NULL)
8880 +#define LOCK_CNT_INC(counter) noop
8881 +#define LOCK_CNT_DEC(counter) noop
8882 +#define LOCK_CNT_NIL(counter) (1)
8883 +#define LOCK_CNT_GTZ(counter) (1)
8884 +#define LOCK_CNT_LT(counter,n) (1)
8885 +
8886 +#endif /* REISER4_DEBUG */
8887 +
8888 +#define assert_spin_not_locked(lock) BUG_ON(0)
8889 +#define assert_rw_write_locked(lock) BUG_ON(0)
8890 +#define assert_rw_read_locked(lock) BUG_ON(0)
8891 +#define assert_rw_locked(lock) BUG_ON(0)
8892 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
8893 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
8894 +#define assert_rw_not_locked(lock) BUG_ON(0)
8895 +
8896 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
8897 + option. */
8898 +typedef enum {
8899 + /* print a lot of information during panic. When this is on all jnodes
8900 + * are listed. This can be *very* large output. Usually you don't want
8901 + * this. Especially over serial line. */
8902 + REISER4_VERBOSE_PANIC = 0x00000001,
8903 + /* print a lot of information during umount */
8904 + REISER4_VERBOSE_UMOUNT = 0x00000002,
8905 + /* print gathered statistics on umount */
8906 + REISER4_STATS_ON_UMOUNT = 0x00000004,
8907 + /* check node consistency */
8908 + REISER4_CHECK_NODE = 0x00000008
8909 +} reiser4_debug_flags;
8910 +
8911 +extern int is_in_reiser4_context(void);
8912 +
8913 +/*
8914 + * evaluate expression @e only if with reiser4 context
8915 + */
8916 +#define ON_CONTEXT(e) do { \
8917 + if(is_in_reiser4_context()) { \
8918 + e; \
8919 + } } while(0)
8920 +
8921 +/*
8922 + * evaluate expression @e only when within reiser4_context and debugging is
8923 + * on.
8924 + */
8925 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8926 +
8927 +/*
8928 + * complain about unexpected function result and crash. Used in "default"
8929 + * branches of switch statements and alike to assert that invalid results are
8930 + * not silently ignored.
8931 + */
8932 +#define wrong_return_value( label, function ) \
8933 + impossible( label, "wrong return value from " function )
8934 +
8935 +/* Issue different types of reiser4 messages to the console */
8936 +#define warning( label, format, ... ) \
8937 + DCALL( KERN_WARNING, \
8938 + printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8939 +#define notice( label, format, ... ) \
8940 + DCALL( KERN_NOTICE, \
8941 + printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8942 +
8943 +/* mark not yet implemented functionality */
8944 +#define not_yet( label, format, ... ) \
8945 + reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8946 +
8947 +extern void reiser4_do_panic(const char *format, ...)
8948 + __attribute__ ((noreturn, format(printf, 1, 2)));
8949 +
8950 +extern void reiser4_print_prefix(const char *level, int reperr, const char *mid,
8951 + const char *function,
8952 + const char *file, int lineno);
8953 +
8954 +extern int preempt_point(void);
8955 +extern void reiser4_print_stats(void);
8956 +
8957 +
8958 +#if REISER4_DEBUG
8959 +extern int no_counters_are_held(void);
8960 +extern int commit_check_locks(void);
8961 +#else
8962 +#define no_counters_are_held() (1)
8963 +#define commit_check_locks() (1)
8964 +#endif
8965 +
8966 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8967 +#define IS_POW(i) \
8968 +({ \
8969 + typeof(i) __i; \
8970 + \
8971 + __i = (i); \
8972 + !(__i & (__i - 1)); \
8973 +})
8974 +
8975 +#define KERNEL_DEBUGGER (1)
8976 +
8977 +#if KERNEL_DEBUGGER
8978 +
8979 +extern void debugtrap(void);
8980 +
8981 +/*
8982 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8983 + * kgdb is not compiled in, do nothing.
8984 + */
8985 +#define DEBUGON(cond) \
8986 +({ \
8987 + if (unlikely(cond)) \
8988 + debugtrap(); \
8989 +})
8990 +#else
8991 +#define DEBUGON(cond) noop
8992 +#endif
8993 +
8994 +/*
8995 + * Error code tracing facility. (Idea is borrowed from XFS code.)
8996 + *
8997 + * Suppose some strange and/or unexpected code is returned from some function
8998 + * (for example, write(2) returns -EEXIST). It is possible to place a
8999 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
9000 + * in what particular place -EEXIST was generated first?
9001 + *
9002 + * In reiser4 all places where actual error codes are produced (that is,
9003 + * statements of the form
9004 + *
9005 + * return -EFOO; // (1), or
9006 + *
9007 + * result = -EFOO; // (2)
9008 + *
9009 + * are replaced with
9010 + *
9011 + * return RETERR(-EFOO); // (1a), and
9012 + *
9013 + * result = RETERR(-EFOO); // (2a) respectively
9014 + *
9015 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
9016 + * printed in error and warning messages. Moreover, it's possible to put a
9017 + * conditional breakpoint in return_err (low-level function called by RETERR()
9018 + * to do the actual work) to break into debugger immediately when particular
9019 + * error happens.
9020 + *
9021 + */
9022 +
9023 +#if REISER4_DEBUG
9024 +
9025 +/*
9026 + * data-type to store information about where error happened ("error site").
9027 + */
9028 +typedef struct err_site {
9029 + int code; /* error code */
9030 + const char *file; /* source file, filled by __FILE__ */
9031 + int line; /* source file line, filled by __LINE__ */
9032 +} err_site;
9033 +
9034 +extern void return_err(int code, const char *file, int line);
9035 +
9036 +/*
9037 + * fill &get_current_context()->err_site with error information.
9038 + */
9039 +#define RETERR(code) \
9040 +({ \
9041 + typeof(code) __code; \
9042 + \
9043 + __code = (code); \
9044 + return_err(__code, __FILE__, __LINE__); \
9045 + __code; \
9046 +})
9047 +
9048 +#else
9049 +
9050 +/*
9051 + * no-op versions of the above
9052 + */
9053 +
9054 +typedef struct err_site {
9055 +} err_site;
9056 +#define RETERR(code) code
9057 +#endif
9058 +
9059 +#if REISER4_LARGE_KEY
9060 +/*
9061 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
9062 + */
9063 +#define ON_LARGE_KEY(...) __VA_ARGS__
9064 +#else
9065 +#define ON_LARGE_KEY(...)
9066 +#endif
9067 +
9068 +/* __FS_REISER4_DEBUG_H__ */
9069 +#endif
9070 +
9071 +/* Make Linus happy.
9072 + Local variables:
9073 + c-indentation-style: "K&R"
9074 + mode-name: "LC"
9075 + c-basic-offset: 8
9076 + tab-width: 8
9077 + fill-column: 120
9078 + End:
9079 +*/
9080 Index: linux-2.6.16/fs/reiser4/dformat.h
9081 ===================================================================
9082 --- /dev/null
9083 +++ linux-2.6.16/fs/reiser4/dformat.h
9084 @@ -0,0 +1,71 @@
9085 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9086 +
9087 +/* Formats of on-disk data and conversion functions. */
9088 +
9089 +/* put all item formats in the files describing the particular items,
9090 + our model is, everything you need to do to add an item to reiser4,
9091 + (excepting the changes to the plugin that uses the item which go
9092 + into the file defining that plugin), you put into one file. */
9093 +/* Data on disk are stored in little-endian format.
9094 + To declare fields of on-disk structures, use d8, d16, d32 and d64.
9095 + d??tocpu() and cputod??() to convert. */
9096 +
9097 +#if !defined( __FS_REISER4_DFORMAT_H__ )
9098 +#define __FS_REISER4_DFORMAT_H__
9099 +
9100 +#include <asm/byteorder.h>
9101 +#include <asm/unaligned.h>
9102 +#include <linux/types.h>
9103 +
9104 +
9105 +typedef __u8 d8;
9106 +typedef __le16 d16;
9107 +typedef __le32 d32;
9108 +typedef __le64 d64;
9109 +
9110 +#define PACKED __attribute__((packed))
9111 +
9112 +/* data-type for block number */
9113 +typedef __u64 reiser4_block_nr;
9114 +
9115 +/* data-type for block number on disk, disk format */
9116 +typedef __le64 reiser4_dblock_nr;
9117 +
9118 +/**
9119 + * disk_addr_eq - compare disk addresses
9120 + * @b1: pointer to block number ot compare
9121 + * @b2: pointer to block number ot compare
9122 + *
9123 + * Returns true if if disk addresses are the same
9124 + */
9125 +static inline int disk_addr_eq(const reiser4_block_nr *b1,
9126 + const reiser4_block_nr * b2)
9127 +{
9128 + assert("nikita-1033", b1 != NULL);
9129 + assert("nikita-1266", b2 != NULL);
9130 +
9131 + return !memcmp(b1, b2, sizeof *b1);
9132 +}
9133 +
9134 +/* structure of master reiser4 super block */
9135 +typedef struct reiser4_master_sb {
9136 + char magic[16]; /* "ReIsEr4" */
9137 + __le16 disk_plugin_id; /* id of disk layout plugin */
9138 + __le16 blocksize;
9139 + char uuid[16]; /* unique id */
9140 + char label[16]; /* filesystem label */
9141 + __le64 diskmap; /* location of the diskmap. 0 if not present */
9142 +} reiser4_master_sb;
9143 +
9144 +/* __FS_REISER4_DFORMAT_H__ */
9145 +#endif
9146 +
9147 +/*
9148 + * Local variables:
9149 + * c-indentation-style: "K&R"
9150 + * mode-name: "LC"
9151 + * c-basic-offset: 8
9152 + * tab-width: 8
9153 + * fill-column: 79
9154 + * End:
9155 + */
9156 Index: linux-2.6.16/fs/reiser4/dscale.c
9157 ===================================================================
9158 --- /dev/null
9159 +++ linux-2.6.16/fs/reiser4/dscale.c
9160 @@ -0,0 +1,174 @@
9161 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9162 + * reiser4/README */
9163 +
9164 +/* Scalable on-disk integers */
9165 +
9166 +/*
9167 + * Various on-disk structures contain integer-like structures. Stat-data
9168 + * contain [yes, "data" is plural, check the dictionary] file size, link
9169 + * count; extent unit contains extent width etc. To accommodate for general
9170 + * case enough space is reserved to keep largest possible value. 64 bits in
9171 + * all cases above. But in overwhelming majority of cases numbers actually
9172 + * stored in these fields will be comparatively small and reserving 8 bytes is
9173 + * a waste of precious disk bandwidth.
9174 + *
9175 + * Scalable integers are one way to solve this problem. dscale_write()
9176 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
9177 + * depending on the magnitude of the value supplied. dscale_read() reads value
9178 + * previously stored by dscale_write().
9179 + *
9180 + * dscale_write() produces format not completely unlike of UTF: two highest
9181 + * bits of the first byte are used to store "tag". One of 4 possible tag
9182 + * values is chosen depending on the number being encoded:
9183 + *
9184 + * 0 ... 0x3f => 0 [table 1]
9185 + * 0x40 ... 0x3fff => 1
9186 + * 0x4000 ... 0x3fffffff => 2
9187 + * 0x40000000 ... 0xffffffffffffffff => 3
9188 + *
9189 + * (see dscale_range() function)
9190 + *
9191 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
9192 + * to be stored, so in this case there is no place in the first byte to store
9193 + * tag. For such values tag is stored in an extra 9th byte.
9194 + *
9195 + * As _highest_ bits are used for the test (which is natural) scaled integers
9196 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
9197 + * uses LITTLE-ENDIAN.
9198 + *
9199 + */
9200 +
9201 +#include "debug.h"
9202 +#include "dscale.h"
9203 +
9204 +/* return tag of scaled integer stored at @address */
9205 +static int gettag(const unsigned char *address)
9206 +{
9207 + /* tag is stored in two highest bits */
9208 + return (*address) >> 6;
9209 +}
9210 +
9211 +/* clear tag from value. Clear tag embedded into @value. */
9212 +static void cleartag(__u64 * value, int tag)
9213 +{
9214 + /*
9215 + * W-w-what ?!
9216 + *
9217 + * Actually, this is rather simple: @value passed here was read by
9218 + * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
9219 + * zeroes. Tag is still stored in the highest (arithmetically)
9220 + * non-zero bits of @value, but relative position of tag within __u64
9221 + * depends on @tag.
9222 + *
9223 + * For example if @tag is 0, it's stored 2 highest bits of lowest
9224 + * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
9225 + *
9226 + * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
9227 + * and it's offset if (2 * 8) - 2 == 14 bits.
9228 + *
9229 + * See table 1 above for details.
9230 + *
9231 + * All these cases are captured by the formula:
9232 + */
9233 + *value &= ~(3 << (((1 << tag) << 3) - 2));
9234 + /*
9235 + * That is, clear two (3 == 0t11) bits at the offset
9236 + *
9237 + * 8 * (2 ^ tag) - 2,
9238 + *
9239 + * that is, two highest bits of (2 ^ tag)-th byte of @value.
9240 + */
9241 +}
9242 +
9243 +/* return tag for @value. See table 1 above for details. */
9244 +static int dscale_range(__u64 value)
9245 +{
9246 + if (value > 0x3fffffff)
9247 + return 3;
9248 + if (value > 0x3fff)
9249 + return 2;
9250 + if (value > 0x3f)
9251 + return 1;
9252 + return 0;
9253 +}
9254 +
9255 +/* restore value stored at @adderss by dscale_write() and return number of
9256 + * bytes consumed */
9257 +int dscale_read(unsigned char *address, __u64 * value)
9258 +{
9259 + int tag;
9260 +
9261 + /* read tag */
9262 + tag = gettag(address);
9263 + switch (tag) {
9264 + case 3:
9265 + /* In this case tag is stored in an extra byte, skip this byte
9266 + * and decode value stored in the next 8 bytes.*/
9267 + *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9268 + /* worst case: 8 bytes for value itself plus one byte for
9269 + * tag. */
9270 + return 9;
9271 + case 0:
9272 + *value = get_unaligned(address);
9273 + break;
9274 + case 1:
9275 + *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9276 + break;
9277 + case 2:
9278 + *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9279 + break;
9280 + default:
9281 + return RETERR(-EIO);
9282 + }
9283 + /* clear tag embedded into @value */
9284 + cleartag(value, tag);
9285 + /* number of bytes consumed is (2 ^ tag)---see table 1. */
9286 + return 1 << tag;
9287 +}
9288 +
9289 +/* store @value at @address and return number of bytes consumed */
9290 +int dscale_write(unsigned char *address, __u64 value)
9291 +{
9292 + int tag;
9293 + int shift;
9294 + __be64 v;
9295 + unsigned char *valarr;
9296 +
9297 + tag = dscale_range(value);
9298 + v = __cpu_to_be64(value);
9299 + valarr = (unsigned char *)&v;
9300 + shift = (tag == 3) ? 1 : 0;
9301 + memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9302 + *address |= (tag << 6);
9303 + return shift + (1 << tag);
9304 +}
9305 +
9306 +/* number of bytes required to store @value */
9307 +int dscale_bytes(__u64 value)
9308 +{
9309 + int bytes;
9310 +
9311 + bytes = 1 << dscale_range(value);
9312 + if (bytes == 8)
9313 + ++bytes;
9314 + return bytes;
9315 +}
9316 +
9317 +/* returns true if @value and @other require the same number of bytes to be
9318 + * stored. Used by detect when data structure (like stat-data) has to be
9319 + * expanded or contracted. */
9320 +int dscale_fit(__u64 value, __u64 other)
9321 +{
9322 + return dscale_range(value) == dscale_range(other);
9323 +}
9324 +
9325 +/* Make Linus happy.
9326 + Local variables:
9327 + c-indentation-style: "K&R"
9328 + mode-name: "LC"
9329 + c-basic-offset: 8
9330 + tab-width: 8
9331 + fill-column: 120
9332 + scroll-step: 1
9333 + End:
9334 +*/
9335 Index: linux-2.6.16/fs/reiser4/dscale.h
9336 ===================================================================
9337 --- /dev/null
9338 +++ linux-2.6.16/fs/reiser4/dscale.h
9339 @@ -0,0 +1,27 @@
9340 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9341 + * reiser4/README */
9342 +
9343 +/* Scalable on-disk integers. See dscale.h for details. */
9344 +
9345 +#if !defined( __FS_REISER4_DSCALE_H__ )
9346 +#define __FS_REISER4_DSCALE_H__
9347 +
9348 +#include "dformat.h"
9349 +
9350 +extern int dscale_read(unsigned char *address, __u64 * value);
9351 +extern int dscale_write(unsigned char *address, __u64 value);
9352 +extern int dscale_bytes(__u64 value);
9353 +extern int dscale_fit(__u64 value, __u64 other);
9354 +
9355 +/* __FS_REISER4_DSCALE_H__ */
9356 +#endif
9357 +
9358 +/* Make Linus happy.
9359 + Local variables:
9360 + c-indentation-style: "K&R"
9361 + mode-name: "LC"
9362 + c-basic-offset: 8
9363 + tab-width: 8
9364 + fill-column: 120
9365 + End:
9366 +*/
9367 Index: linux-2.6.16/fs/reiser4/entd.c
9368 ===================================================================
9369 --- /dev/null
9370 +++ linux-2.6.16/fs/reiser4/entd.c
9371 @@ -0,0 +1,356 @@
9372 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9373 + * reiser4/README */
9374 +
9375 +/* Ent daemon. */
9376 +
9377 +#include "debug.h"
9378 +#include "txnmgr.h"
9379 +#include "tree.h"
9380 +#include "entd.h"
9381 +#include "super.h"
9382 +#include "context.h"
9383 +#include "reiser4.h"
9384 +#include "vfs_ops.h"
9385 +#include "page_cache.h"
9386 +#include "inode.h"
9387 +
9388 +#include <linux/sched.h> /* struct task_struct */
9389 +#include <linux/suspend.h>
9390 +#include <linux/kernel.h>
9391 +#include <linux/writeback.h>
9392 +#include <linux/time.h> /* INITIAL_JIFFIES */
9393 +#include <linux/backing-dev.h> /* bdi_write_congested */
9394 +#include <linux/wait.h>
9395 +#include <linux/kthread.h>
9396 +
9397 +#define LLONG_MAX ((long long)(~0ULL>>1))
9398 +
9399 +#define DEF_PRIORITY 12
9400 +#define MAX_ENTD_ITERS 10
9401 +
9402 +static void entd_flush(struct super_block *, struct wbq *);
9403 +static int entd(void *arg);
9404 +
9405 +/*
9406 + * set ->comm field of end thread to make its state visible to the user level
9407 + */
9408 +#define entd_set_comm(state) \
9409 + snprintf(current->comm, sizeof(current->comm), \
9410 + "ent:%s%s", super->s_id, (state))
9411 +
9412 +/**
9413 + * init_entd - initialize entd context and start kernel daemon
9414 + * @super: super block to start ent thread for
9415 + *
9416 + * Creates entd contexts, starts kernel thread and waits until it
9417 + * initializes.
9418 + */
9419 +int init_entd(struct super_block *super)
9420 +{
9421 + entd_context *ctx;
9422 +
9423 + assert("nikita-3104", super != NULL);
9424 +
9425 + ctx = get_entd_context(super);
9426 +
9427 + memset(ctx, 0, sizeof *ctx);
9428 + spin_lock_init(&ctx->guard);
9429 + init_waitqueue_head(&ctx->wait);
9430 +#if REISER4_DEBUG
9431 + INIT_LIST_HEAD(&ctx->flushers_list);
9432 +#endif
9433 + /* lists of writepage requests */
9434 + INIT_LIST_HEAD(&ctx->todo_list);
9435 + INIT_LIST_HEAD(&ctx->done_list);
9436 + /* start entd */
9437 + ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9438 + if (IS_ERR(ctx->tsk))
9439 + return PTR_ERR(ctx->tsk);
9440 + return 0;
9441 +}
9442 +
9443 +static void __put_wbq(entd_context *ent, struct wbq *rq)
9444 +{
9445 + up(&rq->sem);
9446 +}
9447 +
9448 +/* ent should be locked */
9449 +static struct wbq *__get_wbq(entd_context * ent)
9450 +{
9451 + struct wbq *wbq;
9452 +
9453 + if (list_empty_careful(&ent->todo_list))
9454 + return NULL;
9455 +
9456 + ent->nr_todo_reqs --;
9457 + wbq = list_entry(ent->todo_list.next, struct wbq, link);
9458 + list_del_init(&wbq->link);
9459 + return wbq;
9460 +}
9461 +
9462 +static void wakeup_all_wbq(entd_context * ent)
9463 +{
9464 + struct wbq *rq;
9465 +
9466 + spin_lock(&ent->guard);
9467 + while ((rq = __get_wbq(ent)) != NULL)
9468 + __put_wbq(ent, rq);
9469 + spin_unlock(&ent->guard);
9470 +}
9471 +
9472 +/* ent thread function */
9473 +static int entd(void *arg)
9474 +{
9475 + struct super_block *super;
9476 + entd_context *ent;
9477 + int done = 0;
9478 +
9479 + super = arg;
9480 + /* do_fork() just copies task_struct into the new
9481 + thread. ->fs_context shouldn't be copied of course. This shouldn't
9482 + be a problem for the rest of the code though.
9483 + */
9484 + current->journal_info = NULL;
9485 +
9486 + ent = get_entd_context(super);
9487 +
9488 + while (!done) {
9489 + try_to_freeze();
9490 +
9491 + spin_lock(&ent->guard);
9492 + while (ent->nr_todo_reqs != 0) {
9493 + struct wbq *rq, *next;
9494 +
9495 + assert("", list_empty_careful(&ent->done_list));
9496 +
9497 + /* take request from the queue head */
9498 + rq = __get_wbq(ent);
9499 + assert("", rq != NULL);
9500 + ent->cur_request = rq;
9501 + spin_unlock(&ent->guard);
9502 +
9503 + entd_set_comm("!");
9504 + entd_flush(super, rq);
9505 +
9506 + iput(rq->mapping->host);
9507 + up(&(rq->sem));
9508 +
9509 + /*
9510 + * wakeup all requestors and iput their inodes
9511 + */
9512 + spin_lock(&ent->guard);
9513 + list_for_each_entry_safe(rq, next, &ent->done_list, link) {
9514 + list_del_init(&(rq->link));
9515 + ent->nr_done_reqs --;
9516 + spin_unlock(&ent->guard);
9517 +
9518 + assert("", rq->written == 1);
9519 + iput(rq->mapping->host);
9520 + up(&(rq->sem));
9521 + spin_lock(&ent->guard);
9522 + }
9523 + }
9524 + spin_unlock(&ent->guard);
9525 +
9526 + entd_set_comm(".");
9527 +
9528 + {
9529 + DEFINE_WAIT(__wait);
9530 +
9531 + do {
9532 + prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9533 + if (kthread_should_stop()) {
9534 + done = 1;
9535 + break;
9536 + }
9537 + if (ent->nr_todo_reqs != 0)
9538 + break;
9539 + schedule();
9540 + } while (0);
9541 + finish_wait(&ent->wait, &__wait);
9542 + }
9543 + }
9544 + spin_lock(&ent->guard);
9545 + BUG_ON(ent->nr_todo_reqs != 0);
9546 + spin_unlock(&ent->guard);
9547 + wakeup_all_wbq(ent);
9548 + return 0;
9549 +}
9550 +
9551 +/**
9552 + * done_entd - stop entd kernel thread
9553 + * @super: super block to stop ent thread for
9554 + *
9555 + * It is called on umount. Sends stop signal to entd and wait until it handles
9556 + * it.
9557 + */
9558 +void done_entd(struct super_block *super)
9559 +{
9560 + entd_context *ent;
9561 +
9562 + assert("nikita-3103", super != NULL);
9563 +
9564 + ent = get_entd_context(super);
9565 + assert("zam-1055", ent->tsk != NULL);
9566 + kthread_stop(ent->tsk);
9567 +}
9568 +
9569 +/* called at the beginning of jnode_flush to register flusher thread with ent
9570 + * daemon */
9571 +void enter_flush(struct super_block *super)
9572 +{
9573 + entd_context *ent;
9574 +
9575 + assert("zam-1029", super != NULL);
9576 + ent = get_entd_context(super);
9577 +
9578 + assert("zam-1030", ent != NULL);
9579 +
9580 + spin_lock(&ent->guard);
9581 + ent->flushers++;
9582 +#if REISER4_DEBUG
9583 + list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9584 +#endif
9585 + spin_unlock(&ent->guard);
9586 +}
9587 +
9588 +/* called at the end of jnode_flush */
9589 +void leave_flush(struct super_block *super)
9590 +{
9591 + entd_context *ent;
9592 + int wake_up_ent;
9593 +
9594 + assert("zam-1027", super != NULL);
9595 + ent = get_entd_context(super);
9596 +
9597 + assert("zam-1028", ent != NULL);
9598 +
9599 + spin_lock(&ent->guard);
9600 + ent->flushers--;
9601 + wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9602 +#if REISER4_DEBUG
9603 + list_del_init(&get_current_context()->flushers_link);
9604 +#endif
9605 + spin_unlock(&ent->guard);
9606 + if (wake_up_ent)
9607 + wake_up(&ent->wait);
9608 +}
9609 +
9610 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9611 +
9612 +static void entd_flush(struct super_block *super, struct wbq *rq)
9613 +{
9614 + reiser4_context ctx;
9615 + int tmp;
9616 +
9617 + init_stack_context(&ctx, super);
9618 + ctx.entd = 1;
9619 + ctx.gfp_mask = GFP_NOFS;
9620 +
9621 + rq->wbc->start = rq->page->index << PAGE_CACHE_SHIFT;
9622 + rq->wbc->end = (rq->page->index + ENTD_CAPTURE_APAGE_BURST) << PAGE_CACHE_SHIFT;
9623 + tmp = rq->wbc->nr_to_write;
9624 + rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9625 +
9626 + if (rq->wbc->nr_to_write > 0) {
9627 + rq->wbc->start = 0;
9628 + rq->wbc->end = LLONG_MAX;
9629 + generic_sync_sb_inodes(super, rq->wbc);
9630 + }
9631 + rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9632 + writeout(super, rq->wbc);
9633 +
9634 + context_set_commit_async(&ctx);
9635 + reiser4_exit_context(&ctx);
9636 +}
9637 +
9638 +/**
9639 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9640 + * @page: page to be written
9641 + * @wbc: writeback control passed to reiser4_writepage
9642 + *
9643 + * Creates a request, puts it on entd list of requests, wakeups entd if
9644 + * necessary, waits until entd completes with the request.
9645 + */
9646 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9647 +{
9648 + struct super_block *sb;
9649 + struct inode *inode;
9650 + entd_context *ent;
9651 + struct wbq rq;
9652 +
9653 + assert("", PageLocked(page));
9654 + assert("", page->mapping != NULL);
9655 +
9656 + sb = page->mapping->host->i_sb;
9657 + ent = get_entd_context(sb);
9658 + assert("", ent && ent->done == 0);
9659 +
9660 + /*
9661 + * we are going to unlock page and ask ent thread to write the
9662 + * page. Re-dirty page before unlocking so that if ent thread fails to
9663 + * write it - it will remain dirty
9664 + */
9665 + set_page_dirty_internal(page);
9666 +
9667 + /*
9668 + * pin inode in memory, unlock page, entd_flush will iput. We can not
9669 + * iput here becasue we can not allow delete_inode to be called here
9670 + */
9671 + inode = igrab(page->mapping->host);
9672 + unlock_page(page);
9673 + if (inode == NULL)
9674 + /* inode is getting freed */
9675 + return 0;
9676 +
9677 + /* init wbq */
9678 + INIT_LIST_HEAD(&rq.link);
9679 + rq.magic = WBQ_MAGIC;
9680 + rq.wbc = wbc;
9681 + rq.page = page;
9682 + rq.mapping = inode->i_mapping;
9683 + rq.node = NULL;
9684 + rq.written = 0;
9685 + sema_init(&rq.sem, 0);
9686 +
9687 + /* add request to entd's list of writepage requests */
9688 + spin_lock(&ent->guard);
9689 + ent->nr_todo_reqs++;
9690 + list_add_tail(&rq.link, &ent->todo_list);
9691 + if (ent->nr_todo_reqs == 1)
9692 + wake_up(&ent->wait);
9693 +
9694 + spin_unlock(&ent->guard);
9695 +
9696 + /* wait until entd finishes */
9697 + down(&rq.sem);
9698 +
9699 + /*
9700 + * spin until entd thread which did up(&rq.sem) does not need rq
9701 + * anymore
9702 + */
9703 + spin_lock(&ent->guard);
9704 + spin_unlock(&ent->guard);
9705 +
9706 + if (rq.written)
9707 + /* Eventually ENTD has written the page to disk. */
9708 + return 0;
9709 + return 0;
9710 +}
9711 +
9712 +int wbq_available(void)
9713 +{
9714 + struct super_block *sb = reiser4_get_current_sb();
9715 + entd_context *ent = get_entd_context(sb);
9716 + return ent->nr_todo_reqs;
9717 +}
9718 +
9719 +/*
9720 + * Local variables:
9721 + * c-indentation-style: "K&R"
9722 + * mode-name: "LC"
9723 + * c-basic-offset: 8
9724 + * tab-width: 8
9725 + * fill-column: 79
9726 + * End:
9727 + */
9728 Index: linux-2.6.16/fs/reiser4/entd.h
9729 ===================================================================
9730 --- /dev/null
9731 +++ linux-2.6.16/fs/reiser4/entd.h
9732 @@ -0,0 +1,90 @@
9733 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9734 +
9735 +/* Ent daemon. */
9736 +
9737 +#ifndef __ENTD_H__
9738 +#define __ENTD_H__
9739 +
9740 +#include "context.h"
9741 +
9742 +#include <linux/fs.h>
9743 +#include <linux/completion.h>
9744 +#include <linux/wait.h>
9745 +#include <linux/spinlock.h>
9746 +#include <linux/sched.h> /* for struct task_struct */
9747 +
9748 +#define WBQ_MAGIC 0x7876dc76
9749 +
9750 +/* write-back request. */
9751 +struct wbq {
9752 + int magic;
9753 + struct list_head link; /* list head of this list is in entd context */
9754 + struct writeback_control *wbc;
9755 + struct page *page;
9756 + struct address_space *mapping;
9757 + struct semaphore sem;
9758 + jnode *node; /* set if ent thread captured requested page */
9759 + int written; /* set if ent thread wrote requested page */
9760 +};
9761 +
9762 +/* ent-thread context. This is used to synchronize starting/stopping ent
9763 + * threads. */
9764 +typedef struct entd_context {
9765 + /* wait queue that ent thread waits on for more work. It's
9766 + * signaled by write_page_by_ent(). */
9767 + wait_queue_head_t wait;
9768 + /* spinlock protecting other fields */
9769 + spinlock_t guard;
9770 + /* ent thread */
9771 + struct task_struct *tsk;
9772 + /* set to indicate that ent thread should leave. */
9773 + int done;
9774 + /* counter of active flushers */
9775 + int flushers;
9776 + /*
9777 + * when reiser4_writepage asks entd to write a page - it adds struct
9778 + * wbq to this list
9779 + */
9780 + struct list_head todo_list;
9781 + /* number of elements on the above list */
9782 + int nr_todo_reqs;
9783 +
9784 + struct wbq *cur_request;
9785 + /*
9786 + * when entd writes a page it moves write-back request from todo_list
9787 + * to done_list. This list is used at the end of entd iteration to
9788 + * wakeup requestors and iput inodes.
9789 + */
9790 + struct list_head done_list;
9791 + /* number of elements on the above list */
9792 + int nr_done_reqs;
9793 +
9794 +#if REISER4_DEBUG
9795 + /* list of all active flushers */
9796 + struct list_head flushers_list;
9797 +#endif
9798 +} entd_context;
9799 +
9800 +extern int init_entd(struct super_block *);
9801 +extern void done_entd(struct super_block *);
9802 +
9803 +extern void enter_flush(struct super_block *);
9804 +extern void leave_flush(struct super_block *);
9805 +
9806 +extern int write_page_by_ent(struct page *, struct writeback_control *);
9807 +extern int wbq_available(void);
9808 +extern void ent_writes_page(struct super_block *, struct page *);
9809 +
9810 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9811 +/* __ENTD_H__ */
9812 +#endif
9813 +
9814 +/* Make Linus happy.
9815 + Local variables:
9816 + c-indentation-style: "K&R"
9817 + mode-name: "LC"
9818 + c-basic-offset: 8
9819 + tab-width: 8
9820 + fill-column: 120
9821 + End:
9822 +*/
9823 Index: linux-2.6.16/fs/reiser4/eottl.c
9824 ===================================================================
9825 --- /dev/null
9826 +++ linux-2.6.16/fs/reiser4/eottl.c
9827 @@ -0,0 +1,510 @@
9828 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9829 +
9830 +#include "forward.h"
9831 +#include "debug.h"
9832 +#include "key.h"
9833 +#include "coord.h"
9834 +#include "plugin/item/item.h"
9835 +#include "plugin/node/node.h"
9836 +#include "znode.h"
9837 +#include "block_alloc.h"
9838 +#include "tree_walk.h"
9839 +#include "tree_mod.h"
9840 +#include "carry.h"
9841 +#include "tree.h"
9842 +#include "super.h"
9843 +
9844 +#include <linux/types.h> /* for __u?? */
9845 +
9846 +/*
9847 + * Extents on the twig level (EOTTL) handling.
9848 + *
9849 + * EOTTL poses some problems to the tree traversal, that are better explained
9850 + * by example.
9851 + *
9852 + * Suppose we have block B1 on the twig level with the following items:
9853 + *
9854 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9855 + * offset)
9856 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9857 + * 2. internal item I2 with key (10:0:0:0)
9858 + *
9859 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9860 + * then intra-node lookup is done. This lookup finished on the E1, because the
9861 + * key we are looking for is larger than the key of E1 and is smaller than key
9862 + * the of I2.
9863 + *
9864 + * Here search is stuck.
9865 + *
9866 + * After some thought it is clear what is wrong here: extents on the twig level
9867 + * break some basic property of the *search* tree (on the pretext, that they
9868 + * restore property of balanced tree).
9869 + *
9870 + * Said property is the following: if in the internal node of the search tree
9871 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9872 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9873 + * through the Pointer.
9874 + *
9875 + * This is not true, when Pointer is Extent-Pointer, simply because extent
9876 + * cannot expand indefinitely to the right to include any item with
9877 + *
9878 + * Key1 <= Key <= Key2.
9879 + *
9880 + * For example, our E1 extent is only responsible for the data with keys
9881 + *
9882 + * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9883 + *
9884 + * so, key range
9885 + *
9886 + * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9887 + *
9888 + * is orphaned: there is no way to get there from the tree root.
9889 + *
9890 + * In other words, extent pointers are different than normal child pointers as
9891 + * far as search tree is concerned, and this creates such problems.
9892 + *
9893 + * Possible solution for this problem is to insert our item into node pointed
9894 + * to by I2. There are some problems through:
9895 + *
9896 + * (1) I2 can be in a different node.
9897 + * (2) E1 can be immediately followed by another extent E2.
9898 + *
9899 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9900 + * for locks/coords as necessary.
9901 + *
9902 + * (2) is more complex. Solution here is to insert new empty leaf node and
9903 + * insert internal item between E1 and E2 pointing to said leaf node. This is
9904 + * further complicated by possibility that E2 is in a different node, etc.
9905 + *
9906 + * Problems:
9907 + *
9908 + * (1) if there was internal item I2 immediately on the right of an extent E1
9909 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9910 + * key of S1 will be less than smallest key in the N2. Normally, search key
9911 + * checks that key we are looking for is in the range of keys covered by the
9912 + * node key is being looked in. To work around of this situation, while
9913 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9914 + * cbk falgs bitmask. This flag is automatically set on entrance to the
9915 + * coord_by_key() and is only cleared when we are about to enter situation
9916 + * described above.
9917 + *
9918 + * (2) If extent E1 is immediately followed by another extent E2 and we are
9919 + * searching for the key that is between E1 and E2 we only have to insert new
9920 + * empty leaf node when coord_by_key was called for insertion, rather than just
9921 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9922 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9923 + * performed by insert_by_key() and friends.
9924 + *
9925 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9926 + * case it requires modification of node content which is only possible under
9927 + * write lock. It may well happen that we only have read lock on the node where
9928 + * new internal pointer is to be inserted (common case: lookup of non-existent
9929 + * stat-data that fells between two extents). If only read lock is held, tree
9930 + * traversal is restarted with lock_level modified so that next time we hit
9931 + * this problem, write lock will be held. Once we have write lock, balancing
9932 + * will be performed.
9933 + */
9934 +
9935 +/**
9936 + * is_next_item_internal - check whether next item is internal
9937 + * @coord: coordinate of extent item in twig node
9938 + * @key: search key
9939 + * @lh: twig node lock handle
9940 + *
9941 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9942 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9943 + * to that node, @coord is set to its first unit. If next item is not internal
9944 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9945 + * is returned if search restart has to be done.
9946 + */
9947 +static int
9948 +is_next_item_internal(coord_t *coord, const reiser4_key *key,
9949 + lock_handle *lh)
9950 +{
9951 + coord_t next;
9952 + lock_handle rn;
9953 + int result;
9954 +
9955 + coord_dup(&next, coord);
9956 + if (coord_next_unit(&next) == 0) {
9957 + /* next unit is in this node */
9958 + if (item_is_internal(&next)) {
9959 + coord_dup(coord, &next);
9960 + return 1;
9961 + }
9962 + assert("vs-3", item_is_extent(&next));
9963 + return 0;
9964 + }
9965 +
9966 + /*
9967 + * next unit either does not exist or is in right neighbor. If it is in
9968 + * right neighbor we have to check right delimiting key because
9969 + * concurrent thread could get their first and insert item with a key
9970 + * smaller than @key
9971 + */
9972 + read_lock_dk(current_tree);
9973 + result = keycmp(key, znode_get_rd_key(coord->node));
9974 + read_unlock_dk(current_tree);
9975 + assert("vs-6", result != EQUAL_TO);
9976 + if (result == GREATER_THAN)
9977 + return 2;
9978 +
9979 + /* lock right neighbor */
9980 + init_lh(&rn);
9981 + result = reiser4_get_right_neighbor(&rn, coord->node,
9982 + znode_is_wlocked(coord->node) ?
9983 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9984 + GN_CAN_USE_UPPER_LEVELS);
9985 + if (result == -E_NO_NEIGHBOR) {
9986 + /* we are on the rightmost edge of the tree */
9987 + done_lh(&rn);
9988 + return 0;
9989 + }
9990 +
9991 + if (result) {
9992 + assert("vs-4", result < 0);
9993 + done_lh(&rn);
9994 + return result;
9995 + }
9996 +
9997 + /*
9998 + * check whether concurrent thread managed to insert item with a key
9999 + * smaller than @key
10000 + */
10001 + read_lock_dk(current_tree);
10002 + result = keycmp(key, znode_get_ld_key(rn.node));
10003 + read_unlock_dk(current_tree);
10004 + assert("vs-6", result != EQUAL_TO);
10005 + if (result == GREATER_THAN) {
10006 + done_lh(&rn);
10007 + return 2;
10008 + }
10009 +
10010 + result = zload(rn.node);
10011 + if (result) {
10012 + assert("vs-5", result < 0);
10013 + done_lh(&rn);
10014 + return result;
10015 + }
10016 +
10017 + coord_init_first_unit(&next, rn.node);
10018 + if (item_is_internal(&next)) {
10019 + /*
10020 + * next unit is in right neighbor and it is an unit of internal
10021 + * item. Unlock coord->node. Move @lh to right neighbor. @coord
10022 + * is set to the first unit of right neighbor.
10023 + */
10024 + coord_dup(coord, &next);
10025 + zrelse(rn.node);
10026 + done_lh(lh);
10027 + move_lh(lh, &rn);
10028 + return 1;
10029 + }
10030 +
10031 + /*
10032 + * next unit is unit of extent item. Return without chaning @lh and
10033 + * @coord.
10034 + */
10035 + assert("vs-6", item_is_extent(&next));
10036 + zrelse(rn.node);
10037 + done_lh(&rn);
10038 + return 0;
10039 +}
10040 +
10041 +/**
10042 + * rd_key - calculate key of an item next to the given one
10043 + * @coord: position in a node
10044 + * @key: storage for result key
10045 + *
10046 + * @coord is set between items or after the last item in a node. Calculate key
10047 + * of item to the right of @coord.
10048 + */
10049 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
10050 +{
10051 + coord_t dup;
10052 +
10053 + assert("nikita-2281", coord_is_between_items(coord));
10054 + coord_dup(&dup, coord);
10055 +
10056 + if (coord_set_to_right(&dup) == 0)
10057 + /* next item is in this node. Return its key. */
10058 + unit_key_by_coord(&dup, key);
10059 + else {
10060 + /*
10061 + * next item either does not exist or is in right
10062 + * neighbor. Return znode's right delimiting key.
10063 + */
10064 + read_lock_dk(current_tree);
10065 + *key = *znode_get_rd_key(coord->node);
10066 + read_unlock_dk(current_tree);
10067 + }
10068 + return key;
10069 +}
10070 +
10071 +/**
10072 + * add_empty_leaf - insert empty leaf between two extents
10073 + * @insert_coord: position in twig node between two extents
10074 + * @lh: twig node lock handle
10075 + * @key: left delimiting key of new node
10076 + * @rdkey: right delimiting key of new node
10077 + *
10078 + * Inserts empty leaf node between two extent items. It is necessary when we
10079 + * have to insert an item on leaf level between two extents (items on the twig
10080 + * level).
10081 + */
10082 +static int
10083 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
10084 + const reiser4_key *key, const reiser4_key *rdkey)
10085 +{
10086 + int result;
10087 + carry_pool *pool;
10088 + carry_level *todo;
10089 + reiser4_item_data *item;
10090 + carry_insert_data *cdata;
10091 + carry_op *op;
10092 + znode *node;
10093 + reiser4_tree *tree;
10094 +
10095 + assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
10096 + tree = znode_get_tree(insert_coord->node);
10097 + node = new_node(insert_coord->node, LEAF_LEVEL);
10098 + if (IS_ERR(node))
10099 + return PTR_ERR(node);
10100 +
10101 + /* setup delimiting keys for node being inserted */
10102 + write_lock_dk(tree);
10103 + znode_set_ld_key(node, key);
10104 + znode_set_rd_key(node, rdkey);
10105 + ON_DEBUG(node->creator = current);
10106 + ON_DEBUG(node->first_key = *key);
10107 + write_unlock_dk(tree);
10108 +
10109 + ZF_SET(node, JNODE_ORPHAN);
10110 +
10111 + /*
10112 + * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
10113 + * carry_insert_data
10114 + */
10115 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
10116 + sizeof(*item) + sizeof(*cdata));
10117 + if (IS_ERR(pool))
10118 + return PTR_ERR(pool);
10119 + todo = (carry_level *) (pool + 1);
10120 + init_carry_level(todo, pool);
10121 +
10122 + item = (reiser4_item_data *) (todo + 3);
10123 + cdata = (carry_insert_data *) (item + 1);
10124 +
10125 + op = post_carry(todo, COP_INSERT, insert_coord->node, 0);
10126 + if (!IS_ERR(op)) {
10127 + cdata->coord = insert_coord;
10128 + cdata->key = key;
10129 + cdata->data = item;
10130 + op->u.insert.d = cdata;
10131 + op->u.insert.type = COPT_ITEM_DATA;
10132 + build_child_ptr_data(node, item);
10133 + item->arg = NULL;
10134 + /* have @insert_coord to be set at inserted item after
10135 + insertion is done */
10136 + todo->track_type = CARRY_TRACK_CHANGE;
10137 + todo->tracked = lh;
10138 +
10139 + result = carry(todo, NULL);
10140 + if (result == 0) {
10141 + /*
10142 + * pin node in memory. This is necessary for
10143 + * znode_make_dirty() below.
10144 + */
10145 + result = zload(node);
10146 + if (result == 0) {
10147 + lock_handle local_lh;
10148 +
10149 + /*
10150 + * if we inserted new child into tree we have
10151 + * to mark it dirty so that flush will be able
10152 + * to process it.
10153 + */
10154 + init_lh(&local_lh);
10155 + result = longterm_lock_znode(&local_lh, node,
10156 + ZNODE_WRITE_LOCK,
10157 + ZNODE_LOCK_LOPRI);
10158 + if (result == 0) {
10159 + znode_make_dirty(node);
10160 +
10161 + /*
10162 + * when internal item pointing to @node
10163 + * was inserted into twig node
10164 + * create_hook_internal did not connect
10165 + * it properly because its right
10166 + * neighbor was not known. Do it
10167 + * here
10168 + */
10169 + write_lock_tree(tree);
10170 + assert("nikita-3312",
10171 + znode_is_right_connected(node));
10172 + assert("nikita-2984",
10173 + node->right == NULL);
10174 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
10175 + write_unlock_tree(tree);
10176 + result =
10177 + connect_znode(insert_coord, node);
10178 + if (result == 0)
10179 + ON_DEBUG(check_dkeys(node));
10180 +
10181 + done_lh(lh);
10182 + move_lh(lh, &local_lh);
10183 + assert("vs-1676", node_is_empty(node));
10184 + coord_init_first_unit(insert_coord,
10185 + node);
10186 + } else {
10187 + warning("nikita-3136",
10188 + "Cannot lock child");
10189 + }
10190 + done_lh(&local_lh);
10191 + zrelse(node);
10192 + }
10193 + }
10194 + } else
10195 + result = PTR_ERR(op);
10196 + zput(node);
10197 + done_carry_pool(pool);
10198 + return result;
10199 +}
10200 +
10201 +/**
10202 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
10203 + * @h: search handle
10204 + * @outcome: flag saying whether search has to restart or is done
10205 + *
10206 + * Handles search on twig level. If this function completes search itself then
10207 + * it returns 1. If search has to go one level down then 0 is returned. If
10208 + * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
10209 + * in @h->result.
10210 + */
10211 +int handle_eottl(cbk_handle *h, int *outcome)
10212 +{
10213 + int result;
10214 + reiser4_key key;
10215 + coord_t *coord;
10216 +
10217 + coord = h->coord;
10218 +
10219 + if (h->level != TWIG_LEVEL ||
10220 + (coord_is_existing_item(coord) && item_is_internal(coord))) {
10221 + /* Continue to traverse tree downward. */
10222 + return 0;
10223 + }
10224 +
10225 + /*
10226 + * make sure that @h->coord is set to twig node and that it is either
10227 + * set to extent item or after extent item
10228 + */
10229 + assert("vs-356", h->level == TWIG_LEVEL);
10230 + assert("vs-357", ( {
10231 + coord_t lcoord;
10232 + coord_dup(&lcoord, coord);
10233 + check_me("vs-733", coord_set_to_left(&lcoord) == 0);
10234 + item_is_extent(&lcoord);
10235 + }
10236 + ));
10237 +
10238 + if (*outcome == NS_FOUND) {
10239 + /* we have found desired key on twig level in extent item */
10240 + h->result = CBK_COORD_FOUND;
10241 + *outcome = LOOKUP_DONE;
10242 + return 1;
10243 + }
10244 +
10245 + if (!(h->flags & CBK_FOR_INSERT)) {
10246 + /* tree traversal is not for insertion. Just return
10247 + CBK_COORD_NOTFOUND. */
10248 + h->result = CBK_COORD_NOTFOUND;
10249 + *outcome = LOOKUP_DONE;
10250 + return 1;
10251 + }
10252 +
10253 + /* take a look at the item to the right of h -> coord */
10254 + result = is_next_item_internal(coord, h->key, h->active_lh);
10255 + if (unlikely(result < 0)) {
10256 + h->error = "get_right_neighbor failed";
10257 + h->result = result;
10258 + *outcome = LOOKUP_DONE;
10259 + return 1;
10260 + }
10261 + if (result == 0) {
10262 + /*
10263 + * item to the right is also an extent one. Allocate a new node
10264 + * and insert pointer to it after item h -> coord.
10265 + *
10266 + * This is a result of extents being located at the twig
10267 + * level. For explanation, see comment just above
10268 + * is_next_item_internal().
10269 + */
10270 + znode *loaded;
10271 +
10272 + if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10273 + /*
10274 + * we got node read locked, restart coord_by_key to
10275 + * have write lock on twig level
10276 + */
10277 + h->lock_level = TWIG_LEVEL;
10278 + h->lock_mode = ZNODE_WRITE_LOCK;
10279 + *outcome = LOOKUP_REST;
10280 + return 1;
10281 + }
10282 +
10283 + loaded = coord->node;
10284 + result =
10285 + add_empty_leaf(coord, h->active_lh, h->key,
10286 + rd_key(coord, &key));
10287 + if (result) {
10288 + h->error = "could not add empty leaf";
10289 + h->result = result;
10290 + *outcome = LOOKUP_DONE;
10291 + return 1;
10292 + }
10293 + /* added empty leaf is locked (h->active_lh), its parent node
10294 + is unlocked, h->coord is set as EMPTY */
10295 + assert("vs-13", coord->between == EMPTY_NODE);
10296 + assert("vs-14", znode_is_write_locked(coord->node));
10297 + assert("vs-15",
10298 + WITH_DATA(coord->node, node_is_empty(coord->node)));
10299 + assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10300 + assert("vs-17", coord->node == h->active_lh->node);
10301 + *outcome = LOOKUP_DONE;
10302 + h->result = CBK_COORD_NOTFOUND;
10303 + return 1;
10304 + } else if (result == 1) {
10305 + /*
10306 + * this is special case mentioned in the comment on
10307 + * tree.h:cbk_flags. We have found internal item immediately on
10308 + * the right of extent, and we are going to insert new item
10309 + * there. Key of item we are going to insert is smaller than
10310 + * leftmost key in the node pointed to by said internal item
10311 + * (otherwise search wouldn't come to the extent in the first
10312 + * place).
10313 + *
10314 + * This is a result of extents being located at the twig
10315 + * level. For explanation, see comment just above
10316 + * is_next_item_internal().
10317 + */
10318 + h->flags &= ~CBK_TRUST_DK;
10319 + } else {
10320 + assert("vs-8", result == 2);
10321 + *outcome = LOOKUP_REST;
10322 + return 1;
10323 + }
10324 + assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10325 + return 0;
10326 +}
10327 +
10328 +/*
10329 + * Local variables:
10330 + * c-indentation-style: "K&R"
10331 + * mode-name: "LC"
10332 + * c-basic-offset: 8
10333 + * tab-width: 8
10334 + * fill-column: 120
10335 + * scroll-step: 1
10336 + * End:
10337 + */
10338 Index: linux-2.6.16/fs/reiser4/estimate.c
10339 ===================================================================
10340 --- /dev/null
10341 +++ linux-2.6.16/fs/reiser4/estimate.c
10342 @@ -0,0 +1,111 @@
10343 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10344 +
10345 +#include "debug.h"
10346 +#include "dformat.h"
10347 +#include "tree.h"
10348 +#include "carry.h"
10349 +#include "inode.h"
10350 +#include "plugin/cluster.h"
10351 +#include "plugin/item/ctail.h"
10352 +
10353 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10354 +
10355 + Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10356 + is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10357 + neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10358 + leaf level, 3 for twig level, 2 on upper + 1 for root.
10359 +
10360 + Do not calculate the current node of the lowest level here - this is overhead only.
10361 +
10362 + children is almost always 1 here. Exception is flow insertion
10363 +*/
10364 +static reiser4_block_nr
10365 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10366 +{
10367 + reiser4_block_nr ten_percent;
10368 +
10369 + ten_percent = ((103 * childen) >> 10);
10370 +
10371 + /* If we have too many balancings at the time, tree height can raise on more
10372 + then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10373 + return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10374 +}
10375 +
10376 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10377 + perform insertion of one item into the tree */
10378 +/* it is only called when tree height changes, or gets initialized */
10379 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10380 +{
10381 + return 1 + max_balance_overhead(1, height);
10382 +}
10383 +
10384 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10385 +{
10386 + return tree->estimate_one_insert;
10387 +}
10388 +
10389 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10390 + perform insertion of one unit into an item in the tree */
10391 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10392 +{
10393 + /* estimate insert into item just like item insertion */
10394 + return tree->estimate_one_insert;
10395 +}
10396 +
10397 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10398 +{
10399 + /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10400 + level */
10401 + return tree->estimate_one_insert;
10402 +}
10403 +
10404 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10405 + both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10406 + levels */
10407 +reiser4_block_nr estimate_insert_flow(tree_level height)
10408 +{
10409 + return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10410 + CARRY_FLOW_NEW_NODES_LIMIT,
10411 + height);
10412 +}
10413 +
10414 +/* returnes max number of nodes can be occupied by disk cluster */
10415 +static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10416 +{
10417 + int per_cluster;
10418 + per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10419 + return 3 + per_cluster +
10420 + max_balance_overhead(3 + per_cluster,
10421 + REISER4_MAX_ZTREE_HEIGHT);
10422 +}
10423 +
10424 +/* how many nodes might get dirty and added
10425 + during insertion of a disk cluster */
10426 +reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10427 +{
10428 + return estimate_cluster(inode, 1); /* 24 */
10429 +}
10430 +
10431 +/* how many nodes might get dirty and added
10432 + during update of a (prepped or unprepped) disk cluster */
10433 +reiser4_block_nr estimate_update_cluster(struct inode * inode)
10434 +{
10435 + return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10436 +}
10437 +
10438 +/* how many nodes occupied by a disk cluster might get dirty */
10439 +reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10440 +{
10441 + return 2 + cluster_nrpages(inode);
10442 +}
10443 +
10444 +/* Make Linus happy.
10445 + Local variables:
10446 + c-indentation-style: "K&R"
10447 + mode-name: "LC"
10448 + c-basic-offset: 8
10449 + tab-width: 8
10450 + fill-column: 120
10451 + scroll-step: 1
10452 + End:
10453 +*/
10454 Index: linux-2.6.16/fs/reiser4/export_ops.c
10455 ===================================================================
10456 --- /dev/null
10457 +++ linux-2.6.16/fs/reiser4/export_ops.c
10458 @@ -0,0 +1,296 @@
10459 +/* Copyright 2005 by Hans Reiser, licensing governed by
10460 + * reiser4/README */
10461 +
10462 +#include "inode.h"
10463 +#include "plugin/plugin.h"
10464 +
10465 +
10466 +/*
10467 + * Supported file-handle types
10468 + */
10469 +typedef enum {
10470 + FH_WITH_PARENT = 0x10, /* file handle with parent */
10471 + FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10472 +} reiser4_fhtype;
10473 +
10474 +#define NFSERROR (255)
10475 +
10476 +/* initialize place-holder for object */
10477 +static void object_on_wire_init(reiser4_object_on_wire *o)
10478 +{
10479 + o->plugin = NULL;
10480 +}
10481 +
10482 +/* finish with @o */
10483 +static void object_on_wire_done(reiser4_object_on_wire *o)
10484 +{
10485 + if (o->plugin != NULL)
10486 + o->plugin->wire.done(o);
10487 +}
10488 +
10489 +/*
10490 + * read serialized object identity from @addr and store information about
10491 + * object in @obj. This is dual to encode_inode().
10492 + */
10493 +static char *decode_inode(struct super_block *s, char *addr,
10494 + reiser4_object_on_wire * obj)
10495 +{
10496 + file_plugin *fplug;
10497 +
10498 + /* identifier of object plugin is stored in the first two bytes,
10499 + * followed by... */
10500 + fplug = file_plugin_by_disk_id(get_tree(s), (d16 *) addr);
10501 + if (fplug != NULL) {
10502 + addr += sizeof(d16);
10503 + obj->plugin = fplug;
10504 + assert("nikita-3520", fplug->wire.read != NULL);
10505 + /* plugin specific encoding of object identity. */
10506 + addr = fplug->wire.read(addr, obj);
10507 + } else
10508 + addr = ERR_PTR(RETERR(-EINVAL));
10509 + return addr;
10510 +}
10511 +
10512 +/**
10513 + * reiser4_decode_fh - decode_fh of export operations
10514 + * @super: super block
10515 + * @fh: nfsd file handle
10516 + * @len: length of file handle
10517 + * @fhtype: type of file handle
10518 + * @acceptable: acceptability testing function
10519 + * @context: argument for @acceptable
10520 + *
10521 + * Returns dentry referring to the same file as @fh.
10522 + */
10523 +static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10524 + int len, int fhtype,
10525 + int (*acceptable) (void *context,
10526 + struct dentry *de),
10527 + void *context)
10528 +{
10529 + reiser4_context *ctx;
10530 + reiser4_object_on_wire object;
10531 + reiser4_object_on_wire parent;
10532 + char *addr;
10533 + int with_parent;
10534 +
10535 + ctx = init_context(super);
10536 + if (IS_ERR(ctx))
10537 + return (struct dentry *)ctx;
10538 +
10539 + assert("vs-1482",
10540 + fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10541 +
10542 + with_parent = (fhtype == FH_WITH_PARENT);
10543 +
10544 + addr = (char *)fh;
10545 +
10546 + object_on_wire_init(&object);
10547 + object_on_wire_init(&parent);
10548 +
10549 + addr = decode_inode(super, addr, &object);
10550 + if (!IS_ERR(addr)) {
10551 + if (with_parent)
10552 + addr = decode_inode(super, addr, &parent);
10553 + if (!IS_ERR(addr)) {
10554 + struct dentry *d;
10555 + typeof(super->s_export_op->find_exported_dentry) fn;
10556 +
10557 + fn = super->s_export_op->find_exported_dentry;
10558 + assert("nikita-3521", fn != NULL);
10559 + d = fn(super, &object, with_parent ? &parent : NULL,
10560 + acceptable, context);
10561 + if (d != NULL && !IS_ERR(d))
10562 + /* FIXME check for -ENOMEM */
10563 + reiser4_get_dentry_fsdata(d)->stateless = 1;
10564 + addr = (char *)d;
10565 + }
10566 + }
10567 +
10568 + object_on_wire_done(&object);
10569 + object_on_wire_done(&parent);
10570 +
10571 + reiser4_exit_context(ctx);
10572 + return (void *)addr;
10573 +}
10574 +
10575 +/*
10576 + * Object serialization support.
10577 + *
10578 + * To support knfsd file system provides export_operations that are used to
10579 + * construct and interpret NFS file handles. As a generalization of this,
10580 + * reiser4 object plugins have serialization support: it provides methods to
10581 + * create on-wire representation of identity of reiser4 object, and
10582 + * re-create/locate object given its on-wire identity.
10583 + *
10584 + */
10585 +
10586 +/*
10587 + * return number of bytes that on-wire representation of @inode's identity
10588 + * consumes.
10589 + */
10590 +static int encode_inode_size(struct inode *inode)
10591 +{
10592 + assert("nikita-3514", inode != NULL);
10593 + assert("nikita-3515", inode_file_plugin(inode) != NULL);
10594 + assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10595 +
10596 + return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10597 +}
10598 +
10599 +/*
10600 + * store on-wire representation of @inode's identity at the area beginning at
10601 + * @start.
10602 + */
10603 +static char *encode_inode(struct inode *inode, char *start)
10604 +{
10605 + assert("nikita-3517", inode != NULL);
10606 + assert("nikita-3518", inode_file_plugin(inode) != NULL);
10607 + assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10608 +
10609 + /*
10610 + * first, store two-byte identifier of object plugin, then
10611 + */
10612 + save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10613 + (d16 *) start);
10614 + start += sizeof(d16);
10615 + /*
10616 + * call plugin to serialize object's identity
10617 + */
10618 + return inode_file_plugin(inode)->wire.write(inode, start);
10619 +}
10620 +
10621 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10622 + * returned if file handle can not be stored */
10623 +/**
10624 + * reiser4_encode_fh - encode_fh of export operations
10625 + * @dentry:
10626 + * @fh:
10627 + * @lenp:
10628 + * @need_parent:
10629 + *
10630 + */
10631 +static int
10632 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10633 + int need_parent)
10634 +{
10635 + struct inode *inode;
10636 + struct inode *parent;
10637 + char *addr;
10638 + int need;
10639 + int delta;
10640 + int result;
10641 + reiser4_context *ctx;
10642 +
10643 + /*
10644 + * knfsd asks as to serialize object in @dentry, and, optionally its
10645 + * parent (if need_parent != 0).
10646 + *
10647 + * encode_inode() and encode_inode_size() is used to build
10648 + * representation of object and its parent. All hard work is done by
10649 + * object plugins.
10650 + */
10651 + inode = dentry->d_inode;
10652 + parent = dentry->d_parent->d_inode;
10653 +
10654 + addr = (char *)fh;
10655 +
10656 + need = encode_inode_size(inode);
10657 + if (need < 0)
10658 + return NFSERROR;
10659 + if (need_parent) {
10660 + delta = encode_inode_size(parent);
10661 + if (delta < 0)
10662 + return NFSERROR;
10663 + need += delta;
10664 + }
10665 +
10666 + ctx = init_context(dentry->d_inode->i_sb);
10667 + if (IS_ERR(ctx))
10668 + return PTR_ERR(ctx);
10669 +
10670 + if (need <= sizeof(__u32) * (*lenp)) {
10671 + addr = encode_inode(inode, addr);
10672 + if (need_parent)
10673 + addr = encode_inode(parent, addr);
10674 +
10675 + /* store in lenp number of 32bit words required for file
10676 + * handle. */
10677 + *lenp = (need + sizeof(__u32) - 1) >> 2;
10678 + result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10679 + } else
10680 + /* no enough space in file handle */
10681 + result = NFSERROR;
10682 + reiser4_exit_context(ctx);
10683 + return result;
10684 +}
10685 +
10686 +/**
10687 + * reiser4_get_dentry_parent - get_parent of export operations
10688 + * @child:
10689 + *
10690 + */
10691 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10692 +{
10693 + struct inode *dir;
10694 + dir_plugin *dplug;
10695 +
10696 + assert("nikita-3527", child != NULL);
10697 + /* see comment in reiser4_get_dentry() about following assertion */
10698 + assert("nikita-3528", is_in_reiser4_context());
10699 +
10700 + dir = child->d_inode;
10701 + assert("nikita-3529", dir != NULL);
10702 + dplug = inode_dir_plugin(dir);
10703 + assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10704 + if (dplug != NULL)
10705 + return dplug->get_parent(dir);
10706 + else
10707 + return ERR_PTR(RETERR(-ENOTDIR));
10708 +}
10709 +
10710 +/**
10711 + * reiser4_get_dentry - get_dentry of export operations
10712 + * @super:
10713 + * @data:
10714 + *
10715 + *
10716 + */
10717 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10718 +{
10719 + reiser4_object_on_wire *o;
10720 +
10721 + assert("nikita-3522", super != NULL);
10722 + assert("nikita-3523", data != NULL);
10723 + /*
10724 + * this is only supposed to be called by
10725 + *
10726 + * reiser4_decode_fh->find_exported_dentry
10727 + *
10728 + * so, reiser4_context should be here already.
10729 + */
10730 + assert("nikita-3526", is_in_reiser4_context());
10731 +
10732 + o = (reiser4_object_on_wire *)data;
10733 + assert("nikita-3524", o->plugin != NULL);
10734 + assert("nikita-3525", o->plugin->wire.get != NULL);
10735 +
10736 + return o->plugin->wire.get(super, o);
10737 +}
10738 +
10739 +struct export_operations reiser4_export_operations = {
10740 + .encode_fh = reiser4_encode_fh,
10741 + .decode_fh = reiser4_decode_fh,
10742 + .get_parent = reiser4_get_dentry_parent,
10743 + .get_dentry = reiser4_get_dentry
10744 +};
10745 +
10746 +/*
10747 + * Local variables:
10748 + * c-indentation-style: "K&R"
10749 + * mode-name: "LC"
10750 + * c-basic-offset: 8
10751 + * tab-width: 8
10752 + * fill-column: 79
10753 + * End:
10754 + */
10755 Index: linux-2.6.16/fs/reiser4/flush.c
10756 ===================================================================
10757 --- /dev/null
10758 +++ linux-2.6.16/fs/reiser4/flush.c
10759 @@ -0,0 +1,3626 @@
10760 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10761 +
10762 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10763 +
10764 +#include "forward.h"
10765 +#include "debug.h"
10766 +#include "dformat.h"
10767 +#include "key.h"
10768 +#include "coord.h"
10769 +#include "plugin/item/item.h"
10770 +#include "plugin/plugin.h"
10771 +#include "plugin/object.h"
10772 +#include "txnmgr.h"
10773 +#include "jnode.h"
10774 +#include "znode.h"
10775 +#include "block_alloc.h"
10776 +#include "tree_walk.h"
10777 +#include "carry.h"
10778 +#include "tree.h"
10779 +#include "vfs_ops.h"
10780 +#include "inode.h"
10781 +#include "page_cache.h"
10782 +#include "wander.h"
10783 +#include "super.h"
10784 +#include "entd.h"
10785 +#include "reiser4.h"
10786 +#include "flush.h"
10787 +#include "writeout.h"
10788 +
10789 +#include <asm/atomic.h>
10790 +#include <linux/fs.h> /* for struct super_block */
10791 +#include <linux/mm.h> /* for struct page */
10792 +#include <linux/bio.h> /* for struct bio */
10793 +#include <linux/pagemap.h>
10794 +#include <linux/blkdev.h>
10795 +
10796 +/* IMPLEMENTATION NOTES */
10797 +
10798 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10799 + order to the nodes of the tree in which the parent is placed before its children, which
10800 + are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10801 + describes the node that "came before in forward parent-first order". When we speak of a
10802 + "parent-first follower", it describes the node that "comes next in parent-first
10803 + order" (alternatively the node that "came before in reverse parent-first order").
10804 +
10805 + The following pseudo-code prints the nodes of a tree in forward parent-first order:
10806 +
10807 + void parent_first (node)
10808 + {
10809 + print_node (node);
10810 + if (node->level > leaf) {
10811 + for (i = 0; i < num_children; i += 1) {
10812 + parent_first (node->child[i]);
10813 + }
10814 + }
10815 + }
10816 +*/
10817 +
10818 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10819 + that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10820 + can be accomplished with sequential reads, which results in reading nodes in their
10821 + parent-first order. This is a read-optimization aspect of the flush algorithm, and
10822 + there is also a write-optimization aspect, which is that we wish to make large
10823 + sequential writes to the disk by allocating or reallocating blocks so that they can be
10824 + written in sequence. Sometimes the read-optimization and write-optimization goals
10825 + conflict with each other, as we discuss in more detail below.
10826 +*/
10827 +
10828 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10829 + the relevant jnode->state bits and their relevence to flush:
10830 +
10831 + JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10832 + must be allocated first. In order to be considered allocated, the jnode must have
10833 + exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10834 + all dirtied jnodes eventually have one of these bits set during each transaction.
10835 +
10836 + JNODE_CREATED: The node was freshly created in its transaction and has no previous
10837 + block address, so it is unconditionally assigned to be relocated, although this is
10838 + mainly for code-convenience. It is not being 'relocated' from anything, but in
10839 + almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10840 + remains set even after JNODE_RELOC is set, so the actual relocate can be
10841 + distinguished from the created-and-allocated set easily: relocate-set members
10842 + (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10843 + have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10844 +
10845 + JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10846 + decision to maintain the pre-existing location for this node and it will be written
10847 + to the wandered-log.
10848 +
10849 + JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10850 + not created, see note above). A block with JNODE_RELOC set is eligible for
10851 + early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10852 + bit is set on a znode, the parent node's internal item is modified and the znode is
10853 + rehashed.
10854 +
10855 + JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10856 + and calls plugin->f.squeeze() method for its items. By this technology we update disk
10857 + clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10858 + has this flag (races with write(), rare case) the flush algorythm makes the decision
10859 + to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10860 + repeated allocation.
10861 +
10862 + JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10863 + flush queue. This means the jnode is not on any clean or dirty list, instead it is
10864 + moved to one of the flush queue (see flush_queue.h) object private list. This
10865 + prevents multiple concurrent flushes from attempting to start flushing from the
10866 + same node.
10867 +
10868 + (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10869 + squeeze-and-allocate on a node while its children are actively being squeezed and
10870 + allocated. This flag was created to avoid submitting a write request for a node
10871 + while its children are still being allocated and squeezed. Then flush queue was
10872 + re-implemented to allow unlimited number of nodes be queued. This flag support was
10873 + commented out in source code because we decided that there was no reason to submit
10874 + queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10875 + during a slum traversal and may submit "busy nodes" to disk. Probably we can
10876 + re-enable the JNODE_FLUSH_BUSY bit support in future.
10877 +
10878 + With these state bits, we describe a test used frequently in the code below,
10879 + jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10880 + test for "flushprepped" returns true if any of the following are true:
10881 +
10882 + - The node is not dirty
10883 + - The node has JNODE_RELOC set
10884 + - The node has JNODE_OVRWR set
10885 +
10886 + If either the node is not dirty or it has already been processed by flush (and assigned
10887 + JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10888 + true then flush has work to do on that node.
10889 +*/
10890 +
10891 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10892 + flushprepped twice (unless an explicit call to flush_unprep is made as described in
10893 + detail below). For example a node is dirtied, allocated, and then early-flushed to
10894 + disk and set clean. Before the transaction commits, the page is dirtied again and, due
10895 + to memory pressure, the node is flushed again. The flush algorithm will not relocate
10896 + the node to a new disk location, it will simply write it to the same, previously
10897 + relocated position again.
10898 +*/
10899 +
10900 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10901 + start at a leaf node and allocate in parent-first order by iterating to the right. At
10902 + each step of the iteration, we check for the right neighbor. Before advancing to the
10903 + right neighbor, we check if the current position and the right neighbor share the same
10904 + parent. If they do not share the same parent, the parent is allocated before the right
10905 + neighbor.
10906 +
10907 + This process goes recursively up the tree and squeeze nodes level by level as long as
10908 + the right neighbor and the current position have different parents, then it allocates
10909 + the right-neighbors-with-different-parents on the way back down. This process is
10910 + described in more detail in flush_squalloc_changed_ancestor and the recursive function
10911 + squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10912 + specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10913 + approaches.
10914 +
10915 + The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10916 + approach, we find a starting point by scanning left along each level past dirty nodes,
10917 + then going up and repeating the process until the left node and the parent node are
10918 + clean. We then perform a parent-first traversal from the starting point, which makes
10919 + allocating in parent-first order trivial. After one subtree has been allocated in this
10920 + manner, we move to the right, try moving upward, then repeat the parent-first
10921 + traversal.
10922 +
10923 + Both approaches have problems that need to be addressed. Both are approximately the
10924 + same amount of code, but the bottom-up approach has advantages in the order it acquires
10925 + locks which, at the very least, make it the better approach. At first glance each one
10926 + makes the other one look simpler, so it is important to remember a few of the problems
10927 + with each one.
10928 +
10929 + Main problem with the top-down approach: When you encounter a clean child during the
10930 + parent-first traversal, what do you do? You would like to avoid searching through a
10931 + large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10932 + obvious solution. One of the advantages of the top-down approach is that during the
10933 + parent-first traversal you check every child of a parent to see if it is dirty. In
10934 + this way, the top-down approach easily handles the main problem of the bottom-up
10935 + approach: unallocated children.
10936 +
10937 + The unallocated children problem is that before writing a node to disk we must make
10938 + sure that all of its children are allocated. Otherwise, the writing the node means
10939 + extra I/O because the node will have to be written again when the child is finally
10940 + allocated.
10941 +
10942 + WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10943 + should not cause any file system corruption, it only degrades I/O performance because a
10944 + node may be written when it is sure to be written at least one more time in the same
10945 + transaction when the remaining children are allocated. What follows is a description
10946 + of how we will solve the problem.
10947 +*/
10948 +
10949 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10950 + proceeding in parent first order, allocate some of its left-children, then encounter a
10951 + clean child in the middle of the parent. We do not allocate the clean child, but there
10952 + may remain unallocated (dirty) children to the right of the clean child. If we were to
10953 + stop flushing at this moment and write everything to disk, the parent might still
10954 + contain unallocated children.
10955 +
10956 + We could try to allocate all the descendents of every node that we allocate, but this
10957 + is not necessary. Doing so could result in allocating the entire tree: if the root
10958 + node is allocated then every unallocated node would have to be allocated before
10959 + flushing. Actually, we do not have to write a node just because we allocate it. It is
10960 + possible to allocate but not write a node during flush, when it still has unallocated
10961 + children. However, this approach is probably not optimal for the following reason.
10962 +
10963 + The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10964 + to optimize reads that occur in the same order. Thus we are read-optimizing for a
10965 + left-to-right scan through all the leaves in the system, and we are hoping to
10966 + write-optimize at the same time because those nodes will be written together in batch.
10967 + What happens, however, if we assign a block number to a node in its read-optimized
10968 + order but then avoid writing it because it has unallocated children? In that
10969 + situation, we lose out on the write-optimization aspect because a node will have to be
10970 + written again to the its location on the device, later, which likely means seeking back
10971 + to that location.
10972 +
10973 + So there are tradeoffs. We can choose either:
10974 +
10975 + A. Allocate all unallocated children to preserve both write-optimization and
10976 + read-optimization, but this is not always desirable because it may mean having to
10977 + allocate and flush very many nodes at once.
10978 +
10979 + B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10980 + but sacrifice write-optimization because those nodes will be written again.
10981 +
10982 + C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10983 + locations. Instead, choose to write-optimize them later, when they are written. To
10984 + facilitate this, we "undo" the read-optimized allocation that was given to the node so
10985 + that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10986 + case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10987 + call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10988 + if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10989 + location, and set the JNODE_CREATED bit, effectively setting the node back to an
10990 + unallocated state.
10991 +
10992 + We will take the following approach in v4.0: for twig nodes we will always finish
10993 + allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10994 + writing and choose write-optimization (C).
10995 +
10996 + To summarize, there are several parts to a solution that avoids the problem with
10997 + unallocated children:
10998 +
10999 + FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
11000 + problem because there was an experiment which was done showed that we have 1-2 nodes
11001 + with unallocated children for thousands of written nodes. The experiment was simple
11002 + like coping / deletion of linux kernel sources. However the problem can arise in more
11003 + complex tests. I think we have jnode_io_hook to insert a check for unallocated
11004 + children and see what kind of problem we have.
11005 +
11006 + 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
11007 + squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
11008 + implement: should be simple -- amounts to adding a while loop to jnode_flush, see
11009 + comments in that function.
11010 +
11011 + 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
11012 + have unallocated children. If the twig level has unallocated children it is an
11013 + assertion failure. If a higher-level node has unallocated children, then it should be
11014 + explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
11015 + should be simple.
11016 +
11017 + 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
11018 + CPU cycles than we would like, and it is possible (but medium complexity) to optimize
11019 + this somewhat in the case where large sub-trees are flushed. The following observation
11020 + helps: if both the left- and right-neighbor of a node are processed by the flush
11021 + algorithm then the node itself is guaranteed to have all of its children allocated.
11022 + However, the cost of this check may not be so expensive after all: it is not needed for
11023 + leaves and flush can guarantee this property for twigs. That leaves only (level >
11024 + TWIG) nodes that have to be checked, so this optimization only helps if at least three
11025 + (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
11026 + there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
11027 + then the number of blocks being written will be very large, so the savings may be
11028 + insignificant. That said, the idea is to maintain both the left and right edges of
11029 + nodes that are processed in flush. When flush_empty_queue() is called, a relatively
11030 + simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
11031 + edge, the slow check is necessary, but if it is in the interior then it can be assumed
11032 + to have all of its children allocated. FIXME: medium complexity to implement, but
11033 + simple to verify given that we must have a slow check anyway.
11034 +
11035 + 4. (Optional) This part is optional, not for v4.0--flush should work independently of
11036 + whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
11037 + left-scan operation to take unallocated children into account. Normally, the left-scan
11038 + operation goes left as long as adjacent nodes are dirty up until some large maximum
11039 + value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
11040 + may stop at a position where there are unallocated children to the left with the same
11041 + parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
11042 + FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
11043 + with a rapid scan. The rapid scan skips all the interior children of a node--if the
11044 + leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
11045 + twig to the left). If the left neighbor of the leftmost child is also dirty, then
11046 + continue the scan at the left twig and repeat. This option will cause flush to
11047 + allocate more twigs in a single pass, but it also has the potential to write many more
11048 + nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
11049 + was partially implemented, code removed August 12, 2002 by JMACD.
11050 +*/
11051 +
11052 +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
11053 + starting point for flush is a leaf node, but actually the flush code cares very little
11054 + about whether or not this is true. It is possible that all the leaf nodes are flushed
11055 + and dirty parent nodes still remain, in which case jnode_flush() is called on a
11056 + non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
11057 + leaf, even when it is not. This is a simple approach, and there may be a more optimal
11058 + policy but until a problem with this approach is discovered, simplest is probably best.
11059 +
11060 + NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
11061 + the leaves. This is done as a matter of simplicity and there is only one (shaky)
11062 + justification. When an atom commits, it flushes all leaf level nodes first, followed
11063 + by twigs, and so on. With flushing done in this order, if flush is eventually called
11064 + on a non-leaf node it means that (somehow) we reached a point where all leaves are
11065 + clean and only internal nodes need to be flushed. If that it the case, then it means
11066 + there were no leaves that were the parent-first preceder/follower of the parent. This
11067 + is expected to be a rare case, which is why we do nothing special about it. However,
11068 + memory pressure may pass an internal node to flush when there are still dirty leaf
11069 + nodes that need to be flushed, which could prove our original assumptions
11070 + "inoperative". If this needs to be fixed, then scan_left/right should have
11071 + special checks for the non-leaf levels. For example, instead of passing from a node to
11072 + the left neighbor, it should pass from the node to the left neighbor's rightmost
11073 + descendent (if dirty).
11074 +
11075 +*/
11076 +
11077 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
11078 + it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
11079 + logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
11080 + device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
11081 + device becomes sorted such that tree order and block number order fully correlate.
11082 +
11083 + Resizing is done by shifting everything either all the way to the left or all the way
11084 + to the right, and then reporting the last block.
11085 +*/
11086 +
11087 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
11088 + descibes the policy from the highest level:
11089 +
11090 + The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
11091 + leaf level during flush-scan (right, left), then we unconditionally decide to relocate
11092 + leaf nodes.
11093 +
11094 + Otherwise, there are two contexts in which we make a decision to relocate:
11095 +
11096 + 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
11097 + During the initial stages of flush, after scan-right completes, we want to ask the
11098 + question: should we relocate this leaf node and thus dirty the parent node. Then if
11099 + the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
11100 + the question at the next level up, and so on. In these cases we are moving in the
11101 + reverse-parent first direction.
11102 +
11103 + There is another case which is considered the reverse direction, which comes at the end
11104 + of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
11105 + reach a point where there is a clean twig to the right with a dirty leftmost child. In
11106 + this case, we may wish to relocate the child by testing if it should be relocated
11107 + relative to its parent.
11108 +
11109 + 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
11110 + allocate_znode. What distinguishes the forward parent-first case from the
11111 + reverse-parent first case is that the preceder has already been allocated in the
11112 + forward case, whereas in the reverse case we don't know what the preceder is until we
11113 + finish "going in reverse". That simplifies the forward case considerably, and there we
11114 + actually use the block allocator to determine whether, e.g., a block closer to the
11115 + preceder is available.
11116 +*/
11117 +
11118 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
11119 + finish scan-left and find a starting point, if the parent's left neighbor is dirty then
11120 + squeeze the parent's left neighbor and the parent. This may change the
11121 + flush-starting-node's parent. Repeat until the child's parent is stable. If the child
11122 + is a leftmost child, repeat this left-edge squeezing operation at the next level up.
11123 + Note that we cannot allocate extents during this or they will be out of parent-first
11124 + order. There is also some difficult coordinate maintenence issues. We can't do a tree
11125 + search to find coordinates again (because we hold locks), we have to determine them
11126 + from the two nodes being squeezed. Looks difficult, but has potential to increase
11127 + space utilization. */
11128 +
11129 +/* Flush-scan helper functions. */
11130 +static void scan_init(flush_scan * scan);
11131 +static void scan_done(flush_scan * scan);
11132 +
11133 +/* Flush-scan algorithm. */
11134 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
11135 + unsigned limit);
11136 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
11137 +static int scan_common(flush_scan * scan, flush_scan * other);
11138 +static int scan_formatted(flush_scan * scan);
11139 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
11140 +static int scan_by_coord(flush_scan * scan);
11141 +
11142 +/* Initial flush-point ancestor allocation. */
11143 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
11144 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
11145 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
11146 +
11147 +/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
11148 +static int squalloc(flush_pos_t * pos);
11149 +
11150 +/* Flush squeeze implementation. */
11151 +static int squeeze_right_non_twig(znode * left, znode * right);
11152 +static int shift_one_internal_unit(znode * left, znode * right);
11153 +
11154 +/* Flush reverse parent-first relocation routines. */
11155 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11156 + const reiser4_block_nr * nblk);
11157 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11158 + flush_pos_t * pos);
11159 +static int reverse_relocate_check_dirty_parent(jnode * node,
11160 + const coord_t * parent_coord,
11161 + flush_pos_t * pos);
11162 +
11163 +/* Flush allocate write-queueing functions: */
11164 +static int allocate_znode(znode * node, const coord_t * parent_coord,
11165 + flush_pos_t * pos);
11166 +static int allocate_znode_update(znode * node, const coord_t * parent_coord,
11167 + flush_pos_t * pos);
11168 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
11169 +
11170 +/* Flush helper functions: */
11171 +static int jnode_lock_parent_coord(jnode * node,
11172 + coord_t * coord,
11173 + lock_handle * parent_lh,
11174 + load_count * parent_zh,
11175 + znode_lock_mode mode, int try);
11176 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
11177 + znode_lock_mode mode, int check_dirty);
11178 +static int znode_same_parents(znode * a, znode * b);
11179 +
11180 +static int znode_check_flushprepped(znode * node)
11181 +{
11182 + return jnode_check_flushprepped(ZJNODE(node));
11183 +}
11184 +
11185 +/* Flush position functions */
11186 +static void pos_init(flush_pos_t * pos);
11187 +static int pos_valid(flush_pos_t * pos);
11188 +static void pos_done(flush_pos_t * pos);
11189 +static int pos_stop(flush_pos_t * pos);
11190 +
11191 +/* check that @org is first jnode extent unit, if extent is unallocated,
11192 + * because all jnodes of unallocated extent are dirty and of the same atom. */
11193 +#define checkchild(scan) \
11194 +assert("nikita-3435", \
11195 + ergo(scan->direction == LEFT_SIDE && \
11196 + (scan->parent_coord.node->level == TWIG_LEVEL) && \
11197 + jnode_is_unformatted(scan->node) && \
11198 + extent_is_unallocated(&scan->parent_coord), \
11199 + extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
11200 +
11201 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
11202 + useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
11203 + no static initializer function...) */
11204 +ON_DEBUG(atomic_t flush_cnt;
11205 + )
11206 +
11207 +/* check fs backing device for write congestion */
11208 +static int check_write_congestion(void)
11209 +{
11210 + struct super_block *sb;
11211 + struct backing_dev_info *bdi;
11212 +
11213 + sb = reiser4_get_current_sb();
11214 + bdi = get_super_fake(sb)->i_mapping->backing_dev_info;
11215 + return bdi_write_congested(bdi);
11216 +}
11217 +
11218 +/* conditionally write flush queue */
11219 +static int write_prepped_nodes(flush_pos_t * pos)
11220 +{
11221 + int ret;
11222 +
11223 + assert("zam-831", pos);
11224 + assert("zam-832", pos->fq);
11225 +
11226 + if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11227 + return 0;
11228 +
11229 + if (check_write_congestion())
11230 + return 0;
11231 +
11232 + ret = write_fq(pos->fq, pos->nr_written,
11233 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11234 + return ret;
11235 +}
11236 +
11237 +/* Proper release all flush pos. resources then move flush position to new
11238 + locked node */
11239 +static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
11240 + load_count * new_load, const coord_t * new_coord)
11241 +{
11242 + assert("zam-857", new_lock->node == new_load->node);
11243 +
11244 + if (new_coord) {
11245 + assert("zam-858", new_coord->node == new_lock->node);
11246 + coord_dup(&pos->coord, new_coord);
11247 + } else {
11248 + coord_init_first_unit(&pos->coord, new_lock->node);
11249 + }
11250 +
11251 + if (pos->child) {
11252 + jput(pos->child);
11253 + pos->child = NULL;
11254 + }
11255 +
11256 + move_load_count(&pos->load, new_load);
11257 + done_lh(&pos->lock);
11258 + move_lh(&pos->lock, new_lock);
11259 +}
11260 +
11261 +/* delete empty node which link from the parent still exists. */
11262 +static int delete_empty_node(znode * node)
11263 +{
11264 + reiser4_key smallest_removed;
11265 +
11266 + assert("zam-1019", node != NULL);
11267 + assert("zam-1020", node_is_empty(node));
11268 + assert("zam-1023", znode_is_wlocked(node));
11269 +
11270 + return delete_node(node, &smallest_removed, NULL, 1);
11271 +}
11272 +
11273 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11274 +static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11275 +{
11276 + int ret;
11277 + load_count load;
11278 + lock_handle lock;
11279 +
11280 + init_lh(&lock);
11281 + init_load_count(&load);
11282 +
11283 + if (jnode_is_znode(org)) {
11284 + ret = longterm_lock_znode(&lock, JZNODE(org),
11285 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11286 + if (ret)
11287 + return ret;
11288 +
11289 + ret = incr_load_count_znode(&load, JZNODE(org));
11290 + if (ret)
11291 + return ret;
11292 +
11293 + pos->state =
11294 + (jnode_get_level(org) ==
11295 + LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11296 + move_flush_pos(pos, &lock, &load, NULL);
11297 + } else {
11298 + coord_t parent_coord;
11299 + ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11300 + &load, ZNODE_WRITE_LOCK, 0);
11301 + if (ret)
11302 + goto done;
11303 + if (!item_is_extent(&parent_coord)) {
11304 + /* file was converted to tail, org became HB, we found internal
11305 + item */
11306 + ret = -EAGAIN;
11307 + goto done;
11308 + }
11309 +
11310 + pos->state = POS_ON_EPOINT;
11311 + move_flush_pos(pos, &lock, &load, &parent_coord);
11312 + pos->child = jref(org);
11313 + if (extent_is_unallocated(&parent_coord)
11314 + && extent_unit_index(&parent_coord) != index_jnode(org)) {
11315 + /* @org is not first child of its parent unit. This may happen
11316 + because longerm lock of its parent node was released between
11317 + scan_left and scan_right. For now work around this having flush to repeat */
11318 + ret = -EAGAIN;
11319 + }
11320 + }
11321 +
11322 + done:
11323 + done_load_count(&load);
11324 + done_lh(&lock);
11325 + return ret;
11326 +}
11327 +
11328 +/* TODO LIST (no particular order): */
11329 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
11330 + indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11331 + specific names mentioned instead that need to be inspected/resolved. */
11332 +/* B. There is an issue described in reverse_relocate_test having to do with an
11333 + imprecise is_preceder? check having to do with partially-dirty extents. The code that
11334 + sets preceder hints and computes the preceder is basically untested. Careful testing
11335 + needs to be done that preceder calculations are done correctly, since if it doesn't
11336 + affect correctness we will not catch this stuff during regular testing. */
11337 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11338 + considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11339 + but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11340 + Many of the calls that may produce one of these return values (i.e.,
11341 + longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11342 + values themselves and, for instance, stop flushing instead of resulting in a restart.
11343 + If any of these results are true error conditions then flush will go into a busy-loop,
11344 + as we noticed during testing when a corrupt tree caused find_child_ptr to return
11345 + ENOENT. It needs careful thought and testing of corner conditions.
11346 +*/
11347 +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11348 + block is assigned a block number then early-flushed to disk. It is dirtied again and
11349 + flush is called again. Concurrently, that block is deleted, and the de-allocation of
11350 + its block number does not need to be deferred, since it is not part of the preserve set
11351 + (i.e., it didn't exist before the transaction). I think there may be a race condition
11352 + where flush writes the dirty, created block after the non-deferred deallocated block
11353 + number is re-allocated, making it possible to write deleted data on top of non-deleted
11354 + data. Its just a theory, but it needs to be thought out. */
11355 +/* F. bio_alloc() failure is not handled gracefully. */
11356 +/* G. Unallocated children. */
11357 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11358 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11359 +
11360 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11361 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11362 + neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11363 + blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11364 + a part of transaction commit.
11365 +
11366 + Our objective here is to prep and flush the slum the jnode belongs to. We want to
11367 + squish the slum together, and allocate the nodes in it as we squish because allocation
11368 + of children affects squishing of parents.
11369 +
11370 + The "argument" @node tells flush where to start. From there, flush finds the left edge
11371 + of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11372 + "better place" to start squalloc first we perform a flush_scan.
11373 +
11374 + Flush-scanning may be performed in both left and right directions, but for different
11375 + purposes. When scanning to the left, we are searching for a node that precedes a
11376 + sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11377 + During flush-scanning, we also take the opportunity to count the number of consecutive
11378 + leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11379 + make a decision to reallocate leaf nodes (thus favoring write-optimization).
11380 +
11381 + Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11382 + also be dirty nodes to the right of the argument. If the scan-left operation does not
11383 + count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11384 + operation to see whether there is, in fact, enough nodes to meet the relocate
11385 + threshold. Each right- and left-scan operation uses a single flush_scan object.
11386 +
11387 + After left-scan and possibly right-scan, we prepare a flush_position object with the
11388 + starting flush point or parent coordinate, which was determined using scan-left.
11389 +
11390 + Next we call the main flush routine, squalloc, which iterates along the
11391 + leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11392 +
11393 + After squalloc returns we take extra steps to ensure that all the children
11394 + of the final twig node are allocated--this involves repeating squalloc
11395 + until we finish at a twig with no unallocated children.
11396 +
11397 + Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11398 + any above-twig nodes during flush_empty_queue that still have unallocated children, we
11399 + flush_unprep them.
11400 +
11401 + Flush treats several "failure" cases as non-failures, essentially causing them to start
11402 + over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11403 + probably be handled properly rather than restarting, but there are a bunch of cases to
11404 + audit.
11405 +*/
11406 +
11407 +static int
11408 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11409 + flush_queue_t * fq, int flags)
11410 +{
11411 + long ret = 0;
11412 + flush_scan *right_scan;
11413 + flush_scan *left_scan;
11414 + flush_pos_t *flush_pos;
11415 + int todo;
11416 + struct super_block *sb;
11417 + reiser4_super_info_data *sbinfo;
11418 + jnode *leftmost_in_slum = NULL;
11419 +
11420 + assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11421 + assert("nikita-3022", schedulable());
11422 +
11423 + /* lock ordering: delete_sema and flush_sema are unordered */
11424 + assert("nikita-3185",
11425 + get_current_super_private()->delete_sema_owner != current);
11426 +
11427 + /* allocate right_scan, left_scan and flush_pos */
11428 + right_scan =
11429 + kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), get_gfp_mask());
11430 + if (right_scan == NULL)
11431 + return RETERR(-ENOMEM);
11432 + left_scan = right_scan + 1;
11433 + flush_pos = (flush_pos_t *) (left_scan + 1);
11434 +
11435 + sb = reiser4_get_current_sb();
11436 + sbinfo = get_super_private(sb);
11437 + if (!reiser4_is_set(sb, REISER4_MTFLUSH)) {
11438 + down(&sbinfo->flush_sema);
11439 + }
11440 +
11441 + /* Flush-concurrency debug code */
11442 +#if REISER4_DEBUG
11443 + atomic_inc(&flush_cnt);
11444 +#endif
11445 +
11446 + enter_flush(sb);
11447 +
11448 + /* Initialize a flush position. */
11449 + pos_init(flush_pos);
11450 +
11451 + flush_pos->nr_written = nr_written;
11452 + flush_pos->fq = fq;
11453 + flush_pos->flags = flags;
11454 + flush_pos->nr_to_write = nr_to_write;
11455 +
11456 + scan_init(right_scan);
11457 + scan_init(left_scan);
11458 +
11459 + /* First scan left and remember the leftmost scan position. If the leftmost
11460 + position is unformatted we remember its parent_coord. We scan until counting
11461 + FLUSH_SCAN_MAXNODES.
11462 +
11463 + If starting @node is unformatted, at the beginning of left scan its
11464 + parent (twig level node, containing extent item) will be long term
11465 + locked and lock handle will be stored in the
11466 + @right_scan->parent_lock. This lock is used to start the rightward
11467 + scan without redoing the tree traversal (necessary to find parent)
11468 + and, hence, is kept during leftward scan. As a result, we have to
11469 + use try-lock when taking long term locks during the leftward scan.
11470 + */
11471 + ret = scan_left(left_scan, right_scan,
11472 + node, sbinfo->flush.scan_maxnodes);
11473 + if (ret != 0)
11474 + goto failed;
11475 +
11476 + leftmost_in_slum = jref(left_scan->node);
11477 + scan_done(left_scan);
11478 +
11479 + /* Then possibly go right to decide if we will use a policy of relocating leaves.
11480 + This is only done if we did not scan past (and count) enough nodes during the
11481 + leftward scan. If we do scan right, we only care to go far enough to establish
11482 + that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11483 + scan limit is the difference between left_scan.count and the threshold. */
11484 +
11485 + todo = sbinfo->flush.relocate_threshold - left_scan->count;
11486 + /* scan right is inherently deadlock prone, because we are
11487 + * (potentially) holding a lock on the twig node at this moment.
11488 + * FIXME: this is incorrect comment: lock is not held */
11489 + if (todo > 0) {
11490 + ret = scan_right(right_scan, node, (unsigned)todo);
11491 + if (ret != 0)
11492 + goto failed;
11493 + }
11494 +
11495 + /* Only the right-scan count is needed, release any rightward locks right away. */
11496 + scan_done(right_scan);
11497 +
11498 + /* ... and the answer is: we should relocate leaf nodes if at least
11499 + FLUSH_RELOCATE_THRESHOLD nodes were found. */
11500 + flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11501 + (left_scan->count + right_scan->count >=
11502 + sbinfo->flush.relocate_threshold);
11503 +
11504 + /* Funny business here. We set the 'point' in the flush_position at prior to
11505 + starting squalloc regardless of whether the first point is
11506 + formatted or unformatted. Without this there would be an invariant, in the
11507 + rest of the code, that if the flush_position is unformatted then
11508 + flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11509 + and if the flush_position is formatted then flush_position->point is non-NULL
11510 + and no parent info is set.
11511 +
11512 + This seems lazy, but it makes the initial calls to reverse_relocate_test
11513 + (which ask "is it the pos->point the leftmost child of its parent") much easier
11514 + because we know the first child already. Nothing is broken by this, but the
11515 + reasoning is subtle. Holding an extra reference on a jnode during flush can
11516 + cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11517 + removed from sibling lists until they have zero reference count. Flush would
11518 + never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11519 + deleted to the right. So if nothing is broken, why fix it?
11520 +
11521 + NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11522 + point and in any moment, because of the concurrent file system
11523 + activity (for example, truncate). */
11524 +
11525 + /* Check jnode state after flush_scan completed. Having a lock on this
11526 + node or its parent (in case of unformatted) helps us in case of
11527 + concurrent flushing. */
11528 + if (jnode_check_flushprepped(leftmost_in_slum)
11529 + && !jnode_convertible(leftmost_in_slum)) {
11530 + ret = 0;
11531 + goto failed;
11532 + }
11533 +
11534 + /* Now setup flush_pos using scan_left's endpoint. */
11535 + ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11536 + if (ret)
11537 + goto failed;
11538 +
11539 + if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11540 + && node_is_empty(flush_pos->coord.node)) {
11541 + znode *empty = flush_pos->coord.node;
11542 +
11543 + assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11544 + ret = delete_empty_node(empty);
11545 + goto failed;
11546 + }
11547 +
11548 + if (jnode_check_flushprepped(leftmost_in_slum)
11549 + && !jnode_convertible(leftmost_in_slum)) {
11550 + ret = 0;
11551 + goto failed;
11552 + }
11553 +
11554 + /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11555 + ret = alloc_pos_and_ancestors(flush_pos);
11556 + if (ret)
11557 + goto failed;
11558 +
11559 + /* Do the main rightward-bottom-up squeeze and allocate loop. */
11560 + ret = squalloc(flush_pos);
11561 + pos_stop(flush_pos);
11562 + if (ret)
11563 + goto failed;
11564 +
11565 + /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11566 + First, the pos_stop() and pos_valid() routines should be modified
11567 + so that pos_stop() sets a flush_position->stop flag to 1 without
11568 + releasing the current position immediately--instead release it in
11569 + pos_done(). This is a better implementation than the current one anyway.
11570 +
11571 + It is not clear that all fields of the flush_position should not be released,
11572 + but at the very least the parent_lock, parent_coord, and parent_load should
11573 + remain held because they are hold the last twig when pos_stop() is
11574 + called.
11575 +
11576 + When we reach this point in the code, if the parent_coord is set to after the
11577 + last item then we know that flush reached the end of a twig (and according to
11578 + the new flush queueing design, we will return now). If parent_coord is not
11579 + past the last item, we should check if the current twig has any unallocated
11580 + children to the right (we are not concerned with unallocated children to the
11581 + left--in that case the twig itself should not have been allocated). If the
11582 + twig has unallocated children to the right, set the parent_coord to that
11583 + position and then repeat the call to squalloc.
11584 +
11585 + Testing for unallocated children may be defined in two ways: if any internal
11586 + item has a fake block number, it is unallocated; if any extent item is
11587 + unallocated then all of its children are unallocated. But there is a more
11588 + aggressive approach: if there are any dirty children of the twig to the right
11589 + of the current position, we may wish to relocate those nodes now. Checking for
11590 + potential relocation is more expensive as it requires knowing whether there are
11591 + any dirty children that are not unallocated. The extent_needs_allocation
11592 + should be used after setting the correct preceder.
11593 +
11594 + When we reach the end of a twig at this point in the code, if the flush can
11595 + continue (when the queue is ready) it will need some information on the future
11596 + starting point. That should be stored away in the flush_handle using a seal, I
11597 + believe. Holding a jref() on the future starting point may break other code
11598 + that deletes that node.
11599 + */
11600 +
11601 + /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11602 + above the twig level. If the VM calls flush above the twig level, do nothing
11603 + and return (but figure out why this happens). The txnmgr should be modified to
11604 + only flush its leaf-level dirty list. This will do all the necessary squeeze
11605 + and allocate steps but leave unallocated branches and possibly unallocated
11606 + twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11607 + level, the remaining unallocated nodes should be given write-optimized
11608 + locations. (Possibly, the remaining unallocated twigs should be allocated just
11609 + before their leftmost child.)
11610 + */
11611 +
11612 + /* Any failure reaches this point. */
11613 + failed:
11614 +
11615 + switch (ret) {
11616 + case -E_REPEAT:
11617 + case -EINVAL:
11618 + case -E_DEADLOCK:
11619 + case -E_NO_NEIGHBOR:
11620 + case -ENOENT:
11621 + /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11622 + in each case. They already are handled in many cases. */
11623 + /* Something bad happened, but difficult to avoid... Try again! */
11624 + ret = 0;
11625 + }
11626 +
11627 + if (leftmost_in_slum)
11628 + jput(leftmost_in_slum);
11629 +
11630 + pos_done(flush_pos);
11631 + scan_done(left_scan);
11632 + scan_done(right_scan);
11633 + kfree(right_scan);
11634 +
11635 + ON_DEBUG(atomic_dec(&flush_cnt));
11636 +
11637 + leave_flush(sb);
11638 +
11639 + if (!reiser4_is_set(sb, REISER4_MTFLUSH))
11640 + up(&sbinfo->flush_sema);
11641 +
11642 + return ret;
11643 +}
11644 +
11645 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11646 + * flusher should submit all prepped nodes immediately without keeping them in
11647 + * flush queues for long time. The reason for rapid flush mode is to free
11648 + * memory as fast as possible. */
11649 +
11650 +#if REISER4_USE_RAPID_FLUSH
11651 +
11652 +/**
11653 + * submit all prepped nodes if rapid flush mode is set,
11654 + * turn rapid flush mode off.
11655 + */
11656 +
11657 +static int rapid_flush(flush_pos_t * pos)
11658 +{
11659 + if (!wbq_available())
11660 + return 0;
11661 +
11662 + return write_prepped_nodes(pos);
11663 +}
11664 +
11665 +#else
11666 +
11667 +#define rapid_flush(pos) (0)
11668 +
11669 +#endif /* REISER4_USE_RAPID_FLUSH */
11670 +
11671 +static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11672 + flush_queue_t *fq, int *nr_queued,
11673 + int flags)
11674 +{
11675 + jnode * node;
11676 +
11677 + if (start != NULL) {
11678 + spin_lock_jnode(start);
11679 + if (!jnode_is_flushprepped(start)) {
11680 + assert("zam-1056", start->atom == atom);
11681 + node = start;
11682 + goto enter;
11683 + }
11684 + spin_unlock_jnode(start);
11685 + }
11686 + /*
11687 + * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11688 + * nodes. The atom spin lock is not released until all dirty nodes processed or
11689 + * not prepped node found in the atom dirty lists.
11690 + */
11691 + while ((node = find_first_dirty_jnode(atom, flags))) {
11692 + spin_lock_jnode(node);
11693 + enter:
11694 + assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11695 + assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11696 +
11697 + if (JF_ISSET(node, JNODE_WRITEBACK)) {
11698 + /* move node to the end of atom's writeback list */
11699 + list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11700 +
11701 + /*
11702 + * jnode is not necessarily on dirty list: if it was dirtied when
11703 + * it was on flush queue - it does not get moved to dirty list
11704 + */
11705 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11706 + WB_LIST, 1));
11707 +
11708 + } else if (jnode_is_znode(node)
11709 + && znode_above_root(JZNODE(node))) {
11710 + /*
11711 + * A special case for znode-above-root. The above-root (fake)
11712 + * znode is captured and dirtied when the tree height changes or
11713 + * when the root node is relocated. This causes atoms to fuse so
11714 + * that changes at the root are serialized. However, this node is
11715 + * never flushed. This special case used to be in lock.c to
11716 + * prevent the above-root node from ever being captured, but now
11717 + * that it is captured we simply prevent it from flushing. The
11718 + * log-writer code relies on this to properly log superblock
11719 + * modifications of the tree height.
11720 + */
11721 + jnode_make_wander_nolock(node);
11722 + } else if (JF_ISSET(node, JNODE_RELOC)) {
11723 + queue_jnode(fq, node);
11724 + ++(*nr_queued);
11725 + } else
11726 + break;
11727 +
11728 + spin_unlock_jnode(node);
11729 + }
11730 + return node;
11731 +}
11732 +
11733 +
11734 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11735 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11736 + * other errors as they are. */
11737 +int
11738 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11739 + txn_atom ** atom, jnode *start)
11740 +{
11741 + reiser4_super_info_data *sinfo = get_current_super_private();
11742 + flush_queue_t *fq = NULL;
11743 + jnode *node;
11744 + int nr_queued;
11745 + int ret;
11746 +
11747 + assert("zam-889", atom != NULL && *atom != NULL);
11748 + assert_spin_locked(&((*atom)->alock));
11749 + assert("zam-892", get_current_context()->trans->atom == *atom);
11750 +
11751 + nr_to_write = LONG_MAX;
11752 + while (1) {
11753 + ret = fq_by_atom(*atom, &fq);
11754 + if (ret != -E_REPEAT)
11755 + break;
11756 + *atom = get_current_atom_locked();
11757 + }
11758 + if (ret)
11759 + return ret;
11760 +
11761 + assert_spin_locked(&((*atom)->alock));
11762 +
11763 + /* parallel flushers limit */
11764 + if (sinfo->tmgr.atom_max_flushers != 0) {
11765 + while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11766 + /* An atom_send_event() call is inside fq_put_nolock() which is
11767 + called when flush is finished and nr_flushers is
11768 + decremented. */
11769 + atom_wait_event(*atom);
11770 + *atom = get_current_atom_locked();
11771 + }
11772 + }
11773 +
11774 + /* count ourself as a flusher */
11775 + (*atom)->nr_flushers++;
11776 +
11777 + writeout_mode_enable();
11778 +
11779 + nr_queued = 0;
11780 + node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11781 +
11782 + if (node == NULL) {
11783 + if (nr_queued == 0) {
11784 + (*atom)->nr_flushers--;
11785 + fq_put_nolock(fq);
11786 + atom_send_event(*atom);
11787 + /* current atom remains locked */
11788 + writeout_mode_disable();
11789 + return 0;
11790 + }
11791 + spin_unlock_atom(*atom);
11792 + } else {
11793 + jref(node);
11794 + BUG_ON((*atom)->super != node->tree->super);
11795 + spin_unlock_atom(*atom);
11796 + spin_unlock_jnode(node);
11797 + BUG_ON(nr_to_write == 0);
11798 + ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11799 + jput(node);
11800 + }
11801 +
11802 + ret =
11803 + write_fq(fq, nr_submitted,
11804 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11805 +
11806 + *atom = get_current_atom_locked();
11807 + (*atom)->nr_flushers--;
11808 + fq_put_nolock(fq);
11809 + atom_send_event(*atom);
11810 + spin_unlock_atom(*atom);
11811 +
11812 + writeout_mode_disable();
11813 +
11814 + if (ret == 0)
11815 + ret = -E_REPEAT;
11816 +
11817 + return ret;
11818 +}
11819 +
11820 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11821 +
11822 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11823 + reverse parent-first relocate context. Here all we know is the preceder and the block
11824 + number. Since we are going in reverse, the preceder may still be relocated as well, so
11825 + we can't ask the block allocator "is there a closer block available to relocate?" here.
11826 + In the _forward_ parent-first relocate context (not here) we actually call the block
11827 + allocator to try and find a closer location. */
11828 +static int
11829 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11830 + const reiser4_block_nr * nblk)
11831 +{
11832 + reiser4_block_nr dist;
11833 +
11834 + assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11835 + assert("jmacd-7711", !blocknr_is_fake(pblk));
11836 + assert("jmacd-7712", !blocknr_is_fake(nblk));
11837 +
11838 + /* Distance is the absolute value. */
11839 + dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11840 +
11841 + /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11842 + block, do not relocate. */
11843 + if (dist <= get_current_super_private()->flush.relocate_distance) {
11844 + return 0;
11845 + }
11846 +
11847 + return 1;
11848 +}
11849 +
11850 +/* This function is a predicate that tests for relocation. Always called in the
11851 + reverse-parent-first context, when we are asking whether the current node should be
11852 + relocated in order to expand the flush by dirtying the parent level (and thus
11853 + proceeding to flush that level). When traversing in the forward parent-first direction
11854 + (not here), relocation decisions are handled in two places: allocate_znode() and
11855 + extent_needs_allocation(). */
11856 +static int
11857 +reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11858 + flush_pos_t * pos)
11859 +{
11860 + reiser4_block_nr pblk = 0;
11861 + reiser4_block_nr nblk = 0;
11862 +
11863 + assert("jmacd-8989", !jnode_is_root(node));
11864 +
11865 + /*
11866 + * This function is called only from the
11867 + * reverse_relocate_check_dirty_parent() and only if the parent
11868 + * node is clean. This implies that the parent has the real (i.e., not
11869 + * fake) block number, and, so does the child, because otherwise the
11870 + * parent would be dirty.
11871 + */
11872 +
11873 + /* New nodes are treated as if they are being relocated. */
11874 + if (JF_ISSET (node, JNODE_CREATED) ||
11875 + (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11876 + return 1;
11877 + }
11878 +
11879 + /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11880 + existing node, the coord may be leftmost even though the child is not the
11881 + parent-first preceder of the parent. If the first dirty node appears somewhere
11882 + in the middle of the first extent unit, this preceder calculation is wrong.
11883 + Needs more logic in here. */
11884 + if (coord_is_leftmost_unit(parent_coord)) {
11885 + pblk = *znode_get_block(parent_coord->node);
11886 + } else {
11887 + pblk = pos->preceder.blk;
11888 + }
11889 + check_preceder(pblk);
11890 +
11891 + /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11892 + if (pblk == 0) {
11893 + return 1;
11894 + }
11895 +
11896 + nblk = *jnode_get_block(node);
11897 +
11898 + if (blocknr_is_fake(&nblk))
11899 + /* child is unallocated, mark parent dirty */
11900 + return 1;
11901 +
11902 + return reverse_relocate_if_close_enough(&pblk, &nblk);
11903 +}
11904 +
11905 +/* This function calls reverse_relocate_test to make a reverse-parent-first
11906 + relocation decision and then, if yes, it marks the parent dirty. */
11907 +static int
11908 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11909 + flush_pos_t * pos)
11910 +{
11911 + int ret;
11912 +
11913 + if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11914 +
11915 + ret = reverse_relocate_test(node, parent_coord, pos);
11916 + if (ret < 0) {
11917 + return ret;
11918 + }
11919 +
11920 + /* FIXME-ZAM
11921 + if parent is already relocated - we do not want to grab space, right? */
11922 + if (ret == 1) {
11923 + int grabbed;
11924 +
11925 + grabbed = get_current_context()->grabbed_blocks;
11926 + if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11927 + 0)
11928 + reiser4_panic("umka-1250",
11929 + "No space left during flush.");
11930 +
11931 + assert("jmacd-18923",
11932 + znode_is_write_locked(parent_coord->node));
11933 + znode_make_dirty(parent_coord->node);
11934 + grabbed2free_mark(grabbed);
11935 + }
11936 + }
11937 +
11938 + return 0;
11939 +}
11940 +
11941 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11942 + PARENT-FIRST LOOP BEGINS) */
11943 +
11944 +/* Get the leftmost child for given coord. */
11945 +static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11946 +{
11947 + int ret;
11948 +
11949 + ret = item_utmost_child(coord, LEFT_SIDE, child);
11950 +
11951 + if (ret)
11952 + return ret;
11953 +
11954 + if (IS_ERR(*child))
11955 + return PTR_ERR(*child);
11956 +
11957 + return 0;
11958 +}
11959 +
11960 +/* This step occurs after the left- and right-scans are completed, before starting the
11961 + forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11962 + flush point, which means continuing in the reverse parent-first direction to the
11963 + parent, grandparent, and so on (as long as the child is a leftmost child). This
11964 + routine calls a recursive process, alloc_one_ancestor, which does the real work,
11965 + except there is special-case handling here for the first ancestor, which may be a twig.
11966 + At each level (here and alloc_one_ancestor), we check for relocation and then, if
11967 + the child is a leftmost child, repeat at the next level. On the way back down (the
11968 + recursion), we allocate the ancestors in parent-first order. */
11969 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
11970 +{
11971 + int ret = 0;
11972 + lock_handle plock;
11973 + load_count pload;
11974 + coord_t pcoord;
11975 +
11976 + if (znode_check_flushprepped(pos->lock.node))
11977 + return 0;
11978 +
11979 + coord_init_invalid(&pcoord, NULL);
11980 + init_lh(&plock);
11981 + init_load_count(&pload);
11982 +
11983 + if (pos->state == POS_ON_EPOINT) {
11984 + /* a special case for pos on twig level, where we already have
11985 + a lock on parent node. */
11986 + /* The parent may not be dirty, in which case we should decide
11987 + whether to relocate the child now. If decision is made to
11988 + relocate the child, the parent is marked dirty. */
11989 + ret =
11990 + reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11991 + pos);
11992 + if (ret)
11993 + goto exit;
11994 +
11995 + /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11996 + is leftmost) and the leaf/child, so recursion is not needed.
11997 + Levels above the twig will be allocated for
11998 + write-optimization before the transaction commits. */
11999 +
12000 + /* Do the recursive step, allocating zero or more of our
12001 + * ancestors. */
12002 + ret = alloc_one_ancestor(&pos->coord, pos);
12003 +
12004 + } else {
12005 + if (!znode_is_root(pos->lock.node)) {
12006 + /* all formatted nodes except tree root */
12007 + ret =
12008 + reiser4_get_parent(&plock, pos->lock.node,
12009 + ZNODE_WRITE_LOCK);
12010 + if (ret)
12011 + goto exit;
12012 +
12013 + ret = incr_load_count_znode(&pload, plock.node);
12014 + if (ret)
12015 + goto exit;
12016 +
12017 + ret =
12018 + find_child_ptr(plock.node, pos->lock.node, &pcoord);
12019 + if (ret)
12020 + goto exit;
12021 +
12022 + ret =
12023 + reverse_relocate_check_dirty_parent(ZJNODE
12024 + (pos->lock.
12025 + node), &pcoord,
12026 + pos);
12027 + if (ret)
12028 + goto exit;
12029 +
12030 + ret = alloc_one_ancestor(&pcoord, pos);
12031 + if (ret)
12032 + goto exit;
12033 + }
12034 +
12035 + ret = allocate_znode(pos->lock.node, &pcoord, pos);
12036 + }
12037 + exit:
12038 + done_load_count(&pload);
12039 + done_lh(&plock);
12040 + return ret;
12041 +}
12042 +
12043 +/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
12044 + call to set_preceder, which is the next function described, this checks if the
12045 + child is a leftmost child and returns if it is not. If the child is a leftmost child
12046 + it checks for relocation, possibly dirtying the parent. Then it performs the recursive
12047 + step. */
12048 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
12049 +{
12050 + int ret = 0;
12051 + lock_handle alock;
12052 + load_count aload;
12053 + coord_t acoord;
12054 +
12055 + /* As we ascend at the left-edge of the region to flush, take this opportunity at
12056 + the twig level to find our parent-first preceder unless we have already set
12057 + it. */
12058 + if (pos->preceder.blk == 0) {
12059 + ret = set_preceder(coord, pos);
12060 + if (ret != 0)
12061 + return ret;
12062 + }
12063 +
12064 + /* If the ancestor is clean or already allocated, or if the child is not a
12065 + leftmost child, stop going up, even leaving coord->node not flushprepped. */
12066 + if (znode_check_flushprepped(coord->node)
12067 + || !coord_is_leftmost_unit(coord))
12068 + return 0;
12069 +
12070 + init_lh(&alock);
12071 + init_load_count(&aload);
12072 + coord_init_invalid(&acoord, NULL);
12073 +
12074 + /* Only ascend to the next level if it is a leftmost child, but write-lock the
12075 + parent in case we will relocate the child. */
12076 + if (!znode_is_root(coord->node)) {
12077 +
12078 + ret =
12079 + jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
12080 + &alock, &aload, ZNODE_WRITE_LOCK,
12081 + 0);
12082 + if (ret != 0) {
12083 + /* FIXME(C): check EINVAL, E_DEADLOCK */
12084 + goto exit;
12085 + }
12086 +
12087 + ret =
12088 + reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
12089 + &acoord, pos);
12090 + if (ret != 0) {
12091 + goto exit;
12092 + }
12093 +
12094 + /* Recursive call. */
12095 + if (!znode_check_flushprepped(acoord.node)) {
12096 + ret = alloc_one_ancestor(&acoord, pos);
12097 + if (ret)
12098 + goto exit;
12099 + }
12100 + }
12101 +
12102 + /* Note: we call allocate with the parent write-locked (except at the root) in
12103 + case we relocate the child, in which case it will modify the parent during this
12104 + call. */
12105 + ret = allocate_znode(coord->node, &acoord, pos);
12106 +
12107 + exit:
12108 + done_load_count(&aload);
12109 + done_lh(&alock);
12110 + return ret;
12111 +}
12112 +
12113 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
12114 + a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
12115 + should this node be relocated (in reverse parent-first context)? We repeat this
12116 + process as long as the child is the leftmost child, eventually reaching an ancestor of
12117 + the flush point that is not a leftmost child. The preceder of that ancestors, which is
12118 + not a leftmost child, is actually on the leaf level. The preceder of that block is the
12119 + left-neighbor of the flush point. The preceder of that block is the rightmost child of
12120 + the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
12121 + level, it stops momentarily to remember the block of the rightmost child of the twig on
12122 + the left and sets it to the flush_position's preceder_hint.
12123 +
12124 + There is one other place where we may set the flush_position's preceder hint, which is
12125 + during scan-left.
12126 +*/
12127 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
12128 +{
12129 + int ret;
12130 + coord_t coord;
12131 + lock_handle left_lock;
12132 + load_count left_load;
12133 +
12134 + coord_dup(&coord, coord_in);
12135 +
12136 + init_lh(&left_lock);
12137 + init_load_count(&left_load);
12138 +
12139 + /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
12140 + coord_is_leftmost_unit is not the right test if the unformatted child is in the
12141 + middle of the first extent unit. */
12142 + if (!coord_is_leftmost_unit(&coord)) {
12143 + coord_prev_unit(&coord);
12144 + } else {
12145 + ret =
12146 + reiser4_get_left_neighbor(&left_lock, coord.node,
12147 + ZNODE_READ_LOCK, GN_SAME_ATOM);
12148 + if (ret) {
12149 + /* If we fail for any reason it doesn't matter because the
12150 + preceder is only a hint. We are low-priority at this point, so
12151 + this must be the case. */
12152 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
12153 + ret == -ENOENT || ret == -EINVAL
12154 + || ret == -E_DEADLOCK) {
12155 + ret = 0;
12156 + }
12157 + goto exit;
12158 + }
12159 +
12160 + ret = incr_load_count_znode(&left_load, left_lock.node);
12161 + if (ret)
12162 + goto exit;
12163 +
12164 + coord_init_last_unit(&coord, left_lock.node);
12165 + }
12166 +
12167 + ret =
12168 + item_utmost_child_real_block(&coord, RIGHT_SIDE,
12169 + &pos->preceder.blk);
12170 + exit:
12171 + check_preceder(pos->preceder.blk);
12172 + done_load_count(&left_load);
12173 + done_lh(&left_lock);
12174 + return ret;
12175 +}
12176 +
12177 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
12178 +
12179 +/* This procedure implements the outer loop of the flush algorithm. To put this in
12180 + context, here is the general list of steps taken by the flush routine as a whole:
12181 +
12182 + 1. Scan-left
12183 + 2. Scan-right (maybe)
12184 + 3. Allocate initial flush position and its ancestors
12185 + 4. <handle extents>
12186 + 5. <squeeze and next position and its ancestors to-the-right,
12187 + then update position to-the-right>
12188 + 6. <repeat from #4 until flush is stopped>
12189 +
12190 + This procedure implements the loop in steps 4 through 6 in the above listing.
12191 +
12192 + Step 4: if the current flush position is an extent item (position on the twig level),
12193 + it allocates the extent (allocate_extent_item_in_place) then shifts to the next
12194 + coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
12195 + If the next coordinate is an internal item, we descend back to the leaf level,
12196 + otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
12197 + brings us past the end of the twig level, then we call
12198 + reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
12199 + step #5 which moves to the right.
12200 +
12201 + Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
12202 + tree to allocate any ancestors of the next-right flush position that are not also
12203 + ancestors of the current position. Those ancestors (in top-down order) are the next in
12204 + parent-first order. We squeeze adjacent nodes on the way up until the right node and
12205 + current node share the same parent, then allocate on the way back down. Finally, this
12206 + step sets the flush position to the next-right node. Then repeat steps 4 and 5.
12207 +*/
12208 +
12209 +/* SQUEEZE CODE */
12210 +
12211 +/* squalloc_right_twig helper function, cut a range of extent items from
12212 + cut node to->node from the beginning up to coord @to. */
12213 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
12214 + znode * left)
12215 +{
12216 + coord_t from;
12217 + reiser4_key from_key;
12218 +
12219 + coord_init_first_unit(&from, to->node);
12220 + item_key_by_coord(&from, &from_key);
12221 +
12222 + return cut_node_content(&from, to, &from_key, to_key, NULL);
12223 +}
12224 +
12225 +/* Copy as much of the leading extents from @right to @left, allocating
12226 + unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
12227 + SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
12228 + internal item it calls shift_one_internal_unit and may then return
12229 + SUBTREE_MOVED. */
12230 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
12231 +{
12232 + int ret = SUBTREE_MOVED;
12233 + coord_t coord; /* used to iterate over items */
12234 + reiser4_key stop_key;
12235 +
12236 + assert("jmacd-2008", !node_is_empty(right));
12237 + coord_init_first_unit(&coord, right);
12238 +
12239 + /* FIXME: can be optimized to cut once */
12240 + while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12241 + ON_DEBUG(void *vp);
12242 +
12243 + assert("vs-1468", coord_is_leftmost_unit(&coord));
12244 + ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12245 +
12246 + /* stop_key is used to find what was copied and what to cut */
12247 + stop_key = *min_key();
12248 + ret = squalloc_extent(left, &coord, pos, &stop_key);
12249 + if (ret != SQUEEZE_CONTINUE) {
12250 + ON_DEBUG(kfree(vp));
12251 + break;
12252 + }
12253 + assert("vs-1465", !keyeq(&stop_key, min_key()));
12254 +
12255 + /* Helper function to do the cutting. */
12256 + set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12257 + check_me("vs-1466",
12258 + squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12259 +
12260 + ON_DEBUG(shift_check(vp, left, coord.node));
12261 + }
12262 +
12263 + if (node_is_empty(coord.node))
12264 + ret = SQUEEZE_SOURCE_EMPTY;
12265 +
12266 + if (ret == SQUEEZE_TARGET_FULL) {
12267 + goto out;
12268 + }
12269 +
12270 + if (node_is_empty(right)) {
12271 + /* The whole right node was copied into @left. */
12272 + assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12273 + goto out;
12274 + }
12275 +
12276 + coord_init_first_unit(&coord, right);
12277 +
12278 + if (!item_is_internal(&coord)) {
12279 + /* we do not want to squeeze anything else to left neighbor because "slum"
12280 + is over */
12281 + ret = SQUEEZE_TARGET_FULL;
12282 + goto out;
12283 + }
12284 + assert("jmacd-433", item_is_internal(&coord));
12285 +
12286 + /* Shift an internal unit. The child must be allocated before shifting any more
12287 + extents, so we stop here. */
12288 + ret = shift_one_internal_unit(left, right);
12289 +
12290 + out:
12291 + assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12292 + || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12293 +
12294 + if (ret == SQUEEZE_TARGET_FULL) {
12295 + /* We submit prepped nodes here and expect that this @left twig
12296 + * will not be modified again during this jnode_flush() call. */
12297 + int ret1;
12298 +
12299 + /* NOTE: seems like io is done under long term locks. */
12300 + ret1 = write_prepped_nodes(pos);
12301 + if (ret1 < 0)
12302 + return ret1;
12303 + }
12304 +
12305 + return ret;
12306 +}
12307 +
12308 +#if REISER4_DEBUG
12309 +static void item_convert_invariant(flush_pos_t * pos)
12310 +{
12311 + assert("edward-1225", coord_is_existing_item(&pos->coord));
12312 + if (chaining_data_present(pos)) {
12313 + item_plugin *iplug = item_convert_plug(pos);
12314 +
12315 + assert("edward-1000",
12316 + iplug == item_plugin_by_coord(&pos->coord));
12317 + assert("edward-1001", iplug->f.convert != NULL);
12318 + } else
12319 + assert("edward-1226", pos->child == NULL);
12320 +}
12321 +#else
12322 +
12323 +#define item_convert_invariant(pos) noop
12324 +
12325 +#endif
12326 +
12327 +/* Scan node items starting from the first one and apply for each
12328 + item its flush ->convert() method (if any). This method may
12329 + resize/kill the item so the tree will be changed.
12330 +*/
12331 +static int convert_node(flush_pos_t * pos, znode * node)
12332 +{
12333 + int ret = 0;
12334 + item_plugin *iplug;
12335 +
12336 + assert("edward-304", pos != NULL);
12337 + assert("edward-305", pos->child == NULL);
12338 + assert("edward-475", znode_convertible(node));
12339 + assert("edward-669", znode_is_wlocked(node));
12340 + assert("edward-1210", !node_is_empty(node));
12341 +
12342 + if (znode_get_level(node) != LEAF_LEVEL)
12343 + /* unsupported */
12344 + goto exit;
12345 +
12346 + coord_init_first_unit(&pos->coord, node);
12347 +
12348 + while (1) {
12349 + ret = 0;
12350 + coord_set_to_left(&pos->coord);
12351 + item_convert_invariant(pos);
12352 +
12353 + iplug = item_plugin_by_coord(&pos->coord);
12354 + assert("edward-844", iplug != NULL);
12355 +
12356 + if (iplug->f.convert) {
12357 + ret = iplug->f.convert(pos);
12358 + if (ret)
12359 + goto exit;
12360 + }
12361 + assert("edward-307", pos->child == NULL);
12362 +
12363 + if (coord_next_item(&pos->coord)) {
12364 + /* node is over */
12365 +
12366 + if (!chaining_data_present(pos))
12367 + /* finished this node */
12368 + break;
12369 + if (should_chain_next_node(pos)) {
12370 + /* go to next node */
12371 + move_chaining_data(pos, 0 /* to next node */ );
12372 + break;
12373 + }
12374 + /* repeat this node */
12375 + move_chaining_data(pos, 1 /* this node */ );
12376 + continue;
12377 + }
12378 + /* Node is not over.
12379 + Check if there is attached convert data.
12380 + If so roll one item position back and repeat
12381 + on this node
12382 + */
12383 + if (chaining_data_present(pos)) {
12384 +
12385 + if (iplug != item_plugin_by_coord(&pos->coord))
12386 + set_item_convert_count(pos, 0);
12387 +
12388 + ret = coord_prev_item(&pos->coord);
12389 + assert("edward-1003", !ret);
12390 +
12391 + move_chaining_data(pos, 1 /* this node */ );
12392 + }
12393 + }
12394 + JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12395 + znode_make_dirty(node);
12396 + exit:
12397 + assert("edward-1004", !ret);
12398 + return ret;
12399 +}
12400 +
12401 +/* Squeeze and allocate the right neighbor. This is called after @left and
12402 + its current children have been squeezed and allocated already. This
12403 + procedure's job is to squeeze and items from @right to @left.
12404 +
12405 + If at the leaf level, use the shift_everything_left memcpy-optimized
12406 + version of shifting (squeeze_right_leaf).
12407 +
12408 + If at the twig level, extents are allocated as they are shifted from @right
12409 + to @left (squalloc_right_twig).
12410 +
12411 + At any other level, shift one internal item and return to the caller
12412 + (squalloc_parent_first) so that the shifted-subtree can be processed in
12413 + parent-first order.
12414 +
12415 + When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12416 + returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12417 + returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12418 + is returned.
12419 +*/
12420 +
12421 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12422 + znode * right)
12423 +{
12424 + int ret;
12425 +
12426 + /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12427 + * tree owing to error (for example, ENOSPC) in write */
12428 + /* assert("jmacd-9321", !node_is_empty(left)); */
12429 + assert("jmacd-9322", !node_is_empty(right));
12430 + assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12431 +
12432 + switch (znode_get_level(left)) {
12433 + case TWIG_LEVEL:
12434 + /* Shift with extent allocating until either an internal item
12435 + is encountered or everything is shifted or no free space
12436 + left in @left */
12437 + ret = squeeze_right_twig(left, right, pos);
12438 + break;
12439 +
12440 + default:
12441 + /* All other levels can use shift_everything until we implement per-item
12442 + flush plugins. */
12443 + ret = squeeze_right_non_twig(left, right);
12444 + break;
12445 + }
12446 +
12447 + assert("jmacd-2011", (ret < 0 ||
12448 + ret == SQUEEZE_SOURCE_EMPTY
12449 + || ret == SQUEEZE_TARGET_FULL
12450 + || ret == SUBTREE_MOVED));
12451 + return ret;
12452 +}
12453 +
12454 +static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12455 + znode * right)
12456 +{
12457 + int ret;
12458 +
12459 + ret = squeeze_right_twig(pos->lock.node, right, pos);
12460 + if (ret < 0)
12461 + return ret;
12462 + if (ret > 0) {
12463 + coord_init_after_last_item(&pos->coord, pos->lock.node);
12464 + return ret;
12465 + }
12466 +
12467 + coord_init_last_unit(&pos->coord, pos->lock.node);
12468 + return 0;
12469 +}
12470 +
12471 +/* forward declaration */
12472 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12473 +
12474 +/* do a fast check for "same parents" condition before calling
12475 + * squalloc_upper_levels() */
12476 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12477 + znode * left,
12478 + znode * right)
12479 +{
12480 + if (znode_same_parents(left, right))
12481 + return 0;
12482 +
12483 + return squalloc_upper_levels(pos, left, right);
12484 +}
12485 +
12486 +/* Check whether the parent of given @right node needs to be processes
12487 + ((re)allocated) prior to processing of the child. If @left and @right do not
12488 + share at least the parent of the @right is after the @left but before the
12489 + @right in parent-first order, we have to (re)allocate it before the @right
12490 + gets (re)allocated. */
12491 +static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12492 +{
12493 + int ret;
12494 +
12495 + lock_handle left_parent_lock;
12496 + lock_handle right_parent_lock;
12497 +
12498 + load_count left_parent_load;
12499 + load_count right_parent_load;
12500 +
12501 + init_lh(&left_parent_lock);
12502 + init_lh(&right_parent_lock);
12503 +
12504 + init_load_count(&left_parent_load);
12505 + init_load_count(&right_parent_load);
12506 +
12507 + ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12508 + if (ret)
12509 + goto out;
12510 +
12511 + ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12512 + if (ret)
12513 + goto out;
12514 +
12515 + /* Check for same parents */
12516 + if (left_parent_lock.node == right_parent_lock.node)
12517 + goto out;
12518 +
12519 + if (znode_check_flushprepped(right_parent_lock.node)) {
12520 + /* Keep parent-first order. In the order, the right parent node stands
12521 + before the @right node. If it is already allocated, we set the
12522 + preceder (next block search start point) to its block number, @right
12523 + node should be allocated after it.
12524 +
12525 + However, preceder is set only if the right parent is on twig level.
12526 + The explanation is the following: new branch nodes are allocated over
12527 + already allocated children while the tree grows, it is difficult to
12528 + keep tree ordered, we assume that only leaves and twings are correctly
12529 + allocated. So, only twigs are used as a preceder for allocating of the
12530 + rest of the slum. */
12531 + if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12532 + pos->preceder.blk =
12533 + *znode_get_block(right_parent_lock.node);
12534 + check_preceder(pos->preceder.blk);
12535 + }
12536 + goto out;
12537 + }
12538 +
12539 + ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12540 + if (ret)
12541 + goto out;
12542 +
12543 + ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12544 + if (ret)
12545 + goto out;
12546 +
12547 + ret =
12548 + squeeze_right_neighbor(pos, left_parent_lock.node,
12549 + right_parent_lock.node);
12550 + /* We stop if error. We stop if some items/units were shifted (ret == 0)
12551 + * and thus @right changed its parent. It means we have not process
12552 + * right_parent node prior to processing of @right. Positive return
12553 + * values say that shifting items was not happen because of "empty
12554 + * source" or "target full" conditions. */
12555 + if (ret <= 0)
12556 + goto out;
12557 +
12558 + /* parent(@left) and parent(@right) may have different parents also. We
12559 + * do a recursive call for checking that. */
12560 + ret =
12561 + check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12562 + right_parent_lock.node);
12563 + if (ret)
12564 + goto out;
12565 +
12566 + /* allocate znode when going down */
12567 + ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12568 +
12569 + out:
12570 + done_load_count(&left_parent_load);
12571 + done_load_count(&right_parent_load);
12572 +
12573 + done_lh(&left_parent_lock);
12574 + done_lh(&right_parent_lock);
12575 +
12576 + return ret;
12577 +}
12578 +
12579 +/* Check the leftmost child "flushprepped" status, also returns true if child
12580 + * node was not found in cache. */
12581 +static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12582 +{
12583 + int ret;
12584 + int prepped;
12585 +
12586 + jnode *child;
12587 +
12588 + ret = get_leftmost_child_of_unit(coord, &child);
12589 +
12590 + if (ret)
12591 + return ret;
12592 +
12593 + if (child) {
12594 + prepped = jnode_check_flushprepped(child);
12595 + jput(child);
12596 + } else {
12597 + /* We consider not existing child as a node which slum
12598 + processing should not continue to. Not cached node is clean,
12599 + so it is flushprepped. */
12600 + prepped = 1;
12601 + }
12602 +
12603 + return prepped;
12604 +}
12605 +
12606 +/* (re)allocate znode with automated getting parent node */
12607 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12608 +{
12609 + int ret;
12610 + lock_handle parent_lock;
12611 + load_count parent_load;
12612 + coord_t pcoord;
12613 +
12614 + assert("zam-851", znode_is_write_locked(node));
12615 +
12616 + init_lh(&parent_lock);
12617 + init_load_count(&parent_load);
12618 +
12619 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12620 + if (ret)
12621 + goto out;
12622 +
12623 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12624 + if (ret)
12625 + goto out;
12626 +
12627 + ret = find_child_ptr(parent_lock.node, node, &pcoord);
12628 + if (ret)
12629 + goto out;
12630 +
12631 + ret = allocate_znode(node, &pcoord, pos);
12632 +
12633 + out:
12634 + done_load_count(&parent_load);
12635 + done_lh(&parent_lock);
12636 + return ret;
12637 +}
12638 +
12639 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12640 + * slum reached. */
12641 +static int handle_pos_on_formatted(flush_pos_t * pos)
12642 +{
12643 + int ret;
12644 + lock_handle right_lock;
12645 + load_count right_load;
12646 +
12647 + init_lh(&right_lock);
12648 + init_load_count(&right_load);
12649 +
12650 + if (should_convert_node(pos, pos->lock.node)) {
12651 + ret = convert_node(pos, pos->lock.node);
12652 + if (ret)
12653 + return ret;
12654 + }
12655 +
12656 + while (1) {
12657 + ret =
12658 + neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12659 + ZNODE_WRITE_LOCK,
12660 + !should_convert_next_node(pos,
12661 + right_lock.
12662 + node));
12663 + if (ret)
12664 + break;
12665 +
12666 + /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12667 + * can be optimal. For now we choose to live with the risk that it will
12668 + * be suboptimal because it would be quite complex to code it to be
12669 + * smarter. */
12670 + if (znode_check_flushprepped(right_lock.node)
12671 + && !znode_convertible(right_lock.node)) {
12672 + assert("edward-1005",
12673 + !should_convert_next_node(pos, right_lock.node));
12674 + pos_stop(pos);
12675 + break;
12676 + }
12677 +
12678 + ret = incr_load_count_znode(&right_load, right_lock.node);
12679 + if (ret)
12680 + break;
12681 +
12682 + if (should_convert_node(pos, right_lock.node)) {
12683 + ret = convert_node(pos, right_lock.node);
12684 + if (ret)
12685 + break;
12686 + if (node_is_empty(right_lock.node)) {
12687 + /* node became empty after converting, repeat */
12688 + done_load_count(&right_load);
12689 + done_lh(&right_lock);
12690 + continue;
12691 + }
12692 + }
12693 +
12694 + /* squeeze _before_ going upward. */
12695 + ret =
12696 + squeeze_right_neighbor(pos, pos->lock.node,
12697 + right_lock.node);
12698 + if (ret < 0)
12699 + break;
12700 +
12701 + if (znode_check_flushprepped(right_lock.node)) {
12702 + if (should_convert_next_node(pos, right_lock.node)) {
12703 + /* in spite of flushprepped status of the node,
12704 + its right slum neighbor should be converted */
12705 + assert("edward-953", convert_data(pos));
12706 + assert("edward-954", item_convert_data(pos));
12707 +
12708 + if (node_is_empty(right_lock.node)) {
12709 + done_load_count(&right_load);
12710 + done_lh(&right_lock);
12711 + } else
12712 + move_flush_pos(pos, &right_lock,
12713 + &right_load, NULL);
12714 + continue;
12715 + }
12716 + pos_stop(pos);
12717 + break;
12718 + }
12719 +
12720 + if (node_is_empty(right_lock.node)) {
12721 + /* repeat if right node was squeezed completely */
12722 + done_load_count(&right_load);
12723 + done_lh(&right_lock);
12724 + continue;
12725 + }
12726 +
12727 + /* parent(right_lock.node) has to be processed before
12728 + * (right_lock.node) due to "parent-first" allocation order. */
12729 + ret =
12730 + check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12731 + right_lock.node);
12732 + if (ret)
12733 + break;
12734 + /* (re)allocate _after_ going upward */
12735 + ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12736 + if (ret)
12737 + break;
12738 +
12739 + if (should_terminate_squalloc(pos)) {
12740 + set_item_convert_count(pos, 0);
12741 + break;
12742 + }
12743 +
12744 + /* advance the flush position to the right neighbor */
12745 + move_flush_pos(pos, &right_lock, &right_load, NULL);
12746 +
12747 + ret = rapid_flush(pos);
12748 + if (ret)
12749 + break;
12750 + }
12751 +
12752 + assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
12753 +
12754 + done_load_count(&right_load);
12755 + done_lh(&right_lock);
12756 +
12757 + /* This function indicates via pos whether to stop or go to twig or continue on current
12758 + * level. */
12759 + return ret;
12760 +
12761 +}
12762 +
12763 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12764 + * slum reached. */
12765 +static int handle_pos_on_leaf(flush_pos_t * pos)
12766 +{
12767 + int ret;
12768 +
12769 + assert("zam-845", pos->state == POS_ON_LEAF);
12770 +
12771 + ret = handle_pos_on_formatted(pos);
12772 +
12773 + if (ret == -E_NO_NEIGHBOR) {
12774 + /* cannot get right neighbor, go process extents. */
12775 + pos->state = POS_TO_TWIG;
12776 + return 0;
12777 + }
12778 +
12779 + return ret;
12780 +}
12781 +
12782 +/* Process slum on level > 1 */
12783 +static int handle_pos_on_internal(flush_pos_t * pos)
12784 +{
12785 + assert("zam-850", pos->state == POS_ON_INTERNAL);
12786 + return handle_pos_on_formatted(pos);
12787 +}
12788 +
12789 +/* check whether squalloc should stop before processing given extent */
12790 +static int squalloc_extent_should_stop(flush_pos_t * pos)
12791 +{
12792 + assert("zam-869", item_is_extent(&pos->coord));
12793 +
12794 + /* pos->child is a jnode handle_pos_on_extent() should start with in
12795 + * stead of the first child of the first extent unit. */
12796 + if (pos->child) {
12797 + int prepped;
12798 +
12799 + assert("vs-1383", jnode_is_unformatted(pos->child));
12800 + prepped = jnode_check_flushprepped(pos->child);
12801 + pos->pos_in_unit =
12802 + jnode_get_index(pos->child) -
12803 + extent_unit_index(&pos->coord);
12804 + assert("vs-1470",
12805 + pos->pos_in_unit < extent_unit_width(&pos->coord));
12806 + assert("nikita-3434",
12807 + ergo(extent_is_unallocated(&pos->coord),
12808 + pos->pos_in_unit == 0));
12809 + jput(pos->child);
12810 + pos->child = NULL;
12811 +
12812 + return prepped;
12813 + }
12814 +
12815 + pos->pos_in_unit = 0;
12816 + if (extent_is_unallocated(&pos->coord))
12817 + return 0;
12818 +
12819 + return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12820 +}
12821 +
12822 +/* Handle the case when regular reiser4 tree (znodes connected one to its
12823 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
12824 + * unformatted nodes. By having a lock on twig level and use extent code
12825 + * routines to process unformatted nodes we swim around an irregular part of
12826 + * reiser4 tree. */
12827 +static int handle_pos_on_twig(flush_pos_t * pos)
12828 +{
12829 + int ret;
12830 +
12831 + assert("zam-844", pos->state == POS_ON_EPOINT);
12832 + assert("zam-843", item_is_extent(&pos->coord));
12833 +
12834 + /* We decide should we continue slum processing with current extent
12835 + unit: if leftmost child of current extent unit is flushprepped
12836 + (i.e. clean or already processed by flush) we stop squalloc(). There
12837 + is a fast check for unallocated extents which we assume contain all
12838 + not flushprepped nodes. */
12839 + /* FIXME: Here we implement simple check, we are only looking on the
12840 + leftmost child. */
12841 + ret = squalloc_extent_should_stop(pos);
12842 + if (ret != 0) {
12843 + pos_stop(pos);
12844 + return ret;
12845 + }
12846 +
12847 + while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12848 + && item_is_extent(&pos->coord)) {
12849 + ret = alloc_extent(pos);
12850 + if (ret) {
12851 + break;
12852 + }
12853 + coord_next_unit(&pos->coord);
12854 + }
12855 +
12856 + if (coord_is_after_rightmost(&pos->coord)) {
12857 + pos->state = POS_END_OF_TWIG;
12858 + return 0;
12859 + }
12860 + if (item_is_internal(&pos->coord)) {
12861 + pos->state = POS_TO_LEAF;
12862 + return 0;
12863 + }
12864 +
12865 + assert("zam-860", item_is_extent(&pos->coord));
12866 +
12867 + /* "slum" is over */
12868 + pos->state = POS_INVALID;
12869 + return 0;
12870 +}
12871 +
12872 +/* When we about to return flush position from twig to leaf level we can process
12873 + * the right twig node or move position to the leaf. This processes right twig
12874 + * if it is possible and jump to leaf level if not. */
12875 +static int handle_pos_end_of_twig(flush_pos_t * pos)
12876 +{
12877 + int ret;
12878 + lock_handle right_lock;
12879 + load_count right_load;
12880 + coord_t at_right;
12881 + jnode *child = NULL;
12882 +
12883 + assert("zam-848", pos->state == POS_END_OF_TWIG);
12884 + assert("zam-849", coord_is_after_rightmost(&pos->coord));
12885 +
12886 + init_lh(&right_lock);
12887 + init_load_count(&right_load);
12888 +
12889 + /* We get a lock on the right twig node even it is not dirty because
12890 + * slum continues or discontinues on leaf level not on next twig. This
12891 + * lock on the right twig is needed for getting its leftmost child. */
12892 + ret =
12893 + reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12894 + ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12895 + if (ret)
12896 + goto out;
12897 +
12898 + ret = incr_load_count_znode(&right_load, right_lock.node);
12899 + if (ret)
12900 + goto out;
12901 +
12902 + /* right twig could be not dirty */
12903 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12904 + /* If right twig node is dirty we always attempt to squeeze it
12905 + * content to the left... */
12906 + became_dirty:
12907 + ret =
12908 + squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12909 + if (ret <= 0) {
12910 + /* pos->coord is on internal item, go to leaf level, or
12911 + * we have an error which will be caught in squalloc() */
12912 + pos->state = POS_TO_LEAF;
12913 + goto out;
12914 + }
12915 +
12916 + /* If right twig was squeezed completely we wave to re-lock
12917 + * right twig. now it is done through the top-level squalloc
12918 + * routine. */
12919 + if (node_is_empty(right_lock.node))
12920 + goto out;
12921 +
12922 + /* ... and prep it if it is not yet prepped */
12923 + if (!znode_check_flushprepped(right_lock.node)) {
12924 + /* As usual, process parent before ... */
12925 + ret =
12926 + check_parents_and_squalloc_upper_levels(pos,
12927 + pos->lock.
12928 + node,
12929 + right_lock.
12930 + node);
12931 + if (ret)
12932 + goto out;
12933 +
12934 + /* ... processing the child */
12935 + ret =
12936 + lock_parent_and_allocate_znode(right_lock.node,
12937 + pos);
12938 + if (ret)
12939 + goto out;
12940 + }
12941 + } else {
12942 + coord_init_first_unit(&at_right, right_lock.node);
12943 +
12944 + /* check first child of next twig, should we continue there ? */
12945 + ret = get_leftmost_child_of_unit(&at_right, &child);
12946 + if (ret || child == NULL || jnode_check_flushprepped(child)) {
12947 + pos_stop(pos);
12948 + goto out;
12949 + }
12950 +
12951 + /* check clean twig for possible relocation */
12952 + if (!znode_check_flushprepped(right_lock.node)) {
12953 + ret =
12954 + reverse_relocate_check_dirty_parent(child,
12955 + &at_right, pos);
12956 + if (ret)
12957 + goto out;
12958 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12959 + goto became_dirty;
12960 + }
12961 + }
12962 +
12963 + assert("zam-875", znode_check_flushprepped(right_lock.node));
12964 +
12965 + /* Update the preceder by a block number of just processed right twig
12966 + * node. The code above could miss the preceder updating because
12967 + * allocate_znode() could not be called for this node. */
12968 + pos->preceder.blk = *znode_get_block(right_lock.node);
12969 + check_preceder(pos->preceder.blk);
12970 +
12971 + coord_init_first_unit(&at_right, right_lock.node);
12972 + assert("zam-868", coord_is_existing_unit(&at_right));
12973 +
12974 + pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12975 + move_flush_pos(pos, &right_lock, &right_load, &at_right);
12976 +
12977 + out:
12978 + done_load_count(&right_load);
12979 + done_lh(&right_lock);
12980 +
12981 + if (child)
12982 + jput(child);
12983 +
12984 + return ret;
12985 +}
12986 +
12987 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12988 + * continue there. */
12989 +static int handle_pos_to_leaf(flush_pos_t * pos)
12990 +{
12991 + int ret;
12992 + lock_handle child_lock;
12993 + load_count child_load;
12994 + jnode *child;
12995 +
12996 + assert("zam-846", pos->state == POS_TO_LEAF);
12997 + assert("zam-847", item_is_internal(&pos->coord));
12998 +
12999 + init_lh(&child_lock);
13000 + init_load_count(&child_load);
13001 +
13002 + ret = get_leftmost_child_of_unit(&pos->coord, &child);
13003 + if (ret)
13004 + return ret;
13005 + if (child == NULL) {
13006 + pos_stop(pos);
13007 + return 0;
13008 + }
13009 +
13010 + if (jnode_check_flushprepped(child)) {
13011 + pos->state = POS_INVALID;
13012 + goto out;
13013 + }
13014 +
13015 + ret =
13016 + longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
13017 + ZNODE_LOCK_LOPRI);
13018 + if (ret)
13019 + goto out;
13020 +
13021 + ret = incr_load_count_znode(&child_load, JZNODE(child));
13022 + if (ret)
13023 + goto out;
13024 +
13025 + ret = allocate_znode(JZNODE(child), &pos->coord, pos);
13026 + if (ret)
13027 + goto out;
13028 +
13029 + /* move flush position to leaf level */
13030 + pos->state = POS_ON_LEAF;
13031 + move_flush_pos(pos, &child_lock, &child_load, NULL);
13032 +
13033 + if (node_is_empty(JZNODE(child))) {
13034 + ret = delete_empty_node(JZNODE(child));
13035 + pos->state = POS_INVALID;
13036 + }
13037 + out:
13038 + done_load_count(&child_load);
13039 + done_lh(&child_lock);
13040 + jput(child);
13041 +
13042 + return ret;
13043 +}
13044 +
13045 +/* move pos from leaf to twig, and move lock from leaf to twig. */
13046 +/* Move pos->lock to upper (twig) level */
13047 +static int handle_pos_to_twig(flush_pos_t * pos)
13048 +{
13049 + int ret;
13050 +
13051 + lock_handle parent_lock;
13052 + load_count parent_load;
13053 + coord_t pcoord;
13054 +
13055 + assert("zam-852", pos->state == POS_TO_TWIG);
13056 +
13057 + init_lh(&parent_lock);
13058 + init_load_count(&parent_load);
13059 +
13060 + ret =
13061 + reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
13062 + if (ret)
13063 + goto out;
13064 +
13065 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
13066 + if (ret)
13067 + goto out;
13068 +
13069 + ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
13070 + if (ret)
13071 + goto out;
13072 +
13073 + assert("zam-870", item_is_internal(&pcoord));
13074 + coord_next_item(&pcoord);
13075 +
13076 + if (coord_is_after_rightmost(&pcoord))
13077 + pos->state = POS_END_OF_TWIG;
13078 + else if (item_is_extent(&pcoord))
13079 + pos->state = POS_ON_EPOINT;
13080 + else {
13081 + /* Here we understand that getting -E_NO_NEIGHBOR in
13082 + * handle_pos_on_leaf() was because of just a reaching edge of
13083 + * slum */
13084 + pos_stop(pos);
13085 + goto out;
13086 + }
13087 +
13088 + move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
13089 +
13090 + out:
13091 + done_load_count(&parent_load);
13092 + done_lh(&parent_lock);
13093 +
13094 + return ret;
13095 +}
13096 +
13097 +typedef int (*pos_state_handle_t) (flush_pos_t *);
13098 +static pos_state_handle_t flush_pos_handlers[] = {
13099 + /* process formatted nodes on leaf level, keep lock on a leaf node */
13100 + [POS_ON_LEAF] = handle_pos_on_leaf,
13101 + /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
13102 + * being processed */
13103 + [POS_ON_EPOINT] = handle_pos_on_twig,
13104 + /* move a lock from leaf node to its parent for further processing of unformatted nodes */
13105 + [POS_TO_TWIG] = handle_pos_to_twig,
13106 + /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
13107 + * pos->coord points to the leaf node we jump to */
13108 + [POS_TO_LEAF] = handle_pos_to_leaf,
13109 + /* after processing last extent in the twig node, attempting to shift items from the twigs
13110 + * right neighbor and process them while shifting */
13111 + [POS_END_OF_TWIG] = handle_pos_end_of_twig,
13112 + /* process formatted nodes on internal level, keep lock on an internal node */
13113 + [POS_ON_INTERNAL] = handle_pos_on_internal
13114 +};
13115 +
13116 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
13117 + * encrypt) nodes and their ancestors in "parent-first" order */
13118 +static int squalloc(flush_pos_t * pos)
13119 +{
13120 + int ret = 0;
13121 +
13122 + /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
13123 + * greater CPU efficiency? Measure and see.... -Hans */
13124 + while (pos_valid(pos)) {
13125 + ret = flush_pos_handlers[pos->state] (pos);
13126 + if (ret < 0)
13127 + break;
13128 +
13129 + ret = rapid_flush(pos);
13130 + if (ret)
13131 + break;
13132 + }
13133 +
13134 + /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
13135 + routines, -E_NO_NEIGHBOR means that slum edge was reached */
13136 + if (ret > 0 || ret == -E_NO_NEIGHBOR)
13137 + ret = 0;
13138 +
13139 + return ret;
13140 +}
13141 +
13142 +static void update_ldkey(znode * node)
13143 +{
13144 + reiser4_key ldkey;
13145 +
13146 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
13147 + if (node_is_empty(node))
13148 + return;
13149 +
13150 + znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
13151 +}
13152 +
13153 +/* this is to be called after calling of shift node's method to shift data from @right to
13154 + @left. It sets left delimiting keys of @left and @right to keys of first items of @left
13155 + and @right correspondingly and sets right delimiting key of @left to first key of @right */
13156 +static void update_znode_dkeys(znode * left, znode * right)
13157 +{
13158 + assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
13159 + assert("vs-1629", (znode_is_write_locked(left) &&
13160 + znode_is_write_locked(right)));
13161 +
13162 + /* we need to update left delimiting of left if it was empty before shift */
13163 + update_ldkey(left);
13164 + update_ldkey(right);
13165 + if (node_is_empty(right))
13166 + znode_set_rd_key(left, znode_get_rd_key(right));
13167 + else
13168 + znode_set_rd_key(left, znode_get_ld_key(right));
13169 +}
13170 +
13171 +/* try to shift everything from @right to @left. If everything was shifted -
13172 + @right is removed from the tree. Result is the number of bytes shifted. */
13173 +static int
13174 +shift_everything_left(znode * right, znode * left, carry_level * todo)
13175 +{
13176 + coord_t from;
13177 + node_plugin *nplug;
13178 + carry_plugin_info info;
13179 +
13180 + coord_init_after_last_item(&from, right);
13181 +
13182 + nplug = node_plugin_by_node(right);
13183 + info.doing = NULL;
13184 + info.todo = todo;
13185 + return nplug->shift(&from, left, SHIFT_LEFT,
13186 + 1 /* delete @right if it becomes empty */ ,
13187 + 1
13188 + /* move coord @from to node @left if everything will be shifted */
13189 + ,
13190 + &info);
13191 +}
13192 +
13193 +/* Shift as much as possible from @right to @left using the memcpy-optimized
13194 + shift_everything_left. @left and @right are formatted neighboring nodes on
13195 + leaf level. */
13196 +static int squeeze_right_non_twig(znode * left, znode * right)
13197 +{
13198 + int ret;
13199 + carry_pool *pool;
13200 + carry_level *todo;
13201 +
13202 + assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
13203 +
13204 + if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
13205 + !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
13206 + return SQUEEZE_TARGET_FULL;
13207 +
13208 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
13209 + if (IS_ERR(pool))
13210 + return PTR_ERR(pool);
13211 + todo = (carry_level *) (pool + 1);
13212 + init_carry_level(todo, pool);
13213 +
13214 + ret = shift_everything_left(right, left, todo);
13215 + if (ret > 0) {
13216 + /* something was shifted */
13217 + reiser4_tree *tree;
13218 + __u64 grabbed;
13219 +
13220 + znode_make_dirty(left);
13221 + znode_make_dirty(right);
13222 +
13223 + /* update delimiting keys of nodes which participated in
13224 + shift. FIXME: it would be better to have this in shift
13225 + node's operation. But it can not be done there. Nobody
13226 + remembers why, though */
13227 + tree = znode_get_tree(left);
13228 + write_lock_dk(tree);
13229 + update_znode_dkeys(left, right);
13230 + write_unlock_dk(tree);
13231 +
13232 + /* Carry is called to update delimiting key and, maybe, to remove empty
13233 + node. */
13234 + grabbed = get_current_context()->grabbed_blocks;
13235 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13236 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13237 + ret = carry(todo, NULL /* previous level */ );
13238 + grabbed2free_mark(grabbed);
13239 + } else {
13240 + /* Shifting impossible, we return appropriate result code */
13241 + ret =
13242 + node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13243 + SQUEEZE_TARGET_FULL;
13244 + }
13245 +
13246 + done_carry_pool(pool);
13247 +
13248 + return ret;
13249 +}
13250 +
13251 +#if REISER4_DEBUG
13252 +static int sibling_link_is_ok(const znode *left, const znode *right)
13253 +{
13254 + int result;
13255 +
13256 + read_lock_tree(znode_get_tree(left));
13257 + result = (left->right == right && left == right->left);
13258 + read_unlock_tree(znode_get_tree(left));
13259 + return result;
13260 +}
13261 +#endif
13262 +
13263 +/* Shift first unit of first item if it is an internal one. Return
13264 + SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13265 + SUBTREE_MOVED. */
13266 +static int shift_one_internal_unit(znode * left, znode * right)
13267 +{
13268 + int ret;
13269 + carry_pool *pool;
13270 + carry_level *todo;
13271 + coord_t *coord;
13272 + carry_plugin_info *info;
13273 + int size, moved;
13274 +
13275 + assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13276 + assert("nikita-2435", znode_is_write_locked(left));
13277 + assert("nikita-2436", znode_is_write_locked(right));
13278 + assert("nikita-2434", sibling_link_is_ok(left, right));
13279 +
13280 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13281 + sizeof(*coord) + sizeof(*info)
13282 +#if REISER4_DEBUG
13283 + + sizeof(*coord) + 2 * sizeof(reiser4_key)
13284 +#endif
13285 + );
13286 + if (IS_ERR(pool))
13287 + return PTR_ERR(pool);
13288 + todo = (carry_level *) (pool + 1);
13289 + init_carry_level(todo, pool);
13290 +
13291 + coord = (coord_t *) (todo + 3);
13292 + coord_init_first_unit(coord, right);
13293 + info = (carry_plugin_info *) (coord + 1);
13294 +
13295 +#if REISER4_DEBUG
13296 + if (!node_is_empty(left)) {
13297 + coord_t *last;
13298 + reiser4_key *right_key;
13299 + reiser4_key *left_key;
13300 +
13301 + last = (coord_t *) (info + 1);
13302 + right_key = (reiser4_key *) (last + 1);
13303 + left_key = right_key + 1;
13304 + coord_init_last_unit(last, left);
13305 +
13306 + assert("nikita-2463",
13307 + keyle(item_key_by_coord(last, left_key),
13308 + item_key_by_coord(coord, right_key)));
13309 + }
13310 +#endif
13311 +
13312 + assert("jmacd-2007", item_is_internal(coord));
13313 +
13314 + size = item_length_by_coord(coord);
13315 + info->todo = todo;
13316 + info->doing = NULL;
13317 +
13318 + ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13319 + 1
13320 + /* delete @right if it becomes empty */
13321 + ,
13322 + 0
13323 + /* do not move coord @coord to node @left */
13324 + ,
13325 + info);
13326 +
13327 + /* If shift returns positive, then we shifted the item. */
13328 + assert("vs-423", ret <= 0 || size == ret);
13329 + moved = (ret > 0);
13330 +
13331 + if (moved) {
13332 + /* something was moved */
13333 + reiser4_tree *tree;
13334 + int grabbed;
13335 +
13336 + znode_make_dirty(left);
13337 + znode_make_dirty(right);
13338 + tree = znode_get_tree(left);
13339 + write_lock_dk(tree);
13340 + update_znode_dkeys(left, right);
13341 + write_unlock_dk(tree);
13342 +
13343 + /* reserve space for delimiting keys after shifting */
13344 + grabbed = get_current_context()->grabbed_blocks;
13345 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13346 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13347 +
13348 + ret = carry(todo, NULL /* previous level */ );
13349 + grabbed2free_mark(grabbed);
13350 + }
13351 +
13352 + done_carry_pool(pool);
13353 +
13354 + if (ret != 0) {
13355 + /* Shift or carry operation failed. */
13356 + assert("jmacd-7325", ret < 0);
13357 + return ret;
13358 + }
13359 +
13360 + return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13361 +}
13362 +
13363 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
13364 + znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13365 +static int
13366 +allocate_znode_loaded(znode * node,
13367 + const coord_t * parent_coord, flush_pos_t * pos)
13368 +{
13369 + int ret;
13370 + reiser4_super_info_data *sbinfo = get_current_super_private();
13371 + /* FIXME(D): We have the node write-locked and should have checked for !
13372 + allocated() somewhere before reaching this point, but there can be a race, so
13373 + this assertion is bogus. */
13374 + assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13375 + assert("jmacd-7988", znode_is_write_locked(node));
13376 + assert("jmacd-7989", coord_is_invalid(parent_coord)
13377 + || znode_is_write_locked(parent_coord->node));
13378 +
13379 + if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13380 + znode_is_root(node) ||
13381 + /* We have enough nodes to relocate no matter what. */
13382 + (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13383 + /* No need to decide with new nodes, they are treated the same as
13384 + relocate. If the root node is dirty, relocate. */
13385 + if (pos->preceder.blk == 0) {
13386 + /* preceder is unknown and we have decided to relocate node --
13387 + using of default value for search start is better than search
13388 + from block #0. */
13389 + get_blocknr_hint_default(&pos->preceder.blk);
13390 + check_preceder(pos->preceder.blk);
13391 + }
13392 +
13393 + goto best_reloc;
13394 +
13395 + } else if (pos->preceder.blk == 0) {
13396 + /* If we don't know the preceder, leave it where it is. */
13397 + jnode_make_wander(ZJNODE(node));
13398 + } else {
13399 + /* Make a decision based on block distance. */
13400 + reiser4_block_nr dist;
13401 + reiser4_block_nr nblk = *znode_get_block(node);
13402 +
13403 + assert("jmacd-6172", !blocknr_is_fake(&nblk));
13404 + assert("jmacd-6173", !blocknr_is_fake(&pos->preceder.blk));
13405 + assert("jmacd-6174", pos->preceder.blk != 0);
13406 +
13407 + if (pos->preceder.blk == nblk - 1) {
13408 + /* Ideal. */
13409 + jnode_make_wander(ZJNODE(node));
13410 + } else {
13411 +
13412 + dist =
13413 + (nblk <
13414 + pos->preceder.blk) ? (pos->preceder.blk -
13415 + nblk) : (nblk -
13416 + pos->preceder.blk);
13417 +
13418 + /* See if we can find a closer block (forward direction only). */
13419 + pos->preceder.max_dist =
13420 + min((reiser4_block_nr) sbinfo->flush.
13421 + relocate_distance, dist);
13422 + pos->preceder.level = znode_get_level(node);
13423 +
13424 + ret = allocate_znode_update(node, parent_coord, pos);
13425 +
13426 + pos->preceder.max_dist = 0;
13427 +
13428 + if (ret && (ret != -ENOSPC))
13429 + return ret;
13430 +
13431 + if (ret == 0) {
13432 + /* Got a better allocation. */
13433 + znode_make_reloc(node, pos->fq);
13434 + } else if (dist < sbinfo->flush.relocate_distance) {
13435 + /* The present allocation is good enough. */
13436 + jnode_make_wander(ZJNODE(node));
13437 + } else {
13438 + /* Otherwise, try to relocate to the best position. */
13439 + best_reloc:
13440 + ret =
13441 + allocate_znode_update(node, parent_coord,
13442 + pos);
13443 + if (ret != 0)
13444 + return ret;
13445 +
13446 + /* set JNODE_RELOC bit _after_ node gets allocated */
13447 + znode_make_reloc(node, pos->fq);
13448 + }
13449 + }
13450 + }
13451 +
13452 + /* This is the new preceder. */
13453 + pos->preceder.blk = *znode_get_block(node);
13454 + check_preceder(pos->preceder.blk);
13455 + pos->alloc_cnt += 1;
13456 +
13457 + assert("jmacd-4277", !blocknr_is_fake(&pos->preceder.blk));
13458 +
13459 + return 0;
13460 +}
13461 +
13462 +static int
13463 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13464 +{
13465 + /*
13466 + * perform znode allocation with znode pinned in memory to avoid races
13467 + * with asynchronous emergency flush (which plays with
13468 + * JNODE_FLUSH_RESERVED bit).
13469 + */
13470 + return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13471 +}
13472 +
13473 +/* A subroutine of allocate_znode, this is called first to see if there is a close
13474 + position to relocate to. It may return ENOSPC if there is no close position. If there
13475 + is no close position it may not relocate. This takes care of updating the parent node
13476 + with the relocated block address. */
13477 +static int
13478 +allocate_znode_update(znode * node, const coord_t * parent_coord,
13479 + flush_pos_t * pos)
13480 +{
13481 + int ret;
13482 + reiser4_block_nr blk;
13483 + lock_handle uber_lock;
13484 + int flush_reserved_used = 0;
13485 + int grabbed;
13486 + reiser4_context *ctx;
13487 + reiser4_super_info_data *sbinfo;
13488 +
13489 + init_lh(&uber_lock);
13490 +
13491 + ctx = get_current_context();
13492 + sbinfo = get_super_private(ctx->super);
13493 +
13494 + grabbed = ctx->grabbed_blocks;
13495 +
13496 + /* discard e-flush allocation */
13497 + ret = zload(node);
13498 + if (ret)
13499 + return ret;
13500 +
13501 + if (ZF_ISSET(node, JNODE_CREATED)) {
13502 + assert("zam-816", blocknr_is_fake(znode_get_block(node)));
13503 + pos->preceder.block_stage = BLOCK_UNALLOCATED;
13504 + } else {
13505 + pos->preceder.block_stage = BLOCK_GRABBED;
13506 +
13507 + /* The disk space for relocating the @node is already reserved in "flush reserved"
13508 + * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13509 + * space from whole disk not from only 95%). */
13510 + if (znode_get_level(node) == LEAF_LEVEL) {
13511 + /*
13512 + * earlier (during do_jnode_make_dirty()) we decided
13513 + * that @node can possibly go into overwrite set and
13514 + * reserved block for its wandering location.
13515 + */
13516 + txn_atom *atom = get_current_atom_locked();
13517 + assert("nikita-3449",
13518 + ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13519 + flush_reserved2grabbed(atom, (__u64) 1);
13520 + spin_unlock_atom(atom);
13521 + /*
13522 + * we are trying to move node into relocate
13523 + * set. Allocation of relocated position "uses"
13524 + * reserved block.
13525 + */
13526 + ZF_CLR(node, JNODE_FLUSH_RESERVED);
13527 + flush_reserved_used = 1;
13528 + } else {
13529 + ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13530 + if (ret != 0)
13531 + goto exit;
13532 + }
13533 + }
13534 +
13535 + /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13536 + ret = reiser4_alloc_block(&pos->preceder, &blk,
13537 + BA_FORMATTED | BA_PERMANENT);
13538 + if (ret)
13539 + goto exit;
13540 +
13541 + if (!ZF_ISSET(node, JNODE_CREATED) &&
13542 + (ret =
13543 + reiser4_dealloc_block(znode_get_block(node), 0,
13544 + BA_DEFER | BA_FORMATTED)))
13545 + goto exit;
13546 +
13547 + if (likely(!znode_is_root(node))) {
13548 + item_plugin *iplug;
13549 +
13550 + iplug = item_plugin_by_coord(parent_coord);
13551 + assert("nikita-2954", iplug->f.update != NULL);
13552 + iplug->f.update(parent_coord, &blk);
13553 +
13554 + znode_make_dirty(parent_coord->node);
13555 +
13556 + } else {
13557 + reiser4_tree *tree = znode_get_tree(node);
13558 + znode *uber;
13559 +
13560 + /* We take a longterm lock on the fake node in order to change
13561 + the root block number. This may cause atom fusion. */
13562 + ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13563 + &uber_lock);
13564 + /* The fake node cannot be deleted, and we must have priority
13565 + here, and may not be confused with ENOSPC. */
13566 + assert("jmacd-74412",
13567 + ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13568 +
13569 + if (ret)
13570 + goto exit;
13571 +
13572 + uber = uber_lock.node;
13573 +
13574 + write_lock_tree(tree);
13575 + tree->root_block = blk;
13576 + write_unlock_tree(tree);
13577 +
13578 + znode_make_dirty(uber);
13579 + }
13580 +
13581 + ret = znode_rehash(node, &blk);
13582 + exit:
13583 + if (ret) {
13584 + /* Get flush reserved block back if something fails, because
13585 + * callers assume that on error block wasn't relocated and its
13586 + * flush reserved block wasn't used. */
13587 + if (flush_reserved_used) {
13588 + /*
13589 + * ok, we failed to move node into relocate
13590 + * set. Restore status quo.
13591 + */
13592 + grabbed2flush_reserved((__u64) 1);
13593 + ZF_SET(node, JNODE_FLUSH_RESERVED);
13594 + }
13595 + }
13596 + zrelse(node);
13597 + done_lh(&uber_lock);
13598 + grabbed2free_mark(grabbed);
13599 + return ret;
13600 +}
13601 +
13602 +/* JNODE INTERFACE */
13603 +
13604 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13605 + coordinate in the parent. If the child is the root node, the above_root
13606 + znode is returned but the coord is not set. This function may cause atom
13607 + fusion, but it is only used for read locks (at this point) and therefore
13608 + fusion only occurs when the parent is already dirty. */
13609 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13610 + pointer in jnodes. */
13611 +static int
13612 +jnode_lock_parent_coord(jnode * node,
13613 + coord_t * coord,
13614 + lock_handle * parent_lh,
13615 + load_count * parent_zh,
13616 + znode_lock_mode parent_mode, int try)
13617 +{
13618 + int ret;
13619 +
13620 + assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13621 + assert("edward-54", jnode_is_unformatted(node)
13622 + || znode_is_any_locked(JZNODE(node)));
13623 +
13624 + if (!jnode_is_znode(node)) {
13625 + reiser4_key key;
13626 + tree_level stop_level = TWIG_LEVEL;
13627 + lookup_bias bias = FIND_EXACT;
13628 +
13629 + assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13630 +
13631 + /* The case when node is not znode, but can have parent coord
13632 + (unformatted node, node which represents cluster page,
13633 + etc..). Generate a key for the appropriate entry, search
13634 + in the tree using coord_by_key, which handles locking for
13635 + us. */
13636 +
13637 + /*
13638 + * nothing is locked at this moment, so, nothing prevents
13639 + * concurrent truncate from removing jnode from inode. To
13640 + * prevent this spin-lock jnode. jnode can be truncated just
13641 + * after call to the jnode_build_key(), but this is ok,
13642 + * because coord_by_key() will just fail to find appropriate
13643 + * extent.
13644 + */
13645 + spin_lock_jnode(node);
13646 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13647 + jnode_build_key(node, &key);
13648 + ret = 0;
13649 + } else
13650 + ret = RETERR(-ENOENT);
13651 + spin_unlock_jnode(node);
13652 +
13653 + if (ret != 0)
13654 + return ret;
13655 +
13656 + if (jnode_is_cluster_page(node))
13657 + stop_level = LEAF_LEVEL;
13658 +
13659 + assert("jmacd-1812", coord != NULL);
13660 +
13661 + ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13662 + parent_mode, bias, stop_level, stop_level,
13663 + CBK_UNIQUE, NULL /*ra_info */ );
13664 + switch (ret) {
13665 + case CBK_COORD_NOTFOUND:
13666 + assert("edward-1038",
13667 + ergo(jnode_is_cluster_page(node),
13668 + JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13669 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13670 + warning("nikita-3177", "Parent not found");
13671 + return ret;
13672 + case CBK_COORD_FOUND:
13673 + if (coord->between != AT_UNIT) {
13674 + /* FIXME: comment needed */
13675 + done_lh(parent_lh);
13676 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13677 + warning("nikita-3178",
13678 + "Found but not happy: %i",
13679 + coord->between);
13680 + }
13681 + return RETERR(-ENOENT);
13682 + }
13683 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13684 + if (ret != 0)
13685 + return ret;
13686 + /* if (jnode_is_cluster_page(node)) {
13687 + races with write() are possible
13688 + check_child_cluster (parent_lh->node);
13689 + }
13690 + */
13691 + break;
13692 + default:
13693 + return ret;
13694 + }
13695 +
13696 + } else {
13697 + int flags;
13698 + znode *z;
13699 +
13700 + z = JZNODE(node);
13701 + /* Formatted node case: */
13702 + assert("jmacd-2061", !znode_is_root(z));
13703 +
13704 + flags = GN_ALLOW_NOT_CONNECTED;
13705 + if (try)
13706 + flags |= GN_TRY_LOCK;
13707 +
13708 + ret =
13709 + reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13710 + if (ret != 0)
13711 + /* -E_REPEAT is ok here, it is handled by the caller. */
13712 + return ret;
13713 +
13714 + /* Make the child's position "hint" up-to-date. (Unless above
13715 + root, which caller must check.) */
13716 + if (coord != NULL) {
13717 +
13718 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13719 + if (ret != 0) {
13720 + warning("jmacd-976812386",
13721 + "incr_load_count_znode failed: %d",
13722 + ret);
13723 + return ret;
13724 + }
13725 +
13726 + ret = find_child_ptr(parent_lh->node, z, coord);
13727 + if (ret != 0) {
13728 + warning("jmacd-976812",
13729 + "find_child_ptr failed: %d", ret);
13730 + return ret;
13731 + }
13732 + }
13733 + }
13734 +
13735 + return 0;
13736 +}
13737 +
13738 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13739 + If there is no next neighbor or the neighbor is not in memory or if there is a
13740 + neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13741 + In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13742 +static int neighbor_in_slum(znode * node, /* starting point */
13743 + lock_handle * lock, /* lock on starting point */
13744 + sideof side, /* left or right direction we seek the next node in */
13745 + znode_lock_mode mode, /* kind of lock we want */
13746 + int check_dirty)
13747 +{ /* true if the neighbor should be dirty */
13748 + int ret;
13749 +
13750 + assert("jmacd-6334", znode_is_connected(node));
13751 +
13752 + ret =
13753 + reiser4_get_neighbor(lock, node, mode,
13754 + GN_SAME_ATOM | (side ==
13755 + LEFT_SIDE ? GN_GO_LEFT : 0));
13756 +
13757 + if (ret) {
13758 + /* May return -ENOENT or -E_NO_NEIGHBOR. */
13759 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13760 + if (ret == -ENOENT) {
13761 + ret = RETERR(-E_NO_NEIGHBOR);
13762 + }
13763 +
13764 + return ret;
13765 + }
13766 + if (!check_dirty)
13767 + return 0;
13768 + /* Check dirty bit of locked znode, no races here */
13769 + if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13770 + return 0;
13771 +
13772 + done_lh(lock);
13773 + return RETERR(-E_NO_NEIGHBOR);
13774 +}
13775 +
13776 +/* Return true if two znodes have the same parent. This is called with both nodes
13777 + write-locked (for squeezing) so no tree lock is needed. */
13778 +static int znode_same_parents(znode * a, znode * b)
13779 +{
13780 + int result;
13781 +
13782 + assert("jmacd-7011", znode_is_write_locked(a));
13783 + assert("jmacd-7012", znode_is_write_locked(b));
13784 +
13785 + /* We lock the whole tree for this check.... I really don't like whole tree
13786 + * locks... -Hans */
13787 + read_lock_tree(znode_get_tree(a));
13788 + result = (znode_parent(a) == znode_parent(b));
13789 + read_unlock_tree(znode_get_tree(a));
13790 + return result;
13791 +}
13792 +
13793 +/* FLUSH SCAN */
13794 +
13795 +/* Initialize the flush_scan data structure. */
13796 +static void scan_init(flush_scan * scan)
13797 +{
13798 + memset(scan, 0, sizeof(*scan));
13799 + init_lh(&scan->node_lock);
13800 + init_lh(&scan->parent_lock);
13801 + init_load_count(&scan->parent_load);
13802 + init_load_count(&scan->node_load);
13803 + coord_init_invalid(&scan->parent_coord, NULL);
13804 +}
13805 +
13806 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13807 +static void scan_done(flush_scan * scan)
13808 +{
13809 + done_load_count(&scan->node_load);
13810 + if (scan->node != NULL) {
13811 + jput(scan->node);
13812 + scan->node = NULL;
13813 + }
13814 + done_load_count(&scan->parent_load);
13815 + done_lh(&scan->parent_lock);
13816 + done_lh(&scan->node_lock);
13817 +}
13818 +
13819 +/* Returns true if flush scanning is finished. */
13820 +int scan_finished(flush_scan * scan)
13821 +{
13822 + return scan->stop || (scan->direction == RIGHT_SIDE &&
13823 + scan->count >= scan->max_count);
13824 +}
13825 +
13826 +/* Return true if the scan should continue to the @tonode. True if the node meets the
13827 + same_slum_check condition. If not, deref the "left" node and stop the scan. */
13828 +int scan_goto(flush_scan * scan, jnode * tonode)
13829 +{
13830 + int go = same_slum_check(scan->node, tonode, 1, 0);
13831 +
13832 + if (!go) {
13833 + scan->stop = 1;
13834 + jput(tonode);
13835 + }
13836 +
13837 + return go;
13838 +}
13839 +
13840 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13841 + count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13842 + parent coordinate. */
13843 +int
13844 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13845 + const coord_t * parent)
13846 +{
13847 + /* Release the old references, take the new reference. */
13848 + done_load_count(&scan->node_load);
13849 +
13850 + if (scan->node != NULL) {
13851 + jput(scan->node);
13852 + }
13853 + scan->node = node;
13854 + scan->count += add_count;
13855 +
13856 + /* This next stmt is somewhat inefficient. The scan_extent_coord code could
13857 + delay this update step until it finishes and update the parent_coord only once.
13858 + It did that before, but there was a bug and this was the easiest way to make it
13859 + correct. */
13860 + if (parent != NULL) {
13861 + coord_dup(&scan->parent_coord, parent);
13862 + }
13863 +
13864 + /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13865 + is safely taken. */
13866 + return incr_load_count_jnode(&scan->node_load, node);
13867 +}
13868 +
13869 +/* Return true if scanning in the leftward direction. */
13870 +int scanning_left(flush_scan * scan)
13871 +{
13872 + return scan->direction == LEFT_SIDE;
13873 +}
13874 +
13875 +/* Performs leftward scanning starting from either kind of node. Counts the starting
13876 + node. The right-scan object is passed in for the left-scan in order to copy the parent
13877 + of an unformatted starting position. This way we avoid searching for the unformatted
13878 + node's parent when scanning in each direction. If we search for the parent once it is
13879 + set in both scan objects. The limit parameter tells flush-scan when to stop.
13880 +
13881 + Rapid scanning is used only during scan_left, where we are interested in finding the
13882 + 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13883 + of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13884 + problem is finding a way to flush only those nodes without unallocated children, and it
13885 + is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13886 + problem can be solved by scanning left at every level as we go upward, but this would
13887 + basically bring us back to using a top-down allocation strategy, which we already tried
13888 + (see BK history from May 2002), and has a different set of problems. The top-down
13889 + strategy makes avoiding unallocated children easier, but makes it difficult to
13890 + propertly flush dirty children with clean parents that would otherwise stop the
13891 + top-down flush, only later to dirty the parent once the children are flushed. So we
13892 + solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13893 + only.
13894 +
13895 + The first step in solving the problem is this rapid leftward scan. After we determine
13896 + that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13897 + are no longer interested in the exact count, we are only interested in finding a the
13898 + best place to start the flush. We could choose one of two possibilities:
13899 +
13900 + 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13901 + This requires checking one leaf per rapid-scan twig
13902 +
13903 + 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13904 + to the left. This requires checking possibly all of the in-memory children of each
13905 + twig during the rapid scan.
13906 +
13907 + For now we implement the first policy.
13908 +*/
13909 +static int
13910 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13911 +{
13912 + int ret = 0;
13913 +
13914 + scan->max_count = limit;
13915 + scan->direction = LEFT_SIDE;
13916 +
13917 + ret = scan_set_current(scan, jref(node), 1, NULL);
13918 + if (ret != 0) {
13919 + return ret;
13920 + }
13921 +
13922 + ret = scan_common(scan, right);
13923 + if (ret != 0) {
13924 + return ret;
13925 + }
13926 +
13927 + /* Before rapid scanning, we need a lock on scan->node so that we can get its
13928 + parent, only if formatted. */
13929 + if (jnode_is_znode(scan->node)) {
13930 + ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13931 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13932 + }
13933 +
13934 + /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13935 + return ret;
13936 +}
13937 +
13938 +/* Performs rightward scanning... Does not count the starting node. The limit parameter
13939 + is described in scan_left. If the starting node is unformatted then the
13940 + parent_coord was already set during scan_left. The rapid_after parameter is not used
13941 + during right-scanning.
13942 +
13943 + scan_right is only called if the scan_left operation does not count at least
13944 + FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13945 + the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13946 + scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13947 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13948 +{
13949 + int ret;
13950 +
13951 + scan->max_count = limit;
13952 + scan->direction = RIGHT_SIDE;
13953 +
13954 + ret = scan_set_current(scan, jref(node), 0, NULL);
13955 + if (ret != 0) {
13956 + return ret;
13957 + }
13958 +
13959 + return scan_common(scan, NULL);
13960 +}
13961 +
13962 +/* Common code to perform left or right scanning. */
13963 +static int scan_common(flush_scan * scan, flush_scan * other)
13964 +{
13965 + int ret;
13966 +
13967 + assert("nikita-2376", scan->node != NULL);
13968 + assert("edward-54", jnode_is_unformatted(scan->node)
13969 + || jnode_is_znode(scan->node));
13970 +
13971 + /* Special case for starting at an unformatted node. Optimization: we only want
13972 + to search for the parent (which requires a tree traversal) once. Obviously, we
13973 + shouldn't have to call it once for the left scan and once for the right scan.
13974 + For this reason, if we search for the parent during scan-left we then duplicate
13975 + the coord/lock/load into the scan-right object. */
13976 + if (jnode_is_unformatted(scan->node)) {
13977 + ret = scan_unformatted(scan, other);
13978 + if (ret != 0)
13979 + return ret;
13980 + }
13981 + /* This loop expects to start at a formatted position and performs chaining of
13982 + formatted regions */
13983 + while (!scan_finished(scan)) {
13984 +
13985 + ret = scan_formatted(scan);
13986 + if (ret != 0) {
13987 + return ret;
13988 + }
13989 + }
13990 +
13991 + return 0;
13992 +}
13993 +
13994 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
13995 +{
13996 + int ret = 0;
13997 + int try = 0;
13998 +
13999 + if (!coord_is_invalid(&scan->parent_coord))
14000 + goto scan;
14001 +
14002 + /* set parent coord from */
14003 + if (!jnode_is_unformatted(scan->node)) {
14004 + /* formatted position */
14005 +
14006 + lock_handle lock;
14007 + assert("edward-301", jnode_is_znode(scan->node));
14008 + init_lh(&lock);
14009 +
14010 + /*
14011 + * when flush starts from unformatted node, first thing it
14012 + * does is tree traversal to find formatted parent of starting
14013 + * node. This parent is then kept lock across scans to the
14014 + * left and to the right. This means that during scan to the
14015 + * left we cannot take left-ward lock, because this is
14016 + * dead-lock prone. So, if we are scanning to the left and
14017 + * there is already lock held by this thread,
14018 + * jnode_lock_parent_coord() should use try-lock.
14019 + */
14020 + try = scanning_left(scan)
14021 + && !lock_stack_isclean(get_current_lock_stack());
14022 + /* Need the node locked to get the parent lock, We have to
14023 + take write lock since there is at least one call path
14024 + where this znode is already write-locked by us. */
14025 + ret =
14026 + longterm_lock_znode(&lock, JZNODE(scan->node),
14027 + ZNODE_WRITE_LOCK,
14028 + scanning_left(scan) ? ZNODE_LOCK_LOPRI :
14029 + ZNODE_LOCK_HIPRI);
14030 + if (ret != 0)
14031 + /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
14032 + scanned too far and can't back out, just start over. */
14033 + return ret;
14034 +
14035 + ret = jnode_lock_parent_coord(scan->node,
14036 + &scan->parent_coord,
14037 + &scan->parent_lock,
14038 + &scan->parent_load,
14039 + ZNODE_WRITE_LOCK, try);
14040 +
14041 + /* FIXME(C): check EINVAL, E_DEADLOCK */
14042 + done_lh(&lock);
14043 + if (ret == -E_REPEAT) {
14044 + scan->stop = 1;
14045 + return 0;
14046 + }
14047 + if (ret)
14048 + return ret;
14049 +
14050 + } else {
14051 + /* unformatted position */
14052 +
14053 + ret =
14054 + jnode_lock_parent_coord(scan->node, &scan->parent_coord,
14055 + &scan->parent_lock,
14056 + &scan->parent_load,
14057 + ZNODE_WRITE_LOCK, try);
14058 +
14059 + if (IS_CBKERR(ret))
14060 + return ret;
14061 +
14062 + if (ret == CBK_COORD_NOTFOUND)
14063 + /* FIXME(C): check EINVAL, E_DEADLOCK */
14064 + return ret;
14065 +
14066 + /* parent was found */
14067 + assert("jmacd-8661", other != NULL);
14068 + /* Duplicate the reference into the other flush_scan. */
14069 + coord_dup(&other->parent_coord, &scan->parent_coord);
14070 + copy_lh(&other->parent_lock, &scan->parent_lock);
14071 + copy_load_count(&other->parent_load, &scan->parent_load);
14072 + }
14073 + scan:
14074 + return scan_by_coord(scan);
14075 +}
14076 +
14077 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
14078 + pointers under tree lock as long as:
14079 +
14080 + - node->left/right is non-NULL
14081 + - node->left/right is connected, dirty
14082 + - node->left/right belongs to the same atom
14083 + - scan has not reached maximum count
14084 +*/
14085 +static int scan_formatted(flush_scan * scan)
14086 +{
14087 + int ret;
14088 + znode *neighbor = NULL;
14089 +
14090 + assert("jmacd-1401", !scan_finished(scan));
14091 +
14092 + do {
14093 + znode *node = JZNODE(scan->node);
14094 +
14095 + /* Node should be connected, but if not stop the scan. */
14096 + if (!znode_is_connected(node)) {
14097 + scan->stop = 1;
14098 + break;
14099 + }
14100 +
14101 + /* Lock the tree, check-for and reference the next sibling. */
14102 + read_lock_tree(znode_get_tree(node));
14103 +
14104 + /* It may be that a node is inserted or removed between a node and its
14105 + left sibling while the tree lock is released, but the flush-scan count
14106 + does not need to be precise. Thus, we release the tree lock as soon as
14107 + we get the neighboring node. */
14108 + neighbor = scanning_left(scan) ? node->left : node->right;
14109 + if (neighbor != NULL) {
14110 + zref(neighbor);
14111 + }
14112 +
14113 + read_unlock_tree(znode_get_tree(node));
14114 +
14115 + /* If neighbor is NULL at the leaf level, need to check for an unformatted
14116 + sibling using the parent--break in any case. */
14117 + if (neighbor == NULL) {
14118 + break;
14119 + }
14120 +
14121 + /* Check the condition for going left, break if it is not met. This also
14122 + releases (jputs) the neighbor if false. */
14123 + if (!scan_goto(scan, ZJNODE(neighbor))) {
14124 + break;
14125 + }
14126 +
14127 + /* Advance the flush_scan state to the left, repeat. */
14128 + ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
14129 + if (ret != 0) {
14130 + return ret;
14131 + }
14132 +
14133 + } while (!scan_finished(scan));
14134 +
14135 + /* If neighbor is NULL then we reached the end of a formatted region, or else the
14136 + sibling is out of memory, now check for an extent to the left (as long as
14137 + LEAF_LEVEL). */
14138 + if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
14139 + || scan_finished(scan)) {
14140 + scan->stop = 1;
14141 + return 0;
14142 + }
14143 + /* Otherwise, calls scan_by_coord for the right(left)most item of the
14144 + left(right) neighbor on the parent level, then possibly continue. */
14145 +
14146 + coord_init_invalid(&scan->parent_coord, NULL);
14147 + return scan_unformatted(scan, NULL);
14148 +}
14149 +
14150 +/* NOTE-EDWARD:
14151 + This scans adjacent items of the same type and calls scan flush plugin for each one.
14152 + Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
14153 + from unformatted node, then we continue only if the next neighbor is also unformatted.
14154 + When called from scan_formatted, we skip first iteration (to make sure that
14155 + right(left)most item of the left(right) neighbor on the parent level is of the same
14156 + type and set appropriate coord). */
14157 +static int scan_by_coord(flush_scan * scan)
14158 +{
14159 + int ret = 0;
14160 + int scan_this_coord;
14161 + lock_handle next_lock;
14162 + load_count next_load;
14163 + coord_t next_coord;
14164 + jnode *child;
14165 + item_plugin *iplug;
14166 +
14167 + init_lh(&next_lock);
14168 + init_load_count(&next_load);
14169 + scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
14170 +
14171 + /* set initial item id */
14172 + iplug = item_plugin_by_coord(&scan->parent_coord);
14173 +
14174 + for (; !scan_finished(scan); scan_this_coord = 1) {
14175 + if (scan_this_coord) {
14176 + /* Here we expect that unit is scannable. it would not be so due
14177 + * to race with extent->tail conversion. */
14178 + if (iplug->f.scan == NULL) {
14179 + scan->stop = 1;
14180 + ret = -E_REPEAT;
14181 + /* skip the check at the end. */
14182 + goto race;
14183 + }
14184 +
14185 + ret = iplug->f.scan(scan);
14186 + if (ret != 0)
14187 + goto exit;
14188 +
14189 + if (scan_finished(scan)) {
14190 + checkchild(scan);
14191 + break;
14192 + }
14193 + } else {
14194 + /* the same race against truncate as above is possible
14195 + * here, it seems */
14196 +
14197 + /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
14198 + the first coordinate. */
14199 + assert("jmacd-1231",
14200 + item_is_internal(&scan->parent_coord));
14201 + }
14202 +
14203 + if (iplug->f.utmost_child == NULL
14204 + || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
14205 + /* stop this coord and continue on parrent level */
14206 + ret =
14207 + scan_set_current(scan,
14208 + ZJNODE(zref
14209 + (scan->parent_coord.node)),
14210 + 1, NULL);
14211 + if (ret != 0)
14212 + goto exit;
14213 + break;
14214 + }
14215 +
14216 + /* Either way, the invariant is that scan->parent_coord is set to the
14217 + parent of scan->node. Now get the next unit. */
14218 + coord_dup(&next_coord, &scan->parent_coord);
14219 + coord_sideof_unit(&next_coord, scan->direction);
14220 +
14221 + /* If off-the-end of the twig, try the next twig. */
14222 + if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14223 + /* We take the write lock because we may start flushing from this
14224 + * coordinate. */
14225 + ret =
14226 + neighbor_in_slum(next_coord.node, &next_lock,
14227 + scan->direction, ZNODE_WRITE_LOCK,
14228 + 1 /* check dirty */ );
14229 + if (ret == -E_NO_NEIGHBOR) {
14230 + scan->stop = 1;
14231 + ret = 0;
14232 + break;
14233 + }
14234 +
14235 + if (ret != 0) {
14236 + goto exit;
14237 + }
14238 +
14239 + ret = incr_load_count_znode(&next_load, next_lock.node);
14240 + if (ret != 0) {
14241 + goto exit;
14242 + }
14243 +
14244 + coord_init_sideof_unit(&next_coord, next_lock.node,
14245 + sideof_reverse(scan->direction));
14246 + }
14247 +
14248 + iplug = item_plugin_by_coord(&next_coord);
14249 +
14250 + /* Get the next child. */
14251 + ret =
14252 + iplug->f.utmost_child(&next_coord,
14253 + sideof_reverse(scan->direction),
14254 + &child);
14255 + if (ret != 0)
14256 + goto exit;
14257 + /* If the next child is not in memory, or, item_utmost_child
14258 + failed (due to race with unlink, most probably), stop
14259 + here. */
14260 + if (child == NULL || IS_ERR(child)) {
14261 + scan->stop = 1;
14262 + checkchild(scan);
14263 + break;
14264 + }
14265 +
14266 + assert("nikita-2374", jnode_is_unformatted(child)
14267 + || jnode_is_znode(child));
14268 +
14269 + /* See if it is dirty, part of the same atom. */
14270 + if (!scan_goto(scan, child)) {
14271 + checkchild(scan);
14272 + break;
14273 + }
14274 +
14275 + /* If so, make this child current. */
14276 + ret = scan_set_current(scan, child, 1, &next_coord);
14277 + if (ret != 0)
14278 + goto exit;
14279 +
14280 + /* Now continue. If formatted we release the parent lock and return, then
14281 + proceed. */
14282 + if (jnode_is_znode(child))
14283 + break;
14284 +
14285 + /* Otherwise, repeat the above loop with next_coord. */
14286 + if (next_load.node != NULL) {
14287 + done_lh(&scan->parent_lock);
14288 + move_lh(&scan->parent_lock, &next_lock);
14289 + move_load_count(&scan->parent_load, &next_load);
14290 + }
14291 + }
14292 +
14293 + assert("jmacd-6233", scan_finished(scan) || jnode_is_znode(scan->node));
14294 + exit:
14295 + checkchild(scan);
14296 + race: /* skip the above check */
14297 + if (jnode_is_znode(scan->node)) {
14298 + done_lh(&scan->parent_lock);
14299 + done_load_count(&scan->parent_load);
14300 + }
14301 +
14302 + done_load_count(&next_load);
14303 + done_lh(&next_lock);
14304 + return ret;
14305 +}
14306 +
14307 +/* FLUSH POS HELPERS */
14308 +
14309 +/* Initialize the fields of a flush_position. */
14310 +static void pos_init(flush_pos_t * pos)
14311 +{
14312 + memset(pos, 0, sizeof *pos);
14313 +
14314 + pos->state = POS_INVALID;
14315 + coord_init_invalid(&pos->coord, NULL);
14316 + init_lh(&pos->lock);
14317 + init_load_count(&pos->load);
14318 +
14319 + blocknr_hint_init(&pos->preceder);
14320 +}
14321 +
14322 +/* The flush loop inside squalloc periodically checks pos_valid to
14323 + determine when "enough flushing" has been performed. This will return true until one
14324 + of the following conditions is met:
14325 +
14326 + 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14327 + parameter, meaning we have flushed as many blocks as the kernel requested. When
14328 + flushing to commit, this parameter is NULL.
14329 +
14330 + 2. pos_stop() is called because squalloc discovers that the "next" node in the
14331 + flush order is either non-existant, not dirty, or not in the same atom.
14332 +*/
14333 +
14334 +static int pos_valid(flush_pos_t * pos)
14335 +{
14336 + return pos->state != POS_INVALID;
14337 +}
14338 +
14339 +/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14340 +static void pos_done(flush_pos_t * pos)
14341 +{
14342 + pos_stop(pos);
14343 + blocknr_hint_done(&pos->preceder);
14344 + if (convert_data(pos))
14345 + free_convert_data(pos);
14346 +}
14347 +
14348 +/* Reset the point and parent. Called during flush subroutines to terminate the
14349 + squalloc loop. */
14350 +static int pos_stop(flush_pos_t * pos)
14351 +{
14352 + pos->state = POS_INVALID;
14353 + done_lh(&pos->lock);
14354 + done_load_count(&pos->load);
14355 + coord_init_invalid(&pos->coord, NULL);
14356 +
14357 + if (pos->child) {
14358 + jput(pos->child);
14359 + pos->child = NULL;
14360 + }
14361 +
14362 + return 0;
14363 +}
14364 +
14365 +/* Return the flush_position's block allocator hint. */
14366 +reiser4_blocknr_hint *pos_hint(flush_pos_t * pos)
14367 +{
14368 + return &pos->preceder;
14369 +}
14370 +
14371 +flush_queue_t *pos_fq(flush_pos_t * pos)
14372 +{
14373 + return pos->fq;
14374 +}
14375 +
14376 +/* Make Linus happy.
14377 + Local variables:
14378 + c-indentation-style: "K&R"
14379 + mode-name: "LC"
14380 + c-basic-offset: 8
14381 + tab-width: 8
14382 + fill-column: 90
14383 + LocalWords: preceder
14384 + End:
14385 +*/
14386 Index: linux-2.6.16/fs/reiser4/flush.h
14387 ===================================================================
14388 --- /dev/null
14389 +++ linux-2.6.16/fs/reiser4/flush.h
14390 @@ -0,0 +1,274 @@
14391 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14392 +
14393 +/* DECLARATIONS: */
14394 +
14395 +#if !defined(__REISER4_FLUSH_H__)
14396 +#define __REISER4_FLUSH_H__
14397 +
14398 +#include "plugin/cluster.h"
14399 +
14400 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14401 + single level of the tree. A flush-scan is used for counting the number of adjacent
14402 + nodes to flush, which is used to determine whether we should relocate, and it is also
14403 + used to find a starting point for flush. A flush-scan object can scan in both right
14404 + and left directions via the scan_left() and scan_right() interfaces. The
14405 + right- and left-variations are similar but perform different functions. When scanning
14406 + left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14407 + When scanning right we are simply counting the number of adjacent, dirty nodes. */
14408 +struct flush_scan {
14409 +
14410 + /* The current number of nodes scanned on this level. */
14411 + unsigned count;
14412 +
14413 + /* There may be a maximum number of nodes for a scan on any single level. When
14414 + going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14415 + unsigned max_count;
14416 +
14417 + /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14418 + sideof direction;
14419 +
14420 + /* Initially @stop is set to false then set true once some condition stops the
14421 + search (e.g., we found a clean node before reaching max_count or we found a
14422 + node belonging to another atom). */
14423 + int stop;
14424 +
14425 + /* The current scan position. If @node is non-NULL then its reference count has
14426 + been incremented to reflect this reference. */
14427 + jnode *node;
14428 +
14429 + /* A handle for zload/zrelse of current scan position node. */
14430 + load_count node_load;
14431 +
14432 + /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14433 + node is locked using this lock handle. The endpoint needs to be locked for
14434 + transfer to the flush_position object after scanning finishes. */
14435 + lock_handle node_lock;
14436 +
14437 + /* When the position is unformatted, its parent, coordinate, and parent
14438 + zload/zrelse handle. */
14439 + lock_handle parent_lock;
14440 + coord_t parent_coord;
14441 + load_count parent_load;
14442 +
14443 + /* The block allocator preceder hint. Sometimes flush_scan determines what the
14444 + preceder is and if so it sets it here, after which it is copied into the
14445 + flush_position. Otherwise, the preceder is computed later. */
14446 + reiser4_block_nr preceder_blk;
14447 +};
14448 +
14449 +typedef struct convert_item_info {
14450 + dc_item_stat d_cur; /* disk cluster state of the current item */
14451 + dc_item_stat d_next; /* disk cluster state of the next slum item */
14452 + struct inode *inode;
14453 + flow_t flow;
14454 +} convert_item_info_t;
14455 +
14456 +typedef struct convert_info {
14457 + int count; /* for squalloc terminating */
14458 + reiser4_cluster_t clust; /* transform cluster */
14459 + item_plugin *iplug; /* current item plugin */
14460 + convert_item_info_t *itm; /* current item info */
14461 +} convert_info_t;
14462 +
14463 +typedef enum flush_position_state {
14464 + POS_INVALID, /* Invalid or stopped pos, do not continue slum
14465 + * processing */
14466 + POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14467 + * leaf level */
14468 + POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14469 + * to traverse unformatted nodes */
14470 + POS_TO_LEAF, /* pos is being moved to leaf level */
14471 + POS_TO_TWIG, /* pos is being moved to twig level */
14472 + POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14473 + * rightmost unit of the current twig */
14474 + POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14475 +} flushpos_state_t;
14476 +
14477 +/* An encapsulation of the current flush point and all the parameters that are passed
14478 + through the entire squeeze-and-allocate stage of the flush routine. A single
14479 + flush_position object is constructed after left- and right-scanning finishes. */
14480 +struct flush_position {
14481 + flushpos_state_t state;
14482 +
14483 + coord_t coord; /* coord to traverse unformatted nodes */
14484 + lock_handle lock; /* current lock we hold */
14485 + load_count load; /* load status for current locked formatted node */
14486 +
14487 + jnode *child; /* for passing a reference to unformatted child
14488 + * across pos state changes */
14489 +
14490 + reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14491 + int leaf_relocate; /* True if enough leaf-level nodes were
14492 + * found to suggest a relocate policy. */
14493 + int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14494 + int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14495 + flush_queue_t *fq;
14496 + long *nr_written; /* number of nodes submitted to disk */
14497 + int flags; /* a copy of jnode_flush flags argument */
14498 +
14499 + znode *prev_twig; /* previous parent pointer value, used to catch
14500 + * processing of new twig node */
14501 + convert_info_t *sq; /* convert info */
14502 +
14503 + unsigned long pos_in_unit; /* for extents only. Position
14504 + within an extent unit of first
14505 + jnode of slum */
14506 + long nr_to_write; /* number of unformatted nodes to handle on flush */
14507 +};
14508 +
14509 +static inline int item_convert_count(flush_pos_t * pos)
14510 +{
14511 + return pos->sq->count;
14512 +}
14513 +static inline void inc_item_convert_count(flush_pos_t * pos)
14514 +{
14515 + pos->sq->count++;
14516 +}
14517 +static inline void set_item_convert_count(flush_pos_t * pos, int count)
14518 +{
14519 + pos->sq->count = count;
14520 +}
14521 +static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14522 +{
14523 + return pos->sq->iplug;
14524 +}
14525 +
14526 +static inline convert_info_t *convert_data(flush_pos_t * pos)
14527 +{
14528 + return pos->sq;
14529 +}
14530 +
14531 +static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
14532 +{
14533 + assert("edward-955", convert_data(pos));
14534 + return pos->sq->itm;
14535 +}
14536 +
14537 +static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
14538 +{
14539 + return &pos->sq->clust.tc;
14540 +}
14541 +
14542 +static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
14543 +{
14544 + assert("edward-854", pos->sq != NULL);
14545 + return tfm_stream(tfm_cluster_sq(pos), id);
14546 +}
14547 +
14548 +static inline int chaining_data_present(flush_pos_t * pos)
14549 +{
14550 + return convert_data(pos) && item_convert_data(pos);
14551 +}
14552 +
14553 +/* Returns true if next node contains next item of the disk cluster
14554 + so item convert data should be moved to the right slum neighbor.
14555 +*/
14556 +static inline int should_chain_next_node(flush_pos_t * pos)
14557 +{
14558 + int result = 0;
14559 +
14560 + assert("edward-1007", chaining_data_present(pos));
14561 +
14562 + switch (item_convert_data(pos)->d_next) {
14563 + case DC_CHAINED_ITEM:
14564 + result = 1;
14565 + break;
14566 + case DC_AFTER_CLUSTER:
14567 + break;
14568 + default:
14569 + impossible("edward-1009", "bad state of next slum item");
14570 + }
14571 + return result;
14572 +}
14573 +
14574 +/* update item state in a disk cluster to assign conversion mode */
14575 +static inline void
14576 +move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14577 +{
14578 +
14579 + assert("edward-1010", chaining_data_present(pos));
14580 +
14581 + if (this_node == 0) {
14582 + /* next item is on the right neighbor */
14583 + assert("edward-1011",
14584 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14585 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14586 + assert("edward-1012",
14587 + item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14588 +
14589 + item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14590 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14591 + } else {
14592 + /* next item is on the same node */
14593 + assert("edward-1013",
14594 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14595 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14596 + assert("edward-1227",
14597 + item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14598 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
14599 +
14600 + item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14601 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14602 + }
14603 +}
14604 +
14605 +static inline int should_convert_node(flush_pos_t * pos, znode * node)
14606 +{
14607 + return znode_convertible(node);
14608 +}
14609 +
14610 +/* true if there is attached convert item info */
14611 +static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
14612 +{
14613 + return convert_data(pos) && item_convert_data(pos);
14614 +}
14615 +
14616 +#define SQUALLOC_THRESHOLD 256
14617 +
14618 +static inline int should_terminate_squalloc(flush_pos_t * pos)
14619 +{
14620 + return convert_data(pos) &&
14621 + !item_convert_data(pos) &&
14622 + item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14623 +}
14624 +
14625 +void free_convert_data(flush_pos_t * pos);
14626 +/* used in extent.c */
14627 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14628 + const coord_t * parent);
14629 +int scan_finished(flush_scan * scan);
14630 +int scanning_left(flush_scan * scan);
14631 +int scan_goto(flush_scan * scan, jnode * tonode);
14632 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14633 +int alloc_extent(flush_pos_t *flush_pos);
14634 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14635 + reiser4_key *stop_key);
14636 +extern int init_fqs(void);
14637 +extern void done_fqs(void);
14638 +
14639 +#if REISER4_DEBUG
14640 +
14641 +extern void check_fq(const txn_atom *atom);
14642 +extern atomic_t flush_cnt;
14643 +
14644 +#define check_preceder(blk) \
14645 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14646 +extern void check_pos(flush_pos_t * pos);
14647 +#else
14648 +#define check_preceder(b) noop
14649 +#define check_pos(pos) noop
14650 +#endif
14651 +
14652 +/* __REISER4_FLUSH_H__ */
14653 +#endif
14654 +
14655 +/* Make Linus happy.
14656 + Local variables:
14657 + c-indentation-style: "K&R"
14658 + mode-name: "LC"
14659 + c-basic-offset: 8
14660 + tab-width: 8
14661 + fill-column: 90
14662 + LocalWords: preceder
14663 + End:
14664 +*/
14665 Index: linux-2.6.16/fs/reiser4/flush_queue.c
14666 ===================================================================
14667 --- /dev/null
14668 +++ linux-2.6.16/fs/reiser4/flush_queue.c
14669 @@ -0,0 +1,681 @@
14670 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14671 +
14672 +#include "debug.h"
14673 +#include "super.h"
14674 +#include "txnmgr.h"
14675 +#include "jnode.h"
14676 +#include "znode.h"
14677 +#include "page_cache.h"
14678 +#include "wander.h"
14679 +#include "vfs_ops.h"
14680 +#include "writeout.h"
14681 +#include "flush.h"
14682 +
14683 +#include <linux/bio.h>
14684 +#include <linux/mm.h>
14685 +#include <linux/pagemap.h>
14686 +#include <linux/blkdev.h>
14687 +#include <linux/writeback.h>
14688 +
14689 +/* A flush queue object is an accumulator for keeping jnodes prepared
14690 + by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14691 + kept on the flush queue until memory pressure or atom commit asks
14692 + flush queues to write some or all from their jnodes. */
14693 +
14694 +/*
14695 + LOCKING:
14696 +
14697 + fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14698 + list protected by atom spin lock. fq->prepped list uses the following
14699 + locking:
14700 +
14701 + two ways to protect fq->prepped list for read-only list traversal:
14702 +
14703 + 1. atom spin-lock atom.
14704 + 2. fq is IN_USE, atom->nr_running_queues increased.
14705 +
14706 + and one for list modification:
14707 +
14708 + 1. atom is spin-locked and one condition is true: fq is IN_USE or
14709 + atom->nr_running_queues == 0.
14710 +
14711 + The deadlock-safe order for flush queues and atoms is: first lock atom, then
14712 + lock flush queue, then lock jnode.
14713 +*/
14714 +
14715 +#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14716 +#define fq_ready(fq) (!fq_in_use(fq))
14717 +
14718 +#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14719 +#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14720 +
14721 +/* get lock on atom from locked flush queue object */
14722 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14723 +{
14724 + /* This code is similar to jnode_get_atom(), look at it for the
14725 + * explanation. */
14726 + txn_atom *atom;
14727 +
14728 + assert_spin_locked(&(fq->guard));
14729 +
14730 + while (1) {
14731 + atom = fq->atom;
14732 + if (atom == NULL)
14733 + break;
14734 +
14735 + if (spin_trylock_atom(atom))
14736 + break;
14737 +
14738 + atomic_inc(&atom->refcount);
14739 + spin_unlock(&(fq->guard));
14740 + spin_lock_atom(atom);
14741 + spin_lock(&(fq->guard));
14742 +
14743 + if (fq->atom == atom) {
14744 + atomic_dec(&atom->refcount);
14745 + break;
14746 + }
14747 +
14748 + spin_unlock(&(fq->guard));
14749 + atom_dec_and_unlock(atom);
14750 + spin_lock(&(fq->guard));
14751 + }
14752 +
14753 + return atom;
14754 +}
14755 +
14756 +txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14757 +{
14758 + txn_atom *atom;
14759 +
14760 + spin_lock(&(fq->guard));
14761 + atom = atom_locked_by_fq_nolock(fq);
14762 + spin_unlock(&(fq->guard));
14763 + return atom;
14764 +}
14765 +
14766 +static void init_fq(flush_queue_t * fq)
14767 +{
14768 + memset(fq, 0, sizeof *fq);
14769 +
14770 + atomic_set(&fq->nr_submitted, 0);
14771 +
14772 + INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14773 +
14774 + sema_init(&fq->io_sem, 0);
14775 + spin_lock_init(&fq->guard);
14776 +}
14777 +
14778 +/* slab for flush queues */
14779 +static kmem_cache_t *fq_slab;
14780 +
14781 +
14782 +/**
14783 + * init_fqs - create flush queue cache
14784 + *
14785 + * Initializes slab cache of flush queues. It is part of reiser4 module
14786 + * initialization.
14787 + */
14788 +int init_fqs(void)
14789 +{
14790 + fq_slab = kmem_cache_create("fq",
14791 + sizeof(flush_queue_t),
14792 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14793 + if (fq_slab == NULL)
14794 + return RETERR(-ENOMEM);
14795 + return 0;
14796 +}
14797 +
14798 +/**
14799 + * done_fqs - delete flush queue cache
14800 + *
14801 + * This is called on reiser4 module unloading or system shutdown.
14802 + */
14803 +void done_fqs(void)
14804 +{
14805 + destroy_reiser4_cache(&fq_slab);
14806 +}
14807 +
14808 +/* create new flush queue object */
14809 +static flush_queue_t *create_fq(gfp_t gfp)
14810 +{
14811 + flush_queue_t *fq;
14812 +
14813 + fq = kmem_cache_alloc(fq_slab, gfp);
14814 + if (fq)
14815 + init_fq(fq);
14816 +
14817 + return fq;
14818 +}
14819 +
14820 +/* adjust atom's and flush queue's counters of queued nodes */
14821 +static void count_enqueued_node(flush_queue_t * fq)
14822 +{
14823 + ON_DEBUG(fq->atom->num_queued++);
14824 +}
14825 +
14826 +static void count_dequeued_node(flush_queue_t * fq)
14827 +{
14828 + assert("zam-993", fq->atom->num_queued > 0);
14829 + ON_DEBUG(fq->atom->num_queued--);
14830 +}
14831 +
14832 +/* attach flush queue object to the atom */
14833 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14834 +{
14835 + assert_spin_locked(&(atom->alock));
14836 + list_add(&fq->alink, &atom->flush_queues);
14837 + fq->atom = atom;
14838 + ON_DEBUG(atom->nr_flush_queues++);
14839 +}
14840 +
14841 +static void detach_fq(flush_queue_t * fq)
14842 +{
14843 + assert_spin_locked(&(fq->atom->alock));
14844 +
14845 + spin_lock(&(fq->guard));
14846 + list_del_init(&fq->alink);
14847 + assert("vs-1456", fq->atom->nr_flush_queues > 0);
14848 + ON_DEBUG(fq->atom->nr_flush_queues--);
14849 + fq->atom = NULL;
14850 + spin_unlock(&(fq->guard));
14851 +}
14852 +
14853 +/* destroy flush queue object */
14854 +static void done_fq(flush_queue_t * fq)
14855 +{
14856 + assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14857 + assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14858 +
14859 + kmem_cache_free(fq_slab, fq);
14860 +}
14861 +
14862 +/* */
14863 +void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14864 +{
14865 + JF_SET(node, JNODE_FLUSH_QUEUED);
14866 + count_enqueued_node(fq);
14867 +}
14868 +
14869 +/* Putting jnode into the flush queue. Both atom and jnode should be
14870 + spin-locked. */
14871 +void queue_jnode(flush_queue_t * fq, jnode * node)
14872 +{
14873 + assert_spin_locked(&(node->guard));
14874 + assert("zam-713", node->atom != NULL);
14875 + assert_spin_locked(&(node->atom->alock));
14876 + assert("zam-716", fq->atom != NULL);
14877 + assert("zam-717", fq->atom == node->atom);
14878 + assert("zam-907", fq_in_use(fq));
14879 +
14880 + assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14881 + assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14882 + assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14883 + assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14884 +
14885 + mark_jnode_queued(fq, node);
14886 + list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14887 +
14888 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14889 + FQ_LIST, 1));
14890 +}
14891 +
14892 +/* repeatable process for waiting io completion on a flush queue object */
14893 +static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14894 +{
14895 + assert("zam-738", fq->atom != NULL);
14896 + assert_spin_locked(&(fq->atom->alock));
14897 + assert("zam-736", fq_in_use(fq));
14898 + assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14899 +
14900 + if (atomic_read(&fq->nr_submitted) != 0) {
14901 + struct super_block *super;
14902 +
14903 + spin_unlock_atom(fq->atom);
14904 +
14905 + assert("nikita-3013", schedulable());
14906 +
14907 + super = reiser4_get_current_sb();
14908 +
14909 + /* FIXME: this is instead of blk_run_queues() */
14910 + blk_run_address_space(get_super_fake(super)->i_mapping);
14911 +
14912 + if (!(super->s_flags & MS_RDONLY))
14913 + down(&fq->io_sem);
14914 +
14915 + /* Ask the caller to re-acquire the locks and call this
14916 + function again. Note: this technique is commonly used in
14917 + the txnmgr code. */
14918 + return -E_REPEAT;
14919 + }
14920 +
14921 + *nr_io_errors += atomic_read(&fq->nr_errors);
14922 + return 0;
14923 +}
14924 +
14925 +/* wait on I/O completion, re-submit dirty nodes to write */
14926 +static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14927 +{
14928 + int ret;
14929 + txn_atom *atom = fq->atom;
14930 +
14931 + assert("zam-801", atom != NULL);
14932 + assert_spin_locked(&(atom->alock));
14933 + assert("zam-762", fq_in_use(fq));
14934 +
14935 + ret = wait_io(fq, nr_io_errors);
14936 + if (ret)
14937 + return ret;
14938 +
14939 + detach_fq(fq);
14940 + done_fq(fq);
14941 +
14942 + atom_send_event(atom);
14943 +
14944 + return 0;
14945 +}
14946 +
14947 +/* wait for all i/o for given atom to be completed, actually do one iteration
14948 + on that and return -E_REPEAT if there more iterations needed */
14949 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14950 +{
14951 + flush_queue_t *fq;
14952 +
14953 + assert_spin_locked(&(atom->alock));
14954 +
14955 + if (list_empty_careful(&atom->flush_queues))
14956 + return 0;
14957 +
14958 + list_for_each_entry(fq, &atom->flush_queues, alink) {
14959 + if (fq_ready(fq)) {
14960 + int ret;
14961 +
14962 + mark_fq_in_use(fq);
14963 + assert("vs-1247", fq->owner == NULL);
14964 + ON_DEBUG(fq->owner = current);
14965 + ret = finish_fq(fq, nr_io_errors);
14966 +
14967 + if (*nr_io_errors)
14968 + reiser4_handle_error();
14969 +
14970 + if (ret) {
14971 + fq_put(fq);
14972 + return ret;
14973 + }
14974 +
14975 + spin_unlock_atom(atom);
14976 +
14977 + return -E_REPEAT;
14978 + }
14979 + }
14980 +
14981 + /* All flush queues are in use; atom remains locked */
14982 + return -EBUSY;
14983 +}
14984 +
14985 +/* wait all i/o for current atom */
14986 +int current_atom_finish_all_fq(void)
14987 +{
14988 + txn_atom *atom;
14989 + int nr_io_errors = 0;
14990 + int ret = 0;
14991 +
14992 + do {
14993 + while (1) {
14994 + atom = get_current_atom_locked();
14995 + ret = finish_all_fq(atom, &nr_io_errors);
14996 + if (ret != -EBUSY)
14997 + break;
14998 + atom_wait_event(atom);
14999 + }
15000 + } while (ret == -E_REPEAT);
15001 +
15002 + /* we do not need locked atom after this function finishes, SUCCESS or
15003 + -EBUSY are two return codes when atom remains locked after
15004 + finish_all_fq */
15005 + if (!ret)
15006 + spin_unlock_atom(atom);
15007 +
15008 + assert_spin_not_locked(&(atom->alock));
15009 +
15010 + if (ret)
15011 + return ret;
15012 +
15013 + if (nr_io_errors)
15014 + return RETERR(-EIO);
15015 +
15016 + return 0;
15017 +}
15018 +
15019 +/* change node->atom field for all jnode from given list */
15020 +static void
15021 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
15022 +{
15023 + jnode *cur;
15024 +
15025 + list_for_each_entry(cur, list, capture_link) {
15026 + spin_lock_jnode(cur);
15027 + cur->atom = atom;
15028 + spin_unlock_jnode(cur);
15029 + }
15030 +}
15031 +
15032 +/* support for atom fusion operation */
15033 +void fuse_fq(txn_atom *to, txn_atom *from)
15034 +{
15035 + flush_queue_t *fq;
15036 +
15037 + assert_spin_locked(&(to->alock));
15038 + assert_spin_locked(&(from->alock));
15039 +
15040 + list_for_each_entry(fq, &from->flush_queues, alink) {
15041 + scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
15042 + spin_lock(&(fq->guard));
15043 + fq->atom = to;
15044 + spin_unlock(&(fq->guard));
15045 + }
15046 +
15047 + list_splice_init(&from->flush_queues, to->flush_queues.prev);
15048 +
15049 +#if REISER4_DEBUG
15050 + to->num_queued += from->num_queued;
15051 + to->nr_flush_queues += from->nr_flush_queues;
15052 + from->nr_flush_queues = 0;
15053 +#endif
15054 +}
15055 +
15056 +#if REISER4_DEBUG
15057 +int atom_fq_parts_are_clean(txn_atom * atom)
15058 +{
15059 + assert("zam-915", atom != NULL);
15060 + return list_empty_careful(&atom->flush_queues);
15061 +}
15062 +#endif
15063 +/* Bio i/o completion routine for reiser4 write operations. */
15064 +static int
15065 +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
15066 + int err)
15067 +{
15068 + int i;
15069 + int nr_errors = 0;
15070 + flush_queue_t *fq;
15071 +
15072 + assert("zam-958", bio->bi_rw & WRITE);
15073 +
15074 + /* i/o op. is not fully completed */
15075 + if (bio->bi_size != 0)
15076 + return 1;
15077 +
15078 + if (err == -EOPNOTSUPP)
15079 + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
15080 +
15081 + /* we expect that bio->private is set to NULL or fq object which is used
15082 + * for synchronization and error counting. */
15083 + fq = bio->bi_private;
15084 + /* Check all elements of io_vec for correct write completion. */
15085 + for (i = 0; i < bio->bi_vcnt; i += 1) {
15086 + struct page *pg = bio->bi_io_vec[i].bv_page;
15087 +
15088 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
15089 + SetPageError(pg);
15090 + nr_errors++;
15091 + }
15092 +
15093 + {
15094 + /* jnode WRITEBACK ("write is in progress bit") is
15095 + * atomically cleared here. */
15096 + jnode *node;
15097 +
15098 + assert("zam-736", pg != NULL);
15099 + assert("zam-736", PagePrivate(pg));
15100 + node = jprivate(pg);
15101 +
15102 + JF_CLR(node, JNODE_WRITEBACK);
15103 + }
15104 +
15105 + end_page_writeback(pg);
15106 + page_cache_release(pg);
15107 + }
15108 +
15109 + if (fq) {
15110 + /* count i/o error in fq object */
15111 + atomic_add(nr_errors, &fq->nr_errors);
15112 +
15113 + /* If all write requests registered in this "fq" are done we up
15114 + * the semaphore. */
15115 + if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
15116 + up(&fq->io_sem);
15117 + }
15118 +
15119 + bio_put(bio);
15120 + return 0;
15121 +}
15122 +
15123 +/* Count I/O requests which will be submitted by @bio in given flush queues
15124 + @fq */
15125 +void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
15126 +{
15127 + bio->bi_private = fq;
15128 + bio->bi_end_io = end_io_handler;
15129 +
15130 + if (fq)
15131 + atomic_add(bio->bi_vcnt, &fq->nr_submitted);
15132 +}
15133 +
15134 +/* Move all queued nodes out from @fq->prepped list. */
15135 +static void release_prepped_list(flush_queue_t * fq)
15136 +{
15137 + txn_atom *atom;
15138 +
15139 + assert("zam-904", fq_in_use(fq));
15140 + atom = atom_locked_by_fq(fq);
15141 +
15142 + while (!list_empty(ATOM_FQ_LIST(fq))) {
15143 + jnode *cur;
15144 +
15145 + cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
15146 + list_del_init(&cur->capture_link);
15147 +
15148 + count_dequeued_node(fq);
15149 + spin_lock_jnode(cur);
15150 + assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
15151 + assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
15152 + assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
15153 + JF_CLR(cur, JNODE_FLUSH_QUEUED);
15154 +
15155 + if (JF_ISSET(cur, JNODE_DIRTY)) {
15156 + list_add_tail(&cur->capture_link,
15157 + ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
15158 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15159 + DIRTY_LIST, 1));
15160 + } else {
15161 + list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
15162 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15163 + CLEAN_LIST, 1));
15164 + }
15165 +
15166 + spin_unlock_jnode(cur);
15167 + }
15168 +
15169 + if (--atom->nr_running_queues == 0)
15170 + atom_send_event(atom);
15171 +
15172 + spin_unlock_atom(atom);
15173 +}
15174 +
15175 +/* Submit write requests for nodes on the already filled flush queue @fq.
15176 +
15177 + @fq: flush queue object which contains jnodes we can (and will) write.
15178 + @return: number of submitted blocks (>=0) if success, otherwise -- an error
15179 + code (<0). */
15180 +int write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
15181 +{
15182 + int ret;
15183 + txn_atom *atom;
15184 +
15185 + while (1) {
15186 + atom = atom_locked_by_fq(fq);
15187 + assert("zam-924", atom);
15188 + /* do not write fq in parallel. */
15189 + if (atom->nr_running_queues == 0
15190 + || !(flags & WRITEOUT_SINGLE_STREAM))
15191 + break;
15192 + atom_wait_event(atom);
15193 + }
15194 +
15195 + atom->nr_running_queues++;
15196 + spin_unlock_atom(atom);
15197 +
15198 + ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
15199 + release_prepped_list(fq);
15200 +
15201 + return ret;
15202 +}
15203 +
15204 +/* Getting flush queue object for exclusive use by one thread. May require
15205 + several iterations which is indicated by -E_REPEAT return code.
15206 +
15207 + This function does not contain code for obtaining an atom lock because an
15208 + atom lock is obtained by different ways in different parts of reiser4,
15209 + usually it is current atom, but we need a possibility for getting fq for the
15210 + atom of given jnode. */
15211 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15212 +{
15213 + flush_queue_t *fq;
15214 +
15215 + assert_spin_locked(&(atom->alock));
15216 +
15217 + fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15218 + while (&atom->flush_queues != &fq->alink) {
15219 + spin_lock(&(fq->guard));
15220 +
15221 + if (fq_ready(fq)) {
15222 + mark_fq_in_use(fq);
15223 + assert("vs-1246", fq->owner == NULL);
15224 + ON_DEBUG(fq->owner = current);
15225 + spin_unlock(&(fq->guard));
15226 +
15227 + if (*new_fq)
15228 + done_fq(*new_fq);
15229 +
15230 + *new_fq = fq;
15231 +
15232 + return 0;
15233 + }
15234 +
15235 + spin_unlock(&(fq->guard));
15236 +
15237 + fq = list_entry(fq->alink.next, flush_queue_t, alink);
15238 + }
15239 +
15240 + /* Use previously allocated fq object */
15241 + if (*new_fq) {
15242 + mark_fq_in_use(*new_fq);
15243 + assert("vs-1248", (*new_fq)->owner == 0);
15244 + ON_DEBUG((*new_fq)->owner = current);
15245 + attach_fq(atom, *new_fq);
15246 +
15247 + return 0;
15248 + }
15249 +
15250 + spin_unlock_atom(atom);
15251 +
15252 + *new_fq = create_fq(gfp);
15253 +
15254 + if (*new_fq == NULL)
15255 + return RETERR(-ENOMEM);
15256 +
15257 + return RETERR(-E_REPEAT);
15258 +}
15259 +
15260 +int fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15261 +{
15262 + return fq_by_atom_gfp(atom, new_fq, get_gfp_mask());
15263 +}
15264 +
15265 +/* A wrapper around fq_by_atom for getting a flush queue object for current
15266 + * atom, if success fq->atom remains locked. */
15267 +flush_queue_t *get_fq_for_current_atom(void)
15268 +{
15269 + flush_queue_t *fq = NULL;
15270 + txn_atom *atom;
15271 + int ret;
15272 +
15273 + do {
15274 + atom = get_current_atom_locked();
15275 + ret = fq_by_atom(atom, &fq);
15276 + } while (ret == -E_REPEAT);
15277 +
15278 + if (ret)
15279 + return ERR_PTR(ret);
15280 + return fq;
15281 +}
15282 +
15283 +/* Releasing flush queue object after exclusive use */
15284 +void fq_put_nolock(flush_queue_t *fq)
15285 +{
15286 + assert("zam-747", fq->atom != NULL);
15287 + assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15288 + mark_fq_ready(fq);
15289 + assert("vs-1245", fq->owner == current);
15290 + ON_DEBUG(fq->owner = NULL);
15291 +}
15292 +
15293 +void fq_put(flush_queue_t * fq)
15294 +{
15295 + txn_atom *atom;
15296 +
15297 + spin_lock(&(fq->guard));
15298 + atom = atom_locked_by_fq_nolock(fq);
15299 +
15300 + assert("zam-746", atom != NULL);
15301 +
15302 + fq_put_nolock(fq);
15303 + atom_send_event(atom);
15304 +
15305 + spin_unlock(&(fq->guard));
15306 + spin_unlock_atom(atom);
15307 +}
15308 +
15309 +/* A part of atom object initialization related to the embedded flush queue
15310 + list head */
15311 +
15312 +void init_atom_fq_parts(txn_atom *atom)
15313 +{
15314 + INIT_LIST_HEAD(&atom->flush_queues);
15315 +}
15316 +
15317 +#if REISER4_DEBUG
15318 +
15319 +void check_fq(const txn_atom *atom)
15320 +{
15321 + /* check number of nodes on all atom's flush queues */
15322 + flush_queue_t *fq;
15323 + int count;
15324 + struct list_head *pos;
15325 +
15326 + count = 0;
15327 + list_for_each_entry(fq, &atom->flush_queues, alink) {
15328 + spin_lock(&(fq->guard));
15329 + /* calculate number of jnodes on fq' list of prepped jnodes */
15330 + list_for_each(pos, ATOM_FQ_LIST(fq))
15331 + count++;
15332 + spin_unlock(&(fq->guard));
15333 + }
15334 + if (count != atom->fq)
15335 + warning("", "fq counter %d, real %d\n", atom->fq, count);
15336 +
15337 +}
15338 +
15339 +#endif
15340 +
15341 +/*
15342 + * Local variables:
15343 + * c-indentation-style: "K&R"
15344 + * mode-name: "LC"
15345 + * c-basic-offset: 8
15346 + * tab-width: 8
15347 + * fill-column: 79
15348 + * scroll-step: 1
15349 + * End:
15350 + */
15351 Index: linux-2.6.16/fs/reiser4/forward.h
15352 ===================================================================
15353 --- /dev/null
15354 +++ linux-2.6.16/fs/reiser4/forward.h
15355 @@ -0,0 +1,258 @@
15356 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15357 +
15358 +/* Forward declarations. Thank you Kernighan. */
15359 +
15360 +#if !defined( __REISER4_FORWARD_H__ )
15361 +#define __REISER4_FORWARD_H__
15362 +
15363 +#include <asm/errno.h>
15364 +#include <linux/types.h>
15365 +
15366 +typedef struct zlock zlock;
15367 +typedef struct lock_stack lock_stack;
15368 +typedef struct lock_handle lock_handle;
15369 +typedef struct znode znode;
15370 +typedef struct flow flow_t;
15371 +typedef struct coord coord_t;
15372 +typedef struct tree_access_pointer tap_t;
15373 +typedef struct item_coord item_coord;
15374 +typedef struct shift_params shift_params;
15375 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15376 +typedef union reiser4_plugin reiser4_plugin;
15377 +typedef __u16 reiser4_plugin_id;
15378 +typedef struct item_plugin item_plugin;
15379 +typedef struct jnode_plugin jnode_plugin;
15380 +typedef struct reiser4_item_data reiser4_item_data;
15381 +typedef union reiser4_key reiser4_key;
15382 +typedef struct reiser4_tree reiser4_tree;
15383 +typedef struct carry_cut_data carry_cut_data;
15384 +typedef struct carry_kill_data carry_kill_data;
15385 +typedef struct carry_tree_op carry_tree_op;
15386 +typedef struct carry_tree_node carry_tree_node;
15387 +typedef struct carry_plugin_info carry_plugin_info;
15388 +typedef struct reiser4_journal reiser4_journal;
15389 +typedef struct txn_atom txn_atom;
15390 +typedef struct txn_handle txn_handle;
15391 +typedef struct txn_mgr txn_mgr;
15392 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15393 +typedef struct reiser4_context reiser4_context;
15394 +typedef struct carry_level carry_level;
15395 +typedef struct blocknr_set blocknr_set;
15396 +typedef struct blocknr_set_entry blocknr_set_entry;
15397 +/* super_block->s_fs_info points to this */
15398 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15399 +/* next two objects are fields of reiser4_super_info_data */
15400 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15401 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15402 +
15403 +typedef struct flush_scan flush_scan;
15404 +typedef struct flush_position flush_pos_t;
15405 +
15406 +typedef unsigned short pos_in_node_t;
15407 +#define MAX_POS_IN_NODE 65535
15408 +
15409 +typedef struct jnode jnode;
15410 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15411 +
15412 +typedef struct uf_coord uf_coord_t;
15413 +typedef struct hint hint_t;
15414 +
15415 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15416 +
15417 +typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
15418 +
15419 +struct inode;
15420 +struct page;
15421 +struct file;
15422 +struct dentry;
15423 +struct super_block;
15424 +
15425 +/* return values of coord_by_key(). cbk == coord_by_key */
15426 +typedef enum {
15427 + CBK_COORD_FOUND = 0,
15428 + CBK_COORD_NOTFOUND = -ENOENT,
15429 +} lookup_result;
15430 +
15431 +/* results of lookup with directory file */
15432 +typedef enum {
15433 + FILE_NAME_FOUND = 0,
15434 + FILE_NAME_NOTFOUND = -ENOENT,
15435 + FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15436 + FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15437 +} file_lookup_result;
15438 +
15439 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15440 + both coincide. */
15441 +typedef enum {
15442 + /* search exactly for the coord with key given */
15443 + FIND_EXACT,
15444 + /* search for coord with the maximal key not greater than one
15445 + given */
15446 + FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15447 +} lookup_bias;
15448 +
15449 +typedef enum {
15450 + /* number of leaf level of the tree
15451 + The fake root has (tree_level=0). */
15452 + LEAF_LEVEL = 1,
15453 +
15454 + /* number of level one above leaf level of the tree.
15455 +
15456 + It is supposed that internal tree used by reiser4 to store file
15457 + system data and meta data will have height 2 initially (when
15458 + created by mkfs).
15459 + */
15460 + TWIG_LEVEL = 2,
15461 +} tree_level;
15462 +
15463 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15464 + array, since the zero'th level is not used. */
15465 +#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15466 +
15467 +/* enumeration of possible mutual position of item and coord. This enum is
15468 + return type of ->is_in_item() item plugin method which see. */
15469 +typedef enum {
15470 + /* coord is on the left of an item */
15471 + IP_ON_THE_LEFT,
15472 + /* coord is inside item */
15473 + IP_INSIDE,
15474 + /* coord is inside item, but to the right of the rightmost unit of
15475 + this item */
15476 + IP_RIGHT_EDGE,
15477 + /* coord is on the right of an item */
15478 + IP_ON_THE_RIGHT
15479 +} interposition;
15480 +
15481 +/* type of lock to acquire on znode before returning it to caller */
15482 +typedef enum {
15483 + ZNODE_NO_LOCK = 0,
15484 + ZNODE_READ_LOCK = 1,
15485 + ZNODE_WRITE_LOCK = 2,
15486 +} znode_lock_mode;
15487 +
15488 +/* type of lock request */
15489 +typedef enum {
15490 + ZNODE_LOCK_LOPRI = 0,
15491 + ZNODE_LOCK_HIPRI = (1 << 0),
15492 +
15493 + /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15494 + waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15495 + return the value -E_REPEAT. */
15496 + ZNODE_LOCK_NONBLOCK = (1 << 1),
15497 + /* An option for longterm_lock_znode which prevents atom fusion */
15498 + ZNODE_LOCK_DONT_FUSE = (1 << 2)
15499 +} znode_lock_request;
15500 +
15501 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15502 +
15503 +/* used to specify direction of shift. These must be -1 and 1 */
15504 +typedef enum {
15505 + SHIFT_LEFT = 1,
15506 + SHIFT_RIGHT = -1
15507 +} shift_direction;
15508 +
15509 +typedef enum {
15510 + LEFT_SIDE,
15511 + RIGHT_SIDE
15512 +} sideof;
15513 +
15514 +#define round_up( value, order ) \
15515 + ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15516 + ~( ( order ) - 1 ) ) )
15517 +
15518 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15519 +typedef enum {
15520 + /* unit of internal item is moved */
15521 + SUBTREE_MOVED = 0,
15522 + /* nothing else can be squeezed into left neighbor */
15523 + SQUEEZE_TARGET_FULL = 1,
15524 + /* all content of node is squeezed into its left neighbor */
15525 + SQUEEZE_SOURCE_EMPTY = 2,
15526 + /* one more item is copied (this is only returned by
15527 + allocate_and_copy_extent to squalloc_twig)) */
15528 + SQUEEZE_CONTINUE = 3
15529 +} squeeze_result;
15530 +
15531 +/* Do not change items ids. If you do - there will be format change */
15532 +typedef enum {
15533 + STATIC_STAT_DATA_ID = 0x0,
15534 + SIMPLE_DIR_ENTRY_ID = 0x1,
15535 + COMPOUND_DIR_ID = 0x2,
15536 + NODE_POINTER_ID = 0x3,
15537 + EXTENT_POINTER_ID = 0x5,
15538 + FORMATTING_ID = 0x6,
15539 + CTAIL_ID = 0x7,
15540 + BLACK_BOX_ID = 0x8,
15541 + LAST_ITEM_ID = 0x9
15542 +} item_id;
15543 +
15544 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15545 + whether commit() was called or VM memory pressure was applied. */
15546 +typedef enum {
15547 + /* submit flush queue to disk at jnode_flush completion */
15548 + JNODE_FLUSH_WRITE_BLOCKS = 1,
15549 +
15550 + /* flush is called for commit */
15551 + JNODE_FLUSH_COMMIT = 2,
15552 + /* not implemented */
15553 + JNODE_FLUSH_MEMORY_FORMATTED = 4,
15554 +
15555 + /* not implemented */
15556 + JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15557 +} jnode_flush_flags;
15558 +
15559 +/* Flags to insert/paste carry operations. Currently they only used in
15560 + flushing code, but in future, they can be used to optimize for repetitive
15561 + accesses. */
15562 +typedef enum {
15563 + /* carry is not allowed to shift data to the left when trying to find
15564 + free space */
15565 + COPI_DONT_SHIFT_LEFT = (1 << 0),
15566 + /* carry is not allowed to shift data to the right when trying to find
15567 + free space */
15568 + COPI_DONT_SHIFT_RIGHT = (1 << 1),
15569 + /* carry is not allowed to allocate new node(s) when trying to find
15570 + free space */
15571 + COPI_DONT_ALLOCATE = (1 << 2),
15572 + /* try to load left neighbor if its not in a cache */
15573 + COPI_LOAD_LEFT = (1 << 3),
15574 + /* try to load right neighbor if its not in a cache */
15575 + COPI_LOAD_RIGHT = (1 << 4),
15576 + /* shift insertion point to the left neighbor */
15577 + COPI_GO_LEFT = (1 << 5),
15578 + /* shift insertion point to the right neighbor */
15579 + COPI_GO_RIGHT = (1 << 6),
15580 + /* try to step back into original node if insertion into new node
15581 + fails after shifting data there. */
15582 + COPI_STEP_BACK = (1 << 7)
15583 +} cop_insert_flag;
15584 +
15585 +typedef enum {
15586 + SAFE_UNLINK, /* safe-link for unlink */
15587 + SAFE_TRUNCATE /* safe-link for truncate */
15588 +} reiser4_safe_link_t;
15589 +
15590 +/* this is to show on which list of atom jnode is */
15591 +typedef enum {
15592 + NOT_CAPTURED,
15593 + DIRTY_LIST,
15594 + CLEAN_LIST,
15595 + FQ_LIST,
15596 + WB_LIST,
15597 + OVRWR_LIST
15598 +} atom_list;
15599 +
15600 +
15601 +
15602 +/* __REISER4_FORWARD_H__ */
15603 +#endif
15604 +
15605 +/* Make Linus happy.
15606 + Local variables:
15607 + c-indentation-style: "K&R"
15608 + mode-name: "LC"
15609 + c-basic-offset: 8
15610 + tab-width: 8
15611 + fill-column: 120
15612 + End:
15613 +*/
15614 Index: linux-2.6.16/fs/reiser4/fsdata.c
15615 ===================================================================
15616 --- /dev/null
15617 +++ linux-2.6.16/fs/reiser4/fsdata.c
15618 @@ -0,0 +1,803 @@
15619 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15620 + * reiser4/README */
15621 +
15622 +#include "fsdata.h"
15623 +#include "inode.h"
15624 +
15625 +
15626 +/* cache or dir_cursors */
15627 +static kmem_cache_t *d_cursor_cache;
15628 +static struct shrinker *d_cursor_shrinker;
15629 +
15630 +/* list of unused cursors */
15631 +static LIST_HEAD(cursor_cache);
15632 +
15633 +/* number of cursors in list of ununsed cursors */
15634 +static unsigned long d_cursor_unused = 0;
15635 +
15636 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15637 +DEFINE_SPINLOCK(d_lock);
15638 +
15639 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15640 +static int file_is_stateless(struct file *file);
15641 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15642 +static void kill_cursor(dir_cursor *);
15643 +
15644 +/**
15645 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15646 + * @nr: number of objects to free
15647 + * @mask: GFP mask
15648 + *
15649 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15650 + * number. Return number of still freeable cursors.
15651 + */
15652 +static int d_cursor_shrink(int nr, gfp_t mask)
15653 +{
15654 + if (nr != 0) {
15655 + dir_cursor *scan;
15656 + int killed;
15657 +
15658 + killed = 0;
15659 + spin_lock(&d_lock);
15660 + while (!list_empty(&cursor_cache)) {
15661 + scan = list_entry(cursor_cache.next, dir_cursor, alist);
15662 + assert("nikita-3567", scan->ref == 0);
15663 + kill_cursor(scan);
15664 + ++killed;
15665 + --nr;
15666 + if (nr == 0)
15667 + break;
15668 + }
15669 + spin_unlock(&d_lock);
15670 + }
15671 + return d_cursor_unused;
15672 +}
15673 +
15674 +/**
15675 + * init_d_cursor - create d_cursor cache
15676 + *
15677 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15678 + * initialization.
15679 + */
15680 +int init_d_cursor(void)
15681 +{
15682 + d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15683 + SLAB_HWCACHE_ALIGN, NULL, NULL);
15684 + if (d_cursor_cache == NULL)
15685 + return RETERR(-ENOMEM);
15686 +
15687 + /*
15688 + * actually, d_cursors are "priceless", because there is no way to
15689 + * recover information stored in them. On the other hand, we don't
15690 + * want to consume all kernel memory by them. As a compromise, just
15691 + * assign higher "seeks" value to d_cursor cache, so that it will be
15692 + * shrunk only if system is really tight on memory.
15693 + */
15694 + d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15695 + d_cursor_shrink);
15696 + if (d_cursor_shrinker == NULL) {
15697 + destroy_reiser4_cache(&d_cursor_cache);
15698 + d_cursor_cache = NULL;
15699 + return RETERR(-ENOMEM);
15700 + }
15701 + return 0;
15702 +}
15703 +
15704 +/**
15705 + * done_d_cursor - delete d_cursor cache and d_cursor shrinker
15706 + *
15707 + * This is called on reiser4 module unloading or system shutdown.
15708 + */
15709 +void done_d_cursor(void)
15710 +{
15711 + BUG_ON(d_cursor_shrinker == NULL);
15712 + remove_shrinker(d_cursor_shrinker);
15713 + d_cursor_shrinker = NULL;
15714 +
15715 + destroy_reiser4_cache(&d_cursor_cache);
15716 +}
15717 +
15718 +#define D_CURSOR_TABLE_SIZE (256)
15719 +
15720 +static inline unsigned long
15721 +d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
15722 +{
15723 + assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15724 + return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15725 +}
15726 +
15727 +static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
15728 +{
15729 + return k1->cid == k2->cid && k1->oid == k2->oid;
15730 +}
15731 +
15732 +/*
15733 + * define functions to manipulate reiser4 super block's hash table of
15734 + * dir_cursors
15735 + */
15736 +#define KMALLOC(size) kmalloc((size), get_gfp_mask())
15737 +#define KFREE(ptr, size) kfree(ptr)
15738 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15739 + dir_cursor,
15740 + d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
15741 +#undef KFREE
15742 +#undef KMALLOC
15743 +
15744 +/**
15745 + * init_super_d_info - initialize per-super-block d_cursor resources
15746 + * @super: super block to initialize
15747 + *
15748 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15749 + * of mount.
15750 + */
15751 +int init_super_d_info(struct super_block *super)
15752 +{
15753 + d_cursor_info *p;
15754 +
15755 + p = &get_super_private(super)->d_info;
15756 +
15757 + INIT_RADIX_TREE(&p->tree, get_gfp_mask());
15758 + return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15759 +}
15760 +
15761 +/**
15762 + * done_super_d_info - release per-super-block d_cursor resources
15763 + * @super: super block being umounted
15764 + *
15765 + * It is called on umount. Kills all directory cursors attached to suoer block.
15766 + */
15767 +void done_super_d_info(struct super_block *super)
15768 +{
15769 + d_cursor_info *d_info;
15770 + dir_cursor *cursor, *next;
15771 +
15772 + d_info = &get_super_private(super)->d_info;
15773 + for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15774 + kill_cursor(cursor);
15775 +
15776 + BUG_ON(d_info->tree.rnode != NULL);
15777 + d_cursor_hash_done(&d_info->table);
15778 +}
15779 +
15780 +/**
15781 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15782 + * @cursor: cursor to free
15783 + *
15784 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15785 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15786 + * indices, hash table, list of unused cursors and frees it.
15787 + */
15788 +static void kill_cursor(dir_cursor *cursor)
15789 +{
15790 + unsigned long index;
15791 +
15792 + assert("nikita-3566", cursor->ref == 0);
15793 + assert("nikita-3572", cursor->fsdata != NULL);
15794 +
15795 + index = (unsigned long)cursor->key.oid;
15796 + list_del_init(&cursor->fsdata->dir.linkage);
15797 + free_fsdata(cursor->fsdata);
15798 + cursor->fsdata = NULL;
15799 +
15800 + if (list_empty_careful(&cursor->list))
15801 + /* this is last cursor for a file. Kill radix-tree entry */
15802 + radix_tree_delete(&cursor->info->tree, index);
15803 + else {
15804 + void **slot;
15805 +
15806 + /*
15807 + * there are other cursors for the same oid.
15808 + */
15809 +
15810 + /*
15811 + * if radix tree point to the cursor being removed, re-target
15812 + * radix tree slot to the next cursor in the (non-empty as was
15813 + * checked above) element of the circular list of all cursors
15814 + * for this oid.
15815 + */
15816 + slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15817 + assert("nikita-3571", *slot != NULL);
15818 + if (*slot == cursor)
15819 + *slot = list_entry(cursor->list.next, dir_cursor, list);
15820 + /* remove cursor from circular list */
15821 + list_del_init(&cursor->list);
15822 + }
15823 + /* remove cursor from the list of unused cursors */
15824 + list_del_init(&cursor->alist);
15825 + /* remove cursor from the hash table */
15826 + d_cursor_hash_remove(&cursor->info->table, cursor);
15827 + /* and free it */
15828 + kmem_cache_free(d_cursor_cache, cursor);
15829 + --d_cursor_unused;
15830 +}
15831 +
15832 +/* possible actions that can be performed on all cursors for the given file */
15833 +enum cursor_action {
15834 + /*
15835 + * load all detached state: this is called when stat-data is loaded
15836 + * from the disk to recover information about all pending readdirs
15837 + */
15838 + CURSOR_LOAD,
15839 + /*
15840 + * detach all state from inode, leaving it in the cache. This is called
15841 + * when inode is removed form the memory by memory pressure
15842 + */
15843 + CURSOR_DISPOSE,
15844 + /*
15845 + * detach cursors from the inode, and free them. This is called when
15846 + * inode is destroyed
15847 + */
15848 + CURSOR_KILL
15849 +};
15850 +
15851 +/*
15852 + * return d_cursor data for the file system @inode is in.
15853 + */
15854 +static inline d_cursor_info *d_info(struct inode *inode)
15855 +{
15856 + return &get_super_private(inode->i_sb)->d_info;
15857 +}
15858 +
15859 +/*
15860 + * lookup d_cursor in the per-super-block radix tree.
15861 + */
15862 +static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
15863 +{
15864 + return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15865 +}
15866 +
15867 +/*
15868 + * attach @cursor to the radix tree. There may be multiple cursors for the
15869 + * same oid, they are chained into circular list.
15870 + */
15871 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
15872 +{
15873 + dir_cursor *head;
15874 +
15875 + head = lookup(cursor->info, index);
15876 + if (head == NULL) {
15877 + /* this is the first cursor for this index */
15878 + INIT_LIST_HEAD(&cursor->list);
15879 + radix_tree_insert(&cursor->info->tree, index, cursor);
15880 + } else {
15881 + /* some cursor already exists. Chain ours */
15882 + list_add(&cursor->list, &head->list);
15883 + }
15884 +}
15885 +
15886 +/*
15887 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
15888 + * "unused" list. Called when file descriptor is not longer in active use.
15889 + */
15890 +static void clean_fsdata(struct file *file)
15891 +{
15892 + dir_cursor *cursor;
15893 + reiser4_file_fsdata *fsdata;
15894 +
15895 + assert("nikita-3570", file_is_stateless(file));
15896 +
15897 + fsdata = (reiser4_file_fsdata *) file->private_data;
15898 + if (fsdata != NULL) {
15899 + cursor = fsdata->cursor;
15900 + if (cursor != NULL) {
15901 + spin_lock(&d_lock);
15902 + --cursor->ref;
15903 + if (cursor->ref == 0) {
15904 + list_add_tail(&cursor->alist, &cursor_cache);
15905 + ++d_cursor_unused;
15906 + }
15907 + spin_unlock(&d_lock);
15908 + file->private_data = NULL;
15909 + }
15910 + }
15911 +}
15912 +
15913 +/*
15914 + * global counter used to generate "client ids". These ids are encoded into
15915 + * high bits of fpos.
15916 + */
15917 +static __u32 cid_counter = 0;
15918 +#define CID_SHIFT (20)
15919 +#define CID_MASK (0xfffffull)
15920 +
15921 +static void free_file_fsdata_nolock(struct file *);
15922 +
15923 +/**
15924 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15925 + * @cursor:
15926 + * @file:
15927 + * @inode:
15928 + *
15929 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15930 + * reiser4 super block's hash table and radix tree.
15931 + add detachable readdir
15932 + * state to the @f
15933 + */
15934 +static int insert_cursor(dir_cursor *cursor, struct file *file,
15935 + struct inode *inode)
15936 +{
15937 + int result;
15938 + reiser4_file_fsdata *fsdata;
15939 +
15940 + memset(cursor, 0, sizeof *cursor);
15941 +
15942 + /* this is either first call to readdir, or rewind. Anyway, create new
15943 + * cursor. */
15944 + fsdata = create_fsdata(NULL);
15945 + if (fsdata != NULL) {
15946 + result = radix_tree_preload(get_gfp_mask());
15947 + if (result == 0) {
15948 + d_cursor_info *info;
15949 + oid_t oid;
15950 +
15951 + info = d_info(inode);
15952 + oid = get_inode_oid(inode);
15953 + /* cid occupies higher 12 bits of f->f_pos. Don't
15954 + * allow it to become negative: this confuses
15955 + * nfsd_readdir() */
15956 + cursor->key.cid = (++cid_counter) & 0x7ff;
15957 + cursor->key.oid = oid;
15958 + cursor->fsdata = fsdata;
15959 + cursor->info = info;
15960 + cursor->ref = 1;
15961 +
15962 + spin_lock_inode(inode);
15963 + /* install cursor as @f's private_data, discarding old
15964 + * one if necessary */
15965 +#if REISER4_DEBUG
15966 + if (file->private_data)
15967 + warning("", "file has fsdata already");
15968 +#endif
15969 + clean_fsdata(file);
15970 + free_file_fsdata_nolock(file);
15971 + file->private_data = fsdata;
15972 + fsdata->cursor = cursor;
15973 + spin_unlock_inode(inode);
15974 + spin_lock(&d_lock);
15975 + /* insert cursor into hash table */
15976 + d_cursor_hash_insert(&info->table, cursor);
15977 + /* and chain it into radix-tree */
15978 + bind_cursor(cursor, (unsigned long)oid);
15979 + spin_unlock(&d_lock);
15980 + radix_tree_preload_end();
15981 + file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15982 + }
15983 + } else
15984 + result = RETERR(-ENOMEM);
15985 + return result;
15986 +}
15987 +
15988 +/**
15989 + * process_cursors - do action on each cursor attached to inode
15990 + * @inode:
15991 + * @act: action to do
15992 + *
15993 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15994 + * and performs action specified by @act on each of cursors.
15995 + */
15996 +static void process_cursors(struct inode *inode, enum cursor_action act)
15997 +{
15998 + oid_t oid;
15999 + dir_cursor *start;
16000 + struct list_head *head;
16001 + reiser4_context *ctx;
16002 + d_cursor_info *info;
16003 +
16004 + /* this can be called by
16005 + *
16006 + * kswapd->...->prune_icache->..reiser4_destroy_inode
16007 + *
16008 + * without reiser4_context
16009 + */
16010 + ctx = init_context(inode->i_sb);
16011 + if (IS_ERR(ctx)) {
16012 + warning("vs-23", "failed to init context");
16013 + return;
16014 + }
16015 +
16016 + assert("nikita-3558", inode != NULL);
16017 +
16018 + info = d_info(inode);
16019 + oid = get_inode_oid(inode);
16020 + spin_lock_inode(inode);
16021 + head = get_readdir_list(inode);
16022 + spin_lock(&d_lock);
16023 + /* find any cursor for this oid: reference to it is hanging of radix
16024 + * tree */
16025 + start = lookup(info, (unsigned long)oid);
16026 + if (start != NULL) {
16027 + dir_cursor *scan;
16028 + reiser4_file_fsdata *fsdata;
16029 +
16030 + /* process circular list of cursors for this oid */
16031 + scan = start;
16032 + do {
16033 + dir_cursor *next;
16034 +
16035 + next = list_entry(scan->list.next, dir_cursor, list);
16036 + fsdata = scan->fsdata;
16037 + assert("nikita-3557", fsdata != NULL);
16038 + if (scan->key.oid == oid) {
16039 + switch (act) {
16040 + case CURSOR_DISPOSE:
16041 + list_del_init(&fsdata->dir.linkage);
16042 + break;
16043 + case CURSOR_LOAD:
16044 + list_add(&fsdata->dir.linkage, head);
16045 + break;
16046 + case CURSOR_KILL:
16047 + kill_cursor(scan);
16048 + break;
16049 + }
16050 + }
16051 + if (scan == next)
16052 + /* last cursor was just killed */
16053 + break;
16054 + scan = next;
16055 + } while (scan != start);
16056 + }
16057 + spin_unlock(&d_lock);
16058 + /* check that we killed 'em all */
16059 + assert("nikita-3568",
16060 + ergo(act == CURSOR_KILL,
16061 + list_empty_careful(get_readdir_list(inode))));
16062 + assert("nikita-3569",
16063 + ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
16064 + spin_unlock_inode(inode);
16065 + reiser4_exit_context(ctx);
16066 +}
16067 +
16068 +/**
16069 + * dispose_cursors - removes cursors from inode's list
16070 + * @inode: inode to dispose cursors of
16071 + *
16072 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
16073 + * attached to cursor from inode's readdir list. This is called when inode is
16074 + * removed from the memory by memory pressure.
16075 + */
16076 +void dispose_cursors(struct inode *inode)
16077 +{
16078 + process_cursors(inode, CURSOR_DISPOSE);
16079 +}
16080 +
16081 +/**
16082 + * load_cursors - attach cursors to inode
16083 + * @inode: inode to load cursors to
16084 + *
16085 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
16086 + * attached to cursor to inode's readdir list. This is done when inode is
16087 + * loaded into memory.
16088 + */
16089 +void load_cursors(struct inode *inode)
16090 +{
16091 + process_cursors(inode, CURSOR_LOAD);
16092 +}
16093 +
16094 +/**
16095 + * kill_cursors - kill all inode cursors
16096 + * @inode: inode to kill cursors of
16097 + *
16098 + * Frees all cursors for this inode. This is called when inode is destroyed.
16099 + */
16100 +void kill_cursors(struct inode *inode)
16101 +{
16102 + process_cursors(inode, CURSOR_KILL);
16103 +}
16104 +
16105 +/**
16106 + * file_is_stateless -
16107 + * @file:
16108 + *
16109 + * true, if file descriptor @f is created by NFS server by "demand" to serve
16110 + * one file system operation. This means that there may be "detached state"
16111 + * for underlying inode.
16112 + */
16113 +static int file_is_stateless(struct file *file)
16114 +{
16115 + return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
16116 +}
16117 +
16118 +/**
16119 + * get_dir_fpos -
16120 + * @dir:
16121 + *
16122 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
16123 + * in the case of stateless directory operation (readdir-over-nfs), client id
16124 + * was encoded in the high bits of cookie and should me masked off.
16125 + */
16126 +loff_t get_dir_fpos(struct file *dir)
16127 +{
16128 + if (file_is_stateless(dir))
16129 + return dir->f_pos & CID_MASK;
16130 + else
16131 + return dir->f_pos;
16132 +}
16133 +
16134 +/**
16135 + * try_to_attach_fsdata - ???
16136 + * @file:
16137 + * @inode:
16138 + *
16139 + * Finds or creates cursor for readdir-over-nfs.
16140 + */
16141 +int try_to_attach_fsdata(struct file *file, struct inode *inode)
16142 +{
16143 + loff_t pos;
16144 + int result;
16145 + dir_cursor *cursor;
16146 +
16147 + /*
16148 + * we are serialized by inode->i_mutex
16149 + */
16150 + if (!file_is_stateless(file))
16151 + return 0;
16152 +
16153 + pos = file->f_pos;
16154 + result = 0;
16155 + if (pos == 0) {
16156 + /*
16157 + * first call to readdir (or rewind to the beginning of
16158 + * directory)
16159 + */
16160 + cursor = kmem_cache_alloc(d_cursor_cache, get_gfp_mask());
16161 + if (cursor != NULL)
16162 + result = insert_cursor(cursor, file, inode);
16163 + else
16164 + result = RETERR(-ENOMEM);
16165 + } else {
16166 + /* try to find existing cursor */
16167 + d_cursor_key key;
16168 +
16169 + key.cid = pos >> CID_SHIFT;
16170 + key.oid = get_inode_oid(inode);
16171 + spin_lock(&d_lock);
16172 + cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
16173 + if (cursor != NULL) {
16174 + /* cursor was found */
16175 + if (cursor->ref == 0) {
16176 + /* move it from unused list */
16177 + list_del_init(&cursor->alist);
16178 + --d_cursor_unused;
16179 + }
16180 + ++cursor->ref;
16181 + }
16182 + spin_unlock(&d_lock);
16183 + if (cursor != NULL) {
16184 + spin_lock_inode(inode);
16185 + assert("nikita-3556", cursor->fsdata->back == NULL);
16186 + clean_fsdata(file);
16187 + free_file_fsdata_nolock(file);
16188 + file->private_data = cursor->fsdata;
16189 + spin_unlock_inode(inode);
16190 + }
16191 + }
16192 + return result;
16193 +}
16194 +
16195 +/**
16196 + * detach_fsdata - ???
16197 + * @file:
16198 + *
16199 + * detach fsdata, if necessary
16200 + */
16201 +void detach_fsdata(struct file *file)
16202 +{
16203 + struct inode *inode;
16204 +
16205 + if (!file_is_stateless(file))
16206 + return;
16207 +
16208 + inode = file->f_dentry->d_inode;
16209 + spin_lock_inode(inode);
16210 + clean_fsdata(file);
16211 + spin_unlock_inode(inode);
16212 +}
16213 +
16214 +/* slab for reiser4_dentry_fsdata */
16215 +static kmem_cache_t *dentry_fsdata_cache;
16216 +
16217 +/**
16218 + * init_dentry_fsdata - create cache of dentry_fsdata
16219 + *
16220 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
16221 + * part of reiser4 module initialization.
16222 + */
16223 +int init_dentry_fsdata(void)
16224 +{
16225 + dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16226 + sizeof(reiser4_dentry_fsdata),
16227 + 0,
16228 + SLAB_HWCACHE_ALIGN |
16229 + SLAB_RECLAIM_ACCOUNT, NULL,
16230 + NULL);
16231 + if (dentry_fsdata_cache == NULL)
16232 + return RETERR(-ENOMEM);
16233 + return 0;
16234 +}
16235 +
16236 +/**
16237 + * done_dentry_fsdata - delete cache of dentry_fsdata
16238 + *
16239 + * This is called on reiser4 module unloading or system shutdown.
16240 + */
16241 +void done_dentry_fsdata(void)
16242 +{
16243 + destroy_reiser4_cache(&dentry_fsdata_cache);
16244 +}
16245 +
16246 +/**
16247 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
16248 + * @dentry: queried dentry
16249 + *
16250 + * Allocates if necessary and returns per-dentry data that we attach to each
16251 + * dentry.
16252 + */
16253 +reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16254 +{
16255 + assert("nikita-1365", dentry != NULL);
16256 +
16257 + if (dentry->d_fsdata == NULL) {
16258 + dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16259 + get_gfp_mask());
16260 + if (dentry->d_fsdata == NULL)
16261 + return ERR_PTR(RETERR(-ENOMEM));
16262 + memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
16263 + }
16264 + return dentry->d_fsdata;
16265 +}
16266 +
16267 +/**
16268 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16269 + * @dentry: dentry to free fsdata of
16270 + *
16271 + * Detaches and frees fs-specific dentry data
16272 + */
16273 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
16274 +{
16275 + if (dentry->d_fsdata != NULL) {
16276 + kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16277 + dentry->d_fsdata = NULL;
16278 + }
16279 +}
16280 +
16281 +
16282 +/* slab for reiser4_file_fsdata */
16283 +static kmem_cache_t *file_fsdata_cache;
16284 +
16285 +/**
16286 + * init_file_fsdata - create cache of reiser4_file_fsdata
16287 + *
16288 + * Initializes slab cache of structures attached to file->private_data. It is
16289 + * part of reiser4 module initialization.
16290 + */
16291 +int init_file_fsdata(void)
16292 +{
16293 + file_fsdata_cache = kmem_cache_create("file_fsdata",
16294 + sizeof(reiser4_file_fsdata),
16295 + 0,
16296 + SLAB_HWCACHE_ALIGN |
16297 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
16298 + if (file_fsdata_cache == NULL)
16299 + return RETERR(-ENOMEM);
16300 + return 0;
16301 +}
16302 +
16303 +/**
16304 + * done_file_fsdata - delete cache of reiser4_file_fsdata
16305 + *
16306 + * This is called on reiser4 module unloading or system shutdown.
16307 + */
16308 +void done_file_fsdata(void)
16309 +{
16310 + destroy_reiser4_cache(&file_fsdata_cache);
16311 +}
16312 +
16313 +/**
16314 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16315 + * @file: what to create file_fsdata for, may be NULL
16316 + *
16317 + * Allocates and initializes reiser4_file_fsdata structure.
16318 + */
16319 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16320 +{
16321 + reiser4_file_fsdata *fsdata;
16322 +
16323 + fsdata = kmem_cache_alloc(file_fsdata_cache, get_gfp_mask());
16324 + if (fsdata != NULL) {
16325 + memset(fsdata, 0, sizeof *fsdata);
16326 + fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16327 + fsdata->back = file;
16328 + INIT_LIST_HEAD(&fsdata->dir.linkage);
16329 + }
16330 + return fsdata;
16331 +}
16332 +
16333 +/**
16334 + * free_fsdata - free reiser4_file_fsdata
16335 + * @fsdata: object to free
16336 + *
16337 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16338 + */
16339 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16340 +{
16341 + BUG_ON(fsdata == NULL);
16342 + kmem_cache_free(file_fsdata_cache, fsdata);
16343 +}
16344 +
16345 +/**
16346 + * reiser4_get_file_fsdata - get fs-specific file data
16347 + * @file: queried file
16348 + *
16349 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16350 + * to @file.
16351 + */
16352 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16353 +{
16354 + assert("nikita-1603", file != NULL);
16355 +
16356 + if (file->private_data == NULL) {
16357 + reiser4_file_fsdata *fsdata;
16358 + struct inode *inode;
16359 +
16360 + fsdata = create_fsdata(file);
16361 + if (fsdata == NULL)
16362 + return ERR_PTR(RETERR(-ENOMEM));
16363 +
16364 + inode = file->f_dentry->d_inode;
16365 + spin_lock_inode(inode);
16366 + if (file->private_data == NULL) {
16367 + file->private_data = fsdata;
16368 + fsdata = NULL;
16369 + }
16370 + spin_unlock_inode(inode);
16371 + if (fsdata != NULL)
16372 + /* other thread initialized ->fsdata */
16373 + kmem_cache_free(file_fsdata_cache, fsdata);
16374 + }
16375 + assert("nikita-2665", file->private_data != NULL);
16376 + return file->private_data;
16377 +}
16378 +
16379 +/**
16380 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16381 + * @file:
16382 + *
16383 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16384 + * readdir list, frees if it is not linked to d_cursor object.
16385 + */
16386 +static void free_file_fsdata_nolock(struct file *file)
16387 +{
16388 + reiser4_file_fsdata *fsdata;
16389 +
16390 + assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16391 + fsdata = file->private_data;
16392 + if (fsdata != NULL) {
16393 + list_del_init(&fsdata->dir.linkage);
16394 + if (fsdata->cursor == NULL)
16395 + free_fsdata(fsdata);
16396 + }
16397 + file->private_data = NULL;
16398 +}
16399 +
16400 +/**
16401 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16402 + * @file:
16403 + *
16404 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16405 + */
16406 +void reiser4_free_file_fsdata(struct file *file)
16407 +{
16408 + spin_lock_inode(file->f_dentry->d_inode);
16409 + free_file_fsdata_nolock(file);
16410 + spin_unlock_inode(file->f_dentry->d_inode);
16411 +}
16412 +
16413 +/*
16414 + * Local variables:
16415 + * c-indentation-style: "K&R"
16416 + * mode-name: "LC"
16417 + * c-basic-offset: 8
16418 + * tab-width: 8
16419 + * fill-column: 79
16420 + * End:
16421 + */
16422 Index: linux-2.6.16/fs/reiser4/fsdata.h
16423 ===================================================================
16424 --- /dev/null
16425 +++ linux-2.6.16/fs/reiser4/fsdata.h
16426 @@ -0,0 +1,218 @@
16427 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16428 + * reiser4/README */
16429 +
16430 +#if !defined( __REISER4_FSDATA_H__ )
16431 +#define __REISER4_FSDATA_H__
16432 +
16433 +#include "debug.h"
16434 +#include "kassign.h"
16435 +#include "seal.h"
16436 +#include "type_safe_hash.h"
16437 +#include "plugin/file/file.h"
16438 +#include "readahead.h"
16439 +
16440 +/*
16441 + * comment about reiser4_dentry_fsdata
16442 + *
16443 + *
16444 + */
16445 +
16446 +/*
16447 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16448 + * protected by ->i_mutex on inode. Under this lock following invariant
16449 + * holds:
16450 + *
16451 + * file descriptor is "looking" at the entry_no-th directory entry from
16452 + * the beginning of directory. This entry has key dir_entry_key and is
16453 + * pos-th entry with duplicate-key sequence.
16454 + *
16455 + */
16456 +
16457 +/* logical position within directory */
16458 +typedef struct {
16459 + /* key of directory entry (actually, part of a key sufficient to
16460 + identify directory entry) */
16461 + de_id dir_entry_key;
16462 + /* ordinal number of directory entry among all entries with the same
16463 + key. (Starting from 0.) */
16464 + unsigned pos;
16465 +} dir_pos;
16466 +
16467 +typedef struct {
16468 + /* f_pos corresponding to this readdir position */
16469 + __u64 fpos;
16470 + /* logical position within directory */
16471 + dir_pos position;
16472 + /* logical number of directory entry within
16473 + directory */
16474 + __u64 entry_no;
16475 +} readdir_pos;
16476 +
16477 +/*
16478 + * this is used to speed up lookups for directory entry: on initial call to
16479 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16480 + * in struct dentry and reused later to avoid tree traversals.
16481 + */
16482 +typedef struct de_location {
16483 + /* seal covering directory entry */
16484 + seal_t entry_seal;
16485 + /* coord of directory entry */
16486 + coord_t entry_coord;
16487 + /* ordinal number of directory entry among all entries with the same
16488 + key. (Starting from 0.) */
16489 + int pos;
16490 +} de_location;
16491 +
16492 +/**
16493 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16494 + *
16495 + * This is allocated dynamically and released in d_op->d_release()
16496 + *
16497 + * Currently it only contains cached location (hint) of directory entry, but
16498 + * it is expected that other information will be accumulated here.
16499 + */
16500 +typedef struct reiser4_dentry_fsdata {
16501 + /*
16502 + * here will go fields filled by ->lookup() to speedup next
16503 + * create/unlink, like blocknr of znode with stat-data, or key of
16504 + * stat-data.
16505 + */
16506 + de_location dec;
16507 + int stateless; /* created through reiser4_decode_fh, needs special
16508 + * treatment in readdir. */
16509 +} reiser4_dentry_fsdata;
16510 +
16511 +extern int init_dentry_fsdata(void);
16512 +extern void done_dentry_fsdata(void);
16513 +extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16514 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16515 +
16516 +
16517 +/**
16518 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16519 + *
16520 + * This is allocated dynamically and released in inode->i_fop->release
16521 + */
16522 +typedef struct reiser4_file_fsdata {
16523 + /*
16524 + * pointer back to the struct file which this reiser4_file_fsdata is
16525 + * part of
16526 + */
16527 + struct file *back;
16528 + /* detached cursor for stateless readdir. */
16529 + struct dir_cursor *cursor;
16530 + /*
16531 + * We need both directory and regular file parts here, because there
16532 + * are file system objects that are files and directories.
16533 + */
16534 + struct {
16535 + /*
16536 + * position in directory. It is updated each time directory is
16537 + * modified
16538 + */
16539 + readdir_pos readdir;
16540 + /* head of this list is reiser4_inode->lists.readdir_list */
16541 + struct list_head linkage;
16542 + } dir;
16543 + /* hints to speed up operations with regular files: read and write. */
16544 + struct {
16545 + hint_t hint;
16546 + } reg;
16547 + /* */
16548 + struct {
16549 + /* this is called by reiser4_readpages if set */
16550 + void (*readpages) (struct address_space *,
16551 + struct list_head * pages, void *data);
16552 + /* reiser4_readpaextended coord. It is set by read_extent before
16553 + calling page_cache_readahead */
16554 + void *data;
16555 + } ra2;
16556 + struct reiser4_file_ra_state ra1;
16557 +
16558 +} reiser4_file_fsdata;
16559 +
16560 +extern int init_file_fsdata(void);
16561 +extern void done_file_fsdata(void);
16562 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16563 +extern void reiser4_free_file_fsdata(struct file *);
16564 +
16565 +
16566 +/*
16567 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16568 + * used to address problem reiser4 has with readdir accesses via NFS. See
16569 + * plugin/file_ops_readdir.c for more details.
16570 + */
16571 +typedef struct {
16572 + __u16 cid;
16573 + __u64 oid;
16574 +} d_cursor_key;
16575 +
16576 +/*
16577 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16578 + * maintain hash table of dir_cursor-s in reiser4's super block
16579 + */
16580 +typedef struct dir_cursor dir_cursor;
16581 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16582 +
16583 +typedef struct d_cursor_info d_cursor_info;
16584 +
16585 +struct dir_cursor {
16586 + int ref;
16587 + reiser4_file_fsdata *fsdata;
16588 +
16589 + /* link to reiser4 super block hash table of cursors */
16590 + d_cursor_hash_link hash;
16591 +
16592 + /*
16593 + * this is to link cursors to reiser4 super block's radix tree of
16594 + * cursors if there are more than one cursor of the same objectid
16595 + */
16596 + struct list_head list;
16597 + d_cursor_key key;
16598 + d_cursor_info *info;
16599 + /* list of unused cursors */
16600 + struct list_head alist;
16601 +};
16602 +
16603 +extern int init_d_cursor(void);
16604 +extern void done_d_cursor(void);
16605 +
16606 +extern int init_super_d_info(struct super_block *);
16607 +extern void done_super_d_info(struct super_block *);
16608 +
16609 +extern loff_t get_dir_fpos(struct file *);
16610 +extern int try_to_attach_fsdata(struct file *, struct inode *);
16611 +extern void detach_fsdata(struct file *);
16612 +
16613 +
16614 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16615 + more details */
16616 +void dispose_cursors(struct inode *inode);
16617 +void load_cursors(struct inode *inode);
16618 +void kill_cursors(struct inode *inode);
16619 +void adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj);
16620 +
16621 +/*
16622 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16623 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16624 + */
16625 +struct d_cursor_info {
16626 + d_cursor_hash_table table;
16627 + struct radix_tree_root tree;
16628 +};
16629 +
16630 +/* spinlock protecting readdir cursors */
16631 +extern spinlock_t d_lock;
16632 +
16633 +/* __REISER4_FSDATA_H__ */
16634 +#endif
16635 +
16636 +/*
16637 + * Local variables:
16638 + * c-indentation-style: "K&R"
16639 + * mode-name: "LC"
16640 + * c-basic-offset: 8
16641 + * tab-width: 8
16642 + * fill-column: 120
16643 + * End:
16644 + */
16645 Index: linux-2.6.16/fs/reiser4/init_super.c
16646 ===================================================================
16647 --- /dev/null
16648 +++ linux-2.6.16/fs/reiser4/init_super.c
16649 @@ -0,0 +1,739 @@
16650 +/* Copyright by Hans Reiser, 2003 */
16651 +
16652 +#include "super.h"
16653 +#include "inode.h"
16654 +#include "plugin/plugin_set.h"
16655 +
16656 +#include <linux/swap.h>
16657 +
16658 +
16659 +/**
16660 + * init_fs_info - allocate reiser4 specific super block
16661 + * @super: super block of filesystem
16662 + *
16663 + * Allocates and initialize reiser4_super_info_data, attaches it to
16664 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16665 + */
16666 +int init_fs_info(struct super_block *super)
16667 +{
16668 + reiser4_super_info_data *sbinfo;
16669 +
16670 + sbinfo = kmalloc(sizeof(reiser4_super_info_data), get_gfp_mask());
16671 + if (!sbinfo)
16672 + return RETERR(-ENOMEM);
16673 +
16674 + super->s_fs_info = sbinfo;
16675 + super->s_op = NULL;
16676 + memset(sbinfo, 0, sizeof(*sbinfo));
16677 +
16678 + ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16679 + ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16680 +
16681 + sema_init(&sbinfo->delete_sema, 1);
16682 + sema_init(&sbinfo->flush_sema, 1);
16683 + spin_lock_init(&(sbinfo->guard));
16684 +
16685 + /* initialize per-super-block d_cursor resources */
16686 + init_super_d_info(super);
16687 +
16688 + return 0;
16689 +}
16690 +
16691 +/**
16692 + * done_fs_info - free reiser4 specific super block
16693 + * @super: super block of filesystem
16694 + *
16695 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16696 + * frees reiser4_super_info_data.
16697 + */
16698 +void done_fs_info(struct super_block *super)
16699 +{
16700 + assert("zam-990", super->s_fs_info != NULL);
16701 +
16702 + /* release per-super-block d_cursor resources */
16703 + done_super_d_info(super);
16704 +
16705 + /* make sure that there are not jnodes already */
16706 + assert("", list_empty(&get_super_private(super)->all_jnodes));
16707 + assert("", get_current_context()->trans->atom == NULL);
16708 + check_block_counters(super);
16709 + kfree(super->s_fs_info);
16710 + super->s_fs_info = NULL;
16711 +}
16712 +
16713 +/* type of option parseable by parse_option() */
16714 +typedef enum {
16715 + /* value of option is arbitrary string */
16716 + OPT_STRING,
16717 +
16718 + /*
16719 + * option specifies bit in a bitmask. When option is set - bit in
16720 + * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16721 + * dont_load_bitmap, atomic_write.
16722 + */
16723 + OPT_BIT,
16724 +
16725 + /*
16726 + * value of option should conform to sprintf() format. Examples are
16727 + * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16728 + */
16729 + OPT_FORMAT,
16730 +
16731 + /*
16732 + * option can take one of predefined values. Example is onerror=panic or
16733 + * onerror=remount-ro
16734 + */
16735 + OPT_ONEOF,
16736 +} opt_type_t;
16737 +
16738 +typedef struct opt_bitmask_bit {
16739 + const char *bit_name;
16740 + int bit_nr;
16741 +} opt_bitmask_bit;
16742 +
16743 +/* description of option parseable by parse_option() */
16744 +typedef struct opt_desc {
16745 + /* option name.
16746 +
16747 + parsed portion of string has a form "name=value".
16748 + */
16749 + const char *name;
16750 + /* type of option */
16751 + opt_type_t type;
16752 + union {
16753 + /* where to store value of string option (type == OPT_STRING) */
16754 + char **string;
16755 + /* description of bits for bit option (type == OPT_BIT) */
16756 + struct {
16757 + int nr;
16758 + void *addr;
16759 + } bit;
16760 + /* description of format and targets for format option (type
16761 + == OPT_FORMAT) */
16762 + struct {
16763 + const char *format;
16764 + int nr_args;
16765 + void *arg1;
16766 + void *arg2;
16767 + void *arg3;
16768 + void *arg4;
16769 + } f;
16770 + struct {
16771 + int *result;
16772 + const char *list[10];
16773 + } oneof;
16774 + struct {
16775 + void *addr;
16776 + int nr_bits;
16777 + opt_bitmask_bit *bits;
16778 + } bitmask;
16779 + } u;
16780 +} opt_desc_t;
16781 +
16782 +/**
16783 + * parse_option - parse one option
16784 + * @opt_strin: starting point of parsing
16785 + * @opt: option description
16786 + *
16787 + * foo=bar,
16788 + * ^ ^ ^
16789 + * | | +-- replaced to '\0'
16790 + * | +-- val_start
16791 + * +-- opt_string
16792 + * Figures out option type and handles option correspondingly.
16793 + */
16794 +static int parse_option(char *opt_string, opt_desc_t *opt)
16795 +{
16796 + char *val_start;
16797 + int result;
16798 + const char *err_msg;
16799 +
16800 + /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16801 +
16802 + val_start = strchr(opt_string, '=');
16803 + if (val_start != NULL) {
16804 + *val_start = '\0';
16805 + ++val_start;
16806 + }
16807 +
16808 + err_msg = NULL;
16809 + result = 0;
16810 + switch (opt->type) {
16811 + case OPT_STRING:
16812 + if (val_start == NULL) {
16813 + err_msg = "String arg missing";
16814 + result = RETERR(-EINVAL);
16815 + } else
16816 + *opt->u.string = val_start;
16817 + break;
16818 + case OPT_BIT:
16819 + if (val_start != NULL)
16820 + err_msg = "Value ignored";
16821 + else
16822 + set_bit(opt->u.bit.nr, opt->u.bit.addr);
16823 + break;
16824 + case OPT_FORMAT:
16825 + if (val_start == NULL) {
16826 + err_msg = "Formatted arg missing";
16827 + result = RETERR(-EINVAL);
16828 + break;
16829 + }
16830 + if (sscanf(val_start, opt->u.f.format,
16831 + opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16832 + opt->u.f.arg4) != opt->u.f.nr_args) {
16833 + err_msg = "Wrong conversion";
16834 + result = RETERR(-EINVAL);
16835 + }
16836 + break;
16837 + case OPT_ONEOF:
16838 + {
16839 + int i = 0;
16840 +
16841 + if (val_start == NULL) {
16842 + err_msg = "Value is missing";
16843 + result = RETERR(-EINVAL);
16844 + break;
16845 + }
16846 + err_msg = "Wrong option value";
16847 + result = RETERR(-EINVAL);
16848 + while (opt->u.oneof.list[i]) {
16849 + if (!strcmp(opt->u.oneof.list[i], val_start)) {
16850 + result = 0;
16851 + err_msg = NULL;
16852 + *opt->u.oneof.result = i;
16853 + break;
16854 + }
16855 + i++;
16856 + }
16857 + break;
16858 + }
16859 + default:
16860 + wrong_return_value("nikita-2100", "opt -> type");
16861 + break;
16862 + }
16863 + if (err_msg != NULL) {
16864 + warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16865 + err_msg, opt->name, val_start ? "=" : "",
16866 + val_start ? : "");
16867 + }
16868 + return result;
16869 +}
16870 +
16871 +/**
16872 + * parse_options - parse reiser4 mount options
16873 + * @opt_string: starting point
16874 + * @opts: array of option description
16875 + * @nr_opts: number of elements in @opts
16876 + *
16877 + * Parses comma separated list of reiser4 mount options.
16878 + */
16879 +static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
16880 +{
16881 + int result;
16882 +
16883 + result = 0;
16884 + while ((result == 0) && opt_string && *opt_string) {
16885 + int j;
16886 + char *next;
16887 +
16888 + next = strchr(opt_string, ',');
16889 + if (next != NULL) {
16890 + *next = '\0';
16891 + ++next;
16892 + }
16893 + for (j = 0; j < nr_opts; ++j) {
16894 + if (!strncmp(opt_string, opts[j].name,
16895 + strlen(opts[j].name))) {
16896 + result = parse_option(opt_string, &opts[j]);
16897 + break;
16898 + }
16899 + }
16900 + if (j == nr_opts) {
16901 + warning("nikita-2307", "Unrecognized option: \"%s\"",
16902 + opt_string);
16903 + /* traditionally, -EINVAL is returned on wrong mount
16904 + option */
16905 + result = RETERR(-EINVAL);
16906 + }
16907 + opt_string = next;
16908 + }
16909 + return result;
16910 +}
16911 +
16912 +#define NUM_OPT( label, fmt, addr ) \
16913 + { \
16914 + .name = ( label ), \
16915 + .type = OPT_FORMAT, \
16916 + .u = { \
16917 + .f = { \
16918 + .format = ( fmt ), \
16919 + .nr_args = 1, \
16920 + .arg1 = ( addr ), \
16921 + .arg2 = NULL, \
16922 + .arg3 = NULL, \
16923 + .arg4 = NULL \
16924 + } \
16925 + } \
16926 + }
16927 +
16928 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16929 +
16930 +#define BIT_OPT(label, bitnr) \
16931 + { \
16932 + .name = label, \
16933 + .type = OPT_BIT, \
16934 + .u = { \
16935 + .bit = { \
16936 + .nr = bitnr, \
16937 + .addr = &sbinfo->fs_flags \
16938 + } \
16939 + } \
16940 + }
16941 +
16942 +#define MAX_NR_OPTIONS (30)
16943 +
16944 +/**
16945 + * init_super_data - initialize reiser4 private super block
16946 + * @super: super block to initialize
16947 + * @opt_string: list of reiser4 mount options
16948 + *
16949 + * Sets various reiser4 parameters to default values. Parses mount options and
16950 + * overwrites default settings.
16951 + */
16952 +int init_super_data(struct super_block *super, char *opt_string)
16953 +{
16954 + int result;
16955 + opt_desc_t *opts, *p;
16956 + reiser4_super_info_data *sbinfo = get_super_private(super);
16957 +
16958 + /* initialize super, export, dentry operations */
16959 + sbinfo->ops.super = reiser4_super_operations;
16960 + sbinfo->ops.export = reiser4_export_operations;
16961 + sbinfo->ops.dentry = reiser4_dentry_operations;
16962 + super->s_op = &sbinfo->ops.super;
16963 + super->s_export_op = &sbinfo->ops.export;
16964 +
16965 + /* initialize transaction manager parameters to default values */
16966 + sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16967 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16968 + sbinfo->tmgr.atom_min_size = 256;
16969 + sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16970 +
16971 + /* initialize cbk cache parameter */
16972 + sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16973 +
16974 + /* initialize flush parameters */
16975 + sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16976 + sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16977 + sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16978 + sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16979 +
16980 + sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16981 +
16982 + /* preliminary tree initializations */
16983 + sbinfo->tree.super = super;
16984 + sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16985 + sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16986 + sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16987 + sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16988 + rwlock_init(&(sbinfo->tree.tree_lock));
16989 + spin_lock_init(&(sbinfo->tree.epoch_lock));
16990 +
16991 + /* initialize default readahead params */
16992 + sbinfo->ra_params.max = num_physpages / 4;
16993 + sbinfo->ra_params.flags = 0;
16994 +
16995 + /* allocate memory for structure describing reiser4 mount options */
16996 + opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS, get_gfp_mask());
16997 + if (opts == NULL)
16998 + return RETERR(-ENOMEM);
16999 +
17000 + /* initialize structure describing reiser4 mount options */
17001 + p = opts;
17002 +
17003 +#if REISER4_DEBUG
17004 +# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
17005 + warning ("zam-1046", "opt array is overloaded"); break; \
17006 + }
17007 +#else
17008 +# define OPT_ARRAY_CHECK noop
17009 +#endif
17010 +
17011 +#define PUSH_OPT(...) \
17012 +do { \
17013 + opt_desc_t o = __VA_ARGS__; \
17014 + OPT_ARRAY_CHECK; \
17015 + *p ++ = o; \
17016 +} while (0)
17017 +
17018 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
17019 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
17020 +
17021 + /*
17022 + * tmgr.atom_max_size=N
17023 + * Atoms containing more than N blocks will be forced to commit. N is
17024 + * decimal.
17025 + */
17026 + PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
17027 + /*
17028 + * tmgr.atom_max_age=N
17029 + * Atoms older than N seconds will be forced to commit. N is decimal.
17030 + */
17031 + PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
17032 + /*
17033 + * tmgr.atom_min_size=N
17034 + * In committing an atom to free dirty pages, force the atom less than
17035 + * N in size to fuse with another one.
17036 + */
17037 + PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
17038 + /*
17039 + * tmgr.atom_max_flushers=N
17040 + * limit of concurrent flushers for one atom. 0 means no limit.
17041 + */
17042 + PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
17043 + /*
17044 + * tree.cbk_cache_slots=N
17045 + * Number of slots in the cbk cache.
17046 + */
17047 + PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
17048 + /*
17049 + * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
17050 + * leaf-level blocks it will force them to be relocated.
17051 + */
17052 + PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
17053 + /*
17054 + * If flush finds can find a block allocation closer than at most
17055 + * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
17056 + * position.
17057 + */
17058 + PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
17059 + /*
17060 + * If we have written this much or more blocks before encountering busy
17061 + * jnode in flush list - abort flushing hoping that next time we get
17062 + * called this jnode will be clean already, and we will save some
17063 + * seeks.
17064 + */
17065 + PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
17066 + /* The maximum number of nodes to scan left on a level during flush. */
17067 + PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
17068 + /* preferred IO size */
17069 + PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
17070 + /* carry flags used for insertion of new nodes */
17071 + PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
17072 + /* carry flags used for insertion of new extents */
17073 + PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
17074 + /* carry flags used for paste operations */
17075 + PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
17076 + /* carry flags used for insert operations */
17077 + PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
17078 +
17079 +#ifdef CONFIG_REISER4_BADBLOCKS
17080 + /*
17081 + * Alternative master superblock location in case if it's original
17082 + * location is not writeable/accessable. This is offset in BYTES.
17083 + */
17084 + PUSH_SB_FIELD_OPT(altsuper, "%lu");
17085 +#endif
17086 +
17087 + /* turn on BSD-style gid assignment */
17088 + PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
17089 + /* turn on 32 bit times */
17090 + PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
17091 + /* turn off concurrent flushing */
17092 + PUSH_BIT_OPT("mtflush", REISER4_MTFLUSH);
17093 + /*
17094 + * Don't load all bitmap blocks at mount time, it is useful for
17095 + * machines with tiny RAM and large disks.
17096 + */
17097 + PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
17098 + /* disable transaction commits during write() */
17099 + PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
17100 + /* disable use of write barriers in the reiser4 log writer. */
17101 + PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
17102 +
17103 + PUSH_OPT(
17104 + {
17105 + /*
17106 + * tree traversal readahead parameters:
17107 + * -o readahead:MAXNUM:FLAGS
17108 + * MAXNUM - max number fo nodes to request readahead for: -1UL
17109 + * will set it to max_sane_readahead()
17110 + * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
17111 + * CONTINUE_ON_PRESENT
17112 + */
17113 + .name = "readahead",
17114 + .type = OPT_FORMAT,
17115 + .u = {
17116 + .f = {
17117 + .format = "%u:%u",
17118 + .nr_args = 2,
17119 + .arg1 = &sbinfo->ra_params.max,
17120 + .arg2 = &sbinfo->ra_params.flags,
17121 + .arg3 = NULL,
17122 + .arg4 = NULL
17123 + }
17124 + }
17125 + }
17126 + );
17127 +
17128 + /* What to do in case of fs error */
17129 + PUSH_OPT(
17130 + {
17131 + .name = "onerror",
17132 + .type = OPT_ONEOF,
17133 + .u = {
17134 + .oneof = {
17135 + .result = &sbinfo->onerror,
17136 + .list = {
17137 + "panic", "remount-ro", NULL
17138 + },
17139 + }
17140 + }
17141 + }
17142 + );
17143 +
17144 + /* modify default settings to values set by mount options */
17145 + result = parse_options(opt_string, opts, p - opts);
17146 + kfree(opts);
17147 + if (result != 0)
17148 + return result;
17149 +
17150 + /* correct settings to sanity values */
17151 + sbinfo->tmgr.atom_max_age *= HZ;
17152 + if (sbinfo->tmgr.atom_max_age <= 0)
17153 + /* overflow */
17154 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
17155 +
17156 + /* round optimal io size up to 512 bytes */
17157 + sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
17158 + sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
17159 + if (sbinfo->optimal_io_size == 0) {
17160 + warning("nikita-2497", "optimal_io_size is too small");
17161 + return RETERR(-EINVAL);
17162 + }
17163 +
17164 + /* disable single-threaded flush as it leads to deadlock */
17165 + sbinfo->fs_flags |= (1 << REISER4_MTFLUSH);
17166 + return result;
17167 +}
17168 +
17169 +/**
17170 + * init_read_super - read reiser4 master super block
17171 + * @super: super block to fill
17172 + * @silent: if 0 - print warnings
17173 + *
17174 + * Reads reiser4 master super block either from predefined location or from
17175 + * location specified by altsuper mount option, initializes disk format plugin.
17176 + */
17177 +int init_read_super(struct super_block *super, int silent)
17178 +{
17179 + struct buffer_head *super_bh;
17180 + struct reiser4_master_sb *master_sb;
17181 + reiser4_super_info_data *sbinfo = get_super_private(super);
17182 + unsigned long blocksize;
17183 +
17184 + read_super_block:
17185 +#ifdef CONFIG_REISER4_BADBLOCKS
17186 + if (sbinfo->altsuper)
17187 + /*
17188 + * read reiser4 master super block at position specified by
17189 + * mount option
17190 + */
17191 + super_bh = sb_bread(super,
17192 + (sector_t)(sbinfo->altsuper / super->s_blocksize));
17193 + else
17194 +#endif
17195 + /* read reiser4 master super block at 16-th 4096 block */
17196 + super_bh = sb_bread(super,
17197 + (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
17198 + if (!super_bh)
17199 + return RETERR(-EIO);
17200 +
17201 + master_sb = (struct reiser4_master_sb *)super_bh->b_data;
17202 + /* check reiser4 magic string */
17203 + if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
17204 + sizeof(REISER4_SUPER_MAGIC_STRING))) {
17205 + /* reiser4 master super block contains filesystem blocksize */
17206 + blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
17207 +
17208 + if (blocksize != PAGE_CACHE_SIZE) {
17209 + /*
17210 + * currenly reiser4's blocksize must be equal to
17211 + * pagesize
17212 + */
17213 + if (!silent)
17214 + warning("nikita-2609",
17215 + "%s: wrong block size %ld\n", super->s_id,
17216 + blocksize);
17217 + brelse(super_bh);
17218 + return RETERR(-EINVAL);
17219 + }
17220 + if (blocksize != super->s_blocksize) {
17221 + /*
17222 + * filesystem uses different blocksize. Reread master
17223 + * super block with correct blocksize
17224 + */
17225 + brelse(super_bh);
17226 + if (!sb_set_blocksize(super, (int)blocksize))
17227 + return RETERR(-EINVAL);
17228 + goto read_super_block;
17229 + }
17230 +
17231 + sbinfo->df_plug =
17232 + disk_format_plugin_by_id(
17233 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17234 + if (sbinfo->df_plug == NULL) {
17235 + if (!silent)
17236 + warning("nikita-26091",
17237 + "%s: unknown disk format plugin %d\n",
17238 + super->s_id,
17239 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17240 + brelse(super_bh);
17241 + return RETERR(-EINVAL);
17242 + }
17243 + sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17244 + brelse(super_bh);
17245 + return 0;
17246 + }
17247 +
17248 + /* there is no reiser4 on the device */
17249 + if (!silent)
17250 + warning("nikita-2608",
17251 + "%s: wrong master super block magic", super->s_id);
17252 + brelse(super_bh);
17253 + return RETERR(-EINVAL);
17254 +}
17255 +
17256 +static struct {
17257 + reiser4_plugin_type type;
17258 + reiser4_plugin_id id;
17259 +} default_plugins[PSET_LAST] = {
17260 + [PSET_FILE] = {
17261 + .type = REISER4_FILE_PLUGIN_TYPE,
17262 + .id = UNIX_FILE_PLUGIN_ID
17263 + },
17264 + [PSET_DIR] = {
17265 + .type = REISER4_DIR_PLUGIN_TYPE,
17266 + .id = HASHED_DIR_PLUGIN_ID
17267 + },
17268 + [PSET_HASH] = {
17269 + .type = REISER4_HASH_PLUGIN_TYPE,
17270 + .id = R5_HASH_ID
17271 + },
17272 + [PSET_FIBRATION] = {
17273 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
17274 + .id = FIBRATION_DOT_O
17275 + },
17276 + [PSET_PERM] = {
17277 + .type = REISER4_PERM_PLUGIN_TYPE,
17278 + .id = NULL_PERM_ID
17279 + },
17280 + [PSET_FORMATTING] = {
17281 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
17282 + .id = SMALL_FILE_FORMATTING_ID
17283 + },
17284 + [PSET_SD] = {
17285 + .type = REISER4_ITEM_PLUGIN_TYPE,
17286 + .id = STATIC_STAT_DATA_ID
17287 + },
17288 + [PSET_DIR_ITEM] = {
17289 + .type = REISER4_ITEM_PLUGIN_TYPE,
17290 + .id = COMPOUND_DIR_ID
17291 + },
17292 + [PSET_CIPHER] = {
17293 + .type = REISER4_CIPHER_PLUGIN_TYPE,
17294 + .id = NONE_CIPHER_ID
17295 + },
17296 + [PSET_DIGEST] = {
17297 + .type = REISER4_DIGEST_PLUGIN_TYPE,
17298 + .id = SHA256_32_DIGEST_ID
17299 + },
17300 + [PSET_COMPRESSION] = {
17301 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17302 + .id = LZO1_COMPRESSION_ID
17303 + },
17304 + [PSET_COMPRESSION_MODE] = {
17305 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17306 + .id = COL_16_COMPRESSION_MODE_ID
17307 + },
17308 + [PSET_CLUSTER] = {
17309 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
17310 + .id = CLUSTER_64K_ID
17311 + },
17312 + [PSET_REGULAR_ENTRY] = {
17313 + .type = REISER4_REGULAR_PLUGIN_TYPE,
17314 + .id = UF_REGULAR_ID
17315 + }
17316 +};
17317 +
17318 +/* access to default plugin table */
17319 +static reiser4_plugin *get_default_plugin(pset_member memb)
17320 +{
17321 + return plugin_by_id(default_plugins[memb].type,
17322 + default_plugins[memb].id);
17323 +}
17324 +
17325 +/**
17326 + * init_root_inode - obtain inode of root directory
17327 + * @super: super block of filesystem
17328 + *
17329 + * Obtains inode of root directory (reading it from disk), initializes plugin
17330 + * set it was not initialized.
17331 + */
17332 +int init_root_inode(struct super_block *super)
17333 +{
17334 + reiser4_super_info_data *sbinfo = get_super_private(super);
17335 + struct inode *inode;
17336 + int result = 0;
17337 +
17338 + inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17339 + if (IS_ERR(inode))
17340 + return RETERR(PTR_ERR(inode));
17341 +
17342 + super->s_root = d_alloc_root(inode);
17343 + if (!super->s_root) {
17344 + iput(inode);
17345 + return RETERR(-ENOMEM);
17346 + }
17347 +
17348 + super->s_root->d_op = &sbinfo->ops.dentry;
17349 +
17350 + if (!is_inode_loaded(inode)) {
17351 + pset_member memb;
17352 +
17353 + for (memb = 0; memb < PSET_LAST; ++memb) {
17354 + reiser4_plugin *plug;
17355 +
17356 + plug = get_default_plugin(memb);
17357 + result = grab_plugin_from(inode, memb, plug);
17358 + if (result != 0)
17359 + break;
17360 + }
17361 +
17362 + if (result == 0) {
17363 + if (REISER4_DEBUG) {
17364 + plugin_set *pset;
17365 +
17366 + pset = reiser4_inode_data(inode)->pset;
17367 + for (memb = 0; memb < PSET_LAST; ++memb)
17368 + assert("nikita-3500",
17369 + pset_get(pset, memb) != NULL);
17370 + }
17371 + } else
17372 + warning("nikita-3448", "Cannot set plugins of root: %i",
17373 + result);
17374 + reiser4_iget_complete(inode);
17375 + }
17376 + super->s_maxbytes = MAX_LFS_FILESIZE;
17377 + return result;
17378 +}
17379 +
17380 +/*
17381 + * Local variables:
17382 + * c-indentation-style: "K&R"
17383 + * mode-name: "LC"
17384 + * c-basic-offset: 8
17385 + * tab-width: 8
17386 + * fill-column: 79
17387 + * End:
17388 + */
17389 Index: linux-2.6.16/fs/reiser4/inode.c
17390 ===================================================================
17391 --- /dev/null
17392 +++ linux-2.6.16/fs/reiser4/inode.c
17393 @@ -0,0 +1,727 @@
17394 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17395 +
17396 +/* Inode specific operations. */
17397 +
17398 +#include "forward.h"
17399 +#include "debug.h"
17400 +#include "key.h"
17401 +#include "kassign.h"
17402 +#include "coord.h"
17403 +#include "seal.h"
17404 +#include "dscale.h"
17405 +#include "plugin/item/item.h"
17406 +#include "plugin/security/perm.h"
17407 +#include "plugin/plugin.h"
17408 +#include "plugin/object.h"
17409 +#include "znode.h"
17410 +#include "vfs_ops.h"
17411 +#include "inode.h"
17412 +#include "super.h"
17413 +#include "reiser4.h"
17414 +
17415 +#include <linux/fs.h> /* for struct super_block, address_space */
17416 +
17417 +/* return reiser4 internal tree which inode belongs to */
17418 +/* Audited by: green(2002.06.17) */
17419 +reiser4_tree *tree_by_inode(const struct inode *inode /* inode queried */ )
17420 +{
17421 + assert("nikita-256", inode != NULL);
17422 + assert("nikita-257", inode->i_sb != NULL);
17423 + return get_tree(inode->i_sb);
17424 +}
17425 +
17426 +/* return reiser4-specific inode flags */
17427 +static inline unsigned long *inode_flags(const struct inode *const inode)
17428 +{
17429 + assert("nikita-2842", inode != NULL);
17430 + return &reiser4_inode_data(inode)->flags;
17431 +}
17432 +
17433 +/* set reiser4-specific flag @f in @inode */
17434 +void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17435 +{
17436 + assert("nikita-2248", inode != NULL);
17437 + set_bit((int)f, inode_flags(inode));
17438 +}
17439 +
17440 +/* clear reiser4-specific flag @f in @inode */
17441 +void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17442 +{
17443 + assert("nikita-2250", inode != NULL);
17444 + clear_bit((int)f, inode_flags(inode));
17445 +}
17446 +
17447 +/* true if reiser4-specific flag @f is set in @inode */
17448 +int inode_get_flag(const struct inode *inode, reiser4_file_plugin_flags f)
17449 +{
17450 + assert("nikita-2251", inode != NULL);
17451 + return test_bit((int)f, inode_flags(inode));
17452 +}
17453 +
17454 +/* convert oid to inode number */
17455 +ino_t oid_to_ino(oid_t oid)
17456 +{
17457 + return (ino_t) oid;
17458 +}
17459 +
17460 +/* convert oid to user visible inode number */
17461 +ino_t oid_to_uino(oid_t oid)
17462 +{
17463 + /* reiser4 object is uniquely identified by oid which is 64 bit
17464 + quantity. Kernel in-memory inode is indexed (in the hash table) by
17465 + 32 bit i_ino field, but this is not a problem, because there is a
17466 + way to further distinguish inodes with identical inode numbers
17467 + (find_actor supplied to iget()).
17468 +
17469 + But user space expects unique 32 bit inode number. Obviously this
17470 + is impossible. Work-around is to somehow hash oid into user visible
17471 + inode number.
17472 + */
17473 + oid_t max_ino = (ino_t) ~ 0;
17474 +
17475 + if (REISER4_INO_IS_OID || (oid <= max_ino))
17476 + return oid;
17477 + else
17478 + /* this is remotely similar to algorithm used to find next pid
17479 + to use for process: after wrap-around start from some
17480 + offset rather than from 0. Idea is that there are some long
17481 + living objects with which we don't want to collide.
17482 + */
17483 + return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17484 +}
17485 +
17486 +/* check that "inode" is on reiser4 file-system */
17487 +int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17488 +{
17489 + return inode != NULL && is_reiser4_super(inode->i_sb);
17490 +}
17491 +
17492 +/* Maximal length of a name that can be stored in directory @inode.
17493 +
17494 + This is used in check during file creation and lookup. */
17495 +int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17496 +{
17497 + assert("nikita-287", is_reiser4_inode(inode));
17498 + assert("nikita-1710", inode_dir_item_plugin(inode));
17499 + if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17500 + return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17501 + else
17502 + return 255;
17503 +}
17504 +
17505 +#if REISER4_USE_COLLISION_LIMIT
17506 +/* Maximal number of hash collisions for this directory. */
17507 +int max_hash_collisions(const struct inode *dir /* inode queried */ )
17508 +{
17509 + assert("nikita-1711", dir != NULL);
17510 + return reiser4_inode_data(dir)->plugin.max_collisions;
17511 +}
17512 +#endif /* REISER4_USE_COLLISION_LIMIT */
17513 +
17514 +/* Install file, inode, and address_space operation on @inode, depending on
17515 + its mode. */
17516 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17517 + reiser4_object_create_data * data /* parameters to create
17518 + * object */ )
17519 +{
17520 + reiser4_super_info_data *sinfo;
17521 + file_plugin *fplug;
17522 + dir_plugin *dplug;
17523 +
17524 + fplug = inode_file_plugin(inode);
17525 + dplug = inode_dir_plugin(inode);
17526 +
17527 + sinfo = get_super_private(inode->i_sb);
17528 +
17529 + switch (inode->i_mode & S_IFMT) {
17530 + case S_IFSOCK:
17531 + case S_IFBLK:
17532 + case S_IFCHR:
17533 + case S_IFIFO:
17534 + {
17535 + dev_t rdev; /* to keep gcc happy */
17536 +
17537 + assert("vs-46", fplug != NULL);
17538 + /* ugly hack with rdev */
17539 + if (data == NULL) {
17540 + rdev = inode->i_rdev;
17541 + inode->i_rdev = 0;
17542 + } else
17543 + rdev = data->rdev;
17544 + inode->i_blocks = 0;
17545 + assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17546 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17547 + /* initialize inode->i_fop and inode->i_rdev for block and char
17548 + devices */
17549 + init_special_inode(inode, inode->i_mode, rdev);
17550 + /* all address space operations are null */
17551 + inode->i_mapping->a_ops =
17552 + &file_plugins[fplug->h.id].as_ops;
17553 + break;
17554 + }
17555 + case S_IFLNK:
17556 + assert("vs-46", fplug != NULL);
17557 + assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17558 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17559 + inode->i_fop = NULL;
17560 + /* all address space operations are null */
17561 + inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17562 + break;
17563 + case S_IFDIR:
17564 + assert("vs-46", dplug != NULL);
17565 + assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17566 + dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17567 + inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17568 + inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17569 + inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17570 + break;
17571 + case S_IFREG:
17572 + assert("vs-46", fplug != NULL);
17573 + assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17574 + fplug->h.id == CRC_FILE_PLUGIN_ID));
17575 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17576 + inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17577 + inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17578 + break;
17579 + default:
17580 + warning("nikita-291", "wrong file mode: %o for %llu",
17581 + inode->i_mode,
17582 + (unsigned long long)get_inode_oid(inode));
17583 + reiser4_make_bad_inode(inode);
17584 + return RETERR(-EINVAL);
17585 + }
17586 + return 0;
17587 +}
17588 +
17589 +/* initialize inode from disk data. Called with inode locked.
17590 + Return inode locked. */
17591 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17592 + coord_t * coord /* coord of stat data */ )
17593 +{
17594 + int result;
17595 + item_plugin *iplug;
17596 + void *body;
17597 + int length;
17598 + reiser4_inode *state;
17599 +
17600 + assert("nikita-292", coord != NULL);
17601 + assert("nikita-293", inode != NULL);
17602 +
17603 + coord_clear_iplug(coord);
17604 + result = zload(coord->node);
17605 + if (result)
17606 + return result;
17607 + iplug = item_plugin_by_coord(coord);
17608 + body = item_body_by_coord(coord);
17609 + length = item_length_by_coord(coord);
17610 +
17611 + assert("nikita-295", iplug != NULL);
17612 + assert("nikita-296", body != NULL);
17613 + assert("nikita-297", length > 0);
17614 +
17615 + /* inode is under I_LOCK now */
17616 +
17617 + state = reiser4_inode_data(inode);
17618 + /* call stat-data plugin method to load sd content into inode */
17619 + result = iplug->s.sd.init_inode(inode, body, length);
17620 + plugin_set_sd(&state->pset, iplug);
17621 + if (result == 0) {
17622 + result = setup_inode_ops(inode, NULL);
17623 + if (result == 0 &&
17624 + inode->i_sb->s_root && inode->i_sb->s_root->d_inode) {
17625 + struct inode *root;
17626 + pset_member ind;
17627 +
17628 + /* take missing plugins from file-system defaults */
17629 + root = inode->i_sb->s_root->d_inode;
17630 + /* file and directory plugins are already initialized. */
17631 + for (ind = PSET_DIR + 1; ind < PSET_LAST; ++ind) {
17632 + result = grab_plugin(inode, root, ind);
17633 + if (result != 0)
17634 + break;
17635 + }
17636 + if (result != 0) {
17637 + warning("nikita-3447",
17638 + "Cannot set up plugins for %lli",
17639 + (unsigned long long)
17640 + get_inode_oid(inode));
17641 + }
17642 + }
17643 + }
17644 + zrelse(coord->node);
17645 + return result;
17646 +}
17647 +
17648 +/* read `inode' from the disk. This is what was previously in
17649 + reiserfs_read_inode2().
17650 +
17651 + Must be called with inode locked. Return inode still locked.
17652 +*/
17653 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17654 + const reiser4_key * key /* key of stat data */ ,
17655 + int silent)
17656 +{
17657 + int result;
17658 + lock_handle lh;
17659 + reiser4_inode *info;
17660 + coord_t coord;
17661 +
17662 + assert("nikita-298", inode != NULL);
17663 + assert("nikita-1945", !is_inode_loaded(inode));
17664 +
17665 + info = reiser4_inode_data(inode);
17666 + assert("nikita-300", info->locality_id != 0);
17667 +
17668 + coord_init_zero(&coord);
17669 + init_lh(&lh);
17670 + /* locate stat-data in a tree and return znode locked */
17671 + result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17672 + assert("nikita-301", !is_inode_loaded(inode));
17673 + if (result == 0) {
17674 + /* use stat-data plugin to load sd into inode. */
17675 + result = init_inode(inode, &coord);
17676 + if (result == 0) {
17677 + /* initialize stat-data seal */
17678 + spin_lock_inode(inode);
17679 + seal_init(&info->sd_seal, &coord, key);
17680 + info->sd_coord = coord;
17681 + spin_unlock_inode(inode);
17682 +
17683 + /* call file plugin's method to initialize plugin
17684 + * specific part of inode */
17685 + if (inode_file_plugin(inode)->init_inode_data)
17686 + inode_file_plugin(inode)->init_inode_data(inode,
17687 + NULL,
17688 + 0);
17689 + /* load detached directory cursors for stateless
17690 + * directory readers (NFS). */
17691 + load_cursors(inode);
17692 +
17693 + /* Check the opened inode for consistency. */
17694 + result =
17695 + get_super_private(inode->i_sb)->df_plug->
17696 + check_open(inode);
17697 + }
17698 + }
17699 + /* lookup_sd() doesn't release coord because we want znode
17700 + stay read-locked while stat-data fields are accessed in
17701 + init_inode() */
17702 + done_lh(&lh);
17703 +
17704 + if (result != 0)
17705 + reiser4_make_bad_inode(inode);
17706 + return result;
17707 +}
17708 +
17709 +/* initialise new reiser4 inode being inserted into hash table. */
17710 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17711 + void *opaque /* key of stat data passed to the
17712 + * iget5_locked as cookie */ )
17713 +{
17714 + reiser4_key *key;
17715 +
17716 + assert("nikita-1995", inode != NULL);
17717 + assert("nikita-1996", opaque != NULL);
17718 + key = opaque;
17719 + set_inode_oid(inode, get_key_objectid(key));
17720 + reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17721 + return 0;
17722 +}
17723 +
17724 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17725 +
17726 + This function is called by iget5_locked() to distinguish reiser4 inodes
17727 + having the same inode numbers. Such inodes can only exist due to some error
17728 + condition. One of them should be bad. Inodes with identical inode numbers
17729 + (objectids) are distinguished by their packing locality.
17730 +
17731 +*/
17732 +static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17733 + * check */ ,
17734 + void *opaque /* "cookie" passed to
17735 + * iget5_locked(). This is stat data
17736 + * key */ )
17737 +{
17738 + reiser4_key *key;
17739 +
17740 + key = opaque;
17741 + return
17742 + /* oid is unique, so first term is enough, actually. */
17743 + get_inode_oid(inode) == get_key_objectid(key) &&
17744 + /*
17745 + * also, locality should be checked, but locality is stored in
17746 + * the reiser4-specific part of the inode, and actor can be
17747 + * called against arbitrary inode that happened to be in this
17748 + * hash chain. Hence we first have to check that this is
17749 + * reiser4 inode at least. is_reiser4_inode() is probably too
17750 + * early to call, as inode may have ->i_op not yet
17751 + * initialised.
17752 + */
17753 + is_reiser4_super(inode->i_sb) &&
17754 + /*
17755 + * usually objectid is unique, but pseudo files use counter to
17756 + * generate objectid. All pseudo files are placed into special
17757 + * (otherwise unused) locality.
17758 + */
17759 + reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17760 +}
17761 +
17762 +/* hook for kmem_cache_create */
17763 +void loading_init_once(reiser4_inode * info)
17764 +{
17765 + sema_init(&info->loading, 1);
17766 +}
17767 +
17768 +/* for reiser4_alloc_inode */
17769 +void loading_alloc(reiser4_inode * info)
17770 +{
17771 +#if REISER4_DEBUG
17772 + assert("vs-1717", down_trylock(&info->loading) == 0);
17773 + up(&info->loading);
17774 +#endif
17775 +}
17776 +
17777 +/* for reiser4_destroy */
17778 +void loading_destroy(reiser4_inode * info)
17779 +{
17780 +#if REISER4_DEBUG
17781 + assert("vs-1717", down_trylock(&info->loading) == 0);
17782 + up(&info->loading);
17783 +#endif
17784 +}
17785 +
17786 +static void loading_down(reiser4_inode * info)
17787 +{
17788 + down(&info->loading);
17789 +}
17790 +
17791 +static void loading_up(reiser4_inode * info)
17792 +{
17793 + up(&info->loading);
17794 +}
17795 +
17796 +/**
17797 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17798 + * @super: super block of filesystem
17799 + * @key: key of inode's stat-data
17800 + * @silent:
17801 + *
17802 + * This is our helper function a la iget(). This is be called by
17803 + * reiser4_lookup() and reiser4_read_super(). Return inode locked or error
17804 + * encountered.
17805 + */
17806 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17807 + int silent)
17808 +{
17809 + struct inode *inode;
17810 + int result;
17811 + reiser4_inode *info;
17812 +
17813 + assert("nikita-302", super != NULL);
17814 + assert("nikita-303", key != NULL);
17815 +
17816 + result = 0;
17817 +
17818 + /* call iget(). Our ->read_inode() is dummy, so this will either
17819 + find inode in cache or return uninitialised inode */
17820 + inode = iget5_locked(super,
17821 + (unsigned long)get_key_objectid(key),
17822 + reiser4_inode_find_actor,
17823 + init_locked_inode, (reiser4_key *) key);
17824 + if (inode == NULL)
17825 + return ERR_PTR(RETERR(-ENOMEM));
17826 + if (is_bad_inode(inode)) {
17827 + warning("nikita-304", "Bad inode found");
17828 + print_key("key", key);
17829 + iput(inode);
17830 + return ERR_PTR(RETERR(-EIO));
17831 + }
17832 +
17833 + info = reiser4_inode_data(inode);
17834 +
17835 + /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17836 + loaded and initialized inode from just allocated inode. If
17837 + REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17838 + info->loading. The place in reiser4 which uses not initialized inode
17839 + is the reiser4 repacker, see repacker-related functions in
17840 + plugin/item/extent.c */
17841 + if (!is_inode_loaded(inode)) {
17842 + loading_down(info);
17843 + if (!is_inode_loaded(inode)) {
17844 + /* locking: iget5_locked returns locked inode */
17845 + assert("nikita-1941", !is_inode_loaded(inode));
17846 + assert("nikita-1949",
17847 + reiser4_inode_find_actor(inode,
17848 + (reiser4_key *) key));
17849 + /* now, inode has objectid as ->i_ino and locality in
17850 + reiser4-specific part. This is enough for
17851 + read_inode() to read stat data from the disk */
17852 + result = read_inode(inode, key, silent);
17853 + } else
17854 + loading_up(info);
17855 + }
17856 +
17857 + if (inode->i_state & I_NEW)
17858 + unlock_new_inode(inode);
17859 +
17860 + if (is_bad_inode(inode)) {
17861 + assert("vs-1717", result != 0);
17862 + loading_up(info);
17863 + iput(inode);
17864 + inode = ERR_PTR(result);
17865 + } else if (REISER4_DEBUG) {
17866 + reiser4_key found_key;
17867 +
17868 + assert("vs-1717", result == 0);
17869 + build_sd_key(inode, &found_key);
17870 + if (!keyeq(&found_key, key)) {
17871 + warning("nikita-305", "Wrong key in sd");
17872 + print_key("sought for", key);
17873 + print_key("found", &found_key);
17874 + }
17875 + if (inode->i_nlink == 0) {
17876 + warning("nikita-3559", "Unlinked inode found: %llu\n",
17877 + (unsigned long long)get_inode_oid(inode));
17878 + }
17879 + }
17880 + return inode;
17881 +}
17882 +
17883 +/* reiser4_iget() may return not fully initialized inode, this function should
17884 + * be called after one completes reiser4 inode initializing. */
17885 +void reiser4_iget_complete(struct inode *inode)
17886 +{
17887 + assert("zam-988", is_reiser4_inode(inode));
17888 +
17889 + if (!is_inode_loaded(inode)) {
17890 + inode_set_flag(inode, REISER4_LOADED);
17891 + loading_up(reiser4_inode_data(inode));
17892 + }
17893 +}
17894 +
17895 +void reiser4_make_bad_inode(struct inode *inode)
17896 +{
17897 + assert("nikita-1934", inode != NULL);
17898 +
17899 + /* clear LOADED bit */
17900 + inode_clr_flag(inode, REISER4_LOADED);
17901 + make_bad_inode(inode);
17902 + return;
17903 +}
17904 +
17905 +file_plugin *inode_file_plugin(const struct inode * inode)
17906 +{
17907 + assert("nikita-1997", inode != NULL);
17908 + return reiser4_inode_data(inode)->pset->file;
17909 +}
17910 +
17911 +dir_plugin *inode_dir_plugin(const struct inode * inode)
17912 +{
17913 + assert("nikita-1998", inode != NULL);
17914 + return reiser4_inode_data(inode)->pset->dir;
17915 +}
17916 +
17917 +#if 0
17918 +perm_plugin *inode_perm_plugin(const struct inode * inode)
17919 +{
17920 + assert("nikita-1999", inode != NULL);
17921 + return reiser4_inode_data(inode)->pset->perm;
17922 +}
17923 +#endif /* 0 */
17924 +
17925 +formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17926 +{
17927 + assert("nikita-2000", inode != NULL);
17928 + return reiser4_inode_data(inode)->pset->formatting;
17929 +}
17930 +
17931 +hash_plugin *inode_hash_plugin(const struct inode * inode)
17932 +{
17933 + assert("nikita-2001", inode != NULL);
17934 + return reiser4_inode_data(inode)->pset->hash;
17935 +}
17936 +
17937 +fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17938 +{
17939 + assert("nikita-2001", inode != NULL);
17940 + return reiser4_inode_data(inode)->pset->fibration;
17941 +}
17942 +
17943 +cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17944 +{
17945 + assert("edward-36", inode != NULL);
17946 + return reiser4_inode_data(inode)->pset->cipher;
17947 +}
17948 +
17949 +compression_plugin *inode_compression_plugin(const struct inode * inode)
17950 +{
17951 + assert("edward-37", inode != NULL);
17952 + return reiser4_inode_data(inode)->pset->compression;
17953 +}
17954 +
17955 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17956 + inode)
17957 +{
17958 + assert("edward-1330", inode != NULL);
17959 + return reiser4_inode_data(inode)->pset->compression_mode;
17960 +}
17961 +
17962 +cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17963 +{
17964 + assert("edward-1328", inode != NULL);
17965 + return reiser4_inode_data(inode)->pset->cluster;
17966 +}
17967 +
17968 +regular_plugin *inode_regular_plugin(const struct inode * inode)
17969 +{
17970 + assert("edward-1329", inode != NULL);
17971 + return reiser4_inode_data(inode)->pset->regular_entry;
17972 +}
17973 +
17974 +digest_plugin *inode_digest_plugin(const struct inode * inode)
17975 +{
17976 + assert("edward-86", inode != NULL);
17977 + return reiser4_inode_data(inode)->pset->digest;
17978 +}
17979 +
17980 +item_plugin *inode_sd_plugin(const struct inode * inode)
17981 +{
17982 + assert("vs-534", inode != NULL);
17983 + return reiser4_inode_data(inode)->pset->sd;
17984 +}
17985 +
17986 +item_plugin *inode_dir_item_plugin(const struct inode * inode)
17987 +{
17988 + assert("vs-534", inode != NULL);
17989 + return reiser4_inode_data(inode)->pset->dir_item;
17990 +}
17991 +
17992 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17993 +{
17994 + reiser4_inode *state;
17995 +
17996 + assert("nikita-2716", inode != NULL);
17997 + assert("nikita-2717", ext < LAST_SD_EXTENSION);
17998 + assert("nikita-3491", spin_inode_is_locked(inode));
17999 +
18000 + state = reiser4_inode_data(inode);
18001 + state->extmask |= 1 << ext;
18002 + /* force re-calculation of stat-data length on next call to
18003 + update_sd(). */
18004 + inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18005 +}
18006 +
18007 +void
18008 +inode_set_plugin(struct inode *inode, reiser4_plugin * plug, pset_member memb)
18009 +{
18010 + assert("nikita-2718", inode != NULL);
18011 + assert("nikita-2719", plug != NULL);
18012 +
18013 + reiser4_inode_data(inode)->plugin_mask |= (1 << memb);
18014 +}
18015 +
18016 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
18017 +{
18018 + assert("edward-1287", inode != NULL);
18019 + if (!dscale_fit(old, new))
18020 + inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18021 + return;
18022 +}
18023 +
18024 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
18025 +{
18026 + assert("nikita-2875", inode != NULL);
18027 + spin_lock_inode(inode);
18028 + inode_check_scale_nolock(inode, old, new);
18029 + spin_unlock_inode(inode);
18030 +}
18031 +
18032 +/*
18033 + * initialize ->ordering field of inode. This field defines how file stat-data
18034 + * and body is ordered within a tree with respect to other objects within the
18035 + * same parent directory.
18036 + */
18037 +void
18038 +init_inode_ordering(struct inode *inode,
18039 + reiser4_object_create_data * crd, int create)
18040 +{
18041 + reiser4_key key;
18042 +
18043 + if (create) {
18044 + struct inode *parent;
18045 +
18046 + parent = crd->parent;
18047 + assert("nikita-3224", inode_dir_plugin(parent) != NULL);
18048 + inode_dir_plugin(parent)->build_entry_key(parent,
18049 + &crd->dentry->d_name,
18050 + &key);
18051 + } else {
18052 + coord_t *coord;
18053 +
18054 + coord = &reiser4_inode_data(inode)->sd_coord;
18055 + coord_clear_iplug(coord);
18056 + /* safe to use ->sd_coord, because node is under long term
18057 + * lock */
18058 + WITH_DATA(coord->node, item_key_by_coord(coord, &key));
18059 + }
18060 +
18061 + set_inode_ordering(inode, get_key_ordering(&key));
18062 +}
18063 +
18064 +znode *inode_get_vroot(struct inode *inode)
18065 +{
18066 + reiser4_block_nr blk;
18067 + znode *result;
18068 +
18069 + spin_lock_inode(inode);
18070 + blk = reiser4_inode_data(inode)->vroot;
18071 + spin_unlock_inode(inode);
18072 + if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
18073 + result = zlook(tree_by_inode(inode), &blk);
18074 + else
18075 + result = NULL;
18076 + return result;
18077 +}
18078 +
18079 +void inode_set_vroot(struct inode *inode, znode *vroot)
18080 +{
18081 + spin_lock_inode(inode);
18082 + reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
18083 + spin_unlock_inode(inode);
18084 +}
18085 +
18086 +#if REISER4_DEBUG
18087 +
18088 +void inode_invariant(const struct inode *inode)
18089 +{
18090 + assert("nikita-3077", spin_inode_is_locked(inode));
18091 +}
18092 +
18093 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
18094 +{
18095 + return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
18096 + r4_inode->nr_jnodes == 0;
18097 +}
18098 +
18099 +#endif
18100 +
18101 +/* true if directory is empty (only contains dot and dotdot) */
18102 +/* FIXME: shouldn't it be dir plugin method? */
18103 +int is_dir_empty(const struct inode *dir)
18104 +{
18105 + assert("nikita-1976", dir != NULL);
18106 +
18107 + /* rely on our method to maintain directory i_size being equal to the
18108 + number of entries. */
18109 + return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
18110 +}
18111 +
18112 +/* Make Linus happy.
18113 + Local variables:
18114 + c-indentation-style: "K&R"
18115 + mode-name: "LC"
18116 + c-basic-offset: 8
18117 + tab-width: 8
18118 + fill-column: 120
18119 + End:
18120 +*/
18121 Index: linux-2.6.16/fs/reiser4/inode.h
18122 ===================================================================
18123 --- /dev/null
18124 +++ linux-2.6.16/fs/reiser4/inode.h
18125 @@ -0,0 +1,430 @@
18126 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
18127 +
18128 +/* Inode functions. */
18129 +
18130 +#if !defined( __REISER4_INODE_H__ )
18131 +#define __REISER4_INODE_H__
18132 +
18133 +#include "forward.h"
18134 +#include "debug.h"
18135 +#include "key.h"
18136 +#include "seal.h"
18137 +#include "plugin/plugin.h"
18138 +#include "plugin/file/cryptcompress.h"
18139 +#include "plugin/file/file.h"
18140 +#include "plugin/dir/dir.h"
18141 +#include "plugin/plugin_set.h"
18142 +#include "plugin/security/perm.h"
18143 +#include "vfs_ops.h"
18144 +#include "jnode.h"
18145 +#include "fsdata.h"
18146 +
18147 +#include <linux/types.h> /* for __u?? , ino_t */
18148 +#include <linux/fs.h> /* for struct super_block, struct
18149 + * rw_semaphore, etc */
18150 +#include <linux/spinlock.h>
18151 +#include <asm/types.h>
18152 +
18153 +/* reiser4-specific inode flags. They are "transient" and are not
18154 + supposed to be stored on disk. Used to trace "state" of
18155 + inode
18156 +*/
18157 +typedef enum {
18158 + /* this is light-weight inode, inheriting some state from its
18159 + parent */
18160 + REISER4_LIGHT_WEIGHT = 0,
18161 + /* stat data wasn't yet created */
18162 + REISER4_NO_SD = 1,
18163 + /* internal immutable flag. Currently is only used
18164 + to avoid race condition during file creation.
18165 + See comment in create_object(). */
18166 + REISER4_IMMUTABLE = 2,
18167 + /* inode was read from storage */
18168 + REISER4_LOADED = 3,
18169 + /* this bit is set for symlinks. inode->u.generic_ip points to target
18170 + name of symlink. */
18171 + REISER4_GENERIC_PTR_USED = 4,
18172 + /* set if size of stat-data item for this inode is known. If this is
18173 + * set we can avoid recalculating size of stat-data on each update. */
18174 + REISER4_SDLEN_KNOWN = 5,
18175 + /* reiser4_inode->crypt points to the crypto stat */
18176 + REISER4_CRYPTO_STAT_LOADED = 6,
18177 + /* cryptcompress_inode_data points to the secret key */
18178 + REISER4_SECRET_KEY_INSTALLED = 7,
18179 + /* File (possibly) has pages corresponding to the tail items, that
18180 + * were created by ->readpage. It is set by mmap_unix_file() and
18181 + * sendfile_unix_file(). This bit is inspected by write_unix_file and
18182 + * kill-hook of tail items. It is never cleared once set. This bit is
18183 + * modified and inspected under i_mutex. */
18184 + REISER4_HAS_MMAP = 8,
18185 +
18186 + REISER4_PART_MIXED = 9,
18187 + REISER4_PART_IN_CONV = 10
18188 +} reiser4_file_plugin_flags;
18189 +
18190 +/* state associated with each inode.
18191 + reiser4 inode.
18192 +
18193 + NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
18194 + be of the same size. File-system allocates inodes by itself through
18195 + s_op->allocate_inode() method. So, it is possible to adjust size of inode
18196 + at the time of its creation.
18197 +
18198 + Invariants involving parts of this data-type:
18199 +
18200 + [inode->eflushed]
18201 +
18202 +*/
18203 +
18204 +typedef struct reiser4_inode reiser4_inode;
18205 +/* return pointer to reiser4-specific part of inode */
18206 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18207 + /* inode queried */ );
18208 +
18209 +#if BITS_PER_LONG == 64
18210 +
18211 +#define REISER4_INO_IS_OID (1)
18212 +typedef struct {;
18213 +} oid_hi_t;
18214 +
18215 +/* BITS_PER_LONG == 64 */
18216 +#else
18217 +
18218 +#define REISER4_INO_IS_OID (0)
18219 +typedef __u32 oid_hi_t;
18220 +
18221 +/* BITS_PER_LONG == 64 */
18222 +#endif
18223 +
18224 +struct reiser4_inode {
18225 + /* spin lock protecting fields of this structure. */
18226 + spinlock_t guard;
18227 + /* object plugins */
18228 + plugin_set *pset;
18229 + /* plugins set for inheritance */
18230 + plugin_set *hset;
18231 + /* high 32 bits of object id */
18232 + oid_hi_t oid_hi;
18233 + /* seal for stat-data */
18234 + seal_t sd_seal;
18235 + /* locality id for this file */
18236 + oid_t locality_id;
18237 +#if REISER4_LARGE_KEY
18238 + __u64 ordering;
18239 +#endif
18240 + /* coord of stat-data in sealed node */
18241 + coord_t sd_coord;
18242 + /* bit-mask of stat-data extentions used by this file */
18243 + __u64 extmask;
18244 + /* bitmask of non-default plugins for this inode */
18245 + __u16 plugin_mask;
18246 + union {
18247 + struct list_head readdir_list;
18248 + struct list_head not_used;
18249 + } lists;
18250 + /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18251 + unsigned long flags;
18252 + union {
18253 + /* fields specific to unix_file plugin */
18254 + unix_file_info_t unix_file_info;
18255 + /* fields specific to cryptcompress plugin */
18256 + cryptcompress_info_t cryptcompress_info;
18257 + } file_plugin_data;
18258 +
18259 + /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18260 + tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18261 + struct radix_tree_root jnodes_tree;
18262 +#if REISER4_DEBUG
18263 + /* number of unformatted node jnodes of this file in jnode hash table */
18264 + unsigned long nr_jnodes;
18265 +#endif
18266 +
18267 + /* block number of virtual root for this object. See comment above
18268 + * fs/reiser4/search.c:handle_vroot() */
18269 + reiser4_block_nr vroot;
18270 + struct semaphore loading;
18271 +};
18272 +
18273 +void loading_init_once(reiser4_inode *);
18274 +void loading_alloc(reiser4_inode *);
18275 +void loading_destroy(reiser4_inode *);
18276 +
18277 +typedef struct reiser4_inode_object {
18278 + /* private part */
18279 + reiser4_inode p;
18280 + /* generic fields not specific to reiser4, but used by VFS */
18281 + struct inode vfs_inode;
18282 +} reiser4_inode_object;
18283 +
18284 +/* return pointer to the reiser4 specific portion of @inode */
18285 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18286 + /* inode queried */ )
18287 +{
18288 + assert("nikita-254", inode != NULL);
18289 + return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
18290 +}
18291 +
18292 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18293 + r4_inode /* inode queried */
18294 + )
18295 +{
18296 + return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
18297 +}
18298 +
18299 +/*
18300 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18301 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18302 + * bits.
18303 + *
18304 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18305 + * of inode, otherwise whole oid is stored in i_ino.
18306 + *
18307 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18308 + */
18309 +
18310 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18311 +
18312 +#if REISER4_INO_IS_OID
18313 +
18314 +static inline oid_t get_inode_oid(const struct inode *inode)
18315 +{
18316 + return inode->i_ino;
18317 +}
18318 +
18319 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18320 +{
18321 + inode->i_ino = oid;
18322 +}
18323 +
18324 +/* REISER4_INO_IS_OID */
18325 +#else
18326 +
18327 +static inline oid_t get_inode_oid(const struct inode *inode)
18328 +{
18329 + return
18330 + ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18331 + inode->i_ino;
18332 +}
18333 +
18334 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18335 +{
18336 + assert("nikita-2519", inode != NULL);
18337 + inode->i_ino = (ino_t) (oid);
18338 + reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18339 + assert("nikita-2521", get_inode_oid(inode) == (oid));
18340 +}
18341 +
18342 +/* REISER4_INO_IS_OID */
18343 +#endif
18344 +
18345 +static inline oid_t get_inode_locality(const struct inode *inode)
18346 +{
18347 + return reiser4_inode_data(inode)->locality_id;
18348 +}
18349 +
18350 +#if REISER4_LARGE_KEY
18351 +static inline __u64 get_inode_ordering(const struct inode *inode)
18352 +{
18353 + return reiser4_inode_data(inode)->ordering;
18354 +}
18355 +
18356 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18357 +{
18358 + reiser4_inode_data(inode)->ordering = ordering;
18359 +}
18360 +
18361 +#else
18362 +
18363 +#define get_inode_ordering(inode) (0)
18364 +#define set_inode_ordering(inode, val) noop
18365 +
18366 +#endif
18367 +
18368 +/* return inode in which @uf_info is embedded */
18369 +static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
18370 + uf_info)
18371 +{
18372 + return &container_of(uf_info, reiser4_inode_object,
18373 + p.file_plugin_data.unix_file_info)->vfs_inode;
18374 +}
18375 +
18376 +
18377 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18378 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18379 +
18380 +extern reiser4_tree *tree_by_inode(const struct inode *inode);
18381 +
18382 +#if REISER4_DEBUG
18383 +extern void inode_invariant(const struct inode *inode);
18384 +extern int inode_has_no_jnodes(reiser4_inode *);
18385 +#else
18386 +#define inode_invariant(inode) noop
18387 +#endif
18388 +
18389 +static inline int spin_inode_is_locked(const struct inode *inode)
18390 +{
18391 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18392 + return 1;
18393 +}
18394 +
18395 +/**
18396 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18397 + * @inode: inode to lock
18398 + *
18399 + * In debug mode it checks that lower priority locks are not held and
18400 + * increments reiser4_context's lock counters on which lock ordering checking
18401 + * is based.
18402 + */
18403 +static inline void spin_lock_inode(struct inode *inode)
18404 +{
18405 + assert("", LOCK_CNT_NIL(spin_locked));
18406 + /* check lock ordering */
18407 + assert_spin_not_locked(&d_lock);
18408 +
18409 + spin_lock(&reiser4_inode_data(inode)->guard);
18410 +
18411 + LOCK_CNT_INC(spin_locked_inode);
18412 + LOCK_CNT_INC(spin_locked);
18413 +
18414 + inode_invariant(inode);
18415 +}
18416 +
18417 +/**
18418 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18419 + * @inode: inode to unlock
18420 + *
18421 + * In debug mode it checks that spinlock is held and decrements
18422 + * reiser4_context's lock counters on which lock ordering checking is based.
18423 + */
18424 +static inline void spin_unlock_inode(struct inode *inode)
18425 +{
18426 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18427 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18428 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18429 +
18430 + inode_invariant(inode);
18431 +
18432 + LOCK_CNT_DEC(spin_locked_inode);
18433 + LOCK_CNT_DEC(spin_locked);
18434 +
18435 + spin_unlock(&reiser4_inode_data(inode)->guard);
18436 +}
18437 +
18438 +
18439 +extern znode *inode_get_vroot(struct inode *inode);
18440 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18441 +
18442 +extern int reiser4_max_filename_len(const struct inode *inode);
18443 +extern int max_hash_collisions(const struct inode *dir);
18444 +extern void reiser4_unlock_inode(struct inode *inode);
18445 +extern int is_reiser4_inode(const struct inode *inode);
18446 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18447 +extern struct inode *reiser4_iget(struct super_block *super,
18448 + const reiser4_key * key, int silent);
18449 +extern void reiser4_iget_complete(struct inode *inode);
18450 +extern void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18451 +extern void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18452 +extern int inode_get_flag(const struct inode *inode,
18453 + reiser4_file_plugin_flags f);
18454 +
18455 +/* has inode been initialized? */
18456 +static inline int
18457 +is_inode_loaded(const struct inode *inode /* inode queried */ )
18458 +{
18459 + assert("nikita-1120", inode != NULL);
18460 + return inode_get_flag(inode, REISER4_LOADED);
18461 +}
18462 +
18463 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18464 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18465 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18466 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18467 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18468 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18469 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18470 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18471 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18472 + *inode);
18473 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18474 +extern regular_plugin *inode_regular_plugin(const struct inode *inode);
18475 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18476 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18477 +
18478 +extern void inode_set_plugin(struct inode *inode,
18479 + reiser4_plugin * plug, pset_member memb);
18480 +extern void reiser4_make_bad_inode(struct inode *inode);
18481 +
18482 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18483 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18484 +extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18485 +
18486 +/*
18487 + * update field @field in inode @i to contain value @value.
18488 + */
18489 +#define INODE_SET_FIELD(i, field, value) \
18490 +({ \
18491 + struct inode *__i; \
18492 + typeof(value) __v; \
18493 + \
18494 + __i = (i); \
18495 + __v = (value); \
18496 + inode_check_scale(__i, __i->field, __v); \
18497 + __i->field = __v; \
18498 +})
18499 +
18500 +#define INODE_INC_FIELD(i, field) \
18501 +({ \
18502 + struct inode *__i; \
18503 + \
18504 + __i = (i); \
18505 + inode_check_scale(__i, __i->field, __i->field + 1); \
18506 + ++ __i->field; \
18507 +})
18508 +
18509 +#define INODE_DEC_FIELD(i, field) \
18510 +({ \
18511 + struct inode *__i; \
18512 + \
18513 + __i = (i); \
18514 + inode_check_scale(__i, __i->field, __i->field - 1); \
18515 + -- __i->field; \
18516 +})
18517 +
18518 +/* See comment before readdir_common() for description. */
18519 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18520 +{
18521 + return &reiser4_inode_data(inode)->lists.readdir_list;
18522 +}
18523 +
18524 +extern void init_inode_ordering(struct inode *inode,
18525 + reiser4_object_create_data * crd, int create);
18526 +
18527 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18528 +{
18529 + return &reiser4_inode_data(inode)->jnodes_tree;
18530 +}
18531 +
18532 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18533 + * r4_inode)
18534 +{
18535 + return &r4_inode->jnodes_tree;
18536 +}
18537 +
18538 +#if REISER4_DEBUG
18539 +extern void print_inode(const char *prefix, const struct inode *i);
18540 +#endif
18541 +
18542 +int is_dir_empty(const struct inode *);
18543 +
18544 +/* __REISER4_INODE_H__ */
18545 +#endif
18546 +
18547 +/* Make Linus happy.
18548 + Local variables:
18549 + c-indentation-style: "K&R"
18550 + mode-name: "LC"
18551 + c-basic-offset: 8
18552 + tab-width: 8
18553 + fill-column: 120
18554 + End:
18555 +*/
18556 Index: linux-2.6.16/fs/reiser4/ioctl.h
18557 ===================================================================
18558 --- /dev/null
18559 +++ linux-2.6.16/fs/reiser4/ioctl.h
18560 @@ -0,0 +1,41 @@
18561 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18562 + * reiser4/README */
18563 +
18564 +#if !defined( __REISER4_IOCTL_H__ )
18565 +#define __REISER4_IOCTL_H__
18566 +
18567 +#include <linux/fs.h>
18568 +
18569 +/*
18570 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18571 + * extents and fix in this state. This is used by applications that rely on
18572 + *
18573 + * . files being block aligned, and
18574 + *
18575 + * . files never migrating on disk
18576 + *
18577 + * for example, boot loaders (LILO) need this.
18578 + *
18579 + * This ioctl should be used as
18580 + *
18581 + * result = ioctl(fd, REISER4_IOC_UNPACK);
18582 + *
18583 + * File behind fd descriptor will be converted to the extents (if necessary),
18584 + * and its stat-data will be updated so that it will never be converted back
18585 + * into tails again.
18586 + */
18587 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18588 +
18589 +/* __REISER4_IOCTL_H__ */
18590 +#endif
18591 +
18592 +/* Make Linus happy.
18593 + Local variables:
18594 + c-indentation-style: "K&R"
18595 + mode-name: "LC"
18596 + c-basic-offset: 8
18597 + tab-width: 8
18598 + fill-column: 120
18599 + scroll-step: 1
18600 + End:
18601 +*/
18602 Index: linux-2.6.16/fs/reiser4/jnode.c
18603 ===================================================================
18604 --- /dev/null
18605 +++ linux-2.6.16/fs/reiser4/jnode.c
18606 @@ -0,0 +1,1921 @@
18607 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18608 + * reiser4/README */
18609 +/* Jnode manipulation functions. */
18610 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18611 +
18612 + In particular, jnodes are used to track transactional information
18613 + associated with each block. Each znode contains jnode as ->zjnode field.
18614 +
18615 + Jnode stands for either Josh or Journal node.
18616 +*/
18617 +
18618 +/*
18619 + * Taxonomy.
18620 + *
18621 + * Jnode represents block containing data or meta-data. There are jnodes
18622 + * for:
18623 + *
18624 + * unformatted blocks (jnodes proper). There are plans, however to
18625 + * have a handle per extent unit rather than per each unformatted
18626 + * block, because there are so many of them.
18627 + *
18628 + * For bitmaps. Each bitmap is actually represented by two jnodes--one
18629 + * for working and another for "commit" data, together forming bnode.
18630 + *
18631 + * For io-heads. These are used by log writer.
18632 + *
18633 + * For formatted nodes (znode). See comment at the top of znode.c for
18634 + * details specific to the formatted nodes (znodes).
18635 + *
18636 + * Node data.
18637 + *
18638 + * Jnode provides access to the data of node it represents. Data are
18639 + * stored in a page. Page is kept in a page cache. This means, that jnodes
18640 + * are highly interconnected with page cache and VM internals.
18641 + *
18642 + * jnode has a pointer to page (->pg) containing its data. Pointer to data
18643 + * themselves is cached in ->data field to avoid frequent calls to
18644 + * page_address().
18645 + *
18646 + * jnode and page are attached to each other by jnode_attach_page(). This
18647 + * function places pointer to jnode in set_page_private(), sets PG_private
18648 + * flag and increments page counter.
18649 + *
18650 + * Opposite operation is performed by page_clear_jnode().
18651 + *
18652 + * jnode->pg is protected by jnode spin lock, and page->private is
18653 + * protected by page lock. See comment at the top of page_cache.c for
18654 + * more.
18655 + *
18656 + * page can be detached from jnode for two reasons:
18657 + *
18658 + * . jnode is removed from a tree (file is truncated, of formatted
18659 + * node is removed by balancing).
18660 + *
18661 + * . during memory pressure, VM calls ->releasepage() method
18662 + * (reiser4_releasepage()) to evict page from memory.
18663 + *
18664 + * (there, of course, is also umount, but this is special case we are not
18665 + * concerned with here).
18666 + *
18667 + * To protect jnode page from eviction, one calls jload() function that
18668 + * "pins" page in memory (loading it if necessary), increments
18669 + * jnode->d_count, and kmap()s page. Page is unpinned through call to
18670 + * jrelse().
18671 + *
18672 + * Jnode life cycle.
18673 + *
18674 + * jnode is created, placed in hash table, and, optionally, in per-inode
18675 + * radix tree. Page can be attached to jnode, pinned, released, etc.
18676 + *
18677 + * When jnode is captured into atom its reference counter is
18678 + * increased. While being part of an atom, jnode can be "early
18679 + * flushed". This means that as part of flush procedure, jnode is placed
18680 + * into "relocate set", and its page is submitted to the disk. After io
18681 + * completes, page can be detached, then loaded again, re-dirtied, etc.
18682 + *
18683 + * Thread acquired reference to jnode by calling jref() and releases it by
18684 + * jput(). When last reference is removed, jnode is still retained in
18685 + * memory (cached) if it has page attached, _unless_ it is scheduled for
18686 + * destruction (has JNODE_HEARD_BANSHEE bit set).
18687 + *
18688 + * Tree read-write lock was used as "existential" lock for jnodes. That is,
18689 + * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18690 + * that is, tree lock protected unreferenced jnodes stored in the hash
18691 + * table, from recycling.
18692 + *
18693 + * This resulted in high contention on tree lock, because jref()/jput() is
18694 + * frequent operation. To ameliorate this problem, RCU is used: when jput()
18695 + * is just about to release last reference on jnode it sets JNODE_RIP bit
18696 + * on it, and then proceed with jnode destruction (removing jnode from hash
18697 + * table, cbk_cache, detaching page, etc.). All places that change jnode
18698 + * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18699 + * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18700 + * jnode_rip_check() function), and pretend that nothing was found in hash
18701 + * table if bit is set.
18702 + *
18703 + * jput defers actual return of jnode into slab cache to some later time
18704 + * (by call_rcu()), this guarantees that other threads can safely continue
18705 + * working with JNODE_RIP-ped jnode.
18706 + *
18707 + */
18708 +
18709 +#include "reiser4.h"
18710 +#include "debug.h"
18711 +#include "dformat.h"
18712 +#include "jnode.h"
18713 +#include "plugin/plugin_header.h"
18714 +#include "plugin/plugin.h"
18715 +#include "txnmgr.h"
18716 +/*#include "jnode.h"*/
18717 +#include "znode.h"
18718 +#include "tree.h"
18719 +#include "tree_walk.h"
18720 +#include "super.h"
18721 +#include "inode.h"
18722 +#include "page_cache.h"
18723 +
18724 +#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18725 +#include <linux/types.h>
18726 +#include <linux/slab.h>
18727 +#include <linux/pagemap.h>
18728 +#include <linux/vmalloc.h> /* for vmalloc(), vfree() */
18729 +#include <linux/swap.h>
18730 +#include <linux/fs.h> /* for struct address_space */
18731 +#include <linux/writeback.h> /* for inode_lock */
18732 +
18733 +static kmem_cache_t *_jnode_slab = NULL;
18734 +
18735 +static void jnode_set_type(jnode * node, jnode_type type);
18736 +static int jdelete(jnode * node);
18737 +static int jnode_try_drop(jnode * node);
18738 +
18739 +#if REISER4_DEBUG
18740 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18741 +#endif
18742 +
18743 +/* true if valid page is attached to jnode */
18744 +static inline int jnode_is_parsed(jnode * node)
18745 +{
18746 + return JF_ISSET(node, JNODE_PARSED);
18747 +}
18748 +
18749 +/* hash table support */
18750 +
18751 +/* compare two jnode keys for equality. Used by hash-table macros */
18752 +static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
18753 +{
18754 + assert("nikita-2350", k1 != NULL);
18755 + assert("nikita-2351", k2 != NULL);
18756 +
18757 + return (k1->index == k2->index && k1->objectid == k2->objectid);
18758 +}
18759 +
18760 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18761 +static inline __u32
18762 +jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
18763 +{
18764 + assert("nikita-2352", key != NULL);
18765 + assert("nikita-3346", IS_POW(table->_buckets));
18766 +
18767 + /* yes, this is remarkable simply (where not stupid) hash function. */
18768 + return (key->objectid + key->index) & (table->_buckets - 1);
18769 +}
18770 +
18771 +/* The hash table definition */
18772 +#define KMALLOC(size) vmalloc(size)
18773 +#define KFREE(ptr, size) vfree(ptr)
18774 +TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
18775 + jnode_key_eq);
18776 +#undef KFREE
18777 +#undef KMALLOC
18778 +
18779 +/* call this to initialise jnode hash table */
18780 +int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18781 +{
18782 + assert("nikita-2359", tree != NULL);
18783 + return j_hash_init(&tree->jhash_table, 16384);
18784 +}
18785 +
18786 +/* call this to destroy jnode hash table. This is called during umount. */
18787 +int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18788 +{
18789 + j_hash_table *jtable;
18790 + jnode *node;
18791 + jnode *next;
18792 +
18793 + assert("nikita-2360", tree != NULL);
18794 +
18795 + /*
18796 + * Scan hash table and free all jnodes.
18797 + */
18798 + jtable = &tree->jhash_table;
18799 + if (jtable->_table) {
18800 + for_all_in_htable(jtable, j, node, next) {
18801 + assert("nikita-2361", !atomic_read(&node->x_count));
18802 + jdrop(node);
18803 + }
18804 +
18805 + j_hash_done(&tree->jhash_table);
18806 + }
18807 + return 0;
18808 +}
18809 +
18810 +/**
18811 + * init_jnodes - create jnode cache
18812 + *
18813 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18814 + */
18815 +int init_jnodes(void)
18816 +{
18817 + assert("umka-168", _jnode_slab == NULL);
18818 +
18819 + _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18820 + SLAB_HWCACHE_ALIGN |
18821 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
18822 + if (_jnode_slab == NULL)
18823 + return RETERR(-ENOMEM);
18824 +
18825 + return 0;
18826 +}
18827 +
18828 +/**
18829 + * done_znodes - delete znode cache
18830 + *
18831 + * This is called on reiser4 module unloading or system shutdown.
18832 + */
18833 +void done_jnodes(void)
18834 +{
18835 + destroy_reiser4_cache(&_jnode_slab);
18836 +}
18837 +
18838 +/* Initialize a jnode. */
18839 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18840 +{
18841 + assert("umka-175", node != NULL);
18842 +
18843 + memset(node, 0, sizeof(jnode));
18844 + ON_DEBUG(node->magic = JMAGIC);
18845 + jnode_set_type(node, type);
18846 + atomic_set(&node->d_count, 0);
18847 + atomic_set(&node->x_count, 0);
18848 + spin_lock_init(&node->guard);
18849 + spin_lock_init(&node->load);
18850 + node->atom = NULL;
18851 + node->tree = tree;
18852 + INIT_LIST_HEAD(&node->capture_link);
18853 +
18854 + ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18855 +
18856 + INIT_RCU_HEAD(&node->rcu);
18857 +
18858 +#if REISER4_DEBUG
18859 + {
18860 + reiser4_super_info_data *sbinfo;
18861 +
18862 + sbinfo = get_super_private(tree->super);
18863 + spin_lock_irq(&sbinfo->all_guard);
18864 + list_add(&node->jnodes, &sbinfo->all_jnodes);
18865 + spin_unlock_irq(&sbinfo->all_guard);
18866 + }
18867 +#endif
18868 +}
18869 +
18870 +#if REISER4_DEBUG
18871 +/*
18872 + * Remove jnode from ->all_jnodes list.
18873 + */
18874 +static void jnode_done(jnode * node, reiser4_tree * tree)
18875 +{
18876 + reiser4_super_info_data *sbinfo;
18877 +
18878 + sbinfo = get_super_private(tree->super);
18879 +
18880 + spin_lock_irq(&sbinfo->all_guard);
18881 + assert("nikita-2422", !list_empty(&node->jnodes));
18882 + list_del_init(&node->jnodes);
18883 + spin_unlock_irq(&sbinfo->all_guard);
18884 +}
18885 +#endif
18886 +
18887 +/* return already existing jnode of page */
18888 +jnode *jnode_by_page(struct page *pg)
18889 +{
18890 + assert("nikita-2066", pg != NULL);
18891 + assert("nikita-2400", PageLocked(pg));
18892 + assert("nikita-2068", PagePrivate(pg));
18893 + assert("nikita-2067", jprivate(pg) != NULL);
18894 + return jprivate(pg);
18895 +}
18896 +
18897 +/* exported functions to allocate/free jnode objects outside this file */
18898 +jnode *jalloc(void)
18899 +{
18900 + jnode *jal = kmem_cache_alloc(_jnode_slab, get_gfp_mask());
18901 + return jal;
18902 +}
18903 +
18904 +/* return jnode back to the slab allocator */
18905 +inline void jfree(jnode * node)
18906 +{
18907 + assert("zam-449", node != NULL);
18908 +
18909 + assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18910 + NODE_LIST(node) == NOT_CAPTURED));
18911 + assert("nikita-3222", list_empty(&node->jnodes));
18912 + assert("nikita-3221", jnode_page(node) == NULL);
18913 +
18914 + /* not yet phash_jnode_destroy(node); */
18915 +
18916 + kmem_cache_free(_jnode_slab, node);
18917 +}
18918 +
18919 +/*
18920 + * This function is supplied as RCU callback. It actually frees jnode when
18921 + * last reference to it is gone.
18922 + */
18923 +static void jnode_free_actor(struct rcu_head *head)
18924 +{
18925 + jnode *node;
18926 + jnode_type jtype;
18927 +
18928 + node = container_of(head, jnode, rcu);
18929 + jtype = jnode_get_type(node);
18930 +
18931 + ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18932 +
18933 + switch (jtype) {
18934 + case JNODE_IO_HEAD:
18935 + case JNODE_BITMAP:
18936 + case JNODE_UNFORMATTED_BLOCK:
18937 + jfree(node);
18938 + break;
18939 + case JNODE_FORMATTED_BLOCK:
18940 + zfree(JZNODE(node));
18941 + break;
18942 + case JNODE_INODE:
18943 + default:
18944 + wrong_return_value("nikita-3197", "Wrong jnode type");
18945 + }
18946 +}
18947 +
18948 +/*
18949 + * Free a jnode. Post a callback to be executed later through RCU when all
18950 + * references to @node are released.
18951 + */
18952 +static inline void jnode_free(jnode * node, jnode_type jtype)
18953 +{
18954 + if (jtype != JNODE_INODE) {
18955 + /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18956 + call_rcu(&node->rcu, jnode_free_actor);
18957 + } else
18958 + jnode_list_remove(node);
18959 +}
18960 +
18961 +/* allocate new unformatted jnode */
18962 +static jnode *jnew_unformatted(void)
18963 +{
18964 + jnode *jal;
18965 +
18966 + jal = jalloc();
18967 + if (jal == NULL)
18968 + return NULL;
18969 +
18970 + jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18971 + jal->key.j.mapping = NULL;
18972 + jal->key.j.index = (unsigned long)-1;
18973 + jal->key.j.objectid = 0;
18974 + return jal;
18975 +}
18976 +
18977 +/* look for jnode with given mapping and offset within hash table */
18978 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18979 +{
18980 + jnode_key_t jkey;
18981 + jnode *node;
18982 +
18983 + assert("nikita-2353", tree != NULL);
18984 +
18985 + jkey.objectid = objectid;
18986 + jkey.index = index;
18987 +
18988 + /*
18989 + * hash table is _not_ protected by any lock during lookups. All we
18990 + * have to do is to disable preemption to keep RCU happy.
18991 + */
18992 +
18993 + rcu_read_lock();
18994 + node = j_hash_find(&tree->jhash_table, &jkey);
18995 + if (node != NULL) {
18996 + /* protect @node from recycling */
18997 + jref(node);
18998 + assert("nikita-2955", jnode_invariant(node, 0, 0));
18999 + node = jnode_rip_check(tree, node);
19000 + }
19001 + rcu_read_unlock();
19002 + return node;
19003 +}
19004 +
19005 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
19006 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
19007 +{
19008 + assert("vs-1694", mapping->host != NULL);
19009 +
19010 + return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
19011 +}
19012 +
19013 +jnode *jfind(struct address_space * mapping, unsigned long index)
19014 +{
19015 + reiser4_tree *tree;
19016 + jnode *node;
19017 +
19018 + assert("vs-1694", mapping->host != NULL);
19019 + tree = tree_by_inode(mapping->host);
19020 +
19021 + read_lock_tree(tree);
19022 + node = jfind_nolock(mapping, index);
19023 + if (node != NULL)
19024 + jref(node);
19025 + read_unlock_tree(tree);
19026 + return node;
19027 +}
19028 +
19029 +static void inode_attach_jnode(jnode * node)
19030 +{
19031 + struct inode *inode;
19032 + reiser4_inode *info;
19033 + struct radix_tree_root *rtree;
19034 +
19035 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19036 + assert("zam-1043", node->key.j.mapping != NULL);
19037 + inode = node->key.j.mapping->host;
19038 + info = reiser4_inode_data(inode);
19039 + rtree = jnode_tree_by_reiser4_inode(info);
19040 + if (rtree->rnode == NULL) {
19041 + /* prevent inode from being pruned when it has jnodes attached
19042 + to it */
19043 + write_lock_irq(&inode->i_data.tree_lock);
19044 + inode->i_data.nrpages++;
19045 + write_unlock_irq(&inode->i_data.tree_lock);
19046 + }
19047 + assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
19048 + check_me("zam-1045",
19049 + !radix_tree_insert(rtree, node->key.j.index, node));
19050 + ON_DEBUG(info->nr_jnodes++);
19051 +}
19052 +
19053 +static void inode_detach_jnode(jnode * node)
19054 +{
19055 + struct inode *inode;
19056 + reiser4_inode *info;
19057 + struct radix_tree_root *rtree;
19058 +
19059 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19060 + assert("zam-1044", node->key.j.mapping != NULL);
19061 + inode = node->key.j.mapping->host;
19062 + info = reiser4_inode_data(inode);
19063 + rtree = jnode_tree_by_reiser4_inode(info);
19064 +
19065 + assert("zam-1051", info->nr_jnodes != 0);
19066 + assert("zam-1052", rtree->rnode != NULL);
19067 + ON_DEBUG(info->nr_jnodes--);
19068 +
19069 + /* delete jnode from inode's radix tree of jnodes */
19070 + check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
19071 + if (rtree->rnode == NULL) {
19072 + /* inode can be pruned now */
19073 + write_lock_irq(&inode->i_data.tree_lock);
19074 + inode->i_data.nrpages--;
19075 + write_unlock_irq(&inode->i_data.tree_lock);
19076 + }
19077 +}
19078 +
19079 +/* put jnode into hash table (where they can be found by flush who does not know
19080 + mapping) and to inode's tree of jnodes (where they can be found (hopefully
19081 + faster) in places where mapping is known). Currently it is used by
19082 + fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
19083 + created */
19084 +static void
19085 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
19086 + unsigned long index)
19087 +{
19088 + j_hash_table *jtable;
19089 +
19090 + assert("vs-1446", jnode_is_unformatted(node));
19091 + assert("vs-1442", node->key.j.mapping == 0);
19092 + assert("vs-1443", node->key.j.objectid == 0);
19093 + assert("vs-1444", node->key.j.index == (unsigned long)-1);
19094 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19095 +
19096 + node->key.j.mapping = mapping;
19097 + node->key.j.objectid = get_inode_oid(mapping->host);
19098 + node->key.j.index = index;
19099 +
19100 + jtable = &jnode_get_tree(node)->jhash_table;
19101 +
19102 + /* race with some other thread inserting jnode into the hash table is
19103 + * impossible, because we keep the page lock. */
19104 + /*
19105 + * following assertion no longer holds because of RCU: it is possible
19106 + * jnode is in the hash table, but with JNODE_RIP bit set.
19107 + */
19108 + /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
19109 + j_hash_insert_rcu(jtable, node);
19110 + inode_attach_jnode(node);
19111 +}
19112 +
19113 +static void unhash_unformatted_node_nolock(jnode * node)
19114 +{
19115 + assert("vs-1683", node->key.j.mapping != NULL);
19116 + assert("vs-1684",
19117 + node->key.j.objectid ==
19118 + get_inode_oid(node->key.j.mapping->host));
19119 +
19120 + /* remove jnode from hash-table */
19121 + j_hash_remove_rcu(&node->tree->jhash_table, node);
19122 + inode_detach_jnode(node);
19123 + node->key.j.mapping = NULL;
19124 + node->key.j.index = (unsigned long)-1;
19125 + node->key.j.objectid = 0;
19126 +
19127 +}
19128 +
19129 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
19130 + reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
19131 + uncapture_jnode */
19132 +void unhash_unformatted_jnode(jnode * node)
19133 +{
19134 + assert("vs-1445", jnode_is_unformatted(node));
19135 +
19136 + write_lock_tree(node->tree);
19137 + unhash_unformatted_node_nolock(node);
19138 + write_unlock_tree(node->tree);
19139 +}
19140 +
19141 +/*
19142 + * search hash table for a jnode with given oid and index. If not found,
19143 + * allocate new jnode, insert it, and also insert into radix tree for the
19144 + * given inode/mapping.
19145 + */
19146 +jnode *find_get_jnode(reiser4_tree * tree, struct address_space *mapping,
19147 + oid_t oid, unsigned long index)
19148 +{
19149 + jnode *result;
19150 + jnode *shadow;
19151 + int preload;
19152 +
19153 + result = jnew_unformatted();
19154 +
19155 + if (unlikely(result == NULL))
19156 + return ERR_PTR(RETERR(-ENOMEM));
19157 +
19158 + preload = radix_tree_preload(get_gfp_mask());
19159 + if (preload != 0)
19160 + return ERR_PTR(preload);
19161 +
19162 + write_lock_tree(tree);
19163 + shadow = jfind_nolock(mapping, index);
19164 + if (likely(shadow == NULL)) {
19165 + /* add new jnode to hash table and inode's radix tree of jnodes */
19166 + jref(result);
19167 + hash_unformatted_jnode(result, mapping, index);
19168 + } else {
19169 + /* jnode is found in inode's radix tree of jnodes */
19170 + jref(shadow);
19171 + jnode_free(result, JNODE_UNFORMATTED_BLOCK);
19172 + assert("vs-1498", shadow->key.j.mapping == mapping);
19173 + result = shadow;
19174 + }
19175 + write_unlock_tree(tree);
19176 +
19177 + assert("nikita-2955",
19178 + ergo(result != NULL, jnode_invariant(result, 0, 0)));
19179 + radix_tree_preload_end();
19180 + return result;
19181 +}
19182 +
19183 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
19184 + creates) jnode corresponding to page @pg. jnode is attached to page and
19185 + inserted into jnode hash-table. */
19186 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
19187 +{
19188 + /*
19189 + * There are two ways to create jnode: starting with pre-existing page
19190 + * and without page.
19191 + *
19192 + * When page already exists, jnode is created
19193 + * (jnode_of_page()->do_jget()) under page lock. This is done in
19194 + * ->writepage(), or when capturing anonymous page dirtied through
19195 + * mmap.
19196 + *
19197 + * Jnode without page is created by index_extent_jnode().
19198 + *
19199 + */
19200 +
19201 + jnode *result;
19202 + oid_t oid = get_inode_oid(pg->mapping->host);
19203 +
19204 + assert("umka-176", pg != NULL);
19205 + assert("nikita-2394", PageLocked(pg));
19206 +
19207 + result = jprivate(pg);
19208 + if (likely(result != NULL))
19209 + return jref(result);
19210 +
19211 + tree = tree_by_page(pg);
19212 +
19213 + /* check hash-table first */
19214 + result = jfind(pg->mapping, pg->index);
19215 + if (unlikely(result != NULL)) {
19216 + spin_lock_jnode(result);
19217 + jnode_attach_page(result, pg);
19218 + spin_unlock_jnode(result);
19219 + result->key.j.mapping = pg->mapping;
19220 + return result;
19221 + }
19222 +
19223 + result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19224 + if (unlikely(IS_ERR(result)))
19225 + return result;
19226 + /* attach jnode to page */
19227 + spin_lock_jnode(result);
19228 + jnode_attach_page(result, pg);
19229 + spin_unlock_jnode(result);
19230 + return result;
19231 +}
19232 +
19233 +/*
19234 + * return jnode for @pg, creating it if necessary.
19235 + */
19236 +jnode *jnode_of_page(struct page * pg)
19237 +{
19238 + jnode *result;
19239 +
19240 + assert("umka-176", pg != NULL);
19241 + assert("nikita-2394", PageLocked(pg));
19242 +
19243 + result = do_jget(tree_by_page(pg), pg);
19244 +
19245 + if (REISER4_DEBUG && !IS_ERR(result)) {
19246 + assert("nikita-3210", result == jprivate(pg));
19247 + assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19248 + if (jnode_is_unformatted(jprivate(pg))) {
19249 + assert("nikita-2364",
19250 + jprivate(pg)->key.j.index == pg->index);
19251 + assert("nikita-2367",
19252 + jprivate(pg)->key.j.mapping == pg->mapping);
19253 + assert("nikita-2365",
19254 + jprivate(pg)->key.j.objectid ==
19255 + get_inode_oid(pg->mapping->host));
19256 + assert("vs-1200",
19257 + jprivate(pg)->key.j.objectid ==
19258 + pg->mapping->host->i_ino);
19259 + assert("nikita-2356",
19260 + jnode_is_unformatted(jnode_by_page(pg)));
19261 + }
19262 + assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19263 + }
19264 + return result;
19265 +}
19266 +
19267 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19268 + * page.*/
19269 +void jnode_attach_page(jnode * node, struct page *pg)
19270 +{
19271 + assert("nikita-2060", node != NULL);
19272 + assert("nikita-2061", pg != NULL);
19273 +
19274 + assert("nikita-2050", jprivate(pg) == 0ul);
19275 + assert("nikita-2393", !PagePrivate(pg));
19276 + assert("vs-1741", node->pg == NULL);
19277 +
19278 + assert("nikita-2396", PageLocked(pg));
19279 + assert_spin_locked(&(node->guard));
19280 +
19281 + page_cache_get(pg);
19282 + set_page_private(pg, (unsigned long)node);
19283 + node->pg = pg;
19284 + SetPagePrivate(pg);
19285 +}
19286 +
19287 +/* Dual to jnode_attach_page: break a binding between page and jnode */
19288 +void page_clear_jnode(struct page *page, jnode * node)
19289 +{
19290 + assert("nikita-2424", page != NULL);
19291 + assert("nikita-2425", PageLocked(page));
19292 + assert("nikita-2426", node != NULL);
19293 + assert_spin_locked(&(node->guard));
19294 + assert("nikita-2428", PagePrivate(page));
19295 +
19296 + assert("nikita-3551", !PageWriteback(page));
19297 +
19298 + JF_CLR(node, JNODE_PARSED);
19299 + set_page_private(page, 0ul);
19300 + ClearPagePrivate(page);
19301 + node->pg = NULL;
19302 + page_cache_release(page);
19303 +}
19304 +
19305 +/* it is only used in one place to handle error */
19306 +void
19307 +page_detach_jnode(struct page *page, struct address_space *mapping,
19308 + unsigned long index)
19309 +{
19310 + assert("nikita-2395", page != NULL);
19311 +
19312 + lock_page(page);
19313 + if ((page->mapping == mapping) && (page->index == index)
19314 + && PagePrivate(page)) {
19315 + jnode *node;
19316 +
19317 + node = jprivate(page);
19318 + spin_lock_jnode(node);
19319 + page_clear_jnode(page, node);
19320 + spin_unlock_jnode(node);
19321 + }
19322 + unlock_page(page);
19323 +}
19324 +
19325 +/* return @node page locked.
19326 +
19327 + Locking ordering requires that one first takes page lock and afterwards
19328 + spin lock on node attached to this page. Sometimes it is necessary to go in
19329 + the opposite direction. This is done through standard trylock-and-release
19330 + loop.
19331 +*/
19332 +static struct page *jnode_lock_page(jnode * node)
19333 +{
19334 + struct page *page;
19335 +
19336 + assert("nikita-2052", node != NULL);
19337 + assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19338 +
19339 + while (1) {
19340 +
19341 + spin_lock_jnode(node);
19342 + page = jnode_page(node);
19343 + if (page == NULL) {
19344 + break;
19345 + }
19346 +
19347 + /* no need to page_cache_get( page ) here, because page cannot
19348 + be evicted from memory without detaching it from jnode and
19349 + this requires spin lock on jnode that we already hold.
19350 + */
19351 + if (!TestSetPageLocked(page)) {
19352 + /* We won a lock on jnode page, proceed. */
19353 + break;
19354 + }
19355 +
19356 + /* Page is locked by someone else. */
19357 + page_cache_get(page);
19358 + spin_unlock_jnode(node);
19359 + wait_on_page_locked(page);
19360 + /* it is possible that page was detached from jnode and
19361 + returned to the free pool, or re-assigned while we were
19362 + waiting on locked bit. This will be rechecked on the next
19363 + loop iteration.
19364 + */
19365 + page_cache_release(page);
19366 +
19367 + /* try again */
19368 + }
19369 + return page;
19370 +}
19371 +
19372 +/*
19373 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19374 + * validness of jnode content.
19375 + */
19376 +static inline int jparse(jnode * node)
19377 +{
19378 + int result;
19379 +
19380 + assert("nikita-2466", node != NULL);
19381 +
19382 + spin_lock_jnode(node);
19383 + if (likely(!jnode_is_parsed(node))) {
19384 + result = jnode_ops(node)->parse(node);
19385 + if (likely(result == 0))
19386 + JF_SET(node, JNODE_PARSED);
19387 + } else
19388 + result = 0;
19389 + spin_unlock_jnode(node);
19390 + return result;
19391 +}
19392 +
19393 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19394 + * one. */
19395 +struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19396 +{
19397 + struct page *page;
19398 +
19399 + spin_lock_jnode(node);
19400 + page = jnode_page(node);
19401 +
19402 + if (page == NULL) {
19403 + spin_unlock_jnode(node);
19404 + page = find_or_create_page(jnode_get_mapping(node),
19405 + jnode_get_index(node), gfp_flags);
19406 + if (page == NULL)
19407 + return ERR_PTR(RETERR(-ENOMEM));
19408 + } else {
19409 + if (!TestSetPageLocked(page)) {
19410 + spin_unlock_jnode(node);
19411 + return page;
19412 + }
19413 + page_cache_get(page);
19414 + spin_unlock_jnode(node);
19415 + lock_page(page);
19416 + assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19417 + }
19418 +
19419 + spin_lock_jnode(node);
19420 + if (!jnode_page(node))
19421 + jnode_attach_page(node, page);
19422 + spin_unlock_jnode(node);
19423 +
19424 + page_cache_release(page);
19425 + assert("zam-894", jnode_page(node) == page);
19426 + return page;
19427 +}
19428 +
19429 +/* Start read operation for jnode's page if page is not up-to-date. */
19430 +static int jnode_start_read(jnode * node, struct page *page)
19431 +{
19432 + assert("zam-893", PageLocked(page));
19433 +
19434 + if (PageUptodate(page)) {
19435 + unlock_page(page);
19436 + return 0;
19437 + }
19438 + return page_io(page, node, READ, get_gfp_mask());
19439 +}
19440 +
19441 +#if REISER4_DEBUG
19442 +static void check_jload(jnode * node, struct page *page)
19443 +{
19444 + if (jnode_is_znode(node)) {
19445 + node40_header *nh;
19446 + znode *z;
19447 +
19448 + z = JZNODE(node);
19449 + if (znode_is_any_locked(z)) {
19450 + nh = (node40_header *) kmap(page);
19451 + /* this only works for node40-only file systems. For
19452 + * debugging. */
19453 + assert("nikita-3253",
19454 + z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19455 + kunmap(page);
19456 + }
19457 + assert("nikita-3565", znode_invariant(z));
19458 + }
19459 +}
19460 +#else
19461 +#define check_jload(node, page) noop
19462 +#endif
19463 +
19464 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19465 + * to call jload() shortly. This will bring appropriate portion of jnode into
19466 + * CPU cache. */
19467 +void jload_prefetch(jnode * node)
19468 +{
19469 + prefetchw(&node->x_count);
19470 +}
19471 +
19472 +/* load jnode's data into memory */
19473 +int jload_gfp(jnode * node /* node to load */ ,
19474 + gfp_t gfp_flags /* allocation flags */ ,
19475 + int do_kmap /* true if page should be kmapped */ )
19476 +{
19477 + struct page *page;
19478 + int result = 0;
19479 + int parsed;
19480 +
19481 + assert("nikita-3010", schedulable());
19482 +
19483 + prefetchw(&node->pg);
19484 +
19485 + /* taking d-reference implies taking x-reference. */
19486 + jref(node);
19487 +
19488 + /*
19489 + * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19490 + * should be atomic, otherwise there is a race against
19491 + * reiser4_releasepage().
19492 + */
19493 + spin_lock(&(node->load));
19494 + add_d_ref(node);
19495 + parsed = jnode_is_parsed(node);
19496 + spin_unlock(&(node->load));
19497 +
19498 + if (unlikely(!parsed)) {
19499 + page = jnode_get_page_locked(node, gfp_flags);
19500 + if (unlikely(IS_ERR(page))) {
19501 + result = PTR_ERR(page);
19502 + goto failed;
19503 + }
19504 +
19505 + result = jnode_start_read(node, page);
19506 + if (unlikely(result != 0))
19507 + goto failed;
19508 +
19509 + wait_on_page_locked(page);
19510 + if (unlikely(!PageUptodate(page))) {
19511 + result = RETERR(-EIO);
19512 + goto failed;
19513 + }
19514 +
19515 + if (do_kmap)
19516 + node->data = kmap(page);
19517 +
19518 + result = jparse(node);
19519 + if (unlikely(result != 0)) {
19520 + if (do_kmap)
19521 + kunmap(page);
19522 + goto failed;
19523 + }
19524 + check_jload(node, page);
19525 + } else {
19526 + page = jnode_page(node);
19527 + check_jload(node, page);
19528 + if (do_kmap)
19529 + node->data = kmap(page);
19530 + }
19531 +
19532 + if (!is_writeout_mode())
19533 + /* We do not mark pages active if jload is called as a part of
19534 + * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19535 + * and write_logs() add no value to cached data, there is no
19536 + * sense to mark pages as active when they go to disk, it just
19537 + * confuses vm scanning routines because clean page could be
19538 + * moved out from inactive list as a result of this
19539 + * mark_page_accessed() call. */
19540 + mark_page_accessed(page);
19541 +
19542 + return 0;
19543 +
19544 + failed:
19545 + jrelse_tail(node);
19546 + return result;
19547 +
19548 +}
19549 +
19550 +/* start asynchronous reading for given jnode's page. */
19551 +int jstartio(jnode * node)
19552 +{
19553 + struct page *page;
19554 +
19555 + page = jnode_get_page_locked(node, get_gfp_mask());
19556 + if (IS_ERR(page))
19557 + return PTR_ERR(page);
19558 +
19559 + return jnode_start_read(node, page);
19560 +}
19561 +
19562 +/* Initialize a node by calling appropriate plugin instead of reading
19563 + * node from disk as in jload(). */
19564 +int jinit_new(jnode * node, gfp_t gfp_flags)
19565 +{
19566 + struct page *page;
19567 + int result;
19568 +
19569 + jref(node);
19570 + add_d_ref(node);
19571 +
19572 + page = jnode_get_page_locked(node, gfp_flags);
19573 + if (IS_ERR(page)) {
19574 + result = PTR_ERR(page);
19575 + goto failed;
19576 + }
19577 +
19578 + SetPageUptodate(page);
19579 + unlock_page(page);
19580 +
19581 + node->data = kmap(page);
19582 +
19583 + if (!jnode_is_parsed(node)) {
19584 + jnode_plugin *jplug = jnode_ops(node);
19585 + spin_lock_jnode(node);
19586 + result = jplug->init(node);
19587 + spin_unlock_jnode(node);
19588 + if (result) {
19589 + kunmap(page);
19590 + goto failed;
19591 + }
19592 + JF_SET(node, JNODE_PARSED);
19593 + }
19594 +
19595 + return 0;
19596 +
19597 + failed:
19598 + jrelse(node);
19599 + return result;
19600 +}
19601 +
19602 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19603 +void jrelse_tail(jnode * node /* jnode to release references to */ )
19604 +{
19605 + assert("nikita-489", atomic_read(&node->d_count) > 0);
19606 + atomic_dec(&node->d_count);
19607 + /* release reference acquired in jload_gfp() or jinit_new() */
19608 + jput(node);
19609 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
19610 + LOCK_CNT_DEC(d_refs);
19611 +}
19612 +
19613 +/* drop reference to node data. When last reference is dropped, data are
19614 + unloaded. */
19615 +void jrelse(jnode * node /* jnode to release references to */ )
19616 +{
19617 + struct page *page;
19618 +
19619 + assert("nikita-487", node != NULL);
19620 + assert_spin_not_locked(&(node->guard));
19621 +
19622 + page = jnode_page(node);
19623 + if (likely(page != NULL)) {
19624 + /*
19625 + * it is safe not to lock jnode here, because at this point
19626 + * @node->d_count is greater than zero (if jrelse() is used
19627 + * correctly, that is). JNODE_PARSED may be not set yet, if,
19628 + * for example, we got here as a result of error handling path
19629 + * in jload(). Anyway, page cannot be detached by
19630 + * reiser4_releasepage(). truncate will invalidate page
19631 + * regardless, but this should not be a problem.
19632 + */
19633 + kunmap(page);
19634 + }
19635 + jrelse_tail(node);
19636 +}
19637 +
19638 +/* called from jput() to wait for io completion */
19639 +static void jnode_finish_io(jnode * node)
19640 +{
19641 + struct page *page;
19642 +
19643 + assert("nikita-2922", node != NULL);
19644 +
19645 + spin_lock_jnode(node);
19646 + page = jnode_page(node);
19647 + if (page != NULL) {
19648 + page_cache_get(page);
19649 + spin_unlock_jnode(node);
19650 + wait_on_page_writeback(page);
19651 + page_cache_release(page);
19652 + } else
19653 + spin_unlock_jnode(node);
19654 +}
19655 +
19656 +/*
19657 + * This is called by jput() when last reference to jnode is released. This is
19658 + * separate function, because we want fast path of jput() to be inline and,
19659 + * therefore, small.
19660 + */
19661 +void jput_final(jnode * node)
19662 +{
19663 + int r_i_p;
19664 +
19665 + /* A fast check for keeping node in cache. We always keep node in cache
19666 + * if its page is present and node was not marked for deletion */
19667 + if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19668 + rcu_read_unlock();
19669 + return;
19670 + }
19671 + assert("edward-1432", node->page_count == 0);
19672 +
19673 + r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19674 + /*
19675 + * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19676 + * this case it is safe to access node after unlock.
19677 + */
19678 + rcu_read_unlock();
19679 + if (r_i_p) {
19680 + jnode_finish_io(node);
19681 + if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19682 + /* node is removed from the tree. */
19683 + jdelete(node);
19684 + else
19685 + jnode_try_drop(node);
19686 + }
19687 + /* if !r_i_p some other thread is already killing it */
19688 +}
19689 +
19690 +int jwait_io(jnode * node, int rw)
19691 +{
19692 + struct page *page;
19693 + int result;
19694 +
19695 + assert("zam-447", node != NULL);
19696 + assert("zam-448", jnode_page(node) != NULL);
19697 +
19698 + page = jnode_page(node);
19699 +
19700 + result = 0;
19701 + if (rw == READ) {
19702 + wait_on_page_locked(page);
19703 + } else {
19704 + assert("nikita-2227", rw == WRITE);
19705 + wait_on_page_writeback(page);
19706 + }
19707 + if (PageError(page))
19708 + result = RETERR(-EIO);
19709 +
19710 + return result;
19711 +}
19712 +
19713 +/*
19714 + * jnode types and plugins.
19715 + *
19716 + * jnode by itself is a "base type". There are several different jnode
19717 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19718 + * has to do different things based on jnode type. In the standard reiser4 way
19719 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19720 + *
19721 + * Functions below deal with jnode types and define methods of jnode plugin.
19722 + *
19723 + */
19724 +
19725 +/* set jnode type. This is done during jnode initialization. */
19726 +static void jnode_set_type(jnode * node, jnode_type type)
19727 +{
19728 + static unsigned long type_to_mask[] = {
19729 + [JNODE_UNFORMATTED_BLOCK] = 1,
19730 + [JNODE_FORMATTED_BLOCK] = 0,
19731 + [JNODE_BITMAP] = 2,
19732 + [JNODE_IO_HEAD] = 6,
19733 + [JNODE_INODE] = 4
19734 + };
19735 +
19736 + assert("zam-647", type < LAST_JNODE_TYPE);
19737 + assert("nikita-2815", !jnode_is_loaded(node));
19738 + assert("nikita-3386", node->state == 0);
19739 +
19740 + node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19741 +}
19742 +
19743 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19744 + * specific initialization. */
19745 +static int init_noinit(jnode * node UNUSED_ARG)
19746 +{
19747 + return 0;
19748 +}
19749 +
19750 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19751 + * specific pasring. */
19752 +static int parse_noparse(jnode * node UNUSED_ARG)
19753 +{
19754 + return 0;
19755 +}
19756 +
19757 +/* ->mapping() method for unformatted jnode */
19758 +struct address_space *mapping_jnode(const jnode * node)
19759 +{
19760 + struct address_space *map;
19761 +
19762 + assert("nikita-2713", node != NULL);
19763 +
19764 + /* mapping is stored in jnode */
19765 +
19766 + map = node->key.j.mapping;
19767 + assert("nikita-2714", map != NULL);
19768 + assert("nikita-2897", is_reiser4_inode(map->host));
19769 + assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19770 + return map;
19771 +}
19772 +
19773 +/* ->index() method for unformatted jnodes */
19774 +unsigned long index_jnode(const jnode * node)
19775 +{
19776 + /* index is stored in jnode */
19777 + return node->key.j.index;
19778 +}
19779 +
19780 +/* ->remove() method for unformatted jnodes */
19781 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19782 +{
19783 + /* remove jnode from hash table and radix tree */
19784 + if (node->key.j.mapping)
19785 + unhash_unformatted_node_nolock(node);
19786 +}
19787 +
19788 +/* ->mapping() method for znodes */
19789 +static struct address_space *mapping_znode(const jnode * node)
19790 +{
19791 + /* all znodes belong to fake inode */
19792 + return get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19793 +}
19794 +
19795 +/* ->index() method for znodes */
19796 +static unsigned long index_znode(const jnode * node)
19797 +{
19798 + unsigned long addr;
19799 + assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19800 +
19801 + /* index of znode is just its address (shifted) */
19802 + addr = (unsigned long)node;
19803 + return (addr - PAGE_OFFSET) >> znode_shift_order;
19804 +}
19805 +
19806 +/* ->mapping() method for bitmap jnode */
19807 +static struct address_space *mapping_bitmap(const jnode * node)
19808 +{
19809 + /* all bitmap blocks belong to special bitmap inode */
19810 + return get_super_private(jnode_get_tree(node)->super)->bitmap->
19811 + i_mapping;
19812 +}
19813 +
19814 +/* ->index() method for jnodes that are indexed by address */
19815 +static unsigned long index_is_address(const jnode * node)
19816 +{
19817 + unsigned long ind;
19818 +
19819 + ind = (unsigned long)node;
19820 + return ind - PAGE_OFFSET;
19821 +}
19822 +
19823 +/* resolve race with jput */
19824 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19825 +{
19826 + /*
19827 + * This is used as part of RCU-based jnode handling.
19828 + *
19829 + * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19830 + * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19831 + * not protected during this, so concurrent thread may execute
19832 + * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19833 + * freed in jput_final(). To avoid such races, jput_final() sets
19834 + * JNODE_RIP on jnode (under tree lock). All places that work with
19835 + * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19836 + * (first without taking tree lock), and if this bit is set, released
19837 + * reference acquired by the current thread and returns NULL.
19838 + *
19839 + * As a result, if jnode is being concurrently freed, NULL is returned
19840 + * and caller should pretend that jnode wasn't found in the first
19841 + * place.
19842 + *
19843 + * Otherwise it's safe to release "rcu-read-lock" and continue with
19844 + * jnode.
19845 + */
19846 + if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19847 + read_lock_tree(tree);
19848 + if (JF_ISSET(node, JNODE_RIP)) {
19849 + dec_x_ref(node);
19850 + node = NULL;
19851 + }
19852 + read_unlock_tree(tree);
19853 + }
19854 + return node;
19855 +}
19856 +
19857 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19858 +{
19859 + struct inode *inode;
19860 + item_plugin *iplug;
19861 + loff_t off;
19862 +
19863 + assert("nikita-3092", node != NULL);
19864 + assert("nikita-3093", key != NULL);
19865 + assert("nikita-3094", jnode_is_unformatted(node));
19866 +
19867 + off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19868 + inode = mapping_jnode(node)->host;
19869 +
19870 + if (node->parent_item_id != 0)
19871 + iplug = item_plugin_by_id(node->parent_item_id);
19872 + else
19873 + iplug = NULL;
19874 +
19875 + if (iplug != NULL && iplug->f.key_by_offset)
19876 + iplug->f.key_by_offset(inode, off, key);
19877 + else {
19878 + file_plugin *fplug;
19879 +
19880 + fplug = inode_file_plugin(inode);
19881 + assert("zam-1007", fplug != NULL);
19882 + assert("zam-1008", fplug->key_by_inode != NULL);
19883 +
19884 + fplug->key_by_inode(inode, off, key);
19885 + }
19886 +
19887 + return key;
19888 +}
19889 +
19890 +/* ->parse() method for formatted nodes */
19891 +static int parse_znode(jnode * node)
19892 +{
19893 + return zparse(JZNODE(node));
19894 +}
19895 +
19896 +/* ->delete() method for formatted nodes */
19897 +static void delete_znode(jnode * node, reiser4_tree * tree)
19898 +{
19899 + znode *z;
19900 +
19901 + assert_rw_write_locked(&(tree->tree_lock));
19902 + assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19903 +
19904 + z = JZNODE(node);
19905 + assert("vs-899", z->c_count == 0);
19906 +
19907 + /* delete znode from sibling list. */
19908 + sibling_list_remove(z);
19909 +
19910 + znode_remove(z, tree);
19911 +}
19912 +
19913 +/* ->remove() method for formatted nodes */
19914 +static int remove_znode(jnode * node, reiser4_tree * tree)
19915 +{
19916 + znode *z;
19917 +
19918 + assert_rw_write_locked(&(tree->tree_lock));
19919 + z = JZNODE(node);
19920 +
19921 + if (z->c_count == 0) {
19922 + /* detach znode from sibling list. */
19923 + sibling_list_drop(z);
19924 + /* this is called with tree spin-lock held, so call
19925 + znode_remove() directly (rather than znode_lock_remove()). */
19926 + znode_remove(z, tree);
19927 + return 0;
19928 + }
19929 + return RETERR(-EBUSY);
19930 +}
19931 +
19932 +/* ->init() method for formatted nodes */
19933 +static int init_znode(jnode * node)
19934 +{
19935 + znode *z;
19936 +
19937 + z = JZNODE(node);
19938 + /* call node plugin to do actual initialization */
19939 + return z->nplug->init(z);
19940 +}
19941 +
19942 +/* ->clone() method for formatted nodes */
19943 +static jnode *clone_formatted(jnode * node)
19944 +{
19945 + znode *clone;
19946 +
19947 + assert("vs-1430", jnode_is_znode(node));
19948 + clone = zalloc(get_gfp_mask());
19949 + if (clone == NULL)
19950 + return ERR_PTR(RETERR(-ENOMEM));
19951 + zinit(clone, NULL, current_tree);
19952 + jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19953 + /* ZJNODE(clone)->key.z is not initialized */
19954 + clone->level = JZNODE(node)->level;
19955 +
19956 + return ZJNODE(clone);
19957 +}
19958 +
19959 +/* jplug->clone for unformatted nodes */
19960 +static jnode *clone_unformatted(jnode * node)
19961 +{
19962 + jnode *clone;
19963 +
19964 + assert("vs-1431", jnode_is_unformatted(node));
19965 + clone = jalloc();
19966 + if (clone == NULL)
19967 + return ERR_PTR(RETERR(-ENOMEM));
19968 +
19969 + jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19970 + jnode_set_block(clone, jnode_get_block(node));
19971 +
19972 + return clone;
19973 +
19974 +}
19975 +
19976 +/*
19977 + * Setup jnode plugin methods for various jnode types.
19978 + */
19979 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19980 + [JNODE_UNFORMATTED_BLOCK] = {
19981 + .h = {
19982 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19983 + .id = JNODE_UNFORMATTED_BLOCK,
19984 + .pops = NULL,
19985 + .label = "unformatted",
19986 + .desc = "unformatted node",
19987 + .linkage = {NULL, NULL}
19988 + },
19989 + .init = init_noinit,
19990 + .parse = parse_noparse,
19991 + .mapping = mapping_jnode,
19992 + .index = index_jnode,
19993 + .clone = clone_unformatted
19994 + },
19995 + [JNODE_FORMATTED_BLOCK] = {
19996 + .h = {
19997 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19998 + .id = JNODE_FORMATTED_BLOCK,
19999 + .pops = NULL,
20000 + .label = "formatted",
20001 + .desc = "formatted tree node",
20002 + .linkage = {NULL, NULL}
20003 + },
20004 + .init = init_znode,
20005 + .parse = parse_znode,
20006 + .mapping = mapping_znode,
20007 + .index = index_znode,
20008 + .clone = clone_formatted
20009 + },
20010 + [JNODE_BITMAP] = {
20011 + .h = {
20012 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20013 + .id = JNODE_BITMAP,
20014 + .pops = NULL,
20015 + .label = "bitmap",
20016 + .desc = "bitmap node",
20017 + .linkage = {NULL, NULL}
20018 + },
20019 + .init = init_noinit,
20020 + .parse = parse_noparse,
20021 + .mapping = mapping_bitmap,
20022 + .index = index_is_address,
20023 + .clone = NULL
20024 + },
20025 + [JNODE_IO_HEAD] = {
20026 + .h = {
20027 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20028 + .id = JNODE_IO_HEAD,
20029 + .pops = NULL,
20030 + .label = "io head",
20031 + .desc = "io head",
20032 + .linkage = {NULL, NULL}
20033 + },
20034 + .init = init_noinit,
20035 + .parse = parse_noparse,
20036 + .mapping = mapping_bitmap,
20037 + .index = index_is_address,
20038 + .clone = NULL
20039 + },
20040 + [JNODE_INODE] = {
20041 + .h = {
20042 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
20043 + .id = JNODE_INODE,
20044 + .pops = NULL,
20045 + .label = "inode",
20046 + .desc = "inode's builtin jnode",
20047 + .linkage = {NULL, NULL}
20048 + },
20049 + .init = NULL,
20050 + .parse = NULL,
20051 + .mapping = NULL,
20052 + .index = NULL,
20053 + .clone = NULL
20054 + }
20055 +};
20056 +
20057 +/*
20058 + * jnode destruction.
20059 + *
20060 + * Thread may use a jnode after it acquired a reference to it. References are
20061 + * counted in ->x_count field. Reference protects jnode from being
20062 + * recycled. This is different from protecting jnode data (that are stored in
20063 + * jnode page) from being evicted from memory. Data are protected by jload()
20064 + * and released by jrelse().
20065 + *
20066 + * If thread already possesses a reference to the jnode it can acquire another
20067 + * one through jref(). Initial reference is obtained (usually) by locating
20068 + * jnode in some indexing structure that depends on jnode type: formatted
20069 + * nodes are kept in global hash table, where they are indexed by block
20070 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
20071 + * table, which is indexed by oid and offset within file, and in per-inode
20072 + * radix tree.
20073 + *
20074 + * Reference to jnode is released by jput(). If last reference is released,
20075 + * jput_final() is called. This function determines whether jnode has to be
20076 + * deleted (this happens when corresponding node is removed from the file
20077 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
20078 + * should be just "removed" (deleted from memory).
20079 + *
20080 + * Jnode destruction is signally delicate dance because of locking and RCU.
20081 + */
20082 +
20083 +/*
20084 + * Returns true if jnode cannot be removed right now. This check is called
20085 + * under tree lock. If it returns true, jnode is irrevocably committed to be
20086 + * deleted/removed.
20087 + */
20088 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
20089 +{
20090 + /* if other thread managed to acquire a reference to this jnode, don't
20091 + * free it. */
20092 + if (atomic_read(&node->x_count) > 0)
20093 + return 1;
20094 + /* also, don't free znode that has children in memory */
20095 + if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
20096 + return 1;
20097 + return 0;
20098 +}
20099 +
20100 +/*
20101 + * this is called as part of removing jnode. Based on jnode type, call
20102 + * corresponding function that removes jnode from indices and returns it back
20103 + * to the appropriate slab (through RCU).
20104 + */
20105 +static inline void
20106 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
20107 +{
20108 + switch (jtype) {
20109 + case JNODE_UNFORMATTED_BLOCK:
20110 + remove_jnode(node, tree);
20111 + break;
20112 + case JNODE_IO_HEAD:
20113 + case JNODE_BITMAP:
20114 + break;
20115 + case JNODE_INODE:
20116 + break;
20117 + case JNODE_FORMATTED_BLOCK:
20118 + remove_znode(node, tree);
20119 + break;
20120 + default:
20121 + wrong_return_value("nikita-3196", "Wrong jnode type");
20122 + }
20123 +}
20124 +
20125 +/*
20126 + * this is called as part of deleting jnode. Based on jnode type, call
20127 + * corresponding function that removes jnode from indices and returns it back
20128 + * to the appropriate slab (through RCU).
20129 + *
20130 + * This differs from jnode_remove() only for formatted nodes---for them
20131 + * sibling list handling is different for removal and deletion.
20132 + */
20133 +static inline void
20134 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
20135 +{
20136 + switch (jtype) {
20137 + case JNODE_UNFORMATTED_BLOCK:
20138 + remove_jnode(node, tree);
20139 + break;
20140 + case JNODE_IO_HEAD:
20141 + case JNODE_BITMAP:
20142 + break;
20143 + case JNODE_FORMATTED_BLOCK:
20144 + delete_znode(node, tree);
20145 + break;
20146 + case JNODE_INODE:
20147 + default:
20148 + wrong_return_value("nikita-3195", "Wrong jnode type");
20149 + }
20150 +}
20151 +
20152 +#if REISER4_DEBUG
20153 +/*
20154 + * remove jnode from the debugging list of all jnodes hanging off super-block.
20155 + */
20156 +void jnode_list_remove(jnode * node)
20157 +{
20158 + reiser4_super_info_data *sbinfo;
20159 +
20160 + sbinfo = get_super_private(jnode_get_tree(node)->super);
20161 +
20162 + spin_lock_irq(&sbinfo->all_guard);
20163 + assert("nikita-2422", !list_empty(&node->jnodes));
20164 + list_del_init(&node->jnodes);
20165 + spin_unlock_irq(&sbinfo->all_guard);
20166 +}
20167 +#endif
20168 +
20169 +/*
20170 + * this is called by jput_final() to remove jnode when last reference to it is
20171 + * released.
20172 + */
20173 +static int jnode_try_drop(jnode * node)
20174 +{
20175 + int result;
20176 + reiser4_tree *tree;
20177 + jnode_type jtype;
20178 +
20179 + assert("nikita-2491", node != NULL);
20180 + assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
20181 +
20182 + tree = jnode_get_tree(node);
20183 + jtype = jnode_get_type(node);
20184 +
20185 + spin_lock_jnode(node);
20186 + write_lock_tree(tree);
20187 + /*
20188 + * if jnode has a page---leave it alone. Memory pressure will
20189 + * eventually kill page and jnode.
20190 + */
20191 + if (jnode_page(node) != NULL) {
20192 + write_unlock_tree(tree);
20193 + spin_unlock_jnode(node);
20194 + JF_CLR(node, JNODE_RIP);
20195 + return RETERR(-EBUSY);
20196 + }
20197 +
20198 + /* re-check ->x_count under tree lock. */
20199 + result = jnode_is_busy(node, jtype);
20200 + if (result == 0) {
20201 + assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20202 + assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
20203 +
20204 + spin_unlock_jnode(node);
20205 + /* no page and no references---despatch him. */
20206 + jnode_remove(node, jtype, tree);
20207 + write_unlock_tree(tree);
20208 + jnode_free(node, jtype);
20209 + } else {
20210 + /* busy check failed: reference was acquired by concurrent
20211 + * thread. */
20212 + write_unlock_tree(tree);
20213 + spin_unlock_jnode(node);
20214 + JF_CLR(node, JNODE_RIP);
20215 + }
20216 + return result;
20217 +}
20218 +
20219 +/* jdelete() -- Delete jnode from the tree and file system */
20220 +static int jdelete(jnode * node /* jnode to finish with */ )
20221 +{
20222 + struct page *page;
20223 + int result;
20224 + reiser4_tree *tree;
20225 + jnode_type jtype;
20226 +
20227 + assert("nikita-467", node != NULL);
20228 + assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20229 +
20230 + jtype = jnode_get_type(node);
20231 +
20232 + page = jnode_lock_page(node);
20233 + assert_spin_locked(&(node->guard));
20234 +
20235 + tree = jnode_get_tree(node);
20236 +
20237 + write_lock_tree(tree);
20238 + /* re-check ->x_count under tree lock. */
20239 + result = jnode_is_busy(node, jtype);
20240 + if (likely(!result)) {
20241 + assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20242 + assert("jmacd-511", atomic_read(&node->d_count) == 0);
20243 +
20244 + /* detach page */
20245 + if (page != NULL) {
20246 + /*
20247 + * FIXME this is racy against jnode_extent_write().
20248 + */
20249 + page_clear_jnode(page, node);
20250 + }
20251 + spin_unlock_jnode(node);
20252 + /* goodbye */
20253 + jnode_delete(node, jtype, tree);
20254 + write_unlock_tree(tree);
20255 + jnode_free(node, jtype);
20256 + /* @node is no longer valid pointer */
20257 + if (page != NULL)
20258 + drop_page(page);
20259 + } else {
20260 + /* busy check failed: reference was acquired by concurrent
20261 + * thread. */
20262 + JF_CLR(node, JNODE_RIP);
20263 + write_unlock_tree(tree);
20264 + spin_unlock_jnode(node);
20265 + if (page != NULL)
20266 + unlock_page(page);
20267 + }
20268 + return result;
20269 +}
20270 +
20271 +/* drop jnode on the floor.
20272 +
20273 + Return value:
20274 +
20275 + -EBUSY: failed to drop jnode, because there are still references to it
20276 +
20277 + 0: successfully dropped jnode
20278 +
20279 +*/
20280 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20281 +{
20282 + struct page *page;
20283 + jnode_type jtype;
20284 + int result;
20285 +
20286 + assert("zam-602", node != NULL);
20287 + assert_rw_not_read_locked(&(tree->tree_lock));
20288 + assert_rw_not_write_locked(&(tree->tree_lock));
20289 + assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20290 +
20291 + jtype = jnode_get_type(node);
20292 +
20293 + page = jnode_lock_page(node);
20294 + assert_spin_locked(&(node->guard));
20295 +
20296 + write_lock_tree(tree);
20297 +
20298 + /* re-check ->x_count under tree lock. */
20299 + result = jnode_is_busy(node, jtype);
20300 + if (!result) {
20301 + assert("nikita-2488", page == jnode_page(node));
20302 + assert("nikita-2533", atomic_read(&node->d_count) == 0);
20303 + if (page != NULL) {
20304 + assert("nikita-2126", !PageDirty(page));
20305 + assert("nikita-2127", PageUptodate(page));
20306 + assert("nikita-2181", PageLocked(page));
20307 + page_clear_jnode(page, node);
20308 + }
20309 + spin_unlock_jnode(node);
20310 + jnode_remove(node, jtype, tree);
20311 + write_unlock_tree(tree);
20312 + jnode_free(node, jtype);
20313 + if (page != NULL) {
20314 + drop_page(page);
20315 + }
20316 + } else {
20317 + /* busy check failed: reference was acquired by concurrent
20318 + * thread. */
20319 + JF_CLR(node, JNODE_RIP);
20320 + write_unlock_tree(tree);
20321 + spin_unlock_jnode(node);
20322 + if (page != NULL)
20323 + unlock_page(page);
20324 + }
20325 + return result;
20326 +}
20327 +
20328 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20329 + be 0 (where applicable). */
20330 +void jdrop(jnode * node)
20331 +{
20332 + jdrop_in_tree(node, jnode_get_tree(node));
20333 +}
20334 +
20335 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20336 + functionality (these j-nodes are not in any hash table) just for reading
20337 + from and writing to disk. */
20338 +
20339 +jnode *alloc_io_head(const reiser4_block_nr * block)
20340 +{
20341 + jnode *jal = jalloc();
20342 +
20343 + if (jal != NULL) {
20344 + jnode_init(jal, current_tree, JNODE_IO_HEAD);
20345 + jnode_set_block(jal, block);
20346 + }
20347 +
20348 + jref(jal);
20349 +
20350 + return jal;
20351 +}
20352 +
20353 +void drop_io_head(jnode * node)
20354 +{
20355 + assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20356 +
20357 + jput(node);
20358 + jdrop(node);
20359 +}
20360 +
20361 +/* protect keep jnode data from reiser4_releasepage() */
20362 +void pin_jnode_data(jnode * node)
20363 +{
20364 + assert("zam-671", jnode_page(node) != NULL);
20365 + page_cache_get(jnode_page(node));
20366 +}
20367 +
20368 +/* make jnode data free-able again */
20369 +void unpin_jnode_data(jnode * node)
20370 +{
20371 + assert("zam-672", jnode_page(node) != NULL);
20372 + page_cache_release(jnode_page(node));
20373 +}
20374 +
20375 +struct address_space *jnode_get_mapping(const jnode * node)
20376 +{
20377 + assert("nikita-3162", node != NULL);
20378 + return jnode_ops(node)->mapping(node);
20379 +}
20380 +
20381 +#if REISER4_DEBUG
20382 +/* debugging aid: jnode invariant */
20383 +int jnode_invariant_f(const jnode * node, char const **msg)
20384 +{
20385 +#define _ergo(ant, con) \
20386 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20387 +#define _check(exp) ((*msg) = #exp, (exp))
20388 +
20389 + return _check(node != NULL) &&
20390 + /* [jnode-queued] */
20391 + /* only relocated node can be queued, except that when znode
20392 + * is being deleted, its JNODE_RELOC bit is cleared */
20393 + _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20394 + JF_ISSET(node, JNODE_RELOC) ||
20395 + JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20396 + _check(node->jnodes.prev != NULL) &&
20397 + _check(node->jnodes.next != NULL) &&
20398 + /* [jnode-dirty] invariant */
20399 + /* dirty inode is part of atom */
20400 + _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20401 + /* [jnode-oid] invariant */
20402 + /* for unformatted node ->objectid and ->mapping fields are
20403 + * consistent */
20404 + _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20405 + node->key.j.objectid ==
20406 + get_inode_oid(node->key.j.mapping->host)) &&
20407 + /* [jnode-atom-valid] invariant */
20408 + /* node atom has valid state */
20409 + _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20410 + /* [jnode-page-binding] invariant */
20411 + /* if node points to page, it points back to node */
20412 + _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20413 + /* [jnode-refs] invariant */
20414 + /* only referenced jnode can be loaded */
20415 + _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20416 +
20417 +}
20418 +
20419 +static const char *jnode_type_name(jnode_type type)
20420 +{
20421 + switch (type) {
20422 + case JNODE_UNFORMATTED_BLOCK:
20423 + return "unformatted";
20424 + case JNODE_FORMATTED_BLOCK:
20425 + return "formatted";
20426 + case JNODE_BITMAP:
20427 + return "bitmap";
20428 + case JNODE_IO_HEAD:
20429 + return "io head";
20430 + case JNODE_INODE:
20431 + return "inode";
20432 + case LAST_JNODE_TYPE:
20433 + return "last";
20434 + default:{
20435 + static char unknown[30];
20436 +
20437 + sprintf(unknown, "unknown %i", type);
20438 + return unknown;
20439 + }
20440 + }
20441 +}
20442 +
20443 +#define jnode_state_name( node, flag ) \
20444 + ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20445 +
20446 +/* debugging aid: output human readable information about @node */
20447 +static void info_jnode(const char *prefix /* prefix to print */ ,
20448 + const jnode * node /* node to print */ )
20449 +{
20450 + assert("umka-068", prefix != NULL);
20451 +
20452 + if (node == NULL) {
20453 + printk("%s: null\n", prefix);
20454 + return;
20455 + }
20456 +
20457 + printk
20458 + ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20459 + " block: %s, d_count: %d, x_count: %d, "
20460 + "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20461 + node->state,
20462 + jnode_state_name(node, JNODE_PARSED),
20463 + jnode_state_name(node, JNODE_HEARD_BANSHEE),
20464 + jnode_state_name(node, JNODE_LEFT_CONNECTED),
20465 + jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20466 + jnode_state_name(node, JNODE_ORPHAN),
20467 + jnode_state_name(node, JNODE_CREATED),
20468 + jnode_state_name(node, JNODE_RELOC),
20469 + jnode_state_name(node, JNODE_OVRWR),
20470 + jnode_state_name(node, JNODE_DIRTY),
20471 + jnode_state_name(node, JNODE_IS_DYING),
20472 + jnode_state_name(node, JNODE_RIP),
20473 + jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20474 + jnode_state_name(node, JNODE_WRITEBACK),
20475 + jnode_state_name(node, JNODE_NEW),
20476 + jnode_state_name(node, JNODE_DKSET),
20477 + jnode_state_name(node, JNODE_REPACK),
20478 + jnode_state_name(node, JNODE_CLUSTER_PAGE),
20479 + jnode_get_level(node), sprint_address(jnode_get_block(node)),
20480 + atomic_read(&node->d_count), atomic_read(&node->x_count),
20481 + jnode_page(node), node->atom, 0, 0,
20482 + jnode_type_name(jnode_get_type(node)));
20483 + if (jnode_is_unformatted(node)) {
20484 + printk("inode: %llu, index: %lu, ",
20485 + node->key.j.objectid, node->key.j.index);
20486 + }
20487 +}
20488 +
20489 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20490 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20491 +{
20492 + char const *failed_msg;
20493 + int result;
20494 + reiser4_tree *tree;
20495 +
20496 + tree = jnode_get_tree(node);
20497 +
20498 + assert("umka-063312", node != NULL);
20499 + assert("umka-064321", tree != NULL);
20500 +
20501 + if (!jlocked && !tlocked)
20502 + spin_lock_jnode((jnode *) node);
20503 + if (!tlocked)
20504 + read_lock_tree(jnode_get_tree(node));
20505 + result = jnode_invariant_f(node, &failed_msg);
20506 + if (!result) {
20507 + info_jnode("corrupted node", node);
20508 + warning("jmacd-555", "Condition %s failed", failed_msg);
20509 + }
20510 + if (!tlocked)
20511 + read_unlock_tree(jnode_get_tree(node));
20512 + if (!jlocked && !tlocked)
20513 + spin_unlock_jnode((jnode *) node);
20514 + return result;
20515 +}
20516 +
20517 +#endif /* REISER4_DEBUG */
20518 +
20519 +/* Make Linus happy.
20520 + Local variables:
20521 + c-indentation-style: "K&R"
20522 + mode-name: "LC"
20523 + c-basic-offset: 8
20524 + tab-width: 8
20525 + fill-column: 80
20526 + End:
20527 +*/
20528 Index: linux-2.6.16/fs/reiser4/jnode.h
20529 ===================================================================
20530 --- /dev/null
20531 +++ linux-2.6.16/fs/reiser4/jnode.h
20532 @@ -0,0 +1,711 @@
20533 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20534 + * reiser4/README */
20535 +
20536 +/* Declaration of jnode. See jnode.c for details. */
20537 +
20538 +#ifndef __JNODE_H__
20539 +#define __JNODE_H__
20540 +
20541 +#include "forward.h"
20542 +#include "type_safe_hash.h"
20543 +#include "txnmgr.h"
20544 +#include "key.h"
20545 +#include "debug.h"
20546 +#include "dformat.h"
20547 +#include "context.h"
20548 +
20549 +#include "plugin/plugin.h"
20550 +
20551 +#include <linux/fs.h>
20552 +#include <linux/mm.h>
20553 +#include <linux/spinlock.h>
20554 +#include <asm/atomic.h>
20555 +#include <asm/bitops.h>
20556 +#include <linux/list.h>
20557 +#include <linux/rcupdate.h>
20558 +
20559 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20560 + nodes) */
20561 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20562 +
20563 +/* declare hash table of znodes */
20564 +TYPE_SAFE_HASH_DECLARE(z, znode);
20565 +
20566 +typedef struct {
20567 + __u64 objectid;
20568 + unsigned long index;
20569 + struct address_space *mapping;
20570 +} jnode_key_t;
20571 +
20572 +/*
20573 + Jnode is the "base class" of other nodes in reiser4. It is also happens to
20574 + be exactly the node we use for unformatted tree nodes.
20575 +
20576 + Jnode provides following basic functionality:
20577 +
20578 + . reference counting and indexing.
20579 +
20580 + . integration with page cache. Jnode has ->pg reference to which page can
20581 + be attached.
20582 +
20583 + . interface to transaction manager. It is jnode that is kept in transaction
20584 + manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20585 + means, there should be special type of jnode for inode.)
20586 +
20587 + Locking:
20588 +
20589 + Spin lock: the following fields are protected by the per-jnode spin lock:
20590 +
20591 + ->state
20592 + ->atom
20593 + ->capture_link
20594 +
20595 + Following fields are protected by the global tree lock:
20596 +
20597 + ->link
20598 + ->key.z (content of ->key.z is only changed in znode_rehash())
20599 + ->key.j
20600 +
20601 + Atomic counters
20602 +
20603 + ->x_count
20604 + ->d_count
20605 +
20606 + ->pg, and ->data are protected by spin lock for unused jnode and are
20607 + immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20608 + is false).
20609 +
20610 + ->tree is immutable after creation
20611 +
20612 + Unclear
20613 +
20614 + ->blocknr: should be under jnode spin-lock, but current interface is based
20615 + on passing of block address.
20616 +
20617 + If you ever need to spin lock two nodes at once, do this in "natural"
20618 + memory order: lock znode with lower address first. (See lock_two_nodes().)
20619 +
20620 + Invariants involving this data-type:
20621 +
20622 + [jnode-dirty]
20623 + [jnode-refs]
20624 + [jnode-oid]
20625 + [jnode-queued]
20626 + [jnode-atom-valid]
20627 + [jnode-page-binding]
20628 +*/
20629 +
20630 +struct jnode {
20631 +#if REISER4_DEBUG
20632 +#define JMAGIC 0x52654973 /* "ReIs" */
20633 + int magic;
20634 +#endif
20635 + /* FIRST CACHE LINE (16 bytes): data used by jload */
20636 +
20637 + /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20638 + /* 0 */ unsigned long state;
20639 +
20640 + /* lock, protecting jnode's fields. */
20641 + /* 4 */ spinlock_t load;
20642 +
20643 + /* counter of references to jnode itself. Increased on jref().
20644 + Decreased on jput().
20645 + */
20646 + /* 8 */ atomic_t x_count;
20647 +
20648 + /* counter of references to jnode's data. Pin data page(s) in
20649 + memory while this is greater than 0. Increased on jload().
20650 + Decreased on jrelse().
20651 + */
20652 + /* 12 */ atomic_t d_count;
20653 +
20654 + /* SECOND CACHE LINE: data used by hash table lookups */
20655 +
20656 + /* 16 */ union {
20657 + /* znodes are hashed by block number */
20658 + reiser4_block_nr z;
20659 + /* unformatted nodes are hashed by mapping plus offset */
20660 + jnode_key_t j;
20661 + } key;
20662 +
20663 + /* THIRD CACHE LINE */
20664 +
20665 + /* 32 */ union {
20666 + /* pointers to maintain hash-table */
20667 + z_hash_link z;
20668 + j_hash_link j;
20669 + } link;
20670 +
20671 + /* pointer to jnode page. */
20672 + /* 36 */ struct page *pg;
20673 + /* pointer to node itself. This is page_address(node->pg) when page is
20674 + attached to the jnode
20675 + */
20676 + /* 40 */ void *data;
20677 +
20678 + /* 44 */ reiser4_tree *tree;
20679 +
20680 + /* FOURTH CACHE LINE: atom related fields */
20681 +
20682 + /* 48 */ spinlock_t guard;
20683 +
20684 + /* atom the block is in, if any */
20685 + /* 52 */ txn_atom *atom;
20686 +
20687 + /* capture list */
20688 + /* 56 */ struct list_head capture_link;
20689 +
20690 + /* FIFTH CACHE LINE */
20691 +
20692 + /* 64 */ struct rcu_head rcu;
20693 + /* crosses cache line */
20694 +
20695 + /* SIXTH CACHE LINE */
20696 +
20697 + /* the real blocknr (where io is going to/from) */
20698 + /* 80 */ reiser4_block_nr blocknr;
20699 + /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20700 + /* NOTE: this parent_item_id looks like jnode type. */
20701 + /* 88 */ reiser4_plugin_id parent_item_id;
20702 + /* 92 */
20703 +#if REISER4_DEBUG
20704 + /* number of pages referenced by the jnode (meaningful while capturing of
20705 + page clusters) */
20706 + int page_count;
20707 + /* list of all jnodes for debugging purposes. */
20708 + struct list_head jnodes;
20709 + /* how many times this jnode was written in one transaction */
20710 + int written;
20711 + /* this indicates which atom's list the jnode is on */
20712 + atom_list list;
20713 +#endif
20714 +} __attribute__ ((aligned(16)));
20715 +
20716 +/*
20717 + * jnode types. Enumeration of existing jnode types.
20718 + */
20719 +typedef enum {
20720 + JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20721 + JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20722 + JNODE_BITMAP, /* bitmap */
20723 + JNODE_IO_HEAD, /* jnode representing a block in the
20724 + * wandering log */
20725 + JNODE_INODE, /* jnode embedded into inode */
20726 + LAST_JNODE_TYPE
20727 +} jnode_type;
20728 +
20729 +/* jnode states */
20730 +typedef enum {
20731 + /* jnode's page is loaded and data checked */
20732 + JNODE_PARSED = 0,
20733 + /* node was deleted, not all locks on it were released. This
20734 + node is empty and is going to be removed from the tree
20735 + shortly. */
20736 + JNODE_HEARD_BANSHEE = 1,
20737 + /* left sibling pointer is valid */
20738 + JNODE_LEFT_CONNECTED = 2,
20739 + /* right sibling pointer is valid */
20740 + JNODE_RIGHT_CONNECTED = 3,
20741 +
20742 + /* znode was just created and doesn't yet have a pointer from
20743 + its parent */
20744 + JNODE_ORPHAN = 4,
20745 +
20746 + /* this node was created by its transaction and has not been assigned
20747 + a block address. */
20748 + JNODE_CREATED = 5,
20749 +
20750 + /* this node is currently relocated */
20751 + JNODE_RELOC = 6,
20752 + /* this node is currently wandered */
20753 + JNODE_OVRWR = 7,
20754 +
20755 + /* this znode has been modified */
20756 + JNODE_DIRTY = 8,
20757 +
20758 + /* znode lock is being invalidated */
20759 + JNODE_IS_DYING = 9,
20760 +
20761 + /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20762 +
20763 + /* jnode is queued for flushing. */
20764 + JNODE_FLUSH_QUEUED = 12,
20765 +
20766 + /* In the following bits jnode type is encoded. */
20767 + JNODE_TYPE_1 = 13,
20768 + JNODE_TYPE_2 = 14,
20769 + JNODE_TYPE_3 = 15,
20770 +
20771 + /* jnode is being destroyed */
20772 + JNODE_RIP = 16,
20773 +
20774 + /* znode was not captured during locking (it might so be because
20775 + ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20776 + JNODE_MISSED_IN_CAPTURE = 17,
20777 +
20778 + /* write is in progress */
20779 + JNODE_WRITEBACK = 18,
20780 +
20781 + /* FIXME: now it is used by crypto-compress plugin only */
20782 + JNODE_NEW = 19,
20783 +
20784 + /* delimiting keys are already set for this znode. */
20785 + JNODE_DKSET = 20,
20786 +
20787 + /* when this bit is set page and jnode can not be disconnected */
20788 + JNODE_WRITE_PREPARED = 21,
20789 +
20790 + JNODE_CLUSTER_PAGE = 22,
20791 + /* Jnode is marked for repacking, that means the reiser4 flush and the
20792 + * block allocator should process this node special way */
20793 + JNODE_REPACK = 23,
20794 + /* node should be converted by flush in squalloc phase */
20795 + JNODE_CONVERTIBLE = 24,
20796 + /*
20797 + * When jnode is dirtied for the first time in given transaction,
20798 + * do_jnode_make_dirty() checks whether this jnode can possible became
20799 + * member of overwrite set. If so, this bit is set, and one block is
20800 + * reserved in the ->flush_reserved space of atom.
20801 + *
20802 + * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20803 + *
20804 + * (1) flush decides that we want this block to go into relocate
20805 + * set after all.
20806 + *
20807 + * (2) wandering log is allocated (by log writer)
20808 + *
20809 + * (3) extent is allocated
20810 + *
20811 + */
20812 + JNODE_FLUSH_RESERVED = 29
20813 +} reiser4_jnode_state;
20814 +
20815 +/* Macros for accessing the jnode state. */
20816 +
20817 +static inline void JF_CLR(jnode * j, int f)
20818 +{
20819 + assert("unknown-1", j->magic == JMAGIC);
20820 + clear_bit(f, &j->state);
20821 +}
20822 +static inline int JF_ISSET(const jnode * j, int f)
20823 +{
20824 + assert("unknown-2", j->magic == JMAGIC);
20825 + return test_bit(f, &((jnode *) j)->state);
20826 +}
20827 +static inline void JF_SET(jnode * j, int f)
20828 +{
20829 + assert("unknown-3", j->magic == JMAGIC);
20830 + set_bit(f, &j->state);
20831 +}
20832 +
20833 +static inline int JF_TEST_AND_SET(jnode * j, int f)
20834 +{
20835 + assert("unknown-4", j->magic == JMAGIC);
20836 + return test_and_set_bit(f, &j->state);
20837 +}
20838 +
20839 +static inline void spin_lock_jnode(jnode *node)
20840 +{
20841 + /* check that spinlocks of lower priorities are not held */
20842 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20843 + LOCK_CNT_NIL(spin_locked_txnh) &&
20844 + LOCK_CNT_NIL(spin_locked_zlock) &&
20845 + LOCK_CNT_NIL(rw_locked_dk) &&
20846 + LOCK_CNT_LT(spin_locked_jnode, 2)));
20847 +
20848 + spin_lock(&(node->guard));
20849 +
20850 + LOCK_CNT_INC(spin_locked_jnode);
20851 + LOCK_CNT_INC(spin_locked);
20852 +}
20853 +
20854 +static inline void spin_unlock_jnode(jnode *node)
20855 +{
20856 + assert_spin_locked(&(node->guard));
20857 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20858 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20859 +
20860 + LOCK_CNT_DEC(spin_locked_jnode);
20861 + LOCK_CNT_DEC(spin_locked);
20862 +
20863 + spin_unlock(&(node->guard));
20864 +}
20865 +
20866 +static inline int jnode_is_in_deleteset(const jnode * node)
20867 +{
20868 + return JF_ISSET(node, JNODE_RELOC);
20869 +}
20870 +
20871 +extern int init_jnodes(void);
20872 +extern void done_jnodes(void);
20873 +
20874 +/* Jnode routines */
20875 +extern jnode *jalloc(void);
20876 +extern void jfree(jnode * node) NONNULL;
20877 +extern jnode *jclone(jnode *);
20878 +extern jnode *jlookup(reiser4_tree * tree,
20879 + oid_t objectid, unsigned long ind) NONNULL;
20880 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20881 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
20882 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
20883 +void jnode_attach_page(jnode * node, struct page *pg);
20884 +jnode *find_get_jnode(reiser4_tree * tree,
20885 + struct address_space *mapping, oid_t oid,
20886 + unsigned long index);
20887 +
20888 +void unhash_unformatted_jnode(jnode *);
20889 +struct page *jnode_get_page_locked(jnode *, gfp_t gfp_flags);
20890 +extern jnode *page_next_jnode(jnode * node) NONNULL;
20891 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20892 +extern void jnode_make_dirty(jnode * node) NONNULL;
20893 +extern void jnode_make_clean(jnode * node) NONNULL;
20894 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20895 +extern void jnode_make_wander(jnode *) NONNULL;
20896 +extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20897 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20898 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20899 +
20900 +/**
20901 + * jnode_get_block
20902 + * @node: jnode to query
20903 + *
20904 + */
20905 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20906 +{
20907 + assert("nikita-528", node != NULL);
20908 +
20909 + return &node->blocknr;
20910 +}
20911 +
20912 +/**
20913 + * jnode_set_block
20914 + * @node: jnode to update
20915 + * @blocknr: new block nr
20916 + */
20917 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20918 +{
20919 + assert("nikita-2020", node != NULL);
20920 + assert("umka-055", blocknr != NULL);
20921 + node->blocknr = *blocknr;
20922 +}
20923 +
20924 +
20925 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
20926 + * jnode was emergency flushed---then block number chosen by eflush is
20927 + * used. */
20928 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20929 +{
20930 + assert("nikita-2768", node != NULL);
20931 + assert_spin_locked(&(node->guard));
20932 +
20933 + return jnode_get_block(node);
20934 +}
20935 +
20936 +/* Jnode flush interface. */
20937 +extern reiser4_blocknr_hint *pos_hint(flush_pos_t * pos);
20938 +extern flush_queue_t *pos_fq(flush_pos_t * pos);
20939 +
20940 +/* FIXME-VS: these are used in plugin/item/extent.c */
20941 +
20942 +/* does extent_get_block have to be called */
20943 +#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20944 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20945 +
20946 +/* the node should be converted during flush squalloc phase */
20947 +#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20948 +#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20949 +
20950 +/* Macros to convert from jnode to znode, znode to jnode. These are macros
20951 + because C doesn't allow overloading of const prototypes. */
20952 +#define ZJNODE(x) (& (x) -> zjnode)
20953 +#define JZNODE(x) \
20954 +({ \
20955 + typeof (x) __tmp_x; \
20956 + \
20957 + __tmp_x = (x); \
20958 + assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20959 + (znode*) __tmp_x; \
20960 +})
20961 +
20962 +extern int jnodes_tree_init(reiser4_tree * tree);
20963 +extern int jnodes_tree_done(reiser4_tree * tree);
20964 +
20965 +#if REISER4_DEBUG
20966 +
20967 +extern int znode_is_any_locked(const znode * node);
20968 +extern void jnode_list_remove(jnode * node);
20969 +
20970 +#else
20971 +
20972 +#define jnode_list_remove(node) noop
20973 +
20974 +#endif
20975 +
20976 +int znode_is_root(const znode * node) NONNULL;
20977 +
20978 +/* bump reference counter on @node */
20979 +static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20980 +{
20981 + assert("nikita-1911", node != NULL);
20982 +
20983 + atomic_inc(&node->x_count);
20984 + LOCK_CNT_INC(x_refs);
20985 +}
20986 +
20987 +static inline void dec_x_ref(jnode * node)
20988 +{
20989 + assert("nikita-3215", node != NULL);
20990 + assert("nikita-3216", atomic_read(&node->x_count) > 0);
20991 +
20992 + atomic_dec(&node->x_count);
20993 + assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20994 + LOCK_CNT_DEC(x_refs);
20995 +}
20996 +
20997 +/* jref() - increase counter of references to jnode/znode (x_count) */
20998 +static inline jnode *jref(jnode * node)
20999 +{
21000 + assert("jmacd-508", (node != NULL) && !IS_ERR(node));
21001 + add_x_ref(node);
21002 + return node;
21003 +}
21004 +
21005 +/* get the page of jnode */
21006 +static inline struct page *jnode_page(const jnode * node)
21007 +{
21008 + return node->pg;
21009 +}
21010 +
21011 +/* return pointer to jnode data */
21012 +static inline char *jdata(const jnode * node)
21013 +{
21014 + assert("nikita-1415", node != NULL);
21015 + assert("nikita-3198", jnode_page(node) != NULL);
21016 + return node->data;
21017 +}
21018 +
21019 +static inline int jnode_is_loaded(const jnode * node)
21020 +{
21021 + assert("zam-506", node != NULL);
21022 + return atomic_read(&node->d_count) > 0;
21023 +}
21024 +
21025 +extern void page_detach_jnode(struct page *page,
21026 + struct address_space *mapping,
21027 + unsigned long index) NONNULL;
21028 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
21029 +
21030 +static inline void jnode_set_reloc(jnode * node)
21031 +{
21032 + assert("nikita-2431", node != NULL);
21033 + assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
21034 + JF_SET(node, JNODE_RELOC);
21035 +}
21036 +
21037 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
21038 +
21039 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
21040 +
21041 +static inline int jload(jnode *node)
21042 +{
21043 + return jload_gfp(node, get_gfp_mask(), 1);
21044 +}
21045 +
21046 +extern int jinit_new(jnode *, gfp_t) NONNULL;
21047 +extern int jstartio(jnode *) NONNULL;
21048 +
21049 +extern void jdrop(jnode *) NONNULL;
21050 +extern int jwait_io(jnode *, int rw) NONNULL;
21051 +
21052 +void jload_prefetch(jnode *);
21053 +
21054 +extern jnode *alloc_io_head(const reiser4_block_nr * block) NONNULL;
21055 +extern void drop_io_head(jnode * node) NONNULL;
21056 +
21057 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
21058 +{
21059 + assert("nikita-2691", node != NULL);
21060 + return node->tree;
21061 +}
21062 +
21063 +extern void pin_jnode_data(jnode *);
21064 +extern void unpin_jnode_data(jnode *);
21065 +
21066 +static inline jnode_type jnode_get_type(const jnode * node)
21067 +{
21068 + static const unsigned long state_mask =
21069 + (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
21070 +
21071 + static jnode_type mask_to_type[] = {
21072 + /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
21073 +
21074 + /* 000 */
21075 + [0] = JNODE_FORMATTED_BLOCK,
21076 + /* 001 */
21077 + [1] = JNODE_UNFORMATTED_BLOCK,
21078 + /* 010 */
21079 + [2] = JNODE_BITMAP,
21080 + /* 011 */
21081 + [3] = LAST_JNODE_TYPE, /*invalid */
21082 + /* 100 */
21083 + [4] = JNODE_INODE,
21084 + /* 101 */
21085 + [5] = LAST_JNODE_TYPE,
21086 + /* 110 */
21087 + [6] = JNODE_IO_HEAD,
21088 + /* 111 */
21089 + [7] = LAST_JNODE_TYPE, /* invalid */
21090 + };
21091 +
21092 + return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
21093 +}
21094 +
21095 +/* returns true if node is a znode */
21096 +static inline int jnode_is_znode(const jnode * node)
21097 +{
21098 + return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
21099 +}
21100 +
21101 +static inline int jnode_is_flushprepped(jnode * node)
21102 +{
21103 + assert("jmacd-78212", node != NULL);
21104 + assert_spin_locked(&(node->guard));
21105 + return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
21106 + JF_ISSET(node, JNODE_OVRWR);
21107 +}
21108 +
21109 +/* Return true if @node has already been processed by the squeeze and allocate
21110 + process. This implies the block address has been finalized for the
21111 + duration of this atom (or it is clean and will remain in place). If this
21112 + returns true you may use the block number as a hint. */
21113 +static inline int jnode_check_flushprepped(jnode * node)
21114 +{
21115 + int result;
21116 +
21117 + /* It must be clean or relocated or wandered. New allocations are set to relocate. */
21118 + spin_lock_jnode(node);
21119 + result = jnode_is_flushprepped(node);
21120 + spin_unlock_jnode(node);
21121 + return result;
21122 +}
21123 +
21124 +/* returns true if node is unformatted */
21125 +static inline int jnode_is_unformatted(const jnode * node)
21126 +{
21127 + assert("jmacd-0123", node != NULL);
21128 + return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
21129 +}
21130 +
21131 +/* returns true if node represents a cluster cache page */
21132 +static inline int jnode_is_cluster_page(const jnode * node)
21133 +{
21134 + assert("edward-50", node != NULL);
21135 + return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
21136 +}
21137 +
21138 +/* returns true is node is builtin inode's jnode */
21139 +static inline int jnode_is_inode(const jnode * node)
21140 +{
21141 + assert("vs-1240", node != NULL);
21142 + return jnode_get_type(node) == JNODE_INODE;
21143 +}
21144 +
21145 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
21146 +{
21147 + assert("nikita-2367", type < LAST_JNODE_TYPE);
21148 + return jnode_plugin_by_id((reiser4_plugin_id) type);
21149 +}
21150 +
21151 +static inline jnode_plugin *jnode_ops(const jnode * node)
21152 +{
21153 + assert("nikita-2366", node != NULL);
21154 +
21155 + return jnode_ops_of(jnode_get_type(node));
21156 +}
21157 +
21158 +/* Get the index of a block. */
21159 +static inline unsigned long jnode_get_index(jnode * node)
21160 +{
21161 + return jnode_ops(node)->index(node);
21162 +}
21163 +
21164 +/* return true if "node" is the root */
21165 +static inline int jnode_is_root(const jnode * node)
21166 +{
21167 + return jnode_is_znode(node) && znode_is_root(JZNODE(node));
21168 +}
21169 +
21170 +extern struct address_space *mapping_jnode(const jnode * node);
21171 +extern unsigned long index_jnode(const jnode * node);
21172 +
21173 +static inline void jput(jnode * node);
21174 +extern void jput_final(jnode * node);
21175 +
21176 +/* bump data counter on @node */
21177 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
21178 +{
21179 + assert("nikita-1962", node != NULL);
21180 +
21181 + atomic_inc(&node->d_count);
21182 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
21183 + LOCK_CNT_INC(d_refs);
21184 +}
21185 +
21186 +/* jput() - decrement x_count reference counter on znode.
21187 +
21188 + Count may drop to 0, jnode stays in cache until memory pressure causes the
21189 + eviction of its page. The c_count variable also ensures that children are
21190 + pressured out of memory before the parent. The jnode remains hashed as
21191 + long as the VM allows its page to stay in memory.
21192 +*/
21193 +static inline void jput(jnode * node)
21194 +{
21195 + assert("jmacd-509", node != NULL);
21196 + assert("jmacd-510", atomic_read(&node->x_count) > 0);
21197 + assert("zam-926", schedulable());
21198 + LOCK_CNT_DEC(x_refs);
21199 +
21200 + rcu_read_lock();
21201 + /*
21202 + * we don't need any kind of lock here--jput_final() uses RCU.
21203 + */
21204 + if (unlikely(atomic_dec_and_test(&node->x_count))) {
21205 + jput_final(node);
21206 + } else
21207 + rcu_read_unlock();
21208 + assert("nikita-3473", schedulable());
21209 +}
21210 +
21211 +extern void jrelse(jnode * node);
21212 +extern void jrelse_tail(jnode * node);
21213 +
21214 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
21215 +
21216 +/* resolve race with jput */
21217 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
21218 +{
21219 + if (unlikely(JF_ISSET(node, JNODE_RIP)))
21220 + node = jnode_rip_sync(tree, node);
21221 + return node;
21222 +}
21223 +
21224 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
21225 +
21226 +#if REISER4_DEBUG
21227 +extern int jnode_invariant_f(const jnode *node, char const **msg);
21228 +#endif
21229 +
21230 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21231 +
21232 +/* __JNODE_H__ */
21233 +#endif
21234 +
21235 +/* Make Linus happy.
21236 + Local variables:
21237 + c-indentation-style: "K&R"
21238 + mode-name: "LC"
21239 + c-basic-offset: 8
21240 + tab-width: 8
21241 + fill-column: 120
21242 + End:
21243 +*/
21244 Index: linux-2.6.16/fs/reiser4/kassign.c
21245 ===================================================================
21246 --- /dev/null
21247 +++ linux-2.6.16/fs/reiser4/kassign.c
21248 @@ -0,0 +1,659 @@
21249 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21250 + * reiser4/README */
21251 +
21252 +/* Key assignment policy implementation */
21253 +
21254 +/*
21255 + * In reiser4 every piece of file system data and meta-data has a key. Keys
21256 + * are used to store information in and retrieve it from reiser4 internal
21257 + * tree. In addition to this, keys define _ordering_ of all file system
21258 + * information: things having close keys are placed into the same or
21259 + * neighboring (in the tree order) nodes of the tree. As our block allocator
21260 + * tries to respect tree order (see flush.c), keys also define order in which
21261 + * things are laid out on the disk, and hence, affect performance directly.
21262 + *
21263 + * Obviously, assignment of keys to data and meta-data should be consistent
21264 + * across whole file system. Algorithm that calculates a key for a given piece
21265 + * of data or meta-data is referred to as "key assignment".
21266 + *
21267 + * Key assignment is too expensive to be implemented as a plugin (that is,
21268 + * with an ability to support different key assignment schemas in the same
21269 + * compiled kernel image). As a compromise, all key-assignment functions and
21270 + * data-structures are collected in this single file, so that modifications to
21271 + * key assignment algorithm can be localized. Additional changes may be
21272 + * required in key.[ch].
21273 + *
21274 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21275 + * may guess, there is "Plan B" too.
21276 + *
21277 + */
21278 +
21279 +/*
21280 + * Additional complication with key assignment implementation is a requirement
21281 + * to support different key length.
21282 + */
21283 +
21284 +/*
21285 + * KEY ASSIGNMENT: PLAN A, LONG KEYS.
21286 + *
21287 + * DIRECTORY ITEMS
21288 + *
21289 + * | 60 | 4 | 7 |1| 56 | 64 | 64 |
21290 + * +--------------+---+---+-+-------------+------------------+-----------------+
21291 + * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
21292 + * +--------------+---+---+-+-------------+------------------+-----------------+
21293 + * | | | | |
21294 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21295 + *
21296 + * dirid objectid of directory this item is for
21297 + *
21298 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21299 + *
21300 + * H 1 if last 8 bytes of the key contain hash,
21301 + * 0 if last 8 bytes of the key contain prefix-3
21302 + *
21303 + * prefix-1 first 7 characters of file name.
21304 + * Padded by zeroes if name is not long enough.
21305 + *
21306 + * prefix-2 next 8 characters of the file name.
21307 + *
21308 + * prefix-3 next 8 characters of the file name.
21309 + *
21310 + * hash hash of the rest of file name (i.e., portion of file
21311 + * name not included into prefix-1 and prefix-2).
21312 + *
21313 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21314 + * in the key. Such file names are called "short". They are distinguished by H
21315 + * bit set 0 in the key.
21316 + *
21317 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21318 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21319 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21320 + * characters of the name.
21321 + *
21322 + * This key assignment reaches following important goals:
21323 + *
21324 + * (1) directory entries are sorted in approximately lexicographical
21325 + * order.
21326 + *
21327 + * (2) collisions (when multiple directory items have the same key), while
21328 + * principally unavoidable in a tree with fixed length keys, are rare.
21329 + *
21330 + * STAT DATA
21331 + *
21332 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21333 + * +--------------+---+-----------------+---+--------------+-----------------+
21334 + * | locality id | 1 | ordering | 0 | objectid | 0 |
21335 + * +--------------+---+-----------------+---+--------------+-----------------+
21336 + * | | | | |
21337 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21338 + *
21339 + * locality id object id of a directory where first name was created for
21340 + * the object
21341 + *
21342 + * ordering copy of second 8-byte portion of the key of directory
21343 + * entry for the first name of this object. Ordering has a form
21344 + * {
21345 + * fibration :7;
21346 + * h :1;
21347 + * prefix1 :56;
21348 + * }
21349 + * see description of key for directory entry above.
21350 + *
21351 + * objectid object id for this object
21352 + *
21353 + * This key assignment policy is designed to keep stat-data in the same order
21354 + * as corresponding directory items, thus speeding up readdir/stat types of
21355 + * workload.
21356 + *
21357 + * FILE BODY
21358 + *
21359 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21360 + * +--------------+---+-----------------+---+--------------+-----------------+
21361 + * | locality id | 4 | ordering | 0 | objectid | offset |
21362 + * +--------------+---+-----------------+---+--------------+-----------------+
21363 + * | | | | |
21364 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21365 + *
21366 + * locality id object id of a directory where first name was created for
21367 + * the object
21368 + *
21369 + * ordering the same as in the key of stat-data for this object
21370 + *
21371 + * objectid object id for this object
21372 + *
21373 + * offset logical offset from the beginning of this file.
21374 + * Measured in bytes.
21375 + *
21376 + *
21377 + * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21378 + *
21379 + * DIRECTORY ITEMS
21380 + *
21381 + * | 60 | 4 | 7 |1| 56 | 64 |
21382 + * +--------------+---+---+-+-------------+-----------------+
21383 + * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21384 + * +--------------+---+---+-+-------------+-----------------+
21385 + * | | | |
21386 + * | 8 bytes | 8 bytes | 8 bytes |
21387 + *
21388 + * dirid objectid of directory this item is for
21389 + *
21390 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21391 + *
21392 + * H 1 if last 8 bytes of the key contain hash,
21393 + * 0 if last 8 bytes of the key contain prefix-2
21394 + *
21395 + * prefix-1 first 7 characters of file name.
21396 + * Padded by zeroes if name is not long enough.
21397 + *
21398 + * prefix-2 next 8 characters of the file name.
21399 + *
21400 + * hash hash of the rest of file name (i.e., portion of file
21401 + * name not included into prefix-1).
21402 + *
21403 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21404 + * the key. Such file names are called "short". They are distinguished by H
21405 + * bit set in the key.
21406 + *
21407 + * Other file names are "long". For long name, H bit is 0, and first 7
21408 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21409 + * key are occupied by hash of the remaining characters of the name.
21410 + *
21411 + * STAT DATA
21412 + *
21413 + * | 60 | 4 | 4 | 60 | 64 |
21414 + * +--------------+---+---+--------------+-----------------+
21415 + * | locality id | 1 | 0 | objectid | 0 |
21416 + * +--------------+---+---+--------------+-----------------+
21417 + * | | | |
21418 + * | 8 bytes | 8 bytes | 8 bytes |
21419 + *
21420 + * locality id object id of a directory where first name was created for
21421 + * the object
21422 + *
21423 + * objectid object id for this object
21424 + *
21425 + * FILE BODY
21426 + *
21427 + * | 60 | 4 | 4 | 60 | 64 |
21428 + * +--------------+---+---+--------------+-----------------+
21429 + * | locality id | 4 | 0 | objectid | offset |
21430 + * +--------------+---+---+--------------+-----------------+
21431 + * | | | |
21432 + * | 8 bytes | 8 bytes | 8 bytes |
21433 + *
21434 + * locality id object id of a directory where first name was created for
21435 + * the object
21436 + *
21437 + * objectid object id for this object
21438 + *
21439 + * offset logical offset from the beginning of this file.
21440 + * Measured in bytes.
21441 + *
21442 + *
21443 + */
21444 +
21445 +#include "debug.h"
21446 +#include "key.h"
21447 +#include "kassign.h"
21448 +#include "vfs_ops.h"
21449 +#include "inode.h"
21450 +#include "super.h"
21451 +#include "dscale.h"
21452 +
21453 +#include <linux/types.h> /* for __u?? */
21454 +#include <linux/fs.h> /* for struct super_block, etc */
21455 +
21456 +/* bitmask for H bit (see comment at the beginning of this file */
21457 +static const __u64 longname_mark = 0x0100000000000000ull;
21458 +/* bitmask for F and H portions of the key. */
21459 +static const __u64 fibration_mask = 0xff00000000000000ull;
21460 +
21461 +/* return true if name is not completely encoded in @key */
21462 +int is_longname_key(const reiser4_key * key)
21463 +{
21464 + __u64 highpart;
21465 +
21466 + assert("nikita-2863", key != NULL);
21467 + if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21468 + print_key("oops", key);
21469 + assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21470 +
21471 + if (REISER4_LARGE_KEY)
21472 + highpart = get_key_ordering(key);
21473 + else
21474 + highpart = get_key_objectid(key);
21475 +
21476 + return (highpart & longname_mark) ? 1 : 0;
21477 +}
21478 +
21479 +/* return true if @name is too long to be completely encoded in the key */
21480 +int is_longname(const char *name UNUSED_ARG, int len)
21481 +{
21482 + if (REISER4_LARGE_KEY)
21483 + return len > 23;
21484 + else
21485 + return len > 15;
21486 +}
21487 +
21488 +/* code ascii string into __u64.
21489 +
21490 + Put characters of @name into result (@str) one after another starting
21491 + from @start_idx-th highest (arithmetically) byte. This produces
21492 + endian-safe encoding. memcpy(2) will not do.
21493 +
21494 +*/
21495 +static __u64 pack_string(const char *name /* string to encode */ ,
21496 + int start_idx /* highest byte in result from
21497 + * which to start encoding */ )
21498 +{
21499 + unsigned i;
21500 + __u64 str;
21501 +
21502 + str = 0;
21503 + for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21504 + str <<= 8;
21505 + str |= (unsigned char)name[i];
21506 + }
21507 + str <<= (sizeof str - i - start_idx) << 3;
21508 + return str;
21509 +}
21510 +
21511 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21512 + * string encoded in it and stores result in @buf */
21513 +char *unpack_string(__u64 value, char *buf)
21514 +{
21515 + do {
21516 + *buf = value >> (64 - 8);
21517 + if (*buf)
21518 + ++buf;
21519 + value <<= 8;
21520 + } while (value != 0);
21521 + *buf = 0;
21522 + return buf;
21523 +}
21524 +
21525 +/* obtain name encoded in @key and store it in @buf */
21526 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21527 +{
21528 + char *c;
21529 +
21530 + assert("nikita-2868", !is_longname_key(key));
21531 +
21532 + c = buf;
21533 + if (REISER4_LARGE_KEY) {
21534 + c = unpack_string(get_key_ordering(key) & ~fibration_mask, c);
21535 + c = unpack_string(get_key_fulloid(key), c);
21536 + } else
21537 + c = unpack_string(get_key_fulloid(key) & ~fibration_mask, c);
21538 + unpack_string(get_key_offset(key), c);
21539 + return buf;
21540 +}
21541 +
21542 +/**
21543 + * complete_entry_key - calculate entry key by name
21544 + * @dir: directory where entry is (or will be) in
21545 + * @name: name to calculate key of
21546 + * @len: lenth of name
21547 + * @result: place to store result in
21548 + *
21549 + * Sets fields of entry key @result which depend on file name.
21550 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21551 + * objectid and offset. Otherwise, objectid and offset are set.
21552 + */
21553 +void complete_entry_key(const struct inode *dir, const char *name,
21554 + int len, reiser4_key *result)
21555 +{
21556 +#if REISER4_LARGE_KEY
21557 + __u64 ordering;
21558 + __u64 objectid;
21559 + __u64 offset;
21560 +
21561 + assert("nikita-1139", dir != NULL);
21562 + assert("nikita-1142", result != NULL);
21563 + assert("nikita-2867", strlen(name) == len);
21564 +
21565 + /*
21566 + * key allocation algorithm for directory entries in case of large
21567 + * keys:
21568 + *
21569 + * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21570 + * characters into ordering field of key, next 8 charactes (if any)
21571 + * into objectid field of key and next 8 ones (of any) into offset
21572 + * field of key
21573 + *
21574 + * If file name is longer than 23 characters, put first 7 characters
21575 + * into key's ordering, next 8 to objectid and hash of remaining
21576 + * characters into offset field.
21577 + *
21578 + * To distinguish above cases, in latter set up unused high bit in
21579 + * ordering field.
21580 + */
21581 +
21582 + /* [0-6] characters to ordering */
21583 + ordering = pack_string(name, 1);
21584 + if (len > 7) {
21585 + /* [7-14] characters to objectid */
21586 + objectid = pack_string(name + 7, 0);
21587 + if (len > 15) {
21588 + if (len <= 23) {
21589 + /* [15-23] characters to offset */
21590 + offset = pack_string(name + 15, 0);
21591 + } else {
21592 + /* note in a key the fact that offset contains hash. */
21593 + ordering |= longname_mark;
21594 +
21595 + /* offset is the hash of the file name's tail. */
21596 + offset = inode_hash_plugin(dir)->hash(name + 15,
21597 + len - 15);
21598 + }
21599 + } else {
21600 + offset = 0ull;
21601 + }
21602 + } else {
21603 + objectid = 0ull;
21604 + offset = 0ull;
21605 + }
21606 +
21607 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21608 + ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21609 +
21610 + set_key_ordering(result, ordering);
21611 + set_key_fulloid(result, objectid);
21612 + set_key_offset(result, offset);
21613 + return;
21614 +
21615 +#else
21616 + __u64 objectid;
21617 + __u64 offset;
21618 +
21619 + assert("nikita-1139", dir != NULL);
21620 + assert("nikita-1142", result != NULL);
21621 + assert("nikita-2867", strlen(name) == len);
21622 +
21623 + /*
21624 + * key allocation algorithm for directory entries in case of not large
21625 + * keys:
21626 + *
21627 + * If name is not longer than 7 + 8 = 15 characters, put first 7
21628 + * characters into objectid field of key, next 8 charactes (if any)
21629 + * into offset field of key
21630 + *
21631 + * If file name is longer than 15 characters, put first 7 characters
21632 + * into key's objectid, and hash of remaining characters into offset
21633 + * field.
21634 + *
21635 + * To distinguish above cases, in latter set up unused high bit in
21636 + * objectid field.
21637 + */
21638 +
21639 + /* [0-6] characters to objectid */
21640 + objectid = pack_string(name, 1);
21641 + if (len > 7) {
21642 + if (len <= 15) {
21643 + /* [7-14] characters to offset */
21644 + offset = pack_string(name + 7, 0);
21645 + } else {
21646 + /* note in a key the fact that offset contains hash. */
21647 + objectid |= longname_mark;
21648 +
21649 + /* offset is the hash of the file name. */
21650 + offset = inode_hash_plugin(dir)->hash(name + 7,
21651 + len - 7);
21652 + }
21653 + } else
21654 + offset = 0ull;
21655 +
21656 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21657 + objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21658 +
21659 + set_key_fulloid(result, objectid);
21660 + set_key_offset(result, offset);
21661 + return;
21662 +#endif /* ! REISER4_LARGE_KEY */
21663 +}
21664 +
21665 +/* true, if @key is the key of "." */
21666 +int is_dot_key(const reiser4_key * key /* key to check */ )
21667 +{
21668 + assert("nikita-1717", key != NULL);
21669 + assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21670 + return
21671 + (get_key_ordering(key) == 0ull) &&
21672 + (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21673 +}
21674 +
21675 +/* build key for stat-data.
21676 +
21677 + return key of stat-data of this object. This should became sd plugin
21678 + method in the future. For now, let it be here.
21679 +
21680 +*/
21681 +reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21682 + reiser4_key * result /* resulting key of @target
21683 + stat-data */ )
21684 +{
21685 + assert("nikita-261", result != NULL);
21686 +
21687 + reiser4_key_init(result);
21688 + set_key_locality(result, reiser4_inode_data(target)->locality_id);
21689 + set_key_ordering(result, get_inode_ordering(target));
21690 + set_key_objectid(result, get_inode_oid(target));
21691 + set_key_type(result, KEY_SD_MINOR);
21692 + set_key_offset(result, (__u64) 0);
21693 + return result;
21694 +}
21695 +
21696 +/* encode part of key into &obj_key_id
21697 +
21698 + This encodes into @id part of @key sufficient to restore @key later,
21699 + given that latter is key of object (key of stat-data).
21700 +
21701 + See &obj_key_id
21702 +*/
21703 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21704 + obj_key_id * id /* id where key is encoded in */ )
21705 +{
21706 + assert("nikita-1151", key != NULL);
21707 + assert("nikita-1152", id != NULL);
21708 +
21709 + memcpy(id, key, sizeof *id);
21710 + return 0;
21711 +}
21712 +
21713 +/* encode reference to @obj in @id.
21714 +
21715 + This is like build_obj_key_id() above, but takes inode as parameter. */
21716 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21717 + obj_key_id * id /* result */ )
21718 +{
21719 + reiser4_key sdkey;
21720 +
21721 + assert("nikita-1166", obj != NULL);
21722 + assert("nikita-1167", id != NULL);
21723 +
21724 + build_sd_key(obj, &sdkey);
21725 + build_obj_key_id(&sdkey, id);
21726 + return 0;
21727 +}
21728 +
21729 +/* decode @id back into @key
21730 +
21731 + Restore key of object stat-data from @id. This is dual to
21732 + build_obj_key_id() above.
21733 +*/
21734 +int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21735 + * from */ ,
21736 + reiser4_key * key /* result */ )
21737 +{
21738 + assert("nikita-1153", id != NULL);
21739 + assert("nikita-1154", key != NULL);
21740 +
21741 + reiser4_key_init(key);
21742 + memcpy(key, id, sizeof *id);
21743 + return 0;
21744 +}
21745 +
21746 +/* extract objectid of directory from key of directory entry within said
21747 + directory.
21748 + */
21749 +oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21750 + * directory
21751 + * entry */ )
21752 +{
21753 + assert("nikita-1314", de_key != NULL);
21754 + return get_key_locality(de_key);
21755 +}
21756 +
21757 +/* encode into @id key of directory entry.
21758 +
21759 + Encode into @id information sufficient to later distinguish directory
21760 + entries within the same directory. This is not whole key, because all
21761 + directory entries within directory item share locality which is equal
21762 + to objectid of their directory.
21763 +
21764 +*/
21765 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21766 + const struct qstr *name /* name to be given to @obj by
21767 + * directory entry being
21768 + * constructed */ ,
21769 + de_id * id /* short key of directory entry */ )
21770 +{
21771 + reiser4_key key;
21772 +
21773 + assert("nikita-1290", dir != NULL);
21774 + assert("nikita-1292", id != NULL);
21775 +
21776 + /* NOTE-NIKITA this is suboptimal. */
21777 + inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21778 + return build_de_id_by_key(&key, id);
21779 +}
21780 +
21781 +/* encode into @id key of directory entry.
21782 +
21783 + Encode into @id information sufficient to later distinguish directory
21784 + entries within the same directory. This is not whole key, because all
21785 + directory entries within directory item share locality which is equal
21786 + to objectid of their directory.
21787 +
21788 +*/
21789 +int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21790 + * entry */ ,
21791 + de_id * id /* short key of directory entry */ )
21792 +{
21793 + memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21794 + return 0;
21795 +}
21796 +
21797 +/* restore from @id key of directory entry.
21798 +
21799 + Function dual to build_de_id(): given @id and locality, build full
21800 + key of directory entry within directory item.
21801 +
21802 +*/
21803 +int extract_key_from_de_id(const oid_t locality /* locality of directory
21804 + * entry */ ,
21805 + const de_id * id /* directory entry id */ ,
21806 + reiser4_key * key /* result */ )
21807 +{
21808 + /* no need to initialise key here: all fields are overwritten */
21809 + memcpy(((__u64 *) key) + 1, id, sizeof *id);
21810 + set_key_locality(key, locality);
21811 + set_key_type(key, KEY_FILE_NAME_MINOR);
21812 + return 0;
21813 +}
21814 +
21815 +/* compare two &de_id's */
21816 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21817 + const de_id * id2 /* second &de_id to compare */ )
21818 +{
21819 + /* NOTE-NIKITA ugly implementation */
21820 + reiser4_key k1;
21821 + reiser4_key k2;
21822 +
21823 + extract_key_from_de_id((oid_t) 0, id1, &k1);
21824 + extract_key_from_de_id((oid_t) 0, id2, &k2);
21825 + return keycmp(&k1, &k2);
21826 +}
21827 +
21828 +/* compare &de_id with key */
21829 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21830 + const reiser4_key * key /* key to compare */ )
21831 +{
21832 + cmp_t result;
21833 + reiser4_key *k1;
21834 +
21835 + k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21836 + result = KEY_DIFF_EL(k1, key, 1);
21837 + if (result == EQUAL_TO) {
21838 + result = KEY_DIFF_EL(k1, key, 2);
21839 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21840 + result = KEY_DIFF_EL(k1, key, 3);
21841 + }
21842 + }
21843 + return result;
21844 +}
21845 +
21846 +/*
21847 + * return number of bytes necessary to encode @inode identity.
21848 + */
21849 +int inode_onwire_size(const struct inode *inode)
21850 +{
21851 + int result;
21852 +
21853 + result = dscale_bytes(get_inode_oid(inode));
21854 + result += dscale_bytes(get_inode_locality(inode));
21855 +
21856 + /*
21857 + * ordering is large (it usually has highest bits set), so it makes
21858 + * little sense to dscale it.
21859 + */
21860 + if (REISER4_LARGE_KEY)
21861 + result += sizeof(get_inode_ordering(inode));
21862 + return result;
21863 +}
21864 +
21865 +/*
21866 + * encode @inode identity at @start
21867 + */
21868 +char *build_inode_onwire(const struct inode *inode, char *start)
21869 +{
21870 + start += dscale_write(start, get_inode_locality(inode));
21871 + start += dscale_write(start, get_inode_oid(inode));
21872 +
21873 + if (REISER4_LARGE_KEY) {
21874 + put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21875 + start += sizeof(get_inode_ordering(inode));
21876 + }
21877 + return start;
21878 +}
21879 +
21880 +/*
21881 + * extract key that was previously encoded by build_inode_onwire() at @addr
21882 + */
21883 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21884 +{
21885 + __u64 val;
21886 +
21887 + addr += dscale_read(addr, &val);
21888 + val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21889 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21890 + addr += dscale_read(addr, &val);
21891 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21892 +#if REISER4_LARGE_KEY
21893 + memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21894 + addr += sizeof key_id->ordering;
21895 +#endif
21896 + return addr;
21897 +}
21898 +
21899 +/* Make Linus happy.
21900 + Local variables:
21901 + c-indentation-style: "K&R"
21902 + mode-name: "LC"
21903 + c-basic-offset: 8
21904 + tab-width: 8
21905 + fill-column: 120
21906 + End:
21907 +*/
21908 Index: linux-2.6.16/fs/reiser4/kassign.h
21909 ===================================================================
21910 --- /dev/null
21911 +++ linux-2.6.16/fs/reiser4/kassign.h
21912 @@ -0,0 +1,110 @@
21913 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21914 + * reiser4/README */
21915 +
21916 +/* Key assignment policy interface. See kassign.c for details. */
21917 +
21918 +#if !defined( __KASSIGN_H__ )
21919 +#define __KASSIGN_H__
21920 +
21921 +#include "forward.h"
21922 +#include "key.h"
21923 +#include "dformat.h"
21924 +
21925 +#include <linux/types.h> /* for __u?? */
21926 +#include <linux/fs.h> /* for struct super_block, etc */
21927 +#include <linux/dcache.h> /* for struct qstr */
21928 +
21929 +/* key assignment functions */
21930 +
21931 +/* Information from which key of file stat-data can be uniquely
21932 + restored. This depends on key assignment policy for
21933 + stat-data. Currently it's enough to store object id and locality id
21934 + (60+60==120) bits, because minor packing locality and offset of
21935 + stat-data key are always known constants: KEY_SD_MINOR and 0
21936 + respectively. For simplicity 4 bits are wasted in each id, and just
21937 + two 64 bit integers are stored.
21938 +
21939 + This field has to be byte-aligned, because we don't want to waste
21940 + space in directory entries. There is another side of a coin of
21941 + course: we waste CPU and bus bandwidth in stead, by copying data back
21942 + and forth.
21943 +
21944 + Next optimization: &obj_key_id is mainly used to address stat data from
21945 + directory entries. Under the assumption that majority of files only have
21946 + only name (one hard link) from *the* parent directory it seems reasonable
21947 + to only store objectid of stat data and take its locality from key of
21948 + directory item.
21949 +
21950 + This requires some flag to be added to the &obj_key_id to distinguish
21951 + between these two cases. Remaining bits in flag byte are then asking to be
21952 + used to store file type.
21953 +
21954 + This optimization requires changes in directory item handling code.
21955 +
21956 +*/
21957 +typedef struct obj_key_id {
21958 + d8 locality[sizeof(__u64)];
21959 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21960 + )
21961 + d8 objectid[sizeof(__u64)];
21962 +}
21963 +obj_key_id;
21964 +
21965 +/* Information sufficient to uniquely identify directory entry within
21966 + compressed directory item.
21967 +
21968 + For alignment issues see &obj_key_id above.
21969 +*/
21970 +typedef struct de_id {
21971 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21972 + d8 objectid[sizeof(__u64)];
21973 + d8 offset[sizeof(__u64)];
21974 +}
21975 +de_id;
21976 +
21977 +extern int inode_onwire_size(const struct inode *obj);
21978 +extern char *build_inode_onwire(const struct inode *obj, char *area);
21979 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21980 +
21981 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21982 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21983 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21984 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21985 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
21986 + de_id * id);
21987 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21988 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21989 + reiser4_key * key);
21990 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21991 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21992 +
21993 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21994 +extern void build_entry_key_common(const struct inode *dir,
21995 + const struct qstr *name,
21996 + reiser4_key * result);
21997 +extern void build_entry_key_stable_entry(const struct inode *dir,
21998 + const struct qstr *name,
21999 + reiser4_key * result);
22000 +extern int is_dot_key(const reiser4_key * key);
22001 +extern reiser4_key *build_sd_key(const struct inode *target,
22002 + reiser4_key * result);
22003 +
22004 +extern int is_longname_key(const reiser4_key * key);
22005 +extern int is_longname(const char *name, int len);
22006 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
22007 +extern char *unpack_string(__u64 value, char *buf);
22008 +extern void complete_entry_key(const struct inode *dir, const char *name,
22009 + int len, reiser4_key *result);
22010 +
22011 +/* __KASSIGN_H__ */
22012 +#endif
22013 +
22014 +/* Make Linus happy.
22015 + Local variables:
22016 + c-indentation-style: "K&R"
22017 + mode-name: "LC"
22018 + c-basic-offset: 8
22019 + tab-width: 8
22020 + fill-column: 120
22021 + End:
22022 +*/
22023 Index: linux-2.6.16/fs/reiser4/key.c
22024 ===================================================================
22025 --- /dev/null
22026 +++ linux-2.6.16/fs/reiser4/key.c
22027 @@ -0,0 +1,137 @@
22028 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22029 +
22030 +/* Key manipulations. */
22031 +
22032 +#include "debug.h"
22033 +#include "key.h"
22034 +#include "super.h"
22035 +#include "reiser4.h"
22036 +
22037 +#include <linux/types.h> /* for __u?? */
22038 +
22039 +/* Minimal possible key: all components are zero. It is presumed that this is
22040 + independent of key scheme. */
22041 +static const reiser4_key MINIMAL_KEY = {
22042 + .el = {
22043 + 0ull,
22044 + ON_LARGE_KEY(0ull,)
22045 + 0ull,
22046 + 0ull
22047 + }
22048 +};
22049 +
22050 +/* Maximal possible key: all components are ~0. It is presumed that this is
22051 + independent of key scheme. */
22052 +static const reiser4_key MAXIMAL_KEY = {
22053 + .el = {
22054 + __constant_cpu_to_le64(~0ull),
22055 + ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
22056 + __constant_cpu_to_le64(~0ull),
22057 + __constant_cpu_to_le64(~0ull)
22058 + }
22059 +};
22060 +
22061 +/* Initialize key. */
22062 +void reiser4_key_init(reiser4_key * key /* key to init */ )
22063 +{
22064 + assert("nikita-1169", key != NULL);
22065 + memset(key, 0, sizeof *key);
22066 +}
22067 +
22068 +/* minimal possible key in the tree. Return pointer to the static storage. */
22069 +const reiser4_key *min_key(void)
22070 +{
22071 + return &MINIMAL_KEY;
22072 +}
22073 +
22074 +/* maximum possible key in the tree. Return pointer to the static storage. */
22075 +const reiser4_key *max_key(void)
22076 +{
22077 + return &MAXIMAL_KEY;
22078 +}
22079 +
22080 +#if REISER4_DEBUG
22081 +/* debugging aid: print symbolic name of key type */
22082 +static const char *type_name(unsigned int key_type /* key type */ )
22083 +{
22084 + switch (key_type) {
22085 + case KEY_FILE_NAME_MINOR:
22086 + return "file name";
22087 + case KEY_SD_MINOR:
22088 + return "stat data";
22089 + case KEY_ATTR_NAME_MINOR:
22090 + return "attr name";
22091 + case KEY_ATTR_BODY_MINOR:
22092 + return "attr body";
22093 + case KEY_BODY_MINOR:
22094 + return "file body";
22095 + default:
22096 + return "unknown";
22097 + }
22098 +}
22099 +
22100 +/* debugging aid: print human readable information about key */
22101 +void print_key(const char *prefix /* prefix to print */ ,
22102 + const reiser4_key * key /* key to print */ )
22103 +{
22104 + /* turn bold on */
22105 + /* printf ("\033[1m"); */
22106 + if (key == NULL)
22107 + printk("%s: null key\n", prefix);
22108 + else {
22109 + if (REISER4_LARGE_KEY)
22110 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
22111 + get_key_locality(key),
22112 + get_key_type(key),
22113 + get_key_ordering(key),
22114 + get_key_band(key),
22115 + get_key_objectid(key), get_key_offset(key));
22116 + else
22117 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
22118 + get_key_locality(key),
22119 + get_key_type(key),
22120 + get_key_band(key),
22121 + get_key_objectid(key), get_key_offset(key));
22122 + /*
22123 + * if this is a key of directory entry, try to decode part of
22124 + * a name stored in the key, and output it.
22125 + */
22126 + if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
22127 + char buf[DE_NAME_BUF_LEN];
22128 + char *c;
22129 +
22130 + c = buf;
22131 + c = unpack_string(get_key_ordering(key), c);
22132 + unpack_string(get_key_fulloid(key), c);
22133 + printk("[%s", buf);
22134 + if (is_longname_key(key))
22135 + /*
22136 + * only part of the name is stored in the key.
22137 + */
22138 + printk("...]\n");
22139 + else {
22140 + /*
22141 + * whole name is stored in the key.
22142 + */
22143 + unpack_string(get_key_offset(key), buf);
22144 + printk("%s]\n", buf);
22145 + }
22146 + } else {
22147 + printk("[%s]\n", type_name(get_key_type(key)));
22148 + }
22149 + }
22150 + /* turn bold off */
22151 + /* printf ("\033[m\017"); */
22152 +}
22153 +
22154 +#endif
22155 +
22156 +/* Make Linus happy.
22157 + Local variables:
22158 + c-indentation-style: "K&R"
22159 + mode-name: "LC"
22160 + c-basic-offset: 8
22161 + tab-width: 8
22162 + fill-column: 120
22163 + End:
22164 +*/
22165 Index: linux-2.6.16/fs/reiser4/key.h
22166 ===================================================================
22167 --- /dev/null
22168 +++ linux-2.6.16/fs/reiser4/key.h
22169 @@ -0,0 +1,384 @@
22170 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22171 +
22172 +/* Declarations of key-related data-structures and operations on keys. */
22173 +
22174 +#if !defined( __REISER4_KEY_H__ )
22175 +#define __REISER4_KEY_H__
22176 +
22177 +#include "dformat.h"
22178 +#include "forward.h"
22179 +#include "debug.h"
22180 +
22181 +#include <linux/types.h> /* for __u?? */
22182 +
22183 +/* Operations on keys in reiser4 tree */
22184 +
22185 +/* No access to any of these fields shall be done except via a
22186 + wrapping macro/function, and that wrapping macro/function shall
22187 + convert to little endian order. Compare keys will consider cpu byte order. */
22188 +
22189 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
22190 + which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
22191 + within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
22192 + approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
22193 + right one. */
22194 +
22195 +/* possible values for minor packing locality (4 bits required) */
22196 +typedef enum {
22197 + /* file name */
22198 + KEY_FILE_NAME_MINOR = 0,
22199 + /* stat-data */
22200 + KEY_SD_MINOR = 1,
22201 + /* file attribute name */
22202 + KEY_ATTR_NAME_MINOR = 2,
22203 + /* file attribute value */
22204 + KEY_ATTR_BODY_MINOR = 3,
22205 + /* file body (tail or extent) */
22206 + KEY_BODY_MINOR = 4,
22207 +} key_minor_locality;
22208 +
22209 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
22210 + Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
22211 + and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
22212 + segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
22213 + block_alloc.c to check the node type when deciding where to allocate the node.
22214 +
22215 + The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
22216 + should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
22217 + current implementation tails have a different minor packing locality from extents, and no files have both extents and
22218 + tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
22219 +*/
22220 +
22221 +/* Arbitrary major packing localities can be assigned to objects using
22222 + the reiser4(filenameA/..packing<=some_number) system call.
22223 +
22224 + In reiser4, the creat() syscall creates a directory
22225 +
22226 + whose default flow (that which is referred to if the directory is
22227 + read as a file) is the traditional unix file body.
22228 +
22229 + whose directory plugin is the 'filedir'
22230 +
22231 + whose major packing locality is that of the parent of the object created.
22232 +
22233 + The static_stat item is a particular commonly used directory
22234 + compression (the one for normal unix files).
22235 +
22236 + The filedir plugin checks to see if the static_stat item exists.
22237 + There is a unique key for static_stat. If yes, then it uses the
22238 + static_stat item for all of the values that it contains. The
22239 + static_stat item contains a flag for each stat it contains which
22240 + indicates whether one should look outside the static_stat item for its
22241 + contents.
22242 +*/
22243 +
22244 +/* offset of fields in reiser4_key. Value of each element of this enum
22245 + is index within key (thought as array of __u64's) where this field
22246 + is. */
22247 +typedef enum {
22248 + /* major "locale", aka dirid. Sits in 1st element */
22249 + KEY_LOCALITY_INDEX = 0,
22250 + /* minor "locale", aka item type. Sits in 1st element */
22251 + KEY_TYPE_INDEX = 0,
22252 + ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22253 + /* "object band". Sits in 2nd element */
22254 + KEY_BAND_INDEX,
22255 + /* objectid. Sits in 2nd element */
22256 + KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22257 + /* full objectid. Sits in 2nd element */
22258 + KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22259 + /* Offset. Sits in 3rd element */
22260 + KEY_OFFSET_INDEX,
22261 + /* Name hash. Sits in 3rd element */
22262 + KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22263 + KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22264 + KEY_LAST_INDEX
22265 +} reiser4_key_field_index;
22266 +
22267 +/* key in reiser4 internal "balanced" tree. It is just array of three
22268 + 64bit integers in disk byte order (little-endian by default). This
22269 + array is actually indexed by reiser4_key_field. Each __u64 within
22270 + this array is called "element". Logical key component encoded within
22271 + elements are called "fields".
22272 +
22273 + We declare this as union with second component dummy to suppress
22274 + inconvenient array<->pointer casts implied in C. */
22275 +union reiser4_key {
22276 + __le64 el[KEY_LAST_INDEX];
22277 + int pad;
22278 +};
22279 +
22280 +/* bitmasks showing where within reiser4_key particular key is stored. */
22281 +/* major locality occupies higher 60 bits of the first element */
22282 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22283 +
22284 +/* minor locality occupies lower 4 bits of the first element */
22285 +#define KEY_TYPE_MASK 0xfull
22286 +
22287 +/* controversial band occupies higher 4 bits of the 2nd element */
22288 +#define KEY_BAND_MASK 0xf000000000000000ull
22289 +
22290 +/* objectid occupies lower 60 bits of the 2nd element */
22291 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22292 +
22293 +/* full 64bit objectid*/
22294 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22295 +
22296 +/* offset is just 3rd L.M.Nt itself */
22297 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22298 +
22299 +/* ordering is whole second element */
22300 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22301 +
22302 +/* how many bits key element should be shifted to left to get particular field */
22303 +typedef enum {
22304 + KEY_LOCALITY_SHIFT = 4,
22305 + KEY_TYPE_SHIFT = 0,
22306 + KEY_BAND_SHIFT = 60,
22307 + KEY_OBJECTID_SHIFT = 0,
22308 + KEY_FULLOID_SHIFT = 0,
22309 + KEY_OFFSET_SHIFT = 0,
22310 + KEY_ORDERING_SHIFT = 0,
22311 +} reiser4_key_field_shift;
22312 +
22313 +static inline __u64
22314 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22315 +{
22316 + assert("nikita-753", key != NULL);
22317 + assert("nikita-754", off < KEY_LAST_INDEX);
22318 + return le64_to_cpu(get_unaligned(&key->el[off]));
22319 +}
22320 +
22321 +static inline void
22322 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22323 +{
22324 + assert("nikita-755", key != NULL);
22325 + assert("nikita-756", off < KEY_LAST_INDEX);
22326 + put_unaligned(cpu_to_le64(value), &key->el[off]);
22327 +}
22328 +
22329 +/* macro to define getter and setter functions for field F with type T */
22330 +#define DEFINE_KEY_FIELD( L, U, T ) \
22331 +static inline T get_key_ ## L ( const reiser4_key *key ) \
22332 +{ \
22333 + assert( "nikita-750", key != NULL ); \
22334 + return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22335 + KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22336 +} \
22337 + \
22338 +static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22339 +{ \
22340 + __u64 el; \
22341 + \
22342 + assert( "nikita-752", key != NULL ); \
22343 + \
22344 + el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22345 + /* clear field bits in the key */ \
22346 + el &= ~KEY_ ## U ## _MASK; \
22347 + /* actually it should be \
22348 + \
22349 + el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22350 + \
22351 + but we trust user to never pass values that wouldn't fit \
22352 + into field. Clearing extra bits is one operation, but this \
22353 + function is time-critical. \
22354 + But check this in assertion. */ \
22355 + assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22356 + ~KEY_ ## U ## _MASK ) == 0 ); \
22357 + el |= ( loc << KEY_ ## U ## _SHIFT ); \
22358 + set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22359 +}
22360 +
22361 +typedef __u64 oid_t;
22362 +
22363 +/* define get_key_locality(), set_key_locality() */
22364 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22365 +/* define get_key_type(), set_key_type() */
22366 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22367 +/* define get_key_band(), set_key_band() */
22368 +DEFINE_KEY_FIELD(band, BAND, __u64);
22369 +/* define get_key_objectid(), set_key_objectid() */
22370 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22371 +/* define get_key_fulloid(), set_key_fulloid() */
22372 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22373 +/* define get_key_offset(), set_key_offset() */
22374 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22375 +#if (REISER4_LARGE_KEY)
22376 +/* define get_key_ordering(), set_key_ordering() */
22377 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22378 +#else
22379 +static inline __u64 get_key_ordering(const reiser4_key * key)
22380 +{
22381 + return 0;
22382 +}
22383 +
22384 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22385 +{
22386 +}
22387 +#endif
22388 +
22389 +/* key comparison result */
22390 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22391 + EQUAL_TO = 0, /* if keys are equal */
22392 + GREATER_THAN = +1 /* if first key is greater than second */
22393 +} cmp_t;
22394 +
22395 +void reiser4_key_init(reiser4_key * key);
22396 +
22397 +/* minimal possible key in the tree. Return pointer to the static storage. */
22398 +extern const reiser4_key *min_key(void);
22399 +extern const reiser4_key *max_key(void);
22400 +
22401 +/* helper macro for keycmp() */
22402 +#define KEY_DIFF(k1, k2, field) \
22403 +({ \
22404 + typeof (get_key_ ## field (k1)) f1; \
22405 + typeof (get_key_ ## field (k2)) f2; \
22406 + \
22407 + f1 = get_key_ ## field (k1); \
22408 + f2 = get_key_ ## field (k2); \
22409 + \
22410 + (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22411 +})
22412 +
22413 +/* helper macro for keycmp() */
22414 +#define KEY_DIFF_EL(k1, k2, off) \
22415 +({ \
22416 + __u64 e1; \
22417 + __u64 e2; \
22418 + \
22419 + e1 = get_key_el(k1, off); \
22420 + e2 = get_key_el(k2, off); \
22421 + \
22422 + (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22423 +})
22424 +
22425 +/* compare `k1' and `k2'. This function is a heart of "key allocation
22426 + policy". All you need to implement new policy is to add yet another
22427 + clause here. */
22428 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22429 + const reiser4_key * k2 /* second key to compare */ )
22430 +{
22431 + cmp_t result;
22432 +
22433 + /*
22434 + * This function is the heart of reiser4 tree-routines. Key comparison
22435 + * is among most heavily used operations in the file system.
22436 + */
22437 +
22438 + assert("nikita-439", k1 != NULL);
22439 + assert("nikita-440", k2 != NULL);
22440 +
22441 + /* there is no actual branch here: condition is compile time constant
22442 + * and constant folding and propagation ensures that only one branch
22443 + * is actually compiled in. */
22444 +
22445 + if (REISER4_PLANA_KEY_ALLOCATION) {
22446 + /* if physical order of fields in a key is identical
22447 + with logical order, we can implement key comparison
22448 + as three 64bit comparisons. */
22449 + /* logical order of fields in plan-a:
22450 + locality->type->objectid->offset. */
22451 + /* compare locality and type at once */
22452 + result = KEY_DIFF_EL(k1, k2, 0);
22453 + if (result == EQUAL_TO) {
22454 + /* compare objectid (and band if it's there) */
22455 + result = KEY_DIFF_EL(k1, k2, 1);
22456 + /* compare offset */
22457 + if (result == EQUAL_TO) {
22458 + result = KEY_DIFF_EL(k1, k2, 2);
22459 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22460 + result = KEY_DIFF_EL(k1, k2, 3);
22461 + }
22462 + }
22463 + }
22464 + } else if (REISER4_3_5_KEY_ALLOCATION) {
22465 + result = KEY_DIFF(k1, k2, locality);
22466 + if (result == EQUAL_TO) {
22467 + result = KEY_DIFF(k1, k2, objectid);
22468 + if (result == EQUAL_TO) {
22469 + result = KEY_DIFF(k1, k2, type);
22470 + if (result == EQUAL_TO)
22471 + result = KEY_DIFF(k1, k2, offset);
22472 + }
22473 + }
22474 + } else
22475 + impossible("nikita-441", "Unknown key allocation scheme!");
22476 + return result;
22477 +}
22478 +
22479 +/* true if @k1 equals @k2 */
22480 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22481 + const reiser4_key * k2 /* second key to compare */ )
22482 +{
22483 + assert("nikita-1879", k1 != NULL);
22484 + assert("nikita-1880", k2 != NULL);
22485 + return !memcmp(k1, k2, sizeof *k1);
22486 +}
22487 +
22488 +/* true if @k1 is less than @k2 */
22489 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22490 + const reiser4_key * k2 /* second key to compare */ )
22491 +{
22492 + assert("nikita-1952", k1 != NULL);
22493 + assert("nikita-1953", k2 != NULL);
22494 + return keycmp(k1, k2) == LESS_THAN;
22495 +}
22496 +
22497 +/* true if @k1 is less than or equal to @k2 */
22498 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22499 + const reiser4_key * k2 /* second key to compare */ )
22500 +{
22501 + assert("nikita-1954", k1 != NULL);
22502 + assert("nikita-1955", k2 != NULL);
22503 + return keycmp(k1, k2) != GREATER_THAN;
22504 +}
22505 +
22506 +/* true if @k1 is greater than @k2 */
22507 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22508 + const reiser4_key * k2 /* second key to compare */ )
22509 +{
22510 + assert("nikita-1959", k1 != NULL);
22511 + assert("nikita-1960", k2 != NULL);
22512 + return keycmp(k1, k2) == GREATER_THAN;
22513 +}
22514 +
22515 +/* true if @k1 is greater than or equal to @k2 */
22516 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22517 + const reiser4_key * k2 /* second key to compare */ )
22518 +{
22519 + assert("nikita-1956", k1 != NULL);
22520 + assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22521 + * November 3: Laika */
22522 + return keycmp(k1, k2) != LESS_THAN;
22523 +}
22524 +
22525 +static inline void prefetchkey(reiser4_key * key)
22526 +{
22527 + prefetch(key);
22528 + prefetch(&key->el[KEY_CACHELINE_END]);
22529 +}
22530 +
22531 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22532 + 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22533 +/* size of a buffer suitable to hold human readable key representation */
22534 +#define KEY_BUF_LEN (80)
22535 +
22536 +#if REISER4_DEBUG
22537 +extern void print_key(const char *prefix, const reiser4_key * key);
22538 +#else
22539 +#define print_key(p,k) noop
22540 +#endif
22541 +
22542 +/* __FS_REISERFS_KEY_H__ */
22543 +#endif
22544 +
22545 +/* Make Linus happy.
22546 + Local variables:
22547 + c-indentation-style: "K&R"
22548 + mode-name: "LC"
22549 + c-basic-offset: 8
22550 + tab-width: 8
22551 + fill-column: 120
22552 + End:
22553 +*/
22554 Index: linux-2.6.16/fs/reiser4/ktxnmgrd.c
22555 ===================================================================
22556 --- /dev/null
22557 +++ linux-2.6.16/fs/reiser4/ktxnmgrd.c
22558 @@ -0,0 +1,214 @@
22559 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22560 +/* Transaction manager daemon. */
22561 +
22562 +/*
22563 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22564 + * needed/important for the following reasons:
22565 + *
22566 + * 1. in reiser4 atom is not committed immediately when last transaction
22567 + * handle closes, unless atom is either too old or too large (see
22568 + * atom_should_commit()). This is done to avoid committing too frequently.
22569 + * because:
22570 + *
22571 + * 2. sometimes we don't want to commit atom when closing last transaction
22572 + * handle even if it is old and fat enough. For example, because we are at
22573 + * this point under directory semaphore, and committing would stall all
22574 + * accesses to this directory.
22575 + *
22576 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22577 + * either due to (tunable) timeout or because it was explicitly woken up by
22578 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22579 + * eligible.
22580 + *
22581 + */
22582 +
22583 +#include "debug.h"
22584 +#include "txnmgr.h"
22585 +#include "tree.h"
22586 +#include "ktxnmgrd.h"
22587 +#include "super.h"
22588 +#include "reiser4.h"
22589 +
22590 +#include <linux/sched.h> /* for struct task_struct */
22591 +#include <linux/wait.h>
22592 +#include <linux/suspend.h>
22593 +#include <linux/kernel.h>
22594 +#include <linux/writeback.h>
22595 +#include <linux/kthread.h>
22596 +
22597 +static int scan_mgr(struct super_block *);
22598 +
22599 +/*
22600 + * change current->comm so that ps, top, and friends will see changed
22601 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22602 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22603 + */
22604 +#define set_comm( state ) \
22605 + snprintf( current -> comm, sizeof( current -> comm ), \
22606 + "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22607 +
22608 +/**
22609 + * ktxnmgrd - kernel txnmgr daemon
22610 + * @arg: pointer to super block
22611 + *
22612 + * The background transaction manager daemon, started as a kernel thread during
22613 + * reiser4 initialization.
22614 + */
22615 +static int ktxnmgrd(void *arg)
22616 +{
22617 + struct super_block *super;
22618 + ktxnmgrd_context *ctx;
22619 + txn_mgr *mgr;
22620 + int done = 0;
22621 +
22622 + super = arg;
22623 + mgr = &get_super_private(super)->tmgr;
22624 +
22625 + /*
22626 + * do_fork() just copies task_struct into the new thread. ->fs_context
22627 + * shouldn't be copied of course. This shouldn't be a problem for the
22628 + * rest of the code though.
22629 + */
22630 + current->journal_info = NULL;
22631 + ctx = mgr->daemon;
22632 + while (1) {
22633 + try_to_freeze();
22634 + set_comm("wait");
22635 + {
22636 + DEFINE_WAIT(__wait);
22637 +
22638 + prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22639 + if (kthread_should_stop()) {
22640 + done = 1;
22641 + } else
22642 + schedule_timeout(ctx->timeout);
22643 + finish_wait(&ctx->wait, &__wait);
22644 + }
22645 + if (done)
22646 + break;
22647 + set_comm("run");
22648 + spin_lock(&ctx->guard);
22649 + /*
22650 + * wait timed out or ktxnmgrd was woken up by explicit request
22651 + * to commit something. Scan list of atoms in txnmgr and look
22652 + * for too old atoms.
22653 + */
22654 + do {
22655 + ctx->rescan = 0;
22656 + scan_mgr(super);
22657 + spin_lock(&ctx->guard);
22658 + if (ctx->rescan) {
22659 + /*
22660 + * the list could be modified while ctx
22661 + * spinlock was released, we have to repeat
22662 + * scanning from the beginning
22663 + */
22664 + break;
22665 + }
22666 + } while (ctx->rescan);
22667 + spin_unlock(&ctx->guard);
22668 + }
22669 + return 0;
22670 +}
22671 +
22672 +#undef set_comm
22673 +
22674 +/**
22675 + * init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22676 + * @super: pointer to super block
22677 + *
22678 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22679 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22680 + */
22681 +int init_ktxnmgrd(struct super_block *super)
22682 +{
22683 + txn_mgr *mgr;
22684 + ktxnmgrd_context *ctx;
22685 +
22686 + mgr = &get_super_private(super)->tmgr;
22687 +
22688 + assert("zam-1014", mgr->daemon == NULL);
22689 +
22690 + ctx = kmalloc(sizeof(ktxnmgrd_context), get_gfp_mask());
22691 + if (ctx == NULL)
22692 + return RETERR(-ENOMEM);
22693 +
22694 + assert("nikita-2442", ctx != NULL);
22695 +
22696 + memset(ctx, 0, sizeof *ctx);
22697 + init_waitqueue_head(&ctx->wait);
22698 +
22699 + /*kcond_init(&ctx->startup);*/
22700 + spin_lock_init(&ctx->guard);
22701 + ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22702 + ctx->rescan = 1;
22703 + mgr->daemon = ctx;
22704 +
22705 + ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22706 + if (IS_ERR(ctx->tsk)) {
22707 + int ret = PTR_ERR(ctx->tsk);
22708 + mgr->daemon = NULL;
22709 + kfree(ctx);
22710 + return RETERR(ret);
22711 + }
22712 + return 0;
22713 +}
22714 +
22715 +void ktxnmgrd_kick(txn_mgr *mgr)
22716 +{
22717 + assert("nikita-3234", mgr != NULL);
22718 + assert("nikita-3235", mgr->daemon != NULL);
22719 + wake_up(&mgr->daemon->wait);
22720 +}
22721 +
22722 +int is_current_ktxnmgrd(void)
22723 +{
22724 + return (get_current_super_private()->tmgr.daemon->tsk == current);
22725 +}
22726 +
22727 +/**
22728 + * scan_mgr - commit atoms which are to be committed
22729 + * @super: super block to commit atoms of
22730 + *
22731 + * Commits old atoms.
22732 + */
22733 +static int scan_mgr(struct super_block *super)
22734 +{
22735 + int ret;
22736 + reiser4_context ctx;
22737 +
22738 + init_stack_context(&ctx, super);
22739 +
22740 + ret = commit_some_atoms(&get_super_private(super)->tmgr);
22741 +
22742 + reiser4_exit_context(&ctx);
22743 + return ret;
22744 +}
22745 +
22746 +/**
22747 + * done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22748 + * @mgr:
22749 + *
22750 + * This is called on umount. Stops ktxnmgrd and free t
22751 + */
22752 +void done_ktxnmgrd(struct super_block *super)
22753 +{
22754 + txn_mgr *mgr;
22755 +
22756 + mgr = &get_super_private(super)->tmgr;
22757 + assert("zam-1012", mgr->daemon != NULL);
22758 +
22759 + kthread_stop(mgr->daemon->tsk);
22760 + kfree(mgr->daemon);
22761 + mgr->daemon = NULL;
22762 +}
22763 +
22764 +/*
22765 + * Local variables:
22766 + * c-indentation-style: "K&R"
22767 + * mode-name: "LC"
22768 + * c-basic-offset: 8
22769 + * tab-width: 8
22770 + * fill-column: 120
22771 + * End:
22772 + */
22773 Index: linux-2.6.16/fs/reiser4/ktxnmgrd.h
22774 ===================================================================
22775 --- /dev/null
22776 +++ linux-2.6.16/fs/reiser4/ktxnmgrd.h
22777 @@ -0,0 +1,52 @@
22778 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22779 + * reiser4/README */
22780 +
22781 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22782 +
22783 +#ifndef __KTXNMGRD_H__
22784 +#define __KTXNMGRD_H__
22785 +
22786 +#include "txnmgr.h"
22787 +
22788 +#include <linux/fs.h>
22789 +#include <linux/wait.h>
22790 +#include <linux/completion.h>
22791 +#include <linux/spinlock.h>
22792 +#include <asm/atomic.h>
22793 +#include <linux/sched.h> /* for struct task_struct */
22794 +
22795 +/* in this structure all data necessary to start up, shut down and communicate
22796 + * with ktxnmgrd are kept. */
22797 +struct ktxnmgrd_context {
22798 + /* wait queue head on which ktxnmgrd sleeps */
22799 + wait_queue_head_t wait;
22800 + /* spin lock protecting all fields of this structure */
22801 + spinlock_t guard;
22802 + /* timeout of sleeping on ->wait */
22803 + signed long timeout;
22804 + /* kernel thread running ktxnmgrd */
22805 + struct task_struct *tsk;
22806 + /* list of all file systems served by this ktxnmgrd */
22807 + struct list_head queue;
22808 + /* should ktxnmgrd repeat scanning of atoms? */
22809 + unsigned int rescan:1;
22810 +};
22811 +
22812 +extern int init_ktxnmgrd(struct super_block *);
22813 +extern void done_ktxnmgrd(struct super_block *);
22814 +
22815 +extern void ktxnmgrd_kick(txn_mgr * mgr);
22816 +extern int is_current_ktxnmgrd(void);
22817 +
22818 +/* __KTXNMGRD_H__ */
22819 +#endif
22820 +
22821 +/* Make Linus happy.
22822 + Local variables:
22823 + c-indentation-style: "K&R"
22824 + mode-name: "LC"
22825 + c-basic-offset: 8
22826 + tab-width: 8
22827 + fill-column: 120
22828 + End:
22829 +*/
22830 Index: linux-2.6.16/fs/reiser4/lock.c
22831 ===================================================================
22832 --- /dev/null
22833 +++ linux-2.6.16/fs/reiser4/lock.c
22834 @@ -0,0 +1,1261 @@
22835 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22836 + * reiser4/README */
22837 +
22838 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22839 + order. V4 balances the tree from the bottom up, and searches the tree from
22840 + the top down, and that is really the way we want it, so tradition won't work
22841 + for us.
22842 +
22843 + Instead we have two lock orderings, a high priority lock ordering, and a low
22844 + priority lock ordering. Each node in the tree has a lock in its znode.
22845 +
22846 + Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22847 + has a set (maybe empty) of already locked nodes ("process locked set"). Each
22848 + process may have a pending lock request to a node locked by another process.
22849 + Note: we lock and unlock, but do not transfer locks: it is possible
22850 + transferring locks instead would save some bus locking....
22851 +
22852 + Deadlock occurs when we have a loop constructed from process locked sets and
22853 + lock request vectors.
22854 +
22855 + NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22856 + memory is extended with "znodes" with which we connect nodes with their left
22857 + and right neighbors using sibling pointers stored in the znodes. When we
22858 + perform balancing operations we often go from left to right and from right to
22859 + left.
22860 +
22861 + +-P1-+ +-P3-+
22862 + |+--+| V1 |+--+|
22863 + ||N1|| -------> ||N3||
22864 + |+--+| |+--+|
22865 + +----+ +----+
22866 + ^ |
22867 + |V2 |V3
22868 + | v
22869 + +---------P2---------+
22870 + |+--+ +--+|
22871 + ||N2| -------- |N4||
22872 + |+--+ +--+|
22873 + +--------------------+
22874 +
22875 + We solve this by ensuring that only low priority processes lock in top to
22876 + bottom order and from right to left, and high priority processes lock from
22877 + bottom to top and left to right.
22878 +
22879 + ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22880 + kill those damn busy loops.
22881 + ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22882 + stage) cannot be ordered that way. There are no rules what nodes can belong
22883 + to the atom and what nodes cannot. We cannot define what is right or left
22884 + direction, what is top or bottom. We can take immediate parent or side
22885 + neighbor of one node, but nobody guarantees that, say, left neighbor node is
22886 + not a far right neighbor for other nodes from the same atom. It breaks
22887 + deadlock avoidance rules and hi-low priority locking cannot be applied for
22888 + atom locks.
22889 +
22890 + How does it help to avoid deadlocks ?
22891 +
22892 + Suppose we have a deadlock with n processes. Processes from one priority
22893 + class never deadlock because they take locks in one consistent
22894 + order.
22895 +
22896 + So, any possible deadlock loop must have low priority as well as high
22897 + priority processes. There are no other lock priority levels except low and
22898 + high. We know that any deadlock loop contains at least one node locked by a
22899 + low priority process and requested by a high priority process. If this
22900 + situation is caught and resolved it is sufficient to avoid deadlocks.
22901 +
22902 + V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22903 +
22904 + The deadlock prevention algorithm is based on comparing
22905 + priorities of node owners (processes which keep znode locked) and
22906 + requesters (processes which want to acquire a lock on znode). We
22907 + implement a scheme where low-priority owners yield locks to
22908 + high-priority requesters. We created a signal passing system that
22909 + is used to ask low-priority processes to yield one or more locked
22910 + znodes.
22911 +
22912 + The condition when a znode needs to change its owners is described by the
22913 + following formula:
22914 +
22915 + #############################################
22916 + # #
22917 + # (number of high-priority requesters) > 0 #
22918 + # AND #
22919 + # (numbers of high-priority owners) == 0 #
22920 + # #
22921 + #############################################
22922 +
22923 + Note that a low-priority process delays node releasing if another
22924 + high-priority process owns this node. So, slightly more strictly speaking,
22925 + to have a deadlock capable cycle you must have a loop in which a high
22926 + priority process is waiting on a low priority process to yield a node, which
22927 + is slightly different from saying a high priority process is waiting on a
22928 + node owned by a low priority process.
22929 +
22930 + It is enough to avoid deadlocks if we prevent any low-priority process from
22931 + falling asleep if its locked set contains a node which satisfies the
22932 + deadlock condition.
22933 +
22934 + That condition is implicitly or explicitly checked in all places where new
22935 + high-priority requests may be added or removed from node request queue or
22936 + high-priority process takes or releases a lock on node. The main
22937 + goal of these checks is to never lose the moment when node becomes "has
22938 + wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22939 + at that time.
22940 +
22941 + The information about received signals is stored in the per-process
22942 + structure (lock stack) and analyzed before a low-priority process goes to
22943 + sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22944 + sleeping process up and forces him to re-check lock status and received
22945 + signal info. If "must-yield-this-lock" signals were received the locking
22946 + primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22947 +
22948 + V4 LOCKING DRAWBACKS
22949 +
22950 + If we have already balanced on one level, and we are propagating our changes
22951 + upward to a higher level, it could be very messy to surrender all locks on
22952 + the lower level because we put so much computational work into it, and
22953 + reverting them to their state before they were locked might be very complex.
22954 + We also don't want to acquire all locks before performing balancing because
22955 + that would either be almost as much work as the balancing, or it would be
22956 + too conservative and lock too much. We want balancing to be done only at
22957 + high priority. Yet, we might want to go to the left one node and use some
22958 + of its empty space... So we make one attempt at getting the node to the left
22959 + using try_lock, and if it fails we do without it, because we didn't really
22960 + need it, it was only a nice to have.
22961 +
22962 + LOCK STRUCTURES DESCRIPTION
22963 +
22964 + The following data structures are used in the reiser4 locking
22965 + implementation:
22966 +
22967 + All fields related to long-term locking are stored in znode->lock.
22968 +
22969 + The lock stack is a per thread object. It owns all znodes locked by the
22970 + thread. One znode may be locked by several threads in case of read lock or
22971 + one znode may be write locked by one thread several times. The special link
22972 + objects (lock handles) support n<->m relation between znodes and lock
22973 + owners.
22974 +
22975 + <Thread 1> <Thread 2>
22976 +
22977 + +---------+ +---------+
22978 + | LS1 | | LS2 |
22979 + +---------+ +---------+
22980 + ^ ^
22981 + |---------------+ +----------+
22982 + v v v v
22983 + +---------+ +---------+ +---------+ +---------+
22984 + | LH1 | | LH2 | | LH3 | | LH4 |
22985 + +---------+ +---------+ +---------+ +---------+
22986 + ^ ^ ^ ^
22987 + | +------------+ |
22988 + v v v
22989 + +---------+ +---------+ +---------+
22990 + | Z1 | | Z2 | | Z3 |
22991 + +---------+ +---------+ +---------+
22992 +
22993 + Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22994 + picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22995 + LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22996 + Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22997 + list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22998 + is locked (for read) twice by different threads and two lock handles are on
22999 + its list. Each lock handle represents a single relation of a locking of a
23000 + znode by a thread. Locking of a znode is an establishing of a locking
23001 + relation between the lock stack and the znode by adding of a new lock handle
23002 + to a list of lock handles, the lock stack. The lock stack links all lock
23003 + handles for all znodes locked by the lock stack. The znode list groups all
23004 + lock handles for all locks stacks which locked the znode.
23005 +
23006 + Yet another relation may exist between znode and lock owners. If lock
23007 + procedure cannot immediately take lock on an object it adds the lock owner
23008 + on special `requestors' list belongs to znode. That list represents a
23009 + queue of pending lock requests. Because one lock owner may request only
23010 + only one lock object at a time, it is a 1->n relation between lock objects
23011 + and a lock owner implemented as it is described above. Full information
23012 + (priority, pointers to lock and link objects) about each lock request is
23013 + stored in lock owner structure in `request' field.
23014 +
23015 + SHORT_TERM LOCKING
23016 +
23017 + This is a list of primitive operations over lock stacks / lock handles /
23018 + znodes and locking descriptions for them.
23019 +
23020 + 1. locking / unlocking which is done by two list insertion/deletion, one
23021 + to/from znode's list of lock handles, another one is to/from lock stack's
23022 + list of lock handles. The first insertion is protected by
23023 + znode->lock.guard spinlock. The list owned by the lock stack can be
23024 + modified only by thread who owns the lock stack and nobody else can
23025 + modify/read it. There is nothing to be protected by a spinlock or
23026 + something else.
23027 +
23028 + 2. adding/removing a lock request to/from znode requesters list. The rule is
23029 + that znode->lock.guard spinlock should be taken for this.
23030 +
23031 + 3. we can traverse list of lock handles and use references to lock stacks who
23032 + locked given znode if znode->lock.guard spinlock is taken.
23033 +
23034 + 4. If a lock stack is associated with a znode as a lock requestor or lock
23035 + owner its existence is guaranteed by znode->lock.guard spinlock. Some its
23036 + (lock stack's) fields should be protected from being accessed in parallel
23037 + by two or more threads. Please look at lock_stack structure definition
23038 + for the info how those fields are protected. */
23039 +
23040 +/* Znode lock and capturing intertwining. */
23041 +/* In current implementation we capture formatted nodes before locking
23042 + them. Take a look on longterm lock znode, try_capture() request precedes
23043 + locking requests. The longterm_lock_znode function unconditionally captures
23044 + znode before even checking of locking conditions.
23045 +
23046 + Another variant is to capture znode after locking it. It was not tested, but
23047 + at least one deadlock condition is supposed to be there. One thread has
23048 + locked a znode (Node-1) and calls try_capture() for it. Try_capture() sleeps
23049 + because znode's atom has CAPTURE_WAIT state. Second thread is a flushing
23050 + thread, its current atom is the atom Node-1 belongs to. Second thread wants
23051 + to lock Node-1 and sleeps because Node-1 is locked by the first thread. The
23052 + described situation is a deadlock. */
23053 +
23054 +#include "debug.h"
23055 +#include "txnmgr.h"
23056 +#include "znode.h"
23057 +#include "jnode.h"
23058 +#include "tree.h"
23059 +#include "plugin/node/node.h"
23060 +#include "super.h"
23061 +
23062 +#include <linux/spinlock.h>
23063 +
23064 +#if REISER4_DEBUG
23065 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
23066 + znode_lock_request);
23067 +#endif
23068 +
23069 +/* Returns a lock owner associated with current thread */
23070 +lock_stack *get_current_lock_stack(void)
23071 +{
23072 + return &get_current_context()->stack;
23073 +}
23074 +
23075 +/* Wakes up all low priority owners informing them about possible deadlock */
23076 +static void wake_up_all_lopri_owners(znode * node)
23077 +{
23078 + lock_handle *handle;
23079 +
23080 + assert_spin_locked(&(node->lock.guard));
23081 + list_for_each_entry(handle, &node->lock.owners, owners_link) {
23082 + assert("nikita-1832", handle->node == node);
23083 + /* count this signal in owner->nr_signaled */
23084 + if (!handle->signaled) {
23085 + handle->signaled = 1;
23086 + atomic_inc(&handle->owner->nr_signaled);
23087 + /* Wake up a single process */
23088 + reiser4_wake_up(handle->owner);
23089 + }
23090 + }
23091 +}
23092 +
23093 +/* Adds a lock to a lock owner, which means creating a link to the lock and
23094 + putting the link into the two lists all links are on (the doubly linked list
23095 + that forms the lock_stack, and the doubly linked list of links attached
23096 + to a lock.
23097 +*/
23098 +static inline void
23099 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
23100 +{
23101 + assert("jmacd-810", handle->owner == NULL);
23102 + assert_spin_locked(&(node->lock.guard));
23103 +
23104 + handle->owner = owner;
23105 + handle->node = node;
23106 +
23107 + assert("reiser4-4",
23108 + ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
23109 +
23110 + /* add lock handle to the end of lock_stack's list of locks */
23111 + list_add_tail(&handle->locks_link, &owner->locks);
23112 + ON_DEBUG(owner->nr_locks++);
23113 + set_gfp_mask();
23114 +
23115 + /* add lock handle to the head of znode's list of owners */
23116 + list_add(&handle->owners_link, &node->lock.owners);
23117 + handle->signaled = 0;
23118 +}
23119 +
23120 +/* Breaks a relation between a lock and its owner */
23121 +static inline void unlink_object(lock_handle * handle)
23122 +{
23123 + assert("zam-354", handle->owner != NULL);
23124 + assert("nikita-1608", handle->node != NULL);
23125 + assert_spin_locked(&(handle->node->lock.guard));
23126 + assert("nikita-1829", handle->owner == get_current_lock_stack());
23127 + assert("reiser4-5", handle->owner->nr_locks > 0);
23128 +
23129 + /* remove lock handle from lock_stack's list of locks */
23130 + list_del(&handle->locks_link);
23131 + ON_DEBUG(handle->owner->nr_locks--);
23132 + set_gfp_mask();
23133 + assert("reiser4-6",
23134 + ergo(list_empty_careful(&handle->owner->locks),
23135 + handle->owner->nr_locks == 0));
23136 + /* remove lock handle from znode's list of owners */
23137 + list_del(&handle->owners_link);
23138 + /* indicates that lock handle is free now */
23139 + handle->node = NULL;
23140 +#if REISER4_DEBUG
23141 + INIT_LIST_HEAD(&handle->locks_link);
23142 + INIT_LIST_HEAD(&handle->owners_link);
23143 + handle->owner = NULL;
23144 +#endif
23145 +}
23146 +
23147 +/* Actually locks an object knowing that we are able to do this */
23148 +static void lock_object(lock_stack * owner)
23149 +{
23150 + lock_request *request;
23151 + znode *node;
23152 +
23153 + request = &owner->request;
23154 + node = request->node;
23155 + assert_spin_locked(&(node->lock.guard));
23156 + if (request->mode == ZNODE_READ_LOCK) {
23157 + node->lock.nr_readers++;
23158 + } else {
23159 + /* check that we don't switched from read to write lock */
23160 + assert("nikita-1840", node->lock.nr_readers <= 0);
23161 + /* We allow recursive locking; a node can be locked several
23162 + times for write by same process */
23163 + node->lock.nr_readers--;
23164 + }
23165 +
23166 + link_object(request->handle, owner, node);
23167 +
23168 + if (owner->curpri) {
23169 + node->lock.nr_hipri_owners++;
23170 + }
23171 +}
23172 +
23173 +/* Check for recursive write locking */
23174 +static int recursive(lock_stack * owner)
23175 +{
23176 + int ret;
23177 + znode *node;
23178 + lock_handle *lh;
23179 +
23180 + node = owner->request.node;
23181 +
23182 + /* Owners list is not empty for a locked node */
23183 + assert("zam-314", !list_empty_careful(&node->lock.owners));
23184 + assert("nikita-1841", owner == get_current_lock_stack());
23185 + assert_spin_locked(&(node->lock.guard));
23186 +
23187 +
23188 + lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23189 + ret = (lh->owner == owner);
23190 +
23191 + /* Recursive read locking should be done usual way */
23192 + assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23193 + /* mixing of read/write locks is not allowed */
23194 + assert("zam-341", !ret || znode_is_wlocked(node));
23195 +
23196 + return ret;
23197 +}
23198 +
23199 +#if REISER4_DEBUG
23200 +/* Returns true if the lock is held by the calling thread. */
23201 +int znode_is_any_locked(const znode * node)
23202 +{
23203 + lock_handle *handle;
23204 + lock_stack *stack;
23205 + int ret;
23206 +
23207 + if (!znode_is_locked(node)) {
23208 + return 0;
23209 + }
23210 +
23211 + stack = get_current_lock_stack();
23212 +
23213 + spin_lock_stack(stack);
23214 +
23215 + ret = 0;
23216 +
23217 + list_for_each_entry(handle, &stack->locks, locks_link) {
23218 + if (handle->node == node) {
23219 + ret = 1;
23220 + break;
23221 + }
23222 + }
23223 +
23224 + spin_unlock_stack(stack);
23225 +
23226 + return ret;
23227 +}
23228 +
23229 +#endif
23230 +
23231 +/* Returns true if a write lock is held by the calling thread. */
23232 +int znode_is_write_locked(const znode * node)
23233 +{
23234 + lock_stack *stack;
23235 + lock_handle *handle;
23236 +
23237 + assert("jmacd-8765", node != NULL);
23238 +
23239 + if (!znode_is_wlocked(node)) {
23240 + return 0;
23241 + }
23242 +
23243 + stack = get_current_lock_stack();
23244 +
23245 + /*
23246 + * When znode is write locked, all owner handles point to the same lock
23247 + * stack. Get pointer to lock stack from the first lock handle from
23248 + * znode's owner list
23249 + */
23250 + handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23251 +
23252 + return (handle->owner == stack);
23253 +}
23254 +
23255 +/* This "deadlock" condition is the essential part of reiser4 locking
23256 + implementation. This condition is checked explicitly by calling
23257 + check_deadlock_condition() or implicitly in all places where znode lock
23258 + state (set of owners and request queue) is changed. Locking code is
23259 + designed to use this condition to trigger procedure of passing object from
23260 + low priority owner(s) to high priority one(s).
23261 +
23262 + The procedure results in passing an event (setting lock_handle->signaled
23263 + flag) and counting this event in nr_signaled field of owner's lock stack
23264 + object and wakeup owner's process.
23265 +*/
23266 +static inline int check_deadlock_condition(znode * node)
23267 +{
23268 + assert_spin_locked(&(node->lock.guard));
23269 + return node->lock.nr_hipri_requests > 0
23270 + && node->lock.nr_hipri_owners == 0;
23271 +}
23272 +
23273 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23274 +{
23275 + zlock * lock = &node->lock;
23276 +
23277 + return mode == ZNODE_READ_LOCK &&
23278 + lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23279 +}
23280 +
23281 +/* checks lock/request compatibility */
23282 +static int can_lock_object(lock_stack * owner)
23283 +{
23284 + znode *node = owner->request.node;
23285 +
23286 + assert_spin_locked(&(node->lock.guard));
23287 +
23288 + /* See if the node is disconnected. */
23289 + if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23290 + return RETERR(-EINVAL);
23291 +
23292 + /* Do not ever try to take a lock if we are going in low priority
23293 + direction and a node have a high priority request without high
23294 + priority owners. */
23295 + if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23296 + return RETERR(-E_REPEAT);
23297 + if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23298 + return RETERR(-E_REPEAT);
23299 + if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23300 + return RETERR(-E_REPEAT);
23301 + return 0;
23302 +}
23303 +
23304 +/* Setting of a high priority to the process. It clears "signaled" flags
23305 + because znode locked by high-priority process can't satisfy our "deadlock
23306 + condition". */
23307 +static void set_high_priority(lock_stack * owner)
23308 +{
23309 + assert("nikita-1846", owner == get_current_lock_stack());
23310 + /* Do nothing if current priority is already high */
23311 + if (!owner->curpri) {
23312 + /* We don't need locking for owner->locks list, because, this
23313 + * function is only called with the lock stack of the current
23314 + * thread, and no other thread can play with owner->locks list
23315 + * and/or change ->node pointers of lock handles in this list.
23316 + *
23317 + * (Interrupts also are not involved.)
23318 + */
23319 + lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23320 + while (&owner->locks != &item->locks_link) {
23321 + znode *node = item->node;
23322 +
23323 + spin_lock_zlock(&node->lock);
23324 +
23325 + node->lock.nr_hipri_owners++;
23326 +
23327 + /* we can safely set signaled to zero, because
23328 + previous statement (nr_hipri_owners ++) guarantees
23329 + that signaled will be never set again. */
23330 + item->signaled = 0;
23331 + spin_unlock_zlock(&node->lock);
23332 +
23333 + item = list_entry(item->locks_link.next, lock_handle, locks_link);
23334 + }
23335 + owner->curpri = 1;
23336 + atomic_set(&owner->nr_signaled, 0);
23337 + }
23338 +}
23339 +
23340 +/* Sets a low priority to the process. */
23341 +static void set_low_priority(lock_stack * owner)
23342 +{
23343 + assert("nikita-3075", owner == get_current_lock_stack());
23344 + /* Do nothing if current priority is already low */
23345 + if (owner->curpri) {
23346 + /* scan all locks (lock handles) held by @owner, which is
23347 + actually current thread, and check whether we are reaching
23348 + deadlock possibility anywhere.
23349 + */
23350 + lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23351 + while (&owner->locks != &handle->locks_link) {
23352 + znode *node = handle->node;
23353 + spin_lock_zlock(&node->lock);
23354 + /* this thread just was hipri owner of @node, so
23355 + nr_hipri_owners has to be greater than zero. */
23356 + assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23357 + node->lock.nr_hipri_owners--;
23358 + /* If we have deadlock condition, adjust a nr_signaled
23359 + field. It is enough to set "signaled" flag only for
23360 + current process, other low-pri owners will be
23361 + signaled and waken up after current process unlocks
23362 + this object and any high-priority requestor takes
23363 + control. */
23364 + if (check_deadlock_condition(node)
23365 + && !handle->signaled) {
23366 + handle->signaled = 1;
23367 + atomic_inc(&owner->nr_signaled);
23368 + }
23369 + spin_unlock_zlock(&node->lock);
23370 + handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23371 + }
23372 + owner->curpri = 0;
23373 + }
23374 +}
23375 +
23376 +static void remove_lock_request(lock_stack * requestor)
23377 +{
23378 + zlock * lock = &requestor->request.node->lock;
23379 +
23380 + if (requestor->curpri) {
23381 + assert("nikita-1838", lock->nr_hipri_requests > 0);
23382 + lock->nr_hipri_requests--;
23383 + if (requestor->request.mode == ZNODE_WRITE_LOCK)
23384 + lock->nr_hipri_write_requests --;
23385 + }
23386 + list_del(&requestor->requestors_link);
23387 +}
23388 +
23389 +
23390 +static void invalidate_all_lock_requests(znode * node)
23391 +{
23392 + lock_stack *requestor, *tmp;
23393 +
23394 + assert_spin_locked(&(node->lock.guard));
23395 +
23396 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23397 + remove_lock_request(requestor);
23398 + requestor->request.ret_code = -EINVAL;
23399 + reiser4_wake_up(requestor);
23400 + requestor->request.mode = ZNODE_NO_LOCK;
23401 + }
23402 +}
23403 +
23404 +static void dispatch_lock_requests(znode * node)
23405 +{
23406 + lock_stack *requestor, *tmp;
23407 +
23408 + assert_spin_locked(&(node->lock.guard));
23409 +
23410 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23411 + if (znode_is_write_locked(node))
23412 + break;
23413 + if (!can_lock_object(requestor)) {
23414 + lock_object(requestor);
23415 + remove_lock_request(requestor);
23416 + requestor->request.ret_code = 0;
23417 + reiser4_wake_up(requestor);
23418 + requestor->request.mode = ZNODE_NO_LOCK;
23419 + }
23420 + }
23421 +}
23422 +
23423 +/* release long-term lock, acquired by longterm_lock_znode() */
23424 +void longterm_unlock_znode(lock_handle * handle)
23425 +{
23426 + znode *node = handle->node;
23427 + lock_stack *oldowner = handle->owner;
23428 + int hipri;
23429 + int readers;
23430 + int rdelta;
23431 + int youdie;
23432 +
23433 + /*
23434 + * this is time-critical and highly optimized code. Modify carefully.
23435 + */
23436 +
23437 + assert("jmacd-1021", handle != NULL);
23438 + assert("jmacd-1022", handle->owner != NULL);
23439 + assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23440 +
23441 + assert("zam-130", oldowner == get_current_lock_stack());
23442 +
23443 + LOCK_CNT_DEC(long_term_locked_znode);
23444 +
23445 + /*
23446 + * to minimize amount of operations performed under lock, pre-compute
23447 + * all variables used within critical section. This makes code
23448 + * obscure.
23449 + */
23450 +
23451 + /* was this lock of hi or lo priority */
23452 + hipri = oldowner->curpri ? -1 : 0;
23453 + /* number of readers */
23454 + readers = node->lock.nr_readers;
23455 + /* +1 if write lock, -1 if read lock */
23456 + rdelta = (readers > 0) ? -1 : +1;
23457 + /* true if node is to die and write lock is released */
23458 + youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23459 +
23460 + spin_lock_zlock(&node->lock);
23461 +
23462 + assert("zam-101", znode_is_locked(node));
23463 +
23464 + /* Adjust a number of high priority owners of this lock */
23465 + node->lock.nr_hipri_owners += hipri;
23466 + assert("nikita-1836", node->lock.nr_hipri_owners >= 0);
23467 +
23468 + /* Handle znode deallocation on last write-lock release. */
23469 + if (znode_is_wlocked_once(node)) {
23470 + if (youdie) {
23471 + forget_znode(handle);
23472 + assert("nikita-2191", znode_invariant(node));
23473 + zput(node);
23474 + return;
23475 + }
23476 + }
23477 +
23478 + if (handle->signaled)
23479 + atomic_dec(&oldowner->nr_signaled);
23480 +
23481 + /* Unlocking means owner<->object link deletion */
23482 + unlink_object(handle);
23483 +
23484 + /* This is enough to be sure whether an object is completely
23485 + unlocked. */
23486 + node->lock.nr_readers += rdelta;
23487 +
23488 + /* If the node is locked it must have an owners list. Likewise, if
23489 + the node is unlocked it must have an empty owners list. */
23490 + assert("zam-319", equi(znode_is_locked(node),
23491 + !list_empty_careful(&node->lock.owners)));
23492 +
23493 +#if REISER4_DEBUG
23494 + if (!znode_is_locked(node))
23495 + ++node->times_locked;
23496 +#endif
23497 +
23498 + /* If there are pending lock requests we wake up a requestor */
23499 + if (!znode_is_wlocked(node))
23500 + dispatch_lock_requests(node);
23501 + if (check_deadlock_condition(node))
23502 + wake_up_all_lopri_owners(node);
23503 + spin_unlock_zlock(&node->lock);
23504 +
23505 + /* minus one reference from handle->node */
23506 + assert("nikita-2190", znode_invariant(node));
23507 + ON_DEBUG(check_lock_data());
23508 + ON_DEBUG(check_lock_node_data(node));
23509 + zput(node);
23510 +}
23511 +
23512 +/* final portion of longterm-lock */
23513 +static int
23514 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23515 +{
23516 + znode *node = owner->request.node;
23517 +
23518 + assert_spin_locked(&(node->lock.guard));
23519 +
23520 + /* If we broke with (ok == 0) it means we can_lock, now do it. */
23521 + if (ok == 0) {
23522 + lock_object(owner);
23523 + owner->request.mode = 0;
23524 + /* count a reference from lockhandle->node
23525 +
23526 + znode was already referenced at the entry to this function,
23527 + hence taking spin-lock here is not necessary (see comment
23528 + in the zref()).
23529 + */
23530 + zref(node);
23531 +
23532 + LOCK_CNT_INC(long_term_locked_znode);
23533 + }
23534 + spin_unlock_zlock(&node->lock);
23535 + ON_DEBUG(check_lock_data());
23536 + ON_DEBUG(check_lock_node_data(node));
23537 + return ok;
23538 +}
23539 +
23540 +/*
23541 + * version of longterm_znode_lock() optimized for the most common case: read
23542 + * lock without any special flags. This is the kind of lock that any tree
23543 + * traversal takes on the root node of the tree, which is very frequent.
23544 + */
23545 +static int longterm_lock_tryfast(lock_stack * owner)
23546 +{
23547 + int result;
23548 + znode *node;
23549 + zlock *lock;
23550 +
23551 + node = owner->request.node;
23552 + lock = &node->lock;
23553 +
23554 + assert("nikita-3340", schedulable());
23555 + assert("nikita-3341", request_is_deadlock_safe(node,
23556 + ZNODE_READ_LOCK,
23557 + ZNODE_LOCK_LOPRI));
23558 + spin_lock_zlock(lock);
23559 + result = can_lock_object(owner);
23560 + spin_unlock_zlock(lock);
23561 +
23562 + if (likely(result != -EINVAL)) {
23563 + spin_lock_znode(node);
23564 + result = try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23565 + spin_unlock_znode(node);
23566 + spin_lock_zlock(lock);
23567 + if (unlikely(result != 0)) {
23568 + owner->request.mode = 0;
23569 + } else {
23570 + result = can_lock_object(owner);
23571 + if (unlikely(result == -E_REPEAT)) {
23572 + /* fall back to longterm_lock_znode() */
23573 + spin_unlock_zlock(lock);
23574 + return 1;
23575 + }
23576 + }
23577 + return lock_tail(owner, result, ZNODE_READ_LOCK);
23578 + } else
23579 + return 1;
23580 +}
23581 +
23582 +/* locks given lock object */
23583 +int longterm_lock_znode(
23584 + /* local link object (allocated by lock owner thread, usually on its own
23585 + * stack) */
23586 + lock_handle * handle,
23587 + /* znode we want to lock. */
23588 + znode * node,
23589 + /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23590 + znode_lock_mode mode,
23591 + /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23592 + znode_lock_request request) {
23593 + int ret;
23594 + int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23595 + int non_blocking = 0;
23596 + int has_atom;
23597 + txn_capture cap_flags;
23598 + zlock *lock;
23599 + txn_handle *txnh;
23600 + tree_level level;
23601 +
23602 + /* Get current process context */
23603 + lock_stack *owner = get_current_lock_stack();
23604 +
23605 + /* Check that the lock handle is initialized and isn't already being
23606 + * used. */
23607 + assert("jmacd-808", handle->owner == NULL);
23608 + assert("nikita-3026", schedulable());
23609 + assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23610 + assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23611 + /* long term locks are not allowed in the VM contexts (->writepage(),
23612 + * prune_{d,i}cache()).
23613 + *
23614 + * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23615 + * bug caused by d_splice_alias() only working for directories.
23616 + */
23617 + assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23618 + assert ("zam-1055", mode != ZNODE_NO_LOCK);
23619 +
23620 + cap_flags = 0;
23621 + if (request & ZNODE_LOCK_NONBLOCK) {
23622 + cap_flags |= TXN_CAPTURE_NONBLOCKING;
23623 + non_blocking = 1;
23624 + }
23625 +
23626 + if (request & ZNODE_LOCK_DONT_FUSE)
23627 + cap_flags |= TXN_CAPTURE_DONT_FUSE;
23628 +
23629 + /* If we are changing our process priority we must adjust a number
23630 + of high priority owners for each znode that we already lock */
23631 + if (hipri) {
23632 + set_high_priority(owner);
23633 + } else {
23634 + set_low_priority(owner);
23635 + }
23636 +
23637 + level = znode_get_level(node);
23638 +
23639 + /* Fill request structure with our values. */
23640 + owner->request.mode = mode;
23641 + owner->request.handle = handle;
23642 + owner->request.node = node;
23643 +
23644 + txnh = get_current_context()->trans;
23645 + lock = &node->lock;
23646 +
23647 + if (mode == ZNODE_READ_LOCK && request == 0) {
23648 + ret = longterm_lock_tryfast(owner);
23649 + if (ret <= 0)
23650 + return ret;
23651 + }
23652 +
23653 + has_atom = (txnh->atom != NULL);
23654 +
23655 + /* Synchronize on node's zlock guard lock. */
23656 + spin_lock_zlock(lock);
23657 +
23658 + if (znode_is_locked(node) &&
23659 + mode == ZNODE_WRITE_LOCK && recursive(owner))
23660 + return lock_tail(owner, 0, mode);
23661 +
23662 + for (;;) {
23663 + /* Check the lock's availability: if it is unavaiable we get
23664 + E_REPEAT, 0 indicates "can_lock", otherwise the node is
23665 + invalid. */
23666 + ret = can_lock_object(owner);
23667 +
23668 + if (unlikely(ret == -EINVAL)) {
23669 + /* @node is dying. Leave it alone. */
23670 + break;
23671 + }
23672 +
23673 + if (unlikely(ret == -E_REPEAT && non_blocking)) {
23674 + /* either locking of @node by the current thread will
23675 + * lead to the deadlock, or lock modes are
23676 + * incompatible. */
23677 + break;
23678 + }
23679 +
23680 + assert("nikita-1844", (ret == 0)
23681 + || ((ret == -E_REPEAT) && !non_blocking));
23682 + /* If we can get the lock... Try to capture first before
23683 + taking the lock. */
23684 +
23685 + /* first handle commonest case where node and txnh are already
23686 + * in the same atom. */
23687 + /* safe to do without taking locks, because:
23688 + *
23689 + * 1. read of aligned word is atomic with respect to writes to
23690 + * this word
23691 + *
23692 + * 2. false negatives are handled in try_capture().
23693 + *
23694 + * 3. false positives are impossible.
23695 + *
23696 + * PROOF: left as an exercise to the curious reader.
23697 + *
23698 + * Just kidding. Here is one:
23699 + *
23700 + * At the time T0 txnh->atom is stored in txnh_atom.
23701 + *
23702 + * At the time T1 node->atom is stored in node_atom.
23703 + *
23704 + * At the time T2 we observe that
23705 + *
23706 + * txnh_atom != NULL && node_atom == txnh_atom.
23707 + *
23708 + * Imagine that at this moment we acquire node and txnh spin
23709 + * lock in this order. Suppose that under spin lock we have
23710 + *
23711 + * node->atom != txnh->atom, (S1)
23712 + *
23713 + * at the time T3.
23714 + *
23715 + * txnh->atom != NULL still, because txnh is open by the
23716 + * current thread.
23717 + *
23718 + * Suppose node->atom == NULL, that is, node was un-captured
23719 + * between T1, and T3. But un-capturing of formatted node is
23720 + * always preceded by the call to invalidate_lock(), which
23721 + * marks znode as JNODE_IS_DYING under zlock spin
23722 + * lock. Contradiction, because can_lock_object() above checks
23723 + * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23724 + *
23725 + * Suppose that node->atom != node_atom, that is, atom, node
23726 + * belongs to was fused into another atom: node_atom was fused
23727 + * into node->atom. Atom of txnh was equal to node_atom at T2,
23728 + * which means that under spin lock, txnh->atom == node->atom,
23729 + * because txnh->atom can only follow fusion
23730 + * chain. Contradicts S1.
23731 + *
23732 + * The same for hypothesis txnh->atom != txnh_atom. Hence,
23733 + * node->atom == node_atom == txnh_atom == txnh->atom. Again
23734 + * contradicts S1. Hence S1 is false. QED.
23735 + *
23736 + */
23737 +
23738 + if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23739 + ;
23740 + } else {
23741 + /*
23742 + * unlock zlock spin lock here. It is possible for
23743 + * longterm_unlock_znode() to sneak in here, but there
23744 + * is no harm: invalidate_lock() will mark znode as
23745 + * JNODE_IS_DYING and this will be noted by
23746 + * can_lock_object() below.
23747 + */
23748 + spin_unlock_zlock(lock);
23749 + spin_lock_znode(node);
23750 + ret = try_capture(ZJNODE(node), mode, cap_flags);
23751 + spin_unlock_znode(node);
23752 + spin_lock_zlock(lock);
23753 + if (unlikely(ret != 0)) {
23754 + /* In the failure case, the txnmgr releases
23755 + the znode's lock (or in some cases, it was
23756 + released a while ago). There's no need to
23757 + reacquire it so we should return here,
23758 + avoid releasing the lock. */
23759 + owner->request.mode = 0;
23760 + break;
23761 + }
23762 +
23763 + /* Check the lock's availability again -- this is
23764 + because under some circumstances the capture code
23765 + has to release and reacquire the znode spinlock. */
23766 + ret = can_lock_object(owner);
23767 + }
23768 +
23769 + /* This time, a return of (ret == 0) means we can lock, so we
23770 + should break out of the loop. */
23771 + if (likely(ret != -E_REPEAT || non_blocking)) {
23772 + break;
23773 + }
23774 +
23775 + /* Lock is unavailable, we have to wait. */
23776 +
23777 + /* By having semaphore initialization here we cannot lose
23778 + wakeup signal even if it comes after `nr_signaled' field
23779 + check. */
23780 + ret = prepare_to_sleep(owner);
23781 + if (unlikely(ret != 0)) {
23782 + break;
23783 + }
23784 +
23785 + assert_spin_locked(&(node->lock.guard));
23786 + if (hipri) {
23787 + /* If we are going in high priority direction then
23788 + increase high priority requests counter for the
23789 + node */
23790 + lock->nr_hipri_requests++;
23791 + if (mode == ZNODE_WRITE_LOCK)
23792 + lock->nr_hipri_write_requests ++;
23793 + /* If there are no high priority owners for a node,
23794 + then immediately wake up low priority owners, so
23795 + they can detect possible deadlock */
23796 + if (lock->nr_hipri_owners == 0)
23797 + wake_up_all_lopri_owners(node);
23798 + }
23799 + list_add_tail(&owner->requestors_link, &lock->requestors);
23800 +
23801 + /* Ok, here we have prepared a lock request, so unlock
23802 + a znode ... */
23803 + spin_unlock_zlock(lock);
23804 + /* ... and sleep */
23805 + go_to_sleep(owner);
23806 + if (owner->request.mode == ZNODE_NO_LOCK)
23807 + goto request_is_done;
23808 + spin_lock_zlock(lock);
23809 + if (owner->request.mode == ZNODE_NO_LOCK) {
23810 + spin_unlock_zlock(lock);
23811 + request_is_done:
23812 + if (owner->request.ret_code == 0) {
23813 + LOCK_CNT_INC(long_term_locked_znode);
23814 + zref(node);
23815 + }
23816 + return owner->request.ret_code;
23817 + }
23818 + remove_lock_request(owner);
23819 + }
23820 +
23821 + return lock_tail(owner, ret, mode);
23822 +}
23823 +
23824 +/* lock object invalidation means changing of lock object state to `INVALID'
23825 + and waiting for all other processes to cancel theirs lock requests. */
23826 +void invalidate_lock(lock_handle * handle /* path to lock
23827 + * owner and lock
23828 + * object is being
23829 + * invalidated. */ )
23830 +{
23831 + znode *node = handle->node;
23832 + lock_stack *owner = handle->owner;
23833 +
23834 + assert("zam-325", owner == get_current_lock_stack());
23835 + assert("zam-103", znode_is_write_locked(node));
23836 + assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23837 + assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23838 + assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23839 + assert("nikita-3097", znode_is_wlocked_once(node));
23840 + assert_spin_locked(&(node->lock.guard));
23841 +
23842 + if (handle->signaled)
23843 + atomic_dec(&owner->nr_signaled);
23844 +
23845 + ZF_SET(node, JNODE_IS_DYING);
23846 + unlink_object(handle);
23847 + node->lock.nr_readers = 0;
23848 +
23849 + invalidate_all_lock_requests(node);
23850 + spin_unlock_zlock(&node->lock);
23851 +}
23852 +
23853 +/* Initializes lock_stack. */
23854 +void init_lock_stack(lock_stack * owner /* pointer to
23855 + * allocated
23856 + * structure. */ )
23857 +{
23858 + INIT_LIST_HEAD(&owner->locks);
23859 + INIT_LIST_HEAD(&owner->requestors_link);
23860 + spin_lock_init(&owner->sguard);
23861 + owner->curpri = 1;
23862 + sema_init(&owner->sema, 0);
23863 +}
23864 +
23865 +/* Initializes lock object. */
23866 +void reiser4_init_lock(zlock * lock /* pointer on allocated
23867 + * uninitialized lock object
23868 + * structure. */ )
23869 +{
23870 + memset(lock, 0, sizeof(zlock));
23871 + spin_lock_init(&lock->guard);
23872 + INIT_LIST_HEAD(&lock->requestors);
23873 + INIT_LIST_HEAD(&lock->owners);
23874 +}
23875 +
23876 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
23877 + heap locations). */
23878 +static void
23879 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23880 +{
23881 + znode *node = old->node;
23882 + lock_stack *owner = old->owner;
23883 + int signaled;
23884 +
23885 + /* locks_list, modified by link_object() is not protected by
23886 + anything. This is valid because only current thread ever modifies
23887 + locks_list of its lock_stack.
23888 + */
23889 + assert("nikita-1827", owner == get_current_lock_stack());
23890 + assert("nikita-1831", new->owner == NULL);
23891 +
23892 + spin_lock_zlock(&node->lock);
23893 +
23894 + signaled = old->signaled;
23895 + if (unlink_old) {
23896 + unlink_object(old);
23897 + } else {
23898 + if (node->lock.nr_readers > 0) {
23899 + node->lock.nr_readers += 1;
23900 + } else {
23901 + node->lock.nr_readers -= 1;
23902 + }
23903 + if (signaled) {
23904 + atomic_inc(&owner->nr_signaled);
23905 + }
23906 + if (owner->curpri) {
23907 + node->lock.nr_hipri_owners += 1;
23908 + }
23909 + LOCK_CNT_INC(long_term_locked_znode);
23910 +
23911 + zref(node);
23912 + }
23913 + link_object(new, owner, node);
23914 + new->signaled = signaled;
23915 +
23916 + spin_unlock_zlock(&node->lock);
23917 +}
23918 +
23919 +void move_lh(lock_handle * new, lock_handle * old)
23920 +{
23921 + move_lh_internal(new, old, /*unlink_old */ 1);
23922 +}
23923 +
23924 +void copy_lh(lock_handle * new, lock_handle * old)
23925 +{
23926 + move_lh_internal(new, old, /*unlink_old */ 0);
23927 +}
23928 +
23929 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23930 +int check_deadlock(void)
23931 +{
23932 + lock_stack *owner = get_current_lock_stack();
23933 + return atomic_read(&owner->nr_signaled) != 0;
23934 +}
23935 +
23936 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23937 + priorities. */
23938 +int prepare_to_sleep(lock_stack * owner)
23939 +{
23940 + assert("nikita-1847", owner == get_current_lock_stack());
23941 + /* NOTE(Zam): We cannot reset the lock semaphore here because it may
23942 + clear wake-up signal. The initial design was to re-check all
23943 + conditions under which we continue locking, release locks or sleep
23944 + until conditions are changed. However, even lock.c does not follow
23945 + that design. So, wake-up signal which is stored in semaphore state
23946 + could we loosen by semaphore reset. The less complex scheme without
23947 + resetting the semaphore is enough to not to loose wake-ups.
23948 +
23949 + if (0) {
23950 +
23951 + NOTE-NIKITA: I commented call to sema_init() out hoping
23952 + that it is the reason or thread sleeping in
23953 + down(&owner->sema) without any other thread running.
23954 +
23955 + Anyway, it is just an optimization: is semaphore is not
23956 + reinitialised at this point, in the worst case
23957 + longterm_lock_znode() would have to iterate its loop once
23958 + more.
23959 + spin_lock_stack(owner);
23960 + sema_init(&owner->sema, 0);
23961 + spin_unlock_stack(owner);
23962 + }
23963 + */
23964 +
23965 + /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23966 + * counted in nr_signaled */
23967 + if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23968 + assert("zam-959", !owner->curpri);
23969 + return RETERR(-E_DEADLOCK);
23970 + }
23971 + return 0;
23972 +}
23973 +
23974 +/* Wakes up a single thread */
23975 +void __reiser4_wake_up(lock_stack * owner)
23976 +{
23977 + up(&owner->sema);
23978 +}
23979 +
23980 +/* Puts a thread to sleep */
23981 +void go_to_sleep(lock_stack * owner)
23982 +{
23983 + /* Well, we might sleep here, so holding of any spinlocks is no-no */
23984 + assert("nikita-3027", schedulable());
23985 + /* return down_interruptible(&owner->sema); */
23986 + down(&owner->sema);
23987 +}
23988 +
23989 +int lock_stack_isclean(lock_stack * owner)
23990 +{
23991 + if (list_empty_careful(&owner->locks)) {
23992 + assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23993 + return 1;
23994 + }
23995 +
23996 + return 0;
23997 +}
23998 +
23999 +#if REISER4_DEBUG
24000 +
24001 +/*
24002 + * debugging functions
24003 + */
24004 +
24005 +static void list_check(struct list_head *head)
24006 +{
24007 + struct list_head *pos;
24008 +
24009 + list_for_each(pos, head)
24010 + assert("", (pos->prev != NULL && pos->next != NULL &&
24011 + pos->prev->next == pos && pos->next->prev == pos));
24012 +}
24013 +
24014 +/* check consistency of locking data-structures hanging of the @stack */
24015 +static void check_lock_stack(lock_stack * stack)
24016 +{
24017 + spin_lock_stack(stack);
24018 + /* check that stack->locks is not corrupted */
24019 + list_check(&stack->locks);
24020 + spin_unlock_stack(stack);
24021 +}
24022 +
24023 +/* check consistency of locking data structures */
24024 +void check_lock_data(void)
24025 +{
24026 + check_lock_stack(&get_current_context()->stack);
24027 +}
24028 +
24029 +/* check consistency of locking data structures for @node */
24030 +void check_lock_node_data(znode * node)
24031 +{
24032 + spin_lock_zlock(&node->lock);
24033 + list_check(&node->lock.owners);
24034 + list_check(&node->lock.requestors);
24035 + spin_unlock_zlock(&node->lock);
24036 +}
24037 +
24038 +/* check that given lock request is dead lock safe. This check is, of course,
24039 + * not exhaustive. */
24040 +static int
24041 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
24042 + znode_lock_request request)
24043 +{
24044 + lock_stack *owner;
24045 +
24046 + owner = get_current_lock_stack();
24047 + /*
24048 + * check that hipri lock request is not issued when there are locked
24049 + * nodes at the higher levels.
24050 + */
24051 + if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
24052 + znode_get_level(node) != 0) {
24053 + lock_handle *item;
24054 +
24055 + list_for_each_entry(item, &owner->locks, locks_link) {
24056 + znode *other;
24057 +
24058 + other = item->node;
24059 +
24060 + if (znode_get_level(other) == 0)
24061 + continue;
24062 + if (znode_get_level(other) > znode_get_level(node))
24063 + return 0;
24064 + }
24065 + }
24066 + return 1;
24067 +}
24068 +
24069 +#endif
24070 +
24071 +/* return pointer to static storage with name of lock_mode. For
24072 + debugging */
24073 +const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
24074 +{
24075 + if (lock == ZNODE_READ_LOCK)
24076 + return "read";
24077 + else if (lock == ZNODE_WRITE_LOCK)
24078 + return "write";
24079 + else {
24080 + static char buf[30];
24081 +
24082 + sprintf(buf, "unknown: %i", lock);
24083 + return buf;
24084 + }
24085 +}
24086 +
24087 +/* Make Linus happy.
24088 + Local variables:
24089 + c-indentation-style: "K&R"
24090 + mode-name: "LC"
24091 + c-basic-offset: 8
24092 + tab-width: 8
24093 + fill-column: 79
24094 + End:
24095 +*/
24096 Index: linux-2.6.16/fs/reiser4/lock.h
24097 ===================================================================
24098 --- /dev/null
24099 +++ linux-2.6.16/fs/reiser4/lock.h
24100 @@ -0,0 +1,272 @@
24101 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
24102 +
24103 +/* Long term locking data structures. See lock.c for details. */
24104 +
24105 +#ifndef __LOCK_H__
24106 +#define __LOCK_H__
24107 +
24108 +#include "forward.h"
24109 +#include "debug.h"
24110 +#include "dformat.h"
24111 +#include "key.h"
24112 +#include "coord.h"
24113 +#include "plugin/node/node.h"
24114 +#include "txnmgr.h"
24115 +#include "readahead.h"
24116 +
24117 +#include <linux/types.h>
24118 +#include <linux/spinlock.h>
24119 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
24120 +#include <asm/atomic.h>
24121 +#include <asm/semaphore.h>
24122 +
24123 +/* Per-znode lock object */
24124 +struct zlock {
24125 + spinlock_t guard;
24126 + /* The number of readers if positive; the number of recursively taken
24127 + write locks if negative. Protected by zlock spin lock. */
24128 + int nr_readers;
24129 + /* A number of processes (lock_stacks) that have this object
24130 + locked with high priority */
24131 + unsigned nr_hipri_owners;
24132 + /* A number of attempts to lock znode in high priority direction */
24133 + unsigned nr_hipri_requests;
24134 + /* A linked list of lock_handle objects that contains pointers
24135 + for all lock_stacks which have this lock object locked */
24136 + unsigned nr_hipri_write_requests;
24137 + struct list_head owners;
24138 + /* A linked list of lock_stacks that wait for this lock */
24139 + struct list_head requestors;
24140 +};
24141 +
24142 +static inline void spin_lock_zlock(zlock *lock)
24143 +{
24144 + /* check that zlock is not locked */
24145 + assert("", LOCK_CNT_NIL(spin_locked_zlock));
24146 + /* check that spinlocks of lower priorities are not held */
24147 + assert("", LOCK_CNT_NIL(spin_locked_stack));
24148 +
24149 + spin_lock(&lock->guard);
24150 +
24151 + LOCK_CNT_INC(spin_locked_zlock);
24152 + LOCK_CNT_INC(spin_locked);
24153 +}
24154 +
24155 +static inline void spin_unlock_zlock(zlock *lock)
24156 +{
24157 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
24158 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24159 +
24160 + LOCK_CNT_DEC(spin_locked_zlock);
24161 + LOCK_CNT_DEC(spin_locked);
24162 +
24163 + spin_unlock(&lock->guard);
24164 +}
24165 +
24166 +#define lock_is_locked(lock) ((lock)->nr_readers != 0)
24167 +#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
24168 +#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
24169 +#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
24170 +#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
24171 +#define lock_mode_compatible(lock, mode) \
24172 + (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
24173 + ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
24174 +
24175 +/* Since we have R/W znode locks we need additional bidirectional `link'
24176 + objects to implement n<->m relationship between lock owners and lock
24177 + objects. We call them `lock handles'.
24178 +
24179 + Locking: see lock.c/"SHORT-TERM LOCKING"
24180 +*/
24181 +struct lock_handle {
24182 + /* This flag indicates that a signal to yield a lock was passed to
24183 + lock owner and counted in owner->nr_signalled
24184 +
24185 + Locking: this is accessed under spin lock on ->node.
24186 + */
24187 + int signaled;
24188 + /* A link to owner of a lock */
24189 + lock_stack *owner;
24190 + /* A link to znode locked */
24191 + znode *node;
24192 + /* A list of all locks for a process */
24193 + struct list_head locks_link;
24194 + /* A list of all owners for a znode */
24195 + struct list_head owners_link;
24196 +};
24197 +
24198 +typedef struct lock_request {
24199 + /* A pointer to uninitialized link object */
24200 + lock_handle *handle;
24201 + /* A pointer to the object we want to lock */
24202 + znode *node;
24203 + /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
24204 + znode_lock_mode mode;
24205 + /* how dispatch_lock_requests() returns lock request result code */
24206 + int ret_code;
24207 +} lock_request;
24208 +
24209 +/* A lock stack structure for accumulating locks owned by a process */
24210 +struct lock_stack {
24211 + /* A guard lock protecting a lock stack */
24212 + spinlock_t sguard;
24213 + /* number of znodes which were requested by high priority processes */
24214 + atomic_t nr_signaled;
24215 + /* Current priority of a process
24216 +
24217 + This is only accessed by the current thread and thus requires no
24218 + locking.
24219 + */
24220 + int curpri;
24221 + /* A list of all locks owned by this process. Elements can be added to
24222 + * this list only by the current thread. ->node pointers in this list
24223 + * can be only changed by the current thread. */
24224 + struct list_head locks;
24225 + /* When lock_stack waits for the lock, it puts itself on double-linked
24226 + requestors list of that lock */
24227 + struct list_head requestors_link;
24228 + /* Current lock request info.
24229 +
24230 + This is only accessed by the current thread and thus requires no
24231 + locking.
24232 + */
24233 + lock_request request;
24234 + /* It is a lock_stack's synchronization object for when process sleeps
24235 + when requested lock not on this lock_stack but which it wishes to
24236 + add to this lock_stack is not immediately available. It is used
24237 + instead of wait_queue_t object due to locking problems (lost wake
24238 + up). "lost wakeup" occurs when process is waken up before he actually
24239 + becomes 'sleepy' (through sleep_on()). Using of semaphore object is
24240 + simplest way to avoid that problem.
24241 +
24242 + A semaphore is used in the following way: only the process that is
24243 + the owner of the lock_stack initializes it (to zero) and calls
24244 + down(sema) on it. Usually this causes the process to sleep on the
24245 + semaphore. Other processes may wake him up by calling up(sema). The
24246 + advantage to a semaphore is that up() and down() calls are not
24247 + required to preserve order. Unlike wait_queue it works when process
24248 + is woken up before getting to sleep.
24249 +
24250 + NOTE-NIKITA: Transaction manager is going to have condition variables
24251 + (&kcondvar_t) anyway, so this probably will be replaced with
24252 + one in the future.
24253 +
24254 + After further discussion, Nikita has shown me that Zam's implementation is
24255 + exactly a condition variable. The znode's {zguard,requestors_list} represents
24256 + condition variable and the lock_stack's {sguard,semaphore} guards entry and
24257 + exit from the condition variable's wait queue. But the existing code can't
24258 + just be replaced with a more general abstraction, and I think its fine the way
24259 + it is. */
24260 + struct semaphore sema;
24261 +#if REISER4_DEBUG
24262 + int nr_locks; /* number of lock handles in the above list */
24263 +#endif
24264 +};
24265 +
24266 +
24267 +/*
24268 + User-visible znode locking functions
24269 +*/
24270 +
24271 +extern int longterm_lock_znode(lock_handle * handle,
24272 + znode * node,
24273 + znode_lock_mode mode,
24274 + znode_lock_request request);
24275 +
24276 +extern void longterm_unlock_znode(lock_handle * handle);
24277 +
24278 +extern int check_deadlock(void);
24279 +
24280 +extern lock_stack *get_current_lock_stack(void);
24281 +
24282 +extern void init_lock_stack(lock_stack * owner);
24283 +extern void reiser4_init_lock(zlock * lock);
24284 +
24285 +static inline void init_lh(lock_handle *lh)
24286 +{
24287 +#if REISER4_DEBUG
24288 + memset(lh, 0, sizeof *lh);
24289 + INIT_LIST_HEAD(&lh->locks_link);
24290 + INIT_LIST_HEAD(&lh->owners_link);
24291 +#else
24292 + lh->node = NULL;
24293 +#endif
24294 +}
24295 +
24296 +static inline void done_lh(lock_handle *lh)
24297 +{
24298 + assert("zam-342", lh != NULL);
24299 + if (lh->node != NULL)
24300 + longterm_unlock_znode(lh);
24301 +}
24302 +
24303 +extern void move_lh(lock_handle * new, lock_handle * old);
24304 +extern void copy_lh(lock_handle * new, lock_handle * old);
24305 +
24306 +extern int prepare_to_sleep(lock_stack * owner);
24307 +extern void go_to_sleep(lock_stack * owner);
24308 +extern void __reiser4_wake_up(lock_stack * owner);
24309 +
24310 +extern int lock_stack_isclean(lock_stack * owner);
24311 +
24312 +/* zlock object state check macros: only used in assertions. Both forms imply that the
24313 + lock is held by the current thread. */
24314 +extern int znode_is_write_locked(const znode *);
24315 +extern void invalidate_lock(lock_handle *);
24316 +
24317 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24318 +#define spin_ordering_pred_stack(stack) \
24319 + (LOCK_CNT_NIL(spin_locked_stack) && \
24320 + LOCK_CNT_NIL(spin_locked_txnmgr) && \
24321 + LOCK_CNT_NIL(spin_locked_inode) && \
24322 + LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24323 + LOCK_CNT_NIL(spin_locked_super_eflush) )
24324 +
24325 +static inline void spin_lock_stack(lock_stack *stack)
24326 +{
24327 + assert("", spin_ordering_pred_stack(stack));
24328 + spin_lock(&(stack->sguard));
24329 + LOCK_CNT_INC(spin_locked_stack);
24330 + LOCK_CNT_INC(spin_locked);
24331 +}
24332 +
24333 +static inline void spin_unlock_stack(lock_stack *stack)
24334 +{
24335 + assert_spin_locked(&(stack->sguard));
24336 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24337 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24338 + LOCK_CNT_DEC(spin_locked_stack);
24339 + LOCK_CNT_DEC(spin_locked);
24340 + spin_unlock(&(stack->sguard));
24341 +}
24342 +
24343 +
24344 +static inline void reiser4_wake_up(lock_stack * owner)
24345 +{
24346 + spin_lock_stack(owner);
24347 + __reiser4_wake_up(owner);
24348 + spin_unlock_stack(owner);
24349 +}
24350 +
24351 +const char *lock_mode_name(znode_lock_mode lock);
24352 +
24353 +#if REISER4_DEBUG
24354 +extern void check_lock_data(void);
24355 +extern void check_lock_node_data(znode * node);
24356 +#else
24357 +#define check_lock_data() noop
24358 +#define check_lock_node_data() noop
24359 +#endif
24360 +
24361 +/* __LOCK_H__ */
24362 +#endif
24363 +
24364 +/* Make Linus happy.
24365 + Local variables:
24366 + c-indentation-style: "K&R"
24367 + mode-name: "LC"
24368 + c-basic-offset: 8
24369 + tab-width: 8
24370 + fill-column: 120
24371 + End:
24372 +*/
24373 Index: linux-2.6.16/fs/reiser4/oid.c
24374 ===================================================================
24375 --- /dev/null
24376 +++ linux-2.6.16/fs/reiser4/oid.c
24377 @@ -0,0 +1,141 @@
24378 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24379 +
24380 +#include "debug.h"
24381 +#include "super.h"
24382 +#include "txnmgr.h"
24383 +
24384 +/* we used to have oid allocation plugin. It was removed because it
24385 + was recognized as providing unneeded level of abstraction. If one
24386 + ever will find it useful - look at yet_unneeded_abstractions/oid
24387 +*/
24388 +
24389 +/*
24390 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24391 + * are provided by disk format plugin that reads them from the disk during
24392 + * mount.
24393 + */
24394 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24395 +{
24396 + reiser4_super_info_data *sbinfo;
24397 +
24398 + sbinfo = get_super_private(super);
24399 +
24400 + sbinfo->next_to_use = next;
24401 + sbinfo->oids_in_use = nr_files;
24402 + return 0;
24403 +}
24404 +
24405 +/*
24406 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24407 + * runs out of oids.
24408 + */
24409 +oid_t oid_allocate(struct super_block * super)
24410 +{
24411 + reiser4_super_info_data *sbinfo;
24412 + oid_t oid;
24413 +
24414 + sbinfo = get_super_private(super);
24415 +
24416 + spin_lock_reiser4_super(sbinfo);
24417 + if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24418 + oid = sbinfo->next_to_use++;
24419 + sbinfo->oids_in_use++;
24420 + } else
24421 + oid = ABSOLUTE_MAX_OID;
24422 + spin_unlock_reiser4_super(sbinfo);
24423 + return oid;
24424 +}
24425 +
24426 +/*
24427 + * Tell oid allocator that @oid is now free.
24428 + */
24429 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24430 +{
24431 + reiser4_super_info_data *sbinfo;
24432 +
24433 + sbinfo = get_super_private(super);
24434 +
24435 + spin_lock_reiser4_super(sbinfo);
24436 + sbinfo->oids_in_use--;
24437 + spin_unlock_reiser4_super(sbinfo);
24438 + return 0;
24439 +}
24440 +
24441 +/*
24442 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24443 + * without actually allocating it. This is used by disk format plugin to save
24444 + * oid allocator state on the disk.
24445 + */
24446 +oid_t oid_next(const struct super_block * super)
24447 +{
24448 + reiser4_super_info_data *sbinfo;
24449 + oid_t oid;
24450 +
24451 + sbinfo = get_super_private(super);
24452 +
24453 + spin_lock_reiser4_super(sbinfo);
24454 + oid = sbinfo->next_to_use;
24455 + spin_unlock_reiser4_super(sbinfo);
24456 + return oid;
24457 +}
24458 +
24459 +/*
24460 + * returns number of currently used oids. This is used by statfs(2) to report
24461 + * number of "inodes" and by disk format plugin to save oid allocator state on
24462 + * the disk.
24463 + */
24464 +long oids_used(const struct super_block *super)
24465 +{
24466 + reiser4_super_info_data *sbinfo;
24467 + oid_t used;
24468 +
24469 + sbinfo = get_super_private(super);
24470 +
24471 + spin_lock_reiser4_super(sbinfo);
24472 + used = sbinfo->oids_in_use;
24473 + spin_unlock_reiser4_super(sbinfo);
24474 + if (used < (__u64) ((long)~0) >> 1)
24475 + return (long)used;
24476 + else
24477 + return (long)-1;
24478 +}
24479 +
24480 +/*
24481 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24482 + * at the point when we are irrevocably committed to creation of the new file
24483 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24484 + * error).
24485 + */
24486 +void oid_count_allocated(void)
24487 +{
24488 + txn_atom *atom;
24489 +
24490 + atom = get_current_atom_locked();
24491 + atom->nr_objects_created++;
24492 + spin_unlock_atom(atom);
24493 +}
24494 +
24495 +/*
24496 + * Count oid as free in atom. This is done after call to oid_release() at the
24497 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24498 + * when oid release cannot be any longer rolled back due to some error).
24499 + */
24500 +void oid_count_released(void)
24501 +{
24502 + txn_atom *atom;
24503 +
24504 + atom = get_current_atom_locked();
24505 + atom->nr_objects_deleted++;
24506 + spin_unlock_atom(atom);
24507 +}
24508 +
24509 +/*
24510 + Local variables:
24511 + c-indentation-style: "K&R"
24512 + mode-name: "LC"
24513 + c-basic-offset: 8
24514 + tab-width: 8
24515 + fill-column: 120
24516 + scroll-step: 1
24517 + End:
24518 +*/
24519 Index: linux-2.6.16/fs/reiser4/page_cache.c
24520 ===================================================================
24521 --- /dev/null
24522 +++ linux-2.6.16/fs/reiser4/page_cache.c
24523 @@ -0,0 +1,712 @@
24524 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24525 + * reiser4/README */
24526 +
24527 +/* Memory pressure hooks. Fake inodes handling. */
24528 +/* We store all file system meta data (and data, of course) in the page cache.
24529 +
24530 + What does this mean? In stead of using bread/brelse we create special
24531 + "fake" inode (one per super block) and store content of formatted nodes
24532 + into pages bound to this inode in the page cache. In newer kernels bread()
24533 + already uses inode attached to block device (bd_inode). Advantage of having
24534 + our own fake inode is that we can install appropriate methods in its
24535 + address_space operations. Such methods are called by VM on memory pressure
24536 + (or during background page flushing) and we can use them to react
24537 + appropriately.
24538 +
24539 + In initial version we only support one block per page. Support for multiple
24540 + blocks per page is complicated by relocation.
24541 +
24542 + To each page, used by reiser4, jnode is attached. jnode is analogous to
24543 + buffer head. Difference is that jnode is bound to the page permanently:
24544 + jnode cannot be removed from memory until its backing page is.
24545 +
24546 + jnode contain pointer to page (->pg field) and page contain pointer to
24547 + jnode in ->private field. Pointer from jnode to page is protected to by
24548 + jnode's spinlock and pointer from page to jnode is protected by page lock
24549 + (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24550 + lock. To go into reverse direction use jnode_lock_page() function that uses
24551 + standard try-lock-and-release device.
24552 +
24553 + Properties:
24554 +
24555 + 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24556 + reference counter is increased.
24557 +
24558 + 2. when jnode-to-page mapping is destroyed (by jnode_detach_page() and
24559 + page_detach_jnode()), page reference counter is decreased.
24560 +
24561 + 3. on jload() reference counter on jnode page is increased, page is
24562 + kmapped and `referenced'.
24563 +
24564 + 4. on jrelse() inverse operations are performed.
24565 +
24566 + 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24567 +
24568 + DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24569 + historically.]
24570 +
24571 + [In the following discussion, `lock' invariably means long term lock on
24572 + znode.] (What about page locks?)
24573 +
24574 + There is some special class of deadlock possibilities related to memory
24575 + pressure. Locks acquired by other reiser4 threads are accounted for in
24576 + deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24577 + invoked additional hidden arc is added to the locking graph: thread that
24578 + tries to allocate memory waits for ->vm_writeback() to finish. If this
24579 + thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24580 + prevention is useless.
24581 +
24582 + Another related problem is possibility for ->vm_writeback() to run out of
24583 + memory itself. This is not a problem for ext2 and friends, because their
24584 + ->vm_writeback() don't allocate much memory, but reiser4 flush is
24585 + definitely able to allocate huge amounts of memory.
24586 +
24587 + It seems that there is no reliable way to cope with the problems above. In
24588 + stead it was decided that ->vm_writeback() (as invoked in the kswapd
24589 + context) wouldn't perform any flushing itself, but rather should just wake
24590 + up some auxiliary thread dedicated for this purpose (or, the same thread
24591 + that does periodic commit of old atoms (ktxnmgrd.c)).
24592 +
24593 + Details:
24594 +
24595 + 1. Page is called `reclaimable' against particular reiser4 mount F if this
24596 + page can be ultimately released by try_to_free_pages() under presumptions
24597 + that:
24598 +
24599 + a. ->vm_writeback() for F is no-op, and
24600 +
24601 + b. none of the threads accessing F are making any progress, and
24602 +
24603 + c. other reiser4 mounts obey the same memory reservation protocol as F
24604 + (described below).
24605 +
24606 + For example, clean un-pinned page, or page occupied by ext2 data are
24607 + reclaimable against any reiser4 mount.
24608 +
24609 + When there is more than one reiser4 mount in a system, condition (c) makes
24610 + reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24611 +
24612 + THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24613 +
24614 + Fake inode is used to bound formatted nodes and each node is indexed within
24615 + fake inode by its block number. If block size of smaller than page size, it
24616 + may so happen that block mapped to the page with formatted node is occupied
24617 + by unformatted node or is unallocated. This lead to some complications,
24618 + because flushing whole page can lead to an incorrect overwrite of
24619 + unformatted node that is moreover, can be cached in some other place as
24620 + part of the file body. To avoid this, buffers for unformatted nodes are
24621 + never marked dirty. Also pages in the fake are never marked dirty. This
24622 + rules out usage of ->writepage() as memory pressure hook. In stead
24623 + ->releasepage() is used.
24624 +
24625 + Josh is concerned that page->buffer is going to die. This should not pose
24626 + significant problem though, because we need to add some data structures to
24627 + the page anyway (jnode) and all necessary book keeping can be put there.
24628 +
24629 +*/
24630 +
24631 +/* Life cycle of pages/nodes.
24632 +
24633 + jnode contains reference to page and page contains reference back to
24634 + jnode. This reference is counted in page ->count. Thus, page bound to jnode
24635 + cannot be released back into free pool.
24636 +
24637 + 1. Formatted nodes.
24638 +
24639 + 1. formatted node is represented by znode. When new znode is created its
24640 + ->pg pointer is NULL initially.
24641 +
24642 + 2. when node content is loaded into znode (by call to zload()) for the
24643 + first time following happens (in call to ->read_node() or
24644 + ->allocate_node()):
24645 +
24646 + 1. new page is added to the page cache.
24647 +
24648 + 2. this page is attached to znode and its ->count is increased.
24649 +
24650 + 3. page is kmapped.
24651 +
24652 + 3. if more calls to zload() follow (without corresponding zrelses), page
24653 + counter is left intact and in its stead ->d_count is increased in znode.
24654 +
24655 + 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24656 + ->release_node() is called and page is kunmapped as result.
24657 +
24658 + 5. at some moment node can be captured by a transaction. Its ->x_count
24659 + is then increased by transaction manager.
24660 +
24661 + 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24662 + bit set) following will happen (also see comment at the top of znode.c):
24663 +
24664 + 1. when last lock is released, node will be uncaptured from
24665 + transaction. This released reference that transaction manager acquired
24666 + at the step 5.
24667 +
24668 + 2. when last reference is released, zput() detects that node is
24669 + actually deleted and calls ->delete_node()
24670 + operation. page_cache_delete_node() implementation detaches jnode from
24671 + page and releases page.
24672 +
24673 + 7. otherwise (node wasn't removed from the tree), last reference to
24674 + znode will be released after transaction manager committed transaction
24675 + node was in. This implies squallocing of this node (see
24676 + flush.c). Nothing special happens at this point. Znode is still in the
24677 + hash table and page is still attached to it.
24678 +
24679 + 8. znode is actually removed from the memory because of the memory
24680 + pressure, or during umount (znodes_tree_done()). Anyway, znode is
24681 + removed by the call to zdrop(). At this moment, page is detached from
24682 + znode and removed from the inode address space.
24683 +
24684 +*/
24685 +
24686 +#include "debug.h"
24687 +#include "dformat.h"
24688 +#include "key.h"
24689 +#include "txnmgr.h"
24690 +#include "jnode.h"
24691 +#include "znode.h"
24692 +#include "block_alloc.h"
24693 +#include "tree.h"
24694 +#include "vfs_ops.h"
24695 +#include "inode.h"
24696 +#include "super.h"
24697 +#include "entd.h"
24698 +#include "page_cache.h"
24699 +#include "ktxnmgrd.h"
24700 +
24701 +#include <linux/types.h>
24702 +#include <linux/fs.h>
24703 +#include <linux/mm.h> /* for struct page */
24704 +#include <linux/swap.h> /* for struct page */
24705 +#include <linux/pagemap.h>
24706 +#include <linux/bio.h>
24707 +#include <linux/writeback.h>
24708 +#include <linux/blkdev.h>
24709 +
24710 +static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24711 +
24712 +static struct address_space_operations formatted_fake_as_ops;
24713 +
24714 +static const oid_t fake_ino = 0x1;
24715 +static const oid_t bitmap_ino = 0x2;
24716 +static const oid_t cc_ino = 0x3;
24717 +
24718 +static void
24719 +init_fake_inode(struct super_block *super, struct inode *fake,
24720 + struct inode **pfake)
24721 +{
24722 + assert("nikita-2168", fake->i_state & I_NEW);
24723 + fake->i_mapping->a_ops = &formatted_fake_as_ops;
24724 + *pfake = fake;
24725 + /* NOTE-NIKITA something else? */
24726 + unlock_new_inode(fake);
24727 +}
24728 +
24729 +/**
24730 + * init_formatted_fake - iget inodes for formatted nodes and bitmaps
24731 + * @super: super block to init fake inode for
24732 + *
24733 + * Initializes fake inode to which formatted nodes are bound in the page cache
24734 + * and inode for bitmaps.
24735 + */
24736 +int init_formatted_fake(struct super_block *super)
24737 +{
24738 + struct inode *fake;
24739 + struct inode *bitmap;
24740 + struct inode *cc;
24741 + reiser4_super_info_data *sinfo;
24742 +
24743 + assert("nikita-1703", super != NULL);
24744 +
24745 + sinfo = get_super_private_nocheck(super);
24746 + fake = iget_locked(super, oid_to_ino(fake_ino));
24747 +
24748 + if (fake != NULL) {
24749 + init_fake_inode(super, fake, &sinfo->fake);
24750 +
24751 + bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24752 + if (bitmap != NULL) {
24753 + init_fake_inode(super, bitmap, &sinfo->bitmap);
24754 +
24755 + cc = iget_locked(super, oid_to_ino(cc_ino));
24756 + if (cc != NULL) {
24757 + init_fake_inode(super, cc, &sinfo->cc);
24758 + return 0;
24759 + } else {
24760 + iput(sinfo->fake);
24761 + iput(sinfo->bitmap);
24762 + sinfo->fake = NULL;
24763 + sinfo->bitmap = NULL;
24764 + }
24765 + } else {
24766 + iput(sinfo->fake);
24767 + sinfo->fake = NULL;
24768 + }
24769 + }
24770 + return RETERR(-ENOMEM);
24771 +}
24772 +
24773 +/**
24774 + * done_formatted_fake - release inode used by formatted nodes and bitmaps
24775 + * @super: super block to init fake inode for
24776 + *
24777 + * Releases inodes which were used as address spaces of bitmap and formatted
24778 + * nodes.
24779 + */
24780 +void done_formatted_fake(struct super_block *super)
24781 +{
24782 + reiser4_super_info_data *sinfo;
24783 +
24784 + sinfo = get_super_private_nocheck(super);
24785 +
24786 + if (sinfo->fake != NULL) {
24787 + assert("vs-1426", sinfo->fake->i_data.nrpages == 0);
24788 + iput(sinfo->fake);
24789 + sinfo->fake = NULL;
24790 + }
24791 +
24792 + if (sinfo->bitmap != NULL) {
24793 + iput(sinfo->bitmap);
24794 + sinfo->bitmap = NULL;
24795 + }
24796 +
24797 + if (sinfo->cc != NULL) {
24798 + iput(sinfo->cc);
24799 + sinfo->cc = NULL;
24800 + }
24801 + return;
24802 +}
24803 +
24804 +void reiser4_wait_page_writeback(struct page *page)
24805 +{
24806 + assert("zam-783", PageLocked(page));
24807 +
24808 + do {
24809 + unlock_page(page);
24810 + wait_on_page_writeback(page);
24811 + lock_page(page);
24812 + } while (PageWriteback(page));
24813 +}
24814 +
24815 +/* return tree @page is in */
24816 +reiser4_tree *tree_by_page(const struct page *page /* page to query */ )
24817 +{
24818 + assert("nikita-2461", page != NULL);
24819 + return &get_super_private(page->mapping->host->i_sb)->tree;
24820 +}
24821 +
24822 +/* completion handler for single page bio-based read.
24823 +
24824 + mpage_end_io_read() would also do. But it's static.
24825 +
24826 +*/
24827 +static int
24828 +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24829 + int err UNUSED_ARG)
24830 +{
24831 + struct page *page;
24832 +
24833 + if (bio->bi_size != 0) {
24834 + warning("nikita-3332", "Truncated single page read: %i",
24835 + bio->bi_size);
24836 + return 1;
24837 + }
24838 +
24839 + page = bio->bi_io_vec[0].bv_page;
24840 +
24841 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24842 + SetPageUptodate(page);
24843 + } else {
24844 + ClearPageUptodate(page);
24845 + SetPageError(page);
24846 + }
24847 + unlock_page(page);
24848 + bio_put(bio);
24849 + return 0;
24850 +}
24851 +
24852 +/* completion handler for single page bio-based write.
24853 +
24854 + mpage_end_io_write() would also do. But it's static.
24855 +
24856 +*/
24857 +static int
24858 +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24859 + int err UNUSED_ARG)
24860 +{
24861 + struct page *page;
24862 +
24863 + if (bio->bi_size != 0) {
24864 + warning("nikita-3333", "Truncated single page write: %i",
24865 + bio->bi_size);
24866 + return 1;
24867 + }
24868 +
24869 + page = bio->bi_io_vec[0].bv_page;
24870 +
24871 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24872 + SetPageError(page);
24873 + end_page_writeback(page);
24874 + bio_put(bio);
24875 + return 0;
24876 +}
24877 +
24878 +/* ->readpage() method for formatted nodes */
24879 +static int formatted_readpage(struct file *f UNUSED_ARG,
24880 + struct page *page /* page to read */ )
24881 +{
24882 + assert("nikita-2412", PagePrivate(page) && jprivate(page));
24883 + return page_io(page, jprivate(page), READ, get_gfp_mask());
24884 +}
24885 +
24886 +/**
24887 + * page_io - submit single-page bio request
24888 + * @page: page to perform io for
24889 + * @node: jnode of page
24890 + * @rw: read or write
24891 + * @gfp: gfp mask for bio allocation
24892 + *
24893 + * Submits single page read or write.
24894 + */
24895 +int page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24896 +{
24897 + struct bio *bio;
24898 + int result;
24899 +
24900 + assert("nikita-2094", page != NULL);
24901 + assert("nikita-2226", PageLocked(page));
24902 + assert("nikita-2634", node != NULL);
24903 + assert("nikita-2893", rw == READ || rw == WRITE);
24904 +
24905 + if (rw) {
24906 + if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24907 + unlock_page(page);
24908 + return 0;
24909 + }
24910 + }
24911 +
24912 + bio = page_bio(page, node, rw, gfp);
24913 + if (!IS_ERR(bio)) {
24914 + if (rw == WRITE) {
24915 + SetPageWriteback(page);
24916 + unlock_page(page);
24917 + }
24918 + reiser4_submit_bio(rw, bio);
24919 + result = 0;
24920 + } else {
24921 + unlock_page(page);
24922 + result = PTR_ERR(bio);
24923 + }
24924 +
24925 + return result;
24926 +}
24927 +
24928 +/* helper function to construct bio for page */
24929 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24930 +{
24931 + struct bio *bio;
24932 + assert("nikita-2092", page != NULL);
24933 + assert("nikita-2633", node != NULL);
24934 +
24935 + /* Simple implementation in the assumption that blocksize == pagesize.
24936 +
24937 + We only have to submit one block, but submit_bh() will allocate bio
24938 + anyway, so lets use all the bells-and-whistles of bio code.
24939 + */
24940 +
24941 + bio = bio_alloc(gfp, 1);
24942 + if (bio != NULL) {
24943 + int blksz;
24944 + struct super_block *super;
24945 + reiser4_block_nr blocknr;
24946 +
24947 + super = page->mapping->host->i_sb;
24948 + assert("nikita-2029", super != NULL);
24949 + blksz = super->s_blocksize;
24950 + assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24951 +
24952 + spin_lock_jnode(node);
24953 + blocknr = *jnode_get_io_block(node);
24954 + spin_unlock_jnode(node);
24955 +
24956 + assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24957 + assert("nikita-2276", !blocknr_is_fake(&blocknr));
24958 +
24959 + bio->bi_bdev = super->s_bdev;
24960 + /* fill bio->bi_sector before calling bio_add_page(), because
24961 + * q->merge_bvec_fn may want to inspect it (see
24962 + * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24963 + bio->bi_sector = blocknr * (blksz >> 9);
24964 +
24965 + if (!bio_add_page(bio, page, blksz, 0)) {
24966 + warning("nikita-3452",
24967 + "Single page bio cannot be constructed");
24968 + return ERR_PTR(RETERR(-EINVAL));
24969 + }
24970 +
24971 + /* bio -> bi_idx is filled by bio_init() */
24972 + bio->bi_end_io = (rw == READ) ?
24973 + end_bio_single_page_read : end_bio_single_page_write;
24974 +
24975 + return bio;
24976 + } else
24977 + return ERR_PTR(RETERR(-ENOMEM));
24978 +}
24979 +
24980 +/* this function is internally called by jnode_make_dirty() */
24981 +int set_page_dirty_internal(struct page *page)
24982 +{
24983 + struct address_space *mapping;
24984 +
24985 + mapping = page->mapping;
24986 + BUG_ON(mapping == NULL);
24987 +
24988 + if (!TestSetPageDirty(page)) {
24989 + if (mapping_cap_account_dirty(mapping))
24990 + inc_page_state(nr_dirty);
24991 +
24992 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24993 + }
24994 +
24995 + /* znode must be dirty ? */
24996 + if (mapping->host == get_super_fake(mapping->host->i_sb))
24997 + assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24998 + return 0;
24999 +}
25000 +
25001 +#if REISER4_DEBUG
25002 +
25003 +/**
25004 + * can_hit_entd
25005 + *
25006 + * This is used on
25007 + */
25008 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
25009 +{
25010 + if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
25011 + return 1;
25012 + if (ctx->super != s)
25013 + return 1;
25014 + if (get_super_private(s)->entd.tsk == current)
25015 + return 0;
25016 + if (!lock_stack_isclean(&ctx->stack))
25017 + return 0;
25018 + if (ctx->trans->atom != NULL)
25019 + return 0;
25020 + return 1;
25021 +}
25022 +
25023 +#endif
25024 +
25025 +/**
25026 + * reiser4_writepage - writepage of struct address_space_operations
25027 + * @page: page to write
25028 + * @wbc:
25029 + *
25030 + *
25031 + */
25032 +/* Common memory pressure notification. */
25033 +int reiser4_writepage(struct page *page,
25034 + struct writeback_control *wbc)
25035 +{
25036 + struct super_block *s;
25037 + reiser4_context *ctx;
25038 +
25039 + assert("vs-828", PageLocked(page));
25040 +
25041 + s = page->mapping->host->i_sb;
25042 + ctx = get_current_context_check();
25043 +
25044 + assert("", can_hit_entd(ctx, s));
25045 +
25046 + return write_page_by_ent(page, wbc);
25047 +}
25048 +
25049 +/* ->set_page_dirty() method of formatted address_space */
25050 +static int formatted_set_page_dirty(struct page *page)
25051 +{
25052 + assert("nikita-2173", page != NULL);
25053 + BUG();
25054 + return __set_page_dirty_nobuffers(page);
25055 +}
25056 +
25057 +/* writepages method of address space operations in reiser4 is used to involve
25058 + into transactions pages which are dirtied via mmap. Only regular files can
25059 + have such pages. Fake inode is used to access formatted nodes via page
25060 + cache. As formatted nodes can never be mmaped, fake inode's writepages has
25061 + nothing to do */
25062 +static int
25063 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
25064 +{
25065 + return 0;
25066 +}
25067 +
25068 +/* address space operations for the fake inode */
25069 +static struct address_space_operations formatted_fake_as_ops = {
25070 + /* Perform a writeback of a single page as a memory-freeing
25071 + * operation. */
25072 + .writepage = reiser4_writepage,
25073 + /* this is called to read formatted node */
25074 + .readpage = formatted_readpage,
25075 + /* ->sync_page() method of fake inode address space operations. Called
25076 + from wait_on_page() and lock_page().
25077 +
25078 + This is most annoyingly misnomered method. Actually it is called
25079 + from wait_on_page_bit() and lock_page() and its purpose is to
25080 + actually start io by jabbing device drivers.
25081 + */
25082 + .sync_page = block_sync_page,
25083 + /* Write back some dirty pages from this mapping. Called from sync.
25084 + called during sync (pdflush) */
25085 + .writepages = writepages_fake,
25086 + /* Set a page dirty */
25087 + .set_page_dirty = formatted_set_page_dirty,
25088 + /* used for read-ahead. Not applicable */
25089 + .readpages = NULL,
25090 + .prepare_write = NULL,
25091 + .commit_write = NULL,
25092 + .bmap = NULL,
25093 + /* called just before page is being detached from inode mapping and
25094 + removed from memory. Called on truncate, cut/squeeze, and
25095 + umount. */
25096 + .invalidatepage = reiser4_invalidatepage,
25097 + /* this is called by shrink_cache() so that file system can try to
25098 + release objects (jnodes, buffers, journal heads) attached to page
25099 + and, may be made page itself free-able.
25100 + */
25101 + .releasepage = reiser4_releasepage,
25102 + .direct_IO = NULL
25103 +};
25104 +
25105 +/* called just before page is released (no longer used by reiser4). Callers:
25106 + jdelete() and extent2tail(). */
25107 +void drop_page(struct page *page)
25108 +{
25109 + assert("nikita-2181", PageLocked(page));
25110 + clear_page_dirty_for_io(page);
25111 + ClearPageUptodate(page);
25112 +#if defined(PG_skipped)
25113 + ClearPageSkipped(page);
25114 +#endif
25115 + if (page->mapping != NULL) {
25116 + remove_from_page_cache(page);
25117 + unlock_page(page);
25118 + page_cache_release(page);
25119 + } else
25120 + unlock_page(page);
25121 +}
25122 +
25123 +/* this is called by truncate_jnodes_range which in its turn is always called
25124 + after truncate_mapping_pages_range. Therefore, here jnode can not have
25125 + page. New pages can not be created because truncate_jnodes_range goes under
25126 + exclusive access on file obtained, where as new page creation requires
25127 + non-exclusive access obtained */
25128 +static void invalidate_unformatted(jnode * node)
25129 +{
25130 + struct page *page;
25131 +
25132 + spin_lock_jnode(node);
25133 + page = node->pg;
25134 + if (page) {
25135 + loff_t from, to;
25136 +
25137 + page_cache_get(page);
25138 + spin_unlock_jnode(node);
25139 + /* FIXME: use truncate_complete_page instead */
25140 + from = (loff_t) page->index << PAGE_CACHE_SHIFT;
25141 + to = from + PAGE_CACHE_SIZE - 1;
25142 + truncate_inode_pages_range(page->mapping, from, to);
25143 + page_cache_release(page);
25144 + } else {
25145 + JF_SET(node, JNODE_HEARD_BANSHEE);
25146 + uncapture_jnode(node);
25147 + unhash_unformatted_jnode(node);
25148 + }
25149 +}
25150 +
25151 +#define JNODE_GANG_SIZE (16)
25152 +
25153 +/* find all eflushed jnodes from range specified and invalidate them */
25154 +static int
25155 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
25156 +{
25157 + reiser4_inode *info;
25158 + int truncated_jnodes;
25159 + reiser4_tree *tree;
25160 + unsigned long index;
25161 + unsigned long end;
25162 +
25163 + truncated_jnodes = 0;
25164 +
25165 + info = reiser4_inode_data(inode);
25166 + tree = tree_by_inode(inode);
25167 +
25168 + index = from;
25169 + end = from + count;
25170 +
25171 + while (1) {
25172 + jnode *gang[JNODE_GANG_SIZE];
25173 + int taken;
25174 + int i;
25175 + jnode *node;
25176 +
25177 + assert("nikita-3466", index <= end);
25178 +
25179 + read_lock_tree(tree);
25180 + taken =
25181 + radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25182 + (void **)gang, index,
25183 + JNODE_GANG_SIZE);
25184 + for (i = 0; i < taken; ++i) {
25185 + node = gang[i];
25186 + if (index_jnode(node) < end)
25187 + jref(node);
25188 + else
25189 + gang[i] = NULL;
25190 + }
25191 + read_unlock_tree(tree);
25192 +
25193 + for (i = 0; i < taken; ++i) {
25194 + node = gang[i];
25195 + if (node != NULL) {
25196 + index = max(index, index_jnode(node));
25197 + invalidate_unformatted(node);
25198 + truncated_jnodes++;
25199 + jput(node);
25200 + } else
25201 + break;
25202 + }
25203 + if (i != taken || taken == 0)
25204 + break;
25205 + }
25206 + return truncated_jnodes;
25207 +}
25208 +
25209 +void
25210 +reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25211 + unsigned long count, int even_cows)
25212 +{
25213 + loff_t from_bytes, count_bytes;
25214 +
25215 + if (count == 0)
25216 + return;
25217 + from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25218 + count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25219 +
25220 + unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25221 + truncate_inode_pages_range(mapping, from_bytes,
25222 + from_bytes + count_bytes - 1);
25223 + truncate_jnodes_range(mapping->host, from, count);
25224 +}
25225 +
25226 +/*
25227 + * Local variables:
25228 + * c-indentation-style: "K&R"
25229 + * mode-name: "LC"
25230 + * c-basic-offset: 8
25231 + * tab-width: 8
25232 + * fill-column: 120
25233 + * scroll-step: 1
25234 + * End:
25235 + */
25236 Index: linux-2.6.16/fs/reiser4/page_cache.h
25237 ===================================================================
25238 --- /dev/null
25239 +++ linux-2.6.16/fs/reiser4/page_cache.h
25240 @@ -0,0 +1,62 @@
25241 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25242 + * reiser4/README */
25243 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25244 +
25245 +#if !defined( __REISER4_PAGE_CACHE_H__ )
25246 +#define __REISER4_PAGE_CACHE_H__
25247 +
25248 +#include "forward.h"
25249 +#include "debug.h"
25250 +
25251 +#include <linux/fs.h> /* for struct super_block, address_space */
25252 +#include <linux/mm.h> /* for struct page */
25253 +#include <linux/pagemap.h> /* for lock_page() */
25254 +
25255 +
25256 +extern int init_formatted_fake(struct super_block *);
25257 +extern void done_formatted_fake(struct super_block *);
25258 +
25259 +extern reiser4_tree *tree_by_page(const struct page *);
25260 +
25261 +extern int set_page_dirty_internal(struct page *);
25262 +
25263 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25264 +
25265 +extern void reiser4_wait_page_writeback(struct page *);
25266 +static inline void lock_and_wait_page_writeback(struct page *page)
25267 +{
25268 + lock_page(page);
25269 + if (unlikely(PageWriteback(page)))
25270 + reiser4_wait_page_writeback(page);
25271 +}
25272 +
25273 +#define jprivate(page) ((jnode *)page_private(page))
25274 +
25275 +extern int page_io(struct page *, jnode *, int rw, gfp_t);
25276 +extern void drop_page(struct page *);
25277 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25278 + unsigned long count, int even_cows);
25279 +extern void capture_reiser4_inodes(struct super_block *,
25280 + struct writeback_control *);
25281 +
25282 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25283 +
25284 +#if REISER4_DEBUG
25285 +extern void print_page(const char *prefix, struct page *page);
25286 +#else
25287 +#define print_page(prf, p) noop
25288 +#endif
25289 +
25290 +/* __REISER4_PAGE_CACHE_H__ */
25291 +#endif
25292 +
25293 +/* Make Linus happy.
25294 + Local variables:
25295 + c-indentation-style: "K&R"
25296 + mode-name: "LC"
25297 + c-basic-offset: 8
25298 + tab-width: 8
25299 + fill-column: 120
25300 + scroll-step: 1
25301 + End:
25302 +*/
25303 Index: linux-2.6.16/fs/reiser4/plugin/Makefile
25304 ===================================================================
25305 --- /dev/null
25306 +++ linux-2.6.16/fs/reiser4/plugin/Makefile
25307 @@ -0,0 +1,26 @@
25308 +obj-$(CONFIG_REISER4_FS) += plugins.o
25309 +
25310 +plugins-objs := \
25311 + plugin.o \
25312 + plugin_set.o \
25313 + object.o \
25314 + inode_ops.o \
25315 + inode_ops_rename.o \
25316 + file_ops.o \
25317 + file_ops_readdir.o \
25318 + file_plugin_common.o \
25319 + dir_plugin_common.o \
25320 + digest.o \
25321 + hash.o \
25322 + fibration.o \
25323 + tail_policy.o \
25324 + regular.o
25325 +
25326 +obj-$(CONFIG_REISER4_FS) += item/
25327 +obj-$(CONFIG_REISER4_FS) += file/
25328 +obj-$(CONFIG_REISER4_FS) += dir/
25329 +obj-$(CONFIG_REISER4_FS) += node/
25330 +obj-$(CONFIG_REISER4_FS) += compress/
25331 +obj-$(CONFIG_REISER4_FS) += space/
25332 +obj-$(CONFIG_REISER4_FS) += disk_format/
25333 +obj-$(CONFIG_REISER4_FS) += security/
25334 Index: linux-2.6.16/fs/reiser4/plugin/cluster.c
25335 ===================================================================
25336 --- /dev/null
25337 +++ linux-2.6.16/fs/reiser4/plugin/cluster.c
25338 @@ -0,0 +1,66 @@
25339 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25340 +
25341 +/* Contains reiser4 cluster plugins (see
25342 + http://www.namesys.com/cryptcompress_design.html
25343 + "Concepts of clustering" for details). */
25344 +
25345 +#include "plugin_header.h"
25346 +#include "plugin.h"
25347 +#include "../inode.h"
25348 +
25349 +static int change_cluster(struct inode *inode, reiser4_plugin * plugin)
25350 +{
25351 + int result = 0;
25352 +
25353 + assert("edward-1324", inode != NULL);
25354 + assert("edward-1325", plugin != NULL);
25355 + assert("edward-1326", is_reiser4_inode(inode));
25356 + assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25357 +
25358 + if (inode_file_plugin(inode)->h.id == DIRECTORY_FILE_PLUGIN_ID)
25359 + result = plugin_set_cluster(&reiser4_inode_data(inode)->pset,
25360 + &plugin->clust);
25361 + else
25362 + result = RETERR(-EINVAL);
25363 + return result;
25364 +}
25365 +
25366 +static reiser4_plugin_ops cluster_plugin_ops = {
25367 + .init = NULL,
25368 + .load = NULL,
25369 + .save_len = NULL,
25370 + .save = NULL,
25371 + .change = &change_cluster
25372 +};
25373 +
25374 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25375 + [CLUSTER_ ## ID ## _ID] = { \
25376 + .h = { \
25377 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25378 + .id = CLUSTER_ ## ID ## _ID, \
25379 + .pops = &cluster_plugin_ops, \
25380 + .label = LABEL, \
25381 + .desc = DESC, \
25382 + .linkage = {NULL, NULL} \
25383 + }, \
25384 + .shift = SHIFT \
25385 + }
25386 +
25387 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25388 + SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25389 + SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25390 + SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25391 + SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25392 + SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25393 +};
25394 +
25395 +/*
25396 + Local variables:
25397 + c-indentation-style: "K&R"
25398 + mode-name: "LC"
25399 + c-basic-offset: 8
25400 + tab-width: 8
25401 + fill-column: 120
25402 + scroll-step: 1
25403 + End:
25404 +*/
25405 Index: linux-2.6.16/fs/reiser4/plugin/cluster.h
25406 ===================================================================
25407 --- /dev/null
25408 +++ linux-2.6.16/fs/reiser4/plugin/cluster.h
25409 @@ -0,0 +1,316 @@
25410 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25411 +
25412 +/* This file contains page/cluster index translators and offset modulators
25413 + See http://www.namesys.com/cryptcompress_design.html for details */
25414 +
25415 +#if !defined( __FS_REISER4_CLUSTER_H__ )
25416 +#define __FS_REISER4_CLUSTER_H__
25417 +
25418 +#include "../inode.h"
25419 +
25420 +static inline int inode_cluster_shift(struct inode *inode)
25421 +{
25422 + assert("edward-92", inode != NULL);
25423 + assert("edward-93", reiser4_inode_data(inode) != NULL);
25424 +
25425 + return inode_cluster_plugin(inode)->shift;
25426 +}
25427 +
25428 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25429 +{
25430 + return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25431 +}
25432 +
25433 +/* cluster size in page units */
25434 +static inline unsigned cluster_nrpages(struct inode *inode)
25435 +{
25436 + return 1U << cluster_nrpages_shift(inode);
25437 +}
25438 +
25439 +static inline size_t inode_cluster_size(struct inode *inode)
25440 +{
25441 + assert("edward-96", inode != NULL);
25442 +
25443 + return 1U << inode_cluster_shift(inode);
25444 +}
25445 +
25446 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25447 +{
25448 + return idx >> cluster_nrpages_shift(inode);
25449 +}
25450 +
25451 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25452 +{
25453 + return idx << cluster_nrpages_shift(inode);
25454 +}
25455 +
25456 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25457 +{
25458 + return clust_to_pg(pg_to_clust(idx, inode), inode);
25459 +}
25460 +
25461 +static inline pgoff_t off_to_pg(loff_t off)
25462 +{
25463 + return (off >> PAGE_CACHE_SHIFT);
25464 +}
25465 +
25466 +static inline loff_t pg_to_off(pgoff_t idx)
25467 +{
25468 + return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25469 +}
25470 +
25471 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25472 +{
25473 + return off >> inode_cluster_shift(inode);
25474 +}
25475 +
25476 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25477 +{
25478 + return (loff_t) idx << inode_cluster_shift(inode);
25479 +}
25480 +
25481 +static inline unsigned long count_to_nr(loff_t count, unsigned shift)
25482 +{
25483 + return (count + (1UL << shift) - 1) >> shift;
25484 +}
25485 +
25486 +/* number of pages occupied by @count bytes */
25487 +static inline pgoff_t count_to_nrpages(loff_t count)
25488 +{
25489 + return count_to_nr(count, PAGE_CACHE_SHIFT);
25490 +}
25491 +
25492 +/* number of clusters occupied by @count bytes */
25493 +static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
25494 +{
25495 + return count_to_nr(count, inode_cluster_shift(inode));
25496 +}
25497 +
25498 +/* number of clusters occupied by @count pages */
25499 +static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
25500 +{
25501 + return count_to_nr(count, cluster_nrpages_shift(inode));
25502 +}
25503 +
25504 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25505 +{
25506 + return clust_to_off(off_to_clust(off, inode), inode);
25507 +}
25508 +
25509 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25510 +{
25511 + return clust_to_pg(off_to_clust(off, inode), inode);
25512 +}
25513 +
25514 +static inline unsigned off_to_pgoff(loff_t off)
25515 +{
25516 + return off & (PAGE_CACHE_SIZE - 1);
25517 +}
25518 +
25519 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25520 +{
25521 + return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25522 +}
25523 +
25524 +static inline unsigned
25525 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25526 +{
25527 + return off_to_cloff(pg_to_off(idx), inode);
25528 +}
25529 +
25530 +/* if @size != 0, returns index of the page
25531 + which contains the last byte of the file */
25532 +static inline pgoff_t size_to_pg(loff_t size)
25533 +{
25534 + return (size ? off_to_pg(size - 1) : 0);
25535 +}
25536 +
25537 +/* minimal index of the page which doesn't contain
25538 + file data */
25539 +static inline pgoff_t size_to_next_pg(loff_t size)
25540 +{
25541 + return (size ? off_to_pg(size - 1) + 1 : 0);
25542 +}
25543 +
25544 +/* how many bytes of file of size @cnt can be contained
25545 + in page of index @idx */
25546 +static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
25547 +{
25548 + if (idx > off_to_pg(cnt))
25549 + return 0;
25550 + if (idx < off_to_pg(cnt))
25551 + return PAGE_CACHE_SIZE;
25552 + return off_to_pgoff(cnt);
25553 +}
25554 +
25555 +/* how many bytes of file of size @cnt can be contained
25556 + in logical cluster of index @idx */
25557 +static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
25558 + struct inode *inode)
25559 +{
25560 + if (idx > off_to_clust(cnt, inode))
25561 + return 0;
25562 + if (idx < off_to_clust(cnt, inode))
25563 + return inode_cluster_size(inode);
25564 + return off_to_cloff(cnt, inode);
25565 +}
25566 +
25567 +static inline unsigned
25568 +fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
25569 +{
25570 + assert("edward-288", clust != NULL);
25571 + assert("edward-289", inode != NULL);
25572 +
25573 + return cnt_to_clcnt(inode->i_size, clust->index, inode);
25574 +}
25575 +
25576 +static inline int
25577 +cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
25578 +{
25579 + return clust->tc.lsize == inode_cluster_size(inode);
25580 +}
25581 +
25582 +static inline void reiser4_slide_init(reiser4_slide_t * win)
25583 +{
25584 + assert("edward-1084", win != NULL);
25585 + memset(win, 0, sizeof *win);
25586 +}
25587 +
25588 +static inline void
25589 +tfm_cluster_init_act(tfm_cluster_t * tc, tfm_action act)
25590 +{
25591 + assert("edward-1356", tc != NULL);
25592 + tc->act = act;
25593 +}
25594 +
25595 +static inline void
25596 +cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
25597 + assert("edward-84", clust != NULL);
25598 + memset(clust, 0, sizeof *clust);
25599 + tfm_cluster_init_act(&clust->tc, act);
25600 + clust->dstat = INVAL_DISK_CLUSTER;
25601 + clust->win = window;
25602 +}
25603 +
25604 +static inline void
25605 +cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
25606 +{
25607 + cluster_init_act (clust, TFM_READ_ACT, window);
25608 +}
25609 +
25610 +static inline void
25611 +cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
25612 +{
25613 + cluster_init_act (clust, TFM_WRITE_ACT, window);
25614 +}
25615 +
25616 +static inline int dclust_get_extension(hint_t * hint)
25617 +{
25618 + return hint->ext_coord.extension.ctail.shift;
25619 +}
25620 +
25621 +static inline void dclust_set_extension(hint_t * hint)
25622 +{
25623 + assert("edward-1270",
25624 + item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
25625 + hint->ext_coord.extension.ctail.shift =
25626 + cluster_shift_by_coord(&hint->ext_coord.coord);
25627 +}
25628 +
25629 +static inline int hint_is_unprepped_dclust(hint_t * hint)
25630 +{
25631 + return dclust_get_extension(hint) == (int)UCTAIL_SHIFT;
25632 +}
25633 +
25634 +static inline void coord_set_between_clusters(coord_t * coord)
25635 +{
25636 +#if REISER4_DEBUG
25637 + int result;
25638 + result = zload(coord->node);
25639 + assert("edward-1296", !result);
25640 +#endif
25641 + if (!coord_is_between_items(coord)) {
25642 + coord->between = AFTER_ITEM;
25643 + coord->unit_pos = 0;
25644 + }
25645 +#if REISER4_DEBUG
25646 + zrelse(coord->node);
25647 +#endif
25648 +}
25649 +
25650 +int inflate_cluster(reiser4_cluster_t *, struct inode *);
25651 +int find_cluster(reiser4_cluster_t *, struct inode *, int read, int write);
25652 +void forget_cluster_pages(struct page **page, int nrpages);
25653 +int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
25654 +int deflate_cluster(reiser4_cluster_t *, struct inode *);
25655 +void truncate_page_cluster(struct inode *inode, cloff_t start);
25656 +void invalidate_hint_cluster(reiser4_cluster_t * clust);
25657 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
25658 + znode_lock_mode mode);
25659 +int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
25660 + znode_lock_mode lock_mode);
25661 +void reset_cluster_params(reiser4_cluster_t * clust);
25662 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
25663 + int count);
25664 +int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
25665 + int capture);
25666 +void release_cluster_pages(reiser4_cluster_t *);
25667 +void put_cluster_handle(reiser4_cluster_t * clust);
25668 +int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
25669 +int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
25670 +void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
25671 +void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
25672 +
25673 +/* move cluster handle to the target position
25674 + specified by the page of index @pgidx
25675 +*/
25676 +static inline void
25677 +move_cluster_forward(reiser4_cluster_t * clust, struct inode *inode,
25678 + pgoff_t pgidx, int *progress)
25679 +{
25680 + assert("edward-1297", clust != NULL);
25681 + assert("edward-1298", inode != NULL);
25682 +
25683 + reset_cluster_params(clust);
25684 + if (*progress &&
25685 + /* Hole in the indices. Hint became invalid and can not be
25686 + used by find_cluster_item() even if seal/node versions
25687 + will coincide */
25688 + pg_to_clust(pgidx, inode) != clust->index + 1) {
25689 + unset_hint(clust->hint);
25690 + invalidate_hint_cluster(clust);
25691 + }
25692 + *progress = 1;
25693 + clust->index = pg_to_clust(pgidx, inode);
25694 +}
25695 +
25696 +static inline int
25697 +alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
25698 +{
25699 + assert("edward-791", clust != NULL);
25700 + assert("edward-792", inode != NULL);
25701 + clust->pages =
25702 + kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25703 + GFP_KERNEL);
25704 + if (!clust->pages)
25705 + return -ENOMEM;
25706 + return 0;
25707 +}
25708 +
25709 +static inline void free_clust_pages(reiser4_cluster_t * clust)
25710 +{
25711 + kfree(clust->pages);
25712 +}
25713 +
25714 +#endif /* __FS_REISER4_CLUSTER_H__ */
25715 +
25716 +/* Make Linus happy.
25717 + Local variables:
25718 + c-indentation-style: "K&R"
25719 + mode-name: "LC"
25720 + c-basic-offset: 8
25721 + tab-width: 8
25722 + fill-column: 120
25723 + scroll-step: 1
25724 + End:
25725 +*/
25726 Index: linux-2.6.16/fs/reiser4/plugin/compress/Makefile
25727 ===================================================================
25728 --- /dev/null
25729 +++ linux-2.6.16/fs/reiser4/plugin/compress/Makefile
25730 @@ -0,0 +1,6 @@
25731 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
25732 +
25733 +compress_plugins-objs := \
25734 + compress.o \
25735 + minilzo.o \
25736 + compress_mode.o
25737 Index: linux-2.6.16/fs/reiser4/plugin/compress/compress.c
25738 ===================================================================
25739 --- /dev/null
25740 +++ linux-2.6.16/fs/reiser4/plugin/compress/compress.c
25741 @@ -0,0 +1,370 @@
25742 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25743 +/* reiser4 compression transform plugins */
25744 +
25745 +#include "../../debug.h"
25746 +#include "../../inode.h"
25747 +#include "../plugin.h"
25748 +#include "minilzo.h"
25749 +
25750 +#include <linux/config.h>
25751 +#include <linux/zlib.h>
25752 +#include <linux/types.h>
25753 +#include <linux/hardirq.h>
25754 +
25755 +static int change_compression(struct inode *inode, reiser4_plugin * plugin)
25756 +{
25757 + assert("edward-1316", inode != NULL);
25758 + assert("edward-1317", plugin != NULL);
25759 + assert("edward-1318", is_reiser4_inode(inode));
25760 + assert("edward-1319",
25761 + plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25762 + /* cannot change compression plugin of already existing object */
25763 + return RETERR(-EINVAL);
25764 +}
25765 +
25766 +static reiser4_plugin_ops compression_plugin_ops = {
25767 + .init = NULL,
25768 + .load = NULL,
25769 + .save_len = NULL,
25770 + .save = NULL,
25771 + .change = &change_compression
25772 +};
25773 +
25774 +/******************************************************************************/
25775 +/* gzip1 compression */
25776 +/******************************************************************************/
25777 +
25778 +#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25779 +#define GZIP1_DEF_WINBITS 15
25780 +#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25781 +
25782 +static int gzip1_init(void)
25783 +{
25784 + int ret = -EINVAL;
25785 +#if REISER4_ZLIB
25786 + ret = 0;
25787 +#endif
25788 + if (ret == -EINVAL)
25789 + warning("edward-1337", "Zlib not compiled into kernel");
25790 + return ret;
25791 +}
25792 +
25793 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25794 +{
25795 + return 0;
25796 +}
25797 +
25798 +static coa_t gzip1_alloc(tfm_action act)
25799 +{
25800 + coa_t coa = NULL;
25801 +#if REISER4_ZLIB
25802 + int ret = 0;
25803 + switch (act) {
25804 + case TFM_WRITE_ACT: /* compress */
25805 + coa = vmalloc(zlib_deflate_workspacesize());
25806 + if (!coa) {
25807 + ret = -ENOMEM;
25808 + break;
25809 + }
25810 + memset(coa, 0, zlib_deflate_workspacesize());
25811 + break;
25812 + case TFM_READ_ACT: /* decompress */
25813 + coa = vmalloc(zlib_inflate_workspacesize());
25814 + if (!coa) {
25815 + ret = -ENOMEM;
25816 + break;
25817 + }
25818 + memset(coa, 0, zlib_inflate_workspacesize());
25819 + break;
25820 + default:
25821 + impossible("edward-767",
25822 + "trying to alloc workspace for unknown tfm action");
25823 + }
25824 + if (ret) {
25825 + warning("edward-768",
25826 + "alloc workspace for gzip1 (tfm action = %d) failed\n",
25827 + act);
25828 + return ERR_PTR(ret);
25829 + }
25830 +#endif
25831 + return coa;
25832 +}
25833 +
25834 +static void gzip1_free(coa_t coa, tfm_action act)
25835 +{
25836 + assert("edward-769", coa != NULL);
25837 +
25838 + switch (act) {
25839 + case TFM_WRITE_ACT: /* compress */
25840 + vfree(coa);
25841 + break;
25842 + case TFM_READ_ACT: /* decompress */
25843 + vfree(coa);
25844 + break;
25845 + default:
25846 + impossible("edward-770", "unknown tfm action");
25847 + }
25848 + return;
25849 +}
25850 +
25851 +static int gzip1_min_size_deflate(void)
25852 +{
25853 + return 64;
25854 +}
25855 +
25856 +static void
25857 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25858 + __u8 * dst_first, unsigned *dst_len)
25859 +{
25860 +#if REISER4_ZLIB
25861 + int ret = 0;
25862 + struct z_stream_s stream;
25863 +
25864 + memset(&stream, 0, sizeof(stream));
25865 +
25866 + assert("edward-842", coa != NULL);
25867 + assert("edward-875", src_len != 0);
25868 +
25869 + stream.workspace = coa;
25870 + ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25871 + -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25872 + Z_DEFAULT_STRATEGY);
25873 + if (ret != Z_OK) {
25874 + warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25875 + goto rollback;
25876 + }
25877 + ret = zlib_deflateReset(&stream);
25878 + if (ret != Z_OK) {
25879 + warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25880 + goto rollback;
25881 + }
25882 + stream.next_in = src_first;
25883 + stream.avail_in = src_len;
25884 + stream.next_out = dst_first;
25885 + stream.avail_out = *dst_len;
25886 +
25887 + ret = zlib_deflate(&stream, Z_FINISH);
25888 + if (ret != Z_STREAM_END) {
25889 + if (ret != Z_OK)
25890 + warning("edward-773",
25891 + "zlib_deflate returned %d\n", ret);
25892 + goto rollback;
25893 + }
25894 + *dst_len = stream.total_out;
25895 + return;
25896 + rollback:
25897 + *dst_len = src_len;
25898 +#endif
25899 + return;
25900 +}
25901 +
25902 +static void
25903 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25904 + __u8 * dst_first, unsigned *dst_len)
25905 +{
25906 +#if REISER4_ZLIB
25907 + int ret = 0;
25908 + struct z_stream_s stream;
25909 +
25910 + memset(&stream, 0, sizeof(stream));
25911 +
25912 + assert("edward-843", coa != NULL);
25913 + assert("edward-876", src_len != 0);
25914 +
25915 + stream.workspace = coa;
25916 + ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25917 + if (ret != Z_OK) {
25918 + warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25919 + return;
25920 + }
25921 + ret = zlib_inflateReset(&stream);
25922 + if (ret != Z_OK) {
25923 + warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25924 + return;
25925 + }
25926 +
25927 + stream.next_in = src_first;
25928 + stream.avail_in = src_len;
25929 + stream.next_out = dst_first;
25930 + stream.avail_out = *dst_len;
25931 +
25932 + ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25933 + /*
25934 + * Work around a bug in zlib, which sometimes wants to taste an extra
25935 + * byte when being used in the (undocumented) raw deflate mode.
25936 + * (From USAGI).
25937 + */
25938 + if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25939 + u8 zerostuff = 0;
25940 + stream.next_in = &zerostuff;
25941 + stream.avail_in = 1;
25942 + ret = zlib_inflate(&stream, Z_FINISH);
25943 + }
25944 + if (ret != Z_STREAM_END) {
25945 + warning("edward-776", "zlib_inflate returned %d\n", ret);
25946 + return;
25947 + }
25948 + *dst_len = stream.total_out;
25949 +#endif
25950 + return;
25951 +}
25952 +
25953 +/******************************************************************************/
25954 +/* lzo1 compression */
25955 +/******************************************************************************/
25956 +
25957 +static int lzo1_init(void)
25958 +{
25959 + int ret;
25960 + ret = lzo_init();
25961 + if (ret != LZO_E_OK)
25962 + warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
25963 + return ret;
25964 +}
25965 +
25966 +static int lzo1_overrun(unsigned in_len)
25967 +{
25968 + return in_len / 64 + 16 + 3;
25969 +}
25970 +
25971 +#define LZO_HEAP_SIZE(size) \
25972 + sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
25973 +
25974 +static coa_t lzo1_alloc(tfm_action act)
25975 +{
25976 + int ret = 0;
25977 + coa_t coa = NULL;
25978 +
25979 + switch (act) {
25980 + case TFM_WRITE_ACT: /* compress */
25981 + coa = vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25982 + if (!coa) {
25983 + ret = -ENOMEM;
25984 + break;
25985 + }
25986 + memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25987 + case TFM_READ_ACT: /* decompress */
25988 + break;
25989 + default:
25990 + impossible("edward-877",
25991 + "trying to alloc workspace for unknown tfm action");
25992 + }
25993 + if (ret) {
25994 + warning("edward-878",
25995 + "alloc workspace for lzo1 (tfm action = %d) failed\n",
25996 + act);
25997 + return ERR_PTR(ret);
25998 + }
25999 + return coa;
26000 +}
26001 +
26002 +static void lzo1_free(coa_t coa, tfm_action act)
26003 +{
26004 + assert("edward-879", coa != NULL);
26005 +
26006 + switch (act) {
26007 + case TFM_WRITE_ACT: /* compress */
26008 + vfree(coa);
26009 + break;
26010 + case TFM_READ_ACT: /* decompress */
26011 + impossible("edward-1304",
26012 + "trying to free non-allocated workspace");
26013 + default:
26014 + impossible("edward-880", "unknown tfm action");
26015 + }
26016 + return;
26017 +}
26018 +
26019 +static int lzo1_min_size_deflate(void)
26020 +{
26021 + return 256;
26022 +}
26023 +
26024 +static void
26025 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26026 + __u8 * dst_first, unsigned *dst_len)
26027 +{
26028 + int result;
26029 +
26030 + assert("edward-846", coa != NULL);
26031 + assert("edward-847", src_len != 0);
26032 +
26033 + result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
26034 + if (result != LZO_E_OK) {
26035 + warning("edward-849", "lzo1x_1_compress failed\n");
26036 + goto out;
26037 + }
26038 + if (*dst_len >= src_len) {
26039 + //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
26040 + goto out;
26041 + }
26042 + return;
26043 + out:
26044 + *dst_len = src_len;
26045 + return;
26046 +}
26047 +
26048 +static void
26049 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26050 + __u8 * dst_first, unsigned *dst_len)
26051 +{
26052 + int result;
26053 +
26054 + assert("edward-851", coa == NULL);
26055 + assert("edward-852", src_len != 0);
26056 +
26057 + result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
26058 + if (result != LZO_E_OK)
26059 + warning("edward-853", "lzo1x_1_decompress failed\n");
26060 + return;
26061 +}
26062 +
26063 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
26064 + [LZO1_COMPRESSION_ID] = {
26065 + .h = {
26066 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26067 + .id = LZO1_COMPRESSION_ID,
26068 + .pops = &compression_plugin_ops,
26069 + .label = "lzo1",
26070 + .desc = "lzo1 compression transform",
26071 + .linkage = {NULL, NULL}
26072 + },
26073 + .init = lzo1_init,
26074 + .overrun = lzo1_overrun,
26075 + .alloc = lzo1_alloc,
26076 + .free = lzo1_free,
26077 + .min_size_deflate = lzo1_min_size_deflate,
26078 + .checksum = reiser4_adler32,
26079 + .compress = lzo1_compress,
26080 + .decompress = lzo1_decompress
26081 + },
26082 + [GZIP1_COMPRESSION_ID] = {
26083 + .h = {
26084 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26085 + .id = GZIP1_COMPRESSION_ID,
26086 + .pops = &compression_plugin_ops,
26087 + .label = "gzip1",
26088 + .desc = "gzip1 compression transform",
26089 + .linkage = {NULL, NULL}
26090 + },
26091 + .init = gzip1_init,
26092 + .overrun = gzip1_overrun,
26093 + .alloc = gzip1_alloc,
26094 + .free = gzip1_free,
26095 + .min_size_deflate = gzip1_min_size_deflate,
26096 + .checksum = NULL,
26097 + .compress = gzip1_compress,
26098 + .decompress = gzip1_decompress
26099 + }
26100 +};
26101 +
26102 +/*
26103 + Local variables:
26104 + c-indentation-style: "K&R"
26105 + mode-name: "LC"
26106 + c-basic-offset: 8
26107 + tab-width: 8
26108 + fill-column: 120
26109 + scroll-step: 1
26110 + End:
26111 +*/
26112 Index: linux-2.6.16/fs/reiser4/plugin/compress/compress.h
26113 ===================================================================
26114 --- /dev/null
26115 +++ linux-2.6.16/fs/reiser4/plugin/compress/compress.h
26116 @@ -0,0 +1,38 @@
26117 +#if !defined( __FS_REISER4_COMPRESS_H__ )
26118 +#define __FS_REISER4_COMPRESS_H__
26119 +
26120 +#include <linux/types.h>
26121 +#include <linux/string.h>
26122 +
26123 +typedef enum {
26124 + TFM_READ_ACT,
26125 + TFM_WRITE_ACT,
26126 + TFM_LAST_ACT
26127 +} tfm_action;
26128 +
26129 +/* builtin compression plugins */
26130 +
26131 +typedef enum {
26132 + LZO1_COMPRESSION_ID,
26133 + GZIP1_COMPRESSION_ID,
26134 + LAST_COMPRESSION_ID,
26135 +} reiser4_compression_id;
26136 +
26137 +typedef unsigned long cloff_t;
26138 +typedef void *coa_t;
26139 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFM_LAST_ACT];
26140 +
26141 +__u32 reiser4_adler32(char *data, __u32 len);
26142 +
26143 +#endif /* __FS_REISER4_COMPRESS_H__ */
26144 +
26145 +/* Make Linus happy.
26146 + Local variables:
26147 + c-indentation-style: "K&R"
26148 + mode-name: "LC"
26149 + c-basic-offset: 8
26150 + tab-width: 8
26151 + fill-column: 120
26152 + scroll-step: 1
26153 + End:
26154 +*/
26155 Index: linux-2.6.16/fs/reiser4/plugin/compress/compress_mode.c
26156 ===================================================================
26157 --- /dev/null
26158 +++ linux-2.6.16/fs/reiser4/plugin/compress/compress_mode.c
26159 @@ -0,0 +1,163 @@
26160 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26161 +/* This file contains Reiser4 compression mode plugins.
26162 +
26163 + Compression mode plugin is a set of handlers called by compressor
26164 + at flush time and represent some heuristics including the ones
26165 + which are to avoid compression of incompressible data, see
26166 + http://www.namesys.com/cryptcompress_design.html for more details.
26167 +*/
26168 +#include "../../inode.h"
26169 +#include "../plugin.h"
26170 +
26171 +static int should_deflate_test(struct inode * inode, cloff_t index)
26172 +{
26173 + return !test_bit(0, &index);
26174 +}
26175 +
26176 +static int should_deflate_none(struct inode * inode, cloff_t index)
26177 +{
26178 + return 0;
26179 +}
26180 +
26181 +static int should_deflate_common(struct inode * inode, cloff_t index)
26182 +{
26183 + return compression_is_on(cryptcompress_inode_data(inode));
26184 +}
26185 +
26186 +static int turn_off_compression(struct inode *inode, cloff_t index)
26187 +{
26188 + toggle_compression(cryptcompress_inode_data(inode), 0);
26189 + return 0;
26190 +}
26191 +
26192 +static int turn_on_compression(struct inode *inode, cloff_t index)
26193 +{
26194 + toggle_compression(cryptcompress_inode_data(inode), 1);
26195 + return 0;
26196 +}
26197 +
26198 +static int turn_off_compression_on_zero(struct inode *inode, cloff_t index)
26199 +{
26200 + assert("edward-1308", inode != NULL);
26201 + if (index == 0)
26202 + toggle_compression(cryptcompress_inode_data(inode), 0);
26203 + return 0;
26204 +}
26205 +
26206 +/* Check on lattice (COL) of some sparseness factor,
26207 + the family of adaptive compression modes which define
26208 + the following behavior:
26209 +
26210 + Compression is on: try to compress everything and turn
26211 + it off, whenever cluster is incompressible.
26212 +
26213 + Compression is off: try to compress clusters of indexes
26214 + k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26215 + them is compressible. */
26216 +
26217 +/* check if @index belongs to one-dimensional lattice
26218 + of sparce factor @factor */
26219 +static int check_on_lattice(cloff_t index, int factor)
26220 +{
26221 + return (factor ? index % factor == 0: index == 0);
26222 +}
26223 +
26224 +#define DEFINE_CHECK_ON_LATTICE(FACTOR) \
26225 + static int check_on_lattice_ ## FACTOR (struct inode * inode, \
26226 + cloff_t index) \
26227 +{ \
26228 + return should_deflate_common(inode, index) || \
26229 + check_on_lattice(index, FACTOR); \
26230 +}
26231 +
26232 +#define SUPPORT_COL_COMPRESSION_MODE(FACTOR, LABEL) \
26233 +[COL_ ## FACTOR ## _COMPRESSION_MODE_ID] = { \
26234 + .h = { \
26235 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, \
26236 + .id = COL_ ## FACTOR ## _COMPRESSION_MODE_ID, \
26237 + .pops = NULL, \
26238 + .label = LABEL, \
26239 + .desc = LABEL, \
26240 + .linkage = {NULL, NULL} \
26241 + }, \
26242 + .should_deflate = check_on_lattice_ ## FACTOR, \
26243 + .accept_hook = turn_on_compression, \
26244 + .discard_hook = turn_off_compression \
26245 +}
26246 +
26247 +DEFINE_CHECK_ON_LATTICE(8)
26248 +DEFINE_CHECK_ON_LATTICE(16)
26249 +DEFINE_CHECK_ON_LATTICE(32)
26250 +
26251 +/* compression mode_plugins */
26252 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26253 + [NONE_COMPRESSION_MODE_ID] = {
26254 + .h = {
26255 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26256 + .id = NONE_COMPRESSION_MODE_ID,
26257 + .pops = NULL,
26258 + .label = "none",
26259 + .desc = "Don't compress",
26260 + .linkage = {NULL, NULL}
26261 + },
26262 + .should_deflate = should_deflate_none,
26263 + .accept_hook = NULL,
26264 + .discard_hook = NULL
26265 + },
26266 + /* Check-on-lattice adaptive compression modes */
26267 + SUPPORT_COL_COMPRESSION_MODE(8, "col8"),
26268 + SUPPORT_COL_COMPRESSION_MODE(16, "col16"),
26269 + SUPPORT_COL_COMPRESSION_MODE(32, "col32"),
26270 + /* Turn off compression if logical cluster of index == 0
26271 + is incompressible, then don't check anymore */
26272 + [COZ_COMPRESSION_MODE_ID] = {
26273 + .h = {
26274 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26275 + .id = COZ_COMPRESSION_MODE_ID,
26276 + .pops = NULL,
26277 + .label = "coz",
26278 + .desc = "Check on zero",
26279 + .linkage = {NULL, NULL}
26280 + },
26281 + .should_deflate = should_deflate_common,
26282 + .accept_hook = NULL,
26283 + .discard_hook = turn_off_compression_on_zero
26284 + },
26285 + [FORCE_COMPRESSION_MODE_ID] = {
26286 + .h = {
26287 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26288 + .id = FORCE_COMPRESSION_MODE_ID,
26289 + .pops = NULL,
26290 + .label = "force",
26291 + .desc = "Compress everything",
26292 + .linkage = {NULL, NULL}
26293 + },
26294 + .should_deflate = NULL,
26295 + .accept_hook = NULL,
26296 + .discard_hook = NULL
26297 + },
26298 + [TEST_COMPRESSION_MODE_ID] = {
26299 + .h = {
26300 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26301 + .id = TEST_COMPRESSION_MODE_ID,
26302 + .pops = NULL,
26303 + .label = "test", /* This mode is for benchmarks only */
26304 + .desc = "Don't compress odd clusters",
26305 + .linkage = {NULL, NULL}
26306 + },
26307 + .should_deflate = should_deflate_test,
26308 + .accept_hook = NULL,
26309 + .discard_hook = NULL
26310 + }
26311 +};
26312 +
26313 +/*
26314 + Local variables:
26315 + c-indentation-style: "K&R"
26316 + mode-name: "LC"
26317 + c-basic-offset: 8
26318 + tab-width: 8
26319 + fill-column: 120
26320 + scroll-step: 1
26321 + End:
26322 +*/
26323 Index: linux-2.6.16/fs/reiser4/plugin/compress/lzoconf.h
26324 ===================================================================
26325 --- /dev/null
26326 +++ linux-2.6.16/fs/reiser4/plugin/compress/lzoconf.h
26327 @@ -0,0 +1,420 @@
26328 +/* lzoconf.h -- configuration for the LZO real-time data compression library
26329 + adopted for reiser4 compression transform plugin.
26330 +
26331 + This file is part of the LZO real-time data compression library
26332 + and not included in any proprietary licenses of reiser4.
26333 +
26334 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26335 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26336 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26337 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26338 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26339 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26340 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26341 + All Rights Reserved.
26342 +
26343 + The LZO library is free software; you can redistribute it and/or
26344 + modify it under the terms of the GNU General Public License as
26345 + published by the Free Software Foundation; either version 2 of
26346 + the License, or (at your option) any later version.
26347 +
26348 + The LZO library is distributed in the hope that it will be useful,
26349 + but WITHOUT ANY WARRANTY; without even the implied warranty of
26350 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26351 + GNU General Public License for more details.
26352 +
26353 + You should have received a copy of the GNU General Public License
26354 + along with the LZO library; see the file COPYING.
26355 + If not, write to the Free Software Foundation, Inc.,
26356 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26357 +
26358 + Markus F.X.J. Oberhumer
26359 + <markus@oberhumer.com>
26360 + http://www.oberhumer.com/opensource/lzo/
26361 + */
26362 +
26363 +#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
26364 +
26365 +#ifndef __LZOCONF_H
26366 +#define __LZOCONF_H
26367 +
26368 +#define LZO_VERSION 0x1080
26369 +#define LZO_VERSION_STRING "1.08"
26370 +#define LZO_VERSION_DATE "Jul 12 2002"
26371 +
26372 +/* internal Autoconf configuration file - only used when building LZO */
26373 +#if defined(LZO_HAVE_CONFIG_H)
26374 +# include <config.h>
26375 +#endif
26376 +#ifdef __cplusplus
26377 +extern "C" {
26378 +#endif
26379 +
26380 +/***********************************************************************
26381 +// LZO requires a conforming <limits.h>
26382 +************************************************************************/
26383 +
26384 +#define CHAR_BIT 8
26385 +#define USHRT_MAX 0xffff
26386 +
26387 +/* workaround a cpp bug under hpux 10.20 */
26388 +#define LZO_0xffffffffL 4294967295ul
26389 +
26390 +/***********************************************************************
26391 +// architecture defines
26392 +************************************************************************/
26393 +
26394 +#if !defined(__LZO_WIN) && !defined(__LZO_DOS) && !defined(__LZO_OS2)
26395 +# if defined(__WINDOWS__) || defined(_WINDOWS) || defined(_Windows)
26396 +# define __LZO_WIN
26397 +# elif defined(__WIN32__) || defined(_WIN32) || defined(WIN32)
26398 +# define __LZO_WIN
26399 +# elif defined(__NT__) || defined(__NT_DLL__) || defined(__WINDOWS_386__)
26400 +# define __LZO_WIN
26401 +# elif defined(__DOS__) || defined(__MSDOS__) || defined(MSDOS)
26402 +# define __LZO_DOS
26403 +# elif defined(__OS2__) || defined(__OS2V2__) || defined(OS2)
26404 +# define __LZO_OS2
26405 +# elif defined(__palmos__)
26406 +# define __LZO_PALMOS
26407 +# elif defined(__TOS__) || defined(__atarist__)
26408 +# define __LZO_TOS
26409 +# endif
26410 +#endif
26411 +
26412 +#if (UINT_MAX < LZO_0xffffffffL)
26413 +# if defined(__LZO_WIN)
26414 +# define __LZO_WIN16
26415 +# elif defined(__LZO_DOS)
26416 +# define __LZO_DOS16
26417 +# elif defined(__LZO_PALMOS)
26418 +# define __LZO_PALMOS16
26419 +# elif defined(__LZO_TOS)
26420 +# define __LZO_TOS16
26421 +# elif defined(__C166__)
26422 +# else
26423 + /* porting hint: for pure 16-bit architectures try compiling
26424 + * everything with -D__LZO_STRICT_16BIT */
26425 +# error "16-bit target not supported - contact me for porting hints"
26426 +# endif
26427 +#endif
26428 +
26429 +#if !defined(__LZO_i386)
26430 +# if defined(__LZO_DOS) || defined(__LZO_WIN16)
26431 +# define __LZO_i386
26432 +# elif defined(__i386__) || defined(__386__) || defined(_M_IX86)
26433 +# define __LZO_i386
26434 +# endif
26435 +#endif
26436 +
26437 +#if defined(__LZO_STRICT_16BIT)
26438 +# if (UINT_MAX < LZO_0xffffffffL)
26439 +# include <lzo16bit.h>
26440 +# endif
26441 +#endif
26442 +
26443 +/* memory checkers */
26444 +#if !defined(__LZO_CHECKER)
26445 +# if defined(__BOUNDS_CHECKING_ON)
26446 +# define __LZO_CHECKER
26447 +# elif defined(__CHECKER__)
26448 +# define __LZO_CHECKER
26449 +# elif defined(__INSURE__)
26450 +# define __LZO_CHECKER
26451 +# elif defined(__PURIFY__)
26452 +# define __LZO_CHECKER
26453 +# endif
26454 +#endif
26455 +
26456 +/***********************************************************************
26457 +// integral and pointer types
26458 +************************************************************************/
26459 +
26460 +/* Integral types with 32 bits or more */
26461 +#if !defined(LZO_UINT32_MAX)
26462 +# if (UINT_MAX >= LZO_0xffffffffL)
26463 + typedef unsigned int lzo_uint32;
26464 + typedef int lzo_int32;
26465 +# define LZO_UINT32_MAX UINT_MAX
26466 +# define LZO_INT32_MAX INT_MAX
26467 +# define LZO_INT32_MIN INT_MIN
26468 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26469 + typedef unsigned long lzo_uint32;
26470 + typedef long lzo_int32;
26471 +# define LZO_UINT32_MAX ULONG_MAX
26472 +# define LZO_INT32_MAX LONG_MAX
26473 +# define LZO_INT32_MIN LONG_MIN
26474 +# else
26475 +# error "lzo_uint32"
26476 +# endif
26477 +#endif
26478 +
26479 +/* lzo_uint is used like size_t */
26480 +#if !defined(LZO_UINT_MAX)
26481 +# if (UINT_MAX >= LZO_0xffffffffL)
26482 + typedef unsigned int lzo_uint;
26483 + typedef int lzo_int;
26484 +# define LZO_UINT_MAX UINT_MAX
26485 +# define LZO_INT_MAX INT_MAX
26486 +# define LZO_INT_MIN INT_MIN
26487 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26488 + typedef unsigned long lzo_uint;
26489 + typedef long lzo_int;
26490 +# define LZO_UINT_MAX ULONG_MAX
26491 +# define LZO_INT_MAX LONG_MAX
26492 +# define LZO_INT_MIN LONG_MIN
26493 +# else
26494 +# error "lzo_uint"
26495 +# endif
26496 +#endif
26497 +
26498 + typedef int lzo_bool;
26499 +
26500 +/***********************************************************************
26501 +// memory models
26502 +************************************************************************/
26503 +
26504 +/* Memory model for the public code segment. */
26505 +#if !defined(__LZO_CMODEL)
26506 +# if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26507 +# define __LZO_CMODEL __far
26508 +# elif defined(__LZO_i386) && defined(__WATCOMC__)
26509 +# define __LZO_CMODEL __near
26510 +# else
26511 +# define __LZO_CMODEL
26512 +# endif
26513 +#endif
26514 +
26515 +/* Memory model for the public data segment. */
26516 +#if !defined(__LZO_DMODEL)
26517 +# if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26518 +# define __LZO_DMODEL __far
26519 +# elif defined(__LZO_i386) && defined(__WATCOMC__)
26520 +# define __LZO_DMODEL __near
26521 +# else
26522 +# define __LZO_DMODEL
26523 +# endif
26524 +#endif
26525 +
26526 +/* Memory model that allows to access memory at offsets of lzo_uint. */
26527 +#if !defined(__LZO_MMODEL)
26528 +# if (LZO_UINT_MAX <= UINT_MAX)
26529 +# define __LZO_MMODEL
26530 +# elif defined(__LZO_DOS16) || defined(__LZO_WIN16)
26531 +# define __LZO_MMODEL __huge
26532 +# define LZO_999_UNSUPPORTED
26533 +# elif defined(__LZO_PALMOS16) || defined(__LZO_TOS16)
26534 +# define __LZO_MMODEL
26535 +# else
26536 +# error "__LZO_MMODEL"
26537 +# endif
26538 +#endif
26539 +
26540 +/* no typedef here because of const-pointer issues */
26541 +#define lzo_byte unsigned char __LZO_MMODEL
26542 +#define lzo_bytep unsigned char __LZO_MMODEL *
26543 +#define lzo_charp char __LZO_MMODEL *
26544 +#define lzo_voidp void __LZO_MMODEL *
26545 +#define lzo_shortp short __LZO_MMODEL *
26546 +#define lzo_ushortp unsigned short __LZO_MMODEL *
26547 +#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
26548 +#define lzo_int32p lzo_int32 __LZO_MMODEL *
26549 +#define lzo_uintp lzo_uint __LZO_MMODEL *
26550 +#define lzo_intp lzo_int __LZO_MMODEL *
26551 +#define lzo_voidpp lzo_voidp __LZO_MMODEL *
26552 +#define lzo_bytepp lzo_bytep __LZO_MMODEL *
26553 +
26554 +#ifndef lzo_sizeof_dict_t
26555 +# define lzo_sizeof_dict_t sizeof(lzo_bytep)
26556 +#endif
26557 +
26558 +/***********************************************************************
26559 +// calling conventions and function types
26560 +************************************************************************/
26561 +
26562 +/* linkage */
26563 +#if !defined(__LZO_EXTERN_C)
26564 +# ifdef __cplusplus
26565 +# define __LZO_EXTERN_C extern "C"
26566 +# else
26567 +# define __LZO_EXTERN_C extern
26568 +# endif
26569 +#endif
26570 +
26571 +/* calling convention */
26572 +#if !defined(__LZO_CDECL)
26573 +# if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26574 +# define __LZO_CDECL __LZO_CMODEL __cdecl
26575 +# elif defined(__LZO_i386) && defined(_MSC_VER)
26576 +# define __LZO_CDECL __LZO_CMODEL __cdecl
26577 +# elif defined(__LZO_i386) && defined(__WATCOMC__)
26578 +# define __LZO_CDECL __LZO_CMODEL __cdecl
26579 +# else
26580 +# define __LZO_CDECL __LZO_CMODEL
26581 +# endif
26582 +#endif
26583 +#if !defined(__LZO_ENTRY)
26584 +# define __LZO_ENTRY __LZO_CDECL
26585 +#endif
26586 +
26587 +/* C++ exception specification for extern "C" function types */
26588 +#if !defined(__cplusplus)
26589 +# undef LZO_NOTHROW
26590 +# define LZO_NOTHROW
26591 +#elif !defined(LZO_NOTHROW)
26592 +# define LZO_NOTHROW
26593 +#endif
26594 +
26595 + typedef int
26596 + (__LZO_ENTRY * lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26597 + lzo_byte * dst, lzo_uintp dst_len,
26598 + lzo_voidp wrkmem);
26599 +
26600 + typedef int
26601 + (__LZO_ENTRY * lzo_decompress_t) (const lzo_byte * src,
26602 + lzo_uint src_len, lzo_byte * dst,
26603 + lzo_uintp dst_len, lzo_voidp wrkmem);
26604 +
26605 + typedef int
26606 + (__LZO_ENTRY * lzo_optimize_t) (lzo_byte * src, lzo_uint src_len,
26607 + lzo_byte * dst, lzo_uintp dst_len,
26608 + lzo_voidp wrkmem);
26609 +
26610 + typedef int
26611 + (__LZO_ENTRY * lzo_compress_dict_t) (const lzo_byte * src,
26612 + lzo_uint src_len, lzo_byte * dst,
26613 + lzo_uintp dst_len,
26614 + lzo_voidp wrkmem,
26615 + const lzo_byte * dict,
26616 + lzo_uint dict_len);
26617 +
26618 + typedef int
26619 + (__LZO_ENTRY * lzo_decompress_dict_t) (const lzo_byte * src,
26620 + lzo_uint src_len,
26621 + lzo_byte * dst,
26622 + lzo_uintp dst_len,
26623 + lzo_voidp wrkmem,
26624 + const lzo_byte * dict,
26625 + lzo_uint dict_len);
26626 +
26627 +/* assembler versions always use __cdecl */
26628 + typedef int
26629 + (__LZO_CDECL * lzo_compress_asm_t) (const lzo_byte * src,
26630 + lzo_uint src_len, lzo_byte * dst,
26631 + lzo_uintp dst_len,
26632 + lzo_voidp wrkmem);
26633 +
26634 + typedef int
26635 + (__LZO_CDECL * lzo_decompress_asm_t) (const lzo_byte * src,
26636 + lzo_uint src_len, lzo_byte * dst,
26637 + lzo_uintp dst_len,
26638 + lzo_voidp wrkmem);
26639 +
26640 +/* a progress indicator callback function */
26641 + typedef void (__LZO_ENTRY * lzo_progress_callback_t) (lzo_uint,
26642 + lzo_uint);
26643 +
26644 +/***********************************************************************
26645 +// export information
26646 +************************************************************************/
26647 +
26648 +/* DLL export information */
26649 +#if !defined(__LZO_EXPORT1)
26650 +# define __LZO_EXPORT1
26651 +#endif
26652 +#if !defined(__LZO_EXPORT2)
26653 +# define __LZO_EXPORT2
26654 +#endif
26655 +
26656 +/* exported calling convention for C functions */
26657 +#if !defined(LZO_PUBLIC)
26658 +# define LZO_PUBLIC(_rettype) \
26659 + __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_ENTRY
26660 +#endif
26661 +#if !defined(LZO_EXTERN)
26662 +# define LZO_EXTERN(_rettype) __LZO_EXTERN_C LZO_PUBLIC(_rettype)
26663 +#endif
26664 +#if !defined(LZO_PRIVATE)
26665 +# define LZO_PRIVATE(_rettype) static _rettype __LZO_ENTRY
26666 +#endif
26667 +
26668 +/* exported __cdecl calling convention for assembler functions */
26669 +#if !defined(LZO_PUBLIC_CDECL)
26670 +# define LZO_PUBLIC_CDECL(_rettype) \
26671 + __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_CDECL
26672 +#endif
26673 +#if !defined(LZO_EXTERN_CDECL)
26674 +# define LZO_EXTERN_CDECL(_rettype) __LZO_EXTERN_C LZO_PUBLIC_CDECL(_rettype)
26675 +#endif
26676 +
26677 +/* exported global variables (LZO currently uses no static variables and
26678 + * is fully thread safe) */
26679 +#if !defined(LZO_PUBLIC_VAR)
26680 +# define LZO_PUBLIC_VAR(_type) \
26681 + __LZO_EXPORT1 _type __LZO_EXPORT2 __LZO_DMODEL
26682 +#endif
26683 +#if !defined(LZO_EXTERN_VAR)
26684 +# define LZO_EXTERN_VAR(_type) extern LZO_PUBLIC_VAR(_type)
26685 +#endif
26686 +
26687 +/***********************************************************************
26688 +// error codes and prototypes
26689 +************************************************************************/
26690 +
26691 +/* Error codes for the compression/decompression functions. Negative
26692 + * values are errors, positive values will be used for special but
26693 + * normal events.
26694 + */
26695 +#define LZO_E_OK 0
26696 +#define LZO_E_ERROR (-1)
26697 +#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
26698 +#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
26699 +#define LZO_E_INPUT_OVERRUN (-4)
26700 +#define LZO_E_OUTPUT_OVERRUN (-5)
26701 +#define LZO_E_LOOKBEHIND_OVERRUN (-6)
26702 +#define LZO_E_EOF_NOT_FOUND (-7)
26703 +#define LZO_E_INPUT_NOT_CONSUMED (-8)
26704 +
26705 +/* lzo_init() should be the first function you call.
26706 + * Check the return code !
26707 + *
26708 + * lzo_init() is a macro to allow checking that the library and the
26709 + * compiler's view of various types are consistent.
26710 + */
26711 +#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26712 + (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26713 + (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26714 + (int)sizeof(lzo_compress_t))
26715 + LZO_EXTERN(int) __lzo_init2(unsigned, int, int, int, int, int, int,
26716 + int, int, int);
26717 +
26718 +/* checksum functions */
26719 + LZO_EXTERN(lzo_uint32)
26720 + lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf, lzo_uint _len);
26721 +
26722 +/* misc. */
26723 + typedef union {
26724 + lzo_bytep p;
26725 + lzo_uint u;
26726 + } __lzo_pu_u;
26727 + typedef union {
26728 + lzo_bytep p;
26729 + lzo_uint32 u32;
26730 + } __lzo_pu32_u;
26731 + typedef union {
26732 + void *vp;
26733 + lzo_bytep bp;
26734 + lzo_uint32 u32;
26735 + long l;
26736 + } lzo_align_t;
26737 +
26738 +#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26739 + ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26740 +
26741 +/* deprecated - only for backward compatibility */
26742 +#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26743 +
26744 +#ifdef __cplusplus
26745 +} /* extern "C" */
26746 +#endif
26747 +#endif /* already included */
26748 Index: linux-2.6.16/fs/reiser4/plugin/compress/minilzo.c
26749 ===================================================================
26750 --- /dev/null
26751 +++ linux-2.6.16/fs/reiser4/plugin/compress/minilzo.c
26752 @@ -0,0 +1,2155 @@
26753 +/* minilzo.c -- mini subset of the LZO real-time data compression library
26754 + adopted for reiser4 compression transform plugin.
26755 +
26756 + This file is part of the LZO real-time data compression library
26757 + and not included in any proprietary licenses of reiser4.
26758 +
26759 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26760 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26761 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26762 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26763 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26764 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26765 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26766 + All Rights Reserved.
26767 +
26768 + The LZO library is free software; you can redistribute it and/or
26769 + modify it under the terms of the GNU General Public License as
26770 + published by the Free Software Foundation; either version 2 of
26771 + the License, or (at your option) any later version.
26772 +
26773 + The LZO library is distributed in the hope that it will be useful,
26774 + but WITHOUT ANY WARRANTY; without even the implied warranty of
26775 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26776 + GNU General Public License for more details.
26777 +
26778 + You should have received a copy of the GNU General Public License
26779 + along with the LZO library; see the file COPYING.
26780 + If not, write to the Free Software Foundation, Inc.,
26781 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26782 +
26783 + Markus F.X.J. Oberhumer
26784 + <markus@oberhumer.com>
26785 + http://www.oberhumer.com/opensource/lzo/
26786 + */
26787 +
26788 +/*
26789 + * NOTE:
26790 + * the full LZO package can be found at
26791 + * http://www.oberhumer.com/opensource/lzo/
26792 + */
26793 +
26794 +#include "../../debug.h" /* for reiser4 assert macro -edward */
26795 +
26796 +#define __LZO_IN_MINILZO
26797 +#define LZO_BUILD
26798 +
26799 +#ifdef MINILZO_HAVE_CONFIG_H
26800 +# include <config.h>
26801 +#endif
26802 +
26803 +#undef LZO_HAVE_CONFIG_H
26804 +#include "minilzo.h"
26805 +
26806 +#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26807 +# error "version mismatch in miniLZO source files"
26808 +#endif
26809 +
26810 +#ifdef MINILZO_HAVE_CONFIG_H
26811 +# define LZO_HAVE_CONFIG_H
26812 +#endif
26813 +
26814 +
26815 +#ifndef __LZO_CONF_H
26816 +#define __LZO_CONF_H
26817 +
26818 +#if !defined(__LZO_IN_MINILZO)
26819 +# ifndef __LZOCONF_H
26820 +# include <lzoconf.h>
26821 +# endif
26822 +#endif
26823 +
26824 +#if defined(__BOUNDS_CHECKING_ON)
26825 +# include <unchecked.h>
26826 +#else
26827 +# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
26828 +# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
26829 +#endif
26830 +
26831 +# define HAVE_MEMCMP
26832 +# define HAVE_MEMCPY
26833 +# define HAVE_MEMMOVE
26834 +# define HAVE_MEMSET
26835 +
26836 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26837 +# define HAVE_MALLOC_H
26838 +# define HAVE_HALLOC
26839 +#endif
26840 +
26841 +#undef NDEBUG
26842 +#if !defined(LZO_DEBUG)
26843 +# define NDEBUG
26844 +#endif
26845 +#if defined(LZO_DEBUG) || !defined(NDEBUG)
26846 +# if !defined(NO_STDIO_H)
26847 +# include <stdio.h>
26848 +# endif
26849 +#endif
26850 +# if 0 /* edward */
26851 +#include <assert.h>
26852 +#endif /* edward */
26853 +
26854 +#if !defined(LZO_COMPILE_TIME_ASSERT)
26855 +# define LZO_COMPILE_TIME_ASSERT(expr) \
26856 + { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26857 +#endif
26858 +
26859 +#if !defined(LZO_UNUSED)
26860 +# if 1
26861 +# define LZO_UNUSED(var) ((void)&var)
26862 +# elif 0
26863 +# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26864 +# else
26865 +# define LZO_UNUSED(parm) (parm = parm)
26866 +# endif
26867 +#endif
26868 +
26869 +#if !defined(__inline__) && !defined(__GNUC__)
26870 +# if defined(__cplusplus)
26871 +# define __inline__ inline
26872 +# else
26873 +# define __inline__
26874 +# endif
26875 +#endif
26876 +
26877 +#if defined(NO_MEMCMP)
26878 +# undef HAVE_MEMCMP
26879 +#endif
26880 +
26881 +#if !defined(HAVE_MEMSET)
26882 +# undef memset
26883 +# define memset lzo_memset
26884 +#endif
26885 +
26886 +# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
26887 +
26888 +#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
26889 +#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
26890 +#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26891 +#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26892 +
26893 +#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
26894 +
26895 +#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26896 +
26897 +#define LZO_SIZE(bits) (1u << (bits))
26898 +#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
26899 +
26900 +#define LZO_LSIZE(bits) (1ul << (bits))
26901 +#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
26902 +
26903 +#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
26904 +#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
26905 +
26906 +#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
26907 +#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26908 +
26909 +#if !defined(SIZEOF_UNSIGNED)
26910 +# if (UINT_MAX == 0xffff)
26911 +# define SIZEOF_UNSIGNED 2
26912 +# elif (UINT_MAX == LZO_0xffffffffL)
26913 +# define SIZEOF_UNSIGNED 4
26914 +# elif (UINT_MAX >= LZO_0xffffffffL)
26915 +# define SIZEOF_UNSIGNED 8
26916 +# else
26917 +# error "SIZEOF_UNSIGNED"
26918 +# endif
26919 +#endif
26920 +
26921 +#if !defined(SIZEOF_UNSIGNED_LONG)
26922 +# if (ULONG_MAX == LZO_0xffffffffL)
26923 +# define SIZEOF_UNSIGNED_LONG 4
26924 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26925 +# define SIZEOF_UNSIGNED_LONG 8
26926 +# else
26927 +# error "SIZEOF_UNSIGNED_LONG"
26928 +# endif
26929 +#endif
26930 +
26931 +#if !defined(SIZEOF_SIZE_T)
26932 +# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
26933 +#endif
26934 +#if !defined(SIZE_T_MAX)
26935 +# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26936 +#endif
26937 +
26938 +#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26939 +# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26940 +# define LZO_UNALIGNED_OK_2
26941 +# endif
26942 +# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26943 +# define LZO_UNALIGNED_OK_4
26944 +# endif
26945 +#endif
26946 +
26947 +#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26948 +# if !defined(LZO_UNALIGNED_OK)
26949 +# define LZO_UNALIGNED_OK
26950 +# endif
26951 +#endif
26952 +
26953 +#if defined(__LZO_NO_UNALIGNED)
26954 +# undef LZO_UNALIGNED_OK
26955 +# undef LZO_UNALIGNED_OK_2
26956 +# undef LZO_UNALIGNED_OK_4
26957 +#endif
26958 +
26959 +#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26960 +# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26961 +#endif
26962 +#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26963 +# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26964 +#endif
26965 +
26966 +#if defined(__LZO_NO_ALIGNED)
26967 +# undef LZO_ALIGNED_OK_4
26968 +#endif
26969 +
26970 +#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26971 +# error "LZO_ALIGNED_OK_4 must not be defined on this system"
26972 +#endif
26973 +
26974 +#define LZO_LITTLE_ENDIAN 1234
26975 +#define LZO_BIG_ENDIAN 4321
26976 +#define LZO_PDP_ENDIAN 3412
26977 +
26978 +#if !defined(LZO_BYTE_ORDER)
26979 +# if defined(MFX_BYTE_ORDER)
26980 +# define LZO_BYTE_ORDER MFX_BYTE_ORDER
26981 +# elif defined(__LZO_i386)
26982 +# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
26983 +# elif defined(BYTE_ORDER)
26984 +# define LZO_BYTE_ORDER BYTE_ORDER
26985 +# elif defined(__BYTE_ORDER)
26986 +# define LZO_BYTE_ORDER __BYTE_ORDER
26987 +# endif
26988 +#endif
26989 +
26990 +#if defined(LZO_BYTE_ORDER)
26991 +# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26992 + (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26993 +# error "invalid LZO_BYTE_ORDER"
26994 +# endif
26995 +#endif
26996 +
26997 +#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
26998 +# error "LZO_BYTE_ORDER is not defined"
26999 +#endif
27000 +
27001 +#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
27002 +
27003 +#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
27004 +# if defined(__GNUC__) && defined(__i386__)
27005 +# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
27006 +# define LZO_OPTIMIZE_GNUC_i386
27007 +# endif
27008 +# endif
27009 +#endif
27010 +
27011 +__LZO_EXTERN_C const lzo_uint32 _lzo_crc32_table[256];
27012 +
27013 +#define _LZO_STRINGIZE(x) #x
27014 +#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
27015 +
27016 +#define _LZO_CONCAT2(a,b) a ## b
27017 +#define _LZO_CONCAT3(a,b,c) a ## b ## c
27018 +#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
27019 +#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
27020 +
27021 +#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
27022 +#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
27023 +#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
27024 +#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
27025 +
27026 +#ifndef __LZO_PTR_H
27027 +#define __LZO_PTR_H
27028 +
27029 +#ifdef __cplusplus
27030 +extern "C" {
27031 +#endif
27032 +
27033 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27034 +# include <dos.h>
27035 +# if 1 && defined(__WATCOMC__)
27036 +# include <i86.h>
27037 + __LZO_EXTERN_C unsigned char _HShift;
27038 +# define __LZO_HShift _HShift
27039 +# elif 1 && defined(_MSC_VER)
27040 + __LZO_EXTERN_C unsigned short __near _AHSHIFT;
27041 +# define __LZO_HShift ((unsigned) &_AHSHIFT)
27042 +# elif defined(__LZO_WIN16)
27043 +# define __LZO_HShift 3
27044 +# else
27045 +# define __LZO_HShift 12
27046 +# endif
27047 +# if !defined(_FP_SEG) && defined(FP_SEG)
27048 +# define _FP_SEG FP_SEG
27049 +# endif
27050 +# if !defined(_FP_OFF) && defined(FP_OFF)
27051 +# define _FP_OFF FP_OFF
27052 +# endif
27053 +#endif
27054 +
27055 +#if !defined(lzo_ptrdiff_t)
27056 +# if (UINT_MAX >= LZO_0xffffffffL)
27057 + typedef ptrdiff_t lzo_ptrdiff_t;
27058 +# else
27059 + typedef long lzo_ptrdiff_t;
27060 +# endif
27061 +#endif
27062 +
27063 +#if !defined(__LZO_HAVE_PTR_T)
27064 +# if defined(lzo_ptr_t)
27065 +# define __LZO_HAVE_PTR_T
27066 +# endif
27067 +#endif
27068 +#if !defined(__LZO_HAVE_PTR_T)
27069 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
27070 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
27071 + typedef unsigned long lzo_ptr_t;
27072 + typedef long lzo_sptr_t;
27073 +# define __LZO_HAVE_PTR_T
27074 +# endif
27075 +# endif
27076 +#endif
27077 +#if !defined(__LZO_HAVE_PTR_T)
27078 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
27079 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
27080 + typedef unsigned int lzo_ptr_t;
27081 + typedef int lzo_sptr_t;
27082 +# define __LZO_HAVE_PTR_T
27083 +# endif
27084 +# endif
27085 +#endif
27086 +#if !defined(__LZO_HAVE_PTR_T)
27087 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
27088 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
27089 + typedef unsigned short lzo_ptr_t;
27090 + typedef short lzo_sptr_t;
27091 +# define __LZO_HAVE_PTR_T
27092 +# endif
27093 +# endif
27094 +#endif
27095 +#if !defined(__LZO_HAVE_PTR_T)
27096 +# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
27097 +# error "no suitable type for lzo_ptr_t"
27098 +# else
27099 + typedef unsigned long lzo_ptr_t;
27100 + typedef long lzo_sptr_t;
27101 +# define __LZO_HAVE_PTR_T
27102 +# endif
27103 +#endif
27104 +
27105 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27106 +#define PTR(a) ((lzo_bytep) (a))
27107 +#define PTR_ALIGNED_4(a) ((_FP_OFF(a) & 3) == 0)
27108 +#define PTR_ALIGNED2_4(a,b) (((_FP_OFF(a) | _FP_OFF(b)) & 3) == 0)
27109 +#else
27110 +#define PTR(a) ((lzo_ptr_t) (a))
27111 +#define PTR_LINEAR(a) PTR(a)
27112 +#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
27113 +#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
27114 +#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
27115 +#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
27116 +#endif
27117 +
27118 +#define PTR_LT(a,b) (PTR(a) < PTR(b))
27119 +#define PTR_GE(a,b) (PTR(a) >= PTR(b))
27120 +#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
27121 +#define pd(a,b) ((lzo_uint) ((a)-(b)))
27122 +
27123 + typedef union {
27124 + char a_char;
27125 + unsigned char a_uchar;
27126 + short a_short;
27127 + unsigned short a_ushort;
27128 + int a_int;
27129 + unsigned int a_uint;
27130 + long a_long;
27131 + unsigned long a_ulong;
27132 + lzo_int a_lzo_int;
27133 + lzo_uint a_lzo_uint;
27134 + lzo_int32 a_lzo_int32;
27135 + lzo_uint32 a_lzo_uint32;
27136 + ptrdiff_t a_ptrdiff_t;
27137 + lzo_ptrdiff_t a_lzo_ptrdiff_t;
27138 + lzo_ptr_t a_lzo_ptr_t;
27139 + lzo_voidp a_lzo_voidp;
27140 + void *a_void_p;
27141 + lzo_bytep a_lzo_bytep;
27142 + lzo_bytepp a_lzo_bytepp;
27143 + lzo_uintp a_lzo_uintp;
27144 + lzo_uint *a_lzo_uint_p;
27145 + lzo_uint32p a_lzo_uint32p;
27146 + lzo_uint32 *a_lzo_uint32_p;
27147 + unsigned char *a_uchar_p;
27148 + char *a_char_p;
27149 + } lzo_full_align_t;
27150 +
27151 +#ifdef __cplusplus
27152 +}
27153 +#endif
27154 +#endif
27155 +#define LZO_DETERMINISTIC
27156 +#define LZO_DICT_USE_PTR
27157 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16) || defined(__LZO_STRICT_16BIT)
27158 +# undef LZO_DICT_USE_PTR
27159 +#endif
27160 +#if defined(LZO_DICT_USE_PTR)
27161 +# define lzo_dict_t const lzo_bytep
27162 +# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
27163 +#else
27164 +# define lzo_dict_t lzo_uint
27165 +# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
27166 +#endif
27167 +#if !defined(lzo_moff_t)
27168 +#define lzo_moff_t lzo_uint
27169 +#endif
27170 +#endif
27171 +static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
27172 +{
27173 + lzo_ptr_t p;
27174 +
27175 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27176 + p = (((lzo_ptr_t) (_FP_SEG(ptr))) << (16 - __LZO_HShift)) +
27177 + (_FP_OFF(ptr));
27178 +#else
27179 + p = PTR_LINEAR(ptr);
27180 +#endif
27181 +
27182 + return p;
27183 +}
27184 +
27185 +static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
27186 +{
27187 + lzo_ptr_t p, s, n;
27188 +
27189 + assert("lzo-01", size > 0);
27190 +
27191 + p = __lzo_ptr_linear(ptr);
27192 + s = (lzo_ptr_t) (size - 1);
27193 + n = (((p + s) / size) * size) - p;
27194 +
27195 + assert("lzo-02", (long)n >= 0);
27196 + assert("lzo-03", n <= s);
27197 +
27198 + return (unsigned)n;
27199 +}
27200 +
27201 +#ifndef __LZO_UTIL_H
27202 +#define __LZO_UTIL_H
27203 +
27204 +#ifndef __LZO_CONF_H
27205 +#endif
27206 +
27207 +#ifdef __cplusplus
27208 +extern "C" {
27209 +#endif
27210 +
27211 +#if 1 && defined(HAVE_MEMCPY)
27212 +#if !defined(__LZO_DOS16) && !defined(__LZO_WIN16)
27213 +
27214 +#define MEMCPY8_DS(dest,src,len) \
27215 + memcpy(dest,src,len); \
27216 + dest += len; \
27217 + src += len
27218 +
27219 +#endif
27220 +#endif
27221 +
27222 +#if !defined(MEMCPY8_DS)
27223 +
27224 +#define MEMCPY8_DS(dest,src,len) \
27225 + { register lzo_uint __l = (len) / 8; \
27226 + do { \
27227 + *dest++ = *src++; \
27228 + *dest++ = *src++; \
27229 + *dest++ = *src++; \
27230 + *dest++ = *src++; \
27231 + *dest++ = *src++; \
27232 + *dest++ = *src++; \
27233 + *dest++ = *src++; \
27234 + *dest++ = *src++; \
27235 + } while (--__l > 0); }
27236 +
27237 +#endif
27238 +
27239 +#define MEMCPY_DS(dest,src,len) \
27240 + do *dest++ = *src++; \
27241 + while (--len > 0)
27242 +
27243 +#define MEMMOVE_DS(dest,src,len) \
27244 + do *dest++ = *src++; \
27245 + while (--len > 0)
27246 +
27247 +
27248 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
27249 +
27250 +#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
27251 +
27252 +#else
27253 +
27254 +#define BZERO8_PTR(s,l,n) \
27255 + lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
27256 +
27257 +#endif
27258 +
27259 +#ifdef __cplusplus
27260 +}
27261 +#endif
27262 +
27263 +#endif
27264 +
27265 +/* If you use the LZO library in a product, you *must* keep this
27266 + * copyright string in the executable of your product.
27267 + */
27268 +
27269 +static const lzo_byte __lzo_copyright[] =
27270 +#if !defined(__LZO_IN_MINLZO)
27271 + LZO_VERSION_STRING;
27272 +#else
27273 + "\n\n\n"
27274 + "LZO real-time data compression library.\n"
27275 + "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
27276 + "<markus.oberhumer@jk.uni-linz.ac.at>\n"
27277 + "http://www.oberhumer.com/opensource/lzo/\n"
27278 + "\n"
27279 + "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
27280 + "LZO build date: " __DATE__ " " __TIME__ "\n\n"
27281 + "LZO special compilation options:\n"
27282 +#ifdef __cplusplus
27283 + " __cplusplus\n"
27284 +#endif
27285 +#if defined(__PIC__)
27286 + " __PIC__\n"
27287 +#elif defined(__pic__)
27288 + " __pic__\n"
27289 +#endif
27290 +#if (UINT_MAX < LZO_0xffffffffL)
27291 + " 16BIT\n"
27292 +#endif
27293 +#if defined(__LZO_STRICT_16BIT)
27294 + " __LZO_STRICT_16BIT\n"
27295 +#endif
27296 +#if (UINT_MAX > LZO_0xffffffffL)
27297 + " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
27298 +#endif
27299 +#if (ULONG_MAX > LZO_0xffffffffL)
27300 + " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
27301 +#endif
27302 +#if defined(LZO_BYTE_ORDER)
27303 + " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
27304 +#endif
27305 +#if defined(LZO_UNALIGNED_OK_2)
27306 + " LZO_UNALIGNED_OK_2\n"
27307 +#endif
27308 +#if defined(LZO_UNALIGNED_OK_4)
27309 + " LZO_UNALIGNED_OK_4\n"
27310 +#endif
27311 +#if defined(LZO_ALIGNED_OK_4)
27312 + " LZO_ALIGNED_OK_4\n"
27313 +#endif
27314 +#if defined(LZO_DICT_USE_PTR)
27315 + " LZO_DICT_USE_PTR\n"
27316 +#endif
27317 +#if defined(__LZO_QUERY_COMPRESS)
27318 + " __LZO_QUERY_COMPRESS\n"
27319 +#endif
27320 +#if defined(__LZO_QUERY_DECOMPRESS)
27321 + " __LZO_QUERY_DECOMPRESS\n"
27322 +#endif
27323 +#if defined(__LZO_IN_MINILZO)
27324 + " __LZO_IN_MINILZO\n"
27325 +#endif
27326 + "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
27327 +#if defined(__GNUC__) && defined(__VERSION__)
27328 + " by gcc " __VERSION__
27329 +#elif defined(__BORLANDC__)
27330 + " by Borland C " _LZO_MEXPAND(__BORLANDC__)
27331 +#elif defined(_MSC_VER)
27332 + " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
27333 +#elif defined(__PUREC__)
27334 + " by Pure C " _LZO_MEXPAND(__PUREC__)
27335 +#elif defined(__SC__)
27336 + " by Symantec C " _LZO_MEXPAND(__SC__)
27337 +#elif defined(__TURBOC__)
27338 + " by Turbo C " _LZO_MEXPAND(__TURBOC__)
27339 +#elif defined(__WATCOMC__)
27340 + " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
27341 +#endif
27342 + " $\n"
27343 + "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
27344 +#endif
27345 +
27346 +#define LZO_BASE 65521u
27347 +#define LZO_NMAX 5552
27348 +
27349 +#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
27350 +#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
27351 +#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
27352 +#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
27353 +#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
27354 +
27355 +# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
27356 +# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
27357 +
27358 +#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
27359 +
27360 +static lzo_bool schedule_insns_bug(void);
27361 +static lzo_bool strength_reduce_bug(int *);
27362 +
27363 +# define __lzo_assert(x) ((x) ? 1 : 0)
27364 +
27365 +#undef COMPILE_TIME_ASSERT
27366 +
27367 +# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
27368 +
27369 +static lzo_bool basic_integral_check(void)
27370 +{
27371 + lzo_bool r = 1;
27372 +
27373 + COMPILE_TIME_ASSERT(CHAR_BIT == 8);
27374 + COMPILE_TIME_ASSERT(sizeof(char) == 1);
27375 + COMPILE_TIME_ASSERT(sizeof(short) >= 2);
27376 + COMPILE_TIME_ASSERT(sizeof(long) >= 4);
27377 + COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
27378 + COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
27379 +
27380 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
27381 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
27382 +
27383 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
27384 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
27385 +#if defined(__LZO_STRICT_16BIT)
27386 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
27387 +#else
27388 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
27389 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
27390 +#endif
27391 +
27392 +#if (USHRT_MAX == 65535u)
27393 + COMPILE_TIME_ASSERT(sizeof(short) == 2);
27394 +#elif (USHRT_MAX == LZO_0xffffffffL)
27395 + COMPILE_TIME_ASSERT(sizeof(short) == 4);
27396 +#elif (USHRT_MAX >= LZO_0xffffffffL)
27397 + COMPILE_TIME_ASSERT(sizeof(short) > 4);
27398 +#endif
27399 +#if 0 /* to make gcc happy -edward */
27400 +#if (UINT_MAX == 65535u)
27401 + COMPILE_TIME_ASSERT(sizeof(int) == 2);
27402 +#elif (UINT_MAX == LZO_0xffffffffL)
27403 + COMPILE_TIME_ASSERT(sizeof(int) == 4);
27404 +#elif (UINT_MAX >= LZO_0xffffffffL)
27405 + COMPILE_TIME_ASSERT(sizeof(int) > 4);
27406 +#endif
27407 +#if (ULONG_MAX == 65535ul)
27408 + COMPILE_TIME_ASSERT(sizeof(long) == 2);
27409 +#elif (ULONG_MAX == LZO_0xffffffffL)
27410 + COMPILE_TIME_ASSERT(sizeof(long) == 4);
27411 +#elif (ULONG_MAX >= LZO_0xffffffffL)
27412 + COMPILE_TIME_ASSERT(sizeof(long) > 4);
27413 +#endif
27414 +#if defined(SIZEOF_UNSIGNED)
27415 + COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED == sizeof(unsigned));
27416 +#endif
27417 +#if defined(SIZEOF_UNSIGNED_LONG)
27418 + COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_LONG == sizeof(unsigned long));
27419 +#endif
27420 +#if defined(SIZEOF_UNSIGNED_SHORT)
27421 + COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_SHORT == sizeof(unsigned short));
27422 +#endif
27423 +#if !defined(__LZO_IN_MINILZO)
27424 +#if defined(SIZEOF_SIZE_T)
27425 + COMPILE_TIME_ASSERT(SIZEOF_SIZE_T == sizeof(size_t));
27426 +#endif
27427 +#endif
27428 +#endif /* -edward */
27429 +
27430 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
27431 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
27432 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
27433 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
27434 + COMPILE_TIME_ASSERT(IS_SIGNED(short));
27435 + COMPILE_TIME_ASSERT(IS_SIGNED(int));
27436 + COMPILE_TIME_ASSERT(IS_SIGNED(long));
27437 +
27438 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
27439 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
27440 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
27441 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
27442 +
27443 + COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
27444 + COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
27445 + COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
27446 + COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
27447 + // COMPILE_TIME_ASSERT(SHRT_MAX == LZO_STYPE_MAX(sizeof(short))); /* edward */
27448 + COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
27449 + COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
27450 + LZO_UTYPE_MAX(sizeof(lzo_uint32)));
27451 + COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
27452 +#if !defined(__LZO_IN_MINILZO)
27453 + COMPILE_TIME_ASSERT(SIZE_T_MAX == LZO_UTYPE_MAX(sizeof(size_t)));
27454 +#endif
27455 +
27456 + r &= __lzo_assert(LZO_BYTE(257) == 1);
27457 +
27458 + return r;
27459 +}
27460 +
27461 +static lzo_bool basic_ptr_check(void)
27462 +{
27463 + lzo_bool r = 1;
27464 +
27465 + COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
27466 + COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
27467 +
27468 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
27469 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
27470 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
27471 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
27472 +
27473 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
27474 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
27475 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
27476 +
27477 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
27478 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
27479 +
27480 + COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
27481 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
27482 +
27483 +#if defined(SIZEOF_CHAR_P)
27484 + COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
27485 +#endif
27486 +#if defined(SIZEOF_PTRDIFF_T)
27487 + COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
27488 +#endif
27489 +
27490 + COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
27491 + COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
27492 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
27493 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
27494 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
27495 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
27496 +
27497 + return r;
27498 +}
27499 +
27500 +static lzo_bool ptr_check(void)
27501 +{
27502 + lzo_bool r = 1;
27503 + int i;
27504 + char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
27505 + lzo_bytep wrkmem;
27506 + lzo_bytepp dict;
27507 + unsigned char x[4 * sizeof(lzo_full_align_t)];
27508 + long d;
27509 + lzo_full_align_t a;
27510 + lzo_full_align_t u;
27511 +
27512 + for (i = 0; i < (int)sizeof(x); i++)
27513 + x[i] = LZO_BYTE(i);
27514 +
27515 + wrkmem =
27516 + LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
27517 +
27518 + u.a_lzo_bytep = wrkmem;
27519 + dict = u.a_lzo_bytepp;
27520 +
27521 + d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
27522 + r &= __lzo_assert(d >= 0);
27523 + r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
27524 +
27525 + memset(&a, 0, sizeof(a));
27526 + r &= __lzo_assert(a.a_lzo_voidp == NULL);
27527 +
27528 + memset(&a, 0xff, sizeof(a));
27529 + r &= __lzo_assert(a.a_ushort == USHRT_MAX);
27530 + r &= __lzo_assert(a.a_uint == UINT_MAX);
27531 + r &= __lzo_assert(a.a_ulong == ULONG_MAX);
27532 + r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
27533 + r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
27534 +
27535 + if (r == 1) {
27536 + for (i = 0; i < 8; i++)
27537 + r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
27538 + (const
27539 + lzo_voidp)(&wrkmem[i *
27540 + sizeof(lzo_byte
27541 + *)]));
27542 + }
27543 +
27544 + memset(&a, 0, sizeof(a));
27545 + r &= __lzo_assert(a.a_char_p == NULL);
27546 + r &= __lzo_assert(a.a_lzo_bytep == NULL);
27547 + r &= __lzo_assert(NULL == (void *)0);
27548 + if (r == 1) {
27549 + for (i = 0; i < 10; i++)
27550 + dict[i] = wrkmem;
27551 + BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
27552 + r &= __lzo_assert(dict[0] == wrkmem);
27553 + for (i = 1; i < 9; i++)
27554 + r &= __lzo_assert(dict[i] == NULL);
27555 + r &= __lzo_assert(dict[9] == wrkmem);
27556 + }
27557 +
27558 + if (r == 1) {
27559 + unsigned k = 1;
27560 + const unsigned n = (unsigned)sizeof(lzo_uint32);
27561 + lzo_byte *p0;
27562 + lzo_byte *p1;
27563 +
27564 + k += __lzo_align_gap(&x[k], n);
27565 + p0 = (lzo_bytep) & x[k];
27566 +#if defined(PTR_LINEAR)
27567 + r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27568 +#else
27569 + r &= __lzo_assert(n == 4);
27570 + r &= __lzo_assert(PTR_ALIGNED_4(p0));
27571 +#endif
27572 +
27573 + r &= __lzo_assert(k >= 1);
27574 + p1 = (lzo_bytep) & x[1];
27575 + r &= __lzo_assert(PTR_GE(p0, p1));
27576 +
27577 + r &= __lzo_assert(k < 1 + n);
27578 + p1 = (lzo_bytep) & x[1 + n];
27579 + r &= __lzo_assert(PTR_LT(p0, p1));
27580 +
27581 + if (r == 1) {
27582 + lzo_uint32 v0, v1;
27583 +
27584 + u.a_uchar_p = &x[k];
27585 + v0 = *u.a_lzo_uint32_p;
27586 + u.a_uchar_p = &x[k + n];
27587 + v1 = *u.a_lzo_uint32_p;
27588 +
27589 + r &= __lzo_assert(v0 > 0);
27590 + r &= __lzo_assert(v1 > 0);
27591 + }
27592 + }
27593 +
27594 + return r;
27595 +}
27596 +
27597 +static int _lzo_config_check(void)
27598 +{
27599 + lzo_bool r = 1;
27600 + int i;
27601 + union {
27602 + lzo_uint32 a;
27603 + unsigned short b;
27604 + lzo_uint32 aa[4];
27605 + unsigned char x[4 * sizeof(lzo_full_align_t)];
27606 + }
27607 + u;
27608 +
27609 + COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27610 + COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27611 + < 0);
27612 +
27613 + r &= basic_integral_check();
27614 + r &= basic_ptr_check();
27615 + if (r != 1)
27616 + return LZO_E_ERROR;
27617 +
27618 + u.a = 0;
27619 + u.b = 0;
27620 + for (i = 0; i < (int)sizeof(u.x); i++)
27621 + u.x[i] = LZO_BYTE(i);
27622 +
27623 +#if defined(LZO_BYTE_ORDER)
27624 + if (r == 1) {
27625 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27626 + lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27627 + unsigned short b = (unsigned short)(u.b & 0xffff);
27628 + r &= __lzo_assert(a == 0x03020100L);
27629 + r &= __lzo_assert(b == 0x0100);
27630 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27631 + lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27632 + unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27633 + r &= __lzo_assert(a == 0x00010203L);
27634 + r &= __lzo_assert(b == 0x0001);
27635 +# else
27636 +# error "invalid LZO_BYTE_ORDER"
27637 +# endif
27638 + }
27639 +#endif
27640 +
27641 +#if defined(LZO_UNALIGNED_OK_2)
27642 + COMPILE_TIME_ASSERT(sizeof(short) == 2);
27643 + if (r == 1) {
27644 + unsigned short b[4];
27645 +
27646 + for (i = 0; i < 4; i++)
27647 + b[i] = *(const unsigned short *)&u.x[i];
27648 +
27649 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27650 + r &= __lzo_assert(b[0] == 0x0100);
27651 + r &= __lzo_assert(b[1] == 0x0201);
27652 + r &= __lzo_assert(b[2] == 0x0302);
27653 + r &= __lzo_assert(b[3] == 0x0403);
27654 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27655 + r &= __lzo_assert(b[0] == 0x0001);
27656 + r &= __lzo_assert(b[1] == 0x0102);
27657 + r &= __lzo_assert(b[2] == 0x0203);
27658 + r &= __lzo_assert(b[3] == 0x0304);
27659 +# endif
27660 + }
27661 +#endif
27662 +
27663 +#if defined(LZO_UNALIGNED_OK_4)
27664 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27665 + if (r == 1) {
27666 + lzo_uint32 a[4];
27667 +
27668 + for (i = 0; i < 4; i++)
27669 + a[i] = *(const lzo_uint32 *)&u.x[i];
27670 +
27671 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27672 + r &= __lzo_assert(a[0] == 0x03020100L);
27673 + r &= __lzo_assert(a[1] == 0x04030201L);
27674 + r &= __lzo_assert(a[2] == 0x05040302L);
27675 + r &= __lzo_assert(a[3] == 0x06050403L);
27676 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27677 + r &= __lzo_assert(a[0] == 0x00010203L);
27678 + r &= __lzo_assert(a[1] == 0x01020304L);
27679 + r &= __lzo_assert(a[2] == 0x02030405L);
27680 + r &= __lzo_assert(a[3] == 0x03040506L);
27681 +# endif
27682 + }
27683 +#endif
27684 +
27685 +#if defined(LZO_ALIGNED_OK_4)
27686 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27687 +#endif
27688 +
27689 + COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27690 +
27691 + if (r == 1) {
27692 + r &= __lzo_assert(!schedule_insns_bug());
27693 + }
27694 +
27695 + if (r == 1) {
27696 + static int x[3];
27697 + static unsigned xn = 3;
27698 + register unsigned j;
27699 +
27700 + for (j = 0; j < xn; j++)
27701 + x[j] = (int)j - 3;
27702 + r &= __lzo_assert(!strength_reduce_bug(x));
27703 + }
27704 +
27705 + if (r == 1) {
27706 + r &= ptr_check();
27707 + }
27708 +
27709 + return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27710 +}
27711 +
27712 +static lzo_bool schedule_insns_bug(void)
27713 +{
27714 +#if defined(__LZO_CHECKER)
27715 + return 0;
27716 +#else
27717 + const int clone[] = { 1, 2, 0 };
27718 + const int *q;
27719 + q = clone;
27720 + return (*q) ? 0 : 1;
27721 +#endif
27722 +}
27723 +
27724 +static lzo_bool strength_reduce_bug(int *x)
27725 +{
27726 + return x[0] != -3 || x[1] != -2 || x[2] != -1;
27727 +}
27728 +
27729 +#undef COMPILE_TIME_ASSERT
27730 +
27731 +LZO_PUBLIC(int)
27732 + __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27733 + int s6, int s7, int s8, int s9)
27734 +{
27735 + int r;
27736 +
27737 + if (v == 0)
27738 + return LZO_E_ERROR;
27739 +
27740 + r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27741 + (s2 == -1 || s2 == (int)sizeof(int)) &&
27742 + (s3 == -1 || s3 == (int)sizeof(long)) &&
27743 + (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27744 + (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27745 + (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27746 + (s7 == -1 || s7 == (int)sizeof(char *)) &&
27747 + (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27748 + (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27749 + if (!r)
27750 + return LZO_E_ERROR;
27751 +
27752 + r = _lzo_config_check();
27753 + if (r != LZO_E_OK)
27754 + return r;
27755 +
27756 + return r;
27757 +}
27758 +
27759 +#if !defined(__LZO_IN_MINILZO)
27760 +
27761 +LZO_EXTERN(int)
27762 + __lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7);
27763 +
27764 +LZO_PUBLIC(int)
27765 +__lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7)
27766 +{
27767 + if (v == 0 || v > 0x1010)
27768 + return LZO_E_ERROR;
27769 + return __lzo_init2(v, s1, s2, s3, s4, s5, -1, -1, s6, s7);
27770 +}
27771 +
27772 +#endif
27773 +
27774 +#define do_compress _lzo1x_1_do_compress
27775 +
27776 +#define LZO_NEED_DICT_H
27777 +#define D_BITS 14
27778 +#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
27779 +#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27780 +
27781 +#ifndef __LZO_CONFIG1X_H
27782 +#define __LZO_CONFIG1X_H
27783 +
27784 +#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27785 +# define LZO1X
27786 +#endif
27787 +
27788 +#if !defined(__LZO_IN_MINILZO)
27789 +#include <lzo1x.h>
27790 +#endif
27791 +
27792 +#define LZO_EOF_CODE
27793 +#undef LZO_DETERMINISTIC
27794 +
27795 +#define M1_MAX_OFFSET 0x0400
27796 +#ifndef M2_MAX_OFFSET
27797 +#define M2_MAX_OFFSET 0x0800
27798 +#endif
27799 +#define M3_MAX_OFFSET 0x4000
27800 +#define M4_MAX_OFFSET 0xbfff
27801 +
27802 +#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
27803 +
27804 +#define M1_MIN_LEN 2
27805 +#define M1_MAX_LEN 2
27806 +#define M2_MIN_LEN 3
27807 +#ifndef M2_MAX_LEN
27808 +#define M2_MAX_LEN 8
27809 +#endif
27810 +#define M3_MIN_LEN 3
27811 +#define M3_MAX_LEN 33
27812 +#define M4_MIN_LEN 3
27813 +#define M4_MAX_LEN 9
27814 +
27815 +#define M1_MARKER 0
27816 +#define M2_MARKER 64
27817 +#define M3_MARKER 32
27818 +#define M4_MARKER 16
27819 +
27820 +#ifndef MIN_LOOKAHEAD
27821 +#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
27822 +#endif
27823 +
27824 +#if defined(LZO_NEED_DICT_H)
27825 +
27826 +#ifndef LZO_HASH
27827 +#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
27828 +#endif
27829 +#define DL_MIN_LEN M2_MIN_LEN
27830 +
27831 +#ifndef __LZO_DICT_H
27832 +#define __LZO_DICT_H
27833 +
27834 +#ifdef __cplusplus
27835 +extern "C" {
27836 +#endif
27837 +
27838 +#if !defined(D_BITS) && defined(DBITS)
27839 +# define D_BITS DBITS
27840 +#endif
27841 +#if !defined(D_BITS)
27842 +# error "D_BITS is not defined"
27843 +#endif
27844 +#if (D_BITS < 16)
27845 +# define D_SIZE LZO_SIZE(D_BITS)
27846 +# define D_MASK LZO_MASK(D_BITS)
27847 +#else
27848 +# define D_SIZE LZO_USIZE(D_BITS)
27849 +# define D_MASK LZO_UMASK(D_BITS)
27850 +#endif
27851 +#define D_HIGH ((D_MASK >> 1) + 1)
27852 +
27853 +#if !defined(DD_BITS)
27854 +# define DD_BITS 0
27855 +#endif
27856 +#define DD_SIZE LZO_SIZE(DD_BITS)
27857 +#define DD_MASK LZO_MASK(DD_BITS)
27858 +
27859 +#if !defined(DL_BITS)
27860 +# define DL_BITS (D_BITS - DD_BITS)
27861 +#endif
27862 +#if (DL_BITS < 16)
27863 +# define DL_SIZE LZO_SIZE(DL_BITS)
27864 +# define DL_MASK LZO_MASK(DL_BITS)
27865 +#else
27866 +# define DL_SIZE LZO_USIZE(DL_BITS)
27867 +# define DL_MASK LZO_UMASK(DL_BITS)
27868 +#endif
27869 +
27870 +#if (D_BITS != DL_BITS + DD_BITS)
27871 +# error "D_BITS does not match"
27872 +#endif
27873 +#if (D_BITS < 8 || D_BITS > 18)
27874 +# error "invalid D_BITS"
27875 +#endif
27876 +#if (DL_BITS < 8 || DL_BITS > 20)
27877 +# error "invalid DL_BITS"
27878 +#endif
27879 +#if (DD_BITS < 0 || DD_BITS > 6)
27880 +# error "invalid DD_BITS"
27881 +#endif
27882 +
27883 +#if !defined(DL_MIN_LEN)
27884 +# define DL_MIN_LEN 3
27885 +#endif
27886 +#if !defined(DL_SHIFT)
27887 +# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27888 +#endif
27889 +
27890 +#define LZO_HASH_GZIP 1
27891 +#define LZO_HASH_GZIP_INCREMENTAL 2
27892 +#define LZO_HASH_LZO_INCREMENTAL_A 3
27893 +#define LZO_HASH_LZO_INCREMENTAL_B 4
27894 +
27895 +#if !defined(LZO_HASH)
27896 +# error "choose a hashing strategy"
27897 +#endif
27898 +
27899 +#if (DL_MIN_LEN == 3)
27900 +# define _DV2_A(p,shift1,shift2) \
27901 + (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27902 +# define _DV2_B(p,shift1,shift2) \
27903 + (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27904 +# define _DV3_B(p,shift1,shift2,shift3) \
27905 + ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27906 +#elif (DL_MIN_LEN == 2)
27907 +# define _DV2_A(p,shift1,shift2) \
27908 + (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27909 +# define _DV2_B(p,shift1,shift2) \
27910 + (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27911 +#else
27912 +# error "invalid DL_MIN_LEN"
27913 +#endif
27914 +#define _DV_A(p,shift) _DV2_A(p,shift,shift)
27915 +#define _DV_B(p,shift) _DV2_B(p,shift,shift)
27916 +#define DA2(p,s1,s2) \
27917 + (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27918 +#define DS2(p,s1,s2) \
27919 + (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27920 +#define DX2(p,s1,s2) \
27921 + (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27922 +#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27923 +#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27924 +#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27925 +#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27926 +#define DM(v) DMS(v,0)
27927 +
27928 +#if (LZO_HASH == LZO_HASH_GZIP)
27929 +# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
27930 +
27931 +#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27932 +# define __LZO_HASH_INCREMENTAL
27933 +# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
27934 +# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
27935 +# define _DINDEX(dv,p) (dv)
27936 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27937 +
27938 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27939 +# define __LZO_HASH_INCREMENTAL
27940 +# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
27941 +# define DVAL_NEXT(dv,p) \
27942 + dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27943 +# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27944 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27945 +
27946 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27947 +# define __LZO_HASH_INCREMENTAL
27948 +# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
27949 +# define DVAL_NEXT(dv,p) \
27950 + dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27951 +# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27952 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27953 +
27954 +#else
27955 +# error "choose a hashing strategy"
27956 +#endif
27957 +
27958 +#ifndef DINDEX
27959 +#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27960 +#endif
27961 +#if !defined(DINDEX1) && defined(D_INDEX1)
27962 +#define DINDEX1 D_INDEX1
27963 +#endif
27964 +#if !defined(DINDEX2) && defined(D_INDEX2)
27965 +#define DINDEX2 D_INDEX2
27966 +#endif
27967 +
27968 +#if !defined(__LZO_HASH_INCREMENTAL)
27969 +# define DVAL_FIRST(dv,p) ((void) 0)
27970 +# define DVAL_NEXT(dv,p) ((void) 0)
27971 +# define DVAL_LOOKAHEAD 0
27972 +#endif
27973 +
27974 +#if !defined(DVAL_ASSERT)
27975 +#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
27976 + static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p) {
27977 + lzo_uint32 df;
27978 + DVAL_FIRST(df, (p));
27979 + assert(DINDEX(dv, p) == DINDEX(df, p));
27980 + }
27981 +#else
27982 +# define DVAL_ASSERT(dv,p) ((void) 0)
27983 +#endif
27984 +#endif
27985 +
27986 +#if defined(LZO_DICT_USE_PTR)
27987 +# define DENTRY(p,in) (p)
27988 +# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
27989 +#else
27990 +# define DENTRY(p,in) ((lzo_uint) ((p)-(in)))
27991 +# define GINDEX(m_pos,m_off,dict,dindex,in) m_off = dict[dindex]
27992 +#endif
27993 +
27994 +#if (DD_BITS == 0)
27995 +
27996 +# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27997 +# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
27998 +# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
27999 +
28000 +#else
28001 +
28002 +# define UPDATE_D(dict,drun,dv,p,in) \
28003 + dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
28004 +# define UPDATE_I(dict,drun,index,p,in) \
28005 + dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
28006 +# define UPDATE_P(ptr,drun,p,in) \
28007 + (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
28008 +
28009 +#endif
28010 +
28011 +#if defined(LZO_DICT_USE_PTR)
28012 +
28013 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
28014 + (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
28015 +
28016 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
28017 + (BOUNDS_CHECKING_OFF_IN_EXPR( \
28018 + (PTR_LT(m_pos,in) || \
28019 + (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
28020 + m_off > max_offset) ))
28021 +
28022 +#else
28023 +
28024 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
28025 + (m_off == 0 || \
28026 + ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
28027 + (m_pos = (ip) - (m_off), 0) )
28028 +
28029 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
28030 + ((lzo_moff_t) ((ip)-(in)) <= m_off || \
28031 + ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
28032 + (m_pos = (ip) - (m_off), 0) )
28033 +
28034 +#endif
28035 +
28036 +#if defined(LZO_DETERMINISTIC)
28037 +# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
28038 +#else
28039 +# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
28040 +#endif
28041 +
28042 +#ifdef __cplusplus
28043 +}
28044 +#endif
28045 +#endif
28046 +#endif
28047 +#endif
28048 +#define DO_COMPRESS lzo1x_1_compress
28049 +static
28050 +lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
28051 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28052 +{
28053 + register const lzo_byte *ip;
28054 + lzo_byte *op;
28055 + const lzo_byte *const in_end = in + in_len;
28056 + const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
28057 + const lzo_byte *ii;
28058 + lzo_dict_p const dict = (lzo_dict_p) wrkmem;
28059 +
28060 + op = out;
28061 + ip = in;
28062 + ii = ip;
28063 +
28064 + ip += 4;
28065 + for (;;) {
28066 + register const lzo_byte *m_pos;
28067 +
28068 + lzo_moff_t m_off;
28069 + lzo_uint m_len;
28070 + lzo_uint dindex;
28071 +
28072 + DINDEX1(dindex, ip);
28073 + GINDEX(m_pos, m_off, dict, dindex, in);
28074 + if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
28075 + goto literal;
28076 +#if 1
28077 + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
28078 + goto try_match;
28079 + DINDEX2(dindex, ip);
28080 +#endif
28081 + GINDEX(m_pos, m_off, dict, dindex, in);
28082 + if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
28083 + goto literal;
28084 + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
28085 + goto try_match;
28086 + goto literal;
28087 +
28088 + try_match:
28089 +#if 1 && defined(LZO_UNALIGNED_OK_2)
28090 + if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
28091 +#else
28092 + if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
28093 +#endif
28094 + ;
28095 + } else {
28096 + if (m_pos[2] == ip[2]) {
28097 + goto match;
28098 + } else {
28099 + ;
28100 + }
28101 + }
28102 +
28103 + literal:
28104 + UPDATE_I(dict, 0, dindex, ip, in);
28105 + ++ip;
28106 + if (ip >= ip_end)
28107 + break;
28108 + continue;
28109 +
28110 + match:
28111 + UPDATE_I(dict, 0, dindex, ip, in);
28112 + if (pd(ip, ii) > 0) {
28113 + register lzo_uint t = pd(ip, ii);
28114 +
28115 + if (t <= 3) {
28116 + assert("lzo-04", op - 2 > out);
28117 + op[-2] |= LZO_BYTE(t);
28118 + } else if (t <= 18)
28119 + *op++ = LZO_BYTE(t - 3);
28120 + else {
28121 + register lzo_uint tt = t - 18;
28122 +
28123 + *op++ = 0;
28124 + while (tt > 255) {
28125 + tt -= 255;
28126 + *op++ = 0;
28127 + }
28128 + assert("lzo-05", tt > 0);
28129 + *op++ = LZO_BYTE(tt);
28130 + }
28131 + do
28132 + *op++ = *ii++;
28133 + while (--t > 0);
28134 + }
28135 +
28136 + assert("lzo-06", ii == ip);
28137 + ip += 3;
28138 + if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
28139 + || m_pos[6] != *ip++ || m_pos[7] != *ip++
28140 + || m_pos[8] != *ip++
28141 +#ifdef LZO1Y
28142 + || m_pos[9] != *ip++ || m_pos[10] != *ip++
28143 + || m_pos[11] != *ip++ || m_pos[12] != *ip++
28144 + || m_pos[13] != *ip++ || m_pos[14] != *ip++
28145 +#endif
28146 + ) {
28147 + --ip;
28148 + m_len = ip - ii;
28149 + assert("lzo-07", m_len >= 3);
28150 + assert("lzo-08", m_len <= M2_MAX_LEN);
28151 +
28152 + if (m_off <= M2_MAX_OFFSET) {
28153 + m_off -= 1;
28154 +#if defined(LZO1X)
28155 + *op++ =
28156 + LZO_BYTE(((m_len -
28157 + 1) << 5) | ((m_off & 7) << 2));
28158 + *op++ = LZO_BYTE(m_off >> 3);
28159 +#elif defined(LZO1Y)
28160 + *op++ =
28161 + LZO_BYTE(((m_len +
28162 + 1) << 4) | ((m_off & 3) << 2));
28163 + *op++ = LZO_BYTE(m_off >> 2);
28164 +#endif
28165 + } else if (m_off <= M3_MAX_OFFSET) {
28166 + m_off -= 1;
28167 + *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
28168 + goto m3_m4_offset;
28169 + } else
28170 +#if defined(LZO1X)
28171 + {
28172 + m_off -= 0x4000;
28173 + assert("lzo-09", m_off > 0);
28174 + assert("lzo-10", m_off <= 0x7fff);
28175 + *op++ = LZO_BYTE(M4_MARKER |
28176 + ((m_off & 0x4000) >> 11) |
28177 + (m_len - 2));
28178 + goto m3_m4_offset;
28179 + }
28180 +#elif defined(LZO1Y)
28181 + goto m4_match;
28182 +#endif
28183 + } else {
28184 + {
28185 + const lzo_byte *end = in_end;
28186 + const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
28187 + while (ip < end && *m == *ip)
28188 + m++, ip++;
28189 + m_len = (ip - ii);
28190 + }
28191 + assert("lzo-11", m_len > M2_MAX_LEN);
28192 +
28193 + if (m_off <= M3_MAX_OFFSET) {
28194 + m_off -= 1;
28195 + if (m_len <= 33)
28196 + *op++ =
28197 + LZO_BYTE(M3_MARKER | (m_len - 2));
28198 + else {
28199 + m_len -= 33;
28200 + *op++ = M3_MARKER | 0;
28201 + goto m3_m4_len;
28202 + }
28203 + } else {
28204 +#if defined(LZO1Y)
28205 + m4_match:
28206 +#endif
28207 + m_off -= 0x4000;
28208 + assert("lzo-12", m_off > 0);
28209 + assert("lzo-13", m_off <= 0x7fff);
28210 + if (m_len <= M4_MAX_LEN)
28211 + *op++ = LZO_BYTE(M4_MARKER |
28212 + ((m_off & 0x4000) >>
28213 + 11) | (m_len - 2));
28214 + else {
28215 + m_len -= M4_MAX_LEN;
28216 + *op++ =
28217 + LZO_BYTE(M4_MARKER |
28218 + ((m_off & 0x4000) >> 11));
28219 + m3_m4_len:
28220 + while (m_len > 255) {
28221 + m_len -= 255;
28222 + *op++ = 0;
28223 + }
28224 + assert("lzo-14", m_len > 0);
28225 + *op++ = LZO_BYTE(m_len);
28226 + }
28227 + }
28228 +
28229 + m3_m4_offset:
28230 + *op++ = LZO_BYTE((m_off & 63) << 2);
28231 + *op++ = LZO_BYTE(m_off >> 6);
28232 + }
28233 +
28234 + ii = ip;
28235 + if (ip >= ip_end)
28236 + break;
28237 + }
28238 +
28239 + *out_len = op - out;
28240 + return pd(in_end, ii);
28241 +}
28242 +
28243 +LZO_PUBLIC(int)
28244 + DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
28245 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28246 +{
28247 + lzo_byte *op = out;
28248 + lzo_uint t;
28249 +
28250 +#if defined(__LZO_QUERY_COMPRESS)
28251 + if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28252 + return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
28253 + D_SIZE, lzo_sizeof(lzo_dict_t));
28254 +#endif
28255 +
28256 + if (in_len <= M2_MAX_LEN + 5)
28257 + t = in_len;
28258 + else {
28259 + t = do_compress(in, in_len, op, out_len, wrkmem);
28260 + op += *out_len;
28261 + }
28262 +
28263 + if (t > 0) {
28264 + const lzo_byte *ii = in + in_len - t;
28265 +
28266 + if (op == out && t <= 238)
28267 + *op++ = LZO_BYTE(17 + t);
28268 + else if (t <= 3)
28269 + op[-2] |= LZO_BYTE(t);
28270 + else if (t <= 18)
28271 + *op++ = LZO_BYTE(t - 3);
28272 + else {
28273 + lzo_uint tt = t - 18;
28274 +
28275 + *op++ = 0;
28276 + while (tt > 255) {
28277 + tt -= 255;
28278 + *op++ = 0;
28279 + }
28280 + assert("lzo-15", tt > 0);
28281 + *op++ = LZO_BYTE(tt);
28282 + }
28283 + do
28284 + *op++ = *ii++;
28285 + while (--t > 0);
28286 + }
28287 +
28288 + *op++ = M4_MARKER | 1;
28289 + *op++ = 0;
28290 + *op++ = 0;
28291 +
28292 + *out_len = op - out;
28293 + return LZO_E_OK;
28294 +}
28295 +
28296 +#undef do_compress
28297 +#undef DO_COMPRESS
28298 +#undef LZO_HASH
28299 +
28300 +#undef LZO_TEST_DECOMPRESS_OVERRUN
28301 +#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
28302 +#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
28303 +#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28304 +#undef DO_DECOMPRESS
28305 +#define DO_DECOMPRESS lzo1x_decompress
28306 +
28307 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28308 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28309 +# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28310 +# endif
28311 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28312 +# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28313 +# endif
28314 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28315 +# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28316 +# endif
28317 +#endif
28318 +
28319 +#undef TEST_IP
28320 +#undef TEST_OP
28321 +#undef TEST_LOOKBEHIND
28322 +#undef NEED_IP
28323 +#undef NEED_OP
28324 +#undef HAVE_TEST_IP
28325 +#undef HAVE_TEST_OP
28326 +#undef HAVE_NEED_IP
28327 +#undef HAVE_NEED_OP
28328 +#undef HAVE_ANY_IP
28329 +#undef HAVE_ANY_OP
28330 +
28331 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28332 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28333 +# define TEST_IP (ip < ip_end)
28334 +# endif
28335 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28336 +# define NEED_IP(x) \
28337 + if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28338 +# endif
28339 +#endif
28340 +
28341 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28342 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28343 +# define TEST_OP (op <= op_end)
28344 +# endif
28345 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28346 +# undef TEST_OP
28347 +# define NEED_OP(x) \
28348 + if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28349 +# endif
28350 +#endif
28351 +
28352 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28353 +# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28354 +#else
28355 +# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28356 +#endif
28357 +
28358 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28359 +# define TEST_IP (ip < ip_end)
28360 +#endif
28361 +
28362 +#if defined(TEST_IP)
28363 +# define HAVE_TEST_IP
28364 +#else
28365 +# define TEST_IP 1
28366 +#endif
28367 +#if defined(TEST_OP)
28368 +# define HAVE_TEST_OP
28369 +#else
28370 +# define TEST_OP 1
28371 +#endif
28372 +
28373 +#if defined(NEED_IP)
28374 +# define HAVE_NEED_IP
28375 +#else
28376 +# define NEED_IP(x) ((void) 0)
28377 +#endif
28378 +#if defined(NEED_OP)
28379 +# define HAVE_NEED_OP
28380 +#else
28381 +# define NEED_OP(x) ((void) 0)
28382 +#endif
28383 +
28384 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28385 +# define HAVE_ANY_IP
28386 +#endif
28387 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28388 +# define HAVE_ANY_OP
28389 +#endif
28390 +
28391 +#undef __COPY4
28392 +#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28393 +
28394 +#undef COPY4
28395 +#if defined(LZO_UNALIGNED_OK_4)
28396 +# define COPY4(dst,src) __COPY4(dst,src)
28397 +#elif defined(LZO_ALIGNED_OK_4)
28398 +# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28399 +#endif
28400 +
28401 +#if defined(DO_DECOMPRESS)
28402 +LZO_PUBLIC(int)
28403 + DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
28404 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28405 +#endif
28406 +{
28407 + register lzo_byte *op;
28408 + register const lzo_byte *ip;
28409 + register lzo_uint t;
28410 +#if defined(COPY_DICT)
28411 + lzo_uint m_off;
28412 + const lzo_byte *dict_end;
28413 +#else
28414 + register const lzo_byte *m_pos;
28415 +#endif
28416 +
28417 + const lzo_byte *const ip_end = in + in_len;
28418 +#if defined(HAVE_ANY_OP)
28419 + lzo_byte *const op_end = out + *out_len;
28420 +#endif
28421 +#if defined(LZO1Z)
28422 + lzo_uint last_m_off = 0;
28423 +#endif
28424 +
28425 + LZO_UNUSED(wrkmem);
28426 +
28427 +#if defined(__LZO_QUERY_DECOMPRESS)
28428 + if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28429 + return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
28430 + 0, 0);
28431 +#endif
28432 +
28433 +#if defined(COPY_DICT)
28434 + if (dict) {
28435 + if (dict_len > M4_MAX_OFFSET) {
28436 + dict += dict_len - M4_MAX_OFFSET;
28437 + dict_len = M4_MAX_OFFSET;
28438 + }
28439 + dict_end = dict + dict_len;
28440 + } else {
28441 + dict_len = 0;
28442 + dict_end = NULL;
28443 + }
28444 +#endif
28445 +
28446 + *out_len = 0;
28447 +
28448 + op = out;
28449 + ip = in;
28450 +
28451 + if (*ip > 17) {
28452 + t = *ip++ - 17;
28453 + if (t < 4)
28454 + goto match_next;
28455 + assert("lzo-16", t > 0);
28456 + NEED_OP(t);
28457 + NEED_IP(t + 1);
28458 + do
28459 + *op++ = *ip++;
28460 + while (--t > 0);
28461 + goto first_literal_run;
28462 + }
28463 +
28464 + while (TEST_IP && TEST_OP) {
28465 + t = *ip++;
28466 + if (t >= 16)
28467 + goto match;
28468 + if (t == 0) {
28469 + NEED_IP(1);
28470 + while (*ip == 0) {
28471 + t += 255;
28472 + ip++;
28473 + NEED_IP(1);
28474 + }
28475 + t += 15 + *ip++;
28476 + }
28477 + assert("lzo-17", t > 0);
28478 + NEED_OP(t + 3);
28479 + NEED_IP(t + 4);
28480 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28481 +#if !defined(LZO_UNALIGNED_OK_4)
28482 + if (PTR_ALIGNED2_4(op, ip)) {
28483 +#endif
28484 + COPY4(op, ip);
28485 + op += 4;
28486 + ip += 4;
28487 + if (--t > 0) {
28488 + if (t >= 4) {
28489 + do {
28490 + COPY4(op, ip);
28491 + op += 4;
28492 + ip += 4;
28493 + t -= 4;
28494 + } while (t >= 4);
28495 + if (t > 0)
28496 + do
28497 + *op++ = *ip++;
28498 + while (--t > 0);
28499 + } else
28500 + do
28501 + *op++ = *ip++;
28502 + while (--t > 0);
28503 + }
28504 +#if !defined(LZO_UNALIGNED_OK_4)
28505 + } else
28506 +#endif
28507 +#endif
28508 +#if !defined(LZO_UNALIGNED_OK_4)
28509 + {
28510 + *op++ = *ip++;
28511 + *op++ = *ip++;
28512 + *op++ = *ip++;
28513 + do
28514 + *op++ = *ip++;
28515 + while (--t > 0);
28516 + }
28517 +#endif
28518 +
28519 + first_literal_run:
28520 +
28521 + t = *ip++;
28522 + if (t >= 16)
28523 + goto match;
28524 +#if defined(COPY_DICT)
28525 +#if defined(LZO1Z)
28526 + m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28527 + last_m_off = m_off;
28528 +#else
28529 + m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
28530 +#endif
28531 + NEED_OP(3);
28532 + t = 3;
28533 + COPY_DICT(t, m_off)
28534 +#else
28535 +#if defined(LZO1Z)
28536 + t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28537 + m_pos = op - t;
28538 + last_m_off = t;
28539 +#else
28540 + m_pos = op - (1 + M2_MAX_OFFSET);
28541 + m_pos -= t >> 2;
28542 + m_pos -= *ip++ << 2;
28543 +#endif
28544 + TEST_LOOKBEHIND(m_pos, out);
28545 + NEED_OP(3);
28546 + *op++ = *m_pos++;
28547 + *op++ = *m_pos++;
28548 + *op++ = *m_pos;
28549 +#endif
28550 + goto match_done;
28551 +
28552 + while (TEST_IP && TEST_OP) {
28553 + match:
28554 + if (t >= 64) {
28555 +#if defined(COPY_DICT)
28556 +#if defined(LZO1X)
28557 + m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
28558 + t = (t >> 5) - 1;
28559 +#elif defined(LZO1Y)
28560 + m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
28561 + t = (t >> 4) - 3;
28562 +#elif defined(LZO1Z)
28563 + m_off = t & 0x1f;
28564 + if (m_off >= 0x1c)
28565 + m_off = last_m_off;
28566 + else {
28567 + m_off = 1 + (m_off << 6) + (*ip++ >> 2);
28568 + last_m_off = m_off;
28569 + }
28570 + t = (t >> 5) - 1;
28571 +#endif
28572 +#else
28573 +#if defined(LZO1X)
28574 + m_pos = op - 1;
28575 + m_pos -= (t >> 2) & 7;
28576 + m_pos -= *ip++ << 3;
28577 + t = (t >> 5) - 1;
28578 +#elif defined(LZO1Y)
28579 + m_pos = op - 1;
28580 + m_pos -= (t >> 2) & 3;
28581 + m_pos -= *ip++ << 2;
28582 + t = (t >> 4) - 3;
28583 +#elif defined(LZO1Z)
28584 + {
28585 + lzo_uint off = t & 0x1f;
28586 + m_pos = op;
28587 + if (off >= 0x1c) {
28588 + assert(last_m_off > 0);
28589 + m_pos -= last_m_off;
28590 + } else {
28591 + off =
28592 + 1 + (off << 6) +
28593 + (*ip++ >> 2);
28594 + m_pos -= off;
28595 + last_m_off = off;
28596 + }
28597 + }
28598 + t = (t >> 5) - 1;
28599 +#endif
28600 + TEST_LOOKBEHIND(m_pos, out);
28601 + assert("lzo-18", t > 0);
28602 + NEED_OP(t + 3 - 1);
28603 + goto copy_match;
28604 +#endif
28605 + } else if (t >= 32) {
28606 + t &= 31;
28607 + if (t == 0) {
28608 + NEED_IP(1);
28609 + while (*ip == 0) {
28610 + t += 255;
28611 + ip++;
28612 + NEED_IP(1);
28613 + }
28614 + t += 31 + *ip++;
28615 + }
28616 +#if defined(COPY_DICT)
28617 +#if defined(LZO1Z)
28618 + m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28619 + last_m_off = m_off;
28620 +#else
28621 + m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28622 +#endif
28623 +#else
28624 +#if defined(LZO1Z)
28625 + {
28626 + lzo_uint off =
28627 + 1 + (ip[0] << 6) + (ip[1] >> 2);
28628 + m_pos = op - off;
28629 + last_m_off = off;
28630 + }
28631 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28632 + m_pos = op - 1;
28633 + m_pos -= (*(const lzo_ushortp)ip) >> 2;
28634 +#else
28635 + m_pos = op - 1;
28636 + m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28637 +#endif
28638 +#endif
28639 + ip += 2;
28640 + } else if (t >= 16) {
28641 +#if defined(COPY_DICT)
28642 + m_off = (t & 8) << 11;
28643 +#else
28644 + m_pos = op;
28645 + m_pos -= (t & 8) << 11;
28646 +#endif
28647 + t &= 7;
28648 + if (t == 0) {
28649 + NEED_IP(1);
28650 + while (*ip == 0) {
28651 + t += 255;
28652 + ip++;
28653 + NEED_IP(1);
28654 + }
28655 + t += 7 + *ip++;
28656 + }
28657 +#if defined(COPY_DICT)
28658 +#if defined(LZO1Z)
28659 + m_off += (ip[0] << 6) + (ip[1] >> 2);
28660 +#else
28661 + m_off += (ip[0] >> 2) + (ip[1] << 6);
28662 +#endif
28663 + ip += 2;
28664 + if (m_off == 0)
28665 + goto eof_found;
28666 + m_off += 0x4000;
28667 +#if defined(LZO1Z)
28668 + last_m_off = m_off;
28669 +#endif
28670 +#else
28671 +#if defined(LZO1Z)
28672 + m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28673 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28674 + m_pos -= (*(const lzo_ushortp)ip) >> 2;
28675 +#else
28676 + m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28677 +#endif
28678 + ip += 2;
28679 + if (m_pos == op)
28680 + goto eof_found;
28681 + m_pos -= 0x4000;
28682 +#if defined(LZO1Z)
28683 + last_m_off = op - m_pos;
28684 +#endif
28685 +#endif
28686 + } else {
28687 +#if defined(COPY_DICT)
28688 +#if defined(LZO1Z)
28689 + m_off = 1 + (t << 6) + (*ip++ >> 2);
28690 + last_m_off = m_off;
28691 +#else
28692 + m_off = 1 + (t >> 2) + (*ip++ << 2);
28693 +#endif
28694 + NEED_OP(2);
28695 + t = 2;
28696 + COPY_DICT(t, m_off)
28697 +#else
28698 +#if defined(LZO1Z)
28699 + t = 1 + (t << 6) + (*ip++ >> 2);
28700 + m_pos = op - t;
28701 + last_m_off = t;
28702 +#else
28703 + m_pos = op - 1;
28704 + m_pos -= t >> 2;
28705 + m_pos -= *ip++ << 2;
28706 +#endif
28707 + TEST_LOOKBEHIND(m_pos, out);
28708 + NEED_OP(2);
28709 + *op++ = *m_pos++;
28710 + *op++ = *m_pos;
28711 +#endif
28712 + goto match_done;
28713 + }
28714 +
28715 +#if defined(COPY_DICT)
28716 +
28717 + NEED_OP(t + 3 - 1);
28718 + t += 3 - 1;
28719 + COPY_DICT(t, m_off)
28720 +#else
28721 +
28722 + TEST_LOOKBEHIND(m_pos, out);
28723 + assert("lzo-19", t > 0);
28724 + NEED_OP(t + 3 - 1);
28725 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28726 +#if !defined(LZO_UNALIGNED_OK_4)
28727 + if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28728 + assert((op - m_pos) >= 4);
28729 +#else
28730 + if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28731 +#endif
28732 + COPY4(op, m_pos);
28733 + op += 4;
28734 + m_pos += 4;
28735 + t -= 4 - (3 - 1);
28736 + do {
28737 + COPY4(op, m_pos);
28738 + op += 4;
28739 + m_pos += 4;
28740 + t -= 4;
28741 + } while (t >= 4);
28742 + if (t > 0)
28743 + do
28744 + *op++ = *m_pos++;
28745 + while (--t > 0);
28746 + } else
28747 +#endif
28748 + {
28749 + copy_match:
28750 + *op++ = *m_pos++;
28751 + *op++ = *m_pos++;
28752 + do
28753 + *op++ = *m_pos++;
28754 + while (--t > 0);
28755 + }
28756 +
28757 +#endif
28758 +
28759 + match_done:
28760 +#if defined(LZO1Z)
28761 + t = ip[-1] & 3;
28762 +#else
28763 + t = ip[-2] & 3;
28764 +#endif
28765 + if (t == 0)
28766 + break;
28767 +
28768 + match_next:
28769 + assert("lzo-20", t > 0);
28770 + NEED_OP(t);
28771 + NEED_IP(t + 1);
28772 + do
28773 + *op++ = *ip++;
28774 + while (--t > 0);
28775 + t = *ip++;
28776 + }
28777 + }
28778 +
28779 +#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28780 + *out_len = op - out;
28781 + return LZO_E_EOF_NOT_FOUND;
28782 +#endif
28783 +
28784 + eof_found:
28785 + assert("lzo-21", t == 1);
28786 + *out_len = op - out;
28787 + return (ip == ip_end ? LZO_E_OK :
28788 + (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28789 +
28790 +#if defined(HAVE_NEED_IP)
28791 + input_overrun:
28792 + *out_len = op - out;
28793 + return LZO_E_INPUT_OVERRUN;
28794 +#endif
28795 +
28796 +#if defined(HAVE_NEED_OP)
28797 + output_overrun:
28798 + *out_len = op - out;
28799 + return LZO_E_OUTPUT_OVERRUN;
28800 +#endif
28801 +
28802 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28803 + lookbehind_overrun:
28804 + *out_len = op - out;
28805 + return LZO_E_LOOKBEHIND_OVERRUN;
28806 +#endif
28807 +}
28808 +
28809 +#define LZO_TEST_DECOMPRESS_OVERRUN
28810 +#undef DO_DECOMPRESS
28811 +#define DO_DECOMPRESS lzo1x_decompress_safe
28812 +
28813 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28814 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28815 +# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28816 +# endif
28817 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28818 +# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28819 +# endif
28820 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28821 +# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28822 +# endif
28823 +#endif
28824 +
28825 +#undef TEST_IP
28826 +#undef TEST_OP
28827 +#undef TEST_LOOKBEHIND
28828 +#undef NEED_IP
28829 +#undef NEED_OP
28830 +#undef HAVE_TEST_IP
28831 +#undef HAVE_TEST_OP
28832 +#undef HAVE_NEED_IP
28833 +#undef HAVE_NEED_OP
28834 +#undef HAVE_ANY_IP
28835 +#undef HAVE_ANY_OP
28836 +
28837 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28838 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28839 +# define TEST_IP (ip < ip_end)
28840 +# endif
28841 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28842 +# define NEED_IP(x) \
28843 + if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28844 +# endif
28845 +#endif
28846 +
28847 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28848 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28849 +# define TEST_OP (op <= op_end)
28850 +# endif
28851 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28852 +# undef TEST_OP
28853 +# define NEED_OP(x) \
28854 + if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28855 +# endif
28856 +#endif
28857 +
28858 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28859 +# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28860 +#else
28861 +# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28862 +#endif
28863 +
28864 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28865 +# define TEST_IP (ip < ip_end)
28866 +#endif
28867 +
28868 +#if defined(TEST_IP)
28869 +# define HAVE_TEST_IP
28870 +#else
28871 +# define TEST_IP 1
28872 +#endif
28873 +#if defined(TEST_OP)
28874 +# define HAVE_TEST_OP
28875 +#else
28876 +# define TEST_OP 1
28877 +#endif
28878 +
28879 +#if defined(NEED_IP)
28880 +# define HAVE_NEED_IP
28881 +#else
28882 +# define NEED_IP(x) ((void) 0)
28883 +#endif
28884 +#if defined(NEED_OP)
28885 +# define HAVE_NEED_OP
28886 +#else
28887 +# define NEED_OP(x) ((void) 0)
28888 +#endif
28889 +
28890 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28891 +# define HAVE_ANY_IP
28892 +#endif
28893 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28894 +# define HAVE_ANY_OP
28895 +#endif
28896 +
28897 +#undef __COPY4
28898 +#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28899 +
28900 +#undef COPY4
28901 +#if defined(LZO_UNALIGNED_OK_4)
28902 +# define COPY4(dst,src) __COPY4(dst,src)
28903 +#elif defined(LZO_ALIGNED_OK_4)
28904 +# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28905 +#endif
28906 +
28907 +/***** End of minilzo.c *****/
28908 Index: linux-2.6.16/fs/reiser4/plugin/compress/minilzo.h
28909 ===================================================================
28910 --- /dev/null
28911 +++ linux-2.6.16/fs/reiser4/plugin/compress/minilzo.h
28912 @@ -0,0 +1,94 @@
28913 +/* minilzo.h -- mini subset of the LZO real-time data compression library
28914 + adopted for reiser4 compression transform plugin.
28915 +
28916 + This file is part of the LZO real-time data compression library
28917 + and not included in any proprietary licenses of reiser4.
28918 +
28919 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28920 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28921 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28922 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28923 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28924 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28925 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28926 + All Rights Reserved.
28927 +
28928 + The LZO library is free software; you can redistribute it and/or
28929 + modify it under the terms of the GNU General Public License as
28930 + published by the Free Software Foundation; either version 2 of
28931 + the License, or (at your option) any later version.
28932 +
28933 + The LZO library is distributed in the hope that it will be useful,
28934 + but WITHOUT ANY WARRANTY; without even the implied warranty of
28935 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28936 + GNU General Public License for more details.
28937 +
28938 + You should have received a copy of the GNU General Public License
28939 + along with the LZO library; see the file COPYING.
28940 + If not, write to the Free Software Foundation, Inc.,
28941 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28942 +
28943 + Markus F.X.J. Oberhumer
28944 + <markus@oberhumer.com>
28945 + http://www.oberhumer.com/opensource/lzo/
28946 + */
28947 +
28948 +/*
28949 + * NOTE:
28950 + * the full LZO package can be found at
28951 + * http://www.oberhumer.com/opensource/lzo/
28952 + */
28953 +
28954 +#ifndef __MINILZO_H
28955 +#define __MINILZO_H
28956 +
28957 +#define MINILZO_VERSION 0x1080
28958 +
28959 +#ifdef __LZOCONF_H
28960 +# error "you cannot use both LZO and miniLZO"
28961 +#endif
28962 +
28963 +#undef LZO_HAVE_CONFIG_H
28964 +#include "lzoconf.h"
28965 +
28966 +#if !defined(LZO_VERSION) || (LZO_VERSION != MINILZO_VERSION)
28967 +# error "version mismatch in header files"
28968 +#endif
28969 +
28970 +#ifdef __cplusplus
28971 +extern "C" {
28972 +#endif
28973 +
28974 +/***********************************************************************
28975 +//
28976 +************************************************************************/
28977 +
28978 +/* Memory required for the wrkmem parameter.
28979 + * When the required size is 0, you can also pass a NULL pointer.
28980 + */
28981 +
28982 +#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
28983 +#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28984 +#define LZO1X_MEM_DECOMPRESS (0)
28985 +
28986 +/* compression */
28987 + LZO_EXTERN(int)
28988 + lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28989 + lzo_byte * dst, lzo_uintp dst_len, lzo_voidp wrkmem);
28990 +
28991 +/* decompression */
28992 + LZO_EXTERN(int)
28993 + lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28994 + lzo_byte * dst, lzo_uintp dst_len,
28995 + lzo_voidp wrkmem /* NOT USED */ );
28996 +
28997 +/* safe decompression with overrun testing */
28998 + LZO_EXTERN(int)
28999 + lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
29000 + lzo_byte * dst, lzo_uintp dst_len,
29001 + lzo_voidp wrkmem /* NOT USED */ );
29002 +
29003 +#ifdef __cplusplus
29004 +} /* extern "C" */
29005 +#endif
29006 +#endif /* already included */
29007 Index: linux-2.6.16/fs/reiser4/plugin/crypto/cipher.c
29008 ===================================================================
29009 --- /dev/null
29010 +++ linux-2.6.16/fs/reiser4/plugin/crypto/cipher.c
29011 @@ -0,0 +1,116 @@
29012 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
29013 + licensing governed by reiser4/README */
29014 +/* Reiser4 cipher transform plugins */
29015 +
29016 +#include "../../debug.h"
29017 +#include "../plugin.h"
29018 +#include "../file/cryptcompress.h"
29019 +#include <linux/types.h>
29020 +#include <linux/random.h>
29021 +
29022 +#define MIN_CIPHER_BLOCKSIZE 8
29023 +#define MAX_CIPHER_BLOCKSIZE 128
29024 +
29025 +/*
29026 + Default align() method of the cipher plugin (look for description of this
29027 + method in plugin/plugin.h)
29028 +
29029 + 1) creates the aligning armored format of the input flow before encryption.
29030 + "armored" means that padding is filled by private data (for example,
29031 + pseudo-random sequence of bytes is not private data).
29032 + 2) returns length of appended padding
29033 +
29034 + [ flow | aligning_padding ]
29035 + ^
29036 + |
29037 + @pad
29038 +*/
29039 +static int align_stream_common(__u8 * pad,
29040 + int flow_size /* size of non-aligned flow */,
29041 + int blocksize /* cipher block size */)
29042 +{
29043 + int pad_size;
29044 +
29045 + assert("edward-01", pad != NULL);
29046 + assert("edward-02", flow_size != 0);
29047 + assert("edward-03", blocksize != 0
29048 + || blocksize <= MAX_CIPHER_BLOCKSIZE);
29049 +
29050 + pad_size = blocksize - (flow_size % blocksize);
29051 + get_random_bytes(pad, pad_size);
29052 + return pad_size;
29053 +}
29054 +
29055 +/* This is used for all the cipher algorithms which do not inflate
29056 + block-aligned data */
29057 +static loff_t scale_common(struct inode *inode, size_t blocksize,
29058 + loff_t src_off /* offset to scale */ )
29059 +{
29060 + return src_off;
29061 +}
29062 +
29063 +static void free_aes (struct crypto_tfm * tfm)
29064 +{
29065 +#if REISER4_AES
29066 + crypto_free_tfm(tfm);
29067 +#endif
29068 + return;
29069 +}
29070 +
29071 +static struct crypto_tfm * alloc_aes (void)
29072 +{
29073 +#if REISER4_AES
29074 + return crypto_alloc_tfm ("aes", 0);
29075 +#else
29076 + warning("edward-1417", "aes unsupported");
29077 + return ERR_PTR(-EINVAL);
29078 +#endif /* REISER4_AES */
29079 +}
29080 +
29081 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
29082 + [NONE_CIPHER_ID] = {
29083 + .h = {
29084 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
29085 + .id = NONE_CIPHER_ID,
29086 + .pops = NULL,
29087 + .label = "none",
29088 + .desc = "no cipher transform",
29089 + .linkage = {NULL, NULL}
29090 + },
29091 + .alloc = NULL,
29092 + .free = NULL,
29093 + .scale = NULL,
29094 + .align_stream = NULL,
29095 + .setkey = NULL,
29096 + .encrypt = NULL,
29097 + .decrypt = NULL
29098 + },
29099 + [AES_CIPHER_ID] = {
29100 + .h = {
29101 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
29102 + .id = AES_CIPHER_ID,
29103 + .pops = NULL,
29104 + .label = "aes",
29105 + .desc = "aes cipher transform",
29106 + .linkage = {NULL, NULL}
29107 + },
29108 + .alloc = alloc_aes,
29109 + .free = free_aes,
29110 + .scale = scale_common,
29111 + .align_stream = align_stream_common,
29112 + .setkey = NULL,
29113 + .encrypt = NULL,
29114 + .decrypt = NULL
29115 + }
29116 +};
29117 +
29118 +/* Make Linus happy.
29119 + Local variables:
29120 + c-indentation-style: "K&R"
29121 + mode-name: "LC"
29122 + c-basic-offset: 8
29123 + tab-width: 8
29124 + fill-column: 120
29125 + scroll-step: 1
29126 + End:
29127 +*/
29128 Index: linux-2.6.16/fs/reiser4/plugin/crypto/cipher.h
29129 ===================================================================
29130 --- /dev/null
29131 +++ linux-2.6.16/fs/reiser4/plugin/crypto/cipher.h
29132 @@ -0,0 +1,67 @@
29133 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29134 +/* This file contains definitions for the objects operated
29135 + by reiser4 key manager, which is something like keyring
29136 + wrapped by appropriate reiser4 plugin */
29137 +
29138 +#if !defined( __FS_REISER4_CRYPT_H__ )
29139 +#define __FS_REISER4_CRYPT_H__
29140 +
29141 +#include <linux/crypto.h>
29142 +
29143 +
29144 +/* Transform actions involved in ciphering process and
29145 + supported by reiser4 via appropriate transform plugins */
29146 +typedef enum {
29147 + CIPHER_TFM, /* cipher transform */
29148 + DIGEST_TFM, /* digest transform */
29149 + LAST_TFM
29150 +} reiser4_tfm;
29151 +
29152 +/* This represents a transform action in reiser4 */
29153 +typedef struct reiser4_tfma {
29154 + reiser4_plugin * plug; /* transform plugin */
29155 + struct crypto_tfm * tfm; /* low-level info, operated by
29156 + linux crypto-api (see linux/crypto) */
29157 +} reiser4_tfma_t;
29158 +
29159 +/* key info imported from user space */
29160 +typedef struct crypto_data {
29161 + int keysize; /* uninstantiated key size */
29162 + __u8 * key; /* uninstantiated key */
29163 + int keyid_size; /* size of passphrase */
29164 + __u8 * keyid; /* passphrase */
29165 +} crypto_data_t;
29166 +
29167 +/* This object contains all needed infrastructure to implement
29168 + cipher transform. This is operated (allocating, inheriting,
29169 + validating, binding to host inode, etc..) by reiser4 key manager.
29170 +
29171 + This info can be allocated in two cases:
29172 + 1. importing a key from user space.
29173 + 2. reading inode from disk */
29174 +typedef struct crypto_stat {
29175 + reiser4_tfma_t tfma[LAST_TFM];
29176 +// cipher_key_plugin * kplug; /* key manager */
29177 + __u8 * keyid; /* key fingerprint, created by digest plugin,
29178 + using uninstantiated key and passphrase.
29179 + supposed to be stored in disk stat-data */
29180 + int inst; /* this indicates if the cipher key is
29181 + instantiated (case 1 above) */
29182 + int keysize; /* uninstantiated key size (bytes), supposed
29183 + to be stored in disk stat-data */
29184 + int keyload_count; /* number of the objects which has this
29185 + crypto-stat attached */
29186 +} crypto_stat_t;
29187 +
29188 +#endif /* __FS_REISER4_CRYPT_H__ */
29189 +
29190 +/*
29191 + Local variables:
29192 + c-indentation-style: "K&R"
29193 + mode-name: "LC"
29194 + c-basic-offset: 8
29195 + tab-width: 8
29196 + fill-column: 120
29197 + scroll-step: 1
29198 + End:
29199 +*/
29200 Index: linux-2.6.16/fs/reiser4/plugin/crypto/digest.c
29201 ===================================================================
29202 --- /dev/null
29203 +++ linux-2.6.16/fs/reiser4/plugin/crypto/digest.c
29204 @@ -0,0 +1,58 @@
29205 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29206 +
29207 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
29208 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
29209 +#include "../../debug.h"
29210 +#include "../plugin_header.h"
29211 +#include "../plugin.h"
29212 +#include "../file/cryptcompress.h"
29213 +
29214 +#include <linux/types.h>
29215 +
29216 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
29217 +
29218 +static struct crypto_tfm * alloc_sha256 (void)
29219 +{
29220 +#if REISER4_SHA256
29221 + return crypto_alloc_tfm ("sha256", 0);
29222 +#else
29223 + warning("edward-1418", "sha256 unsupported");
29224 + return ERR_PTR(-EINVAL);
29225 +#endif
29226 +}
29227 +
29228 +static void free_sha256 (struct crypto_tfm * tfm)
29229 +{
29230 +#if REISER4_SHA256
29231 + crypto_free_tfm(tfm);
29232 +#endif
29233 + return;
29234 +}
29235 +
29236 +/* digest plugins */
29237 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
29238 + [SHA256_32_DIGEST_ID] = {
29239 + .h = {
29240 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
29241 + .id = SHA256_32_DIGEST_ID,
29242 + .pops = NULL,
29243 + .label = "sha256_32",
29244 + .desc = "sha256_32 digest transform",
29245 + .linkage = {NULL, NULL}
29246 + },
29247 + .fipsize = sizeof(__u32),
29248 + .alloc = alloc_sha256,
29249 + .free = free_sha256
29250 + }
29251 +};
29252 +
29253 +/*
29254 + Local variables:
29255 + c-indentation-style: "K&R"
29256 + mode-name: "LC"
29257 + c-basic-offset: 8
29258 + tab-width: 8
29259 + fill-column: 120
29260 + scroll-step: 1
29261 + End:
29262 +*/
29263 Index: linux-2.6.16/fs/reiser4/plugin/dir/Makefile
29264 ===================================================================
29265 --- /dev/null
29266 +++ linux-2.6.16/fs/reiser4/plugin/dir/Makefile
29267 @@ -0,0 +1,5 @@
29268 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
29269 +
29270 +dir_plugins-objs := \
29271 + hashed_dir.o \
29272 + seekable_dir.o
29273 Index: linux-2.6.16/fs/reiser4/plugin/dir/dir.h
29274 ===================================================================
29275 --- /dev/null
29276 +++ linux-2.6.16/fs/reiser4/plugin/dir/dir.h
29277 @@ -0,0 +1,36 @@
29278 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29279 + * reiser4/README */
29280 +
29281 +/* this file contains declarations of methods implementing directory plugins */
29282 +
29283 +#if !defined( __REISER4_DIR_H__ )
29284 +#define __REISER4_DIR_H__
29285 +
29286 +/*#include "../../key.h"
29287 +
29288 +#include <linux/fs.h>*/
29289 +
29290 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
29291 +
29292 +/* "hashed" directory methods of dir plugin */
29293 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
29294 + reiser4_key *);
29295 +
29296 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
29297 +
29298 +/* "seekable" directory methods of dir plugin */
29299 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
29300 + reiser4_key *);
29301 +
29302 +/* __REISER4_DIR_H__ */
29303 +#endif
29304 +
29305 +/*
29306 + Local variables:
29307 + c-indentation-style: "K&R"
29308 + mode-name: "LC"
29309 + c-basic-offset: 8
29310 + tab-width: 8
29311 + fill-column: 120
29312 + End:
29313 +*/
29314 Index: linux-2.6.16/fs/reiser4/plugin/dir/hashed_dir.c
29315 ===================================================================
29316 --- /dev/null
29317 +++ linux-2.6.16/fs/reiser4/plugin/dir/hashed_dir.c
29318 @@ -0,0 +1,81 @@
29319 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29320 + * reiser4/README */
29321 +
29322 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
29323 + names to the files. */
29324 +
29325 +/*
29326 + * Hashed directory logically consists of persistent directory
29327 + * entries. Directory entry is a pair of a file name and a key of stat-data of
29328 + * a file that has this name in the given directory.
29329 + *
29330 + * Directory entries are stored in the tree in the form of directory
29331 + * items. Directory item should implement dir_entry_ops portion of item plugin
29332 + * interface (see plugin/item/item.h). Hashed directory interacts with
29333 + * directory item plugin exclusively through dir_entry_ops operations.
29334 + *
29335 + * Currently there are two implementations of directory items: "simple
29336 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
29337 + * (plugin/item/cde.[ch]) with the latter being the default.
29338 + *
29339 + * There is, however some delicate way through which directory code interferes
29340 + * with item plugin: key assignment policy. A key for a directory item is
29341 + * chosen by directory code, and as described in kassign.c, this key contains
29342 + * a portion of file name. Directory item uses this knowledge to avoid storing
29343 + * this portion of file name twice: in the key and in the directory item body.
29344 + *
29345 + */
29346 +
29347 +#include "../../inode.h"
29348 +
29349 +void complete_entry_key(const struct inode *, const char *name,
29350 + int len, reiser4_key * result);
29351 +
29352 +/* this is implementation of build_entry_key method of dir
29353 + plugin for HASHED_DIR_PLUGIN_ID
29354 + */
29355 +void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
29356 + * (or will be) in.*/
29357 + const struct qstr *qname, /* name of file referenced
29358 + * by this entry */
29359 + reiser4_key * result /* resulting key of directory
29360 + * entry */ )
29361 +{
29362 + const char *name;
29363 + int len;
29364 +
29365 + assert("nikita-1139", dir != NULL);
29366 + assert("nikita-1140", qname != NULL);
29367 + assert("nikita-1141", qname->name != NULL);
29368 + assert("nikita-1142", result != NULL);
29369 +
29370 + name = qname->name;
29371 + len = qname->len;
29372 +
29373 + assert("nikita-2867", strlen(name) == len);
29374 +
29375 + reiser4_key_init(result);
29376 + /* locality of directory entry's key is objectid of parent
29377 + directory */
29378 + set_key_locality(result, get_inode_oid(dir));
29379 + /* minor packing locality is constant */
29380 + set_key_type(result, KEY_FILE_NAME_MINOR);
29381 + /* dot is special case---we always want it to be first entry in
29382 + a directory. Actually, we just want to have smallest
29383 + directory entry.
29384 + */
29385 + if (len == 1 && name[0] == '.')
29386 + return;
29387 +
29388 + /* initialize part of entry key which depends on file name */
29389 + complete_entry_key(dir, name, len, result);
29390 +}
29391 +
29392 +/* Local variables:
29393 + c-indentation-style: "K&R"
29394 + mode-name: "LC"
29395 + c-basic-offset: 8
29396 + tab-width: 8
29397 + fill-column: 120
29398 + End:
29399 +*/
29400 Index: linux-2.6.16/fs/reiser4/plugin/dir/seekable_dir.c
29401 ===================================================================
29402 --- /dev/null
29403 +++ linux-2.6.16/fs/reiser4/plugin/dir/seekable_dir.c
29404 @@ -0,0 +1,46 @@
29405 +/* Copyright 2005 by Hans Reiser, licensing governed by
29406 + * reiser4/README */
29407 +
29408 +#include "../../inode.h"
29409 +
29410 +/* this is implementation of build_entry_key method of dir
29411 + plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
29412 + This is for directories where we want repeatable and restartable readdir()
29413 + even in case 32bit user level struct dirent (readdir(3)).
29414 +*/
29415 +void
29416 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
29417 + reiser4_key * result)
29418 +{
29419 + oid_t objectid;
29420 +
29421 + assert("nikita-2283", dir != NULL);
29422 + assert("nikita-2284", name != NULL);
29423 + assert("nikita-2285", name->name != NULL);
29424 + assert("nikita-2286", result != NULL);
29425 +
29426 + reiser4_key_init(result);
29427 + /* locality of directory entry's key is objectid of parent
29428 + directory */
29429 + set_key_locality(result, get_inode_oid(dir));
29430 + /* minor packing locality is constant */
29431 + set_key_type(result, KEY_FILE_NAME_MINOR);
29432 + /* dot is special case---we always want it to be first entry in
29433 + a directory. Actually, we just want to have smallest
29434 + directory entry.
29435 + */
29436 + if ((name->len == 1) && (name->name[0] == '.'))
29437 + return;
29438 +
29439 + /* objectid of key is 31 lowest bits of hash. */
29440 + objectid =
29441 + inode_hash_plugin(dir)->hash(name->name,
29442 + (int)name->len) & 0x7fffffff;
29443 +
29444 + assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
29445 + set_key_objectid(result, objectid);
29446 +
29447 + /* offset is always 0. */
29448 + set_key_offset(result, (__u64) 0);
29449 + return;
29450 +}
29451 Index: linux-2.6.16/fs/reiser4/plugin/dir_plugin_common.c
29452 ===================================================================
29453 --- /dev/null
29454 +++ linux-2.6.16/fs/reiser4/plugin/dir_plugin_common.c
29455 @@ -0,0 +1,864 @@
29456 +/* Copyright 2005 by Hans Reiser, licensing governed by
29457 + reiser4/README */
29458 +
29459 +/* this file contains typical implementations for most of methods of
29460 + directory plugin
29461 +*/
29462 +
29463 +#include "../inode.h"
29464 +
29465 +int find_entry(struct inode *dir, struct dentry *name,
29466 + lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
29467 +int lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
29468 +void check_light_weight(struct inode *inode, struct inode *parent);
29469 +
29470 +/* this is common implementation of get_parent method of dir plugin
29471 + this is used by NFS kernel server to "climb" up directory tree to
29472 + check permissions
29473 + */
29474 +struct dentry *get_parent_common(struct inode *child)
29475 +{
29476 + struct super_block *s;
29477 + struct inode *parent;
29478 + struct dentry dotdot;
29479 + struct dentry *dentry;
29480 + reiser4_key key;
29481 + int result;
29482 +
29483 + /*
29484 + * lookup dotdot entry.
29485 + */
29486 +
29487 + s = child->i_sb;
29488 + memset(&dotdot, 0, sizeof(dotdot));
29489 + dotdot.d_name.name = "..";
29490 + dotdot.d_name.len = 2;
29491 + dotdot.d_op = &get_super_private(s)->ops.dentry;
29492 +
29493 + result = lookup_name(child, &dotdot, &key);
29494 + if (result != 0)
29495 + return ERR_PTR(result);
29496 +
29497 + parent = reiser4_iget(s, &key, 1);
29498 + if (!IS_ERR(parent)) {
29499 + /*
29500 + * FIXME-NIKITA dubious: attributes are inherited from @child
29501 + * to @parent. But:
29502 + *
29503 + * (*) this is the only this we can do
29504 + *
29505 + * (*) attributes of light-weight object are inherited
29506 + * from a parent through which object was looked up first,
29507 + * so it is ambiguous anyway.
29508 + *
29509 + */
29510 + check_light_weight(parent, child);
29511 + reiser4_iget_complete(parent);
29512 + dentry = d_alloc_anon(parent);
29513 + if (dentry == NULL) {
29514 + iput(parent);
29515 + dentry = ERR_PTR(RETERR(-ENOMEM));
29516 + } else
29517 + dentry->d_op = &get_super_private(s)->ops.dentry;
29518 + } else if (PTR_ERR(parent) == -ENOENT)
29519 + dentry = ERR_PTR(RETERR(-ESTALE));
29520 + else
29521 + dentry = (void *)parent;
29522 + return dentry;
29523 +}
29524 +
29525 +/* this is common implementation of is_name_acceptable method of dir
29526 + plugin
29527 + */
29528 +int is_name_acceptable_common(const struct inode *inode, /* directory to check */
29529 + const char *name UNUSED_ARG, /* name to check */
29530 + int len /* @name's length */ )
29531 +{
29532 + assert("nikita-733", inode != NULL);
29533 + assert("nikita-734", name != NULL);
29534 + assert("nikita-735", len > 0);
29535 +
29536 + return len <= reiser4_max_filename_len(inode);
29537 +}
29538 +
29539 +/* there is no common implementation of build_entry_key method of dir
29540 + plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
29541 + plugin/dir/seekable.c:build_entry_key_seekable() for example
29542 +*/
29543 +
29544 +/* this is common implementation of build_readdir_key method of dir
29545 + plugin
29546 + see readdir_common for more details
29547 +*/
29548 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
29549 + reiser4_key * result /* where to store key */ )
29550 +{
29551 + reiser4_file_fsdata *fdata;
29552 + struct inode *inode;
29553 +
29554 + assert("nikita-1361", dir != NULL);
29555 + assert("nikita-1362", result != NULL);
29556 + assert("nikita-1363", dir->f_dentry != NULL);
29557 + inode = dir->f_dentry->d_inode;
29558 + assert("nikita-1373", inode != NULL);
29559 +
29560 + fdata = reiser4_get_file_fsdata(dir);
29561 + if (IS_ERR(fdata))
29562 + return PTR_ERR(fdata);
29563 + assert("nikita-1364", fdata != NULL);
29564 + return extract_key_from_de_id(get_inode_oid(inode),
29565 + &fdata->dir.readdir.position.
29566 + dir_entry_key, result);
29567 +
29568 +}
29569 +
29570 +void adjust_dir_file(struct inode *, const struct dentry *, int offset,
29571 + int adj);
29572 +
29573 +/* this is common implementation of add_entry method of dir plugin
29574 +*/
29575 +int add_entry_common(struct inode *object, /* directory to add new name
29576 + * in */
29577 + struct dentry *where, /* new name */
29578 + reiser4_object_create_data * data UNUSED_ARG, /* parameters
29579 + * of new
29580 + * object */
29581 + reiser4_dir_entry_desc * entry /* parameters of new
29582 + * directory entry */ )
29583 +{
29584 + int result;
29585 + coord_t *coord;
29586 + lock_handle lh;
29587 + reiser4_dentry_fsdata *fsdata;
29588 + reiser4_block_nr reserve;
29589 +
29590 + assert("nikita-1114", object != NULL);
29591 + assert("nikita-1250", where != NULL);
29592 +
29593 + fsdata = reiser4_get_dentry_fsdata(where);
29594 + if (unlikely(IS_ERR(fsdata)))
29595 + return PTR_ERR(fsdata);
29596 +
29597 + reserve = inode_dir_plugin(object)->estimate.add_entry(object);
29598 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29599 + return RETERR(-ENOSPC);
29600 +
29601 + init_lh(&lh);
29602 + coord = &fsdata->dec.entry_coord;
29603 + coord_clear_iplug(coord);
29604 +
29605 + /* check for this entry in a directory. This is plugin method. */
29606 + result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry);
29607 + if (likely(result == -ENOENT)) {
29608 + /* add new entry. Just pass control to the directory
29609 + item plugin. */
29610 + assert("nikita-1709", inode_dir_item_plugin(object));
29611 + assert("nikita-2230", coord->node == lh.node);
29612 + seal_done(&fsdata->dec.entry_seal);
29613 + result =
29614 + inode_dir_item_plugin(object)->s.dir.add_entry(object,
29615 + coord, &lh,
29616 + where,
29617 + entry);
29618 + if (result == 0) {
29619 + adjust_dir_file(object, where, fsdata->dec.pos + 1, +1);
29620 + INODE_INC_FIELD(object, i_size);
29621 + }
29622 + } else if (result == 0) {
29623 + assert("nikita-2232", coord->node == lh.node);
29624 + result = RETERR(-EEXIST);
29625 + }
29626 + done_lh(&lh);
29627 +
29628 + return result;
29629 +}
29630 +
29631 +/**
29632 + * rem_entry - remove entry from directory item
29633 + * @dir:
29634 + * @dentry:
29635 + * @entry:
29636 + * @coord:
29637 + * @lh:
29638 + *
29639 + * Checks that coordinate @coord is set properly and calls item plugin
29640 + * method to cut entry.
29641 + */
29642 +static int
29643 +rem_entry(struct inode *dir, struct dentry *dentry,
29644 + reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
29645 +{
29646 + item_plugin *iplug;
29647 + struct inode *child;
29648 +
29649 + iplug = inode_dir_item_plugin(dir);
29650 + child = dentry->d_inode;
29651 + assert("nikita-3399", child != NULL);
29652 +
29653 + /* check that we are really destroying an entry for @child */
29654 + if (REISER4_DEBUG) {
29655 + int result;
29656 + reiser4_key key;
29657 +
29658 + result = iplug->s.dir.extract_key(coord, &key);
29659 + if (result != 0)
29660 + return result;
29661 + if (get_key_objectid(&key) != get_inode_oid(child)) {
29662 + warning("nikita-3397",
29663 + "rem_entry: %#llx != %#llx\n",
29664 + get_key_objectid(&key),
29665 + (unsigned long long)get_inode_oid(child));
29666 + return RETERR(-EIO);
29667 + }
29668 + }
29669 + return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
29670 +}
29671 +
29672 +/**
29673 + * rem_entry_common - remove entry from a directory
29674 + * @dir: directory to remove entry from
29675 + * @where: name that is being removed
29676 + * @entry: description of entry being removed
29677 + *
29678 + * This is common implementation of rem_entry method of dir plugin.
29679 + */
29680 +int rem_entry_common(struct inode *dir,
29681 + struct dentry *dentry,
29682 + reiser4_dir_entry_desc *entry)
29683 +{
29684 + int result;
29685 + coord_t *coord;
29686 + lock_handle lh;
29687 + reiser4_dentry_fsdata *fsdata;
29688 + __u64 tograb;
29689 +
29690 + assert("nikita-1124", dir != NULL);
29691 + assert("nikita-1125", dentry != NULL);
29692 +
29693 + tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
29694 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
29695 + if (result != 0)
29696 + return RETERR(-ENOSPC);
29697 +
29698 + init_lh(&lh);
29699 +
29700 + /* check for this entry in a directory. This is plugin method. */
29701 + result = find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
29702 + fsdata = reiser4_get_dentry_fsdata(dentry);
29703 + if (IS_ERR(fsdata)) {
29704 + done_lh(&lh);
29705 + return PTR_ERR(fsdata);
29706 + }
29707 +
29708 + coord = &fsdata->dec.entry_coord;
29709 +
29710 + assert("nikita-3404",
29711 + get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
29712 + dir->i_size <= 1);
29713 +
29714 + coord_clear_iplug(coord);
29715 + if (result == 0) {
29716 + /* remove entry. Just pass control to the directory item
29717 + plugin. */
29718 + assert("vs-542", inode_dir_item_plugin(dir));
29719 + seal_done(&fsdata->dec.entry_seal);
29720 + adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
29721 + result =
29722 + WITH_COORD(coord,
29723 + rem_entry(dir, dentry, entry, coord, &lh));
29724 + if (result == 0) {
29725 + if (dir->i_size >= 1)
29726 + INODE_DEC_FIELD(dir, i_size);
29727 + else {
29728 + warning("nikita-2509", "Dir %llu is runt",
29729 + (unsigned long long)
29730 + get_inode_oid(dir));
29731 + result = RETERR(-EIO);
29732 + }
29733 +
29734 + assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
29735 + dentry->d_inode->i_size != 2 ||
29736 + inode_dir_plugin(dentry->d_inode) == NULL);
29737 + }
29738 + }
29739 + done_lh(&lh);
29740 +
29741 + return result;
29742 +}
29743 +
29744 +static reiser4_block_nr estimate_init(struct inode *parent,
29745 + struct inode *object);
29746 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
29747 +
29748 +/* this is common implementation of init method of dir plugin
29749 + create "." and ".." entries
29750 +*/
29751 +int init_common(struct inode *object, /* new directory */
29752 + struct inode *parent, /* parent directory */
29753 + reiser4_object_create_data * data UNUSED_ARG /* info passed
29754 + * to us, this
29755 + * is filled by
29756 + * reiser4()
29757 + * syscall in
29758 + * particular */ )
29759 +{
29760 + reiser4_block_nr reserve;
29761 +
29762 + assert("nikita-680", object != NULL);
29763 + assert("nikita-681", S_ISDIR(object->i_mode));
29764 + assert("nikita-682", parent != NULL);
29765 + assert("nikita-684", data != NULL);
29766 + assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29767 + assert("nikita-687", object->i_mode & S_IFDIR);
29768 +
29769 + reserve = estimate_init(parent, object);
29770 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29771 + return RETERR(-ENOSPC);
29772 +
29773 + return create_dot_dotdot(object, parent);
29774 +}
29775 +
29776 +/* this is common implementation of done method of dir plugin
29777 + remove "." entry
29778 +*/
29779 +int done_common(struct inode *object /* object being deleted */ )
29780 +{
29781 + int result;
29782 + reiser4_block_nr reserve;
29783 + struct dentry goodby_dots;
29784 + reiser4_dir_entry_desc entry;
29785 +
29786 + assert("nikita-1449", object != NULL);
29787 +
29788 + if (inode_get_flag(object, REISER4_NO_SD))
29789 + return 0;
29790 +
29791 + /* of course, this can be rewritten to sweep everything in one
29792 + cut_tree(). */
29793 + memset(&entry, 0, sizeof entry);
29794 +
29795 + /* FIXME: this done method is called from delete_directory_common which
29796 + * reserved space already */
29797 + reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29798 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29799 + return RETERR(-ENOSPC);
29800 +
29801 + memset(&goodby_dots, 0, sizeof goodby_dots);
29802 + entry.obj = goodby_dots.d_inode = object;
29803 + goodby_dots.d_name.name = ".";
29804 + goodby_dots.d_name.len = 1;
29805 + result = rem_entry_common(object, &goodby_dots, &entry);
29806 + reiser4_free_dentry_fsdata(&goodby_dots);
29807 + if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29808 + /* only worth a warning
29809 +
29810 + "values of \ eB\ f will give rise to dom!\n"
29811 + -- v6src/s2/mv.c:89
29812 + */
29813 + warning("nikita-2252", "Cannot remove dot of %lli: %i",
29814 + (unsigned long long)get_inode_oid(object), result);
29815 + return 0;
29816 +}
29817 +
29818 +/* this is common implementation of attach method of dir plugin
29819 +*/
29820 +int
29821 +attach_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG)
29822 +{
29823 + assert("nikita-2647", child != NULL);
29824 + assert("nikita-2648", parent != NULL);
29825 +
29826 + return 0;
29827 +}
29828 +
29829 +/* this is common implementation of detach method of dir plugin
29830 + remove "..", decrease nlink on parent
29831 +*/
29832 +int detach_common(struct inode *object, struct inode *parent)
29833 +{
29834 + int result;
29835 + struct dentry goodby_dots;
29836 + reiser4_dir_entry_desc entry;
29837 +
29838 + assert("nikita-2885", object != NULL);
29839 + assert("nikita-2886", !inode_get_flag(object, REISER4_NO_SD));
29840 +
29841 + memset(&entry, 0, sizeof entry);
29842 +
29843 + /* NOTE-NIKITA this only works if @parent is -the- parent of
29844 + @object, viz. object whose key is stored in dotdot
29845 + entry. Wouldn't work with hard-links on directories. */
29846 + memset(&goodby_dots, 0, sizeof goodby_dots);
29847 + entry.obj = goodby_dots.d_inode = parent;
29848 + goodby_dots.d_name.name = "..";
29849 + goodby_dots.d_name.len = 2;
29850 + result = rem_entry_common(object, &goodby_dots, &entry);
29851 + reiser4_free_dentry_fsdata(&goodby_dots);
29852 + if (result == 0) {
29853 + /* the dot should be the only entry remaining at this time... */
29854 + assert("nikita-3400", object->i_size == 1 &&
29855 + (object->i_nlink >= 0 && object->i_nlink <= 2));
29856 +#if 0
29857 + /* and, together with the only name directory can have, they
29858 + * provides for the last 2 remaining references. If we get
29859 + * here as part of error handling during mkdir, @object
29860 + * possibly has no name yet, so its nlink == 1. If we get here
29861 + * from rename (targeting empty directory), it has no name
29862 + * already, so its nlink == 1. */
29863 + assert("nikita-3401",
29864 + object->i_nlink == 2 || object->i_nlink == 1);
29865 +#endif
29866 +
29867 + /* decrement nlink of directory removed ".." pointed
29868 + to */
29869 + reiser4_del_nlink(parent, NULL, 0);
29870 + }
29871 + return result;
29872 +}
29873 +
29874 +/* this is common implementation of estimate.add_entry method of
29875 + dir plugin
29876 + estimation of adding entry which supposes that entry is inserting a
29877 + unit into item
29878 +*/
29879 +reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29880 +{
29881 + return estimate_one_insert_into_item(tree_by_inode(inode));
29882 +}
29883 +
29884 +/* this is common implementation of estimate.rem_entry method of dir
29885 + plugin
29886 +*/
29887 +reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29888 +{
29889 + return estimate_one_item_removal(tree_by_inode(inode));
29890 +}
29891 +
29892 +/* this is common implementation of estimate.unlink method of dir
29893 + plugin
29894 +*/
29895 +reiser4_block_nr
29896 +dir_estimate_unlink_common(const struct inode * parent,
29897 + const struct inode * object)
29898 +{
29899 + reiser4_block_nr res;
29900 +
29901 + /* hashed_rem_entry(object) */
29902 + res = inode_dir_plugin(object)->estimate.rem_entry(object);
29903 + /* del_nlink(parent) */
29904 + res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29905 +
29906 + return res;
29907 +}
29908 +
29909 +/*
29910 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29911 + * methods: if @inode is a light-weight file, setup its credentials
29912 + * that are not stored in the stat-data in this case
29913 + */
29914 +void check_light_weight(struct inode *inode, struct inode *parent)
29915 +{
29916 + if (inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
29917 + inode->i_uid = parent->i_uid;
29918 + inode->i_gid = parent->i_gid;
29919 + /* clear light-weight flag. If inode would be read by any
29920 + other name, [ug]id wouldn't change. */
29921 + inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
29922 + }
29923 +}
29924 +
29925 +/* looks for name specified in @dentry in directory @parent and if name is
29926 + found - key of object found entry points to is stored in @entry->key */
29927 +int lookup_name(struct inode *parent, /* inode of directory to lookup for
29928 + * name in */
29929 + struct dentry *dentry, /* name to look for */
29930 + reiser4_key * key /* place to store key */ )
29931 +{
29932 + int result;
29933 + coord_t *coord;
29934 + lock_handle lh;
29935 + const char *name;
29936 + int len;
29937 + reiser4_dir_entry_desc entry;
29938 + reiser4_dentry_fsdata *fsdata;
29939 +
29940 + assert("nikita-1247", parent != NULL);
29941 + assert("nikita-1248", dentry != NULL);
29942 + assert("nikita-1123", dentry->d_name.name != NULL);
29943 + assert("vs-1486",
29944 + dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29945 +
29946 + name = dentry->d_name.name;
29947 + len = dentry->d_name.len;
29948 +
29949 + if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29950 + /* some arbitrary error code to return */
29951 + return RETERR(-ENAMETOOLONG);
29952 +
29953 + fsdata = reiser4_get_dentry_fsdata(dentry);
29954 + if (IS_ERR(fsdata))
29955 + return PTR_ERR(fsdata);
29956 +
29957 + coord = &fsdata->dec.entry_coord;
29958 + coord_clear_iplug(coord);
29959 + init_lh(&lh);
29960 +
29961 + /* find entry in a directory. This is plugin method. */
29962 + result = find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, &entry);
29963 + if (result == 0) {
29964 + /* entry was found, extract object key from it. */
29965 + result =
29966 + WITH_COORD(coord,
29967 + item_plugin_by_coord(coord)->s.dir.
29968 + extract_key(coord, key));
29969 + }
29970 + done_lh(&lh);
29971 + return result;
29972 +
29973 +}
29974 +
29975 +/* helper for init_common(): estimate number of blocks to reserve */
29976 +static reiser4_block_nr
29977 +estimate_init(struct inode *parent, struct inode *object)
29978 +{
29979 + reiser4_block_nr res = 0;
29980 +
29981 + assert("vpf-321", parent != NULL);
29982 + assert("vpf-322", object != NULL);
29983 +
29984 + /* hashed_add_entry(object) */
29985 + res += inode_dir_plugin(object)->estimate.add_entry(object);
29986 + /* reiser4_add_nlink(object) */
29987 + res += inode_file_plugin(object)->estimate.update(object);
29988 + /* hashed_add_entry(object) */
29989 + res += inode_dir_plugin(object)->estimate.add_entry(object);
29990 + /* reiser4_add_nlink(parent) */
29991 + res += inode_file_plugin(parent)->estimate.update(parent);
29992 +
29993 + return 0;
29994 +}
29995 +
29996 +/* helper function for init_common(). Create "." and ".." */
29997 +static int create_dot_dotdot(struct inode *object /* object to create dot and
29998 + * dotdot for */ ,
29999 + struct inode *parent /* parent of @object */ )
30000 +{
30001 + int result;
30002 + struct dentry dots_entry;
30003 + reiser4_dir_entry_desc entry;
30004 +
30005 + assert("nikita-688", object != NULL);
30006 + assert("nikita-689", S_ISDIR(object->i_mode));
30007 + assert("nikita-691", parent != NULL);
30008 +
30009 + /* We store dot and dotdot as normal directory entries. This is
30010 + not necessary, because almost all information stored in them
30011 + is already in the stat-data of directory, the only thing
30012 + being missed is objectid of grand-parent directory that can
30013 + easily be added there as extension.
30014 +
30015 + But it is done the way it is done, because not storing dot
30016 + and dotdot will lead to the following complications:
30017 +
30018 + . special case handling in ->lookup().
30019 + . addition of another extension to the sd.
30020 + . dependency on key allocation policy for stat data.
30021 +
30022 + */
30023 +
30024 + memset(&entry, 0, sizeof entry);
30025 + memset(&dots_entry, 0, sizeof dots_entry);
30026 + entry.obj = dots_entry.d_inode = object;
30027 + dots_entry.d_name.name = ".";
30028 + dots_entry.d_name.len = 1;
30029 + result = add_entry_common(object, &dots_entry, NULL, &entry);
30030 + reiser4_free_dentry_fsdata(&dots_entry);
30031 +
30032 + if (result == 0) {
30033 + result = reiser4_add_nlink(object, object, 0);
30034 + if (result == 0) {
30035 + entry.obj = dots_entry.d_inode = parent;
30036 + dots_entry.d_name.name = "..";
30037 + dots_entry.d_name.len = 2;
30038 + result = add_entry_common(object,
30039 + &dots_entry, NULL, &entry);
30040 + reiser4_free_dentry_fsdata(&dots_entry);
30041 + /* if creation of ".." failed, iput() will delete
30042 + object with ".". */
30043 + if (result == 0) {
30044 + result = reiser4_add_nlink(parent, object, 0);
30045 + if (result != 0)
30046 + /*
30047 + * if we failed to bump i_nlink, try
30048 + * to remove ".."
30049 + */
30050 + detach_common(object, parent);
30051 + }
30052 + }
30053 + }
30054 +
30055 + if (result != 0) {
30056 + /*
30057 + * in the case of error, at least update stat-data so that,
30058 + * ->i_nlink updates are not lingering.
30059 + */
30060 + reiser4_update_sd(object);
30061 + reiser4_update_sd(parent);
30062 + }
30063 +
30064 + return result;
30065 +}
30066 +
30067 +/*
30068 + * return 0 iff @coord contains a directory entry for the file with the name
30069 + * @name.
30070 + */
30071 +static int
30072 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
30073 +{
30074 + item_plugin *iplug;
30075 + char buf[DE_NAME_BUF_LEN];
30076 +
30077 + iplug = item_plugin_by_coord(coord);
30078 + if (iplug == NULL) {
30079 + warning("nikita-1135", "Cannot get item plugin");
30080 + print_coord("coord", coord, 1);
30081 + return RETERR(-EIO);
30082 + } else if (item_id_by_coord(coord) !=
30083 + item_id_by_plugin(inode_dir_item_plugin(dir))) {
30084 + /* item id of current item does not match to id of items a
30085 + directory is built of */
30086 + warning("nikita-1136", "Wrong item plugin");
30087 + print_coord("coord", coord, 1);
30088 + return RETERR(-EIO);
30089 + }
30090 + assert("nikita-1137", iplug->s.dir.extract_name);
30091 +
30092 + /* Compare name stored in this entry with name we are looking for.
30093 +
30094 + NOTE-NIKITA Here should go code for support of something like
30095 + unicode, code tables, etc.
30096 + */
30097 + return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
30098 +}
30099 +
30100 +static int
30101 +check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
30102 +{
30103 + return WITH_COORD(coord, check_item(dir, coord, name->name));
30104 +}
30105 +
30106 +/*
30107 + * argument package used by entry_actor to scan entries with identical keys.
30108 + */
30109 +typedef struct entry_actor_args {
30110 + /* name we are looking for */
30111 + const char *name;
30112 + /* key of directory entry. entry_actor() scans through sequence of
30113 + * items/units having the same key */
30114 + reiser4_key *key;
30115 + /* how many entries with duplicate key was scanned so far. */
30116 + int non_uniq;
30117 +#if REISER4_USE_COLLISION_LIMIT
30118 + /* scan limit */
30119 + int max_non_uniq;
30120 +#endif
30121 + /* return parameter: set to true, if ->name wasn't found */
30122 + int not_found;
30123 + /* what type of lock to take when moving to the next node during
30124 + * scan */
30125 + znode_lock_mode mode;
30126 +
30127 + /* last coord that was visited during scan */
30128 + coord_t last_coord;
30129 + /* last node locked during scan */
30130 + lock_handle last_lh;
30131 + /* inode of directory */
30132 + const struct inode *inode;
30133 +} entry_actor_args;
30134 +
30135 +/* Function called by find_entry() to look for given name in the directory. */
30136 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
30137 + coord_t * coord /* current coord */ ,
30138 + lock_handle * lh /* current lock handle */ ,
30139 + void *entry_actor_arg /* argument to scan */ )
30140 +{
30141 + reiser4_key unit_key;
30142 + entry_actor_args *args;
30143 +
30144 + assert("nikita-1131", tree != NULL);
30145 + assert("nikita-1132", coord != NULL);
30146 + assert("nikita-1133", entry_actor_arg != NULL);
30147 +
30148 + args = entry_actor_arg;
30149 + ++args->non_uniq;
30150 +#if REISER4_USE_COLLISION_LIMIT
30151 + if (args->non_uniq > args->max_non_uniq) {
30152 + args->not_found = 1;
30153 + /* hash collision overflow. */
30154 + return RETERR(-EBUSY);
30155 + }
30156 +#endif
30157 +
30158 + /*
30159 + * did we just reach the end of the sequence of items/units with
30160 + * identical keys?
30161 + */
30162 + if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
30163 + assert("nikita-1791",
30164 + keylt(args->key, unit_key_by_coord(coord, &unit_key)));
30165 + args->not_found = 1;
30166 + args->last_coord.between = AFTER_UNIT;
30167 + return 0;
30168 + }
30169 +
30170 + coord_dup(&args->last_coord, coord);
30171 + /*
30172 + * did scan just moved to the next node?
30173 + */
30174 + if (args->last_lh.node != lh->node) {
30175 + int lock_result;
30176 +
30177 + /*
30178 + * if so, lock new node with the mode requested by the caller
30179 + */
30180 + done_lh(&args->last_lh);
30181 + assert("nikita-1896", znode_is_any_locked(lh->node));
30182 + lock_result = longterm_lock_znode(&args->last_lh, lh->node,
30183 + args->mode, ZNODE_LOCK_HIPRI);
30184 + if (lock_result != 0)
30185 + return lock_result;
30186 + }
30187 + return check_item(args->inode, coord, args->name);
30188 +}
30189 +
30190 +/* Look for given @name within directory @dir.
30191 +
30192 + This is called during lookup, creation and removal of directory
30193 + entries and on rename_common
30194 +
30195 + First calculate key that directory entry for @name would have. Search
30196 + for this key in the tree. If such key is found, scan all items with
30197 + the same key, checking name in each directory entry along the way.
30198 +*/
30199 +int find_entry(struct inode *dir, /* directory to scan */
30200 + struct dentry *de, /* name to search for */
30201 + lock_handle * lh, /* resulting lock handle */
30202 + znode_lock_mode mode, /* required lock mode */
30203 + reiser4_dir_entry_desc * entry /* parameters of found directory
30204 + * entry */ )
30205 +{
30206 + const struct qstr *name;
30207 + seal_t *seal;
30208 + coord_t *coord;
30209 + int result;
30210 + __u32 flags;
30211 + de_location *dec;
30212 + reiser4_dentry_fsdata *fsdata;
30213 +
30214 + assert("nikita-1130", lh != NULL);
30215 + assert("nikita-1128", dir != NULL);
30216 +
30217 + name = &de->d_name;
30218 + assert("nikita-1129", name != NULL);
30219 +
30220 + /* dentry private data don't require lock, because dentry
30221 + manipulations are protected by i_mutex on parent.
30222 +
30223 + This is not so for inodes, because there is no -the- parent in
30224 + inode case.
30225 + */
30226 + fsdata = reiser4_get_dentry_fsdata(de);
30227 + if (IS_ERR(fsdata))
30228 + return PTR_ERR(fsdata);
30229 + dec = &fsdata->dec;
30230 +
30231 + coord = &dec->entry_coord;
30232 + coord_clear_iplug(coord);
30233 + seal = &dec->entry_seal;
30234 + /* compose key of directory entry for @name */
30235 + inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
30236 +
30237 + if (seal_is_set(seal)) {
30238 + /* check seal */
30239 + result = seal_validate(seal, coord, &entry->key,
30240 + lh, mode, ZNODE_LOCK_LOPRI);
30241 + if (result == 0) {
30242 + /* key was found. Check that it is really item we are
30243 + looking for. */
30244 + result = check_entry(dir, coord, name);
30245 + if (result == 0)
30246 + return 0;
30247 + }
30248 + }
30249 + flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
30250 + /*
30251 + * find place in the tree where directory item should be located.
30252 + */
30253 + result = object_lookup(dir, &entry->key, coord, lh, mode,
30254 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags,
30255 + NULL /*ra_info */ );
30256 + if (result == CBK_COORD_FOUND) {
30257 + entry_actor_args arg;
30258 +
30259 + /* fast path: no hash collisions */
30260 + result = check_entry(dir, coord, name);
30261 + if (result == 0) {
30262 + seal_init(seal, coord, &entry->key);
30263 + dec->pos = 0;
30264 + } else if (result > 0) {
30265 + /* Iterate through all units with the same keys. */
30266 + arg.name = name->name;
30267 + arg.key = &entry->key;
30268 + arg.not_found = 0;
30269 + arg.non_uniq = 0;
30270 +#if REISER4_USE_COLLISION_LIMIT
30271 + arg.max_non_uniq = max_hash_collisions(dir);
30272 + assert("nikita-2851", arg.max_non_uniq > 1);
30273 +#endif
30274 + arg.mode = mode;
30275 + arg.inode = dir;
30276 + coord_init_zero(&arg.last_coord);
30277 + init_lh(&arg.last_lh);
30278 +
30279 + result = iterate_tree(tree_by_inode(dir), coord, lh,
30280 + entry_actor, &arg, mode, 1);
30281 + /* if end of the tree or extent was reached during
30282 + scanning. */
30283 + if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
30284 + /* step back */
30285 + done_lh(lh);
30286 +
30287 + result = zload(arg.last_coord.node);
30288 + if (result == 0) {
30289 + coord_clear_iplug(&arg.last_coord);
30290 + coord_dup(coord, &arg.last_coord);
30291 + move_lh(lh, &arg.last_lh);
30292 + result = RETERR(-ENOENT);
30293 + zrelse(arg.last_coord.node);
30294 + --arg.non_uniq;
30295 + }
30296 + }
30297 +
30298 + done_lh(&arg.last_lh);
30299 + if (result == 0)
30300 + seal_init(seal, coord, &entry->key);
30301 +
30302 + if (result == 0 || result == -ENOENT) {
30303 + assert("nikita-2580", arg.non_uniq > 0);
30304 + dec->pos = arg.non_uniq - 1;
30305 + }
30306 + }
30307 + } else
30308 + dec->pos = -1;
30309 + return result;
30310 +}
30311 +
30312 +/* Local variables:
30313 + c-indentation-style: "K&R"
30314 + mode-name: "LC"
30315 + c-basic-offset: 8
30316 + tab-width: 8
30317 + fill-column: 120
30318 + End:
30319 +*/
30320 Index: linux-2.6.16/fs/reiser4/plugin/disk_format/Makefile
30321 ===================================================================
30322 --- /dev/null
30323 +++ linux-2.6.16/fs/reiser4/plugin/disk_format/Makefile
30324 @@ -0,0 +1,5 @@
30325 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
30326 +
30327 +df_plugins-objs := \
30328 + disk_format40.o \
30329 + disk_format.o
30330 Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.c
30331 ===================================================================
30332 --- /dev/null
30333 +++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.c
30334 @@ -0,0 +1,37 @@
30335 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30336 +
30337 +#include "../../debug.h"
30338 +#include "../plugin_header.h"
30339 +#include "disk_format40.h"
30340 +#include "disk_format.h"
30341 +#include "../plugin.h"
30342 +
30343 +/* initialization of disk layout plugins */
30344 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30345 + [FORMAT40_ID] = {
30346 + .h = {
30347 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30348 + .id = FORMAT40_ID,
30349 + .pops = NULL,
30350 + .label = "reiser40",
30351 + .desc = "standard disk layout for reiser40",
30352 + .linkage = {NULL, NULL}
30353 + },
30354 + .init_format = init_format_format40,
30355 + .root_dir_key = root_dir_key_format40,
30356 + .release = release_format40,
30357 + .log_super = log_super_format40,
30358 + .check_open = check_open_format40
30359 + }
30360 +};
30361 +
30362 +/* Make Linus happy.
30363 + Local variables:
30364 + c-indentation-style: "K&R"
30365 + mode-name: "LC"
30366 + c-basic-offset: 8
30367 + tab-width: 8
30368 + fill-column: 120
30369 + scroll-step: 1
30370 + End:
30371 +*/
30372 Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.h
30373 ===================================================================
30374 --- /dev/null
30375 +++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.h
30376 @@ -0,0 +1,27 @@
30377 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30378 +
30379 +/* identifiers for disk layouts, they are also used as indexes in array of disk
30380 + plugins */
30381 +
30382 +#if !defined( __REISER4_DISK_FORMAT_H__ )
30383 +#define __REISER4_DISK_FORMAT_H__
30384 +
30385 +typedef enum {
30386 + /* standard reiser4 disk layout plugin id */
30387 + FORMAT40_ID,
30388 + LAST_FORMAT_ID
30389 +} disk_format_id;
30390 +
30391 +/* __REISER4_DISK_FORMAT_H__ */
30392 +#endif
30393 +
30394 +/* Make Linus happy.
30395 + Local variables:
30396 + c-indentation-style: "K&R"
30397 + mode-name: "LC"
30398 + c-basic-offset: 8
30399 + tab-width: 8
30400 + fill-column: 120
30401 + scroll-step: 1
30402 + End:
30403 +*/
30404 Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.c
30405 ===================================================================
30406 --- /dev/null
30407 +++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.c
30408 @@ -0,0 +1,556 @@
30409 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30410 +
30411 +#include "../../debug.h"
30412 +#include "../../dformat.h"
30413 +#include "../../key.h"
30414 +#include "../node/node.h"
30415 +#include "../space/space_allocator.h"
30416 +#include "disk_format40.h"
30417 +#include "../plugin.h"
30418 +#include "../../txnmgr.h"
30419 +#include "../../jnode.h"
30420 +#include "../../tree.h"
30421 +#include "../../super.h"
30422 +#include "../../wander.h"
30423 +#include "../../inode.h"
30424 +#include "../../ktxnmgrd.h"
30425 +#include "../../status_flags.h"
30426 +
30427 +#include <linux/types.h> /* for __u?? */
30428 +#include <linux/fs.h> /* for struct super_block */
30429 +#include <linux/buffer_head.h>
30430 +
30431 +/* reiser 4.0 default disk layout */
30432 +
30433 +/* Amount of free blocks needed to perform release_format40 when fs gets
30434 + mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
30435 + & tx record. */
30436 +#define RELEASE_RESERVED 4
30437 +
30438 +/* functions to access fields of format40_disk_super_block */
30439 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
30440 +{
30441 + return le64_to_cpu(get_unaligned(&sb->block_count));
30442 +}
30443 +
30444 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
30445 +{
30446 + return le64_to_cpu(get_unaligned(&sb->free_blocks));
30447 +}
30448 +
30449 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
30450 +{
30451 + return le64_to_cpu(get_unaligned(&sb->root_block));
30452 +}
30453 +
30454 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
30455 +{
30456 + return le16_to_cpu(get_unaligned(&sb->tree_height));
30457 +}
30458 +
30459 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
30460 +{
30461 + return le64_to_cpu(get_unaligned(&sb->file_count));
30462 +}
30463 +
30464 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
30465 +{
30466 + return le64_to_cpu(get_unaligned(&sb->oid));
30467 +}
30468 +
30469 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
30470 +{
30471 + return le32_to_cpu(get_unaligned(&sb->mkfs_id));
30472 +}
30473 +
30474 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
30475 +{
30476 + return le64_to_cpu(get_unaligned(&sb->flags));
30477 +}
30478 +
30479 +static format40_super_info *get_sb_info(struct super_block *super)
30480 +{
30481 + return &get_super_private(super)->u.format40;
30482 +}
30483 +
30484 +static int consult_diskmap(struct super_block *s)
30485 +{
30486 + format40_super_info *info;
30487 + journal_location *jloc;
30488 +
30489 + info = get_sb_info(s);
30490 + jloc = &get_super_private(s)->jloc;
30491 + /* Default format-specific locations, if there is nothing in
30492 + * diskmap */
30493 + jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
30494 + jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
30495 + info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
30496 +#ifdef CONFIG_REISER4_BADBLOCKS
30497 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
30498 + &jloc->footer);
30499 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
30500 + &jloc->header);
30501 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
30502 + &info->loc.super);
30503 +#endif
30504 + return 0;
30505 +}
30506 +
30507 +/* find any valid super block of disk_format40 (even if the first
30508 + super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
30509 + if needed */
30510 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
30511 + *s)
30512 +{
30513 + struct buffer_head *super_bh;
30514 + format40_disk_super_block *disk_sb;
30515 + format40_super_info *info;
30516 +
30517 + assert("umka-487", s != NULL);
30518 +
30519 + info = get_sb_info(s);
30520 +
30521 + super_bh = sb_bread(s, info->loc.super);
30522 + if (super_bh == NULL)
30523 + return ERR_PTR(RETERR(-EIO));
30524 +
30525 + disk_sb = (format40_disk_super_block *) super_bh->b_data;
30526 + if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
30527 + brelse(super_bh);
30528 + return ERR_PTR(RETERR(-EINVAL));
30529 + }
30530 +
30531 + reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
30532 + reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
30533 + le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30534 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30535 +
30536 + return super_bh;
30537 +}
30538 +
30539 +/* find the most recent version of super block. This is called after journal is
30540 + replayed */
30541 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
30542 +{
30543 + /* Here the most recent superblock copy has to be read. However, as
30544 + journal replay isn't complete, we are using
30545 + find_a_disk_format40_super_block() function. */
30546 + return find_a_disk_format40_super_block(s);
30547 +}
30548 +
30549 +static int get_super_jnode(struct super_block *s)
30550 +{
30551 + reiser4_super_info_data *sbinfo = get_super_private(s);
30552 + jnode *sb_jnode;
30553 + int ret;
30554 +
30555 + sb_jnode = alloc_io_head(&get_sb_info(s)->loc.super);
30556 +
30557 + ret = jload(sb_jnode);
30558 +
30559 + if (ret) {
30560 + drop_io_head(sb_jnode);
30561 + return ret;
30562 + }
30563 +
30564 + pin_jnode_data(sb_jnode);
30565 + jrelse(sb_jnode);
30566 +
30567 + sbinfo->u.format40.sb_jnode = sb_jnode;
30568 +
30569 + return 0;
30570 +}
30571 +
30572 +static void done_super_jnode(struct super_block *s)
30573 +{
30574 + jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30575 +
30576 + if (sb_jnode) {
30577 + unpin_jnode_data(sb_jnode);
30578 + drop_io_head(sb_jnode);
30579 + }
30580 +}
30581 +
30582 +typedef enum format40_init_stage {
30583 + NONE_DONE = 0,
30584 + CONSULT_DISKMAP,
30585 + FIND_A_SUPER,
30586 + INIT_JOURNAL_INFO,
30587 + INIT_STATUS,
30588 + JOURNAL_REPLAY,
30589 + READ_SUPER,
30590 + KEY_CHECK,
30591 + INIT_OID,
30592 + INIT_TREE,
30593 + JOURNAL_RECOVER,
30594 + INIT_SA,
30595 + INIT_JNODE,
30596 + ALL_DONE
30597 +} format40_init_stage;
30598 +
30599 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
30600 +{
30601 + format40_disk_super_block *sb_copy;
30602 +
30603 + sb_copy = kmalloc(sizeof(format40_disk_super_block), get_gfp_mask());
30604 + if (sb_copy == NULL)
30605 + return ERR_PTR(RETERR(-ENOMEM));
30606 + memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
30607 + sizeof(format40_disk_super_block));
30608 + return sb_copy;
30609 +}
30610 +
30611 +static int check_key_format(const format40_disk_super_block *sb_copy)
30612 +{
30613 + if (!equi(REISER4_LARGE_KEY,
30614 + get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
30615 + warning("nikita-3228", "Key format mismatch. "
30616 + "Only %s keys are supported.",
30617 + REISER4_LARGE_KEY ? "large" : "small");
30618 + return RETERR(-EINVAL);
30619 + }
30620 + return 0;
30621 +}
30622 +
30623 +/**
30624 + * try_init_format40
30625 + * @super:
30626 + * @stage:
30627 + *
30628 + */
30629 +static int try_init_format40(struct super_block *super,
30630 + format40_init_stage *stage)
30631 +{
30632 + int result;
30633 + struct buffer_head *super_bh;
30634 + reiser4_super_info_data *sbinfo;
30635 + format40_disk_super_block *sb_copy;
30636 + tree_level height;
30637 + reiser4_block_nr root_block;
30638 + node_plugin *nplug;
30639 +
30640 + assert("vs-475", super != NULL);
30641 + assert("vs-474", get_super_private(super));
30642 +
30643 + *stage = NONE_DONE;
30644 +
30645 + result = consult_diskmap(super);
30646 + if (result)
30647 + return result;
30648 + *stage = CONSULT_DISKMAP;
30649 +
30650 + super_bh = find_a_disk_format40_super_block(super);
30651 + if (IS_ERR(super_bh))
30652 + return PTR_ERR(super_bh);
30653 + brelse(super_bh);
30654 + *stage = FIND_A_SUPER;
30655 +
30656 + /* map jnodes for journal control blocks (header, footer) to disk */
30657 + result = init_journal_info(super);
30658 + if (result)
30659 + return result;
30660 + *stage = INIT_JOURNAL_INFO;
30661 +
30662 + /* ok, we are sure that filesystem format is a format40 format */
30663 + /* Now check it's state */
30664 + result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
30665 + if (result != 0 && result != -EINVAL)
30666 + /* -EINVAL means there is no magic, so probably just old
30667 + * fs. */
30668 + return result;
30669 + *stage = INIT_STATUS;
30670 +
30671 + result = reiser4_status_query(NULL, NULL);
30672 + if (result == REISER4_STATUS_MOUNT_WARN)
30673 + printk("Warning, mounting filesystem with errors\n");
30674 + if (result == REISER4_STATUS_MOUNT_RO) {
30675 + printk
30676 + ("Warning, mounting filesystem with fatal errors, forcing read-only mount\n");
30677 + /* FIXME: here we should actually enforce read-only mount,
30678 + * only it is unsupported yet. */
30679 + }
30680 +
30681 + result = reiser4_journal_replay(super);
30682 + if (result)
30683 + return result;
30684 + *stage = JOURNAL_REPLAY;
30685 +
30686 + super_bh = read_super_block(super);
30687 + if (IS_ERR(super_bh))
30688 + return PTR_ERR(super_bh);
30689 + *stage = READ_SUPER;
30690 +
30691 + /* allocate and make a copy of format40_disk_super_block */
30692 + sb_copy = copy_sb(super_bh);
30693 + brelse(super_bh);
30694 + if (IS_ERR(sb_copy))
30695 + return PTR_ERR(sb_copy);
30696 +
30697 + /* make sure that key format of kernel and filesyste match */
30698 + result = check_key_format(sb_copy);
30699 + if (result) {
30700 + kfree(sb_copy);
30701 + return result;
30702 + }
30703 + *stage = KEY_CHECK;
30704 +
30705 + result = oid_init_allocator(super, get_format40_file_count(sb_copy),
30706 + get_format40_oid(sb_copy));
30707 + if (result) {
30708 + kfree(sb_copy);
30709 + return result;
30710 + }
30711 + *stage = INIT_OID;
30712 +
30713 + /* get things necessary to init reiser4_tree */
30714 + root_block = get_format40_root_block(sb_copy);
30715 + height = get_format40_tree_height(sb_copy);
30716 + nplug = node_plugin_by_id(NODE40_ID);
30717 +
30718 +
30719 + /* initialize reiser4_super_info_data */
30720 + sbinfo = get_super_private(super);
30721 + assert("", sbinfo->tree.super == super);
30722 + /* init reiser4_tree for the filesystem */
30723 + result = init_tree(&sbinfo->tree, &root_block, height, nplug);
30724 + if (result) {
30725 + kfree(sb_copy);
30726 + return result;
30727 + }
30728 + *stage = INIT_TREE;
30729 +
30730 + /*
30731 + * initialize reiser4_super_info_data with data from format40 super
30732 + * block
30733 + */
30734 + sbinfo->default_uid = 0;
30735 + sbinfo->default_gid = 0;
30736 + sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
30737 + /* number of blocks in filesystem and reserved space */
30738 + reiser4_set_block_count(super, get_format40_block_count(sb_copy));
30739 + sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
30740 + kfree(sb_copy);
30741 +
30742 + sbinfo->fsuid = 0;
30743 + sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
30744 + * are not supported */
30745 + sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
30746 + * layout 40 are
30747 + * of one
30748 + * plugin */
30749 + /* sbinfo->tmgr is initialized already */
30750 +
30751 + /* recover sb data which were logged separately from sb block */
30752 +
30753 + /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
30754 + * oid_init_allocator() and reiser4_set_free_blocks() with new
30755 + * data. What's the reason to call them above? */
30756 + result = reiser4_journal_recover_sb_data(super);
30757 + if (result != 0)
30758 + return result;
30759 + *stage = JOURNAL_RECOVER;
30760 +
30761 + /*
30762 + * Set number of used blocks. The number of used blocks is not stored
30763 + * neither in on-disk super block nor in the journal footer blocks. At
30764 + * this moment actual values of total blocks and free block counters
30765 + * are set in the reiser4 super block (in-memory structure) and we can
30766 + * calculate number of used blocks from them.
30767 + */
30768 + reiser4_set_data_blocks(super,
30769 + reiser4_block_count(super) -
30770 + reiser4_free_blocks(super));
30771 +
30772 +#if REISER4_DEBUG
30773 + sbinfo->min_blocks_used = 16 /* reserved area */ +
30774 + 2 /* super blocks */ +
30775 + 2 /* journal footer and header */ ;
30776 +#endif
30777 +
30778 + /* init disk space allocator */
30779 + result = sa_init_allocator(get_space_allocator(super), super, NULL);
30780 + if (result)
30781 + return result;
30782 + *stage = INIT_SA;
30783 +
30784 + result = get_super_jnode(super);
30785 + if (result == 0)
30786 + *stage = ALL_DONE;
30787 + return result;
30788 +}
30789 +
30790 +/* plugin->u.format.get_ready */
30791 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30792 +{
30793 + int result;
30794 + format40_init_stage stage;
30795 +
30796 + result = try_init_format40(s, &stage);
30797 + switch (stage) {
30798 + case ALL_DONE:
30799 + assert("nikita-3458", result == 0);
30800 + break;
30801 + case INIT_JNODE:
30802 + done_super_jnode(s);
30803 + case INIT_SA:
30804 + sa_destroy_allocator(get_space_allocator(s), s);
30805 + case JOURNAL_RECOVER:
30806 + case INIT_TREE:
30807 + done_tree(&get_super_private(s)->tree);
30808 + case INIT_OID:
30809 + case KEY_CHECK:
30810 + case READ_SUPER:
30811 + case JOURNAL_REPLAY:
30812 + case INIT_STATUS:
30813 + reiser4_status_finish();
30814 + case INIT_JOURNAL_INFO:
30815 + done_journal_info(s);
30816 + case FIND_A_SUPER:
30817 + case CONSULT_DISKMAP:
30818 + case NONE_DONE:
30819 + break;
30820 + default:
30821 + impossible("nikita-3457", "init stage: %i", stage);
30822 + }
30823 +
30824 + if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30825 + return RETERR(-ENOSPC);
30826 +
30827 + return result;
30828 +}
30829 +
30830 +static void pack_format40_super(const struct super_block *s, char *data)
30831 +{
30832 + format40_disk_super_block *super_data =
30833 + (format40_disk_super_block *) data;
30834 + reiser4_super_info_data *sbinfo = get_super_private(s);
30835 +
30836 + assert("zam-591", data != NULL);
30837 +
30838 + put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30839 + &super_data->free_blocks);
30840 + put_unaligned(cpu_to_le64(sbinfo->tree.root_block), &super_data->root_block);
30841 +
30842 + put_unaligned(cpu_to_le64(oid_next(s)), &super_data->oid);
30843 + put_unaligned(cpu_to_le64(oids_used(s)), &super_data->file_count);
30844 +
30845 + put_unaligned(cpu_to_le16(sbinfo->tree.height), &super_data->tree_height);
30846 +}
30847 +
30848 +/* plugin->u.format.log_super
30849 + return a jnode which should be added to transaction when the super block
30850 + gets logged */
30851 +jnode *log_super_format40(struct super_block *s)
30852 +{
30853 + jnode *sb_jnode;
30854 +
30855 + sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30856 +
30857 + jload(sb_jnode);
30858 +
30859 + pack_format40_super(s, jdata(sb_jnode));
30860 +
30861 + jrelse(sb_jnode);
30862 +
30863 + return sb_jnode;
30864 +}
30865 +
30866 +/* plugin->u.format.release */
30867 +int release_format40(struct super_block *s)
30868 +{
30869 + int ret;
30870 + reiser4_super_info_data *sbinfo;
30871 +
30872 + sbinfo = get_super_private(s);
30873 + assert("zam-579", sbinfo != NULL);
30874 +
30875 + if (!rofs_super(s)) {
30876 + ret = capture_super_block(s);
30877 + if (ret != 0)
30878 + warning("vs-898", "capture_super_block failed: %d",
30879 + ret);
30880 +
30881 + ret = txnmgr_force_commit_all(s, 1);
30882 + if (ret != 0)
30883 + warning("jmacd-74438", "txn_force failed: %d", ret);
30884 +
30885 + all_grabbed2free();
30886 + }
30887 +
30888 + sa_destroy_allocator(&sbinfo->space_allocator, s);
30889 + done_journal_info(s);
30890 + done_super_jnode(s);
30891 +
30892 + rcu_barrier();
30893 + done_tree(&sbinfo->tree);
30894 + /* call finish_rcu(), because some znode were "released" in
30895 + * done_tree(). */
30896 + rcu_barrier();
30897 +
30898 + return 0;
30899 +}
30900 +
30901 +#define FORMAT40_ROOT_LOCALITY 41
30902 +#define FORMAT40_ROOT_OBJECTID 42
30903 +
30904 +/* plugin->u.format.root_dir_key */
30905 +const reiser4_key *root_dir_key_format40(const struct super_block *super
30906 + UNUSED_ARG)
30907 +{
30908 + static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30909 + .el = {
30910 + __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30911 +#if REISER4_LARGE_KEY
30912 + ON_LARGE_KEY(0ull,)
30913 +#endif
30914 + __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30915 + 0ull
30916 + }
30917 + };
30918 +
30919 + return &FORMAT40_ROOT_DIR_KEY;
30920 +}
30921 +
30922 +/* plugin->u.format.check_open.
30923 + Check the opened object for validness. For now it checks for the valid oid &
30924 + locality only, can be improved later and it its work may depend on the mount
30925 + options. */
30926 +int check_open_format40(const struct inode *object)
30927 +{
30928 + oid_t max, oid;
30929 +
30930 + max = oid_next(object->i_sb) - 1;
30931 +
30932 + /* Check the oid. */
30933 + oid = get_inode_oid(object);
30934 + if (oid > max) {
30935 + warning("vpf-1360", "The object with the oid %llu "
30936 + "greater then the max used oid %llu found.",
30937 + (unsigned long long)oid, (unsigned long long)max);
30938 +
30939 + return RETERR(-EIO);
30940 + }
30941 +
30942 + /* Check the locality. */
30943 + oid = reiser4_inode_data(object)->locality_id;
30944 + if (oid > max) {
30945 + warning("vpf-1360", "The object with the locality %llu "
30946 + "greater then the max used oid %llu found.",
30947 + (unsigned long long)oid, (unsigned long long)max);
30948 +
30949 + return RETERR(-EIO);
30950 + }
30951 +
30952 + return 0;
30953 +}
30954 +
30955 +/* Make Linus happy.
30956 + Local variables:
30957 + c-indentation-style: "K&R"
30958 + mode-name: "LC"
30959 + c-basic-offset: 8
30960 + tab-width: 8
30961 + fill-column: 120
30962 + scroll-step: 1
30963 + End:
30964 +*/
30965 Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.h
30966 ===================================================================
30967 --- /dev/null
30968 +++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.h
30969 @@ -0,0 +1,99 @@
30970 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30971 +
30972 +/* this file contains:
30973 + - definition of ondisk super block of standart disk layout for
30974 + reiser 4.0 (layout 40)
30975 + - definition of layout 40 specific portion of in-core super block
30976 + - declarations of functions implementing methods of layout plugin
30977 + for layout 40
30978 + - declarations of functions used to get/set fields in layout 40 super block
30979 +*/
30980 +
30981 +#ifndef __DISK_FORMAT40_H__
30982 +#define __DISK_FORMAT40_H__
30983 +
30984 +/* magic for default reiser4 layout */
30985 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30986 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30987 +
30988 +#include "../../dformat.h"
30989 +
30990 +#include <linux/fs.h> /* for struct super_block */
30991 +
30992 +typedef enum {
30993 + FORMAT40_LARGE_KEYS
30994 +} format40_flags;
30995 +
30996 +/* ondisk super block for format 40. It is 512 bytes long */
30997 +typedef struct format40_disk_super_block {
30998 + /* 0 */ d64 block_count;
30999 + /* number of block in a filesystem */
31000 + /* 8 */ d64 free_blocks;
31001 + /* number of free blocks */
31002 + /* 16 */ d64 root_block;
31003 + /* filesystem tree root block */
31004 + /* 24 */ d64 oid;
31005 + /* smallest free objectid */
31006 + /* 32 */ d64 file_count;
31007 + /* number of files in a filesystem */
31008 + /* 40 */ d64 flushes;
31009 + /* number of times super block was
31010 + flushed. Needed if format 40
31011 + will have few super blocks */
31012 + /* 48 */ d32 mkfs_id;
31013 + /* unique identifier of fs */
31014 + /* 52 */ char magic[16];
31015 + /* magic string ReIsEr40FoRmAt */
31016 + /* 68 */ d16 tree_height;
31017 + /* height of filesystem tree */
31018 + /* 70 */ d16 formatting_policy;
31019 + /* 72 */ d64 flags;
31020 + /* 72 */ char not_used[432];
31021 +} format40_disk_super_block;
31022 +
31023 +/* format 40 specific part of reiser4_super_info_data */
31024 +typedef struct format40_super_info {
31025 +/* format40_disk_super_block actual_sb; */
31026 + jnode *sb_jnode;
31027 + struct {
31028 + reiser4_block_nr super;
31029 + } loc;
31030 +} format40_super_info;
31031 +
31032 +/* Defines for journal header and footer respectively. */
31033 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
31034 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
31035 +
31036 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
31037 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
31038 +
31039 +#define FORMAT40_STATUS_BLOCKNR \
31040 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
31041 +
31042 +/* Diskmap declarations */
31043 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
31044 +#define FORMAT40_SUPER 1
31045 +#define FORMAT40_JH 2
31046 +#define FORMAT40_JF 3
31047 +
31048 +/* declarations of functions implementing methods of layout plugin for
31049 + format 40. The functions theirself are in disk_format40.c */
31050 +int init_format_format40(struct super_block *, void *data);
31051 +const reiser4_key *root_dir_key_format40(const struct super_block *);
31052 +int release_format40(struct super_block *s);
31053 +jnode *log_super_format40(struct super_block *s);
31054 +int check_open_format40(const struct inode *object);
31055 +
31056 +/* __DISK_FORMAT40_H__ */
31057 +#endif
31058 +
31059 +/* Make Linus happy.
31060 + Local variables:
31061 + c-indentation-style: "K&R"
31062 + mode-name: "LC"
31063 + c-basic-offset: 8
31064 + tab-width: 8
31065 + fill-column: 120
31066 + scroll-step: 1
31067 + End:
31068 +*/
31069 Index: linux-2.6.16/fs/reiser4/plugin/fibration.c
31070 ===================================================================
31071 --- /dev/null
31072 +++ linux-2.6.16/fs/reiser4/plugin/fibration.c
31073 @@ -0,0 +1,174 @@
31074 +/* Copyright 2004 by Hans Reiser, licensing governed by
31075 + * reiser4/README */
31076 +
31077 +/* Directory fibrations */
31078 +
31079 +/*
31080 + * Suppose we have a directory tree with sources of some project. During
31081 + * compilation .o files are created within this tree. This makes access
31082 + * to the original source files less efficient, because source files are
31083 + * now "diluted" by object files: default directory plugin uses prefix
31084 + * of a file name as a part of the key for directory entry (and this
31085 + * part is also inherited by the key of file body). This means that
31086 + * foo.o will be located close to foo.c and foo.h in the tree.
31087 + *
31088 + * To avoid this effect directory plugin fill highest 7 (unused
31089 + * originally) bits of the second component of the directory entry key
31090 + * by bit-pattern depending on the file name (see
31091 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
31092 + * "fibre". Fibre of the file name key is inherited by key of stat data
31093 + * and keys of file body (in the case of REISER4_LARGE_KEY).
31094 + *
31095 + * Fibre for a given file is chosen by per-directory fibration
31096 + * plugin. Names within given fibre are ordered lexicographically.
31097 + */
31098 +
31099 +#include "../debug.h"
31100 +#include "plugin_header.h"
31101 +#include "plugin.h"
31102 +#include "../super.h"
31103 +#include "../inode.h"
31104 +
31105 +#include <linux/types.h>
31106 +
31107 +static const int fibre_shift = 57;
31108 +
31109 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
31110 +
31111 +/*
31112 + * Trivial fibration: all files of directory are just ordered
31113 + * lexicographically.
31114 + */
31115 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
31116 +{
31117 + return FIBRE_NO(0);
31118 +}
31119 +
31120 +/*
31121 + * dot-o fibration: place .o files after all others.
31122 + */
31123 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
31124 +{
31125 + /* special treatment for .*\.o */
31126 + if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
31127 + return FIBRE_NO(1);
31128 + else
31129 + return FIBRE_NO(0);
31130 +}
31131 +
31132 +/*
31133 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
31134 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
31135 + * default fibre for the rest.
31136 + */
31137 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
31138 +{
31139 + if (len > 2 && name[len - 2] == '.')
31140 + return FIBRE_NO(name[len - 1]);
31141 + else
31142 + return FIBRE_NO(0);
31143 +}
31144 +
31145 +/*
31146 + * ext.3 fibration: try to separate files with different 3-character
31147 + * extensions from each other.
31148 + */
31149 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
31150 +{
31151 + if (len > 4 && name[len - 4] == '.')
31152 + return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
31153 + else
31154 + return FIBRE_NO(0);
31155 +}
31156 +
31157 +static int change_fibration(struct inode *inode, reiser4_plugin * plugin)
31158 +{
31159 + int result;
31160 +
31161 + assert("nikita-3503", inode != NULL);
31162 + assert("nikita-3504", plugin != NULL);
31163 +
31164 + assert("nikita-3505", is_reiser4_inode(inode));
31165 + assert("nikita-3506", inode_dir_plugin(inode) != NULL);
31166 + assert("nikita-3507",
31167 + plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
31168 +
31169 + result = 0;
31170 + if (inode_fibration_plugin(inode) == NULL ||
31171 + inode_fibration_plugin(inode)->h.id != plugin->h.id) {
31172 + if (is_dir_empty(inode) == 0)
31173 + result =
31174 + plugin_set_fibration(&reiser4_inode_data(inode)->
31175 + pset, &plugin->fibration);
31176 + else
31177 + result = RETERR(-ENOTEMPTY);
31178 +
31179 + }
31180 + return result;
31181 +}
31182 +
31183 +static reiser4_plugin_ops fibration_plugin_ops = {
31184 + .init = NULL,
31185 + .load = NULL,
31186 + .save_len = NULL,
31187 + .save = NULL,
31188 + .change = change_fibration
31189 +};
31190 +
31191 +/* fibration plugins */
31192 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
31193 + [FIBRATION_LEXICOGRAPHIC] = {
31194 + .h = {
31195 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31196 + .id = FIBRATION_LEXICOGRAPHIC,
31197 + .pops = &fibration_plugin_ops,
31198 + .label = "lexicographic",
31199 + .desc = "no fibration",
31200 + .linkage = {NULL, NULL}
31201 + },
31202 + .fibre = fibre_trivial
31203 + },
31204 + [FIBRATION_DOT_O] = {
31205 + .h = {
31206 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31207 + .id = FIBRATION_DOT_O,
31208 + .pops = &fibration_plugin_ops,
31209 + .label = "dot-o",
31210 + .desc = "fibrate .o files separately",
31211 + .linkage = {NULL, NULL}
31212 + },
31213 + .fibre = fibre_dot_o
31214 + },
31215 + [FIBRATION_EXT_1] = {
31216 + .h = {
31217 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31218 + .id = FIBRATION_EXT_1,
31219 + .pops = &fibration_plugin_ops,
31220 + .label = "ext-1",
31221 + .desc = "fibrate file by single character extension",
31222 + .linkage = {NULL, NULL}
31223 + },
31224 + .fibre = fibre_ext_1
31225 + },
31226 + [FIBRATION_EXT_3] = {
31227 + .h = {
31228 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31229 + .id = FIBRATION_EXT_3,
31230 + .pops = &fibration_plugin_ops,
31231 + .label = "ext-3",
31232 + .desc = "fibrate file by three character extension",
31233 + .linkage = {NULL, NULL}
31234 + },
31235 + .fibre = fibre_ext_3
31236 + }
31237 +};
31238 +
31239 +/*
31240 + * Local variables:
31241 + * c-indentation-style: "K&R"
31242 + * mode-name: "LC"
31243 + * c-basic-offset: 8
31244 + * tab-width: 8
31245 + * fill-column: 79
31246 + * End:
31247 + */
31248 Index: linux-2.6.16/fs/reiser4/plugin/fibration.h
31249 ===================================================================
31250 --- /dev/null
31251 +++ linux-2.6.16/fs/reiser4/plugin/fibration.h
31252 @@ -0,0 +1,37 @@
31253 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
31254 +
31255 +/* Fibration plugin used by hashed directory plugin to segment content
31256 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
31257 +
31258 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
31259 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
31260 +
31261 +#include "plugin_header.h"
31262 +
31263 +typedef struct fibration_plugin {
31264 + /* generic fields */
31265 + plugin_header h;
31266 +
31267 + __u64(*fibre) (const struct inode * dir, const char *name, int len);
31268 +} fibration_plugin;
31269 +
31270 +typedef enum {
31271 + FIBRATION_LEXICOGRAPHIC,
31272 + FIBRATION_DOT_O,
31273 + FIBRATION_EXT_1,
31274 + FIBRATION_EXT_3,
31275 + LAST_FIBRATION_ID
31276 +} reiser4_fibration_id;
31277 +
31278 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
31279 +#endif
31280 +
31281 +/* Make Linus happy.
31282 + Local variables:
31283 + c-indentation-style: "K&R"
31284 + mode-name: "LC"
31285 + c-basic-offset: 8
31286 + tab-width: 8
31287 + fill-column: 120
31288 + End:
31289 +*/
31290 Index: linux-2.6.16/fs/reiser4/plugin/file/Makefile
31291 ===================================================================
31292 --- /dev/null
31293 +++ linux-2.6.16/fs/reiser4/plugin/file/Makefile
31294 @@ -0,0 +1,7 @@
31295 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
31296 +
31297 +file_plugins-objs := \
31298 + file.o \
31299 + tail_conversion.o \
31300 + symlink.o \
31301 + cryptcompress.o
31302 Index: linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.c
31303 ===================================================================
31304 --- /dev/null
31305 +++ linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.c
31306 @@ -0,0 +1,3817 @@
31307 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
31308 + reiser4/README */
31309 +
31310 +/* This file contains implementations of inode/file/address_space/file plugin
31311 + * operations specific for cryptcompress file plugin which manages files with
31312 + * compressed and encrypted bodies. "Cryptcompress file" is built of items of
31313 + * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
31314 + */
31315 +
31316 +#include "../../page_cache.h"
31317 +#include "../../inode.h"
31318 +#include "../cluster.h"
31319 +#include "../object.h"
31320 +#include "../../tree_walk.h"
31321 +#include "cryptcompress.h"
31322 +
31323 +#include <asm/scatterlist.h>
31324 +#include <linux/pagevec.h>
31325 +#include <asm/uaccess.h>
31326 +#include <linux/swap.h>
31327 +#include <linux/writeback.h>
31328 +#include <linux/random.h>
31329 +
31330 +/* get cryptcompress specific portion of inode */
31331 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
31332 +{
31333 + return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
31334 +}
31335 +
31336 +/* plugin->u.file.init_inode_data */
31337 +void
31338 +init_inode_data_cryptcompress(struct inode *inode,
31339 + reiser4_object_create_data * crd, int create)
31340 +{
31341 + cryptcompress_info_t *data;
31342 +
31343 + data = cryptcompress_inode_data(inode);
31344 + assert("edward-685", data != NULL);
31345 +
31346 + memset(data, 0, sizeof(*data));
31347 +
31348 + init_rwsem(&data->lock);
31349 + toggle_compression(data, 1);
31350 + init_inode_ordering(inode, crd, create);
31351 +}
31352 +
31353 +#if REISER4_DEBUG
31354 +int crc_inode_ok(struct inode *inode)
31355 +{
31356 + if (cluster_shift_ok(inode_cluster_shift(inode)))
31357 + return 1;
31358 + assert("edward-686", 0);
31359 + return 0;
31360 +}
31361 +#endif
31362 +
31363 +static int check_cryptcompress(struct inode *inode)
31364 +{
31365 + int result = 0;
31366 + assert("edward-1307", inode_compression_plugin(inode) != NULL);
31367 +
31368 + if (inode_cluster_size(inode) < PAGE_CACHE_SIZE) {
31369 + warning("edward-1331",
31370 + "%s clusters are unsupported",
31371 + inode_cluster_plugin(inode)->h.label);
31372 + return RETERR(-EINVAL);
31373 + }
31374 +
31375 + /* FIXME-EDWARD: init? or check? */
31376 + if (inode_compression_plugin(inode)->init)
31377 + result = inode_compression_plugin(inode)->init();
31378 + return result;
31379 +}
31380 +
31381 +/* The following is a part of reiser4 cipher key manager
31382 + which is called when opening/creating a cryptcompress file */
31383 +
31384 +/* get/set cipher key info */
31385 +crypto_stat_t * inode_crypto_stat (struct inode * inode)
31386 +{
31387 + assert("edward-90", inode != NULL);
31388 + assert("edward-91", reiser4_inode_data(inode) != NULL);
31389 + return cryptcompress_inode_data(inode)->crypt;
31390 +}
31391 +
31392 +static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
31393 +{
31394 + cryptcompress_inode_data(inode)->crypt = stat;
31395 +}
31396 +
31397 +/* allocate a cipher key info */
31398 +crypto_stat_t * alloc_crypto_stat (struct inode * inode)
31399 +{
31400 + crypto_stat_t * info;
31401 + int fipsize;
31402 +
31403 + assert("edward-1421", 0);
31404 + info = kmalloc(sizeof(*info), GFP_KERNEL);
31405 + if (!info)
31406 + return ERR_PTR(-ENOMEM);
31407 + memset(info, 0, sizeof (*info));
31408 + fipsize = inode_digest_plugin(inode)->fipsize;
31409 + info->keyid = kmalloc(fipsize, GFP_KERNEL);
31410 + if (!info->keyid) {
31411 + kfree(info);
31412 + return ERR_PTR(-ENOMEM);
31413 + }
31414 + return info;
31415 +}
31416 +
31417 +#if 0
31418 +/* allocate/free low-level info for cipher and digest
31419 + transforms */
31420 +static int
31421 +alloc_crypto_tfms(plugin_set * pset, crypto_stat_t * info)
31422 +{
31423 + struct crypto_tfm * ret = NULL;
31424 + cipher_plugin * cplug = pset->cipher;
31425 + digest_plugin * dplug = pset->digest;
31426 +
31427 + assert("edward-1363", info != NULL);
31428 + assert("edward-414", cplug != NULL);
31429 + assert("edward-415", dplug != NULL);
31430 +
31431 + if (cplug->alloc) {
31432 + ret = cplug->alloc();
31433 + if (ret == NULL) {
31434 + warning("edward-1364",
31435 + "Can not allocate info for %s\n",
31436 + cplug->h.desc);
31437 + return RETERR(-EINVAL);
31438 + }
31439 + }
31440 + info_set_tfm(info, CIPHER_TFM, ret);
31441 + if (dplug->alloc) {
31442 + ret = dplug->alloc();
31443 + if (ret == NULL) {
31444 + warning("edward-1365",
31445 + "Can not allocate info for %s\n",
31446 + dplug->h.desc);
31447 + goto err;
31448 + }
31449 + }
31450 + info_set_tfm(info, DIGEST_TFM, ret);
31451 + return 0;
31452 + err:
31453 + if (cplug->free) {
31454 + cplug->free(info->tfma[CIPHER_TFM].tfm);
31455 + info_set_tfm(info, CIPHER_TFM, NULL);
31456 + }
31457 + return RETERR(-EINVAL);
31458 +}
31459 +#endif
31460 +
31461 +static void
31462 +free_crypto_tfms(crypto_stat_t * info)
31463 +{
31464 + assert("edward-1366", info != NULL);
31465 + if (!info_cipher_tfm(info))
31466 + return;
31467 + info_cipher_plugin(info)->free(info_cipher_tfm(info));
31468 + info_set_tfm(info, CIPHER_TFM, NULL);
31469 + info_digest_plugin(info)->free(info_digest_tfm(info));
31470 + info_set_tfm(info, DIGEST_TFM, NULL);
31471 + return;
31472 +}
31473 +
31474 +#if 0
31475 +/* create a key fingerprint for disk stat-data */
31476 +static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
31477 +{
31478 + int ret = -ENOMEM;
31479 + size_t blk, pad;
31480 + __u8 * dmem;
31481 + __u8 * cmem;
31482 + struct crypto_tfm * dtfm;
31483 + struct crypto_tfm * ctfm;
31484 + struct scatterlist sg;
31485 +
31486 + assert("edward-1422", 0);
31487 + assert("edward-1367", info != NULL);
31488 + assert("edward-1368", info->keyid != NULL);
31489 +
31490 + dtfm = info_digest_tfm(info);
31491 + ctfm = info_cipher_tfm(info);
31492 +
31493 + dmem = kmalloc((size_t)crypto_tfm_alg_digestsize(dtfm),
31494 + GFP_KERNEL);
31495 + if (!dmem)
31496 + goto exit1;
31497 +
31498 + blk = crypto_tfm_alg_blocksize(ctfm);
31499 +
31500 + pad = data->keyid_size % blk;
31501 + pad = (pad ? blk - pad : 0);
31502 +
31503 + cmem = kmalloc((size_t)data->keyid_size + pad, GFP_KERNEL);
31504 + if (!cmem)
31505 + goto exit2;
31506 + memcpy(cmem, data->keyid, data->keyid_size);
31507 + memset(cmem + data->keyid_size, 0, pad);
31508 +
31509 + sg.page = virt_to_page(cmem);
31510 + sg.offset = offset_in_page(cmem);
31511 + sg.length = data->keyid_size + pad;
31512 +
31513 + ret = crypto_cipher_encrypt(ctfm, &sg, &sg, data->keyid_size + pad);
31514 + if (ret) {
31515 + warning("edward-1369",
31516 + "encryption failed flags=%x\n", ctfm->crt_flags);
31517 + goto exit3;
31518 + }
31519 + crypto_digest_init (dtfm);
31520 + crypto_digest_update (dtfm, &sg, 1);
31521 + crypto_digest_final (dtfm, dmem);
31522 + memcpy(info->keyid, dmem, info_digest_plugin(info)->fipsize);
31523 + exit3:
31524 + kfree(cmem);
31525 + exit2:
31526 + kfree(dmem);
31527 + exit1:
31528 + return ret;
31529 +}
31530 +#endif
31531 +
31532 +static void destroy_keyid(crypto_stat_t * info)
31533 +{
31534 + assert("edward-1370", info != NULL);
31535 + assert("edward-1371", info->keyid != NULL);
31536 + kfree(info->keyid);
31537 + return;
31538 +}
31539 +
31540 +static void free_crypto_stat (crypto_stat_t * info)
31541 +{
31542 + assert("edward-1372", info != NULL);
31543 +
31544 + free_crypto_tfms(info);
31545 + destroy_keyid(info);
31546 + kfree(info);
31547 +}
31548 +
31549 +#if 0
31550 +static void instantiate_crypto_stat(crypto_stat_t * info)
31551 +{
31552 + assert("edward-1373", info != NULL);
31553 + assert("edward-1374", info->inst == 0);
31554 + info->inst = 1;
31555 +}
31556 +#endif
31557 +
31558 +static void uninstantiate_crypto_stat(crypto_stat_t * info)
31559 +{
31560 + assert("edward-1375", info != NULL);
31561 + info->inst = 0;
31562 +}
31563 +
31564 +static int crypto_stat_instantiated(crypto_stat_t * info)
31565 +{
31566 + return info->inst;
31567 +}
31568 +
31569 +static int inode_has_cipher_key(struct inode * inode)
31570 +{
31571 + assert("edward-1376", inode != NULL);
31572 + return inode_crypto_stat(inode) &&
31573 + crypto_stat_instantiated(inode_crypto_stat(inode));
31574 +}
31575 +
31576 +static void inode_free_crypto_stat (struct inode * inode)
31577 +{
31578 + uninstantiate_crypto_stat(inode_crypto_stat(inode));
31579 + free_crypto_stat(inode_crypto_stat(inode));
31580 +}
31581 +
31582 +static int need_cipher(struct inode * inode)
31583 +{
31584 + return inode_cipher_plugin(inode) !=
31585 + cipher_plugin_by_id(NONE_CIPHER_ID);
31586 +}
31587 +
31588 +/* Create a crypto-stat and attach result to the @object.
31589 + If success is returned, then low-level cipher info contains
31590 + an instantiated key */
31591 +#if 0
31592 +crypto_stat_t *
31593 +create_crypto_stat(struct inode * object,
31594 + crypto_data_t * data /* this contains a (uninstantiated)
31595 + cipher key imported from user
31596 + space */)
31597 +{
31598 + int ret;
31599 + crypto_stat_t * info;
31600 +
31601 + assert("edward-1377", data != NULL);
31602 + assert("edward-1378", need_cipher(object));
31603 +
31604 + if (inode_file_plugin(object) !=
31605 + file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
31606 + return ERR_PTR(-EINVAL);
31607 +
31608 + info = alloc_crypto_stat(object);
31609 + if (IS_ERR(info))
31610 + return info;
31611 + ret = alloc_crypto_tfms(reiser4_inode_data(object)->pset, info);
31612 + if (ret)
31613 + goto err;
31614 + /* Someone can change plugins of the host (for example if
31615 + the host is a directory), so we keep the original ones
31616 + in the crypto-stat. */
31617 + info_set_cipher_plugin(info, inode_cipher_plugin(object));
31618 + info_set_digest_plugin(info, inode_digest_plugin(object));
31619 + /* instantiating a key */
31620 + ret = crypto_cipher_setkey(info_cipher_tfm(info),
31621 + data->key,
31622 + data->keysize);
31623 + if (ret) {
31624 + warning("edward-1379",
31625 + "setkey failed flags=%x\n",
31626 + info_cipher_tfm(info)->crt_flags);
31627 + goto err;
31628 + }
31629 + info->keysize = data->keysize;
31630 + ret = create_keyid(info, data);
31631 + if (ret)
31632 + goto err;
31633 + instantiate_crypto_stat(info);
31634 + return info;
31635 + err:
31636 + free_crypto_stat(info);
31637 + return ERR_PTR(ret);
31638 +}
31639 +#endif
31640 +
31641 +/* increment/decrement a load counter when
31642 + attaching/detaching the crypto-stat to any object */
31643 +static void load_crypto_stat(crypto_stat_t * info)
31644 +{
31645 + assert("edward-1380", info != NULL);
31646 + inc_keyload_count(info);
31647 +}
31648 +
31649 +static void unload_crypto_stat(struct inode * inode)
31650 +{
31651 + crypto_stat_t * info = inode_crypto_stat(inode);
31652 + assert("edward-1381", info->keyload_count > 0);
31653 +
31654 + dec_keyload_count(inode_crypto_stat(inode));
31655 + if (info->keyload_count == 0)
31656 + /* final release */
31657 + inode_free_crypto_stat(inode);
31658 +}
31659 +
31660 +/* attach/detach an existing crypto-stat */
31661 +void attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
31662 +{
31663 + assert("edward-1382", inode != NULL);
31664 + assert("edward-1383", info != NULL);
31665 + assert("edward-1384", inode_crypto_stat(inode) == NULL);
31666 +
31667 + set_inode_crypto_stat(inode, info);
31668 + load_crypto_stat(info);
31669 +}
31670 +
31671 +/* returns true, if crypto stat can be attached to the @host */
31672 +#if REISER4_DEBUG
31673 +static int host_allows_crypto_stat(struct inode * host)
31674 +{
31675 + int ret;
31676 + file_plugin * fplug = inode_file_plugin(host);
31677 +
31678 + switch (fplug->h.id) {
31679 + case CRC_FILE_PLUGIN_ID:
31680 + ret = 1;
31681 + break;
31682 + default:
31683 + ret = 0;
31684 + }
31685 + return ret;
31686 +}
31687 +#endif /* REISER4_DEBUG */
31688 +
31689 +void detach_crypto_stat(struct inode * inode)
31690 +{
31691 + assert("edward-1385", inode != NULL);
31692 + assert("edward-1386", host_allows_crypto_stat(inode));
31693 +
31694 + if (inode_crypto_stat(inode))
31695 + unload_crypto_stat(inode);
31696 + set_inode_crypto_stat(inode, NULL);
31697 +}
31698 +
31699 +#if 0
31700 +
31701 +/* compare fingerprints of @child and @parent */
31702 +static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
31703 +{
31704 + return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
31705 +}
31706 +
31707 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
31708 +int can_inherit_crypto_crc(struct inode *child, struct inode *parent)
31709 +{
31710 + if (!need_cipher(child))
31711 + return 0;
31712 + /* the child is created */
31713 + if (!inode_crypto_stat(child))
31714 + return 1;
31715 + /* the child is looked up */
31716 + if (!inode_crypto_stat(parent))
31717 + return 0;
31718 + return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31719 + inode_digest_plugin(child) == inode_digest_plugin(parent) &&
31720 + inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
31721 + keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
31722 +}
31723 +#endif
31724 +
31725 +/* helper functions for ->create() method of the cryptcompress plugin */
31726 +static int inode_set_crypto(struct inode * object)
31727 +{
31728 + reiser4_inode * info;
31729 + if (!inode_crypto_stat(object)) {
31730 + if (need_cipher(object))
31731 + return RETERR(-EINVAL);
31732 + /* the file is not to be encrypted */
31733 + return 0;
31734 + }
31735 + info = reiser4_inode_data(object);
31736 + info->extmask |= (1 << CRYPTO_STAT);
31737 + info->plugin_mask |= (1 << PSET_CIPHER) | (1 << PSET_DIGEST);
31738 + return 0;
31739 +}
31740 +
31741 +static int
31742 +inode_set_compression(struct inode * object)
31743 +{
31744 + int result = 0;
31745 + compression_plugin * cplug;
31746 + reiser4_inode * info = reiser4_inode_data(object);
31747 +
31748 + cplug = inode_compression_plugin(object);
31749 +
31750 + if (cplug->init != NULL) {
31751 + result = cplug->init();
31752 + if (result)
31753 + return result;
31754 + }
31755 + info->plugin_mask |= (1 << PSET_COMPRESSION);
31756 +
31757 + return 0;
31758 +}
31759 +
31760 +static void
31761 +inode_set_compression_mode(struct inode * object)
31762 +{
31763 + reiser4_inode * info = reiser4_inode_data(object);
31764 +
31765 + info->plugin_mask |= (1 << PSET_COMPRESSION_MODE);
31766 + return;
31767 +}
31768 +
31769 +static int inode_set_cluster(struct inode *object)
31770 +{
31771 + reiser4_inode *info;
31772 + cluster_plugin *cplug;
31773 +
31774 + assert("edward-696", object != NULL);
31775 +
31776 + info = reiser4_inode_data(object);
31777 + cplug = inode_cluster_plugin(object);
31778 +
31779 + if (cplug->shift < PAGE_CACHE_SHIFT) {
31780 + warning("edward-1320",
31781 + "Can not support %p clusters (less then page size)",
31782 + cplug->h.label);
31783 + return RETERR(-EINVAL);
31784 + }
31785 + info->plugin_mask |= (1 << PSET_CLUSTER);
31786 + return 0;
31787 +}
31788 +
31789 +/* ->destroy_inode() method of the cryptcompress plugin */
31790 +void destroy_inode_cryptcompress(struct inode * inode)
31791 +{
31792 + assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
31793 + detach_crypto_stat(inode);
31794 + return;
31795 +}
31796 +
31797 +/* ->create() method of the cryptcompress plugin
31798 +
31799 +. install plugins
31800 +. attach crypto info if specified
31801 +. attach compression info if specified
31802 +. attach cluster info
31803 +*/
31804 +int
31805 +create_cryptcompress(struct inode *object, struct inode *parent,
31806 + reiser4_object_create_data * data)
31807 +{
31808 + int result;
31809 + reiser4_inode *info;
31810 +
31811 + assert("edward-23", object != NULL);
31812 + assert("edward-24", parent != NULL);
31813 + assert("edward-30", data != NULL);
31814 + assert("edward-26", inode_get_flag(object, REISER4_NO_SD));
31815 + assert("edward-27", data->id == CRC_FILE_PLUGIN_ID);
31816 +
31817 + info = reiser4_inode_data(object);
31818 +
31819 + assert("edward-29", info != NULL);
31820 +
31821 + /* set file bit */
31822 + info->plugin_mask |= (1 << PSET_FILE);
31823 +
31824 + /* set crypto */
31825 + result = inode_set_crypto(object);
31826 + if (result)
31827 + goto error;
31828 + /* set compression */
31829 + result = inode_set_compression(object);
31830 + if (result)
31831 + goto error;
31832 + inode_set_compression_mode(object);
31833 +
31834 + /* set cluster info */
31835 + result = inode_set_cluster(object);
31836 + if (result)
31837 + goto error;
31838 + /* set plugin mask */
31839 + info->extmask |= (1 << PLUGIN_STAT);
31840 +
31841 + /* save everything in disk stat-data */
31842 + result = write_sd_by_inode_common(object);
31843 + if (!result)
31844 + return 0;
31845 + error:
31846 + detach_crypto_stat(object);
31847 + return result;
31848 +}
31849 +
31850 +/* ->open() method of the cryptcompress plugin */
31851 +int open_cryptcompress(struct inode * inode, struct file * file)
31852 +{
31853 + struct inode * parent;
31854 +
31855 + assert("edward-1394", inode != NULL);
31856 + assert("edward-1395", file != NULL);
31857 + assert("edward-1396", file != NULL);
31858 + assert("edward-1397", file->f_dentry->d_inode == inode);
31859 + assert("edward-1398", file->f_dentry->d_parent != NULL);
31860 + assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31861 + assert("edward-698",
31862 + inode_file_plugin(inode) ==
31863 + file_plugin_by_id(CRC_FILE_PLUGIN_ID));
31864 +
31865 + if (!need_cipher(inode))
31866 + /* the file is not to be ciphered */
31867 + return 0;
31868 + parent = file->f_dentry->d_parent->d_inode;
31869 + if (!inode_has_cipher_key(inode))
31870 + return RETERR(-EINVAL);
31871 + return 0;
31872 +}
31873 +
31874 +/* returns a blocksize, the attribute of a cipher algorithm */
31875 +static unsigned int
31876 +cipher_blocksize(struct inode * inode)
31877 +{
31878 + assert("edward-758", need_cipher(inode));
31879 + assert("edward-1400", inode_crypto_stat(inode) != NULL);
31880 + return crypto_tfm_alg_blocksize
31881 + (info_cipher_tfm(inode_crypto_stat(inode)));
31882 +}
31883 +
31884 +/* returns offset translated by scale factor of the crypto-algorithm */
31885 +static loff_t inode_scaled_offset (struct inode * inode,
31886 + const loff_t src_off /* input offset */)
31887 +{
31888 + assert("edward-97", inode != NULL);
31889 +
31890 + if (!need_cipher(inode) ||
31891 + src_off == get_key_offset(min_key()) ||
31892 + src_off == get_key_offset(max_key()))
31893 + return src_off;
31894 +
31895 + return inode_cipher_plugin(inode)->scale(inode,
31896 + cipher_blocksize(inode),
31897 + src_off);
31898 +}
31899 +
31900 +/* returns disk cluster size */
31901 +size_t inode_scaled_cluster_size(struct inode * inode)
31902 +{
31903 + assert("edward-110", inode != NULL);
31904 +
31905 + return inode_scaled_offset(inode, inode_cluster_size(inode));
31906 +}
31907 +
31908 +static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
31909 +{
31910 + return (clust_to_off(clust->index, inode) >= inode->i_size);
31911 +}
31912 +
31913 +/* set number of cluster pages */
31914 +static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
31915 +{
31916 + reiser4_slide_t *win;
31917 +
31918 + assert("edward-180", clust != NULL);
31919 + assert("edward-1040", inode != NULL);
31920 +
31921 + win = clust->win;
31922 + if (!win) {
31923 + /* NOTE-EDWARD: i_size should be protected */
31924 + clust->nr_pages =
31925 + count_to_nrpages(fsize_to_count(clust, inode));
31926 + return;
31927 + }
31928 + assert("edward-1176", clust->op != PCL_UNKNOWN);
31929 + assert("edward-1064", win->off + win->count + win->delta != 0);
31930 +
31931 + if (win->stat == HOLE_WINDOW &&
31932 + win->off == 0 && win->count == inode_cluster_size(inode)) {
31933 + /* special case: we start write hole from fake cluster */
31934 + clust->nr_pages = 0;
31935 + return;
31936 + }
31937 + clust->nr_pages =
31938 + count_to_nrpages(max_count(win->off + win->count + win->delta,
31939 + fsize_to_count(clust, inode)));
31940 + return;
31941 +}
31942 +
31943 +/* ->key_by_inode() method of the cryptcompress plugin */
31944 +/* see plugin/plugin.h for details */
31945 +int
31946 +key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
31947 +{
31948 + loff_t clust_off;
31949 +
31950 + assert("edward-64", inode != 0);
31951 + // assert("edward-112", ergo(off != get_key_offset(max_key()), !off_to_cloff(off, inode)));
31952 + /* don't come here with other offsets */
31953 +
31954 + clust_off =
31955 + (off ==
31956 + get_key_offset(max_key())? get_key_offset(max_key()) :
31957 + off_to_clust_to_off(off, inode));
31958 +
31959 + key_by_inode_and_offset_common(inode, 0, key);
31960 + set_key_offset(key,
31961 + (__u64) (!inode_crypto_stat(inode) ? clust_off :
31962 + inode_scaled_offset(inode, clust_off)));
31963 + return 0;
31964 +}
31965 +
31966 +/* plugin->flow_by_inode */
31967 +int
31968 +flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
31969 + const char __user *buf /* user level buffer */ ,
31970 + int user /* 1 if @buf is of user space, 0 - if it is
31971 + kernel space */ ,
31972 + loff_t size /* buffer size */ ,
31973 + loff_t off /* offset to start io from */ ,
31974 + rw_op op /* READ or WRITE */ ,
31975 + flow_t * f /* resulting flow */ )
31976 +{
31977 + assert("edward-436", f != NULL);
31978 + assert("edward-149", inode != NULL);
31979 + assert("edward-150", inode_file_plugin(inode) != NULL);
31980 + assert("edward-151",
31981 + inode_file_plugin(inode)->key_by_inode ==
31982 + key_by_inode_cryptcompress);
31983 +
31984 + f->length = size;
31985 + memcpy(&f->data, &buf, sizeof(buf));
31986 + f->user = user;
31987 + f->op = op;
31988 +
31989 + if (op == WRITE_OP && user == 1)
31990 + return 0;
31991 + return key_by_inode_cryptcompress(inode, off, &f->key);
31992 +}
31993 +
31994 +static int
31995 +crc_hint_validate(hint_t * hint, const reiser4_key * key,
31996 + znode_lock_mode lock_mode)
31997 +{
31998 + coord_t *coord;
31999 +
32000 + assert("edward-704", hint != NULL);
32001 + assert("edward-1089", !hint->ext_coord.valid);
32002 + assert("edward-706", hint->lh.owner == NULL);
32003 +
32004 + coord = &hint->ext_coord.coord;
32005 +
32006 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
32007 + /* hint either not set or set by different operation */
32008 + return RETERR(-E_REPEAT);
32009 +
32010 + if (get_key_offset(key) != hint->offset)
32011 + /* hint is set for different key */
32012 + return RETERR(-E_REPEAT);
32013 +
32014 + assert("edward-707", schedulable());
32015 +
32016 + return seal_validate(&hint->seal, &hint->ext_coord.coord,
32017 + key, &hint->lh, lock_mode, ZNODE_LOCK_LOPRI);
32018 +}
32019 +
32020 +/* reserve disk space when writing a logical cluster */
32021 +static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
32022 +{
32023 + int result = 0;
32024 +
32025 + assert("edward-965", schedulable());
32026 + assert("edward-439", inode != NULL);
32027 + assert("edward-440", clust != NULL);
32028 + assert("edward-441", clust->pages != NULL);
32029 + assert("edward-1261", get_current_context()->grabbed_blocks == 0);
32030 +
32031 + if (clust->nr_pages == 0) {
32032 + assert("edward-1152", clust->win != NULL);
32033 + assert("edward-1153", clust->win->stat == HOLE_WINDOW);
32034 + /* don't reserve space for fake disk clusteer */
32035 + return 0;
32036 + }
32037 + assert("edward-442", jprivate(clust->pages[0]) != NULL);
32038 +
32039 + result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
32040 + estimate_update_cluster(inode),
32041 + BA_CAN_COMMIT);
32042 + if (result)
32043 + return result;
32044 + clust->reserved = 1;
32045 + grabbed2cluster_reserved(estimate_insert_cluster(inode) +
32046 + estimate_update_cluster(inode));
32047 +#if REISER4_DEBUG
32048 + clust->reserved_prepped = estimate_update_cluster(inode);
32049 + clust->reserved_unprepped = estimate_insert_cluster(inode);
32050 +#endif
32051 + /* there can be space grabbed by txnmgr_force_commit_all */
32052 + all_grabbed2free();
32053 + return 0;
32054 +}
32055 +
32056 +/* free reserved disk space if writing a logical cluster fails */
32057 +static void
32058 +free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
32059 +{
32060 + assert("edward-967", clust->reserved == 1);
32061 +
32062 + cluster_reserved2free(count);
32063 + clust->reserved = 0;
32064 +}
32065 +
32066 +/* The core search procedure of the cryptcompress plugin.
32067 + If returned value is not cbk_errored, then current znode is locked */
32068 +static int find_cluster_item(hint_t * hint,
32069 + const reiser4_key * key, /* key of the item we are
32070 + looking for */
32071 + znode_lock_mode lock_mode /* which lock */ ,
32072 + ra_info_t * ra_info, lookup_bias bias, __u32 flags)
32073 +{
32074 + int result;
32075 + reiser4_key ikey;
32076 + coord_t *coord = &hint->ext_coord.coord;
32077 + coord_t orig = *coord;
32078 +
32079 + assert("edward-152", hint != NULL);
32080 +
32081 + if (hint->ext_coord.valid == 0) {
32082 + result = crc_hint_validate(hint, key, lock_mode);
32083 + if (result == -E_REPEAT)
32084 + goto traverse_tree;
32085 + else if (result) {
32086 + assert("edward-1216", 0);
32087 + return result;
32088 + }
32089 + hint->ext_coord.valid = 1;
32090 + }
32091 + assert("edward-709", znode_is_any_locked(coord->node));
32092 +
32093 + /* In-place lookup is going here, it means we just need to
32094 + check if next item of the @coord match to the @keyhint) */
32095 +
32096 + if (equal_to_rdk(coord->node, key)) {
32097 + result = goto_right_neighbor(coord, &hint->lh);
32098 + if (result == -E_NO_NEIGHBOR) {
32099 + assert("edward-1217", 0);
32100 + return RETERR(-EIO);
32101 + }
32102 + if (result)
32103 + return result;
32104 + assert("edward-1218", equal_to_ldk(coord->node, key));
32105 + } else {
32106 + coord->item_pos++;
32107 + coord->unit_pos = 0;
32108 + coord->between = AT_UNIT;
32109 + }
32110 + result = zload(coord->node);
32111 + if (result)
32112 + return result;
32113 + assert("edward-1219", !node_is_empty(coord->node));
32114 +
32115 + if (!coord_is_existing_item(coord)) {
32116 + zrelse(coord->node);
32117 + goto not_found;
32118 + }
32119 + item_key_by_coord(coord, &ikey);
32120 + zrelse(coord->node);
32121 + if (!keyeq(key, &ikey))
32122 + goto not_found;
32123 + return CBK_COORD_FOUND;
32124 +
32125 + not_found:
32126 + assert("edward-1220", coord->item_pos > 0);
32127 + //coord->item_pos--;
32128 + /* roll back */
32129 + *coord = orig;
32130 + ON_DEBUG(coord_update_v(coord));
32131 + return CBK_COORD_NOTFOUND;
32132 +
32133 + traverse_tree:
32134 + assert("edward-713", hint->lh.owner == NULL);
32135 + assert("edward-714", schedulable());
32136 +
32137 + unset_hint(hint);
32138 + coord_init_zero(coord);
32139 + result = coord_by_key(current_tree, key, coord, &hint->lh,
32140 + lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
32141 + CBK_UNIQUE | flags, ra_info);
32142 + if (cbk_errored(result))
32143 + return result;
32144 + hint->ext_coord.valid = 1;
32145 + return result;
32146 +}
32147 +
32148 +/* This function is called by deflate[inflate] manager when
32149 + creating a transformed/plain stream to check if we should
32150 + create/cut some overhead. If this returns true, then @oh
32151 + contains the size of this overhead.
32152 + */
32153 +static int
32154 +need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
32155 + rw_op rw, int * oh)
32156 +{
32157 + tfm_cluster_t * tc = &clust->tc;
32158 + switch (rw) {
32159 + case WRITE_OP: /* estimate align */
32160 + *oh = tc->len % cipher_blocksize(inode);
32161 + if (*oh != 0)
32162 + return 1;
32163 + break;
32164 + case READ_OP: /* estimate cut */
32165 + *oh = *(tfm_output_data(clust) + tc->len - 1);
32166 + break;
32167 + default:
32168 + impossible("edward-1401", "bad option");
32169 + }
32170 + return (tc->len != tc->lsize);
32171 +}
32172 +
32173 +/* create/cut an overhead of transformed/plain stream */
32174 +static void
32175 +align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
32176 +{
32177 + int oh;
32178 + cipher_plugin * cplug = inode_cipher_plugin(inode);
32179 +
32180 + assert("edward-1402", need_cipher(inode));
32181 +
32182 + if (!need_cut_or_align(inode, clust, rw, &oh))
32183 + return;
32184 + switch (rw) {
32185 + case WRITE_OP: /* do align */
32186 + clust->tc.len +=
32187 + cplug->align_stream(tfm_input_data(clust) +
32188 + clust->tc.len, clust->tc.len,
32189 + cipher_blocksize(inode));
32190 + *(tfm_input_data(clust) + clust->tc.len - 1) =
32191 + cipher_blocksize(inode) - oh;
32192 + break;
32193 + case READ_OP: /* do cut */
32194 + assert("edward-1403", oh <= cipher_blocksize(inode));
32195 + clust->tc.len -= oh;
32196 + break;
32197 + default:
32198 + impossible("edward-1404", "bad option");
32199 + }
32200 + return;
32201 +}
32202 +
32203 +/* the following two functions are to evaluate results
32204 + of compression transform */
32205 +static unsigned
32206 +max_cipher_overhead(struct inode * inode)
32207 +{
32208 + if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
32209 + return 0;
32210 + return cipher_blocksize(inode);
32211 +}
32212 +
32213 +static int deflate_overhead(struct inode *inode)
32214 +{
32215 + return (inode_compression_plugin(inode)->
32216 + checksum ? DC_CHECKSUM_SIZE : 0);
32217 +}
32218 +
32219 +static unsigned deflate_overrun(struct inode * inode, int ilen)
32220 +{
32221 + return coa_overrun(inode_compression_plugin(inode), ilen);
32222 +}
32223 +
32224 +/* Estimating compressibility of a logical cluster by various
32225 + policies represented by compression mode plugin.
32226 + If this returns false, then compressor won't be called for
32227 + the cluster of index @index.
32228 +*/
32229 +static int should_compress(tfm_cluster_t * tc, cloff_t index,
32230 + struct inode *inode)
32231 +{
32232 + compression_plugin *cplug = inode_compression_plugin(inode);
32233 + compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
32234 +
32235 + assert("edward-1321", tc->len != 0);
32236 + assert("edward-1322", cplug != NULL);
32237 + assert("edward-1323", mplug != NULL);
32238 +
32239 + return /* estimate by size */
32240 + (cplug->min_size_deflate ?
32241 + tc->len >= cplug->min_size_deflate() :
32242 + 1) &&
32243 + /* estimate by compression mode plugin */
32244 + (mplug->should_deflate ?
32245 + mplug->should_deflate(inode, index) :
32246 + 1);
32247 +}
32248 +
32249 +/* Evaluating results of compression transform.
32250 + Returns true, if we need to accept this results */
32251 +static int
32252 +save_compressed(int size_before, int size_after, struct inode * inode)
32253 +{
32254 + return (size_after + deflate_overhead(inode) +
32255 + max_cipher_overhead(inode) < size_before);
32256 +}
32257 +
32258 +/* Guess result of the evaluation above */
32259 +static int
32260 +need_inflate(reiser4_cluster_t * clust, struct inode *inode,
32261 + int encrypted /* is cluster encrypted */ )
32262 +{
32263 + tfm_cluster_t *tc = &clust->tc;
32264 +
32265 + assert("edward-142", tc != 0);
32266 + assert("edward-143", inode != NULL);
32267 +
32268 + return tc->len <
32269 + (encrypted ?
32270 + inode_scaled_offset(inode, tc->lsize) :
32271 + tc->lsize);
32272 +}
32273 +
32274 +/* If results of compression were accepted, then we add
32275 + a checksum to catch possible disk cluster corruption.
32276 + The following is a format of the data stored in disk clusters:
32277 +
32278 + data This is (transformed) logical cluster.
32279 + cipher_overhead This is created by ->align() method
32280 + of cipher plugin. May be absent.
32281 + checksum (4) This is created by ->checksum method
32282 + of compression plugin to check
32283 + integrity. May be absent.
32284 +
32285 + Crypto overhead format:
32286 +
32287 + data
32288 + control_byte (1) contains aligned overhead size:
32289 + 1 <= overhead <= cipher_blksize
32290 +*/
32291 +/* Append a checksum at the end of a transformed stream */
32292 +static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32293 +{
32294 + __u32 checksum;
32295 +
32296 + assert("edward-1309", tc != NULL);
32297 + assert("edward-1310", tc->len > 0);
32298 + assert("edward-1311", cplug->checksum != NULL);
32299 +
32300 + checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
32301 + put_unaligned(cpu_to_le32(checksum),
32302 + (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
32303 + tc->len += (int)DC_CHECKSUM_SIZE;
32304 +}
32305 +
32306 +/* Check a disk cluster checksum.
32307 + Returns 0 if checksum is correct, otherwise returns 1 */
32308 +static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32309 +{
32310 + assert("edward-1312", tc != NULL);
32311 + assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
32312 + assert("edward-1314", cplug->checksum != NULL);
32313 +
32314 + if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
32315 + tc->len - (int)DC_CHECKSUM_SIZE) !=
32316 + le32_to_cpu(get_unaligned((d32 *)
32317 + (tfm_stream_data(tc, INPUT_STREAM)
32318 + + tc->len - (int)DC_CHECKSUM_SIZE)))) {
32319 + warning("edward-156",
32320 + "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
32321 + (int)le32_to_cpu
32322 + (get_unaligned((d32 *)
32323 + (tfm_stream_data(tc, INPUT_STREAM) +
32324 + tc->len - (int)DC_CHECKSUM_SIZE))),
32325 + (int)cplug->checksum
32326 + (tfm_stream_data(tc, INPUT_STREAM),
32327 + tc->len - (int)DC_CHECKSUM_SIZE));
32328 + return 1;
32329 + }
32330 + tc->len -= (int)DC_CHECKSUM_SIZE;
32331 + return 0;
32332 +}
32333 +
32334 +/* get input/output stream for some transform action */
32335 +int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
32336 + tfm_stream_id id)
32337 +{
32338 + size_t size = inode_scaled_cluster_size(inode);
32339 +
32340 + assert("edward-901", tc != NULL);
32341 + assert("edward-1027", inode_compression_plugin(inode) != NULL);
32342 +
32343 + if (tc->act == TFM_WRITE_ACT)
32344 + size += deflate_overrun(inode, inode_cluster_size(inode));
32345 +
32346 + if (!tfm_stream(tc, id) && id == INPUT_STREAM)
32347 + alternate_streams(tc);
32348 + if (!tfm_stream(tc, id))
32349 + return alloc_tfm_stream(tc, size, id);
32350 +
32351 + assert("edward-902", tfm_stream_is_set(tc, id));
32352 +
32353 + if (tfm_stream_size(tc, id) < size)
32354 + return realloc_tfm_stream(tc, size, id);
32355 + return 0;
32356 +}
32357 +
32358 +/* Common deflate manager */
32359 +int deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32360 +{
32361 + int result = 0;
32362 + int compressed = 0;
32363 + int encrypted = 0;
32364 + tfm_cluster_t * tc = &clust->tc;
32365 + compression_plugin * coplug;
32366 +
32367 + assert("edward-401", inode != NULL);
32368 + assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
32369 + assert("edward-1348", tc->act == TFM_WRITE_ACT);
32370 + assert("edward-498", !tfm_cluster_is_uptodate(tc));
32371 +
32372 + coplug = inode_compression_plugin(inode);
32373 + if (should_compress(tc, clust->index, inode)) {
32374 + /* try to compress, discard bad results */
32375 + __u32 dst_len;
32376 + compression_mode_plugin * mplug =
32377 + inode_compression_mode_plugin(inode);
32378 + assert("edward-602", coplug != NULL);
32379 + assert("edward-1423", coplug->compress != NULL);
32380 +
32381 + result = grab_coa(tc, coplug);
32382 + if (result) {
32383 + warning("edward-1424",
32384 + "alloc_coa failed with ret=%d, skipped compression",
32385 + result);
32386 + goto cipher;
32387 + }
32388 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32389 + if (result) {
32390 + warning("edward-1425",
32391 + "alloc stream failed with ret=%d, skipped compression",
32392 + result);
32393 + goto cipher;
32394 + }
32395 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
32396 + coplug->compress(get_coa(tc, coplug->h.id, tc->act),
32397 + tfm_input_data(clust), tc->len,
32398 + tfm_output_data(clust), &dst_len);
32399 + /* make sure we didn't overwrite extra bytes */
32400 + assert("edward-603",
32401 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
32402 +
32403 + /* evaluate results of compression transform */
32404 + if (save_compressed(tc->len, dst_len, inode)) {
32405 + /* good result, accept */
32406 + tc->len = dst_len;
32407 + if (mplug->accept_hook != NULL) {
32408 + result = mplug->accept_hook(inode, clust->index);
32409 + if (result)
32410 + warning("edward-1426",
32411 + "accept_hook failed with ret=%d",
32412 + result);
32413 + }
32414 + compressed = 1;
32415 + }
32416 + else {
32417 + /* bad result, discard */
32418 +#if REISER4_DEBUG
32419 + if (cluster_is_complete(clust, inode))
32420 + warning("edward-1338",
32421 + "incompressible cluster %lu (inode %llu)",
32422 + clust->index,
32423 + (unsigned long long)get_inode_oid(inode));
32424 +#endif
32425 + if (mplug->discard_hook != NULL &&
32426 + cluster_is_complete(clust, inode)) {
32427 + result = mplug->discard_hook(inode,
32428 + clust->index);
32429 + if (result)
32430 + warning("edward-1427",
32431 + "discard_hook failed with ret=%d",
32432 + result);
32433 + }
32434 + }
32435 + }
32436 + cipher:
32437 + if (need_cipher(inode)) {
32438 + cipher_plugin * ciplug;
32439 + struct crypto_tfm * tfm;
32440 + struct scatterlist src;
32441 + struct scatterlist dst;
32442 +
32443 + ciplug = inode_cipher_plugin(inode);
32444 + tfm = info_cipher_tfm(inode_crypto_stat(inode));
32445 + if (compressed)
32446 + alternate_streams(tc);
32447 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32448 + if (result)
32449 + return result;
32450 +
32451 + align_or_cut_overhead(inode, clust, WRITE_OP);
32452 + src.page = virt_to_page(tfm_input_data(clust));
32453 + src.offset = offset_in_page(tfm_input_data(clust));
32454 + src.length = tc->len;
32455 +
32456 + dst.page = virt_to_page(tfm_output_data(clust));
32457 + dst.offset = offset_in_page(tfm_output_data(clust));
32458 + dst.length = tc->len;
32459 +
32460 + result = crypto_cipher_encrypt(tfm, &dst, &src, tc->len);
32461 + if (result) {
32462 + warning("edward-1405",
32463 + "encryption failed flags=%x\n", tfm->crt_flags);
32464 + return result;
32465 + }
32466 + encrypted = 1;
32467 + }
32468 + if (compressed && coplug->checksum != NULL)
32469 + dc_set_checksum(coplug, tc);
32470 + if (!compressed && !encrypted)
32471 + alternate_streams(tc);
32472 + return result;
32473 +}
32474 +
32475 +/* Common inflate manager. */
32476 +int inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32477 +{
32478 + int result = 0;
32479 + int transformed = 0;
32480 + tfm_cluster_t * tc = &clust->tc;
32481 + compression_plugin * coplug;
32482 +
32483 + assert("edward-905", inode != NULL);
32484 + assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
32485 + assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
32486 + assert("edward-1349", tc->act == TFM_READ_ACT);
32487 + assert("edward-907", !tfm_cluster_is_uptodate(tc));
32488 +
32489 + /* Handle a checksum (if any) */
32490 + coplug = inode_compression_plugin(inode);
32491 + if (need_inflate(clust, inode, need_cipher(inode)) &&
32492 + coplug->checksum != NULL) {
32493 + result = dc_check_checksum(coplug, tc);
32494 + if (result)
32495 + return RETERR(-EIO);
32496 + }
32497 + if (need_cipher(inode)) {
32498 + cipher_plugin * ciplug;
32499 + struct crypto_tfm * tfm;
32500 + struct scatterlist src;
32501 + struct scatterlist dst;
32502 +
32503 + ciplug = inode_cipher_plugin(inode);
32504 + tfm = info_cipher_tfm(inode_crypto_stat(inode));
32505 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32506 + if (result)
32507 + return result;
32508 + assert("edward-909", tfm_cluster_is_set(tc));
32509 +
32510 + src.page = virt_to_page(tfm_input_data(clust));
32511 + src.offset = offset_in_page(tfm_input_data(clust));
32512 + src.length = tc->len;
32513 +
32514 + dst.page = virt_to_page(tfm_output_data(clust));
32515 + dst.offset = offset_in_page(tfm_output_data(clust));
32516 + dst.length = tc->len;
32517 +
32518 + result = crypto_cipher_decrypt(tfm, &dst, &src, tc->len);
32519 + if (result)
32520 + return result;
32521 + align_or_cut_overhead(inode, clust, READ_OP);
32522 + transformed = 1;
32523 + }
32524 + if (need_inflate(clust, inode, 0)) {
32525 + unsigned dst_len = inode_cluster_size(inode);
32526 + if(transformed)
32527 + alternate_streams(tc);
32528 +
32529 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32530 + if (result)
32531 + return result;
32532 + assert("edward-1305", coplug->decompress != NULL);
32533 + assert("edward-910", tfm_cluster_is_set(tc));
32534 +
32535 + coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
32536 + tfm_input_data(clust), tc->len,
32537 + tfm_output_data(clust), &dst_len);
32538 + /* check length */
32539 + tc->len = dst_len;
32540 + assert("edward-157", dst_len == tc->lsize);
32541 + transformed = 1;
32542 + }
32543 + if (!transformed)
32544 + alternate_streams(tc);
32545 + return result;
32546 +}
32547 +
32548 +/* This is implementation of readpage method of struct
32549 + address_space_operations for cryptcompress plugin. */
32550 +int readpage_cryptcompress(struct file *file, struct page *page)
32551 +{
32552 + reiser4_context *ctx;
32553 + reiser4_cluster_t clust;
32554 + item_plugin *iplug;
32555 + int result;
32556 +
32557 + assert("edward-88", PageLocked(page));
32558 + assert("vs-976", !PageUptodate(page));
32559 + assert("edward-89", page->mapping && page->mapping->host);
32560 +
32561 + ctx = init_context(page->mapping->host->i_sb);
32562 + if (IS_ERR(ctx))
32563 + return PTR_ERR(ctx);
32564 + result = check_cryptcompress(page->mapping->host);
32565 + if (result) {
32566 + unlock_page(page);
32567 + reiser4_exit_context(ctx);
32568 + return result;
32569 + }
32570 + assert("edward-113",
32571 + ergo(file != NULL,
32572 + page->mapping == file->f_dentry->d_inode->i_mapping));
32573 +
32574 + if (PageUptodate(page)) {
32575 + warning("edward-1338", "page is already uptodate\n");
32576 + reiser4_exit_context(ctx);
32577 + return 0;
32578 + }
32579 + cluster_init_read(&clust, NULL);
32580 + clust.file = file;
32581 + iplug = item_plugin_by_id(CTAIL_ID);
32582 + if (!iplug->s.file.readpage) {
32583 + unlock_page(page);
32584 + put_cluster_handle(&clust);
32585 + reiser4_exit_context(ctx);
32586 + return -EINVAL;
32587 + }
32588 + result = iplug->s.file.readpage(&clust, page);
32589 + if (result)
32590 + unlock_page(page);
32591 + assert("edward-64",
32592 + ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
32593 + put_cluster_handle(&clust);
32594 + reiser4_exit_context(ctx);
32595 + return result;
32596 +}
32597 +
32598 +/* how much pages will be captured */
32599 +static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
32600 +{
32601 + switch (clust->op) {
32602 + case PCL_APPEND:
32603 + return clust->nr_pages;
32604 + case PCL_TRUNCATE:
32605 + assert("edward-1179", clust->win != NULL);
32606 + return count_to_nrpages(clust->win->off + clust->win->count);
32607 + default:
32608 + impossible("edward-1180", "bad page cluster option");
32609 + return 0;
32610 + }
32611 +}
32612 +
32613 +static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
32614 +{
32615 + int i;
32616 + struct page *pg;
32617 + int nrpages = cluster_nrpages_to_capture(clust);
32618 +
32619 + for (i = 0; i < nrpages; i++) {
32620 +
32621 + pg = clust->pages[i];
32622 + assert("edward-968", pg != NULL);
32623 + lock_page(pg);
32624 + assert("edward-1065", PageUptodate(pg));
32625 + set_page_dirty_internal(pg);
32626 + unlock_page(pg);
32627 + mark_page_accessed(pg);
32628 + }
32629 +}
32630 +
32631 +static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
32632 +{
32633 + int i;
32634 + assert("edward-1275", clust != NULL);
32635 +
32636 + for (i = 0; i < clust->nr_pages; i++) {
32637 + assert("edward-1276", clust->pages[i] != NULL);
32638 +
32639 + lock_page(clust->pages[i]);
32640 + if (PageDirty(clust->pages[i])) {
32641 + assert("edward-1277", PageUptodate(clust->pages[i]));
32642 + clear_page_dirty_for_io(clust->pages[i]);
32643 + }
32644 +#if REISER4_DEBUG
32645 + else
32646 + /* Race between flush and write:
32647 + some pages became clean when write() (or another
32648 + process which modifies data) capture the cluster. */
32649 + warning("edward-985", "Page of index %lu (inode %llu)"
32650 + " is not dirty\n", clust->pages[i]->index,
32651 + (unsigned long long)get_inode_oid(clust->
32652 + pages[i]->
32653 + mapping->
32654 + host));
32655 +#endif
32656 + unlock_page(clust->pages[i]);
32657 + }
32658 +}
32659 +
32660 +/* update i_size by window */
32661 +static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
32662 +{
32663 + loff_t size;
32664 + reiser4_slide_t *win;
32665 +
32666 + assert("edward-1181", clust != NULL);
32667 + assert("edward-1182", inode != NULL);
32668 +
32669 + win = clust->win;
32670 + assert("edward-1183", win != NULL);
32671 +
32672 + size = clust_to_off(clust->index, inode) + win->off;
32673 +
32674 + switch (clust->op) {
32675 + case PCL_APPEND:
32676 + if (size + win->count <= inode->i_size)
32677 + /* overwrite only */
32678 + return;
32679 + size += win->count;
32680 + break;
32681 + case PCL_TRUNCATE:
32682 + break;
32683 + default:
32684 + impossible("edward-1184", "bad page cluster option");
32685 + break;
32686 + }
32687 + inode_check_scale_nolock(inode, inode->i_size, size);
32688 + inode->i_size = size;
32689 + return;
32690 +}
32691 +
32692 +/* Check in page cluster modifications.
32693 + . Make jnode dirty, if it wasn't;
32694 + . Reserve space for a disk cluster update by flush algorithm, if needed;
32695 + . Clean up old references (if any).
32696 + . Put pages (grabbed in this thread) which will be truncated
32697 +*/
32698 +static void
32699 +make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
32700 + loff_t * old_isize, struct inode *inode)
32701 +{
32702 + int i;
32703 + int old_nrpages;
32704 + int new_nrpages = cluster_nrpages_to_capture(clust);
32705 +
32706 + assert("edward-973", new_nrpages > 0);
32707 + assert("edward-221", node != NULL);
32708 + assert("edward-971", clust->reserved == 1);
32709 + assert_spin_locked(&(node->guard));
32710 + assert("edward-972", node->page_count < cluster_nrpages(inode));
32711 + assert("edward-1263",
32712 + clust->reserved_prepped == estimate_update_cluster(inode));
32713 + assert("edward-1264", clust->reserved_unprepped == 0);
32714 +
32715 + if (JF_ISSET(node, JNODE_DIRTY)) {
32716 + /* someone has modified this cluster, but
32717 + the modifications are not committed yet */
32718 + old_nrpages =
32719 + count_to_nrpages(cnt_to_clcnt(*old_isize,
32720 + clust->index, inode));
32721 + /* free space which is already reserved */
32722 + free_reserved4cluster(inode, clust,
32723 + estimate_update_cluster(inode));
32724 + /* put old references */
32725 + for (i = 0; i < old_nrpages; i++) {
32726 + assert("edward-975", clust->pages[i]);
32727 + assert("edward-1185", PageUptodate(clust->pages[i]));
32728 +
32729 + page_cache_release(clust->pages[i]);
32730 +#if REISER4_DEBUG
32731 + cryptcompress_inode_data(inode)->pgcount --;
32732 +#endif
32733 + }
32734 + } else {
32735 + /* no captured pages */
32736 + assert("edward-1043", node->page_count == 0);
32737 + jnode_make_dirty_locked(node);
32738 + clust->reserved = 0;
32739 + }
32740 + /* put pages that will be truncated (if any) */
32741 + for (i = new_nrpages; i < clust->nr_pages; i++) {
32742 + assert("edward-1433", clust->pages[i]);
32743 + assert("edward-1434", PageUptodate(clust->pages[i]));
32744 + page_cache_release(clust->pages[i]);
32745 +#if REISER4_DEBUG
32746 + cryptcompress_inode_data(inode)->pgcount --;
32747 +#endif
32748 + }
32749 +#if REISER4_DEBUG
32750 + clust->reserved_prepped -= estimate_update_cluster(inode);
32751 + node->page_count = new_nrpages - 1;
32752 +#endif
32753 + return;
32754 +}
32755 +
32756 +/* This function spawns a transaction and
32757 + is called by any thread as a final step in page cluster modification.
32758 +*/
32759 +static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
32760 +{
32761 + int result = 0;
32762 + loff_t old_size;
32763 + jnode *node;
32764 +
32765 + assert("edward-1029", clust != NULL);
32766 + assert("edward-1030", clust->reserved == 1);
32767 + assert("edward-1031", clust->nr_pages != 0);
32768 + assert("edward-1032", clust->pages != NULL);
32769 + assert("edward-1033", clust->pages[0] != NULL);
32770 +
32771 + node = jprivate(clust->pages[0]);
32772 +
32773 + assert("edward-1035", node != NULL);
32774 +
32775 + spin_lock_jnode(node);
32776 + old_size = inode->i_size;
32777 + if (clust->win)
32778 + inode_set_new_size(clust, inode);
32779 +
32780 + result = try_capture(node, ZNODE_WRITE_LOCK, 0);
32781 + if (result)
32782 + goto exit;
32783 + make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
32784 + exit:
32785 + assert("edward-1034", !result);
32786 + spin_unlock_jnode(node);
32787 + jput(node);
32788 + return result;
32789 +}
32790 +
32791 +/* Collect unlocked cluster pages for any modifications and attach a jnode.
32792 + We allocate only one jnode per cluster, this jnode is binded to the first
32793 + page of this cluster, so we have an extra-reference that will exist with
32794 + this jnode, other references will be cleaned up in flush time.
32795 +*/
32796 +static int
32797 +grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
32798 +{
32799 + int i;
32800 + int result = 0;
32801 + jnode *node = NULL;
32802 +
32803 + assert("edward-182", clust != NULL);
32804 + assert("edward-183", clust->pages != NULL);
32805 + assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32806 +
32807 + if (clust->nr_pages == 0)
32808 + return 0;
32809 +
32810 + for (i = 0; i < clust->nr_pages; i++) {
32811 +
32812 + assert("edward-1044", clust->pages[i] == NULL);
32813 +
32814 + clust->pages[i] =
32815 + grab_cache_page(inode->i_mapping,
32816 + clust_to_pg(clust->index, inode) + i);
32817 + if (!clust->pages[i]) {
32818 + result = RETERR(-ENOMEM);
32819 + break;
32820 + }
32821 + if (i == 0) {
32822 + node = jnode_of_page(clust->pages[i]);
32823 + if (IS_ERR(node)) {
32824 + result = PTR_ERR(node);
32825 + unlock_page(clust->pages[i]);
32826 + break;
32827 + }
32828 + JF_SET(node, JNODE_CLUSTER_PAGE);
32829 + unlock_page(clust->pages[i]);
32830 + assert("edward-919", node);
32831 + continue;
32832 + }
32833 + unlock_page(clust->pages[i]);
32834 + }
32835 + if (result) {
32836 + while (i)
32837 + page_cache_release(clust->pages[--i]);
32838 + if (node && !IS_ERR(node))
32839 + jput(node);
32840 + return result;
32841 + }
32842 + assert("edward-920", jprivate(clust->pages[0]));
32843 +#if REISER4_DEBUG
32844 + cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
32845 +#endif
32846 + return 0;
32847 +}
32848 +
32849 +/* Collect unlocked cluster pages only for read (not to modify) */
32850 +static int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32851 +{
32852 + int i;
32853 + int result = 0;
32854 +
32855 + assert("edward-1428", inode != NULL);
32856 + assert("edward-1429", inode->i_mapping != NULL);
32857 + assert("edward-787", clust != NULL);
32858 + assert("edward-788", clust->pages != NULL);
32859 + assert("edward-789", clust->nr_pages != 0);
32860 + assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
32861 +
32862 + for (i = 0; i < clust->nr_pages; i++) {
32863 + clust->pages[i] =
32864 + grab_cache_page(inode->i_mapping,
32865 + clust_to_pg(clust->index, inode) + i);
32866 + if (!clust->pages[i]) {
32867 + result = RETERR(-ENOMEM);
32868 + break;
32869 + }
32870 + unlock_page(clust->pages[i]);
32871 + }
32872 + if (result)
32873 + while (i)
32874 + page_cache_release(clust->pages[--i]);
32875 + return result;
32876 +}
32877 +
32878 +/* @node might be attached by reiser4_writepage(), not by
32879 + cryptcompress plugin code, but emergency flush should
32880 + understand that pages of cryptcompress files are not
32881 + flushable.
32882 +*/
32883 +int jnode_of_cluster(const jnode * node, struct page * page)
32884 +{
32885 + assert("edward-1339", node != NULL);
32886 + assert("edward-1340", page != NULL);
32887 + assert("edward-1341", page->mapping != NULL);
32888 + assert("edward-1342", page->mapping->host != NULL);
32889 + assert("edward-1343",
32890 + ergo(jnode_is_unformatted(node),
32891 + get_inode_oid(page->mapping->host) ==
32892 + node->key.j.objectid));
32893 + if (inode_file_plugin(page->mapping->host) ==
32894 + file_plugin_by_id(CRC_FILE_PLUGIN_ID)) {
32895 +#if REISER4_DEBUG
32896 + if (!jnode_is_cluster_page(node))
32897 + warning("edward-1345",
32898 + "inode %llu: cluster page of index %lu became private",
32899 + (unsigned long long)get_inode_oid(page->mapping->host),
32900 + page->index);
32901 +#endif
32902 + return 1;
32903 + }
32904 + return 0;
32905 +}
32906 +
32907 +/* put cluster pages */
32908 +void release_cluster_pages(reiser4_cluster_t * clust)
32909 +{
32910 + int i;
32911 +
32912 + assert("edward-447", clust != NULL);
32913 + for (i = 0; i < clust->nr_pages; i++) {
32914 +
32915 + assert("edward-449", clust->pages[i] != NULL);
32916 +
32917 + page_cache_release(clust->pages[i]);
32918 + }
32919 +}
32920 +
32921 +/* this is called when something is failed */
32922 +static void release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
32923 +{
32924 + jnode *node;
32925 +
32926 + assert("edward-445", clust != NULL);
32927 + assert("edward-922", clust->pages != NULL);
32928 + assert("edward-446", clust->pages[0] != NULL);
32929 +
32930 + node = jprivate(clust->pages[0]);
32931 +
32932 + assert("edward-447", node != NULL);
32933 +
32934 + release_cluster_pages(clust);
32935 + jput(node);
32936 +}
32937 +
32938 +#if REISER4_DEBUG
32939 +static int window_ok(reiser4_slide_t * win, struct inode *inode)
32940 +{
32941 + assert("edward-1115", win != NULL);
32942 + assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32943 +
32944 + return (win->off != inode_cluster_size(inode)) &&
32945 + (win->off + win->count + win->delta <= inode_cluster_size(inode));
32946 +}
32947 +
32948 +static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
32949 +{
32950 + assert("edward-279", clust != NULL);
32951 +
32952 + if (!clust->pages)
32953 + return 0;
32954 + return (clust->win ? window_ok(clust->win, inode) : 1);
32955 +}
32956 +#endif
32957 +
32958 +/* guess next window stat */
32959 +static inline window_stat next_window_stat(reiser4_slide_t * win)
32960 +{
32961 + assert("edward-1130", win != NULL);
32962 + return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32963 + HOLE_WINDOW : DATA_WINDOW);
32964 +}
32965 +
32966 +/* guess next cluster index and window params */
32967 +static void
32968 +update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32969 + loff_t to_file)
32970 +{
32971 + reiser4_slide_t *win;
32972 +
32973 + assert("edward-185", clust != NULL);
32974 + assert("edward-438", clust->pages != NULL);
32975 + assert("edward-281", cluster_ok(clust, inode));
32976 +
32977 + win = clust->win;
32978 + if (!win)
32979 + return;
32980 +
32981 + switch (win->stat) {
32982 + case DATA_WINDOW:
32983 + /* increment window position */
32984 + clust->index++;
32985 + win->stat = DATA_WINDOW;
32986 + win->off = 0;
32987 + win->count = min_count(inode_cluster_size(inode), to_file);
32988 + break;
32989 + case HOLE_WINDOW:
32990 + switch (next_window_stat(win)) {
32991 + case HOLE_WINDOW:
32992 + /* set window to fit the offset we start write from */
32993 + clust->index = off_to_clust(file_off, inode);
32994 + win->stat = HOLE_WINDOW;
32995 + win->off = 0;
32996 + win->count = off_to_cloff(file_off, inode);
32997 + win->delta =
32998 + min_count(inode_cluster_size(inode) - win->count,
32999 + to_file);
33000 + break;
33001 + case DATA_WINDOW:
33002 + /* do not move the window, just change its state,
33003 + off+count+delta=inv */
33004 + win->stat = DATA_WINDOW;
33005 + win->off = win->off + win->count;
33006 + win->count = win->delta;
33007 + win->delta = 0;
33008 + break;
33009 + default:
33010 + impossible("edward-282", "wrong next window state");
33011 + }
33012 + break;
33013 + default:
33014 + impossible("edward-283", "wrong current window state");
33015 + }
33016 + assert("edward-1068", cluster_ok(clust, inode));
33017 +}
33018 +
33019 +static int update_sd_cryptcompress(struct inode *inode)
33020 +{
33021 + int result = 0;
33022 +
33023 + assert("edward-978", schedulable());
33024 + assert("edward-1265", get_current_context()->grabbed_blocks == 0);
33025 +
33026 + result = reiser4_grab_space_force( /* one for stat data update */
33027 + estimate_update_common(inode),
33028 + BA_CAN_COMMIT);
33029 + assert("edward-979", !result);
33030 + if (result)
33031 + return result;
33032 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33033 + result = reiser4_update_sd(inode);
33034 +
33035 + all_grabbed2free();
33036 + return result;
33037 +}
33038 +
33039 +
33040 +/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
33041 +static void uncapture_cluster_jnode(jnode * node)
33042 +{
33043 + txn_atom *atom;
33044 +
33045 + assert_spin_locked(&(node->guard));
33046 +
33047 + /*jnode_make_clean(node); */
33048 + atom = jnode_get_atom(node);
33049 + if (atom == NULL) {
33050 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
33051 + spin_unlock_jnode(node);
33052 + return;
33053 + }
33054 +
33055 + uncapture_block(node);
33056 + spin_unlock_atom(atom);
33057 + jput(node);
33058 +}
33059 +
33060 +void forget_cluster_pages(struct page **pages, int nr)
33061 +{
33062 + int i;
33063 + for (i = 0; i < nr; i++) {
33064 +
33065 + assert("edward-1045", pages[i] != NULL);
33066 + page_cache_release(pages[i]);
33067 + }
33068 +}
33069 +
33070 +/* Check out last modifications we are about to commit,
33071 + and prepare input stream for transform operations.
33072 +*/
33073 +int
33074 +flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
33075 + struct inode *inode)
33076 +{
33077 + int result = 0;
33078 + int i;
33079 + int nr_pages = 0;
33080 + tfm_cluster_t *tc = &clust->tc;
33081 +
33082 + assert("edward-980", node != NULL);
33083 + assert("edward-236", inode != NULL);
33084 + assert("edward-237", clust != NULL);
33085 + assert("edward-240", !clust->win);
33086 + assert("edward-241", schedulable());
33087 + assert("edward-718", crc_inode_ok(inode));
33088 +
33089 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
33090 + if (result) {
33091 + warning("edward-1430",
33092 + "alloc stream failed with ret=%d", result);
33093 + return result;
33094 + }
33095 + spin_lock_jnode(node);
33096 + assert("edward-1435", JF_ISSET(node, JNODE_DIRTY));
33097 +
33098 + /* Check out a size of logical cluster and
33099 + set a number of cluster pages to commit. */
33100 + tc->len = tc->lsize = fsize_to_count(clust, inode);
33101 + clust->nr_pages = count_to_nrpages(tc->len);
33102 +
33103 + assert("edward-983", clust->nr_pages == node->page_count + 1);
33104 +#if REISER4_DEBUG
33105 + node->page_count = 0;
33106 +#endif
33107 + cluster_reserved2grabbed(estimate_update_cluster(inode));
33108 + uncapture_cluster_jnode(node);
33109 +
33110 + assert("edward-1224", schedulable());
33111 + /* Check out cluster pages to commit */
33112 + nr_pages =
33113 + find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
33114 + clust->nr_pages, clust->pages);
33115 +
33116 + assert("edward-1280", nr_pages == clust->nr_pages);
33117 + /* Construct input stream from the checked out pages */
33118 + for (i = 0; i < clust->nr_pages; i++) {
33119 + char *data;
33120 +
33121 + assert("edward-242", clust->pages[i] != NULL);
33122 + assert("edward-1436", clust->pages[i]->index ==
33123 + clust_to_pg(clust->index, inode) + i);
33124 + assert("edward-1437", PageUptodate(clust->pages[i]));
33125 + /* flush the page into the input stream */
33126 + lock_page(clust->pages[i]);
33127 + data = kmap(clust->pages[i]);
33128 +
33129 + assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
33130 +
33131 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
33132 + data, cnt_to_pgcnt(tc->len, i));
33133 + kunmap(clust->pages[i]);
33134 + unlock_page(clust->pages[i]);
33135 + }
33136 + clear_cluster_pages_dirty(clust);
33137 + release_cluster_pages(clust);
33138 +#if REISER4_DEBUG
33139 + cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
33140 +#endif
33141 + /* put pages that were found here */
33142 + release_cluster_pages(clust);
33143 + return result;
33144 +}
33145 +
33146 +/* set hint for the cluster of the index @index */
33147 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
33148 + cloff_t index, znode_lock_mode mode)
33149 +{
33150 + reiser4_key key;
33151 + assert("edward-722", crc_inode_ok(inode));
33152 + assert("edward-723",
33153 + inode_file_plugin(inode) ==
33154 + file_plugin_by_id(CRC_FILE_PLUGIN_ID));
33155 +
33156 + inode_file_plugin(inode)->key_by_inode(inode,
33157 + clust_to_off(index, inode),
33158 + &key);
33159 +
33160 + seal_init(&hint->seal, &hint->ext_coord.coord, &key);
33161 + hint->offset = get_key_offset(&key);
33162 + hint->mode = mode;
33163 +}
33164 +
33165 +void invalidate_hint_cluster(reiser4_cluster_t * clust)
33166 +{
33167 + assert("edward-1291", clust != NULL);
33168 + assert("edward-1292", clust->hint != NULL);
33169 +
33170 + done_lh(&clust->hint->lh);
33171 + clust->hint->ext_coord.valid = 0;
33172 +}
33173 +
33174 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
33175 + znode_lock_mode mode)
33176 +{
33177 + assert("edward-1286", clust != NULL);
33178 + assert("edward-1287", clust->hint != NULL);
33179 +
33180 + set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
33181 + invalidate_hint_cluster(clust);
33182 +}
33183 +
33184 +static int
33185 +balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
33186 + loff_t off, loff_t to_file)
33187 +{
33188 + int result;
33189 +
33190 + assert("edward-724", inode != NULL);
33191 + assert("edward-725", crc_inode_ok(inode));
33192 + assert("edward-1272", get_current_context()->grabbed_blocks == 0);
33193 +
33194 + /* set next window params */
33195 + update_cluster(inode, clust, off, to_file);
33196 +
33197 + result = update_sd_cryptcompress(inode);
33198 + assert("edward-988", !result);
33199 + if (result)
33200 + return result;
33201 + assert("edward-726", clust->hint->lh.owner == NULL);
33202 +
33203 + reiser4_throttle_write(inode);
33204 + all_grabbed2free();
33205 + return 0;
33206 +}
33207 +
33208 +/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
33209 +static int
33210 +write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
33211 + loff_t to_file)
33212 +{
33213 + char *data;
33214 + int result = 0;
33215 + unsigned cl_off, cl_count = 0;
33216 + unsigned to_pg, pg_off;
33217 + reiser4_slide_t *win;
33218 +
33219 + assert("edward-190", clust != NULL);
33220 + assert("edward-1069", clust->win != NULL);
33221 + assert("edward-191", inode != NULL);
33222 + assert("edward-727", crc_inode_ok(inode));
33223 + assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
33224 + assert("edward-1154",
33225 + ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
33226 +
33227 + win = clust->win;
33228 +
33229 + assert("edward-1070", win != NULL);
33230 + assert("edward-201", win->stat == HOLE_WINDOW);
33231 + assert("edward-192", cluster_ok(clust, inode));
33232 +
33233 + if (win->off == 0 && win->count == inode_cluster_size(inode)) {
33234 + /* the hole will be represented by fake disk cluster */
33235 + update_cluster(inode, clust, file_off, to_file);
33236 + return 0;
33237 + }
33238 + cl_count = win->count; /* number of zeroes to write */
33239 + cl_off = win->off;
33240 + pg_off = off_to_pgoff(win->off);
33241 +
33242 + while (cl_count) {
33243 + struct page *page;
33244 + page = clust->pages[off_to_pg(cl_off)];
33245 +
33246 + assert("edward-284", page != NULL);
33247 +
33248 + to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
33249 + lock_page(page);
33250 + data = kmap_atomic(page, KM_USER0);
33251 + memset(data + pg_off, 0, to_pg);
33252 + flush_dcache_page(page);
33253 + kunmap_atomic(data, KM_USER0);
33254 + SetPageUptodate(page);
33255 + unlock_page(page);
33256 +
33257 + cl_off += to_pg;
33258 + cl_count -= to_pg;
33259 + pg_off = 0;
33260 + }
33261 + if (!win->delta) {
33262 + /* only zeroes, try to capture */
33263 +
33264 + set_cluster_pages_dirty(clust);
33265 + result = try_capture_cluster(clust, inode);
33266 + if (result)
33267 + return result;
33268 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
33269 + result =
33270 + balance_dirty_page_cluster(clust, inode, file_off, to_file);
33271 + } else
33272 + update_cluster(inode, clust, file_off, to_file);
33273 + return result;
33274 +}
33275 +
33276 +/*
33277 + The main disk search procedure for cryptcompress plugins, which
33278 + . scans all items of disk cluster
33279 + . maybe reads each one (if @read != 0)
33280 + . maybe makes its znode dirty (if @write != 0)
33281 +
33282 + NOTE-EDWARD: Callers should handle the case when disk cluster
33283 + is incomplete (-EIO)
33284 +*/
33285 +int
33286 +find_cluster(reiser4_cluster_t * clust,
33287 + struct inode *inode, int read, int write)
33288 +{
33289 + flow_t f;
33290 + hint_t *hint;
33291 + int result = 0;
33292 + unsigned long cl_idx;
33293 + ra_info_t ra_info;
33294 + file_plugin *fplug;
33295 + item_plugin *iplug;
33296 + tfm_cluster_t *tc;
33297 + int was_grabbed;
33298 +
33299 + assert("edward-138", clust != NULL);
33300 + assert("edward-728", clust->hint != NULL);
33301 + assert("edward-225", read || write);
33302 + assert("edward-226", schedulable());
33303 + assert("edward-137", inode != NULL);
33304 + assert("edward-729", crc_inode_ok(inode));
33305 +
33306 + hint = clust->hint;
33307 + cl_idx = clust->index;
33308 + fplug = inode_file_plugin(inode);
33309 + was_grabbed = get_current_context()->grabbed_blocks;
33310 + tc = &clust->tc;
33311 +
33312 + assert("edward-462", !tfm_cluster_is_uptodate(tc));
33313 + assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
33314 +
33315 + /* set key of the first disk cluster item */
33316 + fplug->flow_by_inode(inode,
33317 + (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
33318 + 0 /* kernel space */ ,
33319 + inode_scaled_cluster_size(inode),
33320 + clust_to_off(cl_idx, inode), READ_OP, &f);
33321 + if (write) {
33322 + /* reserve for flush to make dirty all the leaf nodes
33323 + which contain disk cluster */
33324 + result =
33325 + reiser4_grab_space_force(estimate_dirty_cluster(inode),
33326 + BA_CAN_COMMIT);
33327 + assert("edward-990", !result);
33328 + if (result)
33329 + goto out;
33330 + }
33331 +
33332 + ra_info.key_to_stop = f.key;
33333 + set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
33334 +
33335 + while (f.length) {
33336 + result = find_cluster_item(hint,
33337 + &f.key,
33338 + (write ? ZNODE_WRITE_LOCK :
33339 + ZNODE_READ_LOCK), NULL, FIND_EXACT,
33340 + (write ? CBK_FOR_INSERT : 0));
33341 + switch (result) {
33342 + case CBK_COORD_NOTFOUND:
33343 + result = 0;
33344 + if (inode_scaled_offset
33345 + (inode,
33346 + clust_to_off(cl_idx,
33347 + inode)) == get_key_offset(&f.key)) {
33348 + /* first item not found, this is treated
33349 + as disk cluster is absent */
33350 + clust->dstat = FAKE_DISK_CLUSTER;
33351 + goto out;
33352 + }
33353 + /* we are outside the cluster, stop search here */
33354 + assert("edward-146",
33355 + f.length != inode_scaled_cluster_size(inode));
33356 + goto ok;
33357 + case CBK_COORD_FOUND:
33358 + assert("edward-148",
33359 + hint->ext_coord.coord.between == AT_UNIT);
33360 + assert("edward-460",
33361 + hint->ext_coord.coord.unit_pos == 0);
33362 +
33363 + coord_clear_iplug(&hint->ext_coord.coord);
33364 + result = zload_ra(hint->ext_coord.coord.node, &ra_info);
33365 + if (unlikely(result))
33366 + goto out;
33367 + iplug = item_plugin_by_coord(&hint->ext_coord.coord);
33368 + assert("edward-147",
33369 + item_id_by_coord(&hint->ext_coord.coord) ==
33370 + CTAIL_ID);
33371 +
33372 + result = iplug->s.file.read(NULL, &f, hint);
33373 + if (result) {
33374 + zrelse(hint->ext_coord.coord.node);
33375 + goto out;
33376 + }
33377 + if (write) {
33378 + znode_make_dirty(hint->ext_coord.coord.node);
33379 + znode_set_convertible(hint->ext_coord.coord.
33380 + node);
33381 + }
33382 + zrelse(hint->ext_coord.coord.node);
33383 + break;
33384 + default:
33385 + goto out;
33386 + }
33387 + }
33388 + ok:
33389 + /* at least one item was found */
33390 + /* NOTE-EDWARD: Callers should handle the case
33391 + when disk cluster is incomplete (-EIO) */
33392 + tc->len = inode_scaled_cluster_size(inode) - f.length;
33393 + tc->lsize = fsize_to_count(clust, inode);
33394 + assert("edward-1196", tc->len > 0);
33395 + assert("edward-1406", tc->lsize > 0);
33396 +
33397 + if (hint_is_unprepped_dclust(clust->hint))
33398 + clust->dstat = UNPR_DISK_CLUSTER;
33399 + else
33400 + clust->dstat = PREP_DISK_CLUSTER;
33401 + out:
33402 + assert("edward-1339",
33403 + get_current_context()->grabbed_blocks >= was_grabbed);
33404 + grabbed2free(get_current_context(),
33405 + get_current_super_private(),
33406 + get_current_context()->grabbed_blocks - was_grabbed);
33407 + return result;
33408 +}
33409 +
33410 +int
33411 +get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
33412 + znode_lock_mode lock_mode)
33413 +{
33414 + reiser4_key key;
33415 + ra_info_t ra_info;
33416 +
33417 + assert("edward-730", schedulable());
33418 + assert("edward-731", clust != NULL);
33419 + assert("edward-732", inode != NULL);
33420 +
33421 + if (clust->hint->ext_coord.valid) {
33422 + assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
33423 + assert("edward-1294",
33424 + znode_is_write_locked(clust->hint->lh.node));
33425 + /* already have a valid locked position */
33426 + return (clust->dstat ==
33427 + FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
33428 + CBK_COORD_FOUND);
33429 + }
33430 + key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
33431 + &key);
33432 + ra_info.key_to_stop = key;
33433 + set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
33434 +
33435 + return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
33436 + CBK_FOR_INSERT);
33437 +}
33438 +
33439 +/* Read needed cluster pages before modifying.
33440 + If success, @clust->hint contains locked position in the tree.
33441 + Also:
33442 + . find and set disk cluster state
33443 + . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
33444 +*/
33445 +static int
33446 +read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
33447 +{
33448 + int i;
33449 + int result = 0;
33450 + item_plugin *iplug;
33451 + reiser4_slide_t *win = clust->win;
33452 +
33453 + iplug = item_plugin_by_id(CTAIL_ID);
33454 +
33455 + assert("edward-733", get_current_context()->grabbed_blocks == 0);
33456 + assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
33457 +
33458 +#if REISER4_DEBUG
33459 + if (clust->nr_pages == 0) {
33460 + /* start write hole from fake disk cluster */
33461 + assert("edward-1117", win != NULL);
33462 + assert("edward-1118", win->stat == HOLE_WINDOW);
33463 + assert("edward-1119", new_cluster(clust, inode));
33464 + }
33465 +#endif
33466 + if (new_cluster(clust, inode)) {
33467 + /*
33468 + new page cluster is about to be written, nothing to read,
33469 + */
33470 + assert("edward-734", schedulable());
33471 + assert("edward-735", clust->hint->lh.owner == NULL);
33472 +
33473 + if (clust->nr_pages) {
33474 + int off;
33475 + char *data;
33476 + struct page * pg;
33477 + assert("edward-1419", clust->pages != NULL);
33478 + pg = clust->pages[clust->nr_pages - 1];
33479 + assert("edward-1420", pg != NULL);
33480 + off = off_to_pgoff(win->off+win->count+win->delta);
33481 + if (off) {
33482 + lock_page(pg);
33483 + data = kmap_atomic(pg, KM_USER0);
33484 + memset(data + off, 0, PAGE_CACHE_SIZE - off);
33485 + flush_dcache_page(pg);
33486 + kunmap_atomic(data, KM_USER0);
33487 + unlock_page(pg);
33488 + }
33489 + }
33490 + clust->dstat = FAKE_DISK_CLUSTER;
33491 + return 0;
33492 + }
33493 + /*
33494 + Here we should search for disk cluster to figure out its real state.
33495 + Also there is one more important reason to do disk search: we need
33496 + to make disk cluster _dirty_ if it exists
33497 + */
33498 +
33499 + /* if windows is specified, read the only pages
33500 + that will be modified partially */
33501 +
33502 + for (i = 0; i < clust->nr_pages; i++) {
33503 + struct page *pg = clust->pages[i];
33504 +
33505 + lock_page(pg);
33506 + if (PageUptodate(pg)) {
33507 + unlock_page(pg);
33508 + continue;
33509 + }
33510 + unlock_page(pg);
33511 +
33512 + if (win &&
33513 + i >= count_to_nrpages(win->off) &&
33514 + i < off_to_pg(win->off + win->count + win->delta))
33515 + /* page will be completely overwritten */
33516 + continue;
33517 +
33518 + if (win && (i == clust->nr_pages - 1) &&
33519 + /* the last page is
33520 + partially modified,
33521 + not uptodate .. */
33522 + (count_to_nrpages(inode->i_size) <= pg->index)) {
33523 + /* .. and appended,
33524 + so set zeroes to the rest */
33525 + char *data;
33526 + int offset;
33527 + lock_page(pg);
33528 + data = kmap_atomic(pg, KM_USER0);
33529 +
33530 + assert("edward-1260",
33531 + count_to_nrpages(win->off + win->count +
33532 + win->delta) - 1 == i);
33533 +
33534 + offset =
33535 + off_to_pgoff(win->off + win->count + win->delta);
33536 + memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
33537 + flush_dcache_page(pg);
33538 + kunmap_atomic(data, KM_USER0);
33539 + unlock_page(pg);
33540 + /* still not uptodate */
33541 + break;
33542 + }
33543 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
33544 + result = ctail_read_disk_cluster(clust, inode, 1);
33545 + assert("edward-992", !result);
33546 + if (result)
33547 + goto out;
33548 + assert("edward-925",
33549 + tfm_cluster_is_uptodate(&clust->tc));
33550 + }
33551 + lock_page(pg);
33552 + result = do_readpage_ctail(inode, clust, pg);
33553 + unlock_page(pg);
33554 + assert("edward-993", !result);
33555 + if (result) {
33556 + impossible("edward-219",
33557 + "do_readpage_ctail returned crap");
33558 + goto out;
33559 + }
33560 + }
33561 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
33562 + /* disk cluster unclaimed, but we need to make its znodes dirty
33563 + to make flush update convert its content */
33564 + result =
33565 + find_cluster(clust, inode, 0 /* do not read */ ,
33566 + 1 /* write */ );
33567 + assert("edward-994", !result);
33568 + }
33569 + out:
33570 + tfm_cluster_clr_uptodate(&clust->tc);
33571 + return result;
33572 +}
33573 +
33574 +static int
33575 +should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33576 +{
33577 + assert("edward-737", clust != NULL);
33578 +
33579 + switch (clust->dstat) {
33580 + case PREP_DISK_CLUSTER:
33581 + case UNPR_DISK_CLUSTER:
33582 + return 0;
33583 + case FAKE_DISK_CLUSTER:
33584 + if (clust->win &&
33585 + clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
33586 + assert("edward-1172", new_cluster(clust, inode));
33587 + return 0;
33588 + }
33589 + return 1;
33590 + default:
33591 + impossible("edward-1173", "bad disk cluster state");
33592 + return 0;
33593 + }
33594 +}
33595 +
33596 +static int
33597 +crc_make_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33598 +{
33599 + int result;
33600 +
33601 + assert("edward-1123", schedulable());
33602 + assert("edward-737", clust != NULL);
33603 + assert("edward-738", inode != NULL);
33604 + assert("edward-739", crc_inode_ok(inode));
33605 + assert("edward-1053", clust->hint != NULL);
33606 + assert("edward-1266", get_current_context()->grabbed_blocks == 0);
33607 +
33608 + if (clust->reserved) {
33609 + cluster_reserved2grabbed(estimate_insert_cluster(inode));
33610 +#if REISER4_DEBUG
33611 + assert("edward-1267",
33612 + clust->reserved_unprepped ==
33613 + estimate_insert_cluster(inode));
33614 + clust->reserved_unprepped -= estimate_insert_cluster(inode);
33615 +#endif
33616 + }
33617 + if (!should_create_unprepped_cluster(clust, inode)) {
33618 + all_grabbed2free();
33619 + return 0;
33620 + } else {
33621 + assert("edward-1268", clust->reserved == 1);
33622 + }
33623 + result = ctail_insert_unprepped_cluster(clust, inode);
33624 + all_grabbed2free();
33625 + if (result)
33626 + return result;
33627 +
33628 + assert("edward-743", crc_inode_ok(inode));
33629 + assert("edward-1269", get_current_context()->grabbed_blocks == 0);
33630 + assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
33631 +
33632 + clust->dstat = UNPR_DISK_CLUSTER;
33633 + return 0;
33634 +}
33635 +
33636 +#if REISER4_DEBUG
33637 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
33638 +{
33639 + jnode *node;
33640 + node =
33641 + jlookup(current_tree, get_inode_oid(inode),
33642 + clust_to_pg(index, inode));
33643 + if (likely(!node))
33644 + return 1;
33645 + /* someone got this jnode */
33646 + warning("edward-1315", "jnode %p is untruncated\n", node);
33647 + jput(node);
33648 + return (atomic_read(&node->x_count));
33649 +}
33650 +#endif
33651 +
33652 +/* Collect unlocked cluster pages and jnode (the last is in the
33653 + case when the page cluster will be modified and captured) */
33654 +int
33655 +prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
33656 + int capture)
33657 +{
33658 + assert("edward-177", inode != NULL);
33659 + assert("edward-741", crc_inode_ok(inode));
33660 + assert("edward-740", clust->pages != NULL);
33661 +
33662 + set_cluster_nrpages(clust, inode);
33663 + reset_cluster_pgset(clust, cluster_nrpages(inode));
33664 + return (capture ?
33665 + grab_cluster_pages_jnode(inode, clust) :
33666 + grab_cluster_pages(inode, clust));
33667 +}
33668 +
33669 +/* Truncate all pages of the cluster of index @index.
33670 + This is called by ->kill_hook() method of item plugin */
33671 +void truncate_page_cluster(struct inode *inode, cloff_t index)
33672 +{
33673 + int i;
33674 + int found = 0;
33675 + int nr_pages;
33676 + jnode *node;
33677 + struct page *pages[MAX_CLUSTER_NRPAGES];
33678 +
33679 + node =
33680 + jlookup(current_tree, get_inode_oid(inode),
33681 + clust_to_pg(index, inode));
33682 + /* jnode is absent, just drop pages which can not
33683 + acquire jnode because of exclusive access */
33684 + if (!node) {
33685 + truncate_inode_pages_range(inode->i_mapping,
33686 + clust_to_off(index, inode),
33687 + clust_to_off(index,
33688 + inode) +
33689 + inode_cluster_size(inode) - 1);
33690 + return;
33691 + }
33692 + /* jnode is present and may be dirty */
33693 + nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
33694 +
33695 + found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
33696 + nr_pages, pages);
33697 + spin_lock_jnode(node);
33698 + if (JF_ISSET(node, JNODE_DIRTY)) {
33699 + /* someone has done modifications which are not
33700 + yet committed, so we need to release some resources */
33701 +
33702 + /* free disk space grabbed for disk cluster converting */
33703 + cluster_reserved2grabbed(estimate_update_cluster(inode));
33704 + grabbed2free(get_current_context(),
33705 + get_current_super_private(),
33706 + estimate_update_cluster(inode));
33707 +
33708 + assert("edward-1198", found == nr_pages);
33709 + assert("edward-1199", node->page_count + 1 == nr_pages);
33710 +#if REISER4_DEBUG
33711 + node->page_count = 0;
33712 +#endif
33713 + /* This will clear dirty bit */
33714 + uncapture_cluster_jnode(node);
33715 +
33716 + /* put pages grabbed for last uncommitted modifications */
33717 + for (i = 0; i < nr_pages; i++) {
33718 + assert("edward-1200", PageUptodate(pages[i]));
33719 + page_cache_release(pages[i]);
33720 +#if REISER4_DEBUG
33721 + cryptcompress_inode_data(inode)->pgcount --;
33722 +#endif
33723 + }
33724 + } else
33725 + spin_unlock_jnode(node);
33726 + /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
33727 +
33728 + jput(node);
33729 + /* put pages found here */
33730 + forget_cluster_pages(pages, found);
33731 + truncate_inode_pages_range(inode->i_mapping,
33732 + clust_to_off(index, inode),
33733 + clust_to_off(index,
33734 + inode) +
33735 + inode_cluster_size(inode) - 1);
33736 + assert("edward-1201", jnode_truncate_ok(inode, index));
33737 + return;
33738 +}
33739 +
33740 +/* Prepare cluster handle before(after) modifications
33741 + which are supposed to be committed.
33742 +
33743 + . grab cluster pages;
33744 + . reserve disk space;
33745 + . maybe read pages from disk and set the disk cluster dirty;
33746 + . maybe write hole;
33747 + . maybe create 'unprepped' disk cluster if the last one is fake
33748 + (i.e. is not represenred by any items)
33749 +*/
33750 +
33751 +static int
33752 +prepare_cluster(struct inode *inode,
33753 + loff_t file_off /* write position in the file */ ,
33754 + loff_t to_file, /* bytes of users data to write to the file */
33755 + reiser4_cluster_t * clust, page_cluster_op op)
33756 +{
33757 + int result = 0;
33758 + reiser4_slide_t *win = clust->win;
33759 +
33760 + assert("edward-1273", get_current_context()->grabbed_blocks == 0);
33761 + reset_cluster_params(clust);
33762 +#if REISER4_DEBUG
33763 + clust->ctx = get_current_context();
33764 +#endif
33765 + assert("edward-1190", op != PCL_UNKNOWN);
33766 +
33767 + clust->op = op;
33768 +
33769 + result = prepare_page_cluster(inode, clust, 1);
33770 + if (result)
33771 + return result;
33772 + result = reserve4cluster(inode, clust);
33773 + if (result)
33774 + goto err1;
33775 + result = read_some_cluster_pages(inode, clust);
33776 + if (result) {
33777 + free_reserved4cluster(inode,
33778 + clust,
33779 + estimate_update_cluster(inode) +
33780 + estimate_insert_cluster(inode));
33781 + goto err1;
33782 + }
33783 + assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33784 +
33785 + result = crc_make_unprepped_cluster(clust, inode);
33786 + if (result)
33787 + goto err2;
33788 + if (win && win->stat == HOLE_WINDOW) {
33789 + result = write_hole(inode, clust, file_off, to_file);
33790 + if (result)
33791 + goto err2;
33792 + }
33793 + return 0;
33794 + err2:
33795 + free_reserved4cluster(inode, clust,
33796 + estimate_update_cluster(inode));
33797 + err1:
33798 + release_cluster_pages_and_jnode(clust);
33799 + assert("edward-1125", result == -ENOSPC);
33800 + return result;
33801 +}
33802 +
33803 +/* set window by two offsets */
33804 +static void
33805 +set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
33806 + struct inode *inode, loff_t o1, loff_t o2)
33807 +{
33808 + assert("edward-295", clust != NULL);
33809 + assert("edward-296", inode != NULL);
33810 + assert("edward-1071", win != NULL);
33811 + assert("edward-297", o1 <= o2);
33812 +
33813 + clust->index = off_to_clust(o1, inode);
33814 +
33815 + win->off = off_to_cloff(o1, inode);
33816 + win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
33817 + win->delta = 0;
33818 +
33819 + clust->win = win;
33820 +}
33821 +
33822 +static int
33823 +set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
33824 + reiser4_slide_t * win, flow_t * f, loff_t file_off)
33825 +{
33826 + int result;
33827 +
33828 + assert("edward-197", clust != NULL);
33829 + assert("edward-1072", win != NULL);
33830 + assert("edward-198", inode != NULL);
33831 +
33832 + result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33833 + if (result)
33834 + return result;
33835 +
33836 + if (file_off > inode->i_size) {
33837 + /* Uhmm, hole in cryptcompress file... */
33838 + loff_t hole_size;
33839 + hole_size = file_off - inode->i_size;
33840 +
33841 + set_window(clust, win, inode, inode->i_size, file_off);
33842 + win->stat = HOLE_WINDOW;
33843 + if (win->off + hole_size < inode_cluster_size(inode))
33844 + /* there is also user's data to append to the hole */
33845 + win->delta =
33846 + min_count(inode_cluster_size(inode) -
33847 + (win->off + win->count), f->length);
33848 + return 0;
33849 + }
33850 + set_window(clust, win, inode, file_off, file_off + f->length);
33851 + win->stat = DATA_WINDOW;
33852 + return 0;
33853 +}
33854 +
33855 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
33856 + int count)
33857 +{
33858 + int result = 0;
33859 + int (*setting_actor)(reiser4_cluster_t * clust, int count);
33860 +
33861 + assert("edward-1358", clust != NULL);
33862 + assert("edward-1359", page != NULL);
33863 + assert("edward-1360", page->mapping != NULL);
33864 + assert("edward-1361", page->mapping->host != NULL);
33865 +
33866 + setting_actor = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
33867 + result = setting_actor(clust, count);
33868 + clust->index = pg_to_clust(page->index, page->mapping->host);
33869 + return result;
33870 +}
33871 +
33872 +/* reset all the params that not get updated */
33873 +void reset_cluster_params(reiser4_cluster_t * clust)
33874 +{
33875 + assert("edward-197", clust != NULL);
33876 +
33877 + clust->dstat = INVAL_DISK_CLUSTER;
33878 + clust->tc.uptodate = 0;
33879 + clust->tc.len = 0;
33880 +}
33881 +
33882 +/* Core write procedure of cryptcompress plugin, which slices user's
33883 + flow into logical clusters, maps the last ones to the appropriate
33884 + page clusters, and tries to capture them.
33885 + If @buf != NULL, returns number of successfully written bytes,
33886 + otherwise returns error
33887 +*/
33888 +static loff_t
33889 +write_cryptcompress_flow(struct file *file, struct inode *inode,
33890 + const char __user *buf, size_t count, loff_t pos)
33891 +{
33892 + int i;
33893 + flow_t f;
33894 + hint_t *hint;
33895 + int result = 0;
33896 + size_t to_write = 0;
33897 + loff_t file_off;
33898 + reiser4_slide_t win;
33899 + reiser4_cluster_t clust;
33900 +
33901 + assert("edward-161", schedulable());
33902 + assert("edward-748", crc_inode_ok(inode));
33903 + assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33904 + assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33905 +
33906 + result = check_cryptcompress(inode);
33907 + if (result)
33908 + return result;
33909 + hint = kmalloc(sizeof(*hint), GFP_KERNEL);
33910 + if (hint == NULL)
33911 + return RETERR(-ENOMEM);
33912 +
33913 + result = load_file_hint(file, hint);
33914 + if (result) {
33915 + kfree(hint);
33916 + return result;
33917 + }
33918 +
33919 + result =
33920 + flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
33921 + count, pos, WRITE_OP, &f);
33922 + if (result)
33923 + goto out;
33924 + to_write = f.length;
33925 +
33926 + /* current write position in file */
33927 + file_off = pos;
33928 + reiser4_slide_init(&win);
33929 + cluster_init_read(&clust, &win);
33930 + clust.hint = hint;
33931 +
33932 + result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
33933 + if (result)
33934 + goto out;
33935 +
33936 + if (next_window_stat(&win) == HOLE_WINDOW) {
33937 + result =
33938 + prepare_cluster(inode, file_off, f.length, &clust,
33939 + PCL_APPEND);
33940 + if (result)
33941 + goto out;
33942 + }
33943 + do {
33944 + char *src;
33945 + unsigned page_off, page_count;
33946 +
33947 + assert("edward-750", schedulable());
33948 +
33949 + result =
33950 + prepare_cluster(inode, file_off, f.length, &clust,
33951 + PCL_APPEND);
33952 + if (result)
33953 + goto out;
33954 +
33955 + assert("edward-751", crc_inode_ok(inode));
33956 + assert("edward-204", win.stat == DATA_WINDOW);
33957 + assert("edward-1288", clust.hint->ext_coord.valid);
33958 + assert("edward-752",
33959 + znode_is_write_locked(hint->ext_coord.coord.node));
33960 +
33961 + put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33962 +
33963 + /* set write position in page */
33964 + page_off = off_to_pgoff(win.off);
33965 +
33966 + /* copy user's data to cluster pages */
33967 + for (i = off_to_pg(win.off), src = f.data;
33968 + i < count_to_nrpages(win.off + win.count);
33969 + i++, src += page_count) {
33970 + page_count =
33971 + cnt_to_pgcnt(win.off + win.count, i) - page_off;
33972 +
33973 + assert("edward-1039",
33974 + page_off + page_count <= PAGE_CACHE_SIZE);
33975 + assert("edward-287", clust.pages[i] != NULL);
33976 +
33977 + lock_page(clust.pages[i]);
33978 + result =
33979 + __copy_from_user((char *)kmap(clust.pages[i]) +
33980 + page_off, (char __user *)src, page_count);
33981 + kunmap(clust.pages[i]);
33982 + if (unlikely(result)) {
33983 + unlock_page(clust.pages[i]);
33984 + result = -EFAULT;
33985 + goto err2;
33986 + }
33987 + SetPageUptodate(clust.pages[i]);
33988 + unlock_page(clust.pages[i]);
33989 + page_off = 0;
33990 + }
33991 + assert("edward-753", crc_inode_ok(inode));
33992 +
33993 + set_cluster_pages_dirty(&clust);
33994 +
33995 + result = try_capture_cluster(&clust, inode);
33996 + if (result)
33997 + goto err2;
33998 +
33999 + assert("edward-998", f.user == 1);
34000 +
34001 + move_flow_forward(&f, win.count);
34002 +
34003 + /* disk cluster may be already clean at this point */
34004 +
34005 + /* . update cluster
34006 + . set hint for new offset
34007 + . unlock znode
34008 + . update inode
34009 + . balance dirty pages
34010 + */
34011 + result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
34012 + if (result)
34013 + goto err1;
34014 + assert("edward-755", hint->lh.owner == NULL);
34015 + reset_cluster_params(&clust);
34016 + continue;
34017 + err2:
34018 + release_cluster_pages_and_jnode(&clust);
34019 + err1:
34020 + if (clust.reserved)
34021 + free_reserved4cluster(inode,
34022 + &clust,
34023 + estimate_update_cluster(inode));
34024 + break;
34025 + } while (f.length);
34026 + out:
34027 + done_lh(&hint->lh);
34028 + if (result == -EEXIST)
34029 + warning("edward-1407", "write returns EEXIST!\n");
34030 +
34031 + put_cluster_handle(&clust);
34032 + save_file_hint(file, hint);
34033 + kfree(hint);
34034 + if (buf) {
34035 + /* if nothing were written - there must be an error */
34036 + assert("edward-195", ergo((to_write == f.length), result < 0));
34037 + return (to_write - f.length) ? (to_write - f.length) : result;
34038 + }
34039 + return result;
34040 +}
34041 +
34042 +static ssize_t write_crc_file(struct file *file, /* file to write to */
34043 + struct inode *inode, /* inode */
34044 + const char __user *buf, /* address of user-space buffer */
34045 + size_t count, /* number of bytes to write */
34046 + loff_t * off /* position to write which */ )
34047 +{
34048 +
34049 + int result;
34050 + loff_t pos;
34051 + ssize_t written;
34052 + cryptcompress_info_t *info = cryptcompress_inode_data(inode);
34053 +
34054 + assert("edward-196", crc_inode_ok(inode));
34055 +
34056 + result = generic_write_checks(file, off, &count, 0);
34057 + if (unlikely(result != 0))
34058 + return result;
34059 +
34060 + if (unlikely(count == 0))
34061 + return 0;
34062 +
34063 + down_write(&info->lock);
34064 + LOCK_CNT_INC(inode_sem_w);
34065 +
34066 + pos = *off;
34067 + written =
34068 + write_cryptcompress_flow(file, inode, buf, count, pos);
34069 +
34070 + up_write(&info->lock);
34071 + LOCK_CNT_DEC(inode_sem_w);
34072 +
34073 + if (written < 0) {
34074 + if (written == -EEXIST)
34075 + printk("write_crc_file returns EEXIST!\n");
34076 + return written;
34077 + }
34078 + /* update position in a file */
34079 + *off = pos + written;
34080 + /* return number of written bytes */
34081 + return written;
34082 +}
34083 +
34084 +/**
34085 + * write_cryptcompress - write of struct file_operations
34086 + * @file: file to write to
34087 + * @buf: address of user-space buffer
34088 + * @read_amount: number of bytes to write
34089 + * @off: position in file to write to
34090 + *
34091 + * This is implementation of vfs's write method of struct file_operations for
34092 + * cryptcompress plugin.
34093 + */
34094 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
34095 + size_t count, loff_t *off)
34096 +{
34097 + ssize_t result;
34098 + struct inode *inode;
34099 + reiser4_context *ctx;
34100 +
34101 + inode = file->f_dentry->d_inode;
34102 +
34103 + ctx = init_context(inode->i_sb);
34104 + if (IS_ERR(ctx))
34105 + return PTR_ERR(ctx);
34106 +
34107 + mutex_lock(&inode->i_mutex);
34108 +
34109 + result = write_crc_file(file, inode, buf, count, off);
34110 +
34111 + mutex_unlock(&inode->i_mutex);
34112 +
34113 + context_set_commit_async(ctx);
34114 + reiser4_exit_context(ctx);
34115 + return result;
34116 +}
34117 +
34118 +static void
34119 +readpages_crc(struct address_space *mapping, struct list_head *pages,
34120 + void *data)
34121 +{
34122 + file_plugin *fplug;
34123 + item_plugin *iplug;
34124 +
34125 + assert("edward-1112", mapping != NULL);
34126 + assert("edward-1113", mapping->host != NULL);
34127 +
34128 + fplug = inode_file_plugin(mapping->host);
34129 + assert("edward-1114", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
34130 + iplug = item_plugin_by_id(CTAIL_ID);
34131 +
34132 + iplug->s.file.readpages(data, mapping, pages);
34133 +
34134 + return;
34135 +}
34136 +
34137 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
34138 +{
34139 + /* reserve one block to update stat data item */
34140 + assert("edward-1193",
34141 + inode_file_plugin(inode)->estimate.update ==
34142 + estimate_update_common);
34143 + return estimate_update_common(inode);
34144 +}
34145 +
34146 +/**
34147 + * read_cryptcompress - read of struct file_operations
34148 + * @file: file to read from
34149 + * @buf: address of user-space buffer
34150 + * @read_amount: number of bytes to read
34151 + * @off: position in file to read from
34152 + *
34153 + * This is implementation of vfs's read method of struct file_operations for
34154 + * cryptcompress plugin.
34155 + */
34156 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
34157 + loff_t * off)
34158 +{
34159 + ssize_t result;
34160 + struct inode *inode;
34161 + reiser4_context *ctx;
34162 + reiser4_file_fsdata *fsdata;
34163 + cryptcompress_info_t *info;
34164 + reiser4_block_nr needed;
34165 +
34166 + inode = file->f_dentry->d_inode;
34167 + assert("edward-1194", !inode_get_flag(inode, REISER4_NO_SD));
34168 +
34169 + ctx = init_context(inode->i_sb);
34170 + if (IS_ERR(ctx))
34171 + return PTR_ERR(ctx);
34172 +
34173 + info = cryptcompress_inode_data(inode);
34174 + needed = cryptcompress_estimate_read(inode);
34175 +
34176 + /* FIXME-EDWARD:
34177 + Grab space for sd_update so find_cluster will be happy */
34178 + result = reiser4_grab_space(needed, BA_CAN_COMMIT);
34179 + if (result != 0) {
34180 + reiser4_exit_context(ctx);
34181 + return result;
34182 + }
34183 + fsdata = reiser4_get_file_fsdata(file);
34184 + fsdata->ra2.data = file;
34185 + fsdata->ra2.readpages = readpages_crc;
34186 +
34187 + down_read(&info->lock);
34188 + LOCK_CNT_INC(inode_sem_r);
34189 +
34190 + result = generic_file_read(file, buf, size, off);
34191 +
34192 + up_read(&info->lock);
34193 + LOCK_CNT_DEC(inode_sem_r);
34194 +
34195 + context_set_commit_async(ctx);
34196 + reiser4_exit_context(ctx);
34197 +
34198 + return result;
34199 +}
34200 +
34201 +/* If @index > 0, find real disk cluster of the index (@index - 1),
34202 + If @index == 0 find the real disk cluster of the object of maximal index.
34203 + Keep incremented index of the result in @found.
34204 + It succes was returned:
34205 + (@index == 0 && @found == 0) means that the object doesn't have real disk
34206 + clusters.
34207 + (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
34208 + exist.
34209 +*/
34210 +static int
34211 +find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
34212 +{
34213 + int result;
34214 + reiser4_key key;
34215 + loff_t offset;
34216 + hint_t *hint;
34217 + lock_handle *lh;
34218 + lookup_bias bias;
34219 + coord_t *coord;
34220 + item_plugin *iplug;
34221 +
34222 + assert("edward-1131", inode != NULL);
34223 + assert("edward-95", crc_inode_ok(inode));
34224 +
34225 + hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34226 + if (hint == NULL)
34227 + return RETERR(-ENOMEM);
34228 + hint_init_zero(hint);
34229 + lh = &hint->lh;
34230 +
34231 + bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
34232 + offset =
34233 + (index ? clust_to_off(index, inode) -
34234 + 1 : get_key_offset(max_key()));
34235 +
34236 + key_by_inode_cryptcompress(inode, offset, &key);
34237 +
34238 + /* find the last item of this object */
34239 + result =
34240 + find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
34241 + bias, 0);
34242 + if (cbk_errored(result)) {
34243 + done_lh(lh);
34244 + kfree(hint);
34245 + return result;
34246 + }
34247 + if (result == CBK_COORD_NOTFOUND) {
34248 + /* no real disk clusters */
34249 + done_lh(lh);
34250 + kfree(hint);
34251 + *found = 0;
34252 + return 0;
34253 + }
34254 + /* disk cluster is found */
34255 + coord = &hint->ext_coord.coord;
34256 + coord_clear_iplug(coord);
34257 + result = zload(coord->node);
34258 + if (unlikely(result)) {
34259 + done_lh(lh);
34260 + kfree(hint);
34261 + return result;
34262 + }
34263 + iplug = item_plugin_by_coord(coord);
34264 + assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
34265 + assert("edward-1202", ctail_ok(coord));
34266 +
34267 + item_key_by_coord(coord, &key);
34268 + *found = off_to_clust(get_key_offset(&key), inode) + 1;
34269 +
34270 + assert("edward-1132", ergo(index, index == *found));
34271 +
34272 + zrelse(coord->node);
34273 + done_lh(lh);
34274 + kfree(hint);
34275 + return 0;
34276 +}
34277 +
34278 +static int find_fake_appended(struct inode *inode, cloff_t * index)
34279 +{
34280 + return find_real_disk_cluster(inode, index,
34281 + 0 /* find last real one */ );
34282 +}
34283 +
34284 +/* Set left coord when unit is not found after node_lookup()
34285 + This takes into account that there can be holes in a sequence
34286 + of disk clusters */
34287 +
34288 +static void adjust_left_coord(coord_t * left_coord)
34289 +{
34290 + switch (left_coord->between) {
34291 + case AFTER_UNIT:
34292 + left_coord->between = AFTER_ITEM;
34293 + case AFTER_ITEM:
34294 + case BEFORE_UNIT:
34295 + break;
34296 + default:
34297 + impossible("edward-1204", "bad left coord to cut");
34298 + }
34299 + return;
34300 +}
34301 +
34302 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
34303 +int
34304 +cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
34305 + const reiser4_key * to_key,
34306 + reiser4_key * smallest_removed,
34307 + struct inode *object, int truncate, int *progress)
34308 +{
34309 + lock_handle next_node_lock;
34310 + coord_t left_coord;
34311 + int result;
34312 +
34313 + assert("edward-1158", tap->coord->node != NULL);
34314 + assert("edward-1159", znode_is_write_locked(tap->coord->node));
34315 + assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
34316 +
34317 + *progress = 0;
34318 + init_lh(&next_node_lock);
34319 +
34320 + while (1) {
34321 + znode *node; /* node from which items are cut */
34322 + node_plugin *nplug; /* node plugin for @node */
34323 +
34324 + node = tap->coord->node;
34325 +
34326 + /* Move next_node_lock to the next node on the left. */
34327 + result =
34328 + reiser4_get_left_neighbor(&next_node_lock, node,
34329 + ZNODE_WRITE_LOCK,
34330 + GN_CAN_USE_UPPER_LEVELS);
34331 + if (result != 0 && result != -E_NO_NEIGHBOR)
34332 + break;
34333 + /* FIXME-EDWARD: Check can we delete the node as a whole. */
34334 + result = tap_load(tap);
34335 + if (result)
34336 + return result;
34337 +
34338 + /* Prepare the second (right) point for cut_node() */
34339 + if (*progress)
34340 + coord_init_last_unit(tap->coord, node);
34341 +
34342 + else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
34343 + /* set rightmost unit for the items without lookup method */
34344 + tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
34345 +
34346 + nplug = node->nplug;
34347 +
34348 + assert("edward-1161", nplug);
34349 + assert("edward-1162", nplug->lookup);
34350 +
34351 + /* left_coord is leftmost unit cut from @node */
34352 + result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
34353 +
34354 + if (IS_CBKERR(result))
34355 + break;
34356 +
34357 + if (result == CBK_COORD_NOTFOUND)
34358 + adjust_left_coord(&left_coord);
34359 +
34360 + /* adjust coordinates so that they are set to existing units */
34361 + if (coord_set_to_right(&left_coord)
34362 + || coord_set_to_left(tap->coord)) {
34363 + result = 0;
34364 + break;
34365 + }
34366 +
34367 + if (coord_compare(&left_coord, tap->coord) ==
34368 + COORD_CMP_ON_RIGHT) {
34369 + /* keys from @from_key to @to_key are not in the tree */
34370 + result = 0;
34371 + break;
34372 + }
34373 +
34374 + /* cut data from one node */
34375 + *smallest_removed = *min_key();
34376 + result = kill_node_content(&left_coord,
34377 + tap->coord,
34378 + from_key,
34379 + to_key,
34380 + smallest_removed,
34381 + next_node_lock.node,
34382 + object, truncate);
34383 +#if REISER4_DEBUG
34384 + /*node_check(node, ~0U); */
34385 +#endif
34386 + tap_relse(tap);
34387 +
34388 + if (result)
34389 + break;
34390 +
34391 + ++(*progress);
34392 +
34393 + /* Check whether all items with keys >= from_key were removed
34394 + * from the tree. */
34395 + if (keyle(smallest_removed, from_key))
34396 + /* result = 0; */
34397 + break;
34398 +
34399 + if (next_node_lock.node == NULL)
34400 + break;
34401 +
34402 + result = tap_move(tap, &next_node_lock);
34403 + done_lh(&next_node_lock);
34404 + if (result)
34405 + break;
34406 +
34407 + /* Break long cut_tree operation (deletion of a large file) if
34408 + * atom requires commit. */
34409 + if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
34410 + && current_atom_should_commit()) {
34411 + result = -E_REPEAT;
34412 + break;
34413 + }
34414 + }
34415 + done_lh(&next_node_lock);
34416 + return result;
34417 +}
34418 +
34419 +/* Append or expand hole in two steps (exclusive access should be aquired!)
34420 + 1) write zeroes to the current real cluster,
34421 + 2) expand hole via fake clusters (just increase i_size) */
34422 +static int
34423 +cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
34424 + loff_t new_size)
34425 +{
34426 + int result = 0;
34427 + hint_t *hint;
34428 + lock_handle *lh;
34429 + loff_t hole_size;
34430 + int nr_zeroes;
34431 + reiser4_slide_t win;
34432 + reiser4_cluster_t clust;
34433 +
34434 + assert("edward-1133", inode->i_size < new_size);
34435 + assert("edward-1134", schedulable());
34436 + assert("edward-1135", crc_inode_ok(inode));
34437 + assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
34438 + assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
34439 +
34440 + hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34441 + if (hint == NULL)
34442 + return RETERR(-ENOMEM);
34443 + hint_init_zero(hint);
34444 + lh = &hint->lh;
34445 +
34446 + reiser4_slide_init(&win);
34447 + cluster_init_read(&clust, &win);
34448 + clust.hint = hint;
34449 +
34450 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34451 + if (result)
34452 + goto out;
34453 + if (off_to_cloff(inode->i_size, inode) == 0)
34454 + goto fake_append;
34455 + hole_size = new_size - inode->i_size;
34456 + nr_zeroes =
34457 + inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
34458 + if (hole_size < nr_zeroes)
34459 + nr_zeroes = hole_size;
34460 + set_window(&clust, &win, inode, inode->i_size,
34461 + inode->i_size + nr_zeroes);
34462 + win.stat = HOLE_WINDOW;
34463 +
34464 + assert("edward-1137",
34465 + clust.index == off_to_clust(inode->i_size, inode));
34466 +
34467 + result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
34468 +
34469 + assert("edward-1271", !result || result == -ENOSPC);
34470 + if (result)
34471 + goto out;
34472 + assert("edward-1139",
34473 + clust.dstat == PREP_DISK_CLUSTER ||
34474 + clust.dstat == UNPR_DISK_CLUSTER);
34475 +
34476 + assert("edward-1431", hole_size >= nr_zeroes);
34477 + if (hole_size == nr_zeroes)
34478 + /* nothing to append anymore */
34479 + goto out;
34480 + fake_append:
34481 + INODE_SET_FIELD(inode, i_size, new_size);
34482 + out:
34483 + done_lh(lh);
34484 + kfree(hint);
34485 + put_cluster_handle(&clust);
34486 + return result;
34487 +}
34488 +
34489 +#if REISER4_DEBUG
34490 +static int
34491 +pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
34492 +{
34493 + struct pagevec pvec;
34494 + int i;
34495 + int count;
34496 + int rest;
34497 +
34498 + rest = count_to_nrpages(old_size) - start;
34499 +
34500 + pagevec_init(&pvec, 0);
34501 + count = min_count(pagevec_space(&pvec), rest);
34502 +
34503 + while (rest) {
34504 + count = min_count(pagevec_space(&pvec), rest);
34505 + pvec.nr = find_get_pages(inode->i_mapping, start,
34506 + count, pvec.pages);
34507 + for (i = 0; i < pagevec_count(&pvec); i++) {
34508 + if (PageUptodate(pvec.pages[i])) {
34509 + warning("edward-1205",
34510 + "truncated page of index %lu is uptodate",
34511 + pvec.pages[i]->index);
34512 + return 0;
34513 + }
34514 + }
34515 + start += count;
34516 + rest -= count;
34517 + pagevec_release(&pvec);
34518 + }
34519 + return 1;
34520 +}
34521 +
34522 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
34523 +{
34524 + int result;
34525 + cloff_t raidx;
34526 +
34527 + result = find_fake_appended(inode, &raidx);
34528 + return !result && (aidx == raidx);
34529 +}
34530 +#endif
34531 +
34532 +static int
34533 +update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
34534 +{
34535 + return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
34536 + ? 0 : update_file_size(inode, key, update_sd));
34537 +}
34538 +
34539 +/* prune cryptcompress file in two steps (exclusive access should be acquired!)
34540 + 1) cut all disk clusters but the last one partially truncated,
34541 + 2) set zeroes and capture last partially truncated page cluster if the last
34542 + one exists, otherwise truncate via prune fake cluster (just decrease i_size)
34543 +*/
34544 +static int
34545 +prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
34546 + cloff_t aidx)
34547 +{
34548 + int result = 0;
34549 + unsigned nr_zeroes;
34550 + loff_t to_prune;
34551 + loff_t old_size;
34552 + cloff_t ridx;
34553 +
34554 + hint_t *hint;
34555 + lock_handle *lh;
34556 + reiser4_slide_t win;
34557 + reiser4_cluster_t clust;
34558 +
34559 + assert("edward-1140", inode->i_size >= new_size);
34560 + assert("edward-1141", schedulable());
34561 + assert("edward-1142", crc_inode_ok(inode));
34562 + assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
34563 +
34564 + hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34565 + if (hint == NULL)
34566 + return RETERR(-ENOMEM);
34567 + hint_init_zero(hint);
34568 + lh = &hint->lh;
34569 +
34570 + reiser4_slide_init(&win);
34571 + cluster_init_read(&clust, &win);
34572 + clust.hint = hint;
34573 +
34574 + /* rightmost completely truncated cluster */
34575 + ridx = count_to_nrclust(new_size, inode);
34576 +
34577 + assert("edward-1174", ridx <= aidx);
34578 + old_size = inode->i_size;
34579 + if (ridx != aidx) {
34580 + result = cut_file_items(inode,
34581 + clust_to_off(ridx, inode),
34582 + update_sd,
34583 + clust_to_off(aidx, inode),
34584 + update_cryptcompress_size);
34585 + if (result)
34586 + goto out;
34587 + }
34588 + if (!off_to_cloff(new_size, inode)) {
34589 + /* no partially truncated clusters */
34590 + assert("edward-1145", inode->i_size == new_size);
34591 + goto finish;
34592 + }
34593 + assert("edward-1146", new_size < inode->i_size);
34594 +
34595 + to_prune = inode->i_size - new_size;
34596 +
34597 + /* partial truncate of leftmost cluster,
34598 + first check if it is fake */
34599 + result = find_real_disk_cluster(inode, &aidx, ridx);
34600 + if (result)
34601 + goto out;
34602 + if (!aidx)
34603 + /* yup, this is fake one */
34604 + goto finish;
34605 +
34606 + assert("edward-1148", aidx == ridx);
34607 +
34608 + /* do partial truncate of the leftmost page cluster,
34609 + then try to capture this one */
34610 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34611 + if (result)
34612 + goto out;
34613 + nr_zeroes = (off_to_pgoff(new_size) ?
34614 + PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
34615 + set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
34616 + win.stat = HOLE_WINDOW;
34617 +
34618 + assert("edward-1149", clust.index == ridx - 1);
34619 +
34620 + result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
34621 + if (result)
34622 + goto out;
34623 + assert("edward-1151",
34624 + clust.dstat == PREP_DISK_CLUSTER ||
34625 + clust.dstat == UNPR_DISK_CLUSTER);
34626 +
34627 + assert("edward-1191", inode->i_size == new_size);
34628 + assert("edward-1206", body_truncate_ok(inode, ridx));
34629 + finish:
34630 + /* drop all the pages that don't have jnodes (i.e. pages
34631 + which can not be truncated by cut_file_items() because
34632 + of holes represented by fake disk clusters) including
34633 + the pages of partially truncated cluster which was
34634 + released by prepare_cluster() */
34635 + truncate_inode_pages(inode->i_mapping, new_size);
34636 + INODE_SET_FIELD(inode, i_size, new_size);
34637 + out:
34638 + assert("edward-1334", !result || result == -ENOSPC);
34639 + assert("edward-1209",
34640 + pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
34641 + done_lh(lh);
34642 + kfree(hint);
34643 + put_cluster_handle(&clust);
34644 + return result;
34645 +}
34646 +
34647 +/* Prepare cryptcompress file for truncate:
34648 + prune or append rightmost fake logical clusters (if any)
34649 +*/
34650 +static int
34651 +start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
34652 + int update_sd)
34653 +{
34654 + int result = 0;
34655 + int bytes;
34656 +
34657 + if (new_size > inode->i_size) {
34658 + /* append */
34659 + if (inode->i_size < clust_to_off(aidx, inode))
34660 + /* no fake bytes */
34661 + return 0;
34662 + bytes = new_size - inode->i_size;
34663 + INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
34664 + } else {
34665 + /* prune */
34666 + if (inode->i_size <= clust_to_off(aidx, inode))
34667 + /* no fake bytes */
34668 + return 0;
34669 + bytes =
34670 + inode->i_size - max_count(new_size,
34671 + clust_to_off(aidx, inode));
34672 + if (!bytes)
34673 + return 0;
34674 + INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
34675 + /* In the case of fake prune we need to drop page cluster.
34676 + There are only 2 cases for partially truncated page:
34677 + 1. If is is dirty, therefore it is anonymous
34678 + (was dirtied via mmap), and will be captured
34679 + later via ->capture().
34680 + 2. If is clean, therefore it is filled by zeroes.
34681 + In both cases we don't need to make it dirty and
34682 + capture here.
34683 + */
34684 + truncate_inode_pages(inode->i_mapping, inode->i_size);
34685 + }
34686 + if (update_sd)
34687 + result = update_sd_cryptcompress(inode);
34688 + return result;
34689 +}
34690 +
34691 +/* This is called in setattr_cryptcompress when it is used to truncate,
34692 + and in delete_cryptcompress */
34693 +static int cryptcompress_truncate(struct inode *inode, /* old size */
34694 + loff_t new_size, /* new size */
34695 + int update_sd)
34696 +{
34697 + int result;
34698 + cloff_t aidx;
34699 +
34700 + result = find_fake_appended(inode, &aidx);
34701 + if (result)
34702 + return result;
34703 + assert("edward-1208",
34704 + ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34705 +
34706 + result = start_truncate_fake(inode, aidx, new_size, update_sd);
34707 + if (result)
34708 + return result;
34709 + if (inode->i_size == new_size)
34710 + /* nothing to truncate anymore */
34711 + return 0;
34712 + return (inode->i_size < new_size ?
34713 + cryptcompress_append_hole(inode, new_size) :
34714 + prune_cryptcompress(inode, new_size, update_sd, aidx));
34715 +}
34716 +
34717 +static void clear_moved_tag_cluster(struct address_space * mapping,
34718 + reiser4_cluster_t * clust)
34719 +{
34720 + int i;
34721 + void * ret;
34722 + read_lock_irq(&mapping->tree_lock);
34723 + for (i = 0; i < clust->nr_pages; i++) {
34724 + assert("edward-1438", clust->pages[i] != NULL);
34725 + ret = radix_tree_tag_clear(&mapping->page_tree,
34726 + clust->pages[i]->index,
34727 + PAGECACHE_TAG_REISER4_MOVED);
34728 + assert("edward-1439", ret == clust->pages[i]);
34729 + }
34730 + read_unlock_irq(&mapping->tree_lock);
34731 +}
34732 +
34733 +/* Capture an anonymous pager cluster. (Page cluser is
34734 + anonymous if it contains at least one anonymous page */
34735 +static int
34736 +capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
34737 +{
34738 + int result;
34739 +
34740 + assert("edward-1073", clust != NULL);
34741 + assert("edward-1074", inode != NULL);
34742 + assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34743 +
34744 + result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
34745 + if (result)
34746 + return result;
34747 + set_cluster_pages_dirty(clust);
34748 + clear_moved_tag_cluster(inode->i_mapping, clust);
34749 +
34750 + result = try_capture_cluster(clust, inode);
34751 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
34752 + if (unlikely(result)) {
34753 + /* set cleared tag back, so it will be
34754 + possible to capture it again later */
34755 + read_lock_irq(&inode->i_mapping->tree_lock);
34756 + radix_tree_tag_set(&inode->i_mapping->page_tree,
34757 + clust_to_pg(clust->index, inode),
34758 + PAGECACHE_TAG_REISER4_MOVED);
34759 + read_unlock_irq(&inode->i_mapping->tree_lock);
34760 +
34761 + release_cluster_pages_and_jnode(clust);
34762 + }
34763 + return result;
34764 +}
34765 +
34766 +#define MAX_CLUSTERS_TO_CAPTURE(inode) (1024 >> cluster_nrpages_shift(inode))
34767 +
34768 +/* read lock should be acquired */
34769 +static int
34770 +capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
34771 + int to_capture)
34772 +{
34773 + int result = 0;
34774 + int found;
34775 + int progress = 0;
34776 + struct page *page = NULL;
34777 + hint_t *hint;
34778 + lock_handle *lh;
34779 + reiser4_cluster_t clust;
34780 +
34781 + assert("edward-1127", mapping != NULL);
34782 + assert("edward-1128", mapping->host != NULL);
34783 + assert("edward-1440", mapping->host->i_mapping == mapping);
34784 +
34785 + hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34786 + if (hint == NULL)
34787 + return RETERR(-ENOMEM);
34788 + hint_init_zero(hint);
34789 + lh = &hint->lh;
34790 +
34791 + cluster_init_read(&clust, NULL);
34792 + clust.hint = hint;
34793 +
34794 + result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
34795 + if (result)
34796 + goto out;
34797 +
34798 + while (to_capture > 0) {
34799 + found =
34800 + find_get_pages_tag(mapping, index,
34801 + PAGECACHE_TAG_REISER4_MOVED, 1, &page);
34802 + if (!found) {
34803 + *index = (pgoff_t) - 1;
34804 + break;
34805 + }
34806 + assert("edward-1109", page != NULL);
34807 +
34808 + move_cluster_forward(&clust, mapping->host, page->index,
34809 + &progress);
34810 + result = capture_page_cluster(&clust, mapping->host);
34811 + page_cache_release(page);
34812 + if (result)
34813 + break;
34814 + to_capture--;
34815 + }
34816 + if (result) {
34817 + warning("edward-1077",
34818 + "Cannot capture anon pages: result=%i (captured=%d)\n",
34819 + result,
34820 + ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
34821 + to_capture);
34822 + } else {
34823 + /* something had to be found */
34824 + assert("edward-1078",
34825 + to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
34826 + if (to_capture <= 0)
34827 + /* there may be left more pages */
34828 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
34829 + }
34830 + out:
34831 + done_lh(lh);
34832 + kfree(hint);
34833 + put_cluster_handle(&clust);
34834 + return result;
34835 +}
34836 +
34837 +/* Check mapping for existence of not captured dirty pages.
34838 + This returns !0 if either page tree contains pages tagged
34839 + PAGECACHE_TAG_REISER4_MOVED */
34840 +static int crc_inode_has_anon_pages(struct inode *inode)
34841 +{
34842 + return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
34843 +}
34844 +
34845 +/* this is implementation of vfs's writepages method of struct
34846 + address_space_operations */
34847 +int
34848 +writepages_cryptcompress(struct address_space *mapping,
34849 + struct writeback_control *wbc)
34850 +{
34851 + int result;
34852 + int to_capture;
34853 + pgoff_t nrpages;
34854 + pgoff_t index = 0;
34855 + cryptcompress_info_t *info;
34856 + struct inode *inode;
34857 +
34858 + inode = mapping->host;
34859 + if (!crc_inode_has_anon_pages(inode)) {
34860 + result = 0;
34861 + goto end;
34862 + }
34863 +
34864 + info = cryptcompress_inode_data(inode);
34865 + nrpages = count_to_nrpages(i_size_read(inode));
34866 +
34867 + if (wbc->sync_mode != WB_SYNC_ALL)
34868 + to_capture =
34869 + min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
34870 + else
34871 + to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
34872 + do {
34873 + reiser4_context *ctx;
34874 +
34875 + if (is_in_reiser4_context()) {
34876 + /* FIXME-EDWARD: REMOVEME */
34877 + all_grabbed2free();
34878 +
34879 + /* It can be in the context of write system call from
34880 + balance_dirty_pages() */
34881 + if (down_read_trylock(&info->lock) == 0) {
34882 + result = RETERR(-EBUSY);
34883 + break;
34884 + }
34885 + } else
34886 + down_read(&info->lock);
34887 +
34888 + ctx = init_context(inode->i_sb);
34889 + if (IS_ERR(ctx)) {
34890 + result = PTR_ERR(ctx);
34891 + break;
34892 + }
34893 + ctx->nobalance = 1;
34894 +
34895 + assert("edward-1079",
34896 + lock_stack_isclean(get_current_lock_stack()));
34897 +
34898 + LOCK_CNT_INC(inode_sem_r);
34899 +
34900 + result =
34901 + capture_anonymous_clusters(inode->i_mapping, &index,
34902 + to_capture);
34903 +
34904 + up_read(&info->lock);
34905 +
34906 + LOCK_CNT_DEC(inode_sem_r);
34907 +
34908 + if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
34909 + reiser4_exit_context(ctx);
34910 + break;
34911 + }
34912 + result = txnmgr_force_commit_all(inode->i_sb, 0);
34913 + reiser4_exit_context(ctx);
34914 + } while (result == 0 && index < nrpages);
34915 +
34916 + end:
34917 + if (is_in_reiser4_context()) {
34918 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34919 + /* there are already pages to flush, flush them out, do
34920 + not delay until end of reiser4_sync_inodes */
34921 + writeout(inode->i_sb, wbc);
34922 + get_current_context()->nr_captured = 0;
34923 + }
34924 + }
34925 + return result;
34926 +}
34927 +
34928 +/* plugin->u.file.mmap */
34929 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34930 +{
34931 + //return -ENOSYS;
34932 + return generic_file_mmap(file, vma);
34933 +}
34934 +
34935 +/* plugin->u.file.release */
34936 +/* plugin->u.file.get_block */
34937 +
34938 +/* this is implementation of delete method of file plugin for
34939 + cryptcompress objects */
34940 +int delete_cryptcompress(struct inode *inode)
34941 +{
34942 + int result;
34943 +
34944 + assert("edward-429", inode->i_nlink == 0);
34945 +
34946 + if (inode->i_size) {
34947 + result = cryptcompress_truncate(inode, 0, 0);
34948 + if (result) {
34949 + warning("edward-430",
34950 + "cannot truncate cryptcompress file %lli: %i",
34951 + (unsigned long long)get_inode_oid(inode),
34952 + result);
34953 + return result;
34954 + }
34955 + }
34956 + /* and remove stat data */
34957 + return delete_object_common(inode);
34958 +}
34959 +
34960 +/* plugin->u.file.setattr method
34961 + see plugin.h for description */
34962 +int setattr_cryptcompress(struct dentry *dentry, /* Object to change attributes */
34963 + struct iattr *attr /* change description */ )
34964 +{
34965 + int result;
34966 + struct inode *inode;
34967 +
34968 + inode = dentry->d_inode;
34969 + result = check_cryptcompress(inode);
34970 + if (result)
34971 + return result;
34972 + if (attr->ia_valid & ATTR_SIZE) {
34973 + /* EDWARD-FIXME-HANS: VS-FIXME-HANS:
34974 + Q: this case occurs when? truncate?
34975 + A: yes
34976 +
34977 + Q: If so, why isn't this code in truncate itself instead of here?
34978 +
34979 + A: because vfs calls fs's truncate after it has called truncate_inode_pages to get rid of pages
34980 + corresponding to part of file being truncated. In reiser4 it may cause existence of unallocated
34981 + extents which do not have jnodes. Flush code does not expect that. Solution of this problem is
34982 + straightforward. As vfs's truncate is implemented using setattr operation (common implementaion of
34983 + which calls truncate_inode_pages and fs's truncate in case when size of file changes) - it seems
34984 + reasonable to have reiser4_setattr which will take care of removing pages, jnodes and extents
34985 + simultaneously in case of truncate.
34986 + Q: do you think implementing truncate using setattr is ugly,
34987 + and vfs needs improving, or is there some sense in which this is a good design?
34988 +
34989 + A: VS-FIXME-HANS:
34990 + */
34991 +
34992 + /* truncate does reservation itself and requires exclusive access obtained */
34993 + if (inode->i_size != attr->ia_size) {
34994 + reiser4_context *ctx;
34995 + loff_t old_size;
34996 + cryptcompress_info_t *info =
34997 + cryptcompress_inode_data(inode);
34998 +
34999 + ctx = init_context(dentry->d_inode->i_sb);
35000 + if (IS_ERR(ctx))
35001 + return PTR_ERR(ctx);
35002 +
35003 + down_write(&info->lock);
35004 + LOCK_CNT_INC(inode_sem_w);
35005 +
35006 + inode_check_scale(inode, inode->i_size, attr->ia_size);
35007 +
35008 + old_size = inode->i_size;
35009 +
35010 + result =
35011 + cryptcompress_truncate(inode, attr->ia_size,
35012 + 1 /* update stat data */ );
35013 + if (result) {
35014 + warning("edward-1192",
35015 + "truncate_cryptcompress failed: oid %lli, "
35016 + "old size %lld, new size %lld, retval %d",
35017 + (unsigned long long)
35018 + get_inode_oid(inode), old_size,
35019 + attr->ia_size, result);
35020 + }
35021 + up_write(&info->lock);
35022 + LOCK_CNT_DEC(inode_sem_w);
35023 + context_set_commit_async(ctx);
35024 + reiser4_exit_context(ctx);
35025 + } else
35026 + result = 0;
35027 + } else
35028 + result = setattr_common(dentry, attr);
35029 + return result;
35030 +}
35031 +
35032 +/* sendfile_cryptcompress - sendfile of struct file_operations */
35033 +ssize_t
35034 +sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
35035 + read_actor_t actor, void *target)
35036 +{
35037 + reiser4_context *ctx;
35038 + ssize_t result;
35039 + struct inode *inode;
35040 + cryptcompress_info_t *info;
35041 +
35042 + inode = file->f_dentry->d_inode;
35043 + ctx = init_context(inode->i_sb);
35044 + if (IS_ERR(ctx))
35045 + return PTR_ERR(ctx);
35046 + /*
35047 + * generic_file_sndfile may want to call update_atime. Grab space for
35048 + * stat data update
35049 + */
35050 + result = reiser4_grab_space(estimate_update_common(inode),
35051 + BA_CAN_COMMIT);
35052 + if (result)
35053 + goto exit;
35054 + info = cryptcompress_inode_data(inode);
35055 + down_read(&info->lock);
35056 + result = generic_file_sendfile(file, ppos, count, actor, target);
35057 + up_read(&info->lock);
35058 + exit:
35059 + reiser4_exit_context(ctx);
35060 + return result;
35061 +}
35062 +
35063 +/*
35064 + * release_cryptcompress - release of struct file_operations
35065 + * @inode: inode of released file
35066 + * @file: file to release
35067 + */
35068 +int release_cryptcompress(struct inode *inode, struct file *file)
35069 +{
35070 + reiser4_context *ctx = init_context(inode->i_sb);
35071 +
35072 + if (IS_ERR(ctx))
35073 + return PTR_ERR(ctx);
35074 + reiser4_free_file_fsdata(file);
35075 + reiser4_exit_context(ctx);
35076 + return 0;
35077 +}
35078 +
35079 +static int
35080 +save_len_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin)
35081 +{
35082 + assert("edward-457", inode != NULL);
35083 + assert("edward-458", plugin != NULL);
35084 + assert("edward-459", plugin->h.id == CRC_FILE_PLUGIN_ID);
35085 + return 0;
35086 +}
35087 +
35088 +static int
35089 +load_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin,
35090 + char **area, int *len)
35091 +{
35092 + assert("edward-455", inode != NULL);
35093 + assert("edward-456", (reiser4_inode_data(inode)->pset != NULL));
35094 +
35095 + plugin_set_file(&reiser4_inode_data(inode)->pset,
35096 + file_plugin_by_id(CRC_FILE_PLUGIN_ID));
35097 + return 0;
35098 +}
35099 +
35100 +static int change_cryptcompress(struct inode *inode, reiser4_plugin * plugin)
35101 +{
35102 + /* cannot change object plugin of already existing object */
35103 + return RETERR(-EINVAL);
35104 +}
35105 +
35106 +struct reiser4_plugin_ops cryptcompress_plugin_ops = {
35107 + .load = load_cryptcompress_plugin,
35108 + .save_len = save_len_cryptcompress_plugin,
35109 + .save = NULL,
35110 + .alignment = 8,
35111 + .change = change_cryptcompress
35112 +};
35113 +
35114 +/*
35115 + Local variables:
35116 + c-indentation-style: "K&R"
35117 + mode-name: "LC"
35118 + c-basic-offset: 8
35119 + tab-width: 8
35120 + fill-column: 80
35121 + scroll-step: 1
35122 + End:
35123 +*/
35124 Index: linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.h
35125 ===================================================================
35126 --- /dev/null
35127 +++ linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.h
35128 @@ -0,0 +1,551 @@
35129 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
35130 +/* See http://www.namesys.com/cryptcompress_design.html */
35131 +
35132 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
35133 +#define __FS_REISER4_CRYPTCOMPRESS_H__
35134 +
35135 +#include "../compress/compress.h"
35136 +#include "../crypto/cipher.h"
35137 +
35138 +#include <linux/pagemap.h>
35139 +#include <linux/vmalloc.h>
35140 +
35141 +#define MIN_CLUSTER_SIZE PAGE_CACHE_SIZE
35142 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
35143 +#define MAX_CLUSTER_SHIFT 16
35144 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
35145 +#define DC_CHECKSUM_SIZE 4
35146 +
35147 +static inline loff_t min_count(loff_t a, loff_t b)
35148 +{
35149 + return (a < b ? a : b);
35150 +}
35151 +
35152 +static inline loff_t max_count(loff_t a, loff_t b)
35153 +{
35154 + return (a > b ? a : b);
35155 +}
35156 +
35157 +#if REISER4_DEBUG
35158 +static inline int cluster_shift_ok(int shift)
35159 +{
35160 + return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
35161 +}
35162 +#endif
35163 +
35164 +typedef struct tfm_stream {
35165 + __u8 *data;
35166 + size_t size;
35167 +} tfm_stream_t;
35168 +
35169 +typedef enum {
35170 + INPUT_STREAM,
35171 + OUTPUT_STREAM,
35172 + LAST_STREAM
35173 +} tfm_stream_id;
35174 +
35175 +typedef tfm_stream_t *tfm_unit[LAST_STREAM];
35176 +
35177 +static inline __u8 *ts_data(tfm_stream_t * stm)
35178 +{
35179 + assert("edward-928", stm != NULL);
35180 + return stm->data;
35181 +}
35182 +
35183 +static inline size_t ts_size(tfm_stream_t * stm)
35184 +{
35185 + assert("edward-929", stm != NULL);
35186 + return stm->size;
35187 +}
35188 +
35189 +static inline void set_ts_size(tfm_stream_t * stm, size_t size)
35190 +{
35191 + assert("edward-930", stm != NULL);
35192 +
35193 + stm->size = size;
35194 +}
35195 +
35196 +static inline int alloc_ts(tfm_stream_t ** stm)
35197 +{
35198 + assert("edward-931", stm);
35199 + assert("edward-932", *stm == NULL);
35200 +
35201 + *stm = kmalloc(sizeof **stm, GFP_KERNEL);
35202 + if (*stm == NULL)
35203 + return -ENOMEM;
35204 + memset(*stm, 0, sizeof **stm);
35205 + return 0;
35206 +}
35207 +
35208 +static inline void free_ts(tfm_stream_t * stm)
35209 +{
35210 + assert("edward-933", !ts_data(stm));
35211 + assert("edward-934", !ts_size(stm));
35212 +
35213 + kfree(stm);
35214 +}
35215 +
35216 +static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
35217 +{
35218 + assert("edward-935", !ts_data(stm));
35219 + assert("edward-936", !ts_size(stm));
35220 + assert("edward-937", size != 0);
35221 +
35222 + stm->data = vmalloc(size);
35223 + if (!stm->data)
35224 + return -ENOMEM;
35225 + set_ts_size(stm, size);
35226 + return 0;
35227 +}
35228 +
35229 +static inline void free_ts_data(tfm_stream_t * stm)
35230 +{
35231 + assert("edward-938", equi(ts_data(stm), ts_size(stm)));
35232 +
35233 + if (ts_data(stm))
35234 + vfree(ts_data(stm));
35235 + memset(stm, 0, sizeof *stm);
35236 +}
35237 +
35238 +/* Write modes for item conversion in flush convert phase */
35239 +typedef enum {
35240 + CRC_APPEND_ITEM = 1,
35241 + CRC_OVERWRITE_ITEM = 2,
35242 + CRC_CUT_ITEM = 3
35243 +} crc_write_mode_t;
35244 +
35245 +typedef enum {
35246 + PCL_UNKNOWN = 0, /* invalid option */
35247 + PCL_APPEND = 1, /* append and/or overwrite */
35248 + PCL_TRUNCATE = 2 /* truncate */
35249 +} page_cluster_op;
35250 +
35251 +/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
35252 + using crypto/compression transforms implemented by reiser4 transform plugins.
35253 + Before each transform we allocate a pair of streams (tfm_unit) and assemble
35254 + page cluster into the input one. After transform we split output stream into
35255 + a set of items (disk cluster).
35256 +*/
35257 +typedef struct tfm_cluster {
35258 + coa_set coa;
35259 + tfm_unit tun;
35260 + tfm_action act;
35261 + int uptodate;
35262 + int lsize; /* size of the logical cluster */
35263 + int len; /* length of the transform stream */
35264 +} tfm_cluster_t;
35265 +
35266 +static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
35267 +{
35268 + return tc->coa[id][act];
35269 +}
35270 +
35271 +static inline void
35272 +set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
35273 +{
35274 + tc->coa[id][act] = coa;
35275 +}
35276 +
35277 +static inline int
35278 +alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35279 +{
35280 + coa_t coa;
35281 +
35282 + coa = cplug->alloc(tc->act);
35283 + if (IS_ERR(coa))
35284 + return PTR_ERR(coa);
35285 + set_coa(tc, cplug->h.id, tc->act, coa);
35286 + return 0;
35287 +}
35288 +
35289 +static inline int
35290 +grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35291 +{
35292 + return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
35293 + alloc_coa(tc, cplug) : 0);
35294 +}
35295 +
35296 +static inline void free_coa_set(tfm_cluster_t * tc)
35297 +{
35298 + tfm_action j;
35299 + reiser4_compression_id i;
35300 + compression_plugin *cplug;
35301 +
35302 + assert("edward-810", tc != NULL);
35303 +
35304 + for (j = 0; j < LAST_TFM; j++)
35305 + for (i = 0; i < LAST_COMPRESSION_ID; i++) {
35306 + if (!get_coa(tc, i, j))
35307 + continue;
35308 + cplug = compression_plugin_by_id(i);
35309 + assert("edward-812", cplug->free != NULL);
35310 + cplug->free(get_coa(tc, i, j), j);
35311 + set_coa(tc, i, j, 0);
35312 + }
35313 + return;
35314 +}
35315 +
35316 +static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35317 +{
35318 + return tc->tun[id];
35319 +}
35320 +
35321 +static inline void
35322 +set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
35323 +{
35324 + tc->tun[id] = ts;
35325 +}
35326 +
35327 +static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
35328 +{
35329 + return ts_data(tfm_stream(tc, id));
35330 +}
35331 +
35332 +static inline void
35333 +set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
35334 +{
35335 + tfm_stream(tc, id)->data = data;
35336 +}
35337 +
35338 +static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
35339 +{
35340 + return ts_size(tfm_stream(tc, id));
35341 +}
35342 +
35343 +static inline void
35344 +set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
35345 +{
35346 + tfm_stream(tc, id)->size = size;
35347 +}
35348 +
35349 +static inline int
35350 +alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35351 +{
35352 + assert("edward-939", tc != NULL);
35353 + assert("edward-940", !tfm_stream(tc, id));
35354 +
35355 + tc->tun[id] = kmalloc(sizeof(tfm_stream_t), GFP_KERNEL);
35356 + if (!tc->tun[id])
35357 + return -ENOMEM;
35358 + memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
35359 + return alloc_ts_data(tfm_stream(tc, id), size);
35360 +}
35361 +
35362 +static inline int
35363 +realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35364 +{
35365 + assert("edward-941", tfm_stream_size(tc, id) < size);
35366 + free_ts_data(tfm_stream(tc, id));
35367 + return alloc_ts_data(tfm_stream(tc, id), size);
35368 +}
35369 +
35370 +static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35371 +{
35372 + free_ts_data(tfm_stream(tc, id));
35373 + free_ts(tfm_stream(tc, id));
35374 + set_tfm_stream(tc, id, 0);
35375 +}
35376 +
35377 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
35378 +{
35379 + return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
35380 +}
35381 +
35382 +static inline void free_tfm_unit(tfm_cluster_t * tc)
35383 +{
35384 + tfm_stream_id id;
35385 + for (id = 0; id < LAST_STREAM; id++) {
35386 + if (!tfm_stream(tc, id))
35387 + continue;
35388 + free_tfm_stream(tc, id);
35389 + }
35390 +}
35391 +
35392 +static inline void put_tfm_cluster(tfm_cluster_t * tc)
35393 +{
35394 + assert("edward-942", tc != NULL);
35395 + free_coa_set(tc);
35396 + free_tfm_unit(tc);
35397 +}
35398 +
35399 +static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
35400 +{
35401 + assert("edward-943", tc != NULL);
35402 + assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
35403 + return (tc->uptodate == 1);
35404 +}
35405 +
35406 +static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
35407 +{
35408 + assert("edward-945", tc != NULL);
35409 + assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
35410 + tc->uptodate = 1;
35411 + return;
35412 +}
35413 +
35414 +static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
35415 +{
35416 + assert("edward-947", tc != NULL);
35417 + assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
35418 + tc->uptodate = 0;
35419 + return;
35420 +}
35421 +
35422 +static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
35423 +{
35424 + return (tfm_stream(tc, id) &&
35425 + tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
35426 +}
35427 +
35428 +static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
35429 +{
35430 + int i;
35431 + for (i = 0; i < LAST_STREAM; i++)
35432 + if (!tfm_stream_is_set(tc, i))
35433 + return 0;
35434 + return 1;
35435 +}
35436 +
35437 +static inline void alternate_streams(tfm_cluster_t * tc)
35438 +{
35439 + tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
35440 +
35441 + set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
35442 + set_tfm_stream(tc, OUTPUT_STREAM, tmp);
35443 +}
35444 +
35445 +/* a kind of data that we can write to the window */
35446 +typedef enum {
35447 + DATA_WINDOW, /* the data we copy form user space */
35448 + HOLE_WINDOW /* zeroes if we write hole */
35449 +} window_stat;
35450 +
35451 +/* Sliding window of cluster size which should be set to the approprite position
35452 + (defined by cluster index) in a file before page cluster modification by
35453 + file_write. Then we translate file size, offset to write from, number of
35454 + bytes to write, etc.. to the following configuration needed to estimate
35455 + number of pages to read before write, etc...
35456 +*/
35457 +typedef struct reiser4_slide {
35458 + unsigned off; /* offset we start to write/truncate from */
35459 + unsigned count; /* number of bytes (zeroes) to write/truncate */
35460 + unsigned delta; /* number of bytes to append to the hole */
35461 + window_stat stat; /* a kind of data to write to the window */
35462 +} reiser4_slide_t;
35463 +
35464 +/* The following is a set of possible disk cluster states */
35465 +typedef enum {
35466 + INVAL_DISK_CLUSTER, /* unknown state */
35467 + PREP_DISK_CLUSTER, /* disk cluster got converted by flush
35468 + at least 1 time */
35469 + UNPR_DISK_CLUSTER, /* disk cluster just created and should be
35470 + converted by flush */
35471 + FAKE_DISK_CLUSTER /* disk cluster doesn't exist neither in memory
35472 + nor on disk */
35473 +} disk_cluster_stat;
35474 +
35475 +/*
35476 + While implementing all transforms (from page to disk cluster, and back)
35477 + reiser4 cluster manager fills the following structure incapsulating pointers
35478 + to all the clusters for the same index including the sliding window above
35479 +*/
35480 +typedef struct reiser4_cluster {
35481 + tfm_cluster_t tc; /* transform cluster */
35482 + int nr_pages; /* number of pages */
35483 + struct page **pages; /* page cluster */
35484 + page_cluster_op op; /* page cluster operation */
35485 + struct file *file;
35486 + hint_t *hint; /* disk cluster item for traversal */
35487 + disk_cluster_stat dstat; /* state of the current disk cluster */
35488 + cloff_t index; /* offset in the units of cluster size */
35489 + reiser4_slide_t *win; /* sliding window of cluster size */
35490 + int reserved; /* this indicates that space for disk
35491 + cluster modification is reserved */
35492 +#if REISER4_DEBUG
35493 + reiser4_context *ctx;
35494 + int reserved_prepped;
35495 + int reserved_unprepped;
35496 +#endif
35497 +
35498 +} reiser4_cluster_t;
35499 +
35500 +static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
35501 +{
35502 + return tfm_stream_data(&clust->tc, INPUT_STREAM);
35503 +}
35504 +
35505 +static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
35506 +{
35507 + return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
35508 +}
35509 +
35510 +static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35511 +{
35512 + assert("edward-1057", clust->pages != NULL);
35513 + memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
35514 + return 0;
35515 +}
35516 +
35517 +static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35518 +{
35519 + assert("edward-949", clust != NULL);
35520 + assert("edward-1362", clust->pages == NULL);
35521 + assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
35522 +
35523 + clust->pages =
35524 + kmalloc(sizeof(*clust->pages) * nrpages, GFP_KERNEL);
35525 + if (!clust->pages)
35526 + return RETERR(-ENOMEM);
35527 + reset_cluster_pgset(clust, nrpages);
35528 + return 0;
35529 +}
35530 +
35531 +static inline void free_cluster_pgset(reiser4_cluster_t * clust)
35532 +{
35533 + assert("edward-951", clust->pages != NULL);
35534 + kfree(clust->pages);
35535 + clust->pages = NULL;
35536 +}
35537 +
35538 +static inline void put_cluster_handle(reiser4_cluster_t * clust)
35539 +{
35540 + assert("edward-435", clust != NULL);
35541 +
35542 + put_tfm_cluster(&clust->tc);
35543 + if (clust->pages)
35544 + free_cluster_pgset(clust);
35545 + memset(clust, 0, sizeof *clust);
35546 +}
35547 +
35548 +static inline void inc_keyload_count(crypto_stat_t * data)
35549 +{
35550 + assert("edward-1410", data != NULL);
35551 + data->keyload_count++;
35552 +}
35553 +
35554 +static inline void dec_keyload_count(crypto_stat_t * data)
35555 +{
35556 + assert("edward-1411", data != NULL);
35557 + assert("edward-1412", data->keyload_count > 0);
35558 + data->keyload_count--;
35559 +}
35560 +
35561 +/* cryptcompress specific part of reiser4_inode */
35562 +typedef struct cryptcompress_info {
35563 + struct rw_semaphore lock;
35564 + crypto_stat_t *crypt;
35565 + int compress_toggle; /* current status of compressibility
35566 + is set by compression mode plugin */
35567 +#if REISER4_DEBUG
35568 + int pgcount; /* number of captured pages */
35569 +#endif
35570 +} cryptcompress_info_t;
35571 +
35572 +
35573 +static inline void toggle_compression (cryptcompress_info_t * info, int val)
35574 +{
35575 + info->compress_toggle = val;
35576 +}
35577 +
35578 +static inline int compression_is_on (cryptcompress_info_t * info)
35579 +{
35580 + return info->compress_toggle;
35581 +}
35582 +
35583 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
35584 +int equal_to_rdk(znode *, const reiser4_key *);
35585 +int goto_right_neighbor(coord_t *, lock_handle *);
35586 +int load_file_hint(struct file *, hint_t *);
35587 +void save_file_hint(struct file *, const hint_t *);
35588 +void hint_init_zero(hint_t *);
35589 +int crc_inode_ok(struct inode *inode);
35590 +int jnode_of_cluster(const jnode * node, struct page * page);
35591 +extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *, int);
35592 +extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
35593 + struct page * page);
35594 +extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
35595 + struct inode * inode);
35596 +int bind_cryptcompress(struct inode *child, struct inode *parent);
35597 +void destroy_inode_cryptcompress(struct inode * inode);
35598 +crypto_stat_t * inode_crypto_stat (struct inode * inode);
35599 +void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
35600 + int (*can_inherit)(struct inode * child,
35601 + struct inode * parent));
35602 +void attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
35603 +void detach_crypto_stat(struct inode * inode);
35604 +void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
35605 +crypto_stat_t * alloc_crypto_stat (struct inode * inode);
35606 +
35607 +
35608 +static inline reiser4_tfma_t *
35609 +info_get_tfma (crypto_stat_t * info, reiser4_tfm id)
35610 +{
35611 + return &info->tfma[id];
35612 +}
35613 +
35614 +static inline struct crypto_tfm *
35615 +info_get_tfm (crypto_stat_t * info, reiser4_tfm id)
35616 +{
35617 + return info_get_tfma(info, id)->tfm;
35618 +}
35619 +
35620 +static inline void
35621 +info_set_tfm (crypto_stat_t * info, reiser4_tfm id, struct crypto_tfm * tfm)
35622 +{
35623 + info_get_tfma(info, id)->tfm = tfm;
35624 +}
35625 +
35626 +static inline struct crypto_tfm *
35627 +info_cipher_tfm (crypto_stat_t * info)
35628 +{
35629 + return info_get_tfm(info, CIPHER_TFM);
35630 +}
35631 +
35632 +static inline struct crypto_tfm *
35633 +info_digest_tfm (crypto_stat_t * info)
35634 +{
35635 + return info_get_tfm(info, DIGEST_TFM);
35636 +}
35637 +
35638 +static inline cipher_plugin *
35639 +info_cipher_plugin (crypto_stat_t * info)
35640 +{
35641 + return &info_get_tfma(info, CIPHER_TFM)->plug->cipher;
35642 +}
35643 +
35644 +static inline digest_plugin *
35645 +info_digest_plugin (crypto_stat_t * info)
35646 +{
35647 + return &info_get_tfma(info, DIGEST_TFM)->plug->digest;
35648 +}
35649 +
35650 +static inline void
35651 +info_set_plugin(crypto_stat_t * info, reiser4_tfm id, reiser4_plugin * plugin)
35652 +{
35653 + info_get_tfma(info, id)->plug = plugin;
35654 +}
35655 +
35656 +static inline void
35657 +info_set_cipher_plugin(crypto_stat_t * info, cipher_plugin * cplug)
35658 +{
35659 + info_set_plugin(info, CIPHER_TFM, cipher_plugin_to_plugin(cplug));
35660 +}
35661 +
35662 +static inline void
35663 +info_set_digest_plugin(crypto_stat_t * info, digest_plugin * plug)
35664 +{
35665 + info_set_plugin(info, DIGEST_TFM, digest_plugin_to_plugin(plug));
35666 +}
35667 +
35668 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
35669 +
35670 +/* Make Linus happy.
35671 + Local variables:
35672 + c-indentation-style: "K&R"
35673 + mode-name: "LC"
35674 + c-basic-offset: 8
35675 + tab-width: 8
35676 + fill-column: 120
35677 + scroll-step: 1
35678 + End:
35679 +*/
35680 Index: linux-2.6.16/fs/reiser4/plugin/file/file.c
35681 ===================================================================
35682 --- /dev/null
35683 +++ linux-2.6.16/fs/reiser4/plugin/file/file.c
35684 @@ -0,0 +1,2712 @@
35685 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
35686 + * reiser4/README */
35687 +
35688 +/*
35689 + * this file contains implementations of inode/file/address_space/file plugin
35690 + * operations specific for "unix file plugin" (plugin id is
35691 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
35692 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
35693 + * no items but stat data)
35694 + */
35695 +
35696 +#include "../../inode.h"
35697 +#include "../../super.h"
35698 +#include "../../tree_walk.h"
35699 +#include "../../carry.h"
35700 +#include "../../page_cache.h"
35701 +#include "../../ioctl.h"
35702 +#include "../object.h"
35703 +#include "../../safe_link.h"
35704 +
35705 +#include <linux/writeback.h>
35706 +#include <linux/pagevec.h>
35707 +#include <linux/syscalls.h>
35708 +
35709 +
35710 +static int unpack(struct file *file, struct inode *inode, int forever);
35711 +
35712 +/* get unix file plugin specific portion of inode */
35713 +unix_file_info_t *unix_file_inode_data(const struct inode *inode)
35714 +{
35715 + return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35716 +}
35717 +
35718 +/**
35719 + * equal_to_rdk - compare key and znode's right delimiting key
35720 + * @node: node whose right delimiting key to compare with @key
35721 + * @key: key to compare with @node's right delimiting key
35722 + *
35723 + * Returns true if @key is equal to right delimiting key of @node.
35724 + */
35725 +int equal_to_rdk(znode *node, const reiser4_key *key)
35726 +{
35727 + int result;
35728 +
35729 + read_lock_dk(znode_get_tree(node));
35730 + result = keyeq(key, znode_get_rd_key(node));
35731 + read_unlock_dk(znode_get_tree(node));
35732 + return result;
35733 +}
35734 +
35735 +#if REISER4_DEBUG
35736 +
35737 +/**
35738 + * equal_to_ldk - compare key and znode's left delimiting key
35739 + * @node: node whose left delimiting key to compare with @key
35740 + * @key: key to compare with @node's left delimiting key
35741 + *
35742 + * Returns true if @key is equal to left delimiting key of @node.
35743 + */
35744 +int equal_to_ldk(znode *node, const reiser4_key *key)
35745 +{
35746 + int result;
35747 +
35748 + read_lock_dk(znode_get_tree(node));
35749 + result = keyeq(key, znode_get_ld_key(node));
35750 + read_unlock_dk(znode_get_tree(node));
35751 + return result;
35752 +}
35753 +
35754 +/**
35755 + * check_coord - check whether coord corresponds to key
35756 + * @coord: coord to check
35757 + * @key: key @coord has to correspond to
35758 + *
35759 + * Returns true if @coord is set as if it was set as result of lookup with @key
35760 + * in coord->node.
35761 + */
35762 +static int check_coord(const coord_t *coord, const reiser4_key *key)
35763 +{
35764 + coord_t twin;
35765 +
35766 + node_plugin_by_node(coord->node)->lookup(coord->node, key,
35767 + FIND_MAX_NOT_MORE_THAN, &twin);
35768 + return coords_equal(coord, &twin);
35769 +}
35770 +
35771 +#endif /* REISER4_DEBUG */
35772 +
35773 +/**
35774 + * init_uf_coord - initialize extended coord
35775 + * @uf_coord:
35776 + * @lh:
35777 + *
35778 + *
35779 + */
35780 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35781 +{
35782 + coord_init_zero(&uf_coord->coord);
35783 + coord_clear_iplug(&uf_coord->coord);
35784 + uf_coord->lh = lh;
35785 + init_lh(lh);
35786 + memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35787 + uf_coord->valid = 0;
35788 +}
35789 +
35790 +void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
35791 +{
35792 + assert("vs-1333", uf_coord->valid == 0);
35793 +
35794 + if (coord_is_between_items(&uf_coord->coord))
35795 + return;
35796 +
35797 + assert("vs-1348",
35798 + item_plugin_by_coord(&uf_coord->coord)->s.file.
35799 + init_coord_extension);
35800 +
35801 + item_body_by_coord(&uf_coord->coord);
35802 + item_plugin_by_coord(&uf_coord->coord)->s.file.
35803 + init_coord_extension(uf_coord, offset);
35804 +}
35805 +
35806 +/**
35807 + * goto_right_neighbor - lock right neighbor, drop current node lock
35808 + * @coord:
35809 + * @lh:
35810 + *
35811 + * Obtain lock on right neighbor and drop lock on current node.
35812 + */
35813 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35814 +{
35815 + int result;
35816 + lock_handle lh_right;
35817 +
35818 + assert("vs-1100", znode_is_locked(coord->node));
35819 +
35820 + init_lh(&lh_right);
35821 + result = reiser4_get_right_neighbor(&lh_right, coord->node,
35822 + znode_is_wlocked(coord->node) ?
35823 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35824 + GN_CAN_USE_UPPER_LEVELS);
35825 + if (result) {
35826 + done_lh(&lh_right);
35827 + return result;
35828 + }
35829 +
35830 + /*
35831 + * we hold two longterm locks on neighboring nodes. Unlock left of
35832 + * them
35833 + */
35834 + done_lh(lh);
35835 +
35836 + coord_init_first_unit_nocheck(coord, lh_right.node);
35837 + move_lh(lh, &lh_right);
35838 +
35839 + return 0;
35840 +
35841 +}
35842 +
35843 +/**
35844 + * set_file_state
35845 + * @uf_info:
35846 + * @cbk_result:
35847 + * @level:
35848 + *
35849 + * This is to be used by find_file_item and in find_file_state to
35850 + * determine real state of file
35851 + */
35852 +static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
35853 + tree_level level)
35854 +{
35855 + if (cbk_errored(cbk_result))
35856 + /* error happened in find_file_item */
35857 + return;
35858 +
35859 + assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35860 +
35861 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35862 + /*
35863 + * container is unknown, therefore conversion can not be in
35864 + * progress
35865 + */
35866 + assert("", !inode_get_flag(unix_file_info_to_inode(uf_info),
35867 + REISER4_PART_IN_CONV));
35868 + if (cbk_result == CBK_COORD_NOTFOUND)
35869 + uf_info->container = UF_CONTAINER_EMPTY;
35870 + else if (level == LEAF_LEVEL)
35871 + uf_info->container = UF_CONTAINER_TAILS;
35872 + else
35873 + uf_info->container = UF_CONTAINER_EXTENTS;
35874 + } else {
35875 + /*
35876 + * file state is known, check whether it is set correctly if
35877 + * file is not being tail converted
35878 + */
35879 + if (!inode_get_flag(unix_file_info_to_inode(uf_info),
35880 + REISER4_PART_IN_CONV)) {
35881 + assert("vs-1162",
35882 + ergo(level == LEAF_LEVEL &&
35883 + cbk_result == CBK_COORD_FOUND,
35884 + uf_info->container == UF_CONTAINER_TAILS));
35885 + assert("vs-1165",
35886 + ergo(level == TWIG_LEVEL &&
35887 + cbk_result == CBK_COORD_FOUND,
35888 + uf_info->container == UF_CONTAINER_EXTENTS));
35889 + }
35890 + }
35891 +}
35892 +
35893 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35894 + const reiser4_key *key, znode_lock_mode lock_mode,
35895 + struct inode *inode)
35896 +{
35897 + return object_lookup(inode, key, coord, lh, lock_mode,
35898 + FIND_MAX_NOT_MORE_THAN,
35899 + TWIG_LEVEL, LEAF_LEVEL,
35900 + (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35901 + (CBK_UNIQUE | CBK_FOR_INSERT),
35902 + NULL /* ra_info */ );
35903 +}
35904 +
35905 +/**
35906 + * find_file_item - look for file item in the tree
35907 + * @hint: provides coordinate, lock handle, seal
35908 + * @key: key for search
35909 + * @mode: mode of lock to put on returned node
35910 + * @ra_info:
35911 + * @inode:
35912 + *
35913 + * This finds position in the tree corresponding to @key. It first tries to use
35914 + * @hint's seal if it is set.
35915 + */
35916 +int find_file_item(hint_t *hint, const reiser4_key *key,
35917 + znode_lock_mode lock_mode,
35918 + struct inode *inode)
35919 +{
35920 + int result;
35921 + coord_t *coord;
35922 + lock_handle *lh;
35923 +
35924 + assert("nikita-3030", schedulable());
35925 + assert("vs-1707", hint != NULL);
35926 + assert("vs-47", inode != NULL);
35927 +
35928 + coord = &hint->ext_coord.coord;
35929 + lh = hint->ext_coord.lh;
35930 + init_lh(lh);
35931 +
35932 + result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35933 + if (!result) {
35934 + if (coord->between == AFTER_UNIT &&
35935 + equal_to_rdk(coord->node, key)) {
35936 + result = goto_right_neighbor(coord, lh);
35937 + if (result == -E_NO_NEIGHBOR)
35938 + return RETERR(-EIO);
35939 + if (result)
35940 + return result;
35941 + assert("vs-1152", equal_to_ldk(coord->node, key));
35942 + /*
35943 + * we moved to different node. Invalidate coord
35944 + * extension, zload is necessary to init it again
35945 + */
35946 + hint->ext_coord.valid = 0;
35947 + }
35948 +
35949 + set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35950 + znode_get_level(coord->node));
35951 +
35952 + return CBK_COORD_FOUND;
35953 + }
35954 +
35955 + coord_init_zero(coord);
35956 + result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35957 + set_file_state(unix_file_inode_data(inode), result,
35958 + znode_get_level(coord->node));
35959 +
35960 + /* FIXME: we might already have coord extension initialized */
35961 + hint->ext_coord.valid = 0;
35962 + return result;
35963 +}
35964 +
35965 +/* plugin->u.file.write_flowom = NULL
35966 + plugin->u.file.read_flow = NULL */
35967 +
35968 +void hint_init_zero(hint_t * hint)
35969 +{
35970 + memset(hint, 0, sizeof(*hint));
35971 + init_lh(&hint->lh);
35972 + hint->ext_coord.lh = &hint->lh;
35973 +}
35974 +
35975 +static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
35976 +{
35977 + int result;
35978 + reiser4_key key;
35979 + coord_t coord;
35980 + lock_handle lh;
35981 +
35982 + assert("vs-1628", ea_obtained(uf_info));
35983 +
35984 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35985 + key_by_inode_and_offset_common(inode, 0, &key);
35986 + init_lh(&lh);
35987 + result = find_file_item_nohint(&coord, &lh, &key,
35988 + ZNODE_READ_LOCK, inode);
35989 + set_file_state(uf_info, result, znode_get_level(coord.node));
35990 + done_lh(&lh);
35991 + if (!cbk_errored(result))
35992 + result = 0;
35993 + } else
35994 + result = 0;
35995 + assert("vs-1074",
35996 + ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
35997 + txn_restart_current();
35998 + return result;
35999 +}
36000 +
36001 +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
36002 + data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
36003 + if page corresponds to hole extent and unallocated one will have to be created */
36004 +static int reserve_partial_page(reiser4_tree * tree)
36005 +{
36006 + grab_space_enable();
36007 + return reiser4_grab_reserved(reiser4_get_current_sb(),
36008 + 1 +
36009 + 2 * estimate_one_insert_into_item(tree),
36010 + BA_CAN_COMMIT);
36011 +}
36012 +
36013 +/* estimate and reserve space needed to cut one item and update one stat data */
36014 +static int reserve_cut_iteration(reiser4_tree * tree)
36015 +{
36016 + __u64 estimate = estimate_one_item_removal(tree)
36017 + + estimate_one_insert_into_item(tree);
36018 +
36019 + assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
36020 +
36021 + grab_space_enable();
36022 + /* We need to double our estimate now that we can delete more than one
36023 + node. */
36024 + return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
36025 + BA_CAN_COMMIT);
36026 +}
36027 +
36028 +int update_file_size(struct inode *inode, reiser4_key * key, int update_sd)
36029 +{
36030 + int result = 0;
36031 +
36032 + INODE_SET_FIELD(inode, i_size, get_key_offset(key));
36033 + if (update_sd) {
36034 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
36035 + result = reiser4_update_sd(inode);
36036 + }
36037 + return result;
36038 +}
36039 +
36040 +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
36041 + and update file stat data on every single cut from the tree */
36042 +int
36043 +cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
36044 + loff_t cur_size, int (*update_actor) (struct inode *,
36045 + reiser4_key *, int))
36046 +{
36047 + reiser4_key from_key, to_key;
36048 + reiser4_key smallest_removed;
36049 + file_plugin *fplug = inode_file_plugin(inode);
36050 + int result;
36051 + int progress = 0;
36052 +
36053 + assert("vs-1248",
36054 + fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
36055 + fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
36056 +
36057 + fplug->key_by_inode(inode, new_size, &from_key);
36058 + to_key = from_key;
36059 + set_key_offset(&to_key, cur_size - 1 /*get_key_offset(max_key()) */ );
36060 + /* this loop normally runs just once */
36061 + while (1) {
36062 + result = reserve_cut_iteration(tree_by_inode(inode));
36063 + if (result)
36064 + break;
36065 +
36066 + result = cut_tree_object(current_tree, &from_key, &to_key,
36067 + &smallest_removed, inode, 1,
36068 + &progress);
36069 + if (result == -E_REPEAT) {
36070 + /* -E_REPEAT is a signal to interrupt a long file truncation process */
36071 + if (progress) {
36072 + result =
36073 + update_actor(inode, &smallest_removed,
36074 + update_sd);
36075 + if (result)
36076 + break;
36077 + }
36078 +
36079 + /* the below does up(sbinfo->delete_sema). Do not get folled */
36080 + reiser4_release_reserved(inode->i_sb);
36081 +
36082 + /* cut_tree_object() was interrupted probably because
36083 + * current atom requires commit, we have to release
36084 + * transaction handle to allow atom commit. */
36085 + txn_restart_current();
36086 + continue;
36087 + }
36088 + if (result
36089 + && !(result == CBK_COORD_NOTFOUND && new_size == 0
36090 + && inode->i_size == 0))
36091 + break;
36092 +
36093 + set_key_offset(&smallest_removed, new_size);
36094 + /* Final sd update after the file gets its correct size */
36095 + result = update_actor(inode, &smallest_removed, update_sd);
36096 + break;
36097 + }
36098 +
36099 + /* the below does up(sbinfo->delete_sema). Do not get folled */
36100 + reiser4_release_reserved(inode->i_sb);
36101 +
36102 + return result;
36103 +}
36104 +
36105 +int find_or_create_extent(struct page *page);
36106 +
36107 +static int filler(void *vp, struct page *page)
36108 +{
36109 + return readpage_unix_file_nolock(vp, page);
36110 +}
36111 +
36112 +/* part of truncate_file_body: it is called when truncate is used to make file
36113 + shorter */
36114 +static int shorten_file(struct inode *inode, loff_t new_size)
36115 +{
36116 + int result;
36117 + struct page *page;
36118 + int padd_from;
36119 + unsigned long index;
36120 + char *kaddr;
36121 + unix_file_info_t *uf_info;
36122 +
36123 + /*
36124 + * all items of ordinary reiser4 file are grouped together. That is why
36125 + * we can use cut_tree. Plan B files (for instance) can not be
36126 + * truncated that simply
36127 + */
36128 + result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
36129 + get_key_offset(max_key()), update_file_size);
36130 + if (result)
36131 + return result;
36132 +
36133 + uf_info = unix_file_inode_data(inode);
36134 + assert("vs-1105", new_size == inode->i_size);
36135 + if (new_size == 0) {
36136 + uf_info->container = UF_CONTAINER_EMPTY;
36137 + return 0;
36138 + }
36139 +
36140 + result = find_file_state(inode, uf_info);
36141 + if (result)
36142 + return result;
36143 + if (uf_info->container == UF_CONTAINER_TAILS)
36144 + /*
36145 + * No need to worry about zeroing last page after new file
36146 + * end
36147 + */
36148 + return 0;
36149 +
36150 + padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
36151 + if (!padd_from)
36152 + /* file is truncated to page boundary */
36153 + return 0;
36154 +
36155 + result = reserve_partial_page(tree_by_inode(inode));
36156 + if (result) {
36157 + reiser4_release_reserved(inode->i_sb);
36158 + return result;
36159 + }
36160 +
36161 + /* last page is partially truncated - zero its content */
36162 + index = (inode->i_size >> PAGE_CACHE_SHIFT);
36163 + page = read_cache_page(inode->i_mapping, index, filler, NULL);
36164 + if (IS_ERR(page)) {
36165 + /*
36166 + * the below does up(sbinfo->delete_sema). Do not get
36167 + * confused
36168 + */
36169 + reiser4_release_reserved(inode->i_sb);
36170 + if (likely(PTR_ERR(page) == -EINVAL)) {
36171 + /* looks like file is built of tail items */
36172 + return 0;
36173 + }
36174 + return PTR_ERR(page);
36175 + }
36176 + wait_on_page_locked(page);
36177 + if (!PageUptodate(page)) {
36178 + page_cache_release(page);
36179 + /*
36180 + * the below does up(sbinfo->delete_sema). Do not get
36181 + * confused
36182 + */
36183 + reiser4_release_reserved(inode->i_sb);
36184 + return RETERR(-EIO);
36185 + }
36186 +
36187 + /*
36188 + * if page correspons to hole extent unit - unallocated one will be
36189 + * created here. This is not necessary
36190 + */
36191 + result = find_or_create_extent(page);
36192 +
36193 + /*
36194 + * FIXME: cut_file_items has already updated inode. Probably it would
36195 + * be better to update it here when file is really truncated
36196 + */
36197 + if (result) {
36198 + page_cache_release(page);
36199 + /*
36200 + * the below does up(sbinfo->delete_sema). Do not get
36201 + * confused
36202 + */
36203 + reiser4_release_reserved(inode->i_sb);
36204 + return result;
36205 + }
36206 +
36207 + lock_page(page);
36208 + assert("vs-1066", PageLocked(page));
36209 + kaddr = kmap_atomic(page, KM_USER0);
36210 + memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
36211 + flush_dcache_page(page);
36212 + kunmap_atomic(kaddr, KM_USER0);
36213 + unlock_page(page);
36214 + page_cache_release(page);
36215 + /* the below does up(sbinfo->delete_sema). Do not get confused */
36216 + reiser4_release_reserved(inode->i_sb);
36217 + return 0;
36218 +}
36219 +
36220 +/**
36221 + * should_have_notail
36222 + * @uf_info:
36223 + * @new_size:
36224 + *
36225 + * Calls formatting plugin to see whether file of size @new_size has to be
36226 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
36227 + */
36228 +static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
36229 +{
36230 + if (!uf_info->tplug)
36231 + return 1;
36232 + return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
36233 + new_size);
36234 +
36235 +}
36236 +
36237 +/**
36238 + * truncate_file_body - change length of file
36239 + * @inode: inode of file
36240 + * @new_size: new file length
36241 + *
36242 + * Adjusts items file @inode is built of to match @new_size. It may either cut
36243 + * items or add them to represent a hole at the end of file. The caller has to
36244 + * obtain exclusive access to the file.
36245 + */
36246 +static int truncate_file_body(struct inode *inode, loff_t new_size)
36247 +{
36248 + int result;
36249 +
36250 + if (inode->i_size < new_size) {
36251 + /* expanding truncate */
36252 + struct dentry dentry;
36253 + struct file file;
36254 + unix_file_info_t *uf_info;
36255 +
36256 + dentry.d_inode = inode;
36257 + file.f_dentry = &dentry;
36258 + file.private_data = NULL;
36259 + file.f_pos = new_size;
36260 + file.private_data = NULL;
36261 + uf_info = unix_file_inode_data(inode);
36262 + result = find_file_state(inode, uf_info);
36263 + if (result)
36264 + return result;
36265 +
36266 + if (should_have_notail(uf_info, new_size)) {
36267 + /*
36268 + * file of size @new_size has to be built of
36269 + * extents. If it is built of tails - convert to
36270 + * extents
36271 + */
36272 + if (uf_info->container == UF_CONTAINER_TAILS) {
36273 + /*
36274 + * if file is being convered by another process
36275 + * - wait until it completes
36276 + */
36277 + while (1) {
36278 + if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
36279 + drop_exclusive_access(uf_info);
36280 + schedule();
36281 + get_exclusive_access(uf_info);
36282 + continue;
36283 + }
36284 + break;
36285 + }
36286 +
36287 + if (uf_info->container == UF_CONTAINER_TAILS) {
36288 + result = tail2extent(uf_info);
36289 + if (result)
36290 + return result;
36291 + }
36292 + }
36293 + result = write_extent(&file, NULL, 0, &new_size);
36294 + if (result)
36295 + return result;
36296 + uf_info->container = UF_CONTAINER_EXTENTS;
36297 + } else {
36298 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
36299 + result = write_extent(&file, NULL, 0, &new_size);
36300 + if (result)
36301 + return result;
36302 + } else {
36303 + result = write_tail(&file, NULL, 0, &new_size);
36304 + if (result)
36305 + return result;
36306 + uf_info->container = UF_CONTAINER_TAILS;
36307 + }
36308 + }
36309 + BUG_ON(result > 0);
36310 + INODE_SET_FIELD(inode, i_size, new_size);
36311 + file_update_time(&file);
36312 + result = reiser4_update_sd(inode);
36313 + BUG_ON(result != 0);
36314 + reiser4_free_file_fsdata(&file);
36315 + } else
36316 + result = shorten_file(inode, new_size);
36317 + return result;
36318 +}
36319 +
36320 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
36321 +
36322 +/**
36323 + * load_file_hint - copy hint from struct file to local variable
36324 + * @file: file to get hint from
36325 + * @hint: structure to fill
36326 + *
36327 + * Reiser4 specific portion of struct file may contain information (hint)
36328 + * stored on exiting from previous read or write. That information includes
36329 + * seal of znode and coord within that znode where previous read or write
36330 + * stopped. This function copies that information to @hint if it was stored or
36331 + * initializes @hint by 0s otherwise.
36332 + */
36333 +int load_file_hint(struct file *file, hint_t *hint)
36334 +{
36335 + reiser4_file_fsdata *fsdata;
36336 +
36337 + if (file) {
36338 + fsdata = reiser4_get_file_fsdata(file);
36339 + if (IS_ERR(fsdata))
36340 + return PTR_ERR(fsdata);
36341 +
36342 + spin_lock_inode(file->f_dentry->d_inode);
36343 + if (seal_is_set(&fsdata->reg.hint.seal)) {
36344 + *hint = fsdata->reg.hint;
36345 + init_lh(&hint->lh);
36346 + hint->ext_coord.lh = &hint->lh;
36347 + spin_unlock_inode(file->f_dentry->d_inode);
36348 + /*
36349 + * force re-validation of the coord on the first
36350 + * iteration of the read/write loop.
36351 + */
36352 + hint->ext_coord.valid = 0;
36353 + assert("nikita-19892", coords_equal(&hint->seal.coord1,
36354 + &hint->ext_coord.
36355 + coord));
36356 + return 0;
36357 + }
36358 + memset(&fsdata->reg.hint, 0, sizeof(hint_t));
36359 + spin_unlock_inode(file->f_dentry->d_inode);
36360 + }
36361 + hint_init_zero(hint);
36362 + return 0;
36363 +}
36364 +
36365 +/**
36366 + * save_file_hint - copy hint to reiser4 private struct file's part
36367 + * @file: file to save hint in
36368 + * @hint: hint to save
36369 + *
36370 + * This copies @hint to reiser4 private part of struct file. It can help
36371 + * speedup future accesses to the file.
36372 + */
36373 +void save_file_hint(struct file *file, const hint_t *hint)
36374 +{
36375 + reiser4_file_fsdata *fsdata;
36376 +
36377 + assert("edward-1337", hint != NULL);
36378 +
36379 + if (!file || !seal_is_set(&hint->seal))
36380 + return;
36381 + fsdata = reiser4_get_file_fsdata(file);
36382 + assert("vs-965", !IS_ERR(fsdata));
36383 + assert("nikita-19891",
36384 + coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
36385 + assert("vs-30", hint->lh.owner == NULL);
36386 + spin_lock_inode(file->f_dentry->d_inode);
36387 + fsdata->reg.hint = *hint;
36388 + spin_unlock_inode(file->f_dentry->d_inode);
36389 + return;
36390 +}
36391 +
36392 +void unset_hint(hint_t * hint)
36393 +{
36394 + assert("vs-1315", hint);
36395 + hint->ext_coord.valid = 0;
36396 + seal_done(&hint->seal);
36397 + done_lh(&hint->lh);
36398 +}
36399 +
36400 +/* coord must be set properly. So, that set_hint has nothing to do */
36401 +void set_hint(hint_t * hint, const reiser4_key * key, znode_lock_mode mode)
36402 +{
36403 + ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
36404 + assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
36405 +
36406 + seal_init(&hint->seal, &hint->ext_coord.coord, key);
36407 + hint->offset = get_key_offset(key);
36408 + hint->mode = mode;
36409 + done_lh(&hint->lh);
36410 +}
36411 +
36412 +int hint_is_set(const hint_t * hint)
36413 +{
36414 + return seal_is_set(&hint->seal);
36415 +}
36416 +
36417 +#if REISER4_DEBUG
36418 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
36419 +{
36420 + return (get_key_locality(k1) == get_key_locality(k2) &&
36421 + get_key_type(k1) == get_key_type(k2) &&
36422 + get_key_band(k1) == get_key_band(k2) &&
36423 + get_key_ordering(k1) == get_key_ordering(k2) &&
36424 + get_key_objectid(k1) == get_key_objectid(k2));
36425 +}
36426 +#endif
36427 +
36428 +int
36429 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
36430 + znode_lock_mode lock_mode)
36431 +{
36432 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
36433 + /* hint either not set or set by different operation */
36434 + return RETERR(-E_REPEAT);
36435 +
36436 + assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
36437 +
36438 + if (check_key && get_key_offset(key) != hint->offset)
36439 + /* hint is set for different key */
36440 + return RETERR(-E_REPEAT);
36441 +
36442 + assert("vs-31", hint->ext_coord.lh == &hint->lh);
36443 + return seal_validate(&hint->seal, &hint->ext_coord.coord, key,
36444 + hint->ext_coord.lh, lock_mode, ZNODE_LOCK_LOPRI);
36445 +}
36446 +
36447 +int xversion;
36448 +
36449 +/**
36450 + * find_or_create_extent -
36451 + * @page:
36452 + *
36453 + *
36454 + */
36455 +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
36456 + unallocated extent if it does not exist yet, initialize jnode, capture page */
36457 +int find_or_create_extent(struct page *page)
36458 +{
36459 + int result;
36460 + struct inode *inode;
36461 + int plugged_hole;
36462 +
36463 + jnode *node;
36464 +
36465 + assert("vs-1065", page->mapping && page->mapping->host);
36466 + inode = page->mapping->host;
36467 +
36468 + lock_page(page);
36469 + node = jnode_of_page(page);
36470 + unlock_page(page);
36471 + if (IS_ERR(node))
36472 + return PTR_ERR(node);
36473 +
36474 + if (node->blocknr == 0) {
36475 + plugged_hole = 0;
36476 + result = update_extent(inode, node,
36477 + (loff_t)page->index << PAGE_CACHE_SHIFT,
36478 + &plugged_hole);
36479 + if (result) {
36480 + jput(node);
36481 + warning("", "update_extent failed: %d", result);
36482 + return result;
36483 + }
36484 + if (plugged_hole)
36485 + reiser4_update_sd(inode);
36486 + } else {
36487 + spin_lock_jnode(node);
36488 + result = try_capture(node, ZNODE_WRITE_LOCK, 0);
36489 + BUG_ON(result != 0);
36490 + jnode_make_dirty_locked(node);
36491 + spin_unlock_jnode(node);
36492 + }
36493 +
36494 + BUG_ON(node->atom == NULL);
36495 + jput(node);
36496 +
36497 + if (get_current_context()->entd) {
36498 + entd_context *ent = get_entd_context(node->tree->super);
36499 +
36500 + if (ent->cur_request->page == page)
36501 + ent->cur_request->node = node;
36502 + }
36503 + return 0;
36504 +}
36505 +
36506 +/**
36507 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
36508 + * @inode: inode to check
36509 + *
36510 + * Returns true if inode's mapping has dirty pages which do not belong to any
36511 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
36512 + * tree or were eflushed and can be found via jnodes tagged
36513 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
36514 + */
36515 +static int has_anonymous_pages(struct inode *inode)
36516 +{
36517 + int result;
36518 +
36519 + read_lock_irq(&inode->i_mapping->tree_lock);
36520 + result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
36521 + read_unlock_irq(&inode->i_mapping->tree_lock);
36522 + return result;
36523 +}
36524 +
36525 +/**
36526 + * capture_page_and_create_extent -
36527 + * @page: page to be captured
36528 + *
36529 + * Grabs space for extent creation and stat data update and calls function to
36530 + * do actual work.
36531 + */
36532 +static int capture_page_and_create_extent(struct page *page)
36533 +{
36534 + int result;
36535 + struct inode *inode;
36536 +
36537 + assert("vs-1084", page->mapping && page->mapping->host);
36538 + inode = page->mapping->host;
36539 + assert("vs-1139",
36540 + unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
36541 + /* page belongs to file */
36542 + assert("vs-1393",
36543 + inode->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT));
36544 +
36545 + /* page capture may require extent creation (if it does not exist yet)
36546 + and stat data's update (number of blocks changes on extent
36547 + creation) */
36548 + grab_space_enable();
36549 + result =
36550 + reiser4_grab_space(2 *
36551 + estimate_one_insert_into_item(tree_by_inode
36552 + (inode)),
36553 + BA_CAN_COMMIT);
36554 + if (likely(!result))
36555 + result = find_or_create_extent(page);
36556 +
36557 + if (result != 0)
36558 + SetPageError(page);
36559 + return result;
36560 +}
36561 +
36562 +/* this is implementation of method commit_write of struct
36563 + address_space_operations for unix file plugin */
36564 +int
36565 +commit_write_unix_file(struct file *file, struct page *page,
36566 + unsigned from, unsigned to)
36567 +{
36568 + reiser4_context *ctx;
36569 + struct inode *inode;
36570 + int result;
36571 +
36572 + assert("umka-3101", file != NULL);
36573 + assert("umka-3102", page != NULL);
36574 + assert("umka-3093", PageLocked(page));
36575 +
36576 + SetPageUptodate(page);
36577 +
36578 + inode = page->mapping->host;
36579 + ctx = init_context(page->mapping->host->i_sb);
36580 + if (IS_ERR(ctx))
36581 + return PTR_ERR(ctx);
36582 + page_cache_get(page);
36583 + unlock_page(page);
36584 + result = capture_page_and_create_extent(page);
36585 + lock_page(page);
36586 + page_cache_release(page);
36587 +
36588 + /* don't commit transaction under inode semaphore */
36589 + context_set_commit_async(ctx);
36590 + reiser4_exit_context(ctx);
36591 + return result;
36592 +}
36593 +
36594 +/*
36595 + * Support for "anonymous" pages and jnodes.
36596 + *
36597 + * When file is write-accessed through mmap pages can be dirtied from the user
36598 + * level. In this case kernel is not notified until one of following happens:
36599 + *
36600 + * (1) msync()
36601 + *
36602 + * (2) truncate() (either explicit or through unlink)
36603 + *
36604 + * (3) VM scanner starts reclaiming mapped pages, dirtying them before
36605 + * starting write-back.
36606 + *
36607 + * As a result of (3) ->writepage may be called on a dirty page without
36608 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
36609 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
36610 + * this situation by creating jnode for anonymous page, starting IO on the
36611 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
36612 + * memory. Such jnode is also called anonymous.
36613 + *
36614 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
36615 + * tree. This is done by capture_anonymous_*() functions below.
36616 + */
36617 +
36618 +/**
36619 + * capture_anonymous_page - involve page into transaction
36620 + * @pg: page to deal with
36621 + *
36622 + * Takes care that @page has corresponding metadata in the tree, creates jnode
36623 + * for @page and captures it. On success 1 is returned.
36624 + */
36625 +static int capture_anonymous_page(struct page *page)
36626 +{
36627 + int result;
36628 +
36629 + if (PageWriteback(page))
36630 + /* FIXME: do nothing? */
36631 + return 0;
36632 +
36633 + result = capture_page_and_create_extent(page);
36634 + if (result == 0) {
36635 + result = 1;
36636 + } else
36637 + warning("nikita-3329",
36638 + "Cannot capture anon page: %i", result);
36639 +
36640 + return result;
36641 +}
36642 +
36643 +/**
36644 + * capture_anonymous_pages - find and capture pages dirtied via mmap
36645 + * @mapping: address space where to look for pages
36646 + * @index: start index
36647 + * @to_capture: maximum number of pages to capture
36648 + *
36649 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
36650 + * captures (involves into atom) them, returns number of captured pages,
36651 + * updates @index to next page after the last captured one.
36652 + */
36653 +static int
36654 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
36655 + unsigned int to_capture)
36656 +{
36657 + int result;
36658 + struct pagevec pvec;
36659 + unsigned int i, count;
36660 + int nr;
36661 +
36662 + pagevec_init(&pvec, 0);
36663 + count = min(pagevec_space(&pvec), to_capture);
36664 + nr = 0;
36665 +
36666 + /* find pages tagged MOVED */
36667 + write_lock_irq(&mapping->tree_lock);
36668 + pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
36669 + (void **)pvec.pages, *index, count,
36670 + PAGECACHE_TAG_REISER4_MOVED);
36671 + if (pagevec_count(&pvec) == 0) {
36672 + /*
36673 + * there are no pages tagged MOVED in mapping->page_tree
36674 + * starting from *index
36675 + */
36676 + write_unlock_irq(&mapping->tree_lock);
36677 + *index = (pgoff_t)-1;
36678 + return 0;
36679 + }
36680 +
36681 + /* clear MOVED tag for all found pages */
36682 + for (i = 0; i < pagevec_count(&pvec); i++) {
36683 + void *p;
36684 +
36685 + page_cache_get(pvec.pages[i]);
36686 + p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36687 + PAGECACHE_TAG_REISER4_MOVED);
36688 + assert("vs-49", p == pvec.pages[i]);
36689 + }
36690 + write_unlock_irq(&mapping->tree_lock);
36691 +
36692 +
36693 + *index = pvec.pages[i - 1]->index + 1;
36694 +
36695 + for (i = 0; i < pagevec_count(&pvec); i++) {
36696 + /*
36697 + * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36698 + * set_page_dirty_internal which is called when jnode is
36699 + * captured
36700 + */
36701 + result = capture_anonymous_page(pvec.pages[i]);
36702 + if (result == 1)
36703 + nr++;
36704 + else {
36705 + if (result < 0) {
36706 + warning("vs-1454",
36707 + "failed to capture page: "
36708 + "result=%d, captured=%d)\n",
36709 + result, i);
36710 +
36711 + /*
36712 + * set MOVED tag to all pages which left not
36713 + * captured
36714 + */
36715 + write_lock_irq(&mapping->tree_lock);
36716 + for (; i < pagevec_count(&pvec); i ++) {
36717 + radix_tree_tag_set(&mapping->page_tree,
36718 + pvec.pages[i]->index,
36719 + PAGECACHE_TAG_REISER4_MOVED);
36720 + }
36721 + write_unlock_irq(&mapping->tree_lock);
36722 +
36723 + pagevec_release(&pvec);
36724 + return result;
36725 + } else {
36726 + /*
36727 + * result == 0. capture_anonymous_page returns
36728 + * 0 for Writeback-ed page. Set MOVED tag on
36729 + * that page
36730 + */
36731 + write_lock_irq(&mapping->tree_lock);
36732 + radix_tree_tag_set(&mapping->page_tree,
36733 + pvec.pages[i]->index,
36734 + PAGECACHE_TAG_REISER4_MOVED);
36735 + write_unlock_irq(&mapping->tree_lock);
36736 + if (i == 0)
36737 + *index = pvec.pages[0]->index;
36738 + else
36739 + *index = pvec.pages[i - 1]->index + 1;
36740 + }
36741 + }
36742 + }
36743 + pagevec_release(&pvec);
36744 + return nr;
36745 +}
36746 +
36747 +/**
36748 + * capture_anonymous_jnodes - find and capture anonymous jnodes
36749 + * @mapping: address space where to look for jnodes
36750 + * @from: start index
36751 + * @to: end index
36752 + * @to_capture: maximum number of jnodes to capture
36753 + *
36754 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36755 + * the range of indexes @from-@to and captures them, returns number of captured
36756 + * jnodes, updates @from to next jnode after the last captured one.
36757 + */
36758 +static int
36759 +capture_anonymous_jnodes(struct address_space *mapping,
36760 + pgoff_t *from, pgoff_t to, int to_capture)
36761 +{
36762 + *from = to;
36763 + return 0;
36764 +}
36765 +
36766 +/*
36767 + * Commit atom of the jnode of a page.
36768 + */
36769 +static int sync_page(struct page *page)
36770 +{
36771 + int result;
36772 + do {
36773 + jnode *node;
36774 + txn_atom *atom;
36775 +
36776 + lock_page(page);
36777 + node = jprivate(page);
36778 + if (node != NULL) {
36779 + spin_lock_jnode(node);
36780 + atom = jnode_get_atom(node);
36781 + spin_unlock_jnode(node);
36782 + } else
36783 + atom = NULL;
36784 + unlock_page(page);
36785 + result = sync_atom(atom);
36786 + } while (result == -E_REPEAT);
36787 + /*
36788 + * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36789 + * handle the case where more pages get added to the atom while we are
36790 + * syncing it?
36791 + */
36792 + assert("nikita-3485", ergo(result == 0,
36793 + get_current_context()->trans->atom == NULL));
36794 + return result;
36795 +}
36796 +
36797 +/*
36798 + * Commit atoms of pages on @pages list.
36799 + * call sync_page for each page from mapping's page tree
36800 + */
36801 +static int sync_page_list(struct inode *inode)
36802 +{
36803 + int result;
36804 + struct address_space *mapping;
36805 + unsigned long from; /* start index for radix_tree_gang_lookup */
36806 + unsigned int found; /* return value for radix_tree_gang_lookup */
36807 +
36808 + mapping = inode->i_mapping;
36809 + from = 0;
36810 + result = 0;
36811 + read_lock_irq(&mapping->tree_lock);
36812 + while (result == 0) {
36813 + struct page *page;
36814 +
36815 + found =
36816 + radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36817 + from, 1);
36818 + assert("", found < 2);
36819 + if (found == 0)
36820 + break;
36821 +
36822 + /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36823 + sys_fsync */
36824 + page_cache_get(page);
36825 + read_unlock_irq(&mapping->tree_lock);
36826 +
36827 + from = page->index + 1;
36828 +
36829 + result = sync_page(page);
36830 +
36831 + page_cache_release(page);
36832 + read_lock_irq(&mapping->tree_lock);
36833 + }
36834 +
36835 + read_unlock_irq(&mapping->tree_lock);
36836 + return result;
36837 +}
36838 +
36839 +static int commit_file_atoms(struct inode *inode)
36840 +{
36841 + int result;
36842 + unix_file_info_t *uf_info;
36843 +
36844 + uf_info = unix_file_inode_data(inode);
36845 +
36846 + get_exclusive_access(uf_info);
36847 + /*
36848 + * find what items file is made from
36849 + */
36850 + result = find_file_state(inode, uf_info);
36851 + drop_exclusive_access(uf_info);
36852 + if (result != 0)
36853 + return result;
36854 +
36855 + /*
36856 + * file state cannot change because we are under ->i_mutex
36857 + */
36858 + switch (uf_info->container) {
36859 + case UF_CONTAINER_EXTENTS:
36860 + /* find_file_state might open join an atom */
36861 + txn_restart_current();
36862 + result =
36863 + /*
36864 + * when we are called by
36865 + * filemap_fdatawrite->
36866 + * do_writepages()->
36867 + * reiser4_writepages()
36868 + *
36869 + * inode->i_mapping->dirty_pages are spices into
36870 + * ->io_pages, leaving ->dirty_pages dirty.
36871 + *
36872 + * When we are called from
36873 + * reiser4_fsync()->sync_unix_file(), we have to
36874 + * commit atoms of all pages on the ->dirty_list.
36875 + *
36876 + * So for simplicity we just commit ->io_pages and
36877 + * ->dirty_pages.
36878 + */
36879 + sync_page_list(inode);
36880 + break;
36881 + case UF_CONTAINER_TAILS:
36882 + /*
36883 + * NOTE-NIKITA probably we can be smarter for tails. For now
36884 + * just commit all existing atoms.
36885 + */
36886 + result = txnmgr_force_commit_all(inode->i_sb, 0);
36887 + break;
36888 + case UF_CONTAINER_EMPTY:
36889 + result = 0;
36890 + break;
36891 + case UF_CONTAINER_UNKNOWN:
36892 + default:
36893 + result = -EIO;
36894 + break;
36895 + }
36896 +
36897 + /*
36898 + * commit current transaction: there can be captured nodes from
36899 + * find_file_state() and finish_conversion().
36900 + */
36901 + txn_restart_current();
36902 + return result;
36903 +}
36904 +
36905 +/**
36906 + * writepages_unix_file - writepages of struct address_space_operations
36907 + * @mapping:
36908 + * @wbc:
36909 + *
36910 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36911 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36912 + * created by reiser4_writepage.
36913 + */
36914 +int writepages_unix_file(struct address_space *mapping,
36915 + struct writeback_control *wbc)
36916 +{
36917 + int result;
36918 + unix_file_info_t *uf_info;
36919 + pgoff_t pindex, jindex, nr_pages;
36920 + long to_capture;
36921 + struct inode *inode;
36922 +
36923 + inode = mapping->host;
36924 + if (!has_anonymous_pages(inode)) {
36925 + result = 0;
36926 + goto end;
36927 + }
36928 + jindex = pindex = wbc->start >> PAGE_CACHE_SHIFT;
36929 + result = 0;
36930 + nr_pages =
36931 + (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
36932 + uf_info = unix_file_inode_data(inode);
36933 +
36934 + do {
36935 + reiser4_context *ctx;
36936 +
36937 + if (wbc->sync_mode != WB_SYNC_ALL)
36938 + to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36939 + else
36940 + to_capture = CAPTURE_APAGE_BURST;
36941 +
36942 + ctx = init_context(inode->i_sb);
36943 + if (IS_ERR(ctx)) {
36944 + result = PTR_ERR(ctx);
36945 + break;
36946 + }
36947 + /* avoid recursive calls to ->sync_inodes */
36948 + ctx->nobalance = 1;
36949 + assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36950 + assert("", LOCK_CNT_NIL(inode_sem_w));
36951 + assert("", LOCK_CNT_NIL(inode_sem_r));
36952 +
36953 + txn_restart_current();
36954 +
36955 + /* we have to get nonexclusive access to the file */
36956 + if (get_current_context()->entd) {
36957 + /*
36958 + * use nonblocking version of nonexclusive_access to
36959 + * avoid deadlock which might look like the following:
36960 + * process P1 holds NEA on file F1 and called entd to
36961 + * reclaim some memory. Entd works for P1 and is going
36962 + * to capture pages of file F2. To do that entd has to
36963 + * get NEA to F2. F2 is held by process P2 which also
36964 + * called entd. But entd is serving P1 at the moment
36965 + * and P2 has to wait. Process P3 trying to get EA to
36966 + * file F2. Existence of pending EA request to file F2
36967 + * makes impossible for entd to get NEA to file
36968 + * F2. Neither of these process can continue. Using
36969 + * nonblocking version of gettign NEA is supposed to
36970 + * avoid this deadlock.
36971 + */
36972 + if (try_to_get_nonexclusive_access(uf_info) == 0) {
36973 + result = RETERR(-EBUSY);
36974 + reiser4_exit_context(ctx);
36975 + break;
36976 + }
36977 + } else
36978 + get_nonexclusive_access(uf_info);
36979 +
36980 + while (to_capture > 0) {
36981 + pgoff_t start;
36982 +
36983 + assert("vs-1727", jindex <= pindex);
36984 + if (pindex == jindex) {
36985 + start = pindex;
36986 + result =
36987 + capture_anonymous_pages(inode->i_mapping,
36988 + &pindex,
36989 + to_capture);
36990 + if (result <= 0)
36991 + break;
36992 + to_capture -= result;
36993 + wbc->nr_to_write -= result;
36994 + if (start + result == pindex) {
36995 + jindex = pindex;
36996 + continue;
36997 + }
36998 + if (to_capture <= 0)
36999 + break;
37000 + }
37001 + /* deal with anonymous jnodes between jindex and pindex */
37002 + result =
37003 + capture_anonymous_jnodes(inode->i_mapping, &jindex,
37004 + pindex, to_capture);
37005 + if (result < 0)
37006 + break;
37007 + to_capture -= result;
37008 + get_current_context()->nr_captured += result;
37009 +
37010 + if (jindex == (pgoff_t) - 1) {
37011 + assert("vs-1728", pindex == (pgoff_t) - 1);
37012 + break;
37013 + }
37014 + }
37015 + if (to_capture <= 0)
37016 + /* there may be left more pages */
37017 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
37018 +
37019 + drop_nonexclusive_access(uf_info);
37020 + if (result < 0) {
37021 + /* error happened */
37022 + reiser4_exit_context(ctx);
37023 + return result;
37024 + }
37025 + if (wbc->sync_mode != WB_SYNC_ALL) {
37026 + reiser4_exit_context(ctx);
37027 + return 0;
37028 + }
37029 + result = commit_file_atoms(inode);
37030 + reiser4_exit_context(ctx);
37031 + if (pindex >= nr_pages && jindex == pindex)
37032 + break;
37033 + } while (1);
37034 +
37035 + end:
37036 + if (is_in_reiser4_context()) {
37037 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
37038 + /*
37039 + * there are already pages to flush, flush them out, do
37040 + * not delay until end of reiser4_sync_inodes
37041 + */
37042 + writeout(inode->i_sb, wbc);
37043 + get_current_context()->nr_captured = 0;
37044 + }
37045 + }
37046 + return result;
37047 +}
37048 +
37049 +/*
37050 + * ->sync() method for unix file.
37051 + *
37052 + * We are trying to be smart here. Instead of committing all atoms (original
37053 + * solution), we scan dirty pages of this file and commit all atoms they are
37054 + * part of.
37055 + *
37056 + * Situation is complicated by anonymous pages: i.e., extent-less pages
37057 + * dirtied through mmap. Fortunately sys_fsync() first calls
37058 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37059 + * all missing extents and capture anonymous pages.
37060 + */
37061 +int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
37062 +{
37063 + reiser4_context *ctx;
37064 + txn_atom *atom;
37065 + reiser4_block_nr reserve;
37066 +
37067 + ctx = init_context(dentry->d_inode->i_sb);
37068 + if (IS_ERR(ctx))
37069 + return PTR_ERR(ctx);
37070 +
37071 + reserve = estimate_update_common(dentry->d_inode);
37072 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37073 + reiser4_exit_context(ctx);
37074 + return RETERR(-ENOSPC);
37075 + }
37076 + write_sd_by_inode_common(dentry->d_inode);
37077 +
37078 + atom = get_current_atom_locked();
37079 + spin_lock_txnh(ctx->trans);
37080 + force_commit_atom(ctx->trans);
37081 + reiser4_exit_context(ctx);
37082 + return 0;
37083 +}
37084 +
37085 +/**
37086 + * readpage_unix_file_nolock - readpage of struct address_space_operations
37087 + * @file:
37088 + * @page:
37089 + *
37090 + * Compose a key and search for item containing information about @page
37091 + * data. If item is found - its readpage method is called.
37092 + */
37093 +int readpage_unix_file_nolock(struct file *file, struct page *page)
37094 +{
37095 + reiser4_context *ctx;
37096 + int result;
37097 + struct inode *inode;
37098 + reiser4_key key;
37099 + item_plugin *iplug;
37100 + hint_t *hint;
37101 + lock_handle *lh;
37102 + coord_t *coord;
37103 +
37104 + assert("vs-1062", PageLocked(page));
37105 + assert("vs-976", !PageUptodate(page));
37106 + assert("vs-1061", page->mapping && page->mapping->host);
37107 +
37108 + if ((page->mapping->host->i_size <=
37109 + ((loff_t) page->index << PAGE_CACHE_SHIFT))) {
37110 + /* page is out of file already */
37111 + unlock_page(page);
37112 + return -EINVAL;
37113 + }
37114 +
37115 + inode = page->mapping->host;
37116 + ctx = init_context(inode->i_sb);
37117 + if (IS_ERR(ctx)) {
37118 + unlock_page(page);
37119 + return PTR_ERR(ctx);
37120 + }
37121 +
37122 + hint = kmalloc(sizeof(*hint), get_gfp_mask());
37123 + if (hint == NULL) {
37124 + unlock_page(page);
37125 + reiser4_exit_context(ctx);
37126 + return RETERR(-ENOMEM);
37127 + }
37128 +
37129 + result = load_file_hint(file, hint);
37130 + if (result) {
37131 + kfree(hint);
37132 + unlock_page(page);
37133 + reiser4_exit_context(ctx);
37134 + return result;
37135 + }
37136 + lh = &hint->lh;
37137 +
37138 + /* get key of first byte of the page */
37139 + key_by_inode_and_offset_common(inode,
37140 + (loff_t) page->index << PAGE_CACHE_SHIFT,
37141 + &key);
37142 +
37143 + /* look for file metadata corresponding to first byte of page */
37144 + page_cache_get(page);
37145 + unlock_page(page);
37146 + result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
37147 + lock_page(page);
37148 + page_cache_release(page);
37149 +
37150 + if (page->mapping == NULL) {
37151 + /*
37152 + * readpage allows truncate to run concurrently. Page was
37153 + * truncated while it was not locked
37154 + */
37155 + done_lh(lh);
37156 + kfree(hint);
37157 + unlock_page(page);
37158 + txn_restart(ctx);
37159 + reiser4_exit_context(ctx);
37160 + return -EINVAL;
37161 + }
37162 +
37163 + if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
37164 + if (result == CBK_COORD_FOUND &&
37165 + hint->ext_coord.coord.between != AT_UNIT)
37166 + /* file is truncated */
37167 + result = -EINVAL;
37168 + done_lh(lh);
37169 + kfree(hint);
37170 + unlock_page(page);
37171 + txn_restart(ctx);
37172 + reiser4_exit_context(ctx);
37173 + return result;
37174 + }
37175 +
37176 + /*
37177 + * item corresponding to page is found. It can not be removed because
37178 + * znode lock is held
37179 + */
37180 + if (PageUptodate(page)) {
37181 + done_lh(lh);
37182 + kfree(hint);
37183 + unlock_page(page);
37184 + txn_restart(ctx);
37185 + reiser4_exit_context(ctx);
37186 + return 0;
37187 + }
37188 +
37189 + coord = &hint->ext_coord.coord;
37190 + result = zload(coord->node);
37191 + if (result) {
37192 + done_lh(lh);
37193 + kfree(hint);
37194 + unlock_page(page);
37195 + txn_restart(ctx);
37196 + reiser4_exit_context(ctx);
37197 + return result;
37198 + }
37199 +
37200 + validate_extended_coord(&hint->ext_coord,
37201 + (loff_t) page->index << PAGE_CACHE_SHIFT);
37202 +
37203 + if (!coord_is_existing_unit(coord)) {
37204 + /* this indicates corruption */
37205 + warning("vs-280",
37206 + "Looking for page %lu of file %llu (size %lli). "
37207 + "No file items found (%d). File is corrupted?\n",
37208 + page->index, (unsigned long long)get_inode_oid(inode),
37209 + inode->i_size, result);
37210 + zrelse(coord->node);
37211 + done_lh(lh);
37212 + kfree(hint);
37213 + unlock_page(page);
37214 + txn_restart(ctx);
37215 + reiser4_exit_context(ctx);
37216 + return RETERR(-EIO);
37217 + }
37218 +
37219 + /*
37220 + * get plugin of found item or use plugin if extent if there are no
37221 + * one
37222 + */
37223 + iplug = item_plugin_by_coord(coord);
37224 + if (iplug->s.file.readpage)
37225 + result = iplug->s.file.readpage(coord, page);
37226 + else
37227 + result = RETERR(-EINVAL);
37228 +
37229 + if (!result) {
37230 + set_key_offset(&key,
37231 + (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
37232 + /* FIXME should call set_hint() */
37233 + unset_hint(hint);
37234 + } else {
37235 + unlock_page(page);
37236 + unset_hint(hint);
37237 + }
37238 + assert("vs-979",
37239 + ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
37240 + assert("vs-9791", ergo(result != 0, !PageLocked(page)));
37241 +
37242 + zrelse(coord->node);
37243 + done_lh(lh);
37244 +
37245 + save_file_hint(file, hint);
37246 + kfree(hint);
37247 +
37248 + /*
37249 + * FIXME: explain why it is needed. HINT: page allocation in write can
37250 + * not be done when atom is not NULL because reiser4_writepage can not
37251 + * kick entd and have to eflush
37252 + */
37253 + txn_restart(ctx);
37254 + reiser4_exit_context(ctx);
37255 + return result;
37256 +}
37257 +
37258 +/**
37259 + * readpage_unix_file - readpage of struct address_space_operations
37260 + * @file: file @page belongs to
37261 + * @page: page to read
37262 + *
37263 + * Get non exclusive access to a file to avoid races with truncate. If page is
37264 + * out of file - return error. Call readpage_unix_file_nolock to do the rest.
37265 + */
37266 +int readpage_unix_file(struct file *file, struct page *page)
37267 +{
37268 + return readpage_unix_file_nolock(file, page);
37269 +}
37270 +
37271 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
37272 + loff_t count UNUSED_ARG)
37273 +{
37274 + /* We should reserve one block, because of updating of the stat data
37275 + item */
37276 + assert("vs-1249",
37277 + inode_file_plugin(inode)->estimate.update ==
37278 + estimate_update_common);
37279 + return estimate_update_common(inode);
37280 +}
37281 +
37282 +/* this is called with nonexclusive access obtained, file's container can not change */
37283 +static size_t read_file(hint_t * hint, struct file *file, /* file to read from to */
37284 + char __user *buf, /* address of user-space buffer */
37285 + size_t count, /* number of bytes to read */
37286 + loff_t * off)
37287 +{
37288 + int result;
37289 + struct inode *inode;
37290 + flow_t flow;
37291 + int (*read_f) (struct file *, flow_t *, hint_t *);
37292 + coord_t *coord;
37293 + znode *loaded;
37294 +
37295 + inode = file->f_dentry->d_inode;
37296 +
37297 + /* build flow */
37298 + assert("vs-1250",
37299 + inode_file_plugin(inode)->flow_by_inode ==
37300 + flow_by_inode_unix_file);
37301 + result =
37302 + flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
37303 + *off, READ_OP, &flow);
37304 + if (unlikely(result))
37305 + return result;
37306 +
37307 + /* get seal and coord sealed with it from reiser4 private data
37308 + of struct file. The coord will tell us where our last read
37309 + of this file finished, and the seal will help to determine
37310 + if that location is still valid.
37311 + */
37312 + coord = &hint->ext_coord.coord;
37313 + while (flow.length && result == 0) {
37314 + result =
37315 + find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
37316 + if (cbk_errored(result))
37317 + /* error happened */
37318 + break;
37319 +
37320 + if (coord->between != AT_UNIT) {
37321 + /* there were no items corresponding to given offset */
37322 + done_lh(hint->ext_coord.lh);
37323 + break;
37324 + }
37325 +
37326 + loaded = coord->node;
37327 + result = zload(loaded);
37328 + if (unlikely(result)) {
37329 + done_lh(hint->ext_coord.lh);
37330 + break;
37331 + }
37332 +
37333 + if (hint->ext_coord.valid == 0)
37334 + validate_extended_coord(&hint->ext_coord,
37335 + get_key_offset(&flow.key));
37336 +
37337 + assert("vs-4", hint->ext_coord.valid == 1);
37338 + assert("vs-33", hint->ext_coord.lh == &hint->lh);
37339 + /* call item's read method */
37340 + read_f = item_plugin_by_coord(coord)->s.file.read;
37341 + result = read_f(file, &flow, hint);
37342 + zrelse(loaded);
37343 + done_lh(hint->ext_coord.lh);
37344 + }
37345 +
37346 + return (count - flow.length) ? (count - flow.length) : result;
37347 +}
37348 +
37349 +/**
37350 + * read_unix_file - read of struct file_operations
37351 + * @file: file to read from
37352 + * @buf: address of user-space buffer
37353 + * @read_amount: number of bytes to read
37354 + * @off: position in file to read from
37355 + *
37356 + * This is implementation of vfs's read method of struct file_operations for
37357 + * unix file plugin.
37358 + */
37359 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
37360 + loff_t *off)
37361 +{
37362 + reiser4_context *ctx;
37363 + int result;
37364 + struct inode *inode;
37365 + hint_t *hint;
37366 + unix_file_info_t *uf_info;
37367 + size_t count, read, left;
37368 + reiser4_block_nr needed;
37369 + loff_t size;
37370 +
37371 + if (unlikely(read_amount == 0))
37372 + return 0;
37373 +
37374 + assert("umka-072", file != NULL);
37375 + assert("umka-074", off != NULL);
37376 + inode = file->f_dentry->d_inode;
37377 + assert("vs-972", !inode_get_flag(inode, REISER4_NO_SD));
37378 +
37379 + ctx = init_context(inode->i_sb);
37380 + if (IS_ERR(ctx))
37381 + return PTR_ERR(ctx);
37382 +
37383 + hint = kmalloc(sizeof(*hint), get_gfp_mask());
37384 + if (hint == NULL) {
37385 + context_set_commit_async(ctx);
37386 + reiser4_exit_context(ctx);
37387 + return RETERR(-ENOMEM);
37388 + }
37389 +
37390 + result = load_file_hint(file, hint);
37391 + if (result) {
37392 + kfree(hint);
37393 + context_set_commit_async(ctx);
37394 + reiser4_exit_context(ctx);
37395 + return result;
37396 + }
37397 +
37398 + left = read_amount;
37399 + count = 0;
37400 + uf_info = unix_file_inode_data(inode);
37401 + while (left > 0) {
37402 + txn_restart_current();
37403 +
37404 + get_nonexclusive_access(uf_info);
37405 +
37406 + size = i_size_read(inode);
37407 + if (*off >= size) {
37408 + /* position to read from is past the end of file */
37409 + drop_nonexclusive_access(uf_info);
37410 + break;
37411 + }
37412 + if (*off + left > size)
37413 + left = size - *off;
37414 +
37415 + /* faultin user page */
37416 + if(fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left)) {
37417 + drop_nonexclusive_access(uf_info);
37418 + result = RETERR(-EFAULT);
37419 + break;
37420 + }
37421 +
37422 + read = read_file(hint, file, buf,
37423 + left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
37424 + off);
37425 +
37426 + drop_nonexclusive_access(uf_info);
37427 +
37428 + if (read < 0) {
37429 + result = read;
37430 + break;
37431 + }
37432 + left -= read;
37433 + buf += read;
37434 +
37435 + /* update position in a file */
37436 + *off += read;
37437 + /* total number of read bytes */
37438 + count += read;
37439 + }
37440 + save_file_hint(file, hint);
37441 + done_lh(&hint->lh);
37442 + kfree(hint);
37443 +
37444 + if (count) {
37445 + /*
37446 + * something was read. Grab space for stat data update and
37447 + * update atime
37448 + */
37449 + needed = unix_file_estimate_read(inode, read_amount);
37450 + result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37451 + if (result == 0)
37452 + file_accessed(file);
37453 + else
37454 + warning("", "failed to grab space for atime update");
37455 + }
37456 +
37457 + context_set_commit_async(ctx);
37458 + reiser4_exit_context(ctx);
37459 +
37460 + /* return number of read bytes or error code if nothing is read */
37461 + return count ? count : result;
37462 +}
37463 +
37464 +/* This function takes care about @file's pages. First of all it checks if
37465 + filesystems readonly and if so gets out. Otherwise, it throws out all
37466 + pages of file if it was mapped for read and going to be mapped for write
37467 + and consists of tails. This is done in order to not manage few copies
37468 + of the data (first in page cache and second one in tails them selves)
37469 + for the case of mapping files consisting tails.
37470 +
37471 + Here also tail2extent conversion is performed if it is allowed and file
37472 + is going to be written or mapped for write. This functions may be called
37473 + from write_unix_file() or mmap_unix_file(). */
37474 +static int check_pages_unix_file(struct file *file, struct inode *inode)
37475 +{
37476 + reiser4_invalidate_pages(inode->i_mapping, 0,
37477 + (inode->i_size + PAGE_CACHE_SIZE -
37478 + 1) >> PAGE_CACHE_SHIFT, 0);
37479 + return unpack(file, inode, 0 /* not forever */ );
37480 +}
37481 +
37482 +/**
37483 + * mmap_unix_file - mmap of struct file_operations
37484 + * @file: file to mmap
37485 + * @vma:
37486 + *
37487 + * This is implementation of vfs's mmap method of struct file_operations for
37488 + * unix file plugin. It converts file to extent if necessary. Sets
37489 + * reiser4_inode's flag - REISER4_HAS_MMAP.
37490 + */
37491 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
37492 +{
37493 + reiser4_context *ctx;
37494 + int result;
37495 + struct inode *inode;
37496 + unix_file_info_t *uf_info;
37497 + reiser4_block_nr needed;
37498 +
37499 + inode = file->f_dentry->d_inode;
37500 + ctx = init_context(inode->i_sb);
37501 + if (IS_ERR(ctx))
37502 + return PTR_ERR(ctx);
37503 +
37504 + uf_info = unix_file_inode_data(inode);
37505 +
37506 + down(&uf_info->write);
37507 + get_exclusive_access(uf_info);
37508 +
37509 + if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
37510 + /*
37511 + * we need file built of extent items. If it is still built of
37512 + * tail items we have to convert it. Find what items the file
37513 + * is built of
37514 + */
37515 + result = find_file_state(inode, uf_info);
37516 + if (result != 0) {
37517 + drop_exclusive_access(uf_info);
37518 + up(&uf_info->write);
37519 + reiser4_exit_context(ctx);
37520 + return result;
37521 + }
37522 +
37523 + assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
37524 + uf_info->container == UF_CONTAINER_EXTENTS ||
37525 + uf_info->container == UF_CONTAINER_EMPTY));
37526 + if (uf_info->container == UF_CONTAINER_TAILS) {
37527 + /*
37528 + * invalidate all pages and convert file from tails to
37529 + * extents
37530 + */
37531 + result = check_pages_unix_file(file, inode);
37532 + if (result) {
37533 + drop_exclusive_access(uf_info);
37534 + up(&uf_info->write);
37535 + reiser4_exit_context(ctx);
37536 + return result;
37537 + }
37538 + }
37539 + }
37540 +
37541 + /*
37542 + * generic_file_mmap will do update_atime. Grab space for stat data
37543 + * update.
37544 + */
37545 + needed = inode_file_plugin(inode)->estimate.update(inode);
37546 + result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37547 + if (result) {
37548 + drop_exclusive_access(uf_info);
37549 + up(&uf_info->write);
37550 + reiser4_exit_context(ctx);
37551 + return result;
37552 + }
37553 +
37554 + result = generic_file_mmap(file, vma);
37555 + if (result == 0) {
37556 + /* mark file as having mapping. */
37557 + inode_set_flag(inode, REISER4_HAS_MMAP);
37558 + }
37559 +
37560 + drop_exclusive_access(uf_info);
37561 + up(&uf_info->write);
37562 + reiser4_exit_context(ctx);
37563 + return result;
37564 +}
37565 +
37566 +/**
37567 + * find_first_item
37568 + * @inode:
37569 + *
37570 + * Finds file item which is responsible for first byte in the file.
37571 + */
37572 +static int find_first_item(struct inode *inode)
37573 +{
37574 + coord_t coord;
37575 + lock_handle lh;
37576 + reiser4_key key;
37577 + int result;
37578 +
37579 + coord_init_zero(&coord);
37580 + init_lh(&lh);
37581 + inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37582 + result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37583 + inode);
37584 + if (result == CBK_COORD_FOUND) {
37585 + if (coord.between == AT_UNIT) {
37586 + result = zload(coord.node);
37587 + if (result == 0) {
37588 + result = item_id_by_coord(&coord);
37589 + zrelse(coord.node);
37590 + if (result != EXTENT_POINTER_ID &&
37591 + result != FORMATTING_ID)
37592 + result = RETERR(-EIO);
37593 + }
37594 + } else
37595 + result = RETERR(-EIO);
37596 + }
37597 + done_lh(&lh);
37598 + return result;
37599 +}
37600 +
37601 +/**
37602 + * open_unix_file
37603 + * @inode:
37604 + * @file:
37605 + *
37606 + * If filesystem is not readonly - complete uncompleted tail conversion if
37607 + * there was one
37608 + */
37609 +int open_unix_file(struct inode *inode, struct file *file)
37610 +{
37611 + int result;
37612 + reiser4_context *ctx;
37613 + unix_file_info_t *uf_info;
37614 +
37615 + if (IS_RDONLY(inode))
37616 + return 0;
37617 +
37618 + if (!inode_get_flag(inode, REISER4_PART_MIXED))
37619 + return 0;
37620 +
37621 + ctx = init_context(inode->i_sb);
37622 + if (IS_ERR(ctx))
37623 + return PTR_ERR(ctx);
37624 +
37625 + uf_info = unix_file_inode_data(inode);
37626 + get_exclusive_access(uf_info);
37627 +
37628 + /*
37629 + * it may happen that another process is doing tail conversion. Wait
37630 + * until it completes
37631 + */
37632 + while (1) {
37633 + if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
37634 + drop_exclusive_access(uf_info);
37635 + schedule();
37636 + get_exclusive_access(uf_info);
37637 + continue;
37638 + }
37639 + break;
37640 + }
37641 +
37642 + if (!inode_get_flag(inode, REISER4_PART_MIXED)) {
37643 + /*
37644 + * other process completed the conversion
37645 + */
37646 + drop_exclusive_access(uf_info);
37647 + reiser4_exit_context(ctx);
37648 + return 0;
37649 + }
37650 +
37651 + /*
37652 + * file left in semi converted state after unclean shutdown or another
37653 + * thread is doing conversion and dropped exclusive access which doing
37654 + * balance dirty pages. Complete the conversion
37655 + */
37656 + result = find_first_item(inode);
37657 + if (result == EXTENT_POINTER_ID)
37658 + /*
37659 + * first item is extent, therefore there was incomplete
37660 + * tail2extent conversion. Complete it
37661 + */
37662 + result = tail2extent(unix_file_inode_data(inode));
37663 + else if (result == FORMATTING_ID)
37664 + /*
37665 + * first item is formatting item, therefore there was
37666 + * incomplete extent2tail conversion. Complete it
37667 + */
37668 + result = extent2tail(unix_file_inode_data(inode));
37669 + else
37670 + result = -EIO;
37671 +
37672 + assert("vs-1712",
37673 + ergo(result == 0, (!inode_get_flag(inode, REISER4_PART_MIXED) &&
37674 + !inode_get_flag(inode, REISER4_PART_IN_CONV))));
37675 + drop_exclusive_access(uf_info);
37676 + reiser4_exit_context(ctx);
37677 + return result;
37678 +}
37679 +
37680 +#define NEITHER_OBTAINED 0
37681 +#define EA_OBTAINED 1
37682 +#define NEA_OBTAINED 2
37683 +
37684 +static void drop_access(unix_file_info_t *uf_info)
37685 +{
37686 + if (uf_info->exclusive_use)
37687 + drop_exclusive_access(uf_info);
37688 + else
37689 + drop_nonexclusive_access(uf_info);
37690 +}
37691 +
37692 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37693 + __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37694 +
37695 +void balance_dirty_pages(struct address_space *mapping);
37696 +
37697 +/**
37698 + * write_unix_file - write of struct file_operations
37699 + * @file: file to write to
37700 + * @buf: address of user-space buffer
37701 + * @write_amount: number of bytes to write
37702 + * @off: position in file to write to
37703 + *
37704 + * This is implementation of vfs's write method of struct file_operations for
37705 + * unix file plugin.
37706 + */
37707 +ssize_t write_unix_file(struct file *file, const char __user *buf,
37708 + size_t count, loff_t *pos)
37709 +{
37710 + int result;
37711 + reiser4_context *ctx;
37712 + struct inode *inode;
37713 + unix_file_info_t *uf_info;
37714 + ssize_t written;
37715 + int try_free_space;
37716 + int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37717 + size_t left;
37718 + ssize_t (*write_op)(struct file *, const char __user *, size_t,
37719 + loff_t *pos);
37720 + int ea;
37721 + loff_t new_size;
37722 +
37723 + inode = file->f_dentry->d_inode;
37724 + ctx = init_context(inode->i_sb);
37725 + if (IS_ERR(ctx))
37726 + return PTR_ERR(ctx);
37727 +
37728 + mutex_lock(&inode->i_mutex);
37729 +
37730 + assert("vs-947", !inode_get_flag(inode, REISER4_NO_SD));
37731 + assert("vs-9471", (!inode_get_flag(inode, REISER4_PART_MIXED)));
37732 +
37733 + /* check amount of bytes to write and writing position */
37734 + result = generic_write_checks(file, pos, &count, 0);
37735 + if (result) {
37736 + mutex_unlock(&inode->i_mutex);
37737 + context_set_commit_async(ctx);
37738 + reiser4_exit_context(ctx);
37739 + return result;
37740 + }
37741 +
37742 + result = remove_suid(file->f_dentry);
37743 + if (result) {
37744 + mutex_unlock(&inode->i_mutex);
37745 + context_set_commit_async(ctx);
37746 + reiser4_exit_context(ctx);
37747 + return result;
37748 + }
37749 +
37750 + uf_info = unix_file_inode_data(inode);
37751 +
37752 + current->backing_dev_info = inode->i_mapping->backing_dev_info;
37753 + written = 0;
37754 + try_free_space = 0;
37755 + left = count;
37756 + ea = NEITHER_OBTAINED;
37757 +
37758 + new_size = i_size_read(inode);
37759 + if (*pos + count > new_size)
37760 + new_size = *pos + count;
37761 +
37762 + while (left) {
37763 + if (left < to_write)
37764 + to_write = left;
37765 +
37766 + if (uf_info->container == UF_CONTAINER_EMPTY) {
37767 + get_exclusive_access(uf_info);
37768 + ea = EA_OBTAINED;
37769 + if (uf_info->container != UF_CONTAINER_EMPTY) {
37770 + /* file is made not empty by another process */
37771 + drop_exclusive_access(uf_info);
37772 + ea = NEITHER_OBTAINED;
37773 + continue;
37774 + }
37775 + } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37776 + /*
37777 + * get exclusive access directly just to not have to
37778 + * re-obtain it if file will appear empty
37779 + */
37780 + get_exclusive_access(uf_info);
37781 + ea = EA_OBTAINED;
37782 + result = find_file_state(inode, uf_info);
37783 + if (result) {
37784 + drop_exclusive_access(uf_info);
37785 + ea = NEITHER_OBTAINED;
37786 + break;
37787 + }
37788 + } else {
37789 + get_nonexclusive_access(uf_info);
37790 + ea = NEA_OBTAINED;
37791 + }
37792 +
37793 + /* either EA or NEA is obtained. Choose item write method */
37794 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
37795 + /* file is built of extent items */
37796 + write_op = write_extent;
37797 + } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37798 + /* file is empty */
37799 + if (should_have_notail(uf_info, new_size))
37800 + write_op = write_extent;
37801 + else
37802 + write_op = write_tail;
37803 + } else {
37804 + /* file is built of tail items */
37805 + if (should_have_notail(uf_info, new_size)) {
37806 + if (ea == NEA_OBTAINED) {
37807 + drop_nonexclusive_access(uf_info);
37808 + get_exclusive_access(uf_info);
37809 + ea = EA_OBTAINED;
37810 + }
37811 + if (uf_info->container == UF_CONTAINER_TAILS) {
37812 + /*
37813 + * if file is being convered by another
37814 + * process - wait until it completes
37815 + */
37816 + while (1) {
37817 + if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
37818 + drop_exclusive_access(uf_info);
37819 + schedule();
37820 + get_exclusive_access(uf_info);
37821 + continue;
37822 + }
37823 + break;
37824 + }
37825 + if (uf_info->container == UF_CONTAINER_TAILS) {
37826 + result = tail2extent(uf_info);
37827 + if (result)
37828 + break;
37829 + }
37830 + }
37831 + drop_exclusive_access(uf_info);
37832 + ea = NEITHER_OBTAINED;
37833 + continue;
37834 + }
37835 + write_op = write_tail;
37836 + }
37837 +
37838 + written = write_op(file, buf, to_write, pos);
37839 + if (written == -ENOSPC && try_free_space) {
37840 + drop_access(uf_info);
37841 + txnmgr_force_commit_all(inode->i_sb, 0);
37842 + try_free_space = 0;
37843 + continue;
37844 + }
37845 + if (written < 0) {
37846 + drop_access(uf_info);
37847 + result = written;
37848 + break;
37849 + }
37850 + /* something is written. */
37851 + if (uf_info->container == UF_CONTAINER_EMPTY) {
37852 + assert("", ea == EA_OBTAINED);
37853 + uf_info->container = (write_op == write_extent) ?
37854 + UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37855 + } else {
37856 + assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
37857 + write_op == write_extent));
37858 + assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
37859 + write_op == write_tail));
37860 + }
37861 + if (*pos + written > inode->i_size)
37862 + INODE_SET_FIELD(inode, i_size, *pos + written);
37863 + file_update_time(file);
37864 + result = reiser4_update_sd(inode);
37865 + if (result) {
37866 + mutex_unlock(&inode->i_mutex);
37867 + current->backing_dev_info = NULL;
37868 + drop_access(uf_info);
37869 + context_set_commit_async(ctx);
37870 + reiser4_exit_context(ctx);
37871 + return result;
37872 + }
37873 + drop_access(uf_info);
37874 + ea = NEITHER_OBTAINED;
37875 + txn_restart(ctx);
37876 + current->journal_info = NULL;
37877 + /*
37878 + * tell VM how many pages were dirtied. Maybe number of pages
37879 + * which were dirty already should not be counted
37880 + */
37881 + balance_dirty_pages(inode->i_mapping);
37882 + current->journal_info = ctx;
37883 +
37884 + left -= written;
37885 + buf += written;
37886 + *pos += written;
37887 + }
37888 +
37889 + mutex_unlock(&inode->i_mutex);
37890 +
37891 + if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
37892 + txn_restart_current();
37893 + grab_space_enable();
37894 + result = sync_unix_file(file, file->f_dentry,
37895 + 0 /* data and stat data */ );
37896 + if (result)
37897 + warning("reiser4-7", "failed to sync file %llu",
37898 + (unsigned long long)get_inode_oid(inode));
37899 + }
37900 +
37901 + current->backing_dev_info = NULL;
37902 +
37903 + reiser4_exit_context(ctx);
37904 +
37905 + /*
37906 + * return number of written bytes or error code if nothing is
37907 + * written. Note, that it does not work correctly in case when
37908 + * sync_unix_file returns error
37909 + */
37910 + return (count - left) ? (count - left) : result;
37911 +}
37912 +
37913 +/**
37914 + * release_unix_file - release of struct file_operations
37915 + * @inode: inode of released file
37916 + * @file: file to release
37917 + *
37918 + * Implementation of release method of struct file_operations for unix file
37919 + * plugin. If last reference to indode is released - convert all extent items
37920 + * into tail items if necessary. Frees reiser4 specific file data.
37921 + */
37922 +int release_unix_file(struct inode *inode, struct file *file)
37923 +{
37924 + reiser4_context *ctx;
37925 + unix_file_info_t *uf_info;
37926 + int result;
37927 + int in_reiser4;
37928 +
37929 + in_reiser4 = is_in_reiser4_context();
37930 +
37931 + ctx = init_context(inode->i_sb);
37932 + if (IS_ERR(ctx))
37933 + return PTR_ERR(ctx);
37934 +
37935 + result = 0;
37936 + if (in_reiser4 == 0) {
37937 + uf_info = unix_file_inode_data(inode);
37938 +
37939 + down(&uf_info->write);
37940 + get_exclusive_access(uf_info);
37941 + if (atomic_read(&file->f_dentry->d_count) == 1 &&
37942 + uf_info->container == UF_CONTAINER_EXTENTS &&
37943 + !should_have_notail(uf_info, inode->i_size) &&
37944 + !rofs_inode(inode)) {
37945 + result = extent2tail(uf_info);
37946 + if (result != 0) {
37947 + warning("nikita-3233",
37948 + "Failed (%d) to convert in %s (%llu)",
37949 + result, __FUNCTION__,
37950 + (unsigned long long)
37951 + get_inode_oid(inode));
37952 + }
37953 + }
37954 + drop_exclusive_access(uf_info);
37955 + up(&uf_info->write);
37956 + } else {
37957 + /*
37958 + we are within reiser4 context already. How latter is
37959 + possible? Simple:
37960 +
37961 + (gdb) bt
37962 + #0 get_exclusive_access ()
37963 + #2 0xc01e56d3 in release_unix_file ()
37964 + #3 0xc01c3643 in reiser4_release ()
37965 + #4 0xc014cae0 in __fput ()
37966 + #5 0xc013ffc3 in remove_vm_struct ()
37967 + #6 0xc0141786 in exit_mmap ()
37968 + #7 0xc0118480 in mmput ()
37969 + #8 0xc0133205 in oom_kill ()
37970 + #9 0xc01332d1 in out_of_memory ()
37971 + #10 0xc013bc1d in try_to_free_pages ()
37972 + #11 0xc013427b in __alloc_pages ()
37973 + #12 0xc013f058 in do_anonymous_page ()
37974 + #13 0xc013f19d in do_no_page ()
37975 + #14 0xc013f60e in handle_mm_fault ()
37976 + #15 0xc01131e5 in do_page_fault ()
37977 + #16 0xc0104935 in error_code ()
37978 + #17 0xc025c0c6 in __copy_to_user_ll ()
37979 + #18 0xc01d496f in read_tail ()
37980 + #19 0xc01e4def in read_unix_file ()
37981 + #20 0xc01c3504 in reiser4_read ()
37982 + #21 0xc014bd4f in vfs_read ()
37983 + #22 0xc014bf66 in sys_read ()
37984 + */
37985 + warning("vs-44", "out of memory?");
37986 + }
37987 +
37988 + reiser4_free_file_fsdata(file);
37989 +
37990 + reiser4_exit_context(ctx);
37991 + return result;
37992 +}
37993 +
37994 +static void set_file_notail(struct inode *inode)
37995 +{
37996 + reiser4_inode *state;
37997 + formatting_plugin *tplug;
37998 +
37999 + state = reiser4_inode_data(inode);
38000 + tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
38001 + plugin_set_formatting(&state->pset, tplug);
38002 + inode_set_plugin(inode,
38003 + formatting_plugin_to_plugin(tplug), PSET_FORMATTING);
38004 +}
38005 +
38006 +/* if file is built of tails - convert it to extents */
38007 +static int unpack(struct file *filp, struct inode *inode, int forever)
38008 +{
38009 + int result = 0;
38010 + unix_file_info_t *uf_info;
38011 +
38012 + uf_info = unix_file_inode_data(inode);
38013 + assert("vs-1628", ea_obtained(uf_info));
38014 +
38015 + result = find_file_state(inode, uf_info);
38016 + if (result)
38017 + return result;
38018 + assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
38019 +
38020 + if (uf_info->container == UF_CONTAINER_TAILS) {
38021 + /*
38022 + * if file is being convered by another process - wait until it
38023 + * completes
38024 + */
38025 + while (1) {
38026 + if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
38027 + drop_exclusive_access(uf_info);
38028 + schedule();
38029 + get_exclusive_access(uf_info);
38030 + continue;
38031 + }
38032 + break;
38033 + }
38034 + if (uf_info->container == UF_CONTAINER_TAILS) {
38035 + result = tail2extent(uf_info);
38036 + if (result)
38037 + return result;
38038 + }
38039 + }
38040 + if (forever) {
38041 + /* safe new formatting plugin in stat data */
38042 + __u64 tograb;
38043 +
38044 + set_file_notail(inode);
38045 +
38046 + grab_space_enable();
38047 + tograb = inode_file_plugin(inode)->estimate.update(inode);
38048 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
38049 + result = reiser4_update_sd(inode);
38050 + }
38051 +
38052 + return result;
38053 +}
38054 +
38055 +/* implentation of vfs' ioctl method of struct file_operations for unix file
38056 + plugin
38057 +*/
38058 +int
38059 +ioctl_unix_file(struct inode *inode, struct file *filp,
38060 + unsigned int cmd, unsigned long arg UNUSED_ARG)
38061 +{
38062 + reiser4_context *ctx;
38063 + int result;
38064 +
38065 + ctx = init_context(inode->i_sb);
38066 + if (IS_ERR(ctx))
38067 + return PTR_ERR(ctx);
38068 +
38069 + switch (cmd) {
38070 + case REISER4_IOC_UNPACK:
38071 + get_exclusive_access(unix_file_inode_data(inode));
38072 + result = unpack(filp, inode, 1 /* forever */ );
38073 + drop_exclusive_access(unix_file_inode_data(inode));
38074 + break;
38075 +
38076 + default:
38077 + result = RETERR(-ENOSYS);
38078 + break;
38079 + }
38080 + reiser4_exit_context(ctx);
38081 + return result;
38082 +}
38083 +
38084 +/* implentation of vfs' bmap method of struct address_space_operations for unix
38085 + file plugin
38086 +*/
38087 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
38088 +{
38089 + reiser4_context *ctx;
38090 + sector_t result;
38091 + reiser4_key key;
38092 + coord_t coord;
38093 + lock_handle lh;
38094 + struct inode *inode;
38095 + item_plugin *iplug;
38096 + sector_t block;
38097 +
38098 + inode = mapping->host;
38099 +
38100 + ctx = init_context(inode->i_sb);
38101 + if (IS_ERR(ctx))
38102 + return PTR_ERR(ctx);
38103 + key_by_inode_and_offset_common(inode,
38104 + (loff_t) lblock * current_blocksize,
38105 + &key);
38106 +
38107 + init_lh(&lh);
38108 + result =
38109 + find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
38110 + if (cbk_errored(result)) {
38111 + done_lh(&lh);
38112 + reiser4_exit_context(ctx);
38113 + return result;
38114 + }
38115 +
38116 + result = zload(coord.node);
38117 + if (result) {
38118 + done_lh(&lh);
38119 + reiser4_exit_context(ctx);
38120 + return result;
38121 + }
38122 +
38123 + iplug = item_plugin_by_coord(&coord);
38124 + if (iplug->s.file.get_block) {
38125 + result = iplug->s.file.get_block(&coord, lblock, &block);
38126 + if (result == 0)
38127 + result = block;
38128 + } else
38129 + result = RETERR(-EINVAL);
38130 +
38131 + zrelse(coord.node);
38132 + done_lh(&lh);
38133 + reiser4_exit_context(ctx);
38134 + return result;
38135 +}
38136 +
38137 +/**
38138 + * flow_by_inode_unix_file - initizlize structure flow
38139 + * @inode: inode of file for which read or write is abou
38140 + * @buf: buffer to perform read to or write from
38141 + * @user: flag showing whether @buf is user space or kernel space
38142 + * @size: size of buffer @buf
38143 + * @off: start offset fro read or write
38144 + * @op: READ or WRITE
38145 + * @flow:
38146 + *
38147 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
38148 + */
38149 +int flow_by_inode_unix_file(struct inode *inode,
38150 + const char __user *buf, int user,
38151 + loff_t size, loff_t off,
38152 + rw_op op, flow_t *flow)
38153 +{
38154 + assert("nikita-1100", inode != NULL);
38155 +
38156 + flow->length = size;
38157 + memcpy(&flow->data, &buf, sizeof(buf));
38158 + flow->user = user;
38159 + flow->op = op;
38160 + assert("nikita-1931", inode_file_plugin(inode) != NULL);
38161 + assert("nikita-1932",
38162 + inode_file_plugin(inode)->key_by_inode ==
38163 + key_by_inode_and_offset_common);
38164 + /* calculate key of write position and insert it into flow->key */
38165 + return key_by_inode_and_offset_common(inode, off, &flow->key);
38166 +}
38167 +
38168 +/* plugin->u.file.set_plug_in_sd = NULL
38169 + plugin->u.file.set_plug_in_inode = NULL
38170 + plugin->u.file.create_blank_sd = NULL */
38171 +/* plugin->u.file.delete */
38172 +/*
38173 + plugin->u.file.add_link = add_link_common
38174 + plugin->u.file.rem_link = NULL */
38175 +
38176 +/* plugin->u.file.owns_item
38177 + this is common_file_owns_item with assertion */
38178 +/* Audited by: green(2002.06.15) */
38179 +int
38180 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
38181 + const coord_t * coord /* coord to check */ )
38182 +{
38183 + int result;
38184 +
38185 + result = owns_item_common(inode, coord);
38186 + if (!result)
38187 + return 0;
38188 + if (item_type_by_coord(coord) != UNIX_FILE_METADATA_ITEM_TYPE)
38189 + return 0;
38190 + assert("vs-547",
38191 + item_id_by_coord(coord) == EXTENT_POINTER_ID ||
38192 + item_id_by_coord(coord) == FORMATTING_ID);
38193 + return 1;
38194 +}
38195 +
38196 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
38197 +{
38198 + int result;
38199 + int s_result;
38200 + loff_t old_size;
38201 + reiser4_tree *tree;
38202 +
38203 + inode_check_scale(inode, inode->i_size, attr->ia_size);
38204 +
38205 + old_size = inode->i_size;
38206 + tree = tree_by_inode(inode);
38207 +
38208 + result = safe_link_grab(tree, BA_CAN_COMMIT);
38209 + if (result == 0)
38210 + result = safe_link_add(inode, SAFE_TRUNCATE);
38211 + if (result == 0)
38212 + result = truncate_file_body(inode, attr->ia_size);
38213 + if (result)
38214 + warning("vs-1588", "truncate_file failed: oid %lli, "
38215 + "old size %lld, new size %lld, retval %d",
38216 + (unsigned long long)get_inode_oid(inode),
38217 + old_size, attr->ia_size, result);
38218 +
38219 + s_result = safe_link_grab(tree, BA_CAN_COMMIT);
38220 + if (s_result == 0)
38221 + s_result =
38222 + safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
38223 + if (s_result != 0) {
38224 + warning("nikita-3417", "Cannot kill safelink %lli: %i",
38225 + (unsigned long long)get_inode_oid(inode), s_result);
38226 + }
38227 + safe_link_release(tree);
38228 + return result;
38229 +}
38230 +
38231 +/* plugin->u.file.setattr method */
38232 +/* This calls inode_setattr and if truncate is in effect it also takes
38233 + exclusive inode access to avoid races */
38234 +int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
38235 + struct iattr *attr /* change description */ )
38236 +{
38237 + int result;
38238 +
38239 + if (attr->ia_valid & ATTR_SIZE) {
38240 + reiser4_context *ctx;
38241 + unix_file_info_t *uf_info;
38242 +
38243 + /* truncate does reservation itself and requires exclusive
38244 + access obtained */
38245 + ctx = init_context(dentry->d_inode->i_sb);
38246 + if (IS_ERR(ctx))
38247 + return PTR_ERR(ctx);
38248 +
38249 + uf_info = unix_file_inode_data(dentry->d_inode);
38250 + down(&uf_info->write);
38251 + get_exclusive_access(uf_info);
38252 + result = setattr_truncate(dentry->d_inode, attr);
38253 + drop_exclusive_access(uf_info);
38254 + up(&uf_info->write);
38255 + context_set_commit_async(ctx);
38256 + reiser4_exit_context(ctx);
38257 + } else
38258 + result = setattr_common(dentry, attr);
38259 +
38260 + return result;
38261 +}
38262 +
38263 +/* plugin->u.file.init_inode_data */
38264 +void
38265 +init_inode_data_unix_file(struct inode *inode,
38266 + reiser4_object_create_data * crd, int create)
38267 +{
38268 + unix_file_info_t *data;
38269 +
38270 + data = unix_file_inode_data(inode);
38271 + data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
38272 + init_rwsem(&data->latch);
38273 + sema_init(&data->write, 1);
38274 + data->tplug = inode_formatting_plugin(inode);
38275 + data->exclusive_use = 0;
38276 +
38277 +#if REISER4_DEBUG
38278 + data->ea_owner = NULL;
38279 + atomic_set(&data->nr_neas, 0);
38280 +#endif
38281 + init_inode_ordering(inode, crd, create);
38282 +}
38283 +
38284 +/**
38285 + * delete_object_unix_file - delete_object of file_plugin
38286 + * @inode: inode to be deleted
38287 + *
38288 + * Truncates file to length 0, removes stat data and safe link.
38289 + */
38290 +int delete_object_unix_file(struct inode *inode)
38291 +{
38292 + unix_file_info_t *uf_info;
38293 + int result;
38294 +
38295 + if (inode_get_flag(inode, REISER4_NO_SD))
38296 + return 0;
38297 +
38298 + /* truncate file bogy first */
38299 + uf_info = unix_file_inode_data(inode);
38300 + get_exclusive_access(uf_info);
38301 + result = truncate_file_body(inode, 0 /* size */ );
38302 + drop_exclusive_access(uf_info);
38303 +
38304 + if (result)
38305 + warning("", "failed to truncate file (%llu) on removal: %d",
38306 + get_inode_oid(inode), result);
38307 +
38308 + /* remove stat data and safe link */
38309 + return delete_object_common(inode);
38310 +}
38311 +
38312 +/**
38313 + * sendfile_unix_file - sendfile of struct file_operations
38314 + * @file: file to be sent
38315 + * @ppos: position to start from
38316 + * @count: number of bytes to send
38317 + * @actor: function to copy data
38318 + * @target: where to copy read data
38319 + *
38320 + * Reads @count bytes from @file and calls @actor for every page read. This is
38321 + * needed for loop back devices support.
38322 + */
38323 +ssize_t
38324 +sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
38325 + read_actor_t actor, void *target)
38326 +{
38327 + reiser4_context *ctx;
38328 + ssize_t result;
38329 + struct inode *inode;
38330 + unix_file_info_t *uf_info;
38331 +
38332 + inode = file->f_dentry->d_inode;
38333 + ctx = init_context(inode->i_sb);
38334 + if (IS_ERR(ctx))
38335 + return PTR_ERR(ctx);
38336 +
38337 + /*
38338 + * generic_file_sndfile may want to call update_atime. Grab space for
38339 + * stat data update
38340 + */
38341 + result = reiser4_grab_space(estimate_update_common(inode),
38342 + BA_CAN_COMMIT);
38343 + if (result)
38344 + goto error;
38345 + mutex_lock(&inode->i_mutex);
38346 + inode_set_flag(inode, REISER4_HAS_MMAP);
38347 + mutex_unlock(&inode->i_mutex);
38348 +
38349 + uf_info = unix_file_inode_data(inode);
38350 + get_nonexclusive_access(uf_info);
38351 + result = generic_file_sendfile(file, ppos, count, actor, target);
38352 + drop_nonexclusive_access(uf_info);
38353 + error:
38354 + reiser4_exit_context(ctx);
38355 + return result;
38356 +}
38357 +
38358 +int
38359 +prepare_write_unix_file(struct file *file, struct page *page,
38360 + unsigned from, unsigned to)
38361 +{
38362 + reiser4_context *ctx;
38363 + unix_file_info_t *uf_info;
38364 + int ret;
38365 +
38366 + ctx = init_context(file->f_dentry->d_inode->i_sb);
38367 + if (IS_ERR(ctx))
38368 + return PTR_ERR(ctx);
38369 +
38370 + uf_info = unix_file_inode_data(file->f_dentry->d_inode);
38371 + get_exclusive_access(uf_info);
38372 + ret = find_file_state(file->f_dentry->d_inode, uf_info);
38373 + if (ret == 0) {
38374 + if (uf_info->container == UF_CONTAINER_TAILS)
38375 + ret = -EINVAL;
38376 + else
38377 + ret = do_prepare_write(file, page, from, to);
38378 + }
38379 + drop_exclusive_access(uf_info);
38380 +
38381 + /* don't commit transaction under inode semaphore */
38382 + context_set_commit_async(ctx);
38383 + reiser4_exit_context(ctx);
38384 + return ret;
38385 +}
38386 +
38387 +/*
38388 + * Local variables:
38389 + * c-indentation-style: "K&R"
38390 + * mode-name: "LC"
38391 + * c-basic-offset: 8
38392 + * tab-width: 8
38393 + * fill-column: 79
38394 + * scroll-step: 1
38395 + * End:
38396 + */
38397 Index: linux-2.6.16/fs/reiser4/plugin/file/file.h
38398 ===================================================================
38399 --- /dev/null
38400 +++ linux-2.6.16/fs/reiser4/plugin/file/file.h
38401 @@ -0,0 +1,257 @@
38402 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38403 + * reiser4/README */
38404 +
38405 +/* this file contains declarations of methods implementing file plugins
38406 + (UNIX_FILE_PLUGIN_ID, SYMLINK_FILE_PLUGIN_ID and CRC_FILE_PLUGIN_ID) */
38407 +
38408 +#if !defined( __REISER4_FILE_H__ )
38409 +#define __REISER4_FILE_H__
38410 +
38411 +/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38412 +
38413 +/* inode operations */
38414 +int setattr_unix_file(struct dentry *, struct iattr *);
38415 +
38416 +/* file operations */
38417 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38418 + loff_t *off);
38419 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38420 + loff_t * off);
38421 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38422 + unsigned long arg);
38423 +int mmap_unix_file(struct file *, struct vm_area_struct *);
38424 +int open_unix_file(struct inode *, struct file *);
38425 +int release_unix_file(struct inode *, struct file *);
38426 +int sync_unix_file(struct file *, struct dentry *, int datasync);
38427 +ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38428 + read_actor_t, void *target);
38429 +
38430 +/* address space operations */
38431 +int readpage_unix_file(struct file *, struct page *);
38432 +int readpage_unix_file_nolock(struct file *, struct page *);
38433 +int writepages_unix_file(struct address_space *, struct writeback_control *);
38434 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38435 + unsigned to);
38436 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
38437 + unsigned to);
38438 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38439 +
38440 +/* file plugin operations */
38441 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38442 + int user, loff_t, loff_t, rw_op, flow_t *);
38443 +int owns_item_unix_file(const struct inode *, const coord_t *);
38444 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38445 + int create);
38446 +int delete_object_unix_file(struct inode *);
38447 +
38448 +/*
38449 + * all the write into unix file is performed by item write method. Write method
38450 + * of unix file plugin only decides which item plugin (extent or tail) and in
38451 + * which mode (one from the enum below) to call
38452 + */
38453 +typedef enum {
38454 + FIRST_ITEM = 1,
38455 + APPEND_ITEM = 2,
38456 + OVERWRITE_ITEM = 3
38457 +} write_mode_t;
38458 +
38459 +/* unix file may be in one the following states */
38460 +typedef enum {
38461 + UF_CONTAINER_UNKNOWN = 0,
38462 + UF_CONTAINER_TAILS = 1,
38463 + UF_CONTAINER_EXTENTS = 2,
38464 + UF_CONTAINER_EMPTY = 3
38465 +} file_container_t;
38466 +
38467 +struct formatting_plugin;
38468 +struct inode;
38469 +
38470 +/* unix file plugin specific part of reiser4 inode */
38471 +typedef struct unix_file_info {
38472 + /*
38473 + * this read-write lock protects file containerization change. Accesses
38474 + * which do not change file containerization (see file_container_t)
38475 + * (read, readpage, writepage, write (until tail conversion is
38476 + * involved)) take read-lock. Accesses which modify file
38477 + * containerization (truncate, conversion from tail to extent and back)
38478 + * take write-lock.
38479 + */
38480 + struct rw_semaphore latch;
38481 + /*
38482 + * this semaphore is used to serialize writes instead of inode->i_mutex,
38483 + * because write_unix_file uses get_user_pages which is to be used
38484 + * under mm->mmap_sem and because it is required to take mm->mmap_sem
38485 + * before inode->i_mutex, so inode->i_mutex would have to be unlocked
38486 + * before calling to get_user_pages which is unacceptable
38487 + */
38488 + struct semaphore write;
38489 + /* this enum specifies which items are used to build the file */
38490 + file_container_t container;
38491 + /*
38492 + * plugin which controls when file is to be converted to extents and
38493 + * back to tail
38494 + */
38495 + struct formatting_plugin *tplug;
38496 + /* if this is set, file is in exclusive use */
38497 + int exclusive_use;
38498 +#if REISER4_DEBUG
38499 + /* pointer to task struct of thread owning exclusive access to file */
38500 + void *ea_owner;
38501 + atomic_t nr_neas;
38502 + void *last_reader;
38503 +#endif
38504 +} unix_file_info_t;
38505 +
38506 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38507 +void get_exclusive_access(unix_file_info_t *);
38508 +void drop_exclusive_access(unix_file_info_t *);
38509 +void get_nonexclusive_access(unix_file_info_t *);
38510 +void drop_nonexclusive_access(unix_file_info_t *);
38511 +int try_to_get_nonexclusive_access(unix_file_info_t *);
38512 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38513 + struct inode *);
38514 +int find_file_item_nohint(coord_t *, lock_handle *,
38515 + const reiser4_key *, znode_lock_mode,
38516 + struct inode *);
38517 +
38518 +void validate_extended_coord(uf_coord_t *, loff_t offset);
38519 +int load_file_hint(struct file *, hint_t *);
38520 +void save_file_hint(struct file *, const hint_t *);
38521 +
38522 +
38523 +#include "../item/extent.h"
38524 +#include "../item/tail.h"
38525 +#include "../item/ctail.h"
38526 +
38527 +struct uf_coord {
38528 + coord_t coord;
38529 + lock_handle *lh;
38530 + int valid;
38531 + union {
38532 + extent_coord_extension_t extent;
38533 + tail_coord_extension_t tail;
38534 + ctail_coord_extension_t ctail;
38535 + } extension;
38536 +};
38537 +
38538 +#include "../../forward.h"
38539 +#include "../../seal.h"
38540 +#include "../../lock.h"
38541 +
38542 +/*
38543 + * This structure is used to speed up file operations (reads and writes). A
38544 + * hint is a suggestion about where a key resolved to last time. A seal
38545 + * indicates whether a node has been modified since a hint was last recorded.
38546 + * You check the seal, and if the seal is still valid, you can use the hint
38547 + * without traversing the tree again.
38548 + */
38549 +struct hint {
38550 + seal_t seal; /* a seal over last file item accessed */
38551 + uf_coord_t ext_coord;
38552 + loff_t offset;
38553 + znode_lock_mode mode;
38554 + lock_handle lh;
38555 +};
38556 +
38557 +void set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38558 +int hint_is_set(const hint_t *);
38559 +void unset_hint(hint_t *);
38560 +int hint_validate(hint_t *, const reiser4_key *, int check_key,
38561 + znode_lock_mode);
38562 +void hint_init_zero(hint_t *);
38563 +
38564 +int update_file_size(struct inode *, reiser4_key *, int update_sd);
38565 +int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38566 + loff_t cur_size, int (*update_actor) (struct inode *,
38567 + reiser4_key *, int));
38568 +
38569 +
38570 +#if REISER4_DEBUG
38571 +
38572 +/* return 1 is exclusive access is obtained, 0 - otherwise */
38573 +static inline int ea_obtained(unix_file_info_t * uf_info)
38574 +{
38575 + int ret;
38576 +
38577 + ret = down_read_trylock(&uf_info->latch);
38578 + if (ret)
38579 + up_read(&uf_info->latch);
38580 + return !ret;
38581 +}
38582 +
38583 +#endif
38584 +
38585 +/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38586 +int create_symlink(struct inode *symlink, struct inode *dir,
38587 + reiser4_object_create_data *);
38588 +void destroy_inode_symlink(struct inode *);
38589 +
38590 +/* declarations of functions implementing CRC_FILE_PLUGIN_ID file plugin */
38591 +
38592 +/* inode operations */
38593 +int setattr_cryptcompress(struct dentry *, struct iattr *);
38594 +
38595 +/* file operations */
38596 +ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38597 + loff_t * off);
38598 +ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38599 + loff_t * off);
38600 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38601 +ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38602 + read_actor_t actor, void *target);
38603 +int release_cryptcompress(struct inode *, struct file *);
38604 +
38605 +/* address space operations */
38606 +extern int readpage_cryptcompress(struct file *, struct page *);
38607 +extern int writepages_cryptcompress(struct address_space *,
38608 + struct writeback_control *);
38609 +
38610 +
38611 +/* file plugin operations */
38612 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38613 + int user, loff_t, loff_t, rw_op, flow_t *);
38614 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38615 +int create_cryptcompress(struct inode *, struct inode *,
38616 + reiser4_object_create_data *);
38617 +int delete_cryptcompress(struct inode *);
38618 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38619 + int create);
38620 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38621 + const reiser4_key * to_key,
38622 + reiser4_key * smallest_removed,
38623 + struct inode *object, int truncate,
38624 + int *progress);
38625 +void destroy_inode_cryptcompress(struct inode *);
38626 +
38627 +extern reiser4_plugin_ops cryptcompress_plugin_ops;
38628 +
38629 +#define WRITE_GRANULARITY 32
38630 +
38631 +
38632 +int tail2extent(unix_file_info_t *);
38633 +int extent2tail(unix_file_info_t *);
38634 +
38635 +int goto_right_neighbor(coord_t *, lock_handle *);
38636 +int find_or_create_extent(struct page *);
38637 +int equal_to_ldk(znode *, const reiser4_key *);
38638 +
38639 +
38640 +extern inline int cbk_errored(int cbk_result)
38641 +{
38642 + return (cbk_result != CBK_COORD_NOTFOUND
38643 + && cbk_result != CBK_COORD_FOUND);
38644 +}
38645 +
38646 +/* __REISER4_FILE_H__ */
38647 +#endif
38648 +
38649 +/*
38650 + * Local variables:
38651 + * c-indentation-style: "K&R"
38652 + * mode-name: "LC"
38653 + * c-basic-offset: 8
38654 + * tab-width: 8
38655 + * fill-column: 79
38656 + * scroll-step: 1
38657 + * End:
38658 +*/
38659 Index: linux-2.6.16/fs/reiser4/plugin/file/invert.c
38660 ===================================================================
38661 --- /dev/null
38662 +++ linux-2.6.16/fs/reiser4/plugin/file/invert.c
38663 @@ -0,0 +1,493 @@
38664 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38665 +
38666 +/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
38667 + buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert
38668 + provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files
38669 + when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it
38670 + to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to
38671 + make that easy for you by providing those delimiters in what you read from it.
38672 +
38673 + When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a
38674 + bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
38675 + would create those files. But which files? Well, that must be specified in the body of the invert using a special
38676 + syntax, and that specification is called the invert of the assignment.
38677 +
38678 + When written to, an invert performs the assignment command that is written
38679 + to it, and modifies its own body to contain the invert of that
38680 + assignment.
38681 +
38682 + In other words, writing to an invert file what you have read from it
38683 + is the identity operation.
38684 +
38685 + Malformed assignments cause write errors. Partial writes are not
38686 + supported in v4.0, but will be.
38687 +
38688 + Example:
38689 +
38690 + If an invert contains:
38691 +
38692 + /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
38693 +
38694 +======================
38695 +Each element in this definition should be an invert, and all files
38696 +should be called recursively - too. This is bad. If one of the
38697 +included files in not a regular or invert file, then we can't read
38698 +main file.
38699 +
38700 +I think to make it is possible easier:
38701 +
38702 +internal structure of invert file should be like symlink file. But
38703 +read and write method should be explitely indicated in i/o operation..
38704 +
38705 +By default we read and write (if probably) as symlink and if we
38706 +specify ..invert at reading time that too we can specify it at write time.
38707 +
38708 +example:
38709 +/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
38710 +will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
38711 +
38712 +read of /my_invert_file/..invert will be
38713 +/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38714 +
38715 +but read of /my_invert_file/ will be
38716 +The contents of filenameAsome text stored in the invertThe contents of filenameB
38717 +
38718 +we also can creat this file as
38719 +/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
38720 +will create /my_invert_file , and use existing files /filenameA and /filenameB.
38721 +
38722 +and when we will read it will be as previously invert file.
38723 +
38724 +This is correct?
38725 +
38726 + vv
38727 +DEMIDOV-FIXME-HANS:
38728 +
38729 +Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
38730 +
38731 +Do you agree? Discuss it on reiserfs-list....
38732 +
38733 +-Hans
38734 +=======================
38735 +
38736 + Then a read will return:
38737 +
38738 + /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38739 +
38740 + and a write of the line above to the invert will set the contents of
38741 + the invert and filenameA and filenameB to their original values.
38742 +
38743 + Note that the contents of an invert have no influence on the effect
38744 + of a write unless the write is a partial write (and a write of a
38745 + shorter file without using truncate first is a partial write).
38746 +
38747 + truncate() has no effect on filenameA and filenameB, it merely
38748 + resets the value of the invert.
38749 +
38750 + Writes to subfiles via the invert are implemented by preceding them
38751 + with truncates.
38752 +
38753 + Parse failures cause write failures.
38754 +
38755 + Questions to ponder: should the invert be acted on prior to file
38756 + close when writing to an open filedescriptor?
38757 +
38758 + Example:
38759 +
38760 + If an invert contains:
38761 +
38762 + "(This text and a pair of quotes are all that is here.)
38763 +
38764 +Then a read will return:
38765 +
38766 + "(This text and a pair of quotes are all that is here.)
38767 +
38768 +*/
38769 +
38770 +/* OPEN method places a struct file in memory associated with invert body
38771 + and returns something like file descriptor to the user for the future access
38772 + to the invert file.
38773 + During opening we parse the body of invert and get a list of the 'entryes'
38774 + (that describes all its subfiles) and place pointer on the first struct in
38775 + reiserfs-specific part of invert inode (arbitrary decision).
38776 +
38777 + Each subfile is described by the struct inv_entry that has a pointer @sd on
38778 + in-core based stat-data and a pointer on struct file @f (if we find that the
38779 + subfile uses more then one unformated node (arbitrary decision), we load
38780 + struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
38781 + of some other information we need)
38782 +
38783 + Since READ and WRITE methods for inverts were formulated in assignment
38784 + language, they don't contain arguments 'size' and 'offset' that make sense
38785 + only in ordinary read/write methods.
38786 +
38787 + READ method is a combination of two methods:
38788 + 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
38789 + with @f != 0, this method uses pointer on struct file as an argument
38790 + 2) read method for inode-less files with @sd != 0, this method uses
38791 + in-core based stat-data instead struct file as an argument.
38792 + in the first case we don't use pagecache, just copy data that we got after
38793 + cbk() into userspace.
38794 +
38795 + WRITE method for invert files is more complex.
38796 + Besides declared WRITE-interface in assignment languageb above we need
38797 + to have an opportunity to edit unwrapped body of invert file with some
38798 + text editor, it means we need GENERIC WRITE METHOD for invert file:
38799 +
38800 + my_invert_file/..invert <- "string"
38801 +
38802 + this method parses "string" and looks for correct subfile signatures, also
38803 + the parsing process splits this "string" on the set of flows in accordance
38804 + with the set of subfiles specified by this signarure.
38805 + The found list of signatures #S is compared with the opened one #I of invert
38806 + file. If it doesn't have this one (#I==0, it will be so for instance if we
38807 + have just create this invert file) the write method assignes found signature
38808 + (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
38809 + itself to the some write methods for ordinary or light-weight, or call itself
38810 + recursively for invert files with corresponding flows.
38811 + I am not sure, but the list of signatures looks like what mr.Demidov means
38812 + by 'delimiters'.
38813 +
38814 + The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
38815 + and cause delete (create new) subfiles (arbitrary decision - it may looks
38816 + too complex, but this interface will be the completest). The order of entries
38817 + of list #S (#I) and inherited order on #I (#S) must coincide.
38818 + The other parsing results give malformed signature that aborts READ method
38819 + and releases all resources.
38820 +
38821 + Format of subfile (entry) signature:
38822 +
38823 + "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
38824 +
38825 + Legend:
38826 +
38827 + START_MAGIC - keyword indicates the start of subfile signature;
38828 +
38829 + <> indicates the start of 'subfile metadata', that is the pair
38830 + (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
38831 +
38832 + TYPE - the string "type" indicates the start of one of the three words:
38833 + - ORDINARY_FILE,
38834 + - LIGHT_WEIGHT_FILE,
38835 + - INVERT_FILE;
38836 +
38837 + LOOKUP_ARG - lookup argument depends on previous type:
38838 + */
38839 +
38840 + /************************************************************/
38841 + /* TYPE * LOOKUP ARGUMENT */
38842 + /************************************************************/
38843 + /* LIGH_WEIGHT_FILE * stat-data key */
38844 + /************************************************************/
38845 + /* ORDINARY_FILE * filename */
38846 + /************************************************************/
38847 + /* INVERT_FILE * filename */
38848 + /************************************************************/
38849 +
38850 + /* where:
38851 + *stat-data key - the string contains stat data key of this subfile, it will be
38852 + passed to fast-access lookup method for light-weight files;
38853 + *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
38854 + for ordinary and invert files;
38855 +
38856 + SUBFILE_BODY - data of this subfile (it will go to the flow)
38857 + END_MAGIC - the keyword indicates the end of subfile signature.
38858 +
38859 + The other simbols inside the signature interpreted as 'unformatted content',
38860 + which is available with VFS's read_link() (arbitraruy decision).
38861 +
38862 + NOTE: Parse method for a body of invert file uses mentioned signatures _without_
38863 + subfile bodies.
38864 +
38865 + Now the only unclear thing is WRITE in regular light-weight subfile A that we
38866 + can describe only in assignment language:
38867 +
38868 + A <- "some_string"
38869 +
38870 + I guess we don't want to change stat-data and body items of file A
38871 + if this file exist, and size(A) != size("some_string") because this operation is
38872 + expencive, so we only do the partial write if size(A) > size("some_string")
38873 + and do truncate of the "some_string", and then do A <- "truncated string", if
38874 + size(A) < size("some_string"). This decision is also arbitrary..
38875 + */
38876 +
38877 +/* here is infrastructure for formated flows */
38878 +
38879 +#define SUBFILE_HEADER_MAGIC 0x19196605
38880 +#define FLOW_HEADER_MAGIC 0x01194304
38881 +
38882 +#include "../plugin.h"
38883 +#include "../../debug.h"
38884 +#include "../../forward.h"
38885 +#include "../object.h"
38886 +#include "../item/item.h"
38887 +#include "../item/static_stat.h"
38888 +#include "../../dformat.h"
38889 +#include "../znode.h"
38890 +#include "../inode.h"
38891 +
38892 +#include <linux/types.h>
38893 +#include <linux/fs.h> /* for struct file */
38894 +#include <linux/list.h> /* for struct list_head */
38895 +
38896 +typedef enum {
38897 + LIGHT_WEIGHT_FILE,
38898 + ORDINARY_FILE,
38899 + INVERT_FILE
38900 +} inv_entry_type;
38901 +
38902 +typedef struct flow_header {
38903 + d32 fl_magic;
38904 + d16 fl_nr; /* number of subfiles in the flow */
38905 +};
38906 +
38907 +typedef struct subfile_header {
38908 + d32 sh_magic; /* subfile magic */
38909 + d16 sh_type; /* type of subfile: light-weight, ordinary, invert */
38910 + d16 sh_arg_len; /* lenght of lookup argument (filename, key) */
38911 + d32 sh_body_len; /* lenght of subfile body */
38912 +};
38913 +
38914 +/* functions to get/set fields of flow header */
38915 +
38916 +static void fl_set_magic(flow_header * fh, __u32 value)
38917 +{
38918 + cputod32(value, &fh->fh_magic);
38919 +}
38920 +
38921 +static __u32 fl_get_magic(flow_header * fh)
38922 +{
38923 + return d32tocpu(&fh->fh_magic);
38924 +}
38925 +static void fl_set_number(flow_header * fh, __u16 value)
38926 +{
38927 + cputod16(value, &fh->fh_nr);
38928 +}
38929 +static unsigned fl_get_number(flow_header * fh)
38930 +{
38931 + return d16tocpu(&fh->fh_nr);
38932 +}
38933 +
38934 +/* functions to get/set fields of subfile header */
38935 +
38936 +static void sh_set_magic(subfile_header * sh, __u32 value)
38937 +{
38938 + cputod32(value, &sh->sh_magic);
38939 +}
38940 +
38941 +static __u32 sh_get_magic(subfile_header * sh)
38942 +{
38943 + return d32tocpu(&sh->sh_magic);
38944 +}
38945 +static void sh_set_type(subfile_header * sh, __u16 value)
38946 +{
38947 + cputod16(value, &sh->sh_magic);
38948 +}
38949 +static unsigned sh_get_type(subfile_header * sh)
38950 +{
38951 + return d16tocpu(&sh->sh_magic);
38952 +}
38953 +static void sh_set_arg_len(subfile_header * sh, __u16 value)
38954 +{
38955 + cputod16(value, &sh->sh_arg_len);
38956 +}
38957 +static unsigned sh_get_arg_len(subfile_header * sh)
38958 +{
38959 + return d16tocpu(&sh->sh_arg_len);
38960 +}
38961 +static void sh_set_body_len(subfile_header * sh, __u32 value)
38962 +{
38963 + cputod32(value, &sh->sh_body_len);
38964 +}
38965 +
38966 +static __u32 sh_get_body_len(subfile_header * sh)
38967 +{
38968 + return d32tocpu(&sh->sh_body_len);
38969 +}
38970 +
38971 +/* in-core minimal stat-data, light-weight analog of inode */
38972 +
38973 +struct incore_sd_base {
38974 + umode_t isd_mode;
38975 + nlink_t isd_nlink;
38976 + loff_t isd_size;
38977 + char *isd_data; /* 'subflow' to write */
38978 +};
38979 +
38980 +/* open invert create a list of invert entries,
38981 + every entry is represented by structure inv_entry */
38982 +
38983 +struct inv_entry {
38984 + struct list_head *ie_list;
38985 + struct file *ie_file; /* this is NULL if the file doesn't
38986 + have unformated nodes */
38987 + struct incore_sd_base *ie_sd; /* inode-less analog of struct file */
38988 +};
38989 +
38990 +/* allocate and init invert entry */
38991 +
38992 +static struct inv_entry *allocate_inv_entry(void)
38993 +{
38994 + struct inv_entry *inv_entry;
38995 +
38996 + inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
38997 + if (!inv_entry)
38998 + return ERR_PTR(RETERR(-ENOMEM));
38999 + inv_entry->ie_file = NULL;
39000 + inv_entry->ie_sd = NULL;
39001 + INIT_LIST_HEAD(&inv_entry->ie_list);
39002 + return inv_entry;
39003 +}
39004 +
39005 +static int put_inv_entry(struct inv_entry *ientry)
39006 +{
39007 + int result = 0;
39008 +
39009 + assert("edward-96", ientry != NULL);
39010 + assert("edward-97", ientry->ie_list != NULL);
39011 +
39012 + list_del(ientry->ie_list);
39013 + if (ientry->ie_sd != NULL) {
39014 + kfree(ientry->ie_sd);
39015 + kfree(ientry);
39016 + }
39017 + if (ientry->ie_file != NULL)
39018 + result = filp_close(ientry->file, NULL);
39019 + return result;
39020 +}
39021 +
39022 +static int allocate_incore_sd_base(struct inv_entry *inv_entry)
39023 +{
39024 + struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
39025 + assert("edward-99", inv_entry->ie_inode = NULL);
39026 + assert("edward-100", inv_entry->ie_sd = NULL);
39027 +
39028 + isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
39029 + if (!isd_base)
39030 + return RETERR(-ENOMEM);
39031 + inv_entry->ie_sd = isd_base;
39032 + return 0;
39033 +}
39034 +
39035 +/* this can be installed as ->init_inv_entry () method of
39036 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
39037 + Copies data from on-disk stat-data format into light-weight analog of inode .
39038 + Doesn't hanlde stat-data extensions. */
39039 +
39040 +static void sd_base_load(struct inv_entry *inv_entry, char *sd)
39041 +{
39042 + reiser4_stat_data_base *sd_base;
39043 +
39044 + assert("edward-101", inv_entry != NULL);
39045 + assert("edward-101", inv_entry->ie_sd != NULL);
39046 + assert("edward-102", sd != NULL);
39047 +
39048 + sd_base = (reiser4_stat_data_base *) sd;
39049 + inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
39050 + inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
39051 + inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
39052 + inv_entry->incore_sd_base->isd_data = NULL;
39053 +}
39054 +
39055 +/* initialise incore stat-data */
39056 +
39057 +static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
39058 +{
39059 + reiser4_plugin *plugin = item_plugin_by_coord(coord);
39060 + void *body = item_body_by_coord(coord);
39061 +
39062 + assert("edward-103", inv_entry != NULL);
39063 + assert("edward-104", plugin != NULL);
39064 + assert("edward-105", body != NULL);
39065 +
39066 + sd_base_load(inv_entry, body);
39067 +}
39068 +
39069 +/* takes a key or filename and allocates new invert_entry,
39070 + init and adds it into the list,
39071 + we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
39072 +
39073 +int get_inv_entry(struct inode *invert_inode, /* inode of invert's body */
39074 + inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */
39075 + const reiser4_key * key, /* key of invert entry stat-data */
39076 + char *filename, /* filename of the file to be opened */
39077 + int flags, int mode)
39078 +{
39079 + int result;
39080 + struct inv_entry *ientry;
39081 +
39082 + assert("edward-107", invert_inode != NULL);
39083 +
39084 + ientry = allocate_inv_entry();
39085 + if (IS_ERR(ientry))
39086 + return (PTR_ERR(ientry));
39087 +
39088 + if (type == LIGHT_WEIGHT_FILE) {
39089 + coord_t coord;
39090 + lock_handle lh;
39091 +
39092 + assert("edward-108", key != NULL);
39093 +
39094 + init_coord(&coord);
39095 + init_lh(&lh);
39096 + result =
39097 + lookup_sd_by_key(tree_by_inode(invert_inode),
39098 + ZNODE_READ_LOCK, &coord, &lh, key);
39099 + if (result == 0)
39100 + init_incore_sd_base(ientry, coord);
39101 +
39102 + done_lh(&lh);
39103 + done_coord(&coord);
39104 + return (result);
39105 + } else {
39106 + struct file *file = filp_open(filename, flags, mode);
39107 + /* FIXME_EDWARD here we need to check if we
39108 + did't follow to any mount point */
39109 +
39110 + assert("edward-108", filename != NULL);
39111 +
39112 + if (IS_ERR(file))
39113 + return (PTR_ERR(file));
39114 + ientry->ie_file = file;
39115 + return 0;
39116 + }
39117 +}
39118 +
39119 +/* takes inode of invert, reads the body of this invert, parses it,
39120 + opens all invert entries and return pointer on the first inv_entry */
39121 +
39122 +struct inv_entry *open_invert(struct file *invert_file)
39123 +{
39124 +
39125 +}
39126 +
39127 +ssize_t subfile_read(struct *invert_entry, flow * f)
39128 +{
39129 +
39130 +}
39131 +
39132 +ssize_t subfile_write(struct *invert_entry, flow * f)
39133 +{
39134 +
39135 +}
39136 +
39137 +ssize_t invert_read(struct *file, flow * f)
39138 +{
39139 +
39140 +}
39141 +
39142 +ssize_t invert_write(struct *file, flow * f)
39143 +{
39144 +
39145 +}
39146 +
39147 +/* Make Linus happy.
39148 + Local variables:
39149 + c-indentation-style: "K&R"
39150 + mode-name: "LC"
39151 + c-basic-offset: 8
39152 + tab-width: 8
39153 + fill-column: 120
39154 + scroll-step: 1
39155 + End:
39156 +*/
39157 Index: linux-2.6.16/fs/reiser4/plugin/file/symfile.c
39158 ===================================================================
39159 --- /dev/null
39160 +++ linux-2.6.16/fs/reiser4/plugin/file/symfile.c
39161 @@ -0,0 +1,87 @@
39162 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39163 +
39164 +/* Symfiles are a generalization of Unix symlinks.
39165 +
39166 + A symfile when read behaves as though you took its contents and
39167 + substituted them into the reiser4 naming system as the right hand side
39168 + of an assignment, and then read that which you had assigned to it.
39169 +
39170 + A key issue for symfiles is how to implement writes through to
39171 + subfiles. In general, one must have some method of determining what
39172 + of that which is written to the symfile is written to what subfile.
39173 + This can be done by use of custom plugin methods written by users, or
39174 + by using a few general methods we provide for those willing to endure
39175 + the insertion of delimiters into what is read.
39176 +
39177 + Writing to symfiles without delimiters to denote what is written to
39178 + what subfile is not supported by any plugins we provide in this
39179 + release. Our most sophisticated support for writes is that embodied
39180 + by the invert plugin (see invert.c).
39181 +
39182 + A read only version of the /etc/passwd file might be
39183 + constructed as a symfile whose contents are as follows:
39184 +
39185 + /etc/passwd/userlines/*
39186 +
39187 + or
39188 +
39189 + /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
39190 +
39191 + or
39192 +
39193 + /etc/passwd/userlines/(demidov+edward+reiser+root)
39194 +
39195 + A symfile with contents
39196 +
39197 + /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
39198 +
39199 + will return when read
39200 +
39201 + The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
39202 +
39203 + and write of what has been read will not be possible to implement as
39204 + an identity operation because there are no delimiters denoting the
39205 + boundaries of what is to be written to what subfile.
39206 +
39207 + Note that one could make this a read/write symfile if one specified
39208 + delimiters, and the write method understood those delimiters delimited
39209 + what was written to subfiles.
39210 +
39211 + So, specifying the symfile in a manner that allows writes:
39212 +
39213 + /etc/passwd/userlines/demidov+"(
39214 + )+/etc/passwd/userlines/edward+"(
39215 + )+/etc/passwd/userlines/reiser+"(
39216 + )+/etc/passwd/userlines/root+"(
39217 + )
39218 +
39219 + or
39220 +
39221 + /etc/passwd/userlines/(demidov+"(
39222 + )+edward+"(
39223 + )+reiser+"(
39224 + )+root+"(
39225 + ))
39226 +
39227 + and the file demidov might be specified as:
39228 +
39229 + /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39230 +
39231 + or
39232 +
39233 + /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39234 +
39235 + Notice that if the file demidov has a carriage return in it, the
39236 + parsing fails, but then if you put carriage returns in the wrong place
39237 + in a normal /etc/passwd file it breaks things also.
39238 +
39239 + Note that it is forbidden to have no text between two interpolations
39240 + if one wants to be able to define what parts of a write go to what
39241 + subfiles referenced in an interpolation.
39242 +
39243 + If one wants to be able to add new lines by writing to the file, one
39244 + must either write a custom plugin for /etc/passwd that knows how to
39245 + name an added line, or one must use an invert, or one must use a more
39246 + sophisticated symfile syntax that we are not planning to write for
39247 + version 4.0.
39248 +*/
39249 Index: linux-2.6.16/fs/reiser4/plugin/file/symlink.c
39250 ===================================================================
39251 --- /dev/null
39252 +++ linux-2.6.16/fs/reiser4/plugin/file/symlink.c
39253 @@ -0,0 +1,92 @@
39254 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39255 +
39256 +#include "../../inode.h"
39257 +
39258 +#include <linux/types.h>
39259 +#include <linux/fs.h>
39260 +
39261 +/* file plugin methods specific for symlink files
39262 + (SYMLINK_FILE_PLUGIN_ID) */
39263 +
39264 +/* this is implementation of create_object method of file plugin for
39265 + SYMLINK_FILE_PLUGIN_ID
39266 + */
39267 +
39268 +/**
39269 + * create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39270 + * @symlink: inode of symlink object
39271 + * @dir: inode of parent directory
39272 + * @info: parameters of new object
39273 + *
39274 + * Inserts stat data with symlink extension where into the tree.
39275 + */
39276 +int create_symlink(struct inode *symlink,
39277 + struct inode *dir UNUSED_ARG,
39278 + reiser4_object_create_data *data /* info passed to us,
39279 + * this is filled by
39280 + * reiser4() syscall
39281 + * in particular */ )
39282 +{
39283 + int result;
39284 +
39285 + assert("nikita-680", symlink != NULL);
39286 + assert("nikita-681", S_ISLNK(symlink->i_mode));
39287 + assert("nikita-685", inode_get_flag(symlink, REISER4_NO_SD));
39288 + assert("nikita-682", dir != NULL);
39289 + assert("nikita-684", data != NULL);
39290 + assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39291 +
39292 + /*
39293 + * stat data of symlink has symlink extension in which we store
39294 + * symlink content, that is, path symlink is pointing to.
39295 + */
39296 + reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39297 +
39298 + assert("vs-838", symlink->u.generic_ip == NULL);
39299 + symlink->u.generic_ip = (void *)data->name;
39300 +
39301 + assert("vs-843", symlink->i_size == 0);
39302 + INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39303 +
39304 + /* insert stat data appended with data->name */
39305 + result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39306 + if (result) {
39307 + /* FIXME-VS: Make sure that symlink->u.generic_ip is not attached
39308 + to kmalloced data */
39309 + INODE_SET_FIELD(symlink, i_size, 0);
39310 + } else {
39311 + assert("vs-849", symlink->u.generic_ip
39312 + && inode_get_flag(symlink, REISER4_GENERIC_PTR_USED));
39313 + assert("vs-850",
39314 + !memcmp((char *)symlink->u.generic_ip, data->name,
39315 + (size_t) symlink->i_size + 1));
39316 + }
39317 + return result;
39318 +}
39319 +
39320 +/* this is implementation of destroy_inode method of file plugin for
39321 + SYMLINK_FILE_PLUGIN_ID
39322 + */
39323 +void destroy_inode_symlink(struct inode *inode)
39324 +{
39325 + assert("edward-799",
39326 + inode_file_plugin(inode) ==
39327 + file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39328 + assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39329 + assert("edward-801", inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
39330 + assert("vs-839", S_ISLNK(inode->i_mode));
39331 +
39332 + kfree(inode->u.generic_ip);
39333 + inode->u.generic_ip = NULL;
39334 + inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39335 +}
39336 +
39337 +/* Local variables:
39338 + c-indentation-style: "K&R"
39339 + mode-name: "LC"
39340 + c-basic-offset: 8
39341 + tab-width: 8
39342 + fill-column: 120
39343 + scroll-step: 1
39344 + End:
39345 +*/
39346 Index: linux-2.6.16/fs/reiser4/plugin/file/tail_conversion.c
39347 ===================================================================
39348 --- /dev/null
39349 +++ linux-2.6.16/fs/reiser4/plugin/file/tail_conversion.c
39350 @@ -0,0 +1,728 @@
39351 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39352 +
39353 +#include "../../inode.h"
39354 +#include "../../super.h"
39355 +#include "../../page_cache.h"
39356 +#include "../../carry.h"
39357 +#include "../../safe_link.h"
39358 +#include "../../vfs_ops.h"
39359 +
39360 +#include <linux/writeback.h>
39361 +
39362 +/* this file contains:
39363 + tail2extent and extent2tail */
39364 +
39365 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39366 +void get_exclusive_access(unix_file_info_t * uf_info)
39367 +{
39368 + assert("nikita-3028", schedulable());
39369 + assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39370 + assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39371 + /*
39372 + * "deadlock avoidance": sometimes we commit a transaction under
39373 + * rw-semaphore on a file. Such commit can deadlock with another
39374 + * thread that captured some block (hence preventing atom from being
39375 + * committed) and waits on rw-semaphore.
39376 + */
39377 + txn_restart_current();
39378 + LOCK_CNT_INC(inode_sem_w);
39379 + down_write(&uf_info->latch);
39380 + uf_info->exclusive_use = 1;
39381 + assert("vs-1713", uf_info->ea_owner == NULL);
39382 + assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39383 + ON_DEBUG(uf_info->ea_owner = current);
39384 +}
39385 +
39386 +void drop_exclusive_access(unix_file_info_t * uf_info)
39387 +{
39388 + assert("vs-1714", uf_info->ea_owner == current);
39389 + assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39390 + ON_DEBUG(uf_info->ea_owner = NULL);
39391 + uf_info->exclusive_use = 0;
39392 + up_write(&uf_info->latch);
39393 + assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39394 + assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39395 + LOCK_CNT_DEC(inode_sem_w);
39396 + txn_restart_current();
39397 +}
39398 +
39399 +/**
39400 + * nea_grabbed - do something when file semaphore is down_read-ed
39401 + * @uf_info:
39402 + *
39403 + * This is called when nonexclisive access is obtained on file. All it does is
39404 + * for debugging purposes.
39405 + */
39406 +static void nea_grabbed(unix_file_info_t *uf_info)
39407 +{
39408 +#if REISER4_DEBUG
39409 + LOCK_CNT_INC(inode_sem_r);
39410 + assert("vs-1716", uf_info->ea_owner == NULL);
39411 + atomic_inc(&uf_info->nr_neas);
39412 + uf_info->last_reader = current;
39413 +#endif
39414 +}
39415 +
39416 +/**
39417 + * get_nonexclusive_access - get nonexclusive access to a file
39418 + * @uf_info: unix file specific part of inode to obtain access to
39419 + *
39420 + * Nonexclusive access is obtained on a file before read, write, readpage.
39421 + */
39422 +void get_nonexclusive_access(unix_file_info_t *uf_info)
39423 +{
39424 + assert("nikita-3029", schedulable());
39425 + assert("nikita-3361", get_current_context()->trans->atom == NULL);
39426 +
39427 + down_read(&uf_info->latch);
39428 + nea_grabbed(uf_info);
39429 +}
39430 +
39431 +/**
39432 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39433 + * @uf_info: unix file specific part of inode to obtain access to
39434 + *
39435 + * Non-blocking version of nonexclusive access obtaining.
39436 + */
39437 +int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
39438 +{
39439 + int result;
39440 +
39441 + result = down_read_trylock(&uf_info->latch);
39442 + if (result)
39443 + nea_grabbed(uf_info);
39444 + return result;
39445 +}
39446 +
39447 +void drop_nonexclusive_access(unix_file_info_t * uf_info)
39448 +{
39449 + assert("vs-1718", uf_info->ea_owner == NULL);
39450 + assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
39451 + ON_DEBUG(atomic_dec(&uf_info->nr_neas));
39452 +
39453 + up_read(&uf_info->latch);
39454 +
39455 + LOCK_CNT_DEC(inode_sem_r);
39456 + txn_restart_current();
39457 +}
39458 +
39459 +/* part of tail2extent. Cut all items covering @count bytes starting from
39460 + @offset */
39461 +/* Audited by: green(2002.06.15) */
39462 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
39463 +{
39464 + reiser4_key from, to;
39465 +
39466 + /* AUDIT: How about putting an assertion here, what would check
39467 + all provided range is covered by tail items only? */
39468 + /* key of first byte in the range to be cut */
39469 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39470 +
39471 + /* key of last byte in that range */
39472 + to = from;
39473 + set_key_offset(&to, (__u64) (offset + count - 1));
39474 +
39475 + /* cut everything between those keys */
39476 + return cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
39477 +}
39478 +
39479 +static void release_all_pages(struct page **pages, unsigned nr_pages)
39480 +{
39481 + unsigned i;
39482 +
39483 + for (i = 0; i < nr_pages; i++) {
39484 + if (pages[i] == NULL) {
39485 + unsigned j;
39486 + for (j = i + 1; j < nr_pages; j++)
39487 + assert("vs-1620", pages[j] == NULL);
39488 + break;
39489 + }
39490 + page_cache_release(pages[i]);
39491 + pages[i] = NULL;
39492 + }
39493 +}
39494 +
39495 +/* part of tail2extent. replace tail items with extent one. Content of tail
39496 + items (@count bytes) being cut are copied already into
39497 + pages. extent_writepage method is called to create extents corresponding to
39498 + those pages */
39499 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
39500 +{
39501 + int result;
39502 + unsigned i;
39503 + STORE_COUNTERS;
39504 +
39505 + if (nr_pages == 0)
39506 + return 0;
39507 +
39508 + assert("vs-596", pages[0]);
39509 +
39510 + /* cut copied items */
39511 + result =
39512 + cut_formatting_items(inode,
39513 + (loff_t) pages[0]->index << PAGE_CACHE_SHIFT,
39514 + count);
39515 + if (result)
39516 + return result;
39517 +
39518 + CHECK_COUNTERS;
39519 +
39520 + /* put into tree replacement for just removed items: extent item, namely */
39521 + for (i = 0; i < nr_pages; i++) {
39522 + result = add_to_page_cache_lru(pages[i], inode->i_mapping,
39523 + pages[i]->index,
39524 + mapping_gfp_mask(inode->
39525 + i_mapping));
39526 + if (result)
39527 + break;
39528 + unlock_page(pages[i]);
39529 + result = find_or_create_extent(pages[i]);
39530 + if (result)
39531 + break;
39532 + SetPageUptodate(pages[i]);
39533 + }
39534 + return result;
39535 +}
39536 +
39537 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
39538 + * items */
39539 +
39540 +static int reserve_tail2extent_iteration(struct inode *inode)
39541 +{
39542 + reiser4_block_nr unformatted_nodes;
39543 + reiser4_tree *tree;
39544 +
39545 + tree = tree_by_inode(inode);
39546 +
39547 + /* number of unformatted nodes which will be created */
39548 + unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
39549 +
39550 + /*
39551 + * space required for one iteration of extent->tail conversion:
39552 + *
39553 + * 1. kill N tail items
39554 + *
39555 + * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
39556 + *
39557 + * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
39558 + * extents) extent units.
39559 + *
39560 + * 4. drilling to the leaf level by coord_by_key()
39561 + *
39562 + * 5. possible update of stat-data
39563 + *
39564 + */
39565 + grab_space_enable();
39566 + return reiser4_grab_space
39567 + (2 * tree->height +
39568 + TAIL2EXTENT_PAGE_NUM +
39569 + TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
39570 + 1 + estimate_one_insert_item(tree) +
39571 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39572 +}
39573 +
39574 +/* clear stat data's flag indicating that conversion is being converted */
39575 +static int complete_conversion(struct inode *inode)
39576 +{
39577 + int result;
39578 +
39579 + grab_space_enable();
39580 + result =
39581 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39582 + BA_CAN_COMMIT);
39583 + if (result == 0) {
39584 + inode_clr_flag(inode, REISER4_PART_MIXED);
39585 + result = reiser4_update_sd(inode);
39586 + }
39587 + if (result)
39588 + warning("vs-1696", "Failed to clear converting bit of %llu: %i",
39589 + (unsigned long long)get_inode_oid(inode), result);
39590 + return 0;
39591 +}
39592 +
39593 +/**
39594 + * find_start
39595 + * @inode:
39596 + * @id:
39597 + * @offset:
39598 + *
39599 + * this is used by tail2extent and extent2tail to detect where previous
39600 + * uncompleted conversion stopped
39601 + */
39602 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
39603 +{
39604 + int result;
39605 + lock_handle lh;
39606 + coord_t coord;
39607 + unix_file_info_t *ufo;
39608 + int found;
39609 + reiser4_key key;
39610 +
39611 + ufo = unix_file_inode_data(inode);
39612 + init_lh(&lh);
39613 + result = 0;
39614 + found = 0;
39615 + inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
39616 + do {
39617 + init_lh(&lh);
39618 + result = find_file_item_nohint(&coord, &lh, &key,
39619 + ZNODE_READ_LOCK, inode);
39620 +
39621 + if (result == CBK_COORD_FOUND) {
39622 + if (coord.between == AT_UNIT) {
39623 + /*coord_clear_iplug(&coord); */
39624 + result = zload(coord.node);
39625 + if (result == 0) {
39626 + if (item_id_by_coord(&coord) == id)
39627 + found = 1;
39628 + else
39629 + item_plugin_by_coord(&coord)->s.
39630 + file.append_key(&coord,
39631 + &key);
39632 + zrelse(coord.node);
39633 + }
39634 + } else
39635 + result = RETERR(-ENOENT);
39636 + }
39637 + done_lh(&lh);
39638 + } while (result == 0 && !found);
39639 + *offset = get_key_offset(&key);
39640 + return result;
39641 +}
39642 +
39643 +/**
39644 + * tail2extent
39645 + * @uf_info:
39646 + *
39647 + *
39648 + */
39649 +int tail2extent(unix_file_info_t *uf_info)
39650 +{
39651 + int result;
39652 + reiser4_key key; /* key of next byte to be moved to page */
39653 + char *p_data; /* data of page */
39654 + unsigned page_off = 0, /* offset within the page where to copy data */
39655 + count; /* number of bytes of item which can be
39656 + * copied to page */
39657 + struct page *pages[TAIL2EXTENT_PAGE_NUM];
39658 + struct page *page;
39659 + int done; /* set to 1 when all file is read */
39660 + char *item;
39661 + int i;
39662 + struct inode *inode;
39663 + int first_iteration;
39664 + int bytes;
39665 + __u64 offset;
39666 +
39667 + assert("nikita-3362", ea_obtained(uf_info));
39668 + inode = unix_file_info_to_inode(uf_info);
39669 + assert("nikita-3412", !IS_RDONLY(inode));
39670 + assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
39671 + assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
39672 +
39673 + offset = 0;
39674 + first_iteration = 1;
39675 + result = 0;
39676 + if (inode_get_flag(inode, REISER4_PART_MIXED)) {
39677 + /*
39678 + * file is marked on disk as there was a conversion which did
39679 + * not complete due to either crash or some error. Find which
39680 + * offset tail conversion stopped at
39681 + */
39682 + result = find_start(inode, FORMATTING_ID, &offset);
39683 + if (result == -ENOENT) {
39684 + /* no tail items found, everything is converted */
39685 + uf_info->container = UF_CONTAINER_EXTENTS;
39686 + complete_conversion(inode);
39687 + return 0;
39688 + } else if (result != 0)
39689 + /* some other error */
39690 + return result;
39691 + first_iteration = 0;
39692 + }
39693 +
39694 + inode_set_flag(inode, REISER4_PART_IN_CONV);
39695 +
39696 + /* get key of first byte of a file */
39697 + inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
39698 +
39699 + done = 0;
39700 + while (done == 0) {
39701 + memset(pages, 0, sizeof(pages));
39702 + result = reserve_tail2extent_iteration(inode);
39703 + if (result != 0)
39704 + goto out;
39705 + if (first_iteration) {
39706 + inode_set_flag(inode, REISER4_PART_MIXED);
39707 + reiser4_update_sd(inode);
39708 + first_iteration = 0;
39709 + }
39710 + bytes = 0;
39711 + for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
39712 + assert("vs-598",
39713 + (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
39714 + page = alloc_page(get_gfp_mask());
39715 + if (!page) {
39716 + result = RETERR(-ENOMEM);
39717 + goto error;
39718 + }
39719 +
39720 + page->index =
39721 + (unsigned long)(get_key_offset(&key) >>
39722 + PAGE_CACHE_SHIFT);
39723 + /*
39724 + * usually when one is going to longterm lock znode (as
39725 + * find_file_item does, for instance) he must not hold
39726 + * locked pages. However, there is an exception for
39727 + * case tail2extent. Pages appearing here are not
39728 + * reachable to everyone else, they are clean, they do
39729 + * not have jnodes attached so keeping them locked do
39730 + * not risk deadlock appearance
39731 + */
39732 + assert("vs-983", !PagePrivate(page));
39733 + reiser4_invalidate_pages(inode->i_mapping, page->index,
39734 + 1, 0);
39735 +
39736 + for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
39737 + coord_t coord;
39738 + lock_handle lh;
39739 +
39740 + /* get next item */
39741 + /* FIXME: we might want to readahead here */
39742 + init_lh(&lh);
39743 + result =
39744 + find_file_item_nohint(&coord, &lh, &key,
39745 + ZNODE_READ_LOCK,
39746 + inode);
39747 + if (result != CBK_COORD_FOUND) {
39748 + /*
39749 + * error happened of not items of file
39750 + * were found
39751 + */
39752 + done_lh(&lh);
39753 + page_cache_release(page);
39754 + goto error;
39755 + }
39756 +
39757 + if (coord.between == AFTER_UNIT) {
39758 + /*
39759 + * end of file is reached. Padd page
39760 + * with zeros
39761 + */
39762 + done_lh(&lh);
39763 + done = 1;
39764 + p_data = kmap_atomic(page, KM_USER0);
39765 + memset(p_data + page_off, 0,
39766 + PAGE_CACHE_SIZE - page_off);
39767 + kunmap_atomic(p_data, KM_USER0);
39768 + break;
39769 + }
39770 +
39771 + result = zload(coord.node);
39772 + if (result) {
39773 + page_cache_release(page);
39774 + done_lh(&lh);
39775 + goto error;
39776 + }
39777 + assert("vs-856", coord.between == AT_UNIT);
39778 + item = ((char *)item_body_by_coord(&coord)) +
39779 + coord.unit_pos;
39780 +
39781 + /* how many bytes to copy */
39782 + count =
39783 + item_length_by_coord(&coord) -
39784 + coord.unit_pos;
39785 + /* limit length of copy to end of page */
39786 + if (count > PAGE_CACHE_SIZE - page_off)
39787 + count = PAGE_CACHE_SIZE - page_off;
39788 +
39789 + /*
39790 + * copy item (as much as will fit starting from
39791 + * the beginning of the item) into the page
39792 + */
39793 + p_data = kmap_atomic(page, KM_USER0);
39794 + memcpy(p_data + page_off, item, count);
39795 + kunmap_atomic(p_data, KM_USER0);
39796 +
39797 + page_off += count;
39798 + bytes += count;
39799 + set_key_offset(&key,
39800 + get_key_offset(&key) + count);
39801 +
39802 + zrelse(coord.node);
39803 + done_lh(&lh);
39804 + } /* end of loop which fills one page by content of
39805 + * formatting items */
39806 +
39807 + if (page_off) {
39808 + /* something was copied into page */
39809 + pages[i] = page;
39810 + } else {
39811 + page_cache_release(page);
39812 + assert("vs-1648", done == 1);
39813 + break;
39814 + }
39815 + } /* end of loop through pages of one conversion iteration */
39816 +
39817 + if (i > 0) {
39818 + result = replace(inode, pages, i, bytes);
39819 + release_all_pages(pages, sizeof_array(pages));
39820 + if (result)
39821 + goto error;
39822 + /*
39823 + * we have to drop exclusive access to avoid deadlock
39824 + * which may happen because called by
39825 + * reiser4_writepages capture_unix_file requires to get
39826 + * non-exclusive access to a file. It is safe to drop
39827 + * EA in the middle of tail2extent conversion because
39828 + * write_unix_file/unix_setattr(truncate)/release_unix_file(extent2tail)
39829 + * are serialized by uf_info->write semaphore and
39830 + * because read_unix_file works (should at least) on
39831 + * partially converted files
39832 + */
39833 + drop_exclusive_access(uf_info);
39834 + /* throttle the conversion */
39835 + reiser4_throttle_write(inode);
39836 + get_exclusive_access(uf_info);
39837 +
39838 + /*
39839 + * nobody is allowed to complete conversion but a
39840 + * process which started it
39841 + */
39842 + assert("", inode_get_flag(inode, REISER4_PART_MIXED));
39843 + }
39844 + }
39845 +
39846 + inode_clr_flag(inode, REISER4_PART_IN_CONV);
39847 +
39848 + if (result == 0) {
39849 + /* file is converted to extent items */
39850 + assert("vs-1697", inode_get_flag(inode, REISER4_PART_MIXED));
39851 +
39852 + uf_info->container = UF_CONTAINER_EXTENTS;
39853 + complete_conversion(inode);
39854 + } else {
39855 + /*
39856 + * conversion is not complete. Inode was already marked as
39857 + * REISER4_PART_CONV and stat-data were updated at the first
39858 + * iteration of the loop above.
39859 + */
39860 + error:
39861 + release_all_pages(pages, sizeof_array(pages));
39862 + warning("nikita-2282", "Partial conversion of %llu: %i",
39863 + (unsigned long long)get_inode_oid(inode), result);
39864 + }
39865 +
39866 + out:
39867 + return result;
39868 +}
39869 +
39870 +static int reserve_extent2tail_iteration(struct inode *inode)
39871 +{
39872 + reiser4_tree *tree;
39873 +
39874 + tree = tree_by_inode(inode);
39875 + /*
39876 + * reserve blocks for (in this order):
39877 + *
39878 + * 1. removal of extent item
39879 + *
39880 + * 2. insertion of tail by insert_flow()
39881 + *
39882 + * 3. drilling to the leaf level by coord_by_key()
39883 + *
39884 + * 4. possible update of stat-data
39885 + */
39886 + grab_space_enable();
39887 + return reiser4_grab_space
39888 + (estimate_one_item_removal(tree) +
39889 + estimate_insert_flow(tree->height) +
39890 + 1 + estimate_one_insert_item(tree) +
39891 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39892 +}
39893 +
39894 +static int filler(void *vp, struct page *page)
39895 +{
39896 + return readpage_unix_file_nolock(vp, page);
39897 +}
39898 +
39899 +/* for every page of file: read page, cut part of extent pointing to this page,
39900 + put data of page tree by tail item */
39901 +int extent2tail(unix_file_info_t *uf_info)
39902 +{
39903 + int result;
39904 + struct inode *inode;
39905 + struct page *page;
39906 + unsigned long num_pages, i;
39907 + unsigned long start_page;
39908 + reiser4_key from;
39909 + reiser4_key to;
39910 + unsigned count;
39911 + __u64 offset;
39912 +
39913 + assert("nikita-3362", ea_obtained(uf_info));
39914 + inode = unix_file_info_to_inode(uf_info);
39915 + assert("nikita-3412", !IS_RDONLY(inode));
39916 + assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
39917 + assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
39918 +
39919 + offset = 0;
39920 + if (inode_get_flag(inode, REISER4_PART_MIXED)) {
39921 + /*
39922 + * file is marked on disk as there was a conversion which did
39923 + * not complete due to either crash or some error. Find which
39924 + * offset tail conversion stopped at
39925 + */
39926 + result = find_start(inode, EXTENT_POINTER_ID, &offset);
39927 + if (result == -ENOENT) {
39928 + /* no extent found, everything is converted */
39929 + uf_info->container = UF_CONTAINER_TAILS;
39930 + complete_conversion(inode);
39931 + return 0;
39932 + } else if (result != 0)
39933 + /* some other error */
39934 + return result;
39935 + }
39936 +
39937 + inode_set_flag(inode, REISER4_PART_IN_CONV);
39938 +
39939 + /* number of pages in the file */
39940 + num_pages =
39941 + (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
39942 + start_page = offset >> PAGE_CACHE_SHIFT;
39943 +
39944 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39945 + to = from;
39946 +
39947 + result = 0;
39948 + for (i = 0; i < num_pages; i++) {
39949 + __u64 start_byte;
39950 +
39951 + result = reserve_extent2tail_iteration(inode);
39952 + if (result != 0)
39953 + break;
39954 + if (i == 0 && offset == 0) {
39955 + inode_set_flag(inode, REISER4_PART_MIXED);
39956 + reiser4_update_sd(inode);
39957 + }
39958 +
39959 + page = read_cache_page(inode->i_mapping,
39960 + (unsigned)(i + start_page), filler, NULL);
39961 + if (IS_ERR(page)) {
39962 + result = PTR_ERR(page);
39963 + break;
39964 + }
39965 +
39966 + wait_on_page_locked(page);
39967 +
39968 + if (!PageUptodate(page)) {
39969 + page_cache_release(page);
39970 + result = RETERR(-EIO);
39971 + break;
39972 + }
39973 +
39974 + /* cut part of file we have read */
39975 + start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
39976 + set_key_offset(&from, start_byte);
39977 + set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
39978 + /*
39979 + * cut_tree_object() returns -E_REPEAT to allow atom
39980 + * commits during over-long truncates. But
39981 + * extent->tail conversion should be performed in one
39982 + * transaction.
39983 + */
39984 + result = cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
39985 +
39986 + if (result) {
39987 + page_cache_release(page);
39988 + break;
39989 + }
39990 +
39991 + /* put page data into tree via tail_write */
39992 + count = PAGE_CACHE_SIZE;
39993 + if ((i == (num_pages - 1)) &&
39994 + (inode->i_size & ~PAGE_CACHE_MASK))
39995 + /* last page can be incompleted */
39996 + count = (inode->i_size & ~PAGE_CACHE_MASK);
39997 + while (count) {
39998 + struct dentry dentry;
39999 + struct file file;
40000 + loff_t pos;
40001 +
40002 + dentry.d_inode = inode;
40003 + file.f_dentry = &dentry;
40004 + file.private_data = NULL;
40005 + file.f_pos = start_byte;
40006 + file.private_data = NULL;
40007 + pos = start_byte;
40008 + result = write_tail(&file, (char __user *)kmap(page),
40009 + count, &pos);
40010 + reiser4_free_file_fsdata(&file);
40011 + if (result <= 0) {
40012 + warning("", "write_tail failed");
40013 + page_cache_release(page);
40014 + inode_clr_flag(inode, REISER4_PART_IN_CONV);
40015 + return result;
40016 + }
40017 + count -= result;
40018 + }
40019 +
40020 + /* release page */
40021 + lock_page(page);
40022 + /* page is already detached from jnode and mapping. */
40023 + assert("vs-1086", page->mapping == NULL);
40024 + assert("nikita-2690",
40025 + (!PagePrivate(page) && jprivate(page) == 0));
40026 + /* waiting for writeback completion with page lock held is
40027 + * perfectly valid. */
40028 + wait_on_page_writeback(page);
40029 + drop_page(page);
40030 + /* release reference taken by read_cache_page() above */
40031 + page_cache_release(page);
40032 +
40033 + drop_exclusive_access(uf_info);
40034 + /* throttle the conversion */
40035 + reiser4_throttle_write(inode);
40036 + get_exclusive_access(uf_info);
40037 + /*
40038 + * nobody is allowed to complete conversion but a process which
40039 + * started it
40040 + */
40041 + assert("", inode_get_flag(inode, REISER4_PART_MIXED));
40042 + }
40043 +
40044 + inode_clr_flag(inode, REISER4_PART_IN_CONV);
40045 +
40046 + if (i == num_pages) {
40047 + /* file is converted to formatted items */
40048 + assert("vs-1698", inode_get_flag(inode, REISER4_PART_MIXED));
40049 + assert("vs-1260",
40050 + inode_has_no_jnodes(reiser4_inode_data(inode)));
40051 +
40052 + uf_info->container = UF_CONTAINER_TAILS;
40053 + complete_conversion(inode);
40054 + return 0;
40055 + }
40056 + /*
40057 + * conversion is not complete. Inode was already marked as
40058 + * REISER4_PART_MIXED and stat-data were updated at the first *
40059 + * iteration of the loop above.
40060 + */
40061 + warning("nikita-2282",
40062 + "Partial conversion of %llu: %lu of %lu: %i",
40063 + (unsigned long long)get_inode_oid(inode), i,
40064 + num_pages, result);
40065 +
40066 + return result;
40067 +}
40068 +
40069 +/*
40070 + * Local variables:
40071 + * c-indentation-style: "K&R"
40072 + * mode-name: "LC"
40073 + * c-basic-offset: 8
40074 + * tab-width: 8
40075 + * fill-column: 79
40076 + * scroll-step: 1
40077 + * End:
40078 + */
40079 Index: linux-2.6.16/fs/reiser4/plugin/file_ops.c
40080 ===================================================================
40081 --- /dev/null
40082 +++ linux-2.6.16/fs/reiser4/plugin/file_ops.c
40083 @@ -0,0 +1,167 @@
40084 +/* Copyright 2005 by Hans Reiser, licensing governed by
40085 + reiser4/README */
40086 +
40087 +/* this file contains typical implementations for some of methods of
40088 + struct file_operations and of struct address_space_operations
40089 +*/
40090 +
40091 +#include "../inode.h"
40092 +#include "object.h"
40093 +
40094 +/* file operations */
40095 +
40096 +/* implementation of vfs's llseek method of struct file_operations for
40097 + typical directory can be found in readdir_common.c
40098 +*/
40099 +loff_t llseek_common_dir(struct file *, loff_t, int origin);
40100 +
40101 +/* implementation of vfs's readdir method of struct file_operations for
40102 + typical directory can be found in readdir_common.c
40103 +*/
40104 +int readdir_common(struct file *, void *dirent, filldir_t);
40105 +
40106 +/**
40107 + * release_dir_common - release of struct file_operations
40108 + * @inode: inode of released file
40109 + * @file: file to release
40110 + *
40111 + * Implementation of release method of struct file_operations for typical
40112 + * directory. All it does is freeing of reiser4 specific file data.
40113 +*/
40114 +int release_dir_common(struct inode *inode, struct file *file)
40115 +{
40116 + reiser4_context *ctx;
40117 +
40118 + ctx = init_context(inode->i_sb);
40119 + if (IS_ERR(ctx))
40120 + return PTR_ERR(ctx);
40121 + reiser4_free_file_fsdata(file);
40122 + reiser4_exit_context(ctx);
40123 + return 0;
40124 +}
40125 +
40126 +/* this is common implementation of vfs's fsync method of struct
40127 + file_operations
40128 +*/
40129 +int sync_common(struct file *file, struct dentry *dentry, int datasync)
40130 +{
40131 + reiser4_context *ctx;
40132 + int result;
40133 +
40134 + ctx = init_context(dentry->d_inode->i_sb);
40135 + if (IS_ERR(ctx))
40136 + return PTR_ERR(ctx);
40137 + result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
40138 +
40139 + context_set_commit_async(ctx);
40140 + reiser4_exit_context(ctx);
40141 + return result;
40142 +}
40143 +
40144 +/* this is common implementation of vfs's sendfile method of struct
40145 + file_operations
40146 +
40147 + Reads @count bytes from @file and calls @actor for every page read. This is
40148 + needed for loop back devices support.
40149 +*/
40150 +#if 0
40151 +ssize_t
40152 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
40153 + read_actor_t actor, void *target)
40154 +{
40155 + reiser4_context *ctx;
40156 + ssize_t result;
40157 +
40158 + ctx = init_context(file->f_dentry->d_inode->i_sb);
40159 + if (IS_ERR(ctx))
40160 + return PTR_ERR(ctx);
40161 + result = generic_file_sendfile(file, ppos, count, actor, target);
40162 + reiser4_exit_context(ctx);
40163 + return result;
40164 +}
40165 +#endif /* 0 */
40166 +
40167 +/* address space operations */
40168 +
40169 +/* this is common implementation of vfs's prepare_write method of struct
40170 + address_space_operations
40171 +*/
40172 +int
40173 +prepare_write_common(struct file *file, struct page *page, unsigned from,
40174 + unsigned to)
40175 +{
40176 + reiser4_context *ctx;
40177 + int result;
40178 +
40179 + ctx = init_context(page->mapping->host->i_sb);
40180 + result = do_prepare_write(file, page, from, to);
40181 +
40182 + /* don't commit transaction under inode semaphore */
40183 + context_set_commit_async(ctx);
40184 + reiser4_exit_context(ctx);
40185 +
40186 + return result;
40187 +}
40188 +
40189 +/* this is helper for prepare_write_common and prepare_write_unix_file
40190 + */
40191 +int
40192 +do_prepare_write(struct file *file, struct page *page, unsigned from,
40193 + unsigned to)
40194 +{
40195 + int result;
40196 + file_plugin *fplug;
40197 + struct inode *inode;
40198 +
40199 + assert("umka-3099", file != NULL);
40200 + assert("umka-3100", page != NULL);
40201 + assert("umka-3095", PageLocked(page));
40202 +
40203 + if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
40204 + return 0;
40205 +
40206 + inode = page->mapping->host;
40207 + fplug = inode_file_plugin(inode);
40208 +
40209 + if (page->mapping->a_ops->readpage == NULL)
40210 + return RETERR(-EINVAL);
40211 +
40212 + result = page->mapping->a_ops->readpage(file, page);
40213 + if (result != 0) {
40214 + SetPageError(page);
40215 + ClearPageUptodate(page);
40216 + /* All reiser4 readpage() implementations should return the
40217 + * page locked in case of error. */
40218 + assert("nikita-3472", PageLocked(page));
40219 + } else {
40220 + /*
40221 + * ->readpage() either:
40222 + *
40223 + * 1. starts IO against @page. @page is locked for IO in
40224 + * this case.
40225 + *
40226 + * 2. doesn't start IO. @page is unlocked.
40227 + *
40228 + * In either case, page should be locked.
40229 + */
40230 + lock_page(page);
40231 + /*
40232 + * IO (if any) is completed at this point. Check for IO
40233 + * errors.
40234 + */
40235 + if (!PageUptodate(page))
40236 + result = RETERR(-EIO);
40237 + }
40238 + assert("umka-3098", PageLocked(page));
40239 + return result;
40240 +}
40241 +
40242 +/*
40243 + * Local variables:
40244 + * c-indentation-style: "K&R"
40245 + * mode-name: "LC"
40246 + * c-basic-offset: 8
40247 + * tab-width: 8
40248 + * fill-column: 79
40249 + * End:
40250 + */
40251 Index: linux-2.6.16/fs/reiser4/plugin/file_ops_readdir.c
40252 ===================================================================
40253 --- /dev/null
40254 +++ linux-2.6.16/fs/reiser4/plugin/file_ops_readdir.c
40255 @@ -0,0 +1,654 @@
40256 +/* Copyright 2005 by Hans Reiser, licensing governed by
40257 + * reiser4/README */
40258 +
40259 +#include "../inode.h"
40260 +
40261 +/* return true, iff @coord points to the valid directory item that is part of
40262 + * @inode directory. */
40263 +static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40264 +{
40265 + return
40266 + item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE &&
40267 + inode_file_plugin(inode)->owns_item(inode, coord);
40268 +}
40269 +
40270 +/* compare two logical positions within the same directory */
40271 +static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
40272 +{
40273 + cmp_t result;
40274 +
40275 + assert("nikita-2534", p1 != NULL);
40276 + assert("nikita-2535", p2 != NULL);
40277 +
40278 + result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40279 + if (result == EQUAL_TO) {
40280 + int diff;
40281 +
40282 + diff = p1->pos - p2->pos;
40283 + result =
40284 + (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40285 + }
40286 + return result;
40287 +}
40288 +
40289 +
40290 +/* see comment before readdir_common() for overview of why "adjustment" is
40291 + * necessary. */
40292 +static void
40293 +adjust_dir_pos(struct file *dir,
40294 + readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
40295 +{
40296 + dir_pos *pos;
40297 +
40298 + /*
40299 + * new directory entry was added (adj == +1) or removed (adj == -1) at
40300 + * the @mod_point. Directory file descriptor @dir is doing readdir and
40301 + * is currently positioned at @readdir_spot. Latter has to be updated
40302 + * to maintain stable readdir.
40303 + */
40304 + /* directory is positioned to the beginning. */
40305 + if (readdir_spot->entry_no == 0)
40306 + return;
40307 +
40308 + pos = &readdir_spot->position;
40309 + switch (dir_pos_cmp(mod_point, pos)) {
40310 + case LESS_THAN:
40311 + /* @mod_pos is _before_ @readdir_spot, that is, entry was
40312 + * added/removed on the left (in key order) of current
40313 + * position. */
40314 + /* logical number of directory entry readdir is "looking" at
40315 + * changes */
40316 + readdir_spot->entry_no += adj;
40317 + assert("nikita-2577",
40318 + ergo(dir != NULL, get_dir_fpos(dir) + adj >= 0));
40319 + if (de_id_cmp(&pos->dir_entry_key,
40320 + &mod_point->dir_entry_key) == EQUAL_TO) {
40321 + assert("nikita-2575", mod_point->pos < pos->pos);
40322 + /*
40323 + * if entry added/removed has the same key as current
40324 + * for readdir, update counter of duplicate keys in
40325 + * @readdir_spot.
40326 + */
40327 + pos->pos += adj;
40328 + }
40329 + break;
40330 + case GREATER_THAN:
40331 + /* directory is modified after @pos: nothing to do. */
40332 + break;
40333 + case EQUAL_TO:
40334 + /* cannot insert an entry readdir is looking at, because it
40335 + already exists. */
40336 + assert("nikita-2576", adj < 0);
40337 + /* directory entry to which @pos points to is being
40338 + removed.
40339 +
40340 + NOTE-NIKITA: Right thing to do is to update @pos to point
40341 + to the next entry. This is complex (we are under spin-lock
40342 + for one thing). Just rewind it to the beginning. Next
40343 + readdir will have to scan the beginning of
40344 + directory. Proper solution is to use semaphore in
40345 + spin lock's stead and use rewind_right() here.
40346 +
40347 + NOTE-NIKITA: now, semaphore is used, so...
40348 + */
40349 + memset(readdir_spot, 0, sizeof *readdir_spot);
40350 + }
40351 +}
40352 +
40353 +/* scan all file-descriptors for this directory and adjust their
40354 + positions respectively. Should be used by implementations of
40355 + add_entry and rem_entry of dir plugin */
40356 +void
40357 +adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj)
40358 +{
40359 + reiser4_file_fsdata *scan;
40360 + dir_pos mod_point;
40361 +
40362 + assert("nikita-2536", dir != NULL);
40363 + assert("nikita-2538", de != NULL);
40364 + assert("nikita-2539", adj != 0);
40365 +
40366 + build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40367 + mod_point.pos = offset;
40368 +
40369 + spin_lock_inode(dir);
40370 +
40371 + /*
40372 + * new entry was added/removed in directory @dir. Scan all file
40373 + * descriptors for @dir that are currently involved into @readdir and
40374 + * update them.
40375 + */
40376 +
40377 + list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40378 + adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40379 +
40380 + spin_unlock_inode(dir);
40381 +}
40382 +
40383 +/*
40384 + * traverse tree to start/continue readdir from the readdir position @pos.
40385 + */
40386 +static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
40387 +{
40388 + reiser4_key key;
40389 + int result;
40390 + struct inode *inode;
40391 +
40392 + assert("nikita-2554", pos != NULL);
40393 +
40394 + inode = dir->f_dentry->d_inode;
40395 + result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40396 + if (result != 0)
40397 + return result;
40398 + result = object_lookup(inode,
40399 + &key,
40400 + tap->coord,
40401 + tap->lh,
40402 + tap->mode,
40403 + FIND_EXACT,
40404 + LEAF_LEVEL, LEAF_LEVEL, 0, &tap->ra_info);
40405 + if (result == CBK_COORD_FOUND)
40406 + result = rewind_right(tap, (int)pos->position.pos);
40407 + else {
40408 + tap->coord->node = NULL;
40409 + done_lh(tap->lh);
40410 + result = RETERR(-EIO);
40411 + }
40412 + return result;
40413 +}
40414 +
40415 +/*
40416 + * handling of non-unique keys: calculate at what ordinal position within
40417 + * sequence of directory items with identical keys @pos is.
40418 + */
40419 +static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
40420 +{
40421 + int result;
40422 + coord_t coord;
40423 + lock_handle lh;
40424 + tap_t scan;
40425 + de_id *did;
40426 + reiser4_key de_key;
40427 +
40428 + coord_init_zero(&coord);
40429 + init_lh(&lh);
40430 + tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40431 + tap_copy(&scan, tap);
40432 + tap_load(&scan);
40433 + pos->position.pos = 0;
40434 +
40435 + did = &pos->position.dir_entry_key;
40436 +
40437 + if (is_valid_dir_coord(inode, scan.coord)) {
40438 +
40439 + build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40440 +
40441 + while (1) {
40442 +
40443 + result = go_prev_unit(&scan);
40444 + if (result != 0)
40445 + break;
40446 +
40447 + if (!is_valid_dir_coord(inode, scan.coord)) {
40448 + result = -EINVAL;
40449 + break;
40450 + }
40451 +
40452 + /* get key of directory entry */
40453 + unit_key_by_coord(scan.coord, &de_key);
40454 + if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
40455 + /* duplicate-sequence is over */
40456 + break;
40457 + }
40458 + pos->position.pos++;
40459 + }
40460 + } else
40461 + result = RETERR(-ENOENT);
40462 + tap_relse(&scan);
40463 + tap_done(&scan);
40464 + return result;
40465 +}
40466 +
40467 +/*
40468 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
40469 + */
40470 +static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
40471 +{
40472 + __u64 destination;
40473 + __s64 shift;
40474 + int result;
40475 + struct inode *inode;
40476 + loff_t dirpos;
40477 +
40478 + assert("nikita-2553", dir != NULL);
40479 + assert("nikita-2548", pos != NULL);
40480 + assert("nikita-2551", tap->coord != NULL);
40481 + assert("nikita-2552", tap->lh != NULL);
40482 +
40483 + dirpos = get_dir_fpos(dir);
40484 + shift = dirpos - pos->fpos;
40485 + /* this is logical directory entry within @dir which we are rewinding
40486 + * to */
40487 + destination = pos->entry_no + shift;
40488 +
40489 + inode = dir->f_dentry->d_inode;
40490 + if (dirpos < 0)
40491 + return RETERR(-EINVAL);
40492 + else if (destination == 0ll || dirpos == 0) {
40493 + /* rewind to the beginning of directory */
40494 + memset(pos, 0, sizeof *pos);
40495 + return dir_go_to(dir, pos, tap);
40496 + } else if (destination >= inode->i_size)
40497 + return RETERR(-ENOENT);
40498 +
40499 + if (shift < 0) {
40500 + /* I am afraid of negative numbers */
40501 + shift = -shift;
40502 + /* rewinding to the left */
40503 + if (shift <= (int)pos->position.pos) {
40504 + /* destination is within sequence of entries with
40505 + duplicate keys. */
40506 + result = dir_go_to(dir, pos, tap);
40507 + } else {
40508 + shift -= pos->position.pos;
40509 + while (1) {
40510 + /* repetitions: deadlock is possible when
40511 + going to the left. */
40512 + result = dir_go_to(dir, pos, tap);
40513 + if (result == 0) {
40514 + result = rewind_left(tap, shift);
40515 + if (result == -E_DEADLOCK) {
40516 + tap_done(tap);
40517 + continue;
40518 + }
40519 + }
40520 + break;
40521 + }
40522 + }
40523 + } else {
40524 + /* rewinding to the right */
40525 + result = dir_go_to(dir, pos, tap);
40526 + if (result == 0)
40527 + result = rewind_right(tap, shift);
40528 + }
40529 + if (result == 0) {
40530 + result = set_pos(inode, pos, tap);
40531 + if (result == 0) {
40532 + /* update pos->position.pos */
40533 + pos->entry_no = destination;
40534 + pos->fpos = dirpos;
40535 + }
40536 + }
40537 + return result;
40538 +}
40539 +
40540 +/*
40541 + * Function that is called by common_readdir() on each directory entry while
40542 + * doing readdir. ->filldir callback may block, so we had to release long term
40543 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
40544 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
40545 + *
40546 + * Whether node is unlocked in case of any other error is undefined. It is
40547 + * guaranteed to be still locked if success (0) is returned.
40548 + *
40549 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
40550 + * unlocked.
40551 + */
40552 +static int
40553 +feed_entry(struct file *f,
40554 + readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
40555 +{
40556 + item_plugin *iplug;
40557 + char *name;
40558 + reiser4_key sd_key;
40559 + int result;
40560 + char buf[DE_NAME_BUF_LEN];
40561 + char name_buf[32];
40562 + char *local_name;
40563 + unsigned file_type;
40564 + seal_t seal;
40565 + coord_t *coord;
40566 + reiser4_key entry_key;
40567 +
40568 + coord = tap->coord;
40569 + iplug = item_plugin_by_coord(coord);
40570 +
40571 + /* pointer to name within the node */
40572 + name = iplug->s.dir.extract_name(coord, buf);
40573 + assert("nikita-1371", name != NULL);
40574 +
40575 + /* key of object the entry points to */
40576 + if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
40577 + return RETERR(-EIO);
40578 +
40579 + /* we must release longterm znode lock before calling filldir to avoid
40580 + deadlock which may happen if filldir causes page fault. So, copy
40581 + name to intermediate buffer */
40582 + if (strlen(name) + 1 > sizeof(name_buf)) {
40583 + local_name = kmalloc(strlen(name) + 1, get_gfp_mask());
40584 + if (local_name == NULL)
40585 + return RETERR(-ENOMEM);
40586 + } else
40587 + local_name = name_buf;
40588 +
40589 + strcpy(local_name, name);
40590 + file_type = iplug->s.dir.extract_file_type(coord);
40591 +
40592 + unit_key_by_coord(coord, &entry_key);
40593 + seal_init(&seal, coord, &entry_key);
40594 +
40595 + longterm_unlock_znode(tap->lh);
40596 +
40597 + /*
40598 + * send information about directory entry to the ->filldir() filler
40599 + * supplied to us by caller (VFS).
40600 + *
40601 + * ->filldir is entitled to do weird things. For example, ->filldir
40602 + * supplied by knfsd re-enters file system. Make sure no locks are
40603 + * held.
40604 + */
40605 + assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
40606 +
40607 + result = filldir(dirent, name, (int)strlen(name),
40608 + /* offset of this entry */
40609 + f->f_pos,
40610 + /* inode number of object bounden by this entry */
40611 + oid_to_uino(get_key_objectid(&sd_key)), file_type);
40612 + if (local_name != name_buf)
40613 + kfree(local_name);
40614 + if (result < 0)
40615 + /* ->filldir() is satisfied. (no space in buffer, IOW) */
40616 + result = 1;
40617 + else
40618 + result = seal_validate(&seal, coord, &entry_key,
40619 + tap->lh, tap->mode, ZNODE_LOCK_HIPRI);
40620 + return result;
40621 +}
40622 +
40623 +static void move_entry(readdir_pos * pos, coord_t * coord)
40624 +{
40625 + reiser4_key de_key;
40626 + de_id *did;
40627 +
40628 + /* update @pos */
40629 + ++pos->entry_no;
40630 + did = &pos->position.dir_entry_key;
40631 +
40632 + /* get key of directory entry */
40633 + unit_key_by_coord(coord, &de_key);
40634 +
40635 + if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
40636 + /* we are within sequence of directory entries
40637 + with duplicate keys. */
40638 + ++pos->position.pos;
40639 + else {
40640 + pos->position.pos = 0;
40641 + build_de_id_by_key(&de_key, did);
40642 + }
40643 + ++pos->fpos;
40644 +}
40645 +
40646 +/*
40647 + * STATELESS READDIR
40648 + *
40649 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
40650 + * into reiser4_file_fsdata on each directory modification (name insertion and
40651 + * removal), see readdir_common() function below. This obviously doesn't work
40652 + * when reiser4 is accessed over NFS, because NFS doesn't keep any state
40653 + * across client READDIR requests for the same directory.
40654 + *
40655 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
40656 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
40657 + * find detached reiser4_file_fsdata corresponding to previous readdir
40658 + * request. In other words, additional state is maintained on the
40659 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
40660 + *
40661 + * To efficiently detect when our ->readdir() method is called by NFS server,
40662 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
40663 + * file_is_stateless() function).
40664 + *
40665 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
40666 + * bits of NFS readdir cookie: when first readdir request comes to the given
40667 + * directory from the given client, cookie is set to 0. This situation is
40668 + * detected, global cid_counter is incremented, and stored in highest bits of
40669 + * all direntry offsets returned to the client, including last one. As the
40670 + * only valid readdir cookie is one obtained as direntry->offset, we are
40671 + * guaranteed that next readdir request (continuing current one) will have
40672 + * current cid in the highest bits of starting readdir cookie. All d_cursors
40673 + * are hashed into per-super-block hash table by (oid, cid) key.
40674 + *
40675 + * In addition d_cursors are placed into per-super-block radix tree where they
40676 + * are keyed by oid alone. This is necessary to efficiently remove them during
40677 + * rmdir.
40678 + *
40679 + * At last, currently unused d_cursors are linked into special list. This list
40680 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
40681 + *
40682 + */
40683 +
40684 +
40685 +/*
40686 + * prepare for readdir.
40687 + */
40688 +static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
40689 +{
40690 + struct inode *inode;
40691 + reiser4_file_fsdata *fsdata;
40692 + int result;
40693 +
40694 + assert("nikita-1359", f != NULL);
40695 + inode = f->f_dentry->d_inode;
40696 + assert("nikita-1360", inode != NULL);
40697 +
40698 + if (!S_ISDIR(inode->i_mode))
40699 + return RETERR(-ENOTDIR);
40700 +
40701 + /* try to find detached readdir state */
40702 + result = try_to_attach_fsdata(f, inode);
40703 + if (result != 0)
40704 + return result;
40705 +
40706 + fsdata = reiser4_get_file_fsdata(f);
40707 + assert("nikita-2571", fsdata != NULL);
40708 + if (IS_ERR(fsdata))
40709 + return PTR_ERR(fsdata);
40710 +
40711 + /* add file descriptor to the readdir list hanging of directory
40712 + * inode. This list is used to scan "readdirs-in-progress" while
40713 + * inserting or removing names in the directory. */
40714 + spin_lock_inode(inode);
40715 + if (list_empty_careful(&fsdata->dir.linkage))
40716 + list_add(&fsdata->dir.linkage, get_readdir_list(inode));
40717 + *pos = &fsdata->dir.readdir;
40718 + spin_unlock_inode(inode);
40719 +
40720 + /* move @tap to the current position */
40721 + return dir_rewind(f, *pos, tap);
40722 +}
40723 +
40724 +/* this is implementation of vfs's llseek method of struct file_operations for
40725 + typical directory
40726 + See comment before readdir_common() for explanation.
40727 +*/
40728 +loff_t llseek_common_dir(struct file * file, loff_t off, int origin)
40729 +{
40730 + reiser4_context *ctx;
40731 + loff_t result;
40732 + struct inode *inode;
40733 +
40734 + inode = file->f_dentry->d_inode;
40735 +
40736 + ctx = init_context(inode->i_sb);
40737 + if (IS_ERR(ctx))
40738 + return PTR_ERR(ctx);
40739 +
40740 + mutex_lock(&inode->i_mutex);
40741 +
40742 + /* update ->f_pos */
40743 + result = default_llseek(file, off, origin);
40744 + if (result >= 0) {
40745 + int ff;
40746 + coord_t coord;
40747 + lock_handle lh;
40748 + tap_t tap;
40749 + readdir_pos *pos;
40750 +
40751 + coord_init_zero(&coord);
40752 + init_lh(&lh);
40753 + tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40754 +
40755 + ff = dir_readdir_init(file, &tap, &pos);
40756 + detach_fsdata(file);
40757 + if (ff != 0)
40758 + result = (loff_t) ff;
40759 + tap_done(&tap);
40760 + }
40761 + detach_fsdata(file);
40762 + mutex_unlock(&inode->i_mutex);
40763 +
40764 + reiser4_exit_context(ctx);
40765 + return result;
40766 +}
40767 +
40768 +/* this is common implementation of vfs's readdir method of struct
40769 + file_operations
40770 +
40771 + readdir problems:
40772 +
40773 + readdir(2)/getdents(2) interface is based on implicit assumption that
40774 + readdir can be restarted from any particular point by supplying file system
40775 + with off_t-full of data. That is, file system fills ->d_off field in struct
40776 + dirent and later user passes ->d_off to the seekdir(3), which is, actually,
40777 + implemented by glibc as lseek(2) on directory.
40778 +
40779 + Reiser4 cannot restart readdir from 64 bits of data, because two last
40780 + components of the key of directory entry are unknown, which given 128 bits:
40781 + locality and type fields in the key of directory entry are always known, to
40782 + start readdir() from given point objectid and offset fields have to be
40783 + filled.
40784 +
40785 + Traditional UNIX API for scanning through directory
40786 + (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
40787 + assumption that directory is structured very much like regular file, in
40788 + particular, it is implied that each name within given directory (directory
40789 + entry) can be uniquely identified by scalar offset and that such offset is
40790 + stable across the life-time of the name is identifies.
40791 +
40792 + This is manifestly not so for reiser4. In reiser4 the only stable unique
40793 + identifies for the directory entry is its key that doesn't fit into
40794 + seekdir/telldir API.
40795 +
40796 + solution:
40797 +
40798 + Within each file descriptor participating in readdir-ing of directory
40799 + plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
40800 + the "current" directory entry that file descriptor looks at. It contains a
40801 + key of directory entry (plus some additional info to deal with non-unique
40802 + keys that we wouldn't dwell onto here) and a logical position of this
40803 + directory entry starting from the beginning of the directory, that is
40804 + ordinal number of this entry in the readdir order.
40805 +
40806 + Obviously this logical position is not stable in the face of directory
40807 + modifications. To work around this, on each addition or removal of directory
40808 + entry all file descriptors for directory inode are scanned and their
40809 + readdir_pos are updated accordingly (adjust_dir_pos()).
40810 +*/
40811 +int readdir_common(struct file *f /* directory file being read */ ,
40812 + void *dirent /* opaque data passed to us by VFS */ ,
40813 + filldir_t filld /* filler function passed to us by VFS */ )
40814 +{
40815 + reiser4_context *ctx;
40816 + int result;
40817 + struct inode *inode;
40818 + coord_t coord;
40819 + lock_handle lh;
40820 + tap_t tap;
40821 + readdir_pos *pos;
40822 +
40823 + assert("nikita-1359", f != NULL);
40824 + inode = f->f_dentry->d_inode;
40825 + assert("nikita-1360", inode != NULL);
40826 +
40827 + if (!S_ISDIR(inode->i_mode))
40828 + return RETERR(-ENOTDIR);
40829 +
40830 + ctx = init_context(inode->i_sb);
40831 + if (IS_ERR(ctx))
40832 + return PTR_ERR(ctx);
40833 +
40834 + coord_init_zero(&coord);
40835 + init_lh(&lh);
40836 + tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40837 +
40838 + reiser4_readdir_readahead_init(inode, &tap);
40839 +
40840 + repeat:
40841 + result = dir_readdir_init(f, &tap, &pos);
40842 + if (result == 0) {
40843 + result = tap_load(&tap);
40844 + /* scan entries one by one feeding them to @filld */
40845 + while (result == 0) {
40846 + coord_t *coord;
40847 +
40848 + coord = tap.coord;
40849 + assert("nikita-2572", coord_is_existing_unit(coord));
40850 + assert("nikita-3227", is_valid_dir_coord(inode, coord));
40851 +
40852 + result = feed_entry(f, pos, &tap, filld, dirent);
40853 + if (result > 0) {
40854 + break;
40855 + } else if (result == 0) {
40856 + ++f->f_pos;
40857 + result = go_next_unit(&tap);
40858 + if (result == -E_NO_NEIGHBOR ||
40859 + result == -ENOENT) {
40860 + result = 0;
40861 + break;
40862 + } else if (result == 0) {
40863 + if (is_valid_dir_coord(inode, coord))
40864 + move_entry(pos, coord);
40865 + else
40866 + break;
40867 + }
40868 + } else if (result == -E_REPEAT) {
40869 + /* feed_entry() had to restart. */
40870 + ++f->f_pos;
40871 + tap_relse(&tap);
40872 + goto repeat;
40873 + } else
40874 + warning("vs-1617",
40875 + "readdir_common: unexpected error %d",
40876 + result);
40877 + }
40878 + tap_relse(&tap);
40879 +
40880 + if (result >= 0)
40881 + f->f_version = inode->i_version;
40882 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
40883 + result = 0;
40884 + tap_done(&tap);
40885 + detach_fsdata(f);
40886 +
40887 + /* try to update directory's atime */
40888 + if (reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
40889 + BA_CAN_COMMIT) != 0)
40890 + warning("", "failed to update atime on readdir: %llu",
40891 + get_inode_oid(inode));
40892 + else
40893 + file_accessed(f);
40894 +
40895 + context_set_commit_async(ctx);
40896 + reiser4_exit_context(ctx);
40897 +
40898 + return (result <= 0) ? result : 0;
40899 +}
40900 +
40901 +/*
40902 + * Local variables:
40903 + * c-indentation-style: "K&R"
40904 + * mode-name: "LC"
40905 + * c-basic-offset: 8
40906 + * tab-width: 8
40907 + * fill-column: 79
40908 + * End:
40909 + */
40910 Index: linux-2.6.16/fs/reiser4/plugin/file_plugin_common.c
40911 ===================================================================
40912 --- /dev/null
40913 +++ linux-2.6.16/fs/reiser4/plugin/file_plugin_common.c
40914 @@ -0,0 +1,929 @@
40915 +/* Copyright 2005 by Hans Reiser, licensing governed by
40916 + reiser4/README */
40917 +
40918 +/* this file contains typical implementations for most of methods of
40919 + file plugin
40920 +*/
40921 +
40922 +#include "../inode.h"
40923 +#include "object.h"
40924 +#include "../safe_link.h"
40925 +
40926 +#include <linux/quotaops.h>
40927 +
40928 +static int insert_new_sd(struct inode *inode);
40929 +static int update_sd(struct inode *inode);
40930 +
40931 +/* this is common implementation of write_sd_by_inode method of file plugin
40932 + either insert stat data or update it
40933 + */
40934 +int write_sd_by_inode_common(struct inode *inode /* object to save */ )
40935 +{
40936 + int result;
40937 +
40938 + assert("nikita-730", inode != NULL);
40939 +
40940 + if (inode_get_flag(inode, REISER4_NO_SD))
40941 + /* object doesn't have stat-data yet */
40942 + result = insert_new_sd(inode);
40943 + else
40944 + result = update_sd(inode);
40945 + if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
40946 + /* Don't issue warnings about "name is too long" */
40947 + warning("nikita-2221", "Failed to save sd for %llu: %i",
40948 + (unsigned long long)get_inode_oid(inode), result);
40949 + return result;
40950 +}
40951 +
40952 +/* this is common implementation of key_by_inode method of file plugin
40953 + */
40954 +int
40955 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
40956 + reiser4_key * key)
40957 +{
40958 + reiser4_key_init(key);
40959 + set_key_locality(key, reiser4_inode_data(inode)->locality_id);
40960 + set_key_ordering(key, get_inode_ordering(inode));
40961 + set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
40962 + set_key_type(key, KEY_BODY_MINOR);
40963 + set_key_offset(key, (__u64) off);
40964 + return 0;
40965 +}
40966 +
40967 +/* this is common implementation of set_plug_in_inode method of file plugin
40968 + */
40969 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
40970 + struct inode *parent /* parent object */ ,
40971 + reiser4_object_create_data * data /* creational
40972 + * data */ )
40973 +{
40974 + __u64 mask;
40975 +
40976 + object->i_mode = data->mode;
40977 + /* this should be plugin decision */
40978 + object->i_uid = current->fsuid;
40979 + object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
40980 +
40981 + /* support for BSD style group-id assignment. See mount's manual page
40982 + description of bsdgroups ext2 mount options for more details */
40983 + if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
40984 + object->i_gid = parent->i_gid;
40985 + else if (parent->i_mode & S_ISGID) {
40986 + /* parent directory has sguid bit */
40987 + object->i_gid = parent->i_gid;
40988 + if (S_ISDIR(object->i_mode))
40989 + /* sguid is inherited by sub-directories */
40990 + object->i_mode |= S_ISGID;
40991 + } else
40992 + object->i_gid = current->fsgid;
40993 +
40994 + /* this object doesn't have stat-data yet */
40995 + inode_set_flag(object, REISER4_NO_SD);
40996 +#if 0
40997 + /* this is now called after all inode plugins are initialized:
40998 + do_create_vfs_child after adjust_to_parent */
40999 + /* setup inode and file-operations for this inode */
41000 + setup_inode_ops(object, data);
41001 +#endif
41002 + object->i_nlink = 0;
41003 + seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
41004 + mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
41005 + if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
41006 + mask |= (1 << LARGE_TIMES_STAT);
41007 +
41008 + reiser4_inode_data(object)->extmask = mask;
41009 + return 0;
41010 +}
41011 +
41012 +/* this is common implementation of adjust_to_parent method of file plugin for
41013 + regular files
41014 + */
41015 +int adjust_to_parent_common(struct inode *object /* new object */ ,
41016 + struct inode *parent /* parent directory */ ,
41017 + struct inode *root /* root directory */ )
41018 +{
41019 + assert("nikita-2165", object != NULL);
41020 + if (parent == NULL)
41021 + parent = root;
41022 + assert("nikita-2069", parent != NULL);
41023 +
41024 + /*
41025 + * inherit missing plugins from parent
41026 + */
41027 +
41028 + grab_plugin(object, parent, PSET_FILE);
41029 + grab_plugin(object, parent, PSET_SD);
41030 + grab_plugin(object, parent, PSET_FORMATTING);
41031 + grab_plugin(object, parent, PSET_PERM);
41032 + return 0;
41033 +}
41034 +
41035 +/* this is common implementation of adjust_to_parent method of file plugin for
41036 + typical directories
41037 + */
41038 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
41039 + struct inode *parent /* parent directory */ ,
41040 + struct inode *root /* root directory */ )
41041 +{
41042 + int result = 0;
41043 + pset_member memb;
41044 +
41045 + assert("nikita-2166", object != NULL);
41046 + if (parent == NULL)
41047 + parent = root;
41048 + assert("nikita-2167", parent != NULL);
41049 +
41050 + /*
41051 + * inherit missing plugins from parent
41052 + */
41053 + for (memb = 0; memb < PSET_LAST; ++memb) {
41054 + result = grab_plugin(object, parent, memb);
41055 + if (result != 0)
41056 + break;
41057 + }
41058 + return result;
41059 +}
41060 +
41061 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
41062 + struct inode *parent /* parent directory */,
41063 + struct inode *root /* root directory */)
41064 +{
41065 + int result;
41066 + result = adjust_to_parent_common(object, parent, root);
41067 + if (result)
41068 + return result;
41069 + assert("edward-1416", parent != NULL);
41070 +
41071 + grab_plugin(object, parent, PSET_CLUSTER);
41072 + grab_plugin(object, parent, PSET_CIPHER);
41073 + grab_plugin(object, parent, PSET_DIGEST);
41074 + grab_plugin(object, parent, PSET_COMPRESSION);
41075 + grab_plugin(object, parent, PSET_COMPRESSION_MODE);
41076 +
41077 + return 0;
41078 +}
41079 +
41080 +/* this is common implementation of create_object method of file plugin
41081 + */
41082 +int
41083 +create_object_common(struct inode *object, struct inode *parent UNUSED_ARG,
41084 + reiser4_object_create_data * data UNUSED_ARG)
41085 +{
41086 + reiser4_block_nr reserve;
41087 + assert("nikita-744", object != NULL);
41088 + assert("nikita-745", parent != NULL);
41089 + assert("nikita-747", data != NULL);
41090 + assert("nikita-748", inode_get_flag(object, REISER4_NO_SD));
41091 +
41092 + reserve = estimate_create_common(object);
41093 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41094 + return RETERR(-ENOSPC);
41095 + return write_sd_by_inode_common(object);
41096 +}
41097 +
41098 +static int common_object_delete_no_reserve(struct inode *inode);
41099 +
41100 +/**
41101 + * delete_object_common - delete_object of file_plugin
41102 + * @inode: inode to be deleted
41103 + *
41104 + * This is common implementation of delete_object method of file_plugin. It
41105 + * applies to object its deletion consists of removing two items - stat data
41106 + * and safe-link.
41107 + */
41108 +int delete_object_common(struct inode *inode)
41109 +{
41110 + int result;
41111 +
41112 + assert("nikita-1477", inode != NULL);
41113 + /* FIXME: if file body deletion failed (i/o error, for instance),
41114 + inode->i_size can be != 0 here */
41115 + assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
41116 + assert("nikita-3421", inode->i_nlink == 0);
41117 +
41118 +
41119 + if (!inode_get_flag(inode, REISER4_NO_SD)) {
41120 + reiser4_block_nr reserve;
41121 +
41122 + /* grab space which is needed to remove 2 items from the tree:
41123 + stat data and safe-link */
41124 + reserve = 2 * estimate_one_item_removal(tree_by_inode(inode));
41125 + if (reiser4_grab_space_force(reserve,
41126 + BA_RESERVED | BA_CAN_COMMIT))
41127 + return RETERR(-ENOSPC);
41128 + result = common_object_delete_no_reserve(inode);
41129 + } else
41130 + result = 0;
41131 + return result;
41132 +}
41133 +
41134 +/**
41135 + * delete_directory_common - delete_object of file_plugin
41136 + * @inode: inode to be deleted
41137 + *
41138 + * This is common implementation of delete_object method of file_plugin for
41139 + * typical directory. It calls done method of dir_plugin to remove "." and
41140 + * removes stat data and safe-link.
41141 + */
41142 +int delete_directory_common(struct inode *inode)
41143 +{
41144 + int result;
41145 + dir_plugin *dplug;
41146 +
41147 + assert("", (get_current_context() &&
41148 + get_current_context()->trans->atom == NULL));
41149 +
41150 + dplug = inode_dir_plugin(inode);
41151 + assert("vs-1101", dplug && dplug->done);
41152 +
41153 + /* kill cursors which might be attached to inode */
41154 + kill_cursors(inode);
41155 +
41156 + /* grab space enough for removing two items */
41157 + if (reiser4_grab_space
41158 + (2 * estimate_one_item_removal(tree_by_inode(inode)),
41159 + BA_RESERVED | BA_CAN_COMMIT))
41160 + return RETERR(-ENOSPC);
41161 +
41162 + result = dplug->done(inode);
41163 + if (!result)
41164 + result = common_object_delete_no_reserve(inode);
41165 + return result;
41166 +}
41167 +
41168 +/* this is common implementation of add_link method of file plugin
41169 + */
41170 +int add_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
41171 +{
41172 + /*
41173 + * increment ->i_nlink and update ->i_ctime
41174 + */
41175 +
41176 + INODE_INC_FIELD(object, i_nlink);
41177 + object->i_ctime = CURRENT_TIME;
41178 + return 0;
41179 +}
41180 +
41181 +/* this is common implementation of rem_link method of file plugin
41182 + */
41183 +int rem_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
41184 +{
41185 + assert("nikita-2021", object != NULL);
41186 + assert("nikita-2163", object->i_nlink > 0);
41187 +
41188 + /*
41189 + * decrement ->i_nlink and update ->i_ctime
41190 + */
41191 +
41192 + INODE_DEC_FIELD(object, i_nlink);
41193 + object->i_ctime = CURRENT_TIME;
41194 + return 0;
41195 +}
41196 +
41197 +/* this is common implementation of rem_link method of file plugin for typical
41198 + directory
41199 +*/
41200 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
41201 +{
41202 + assert("nikita-20211", object != NULL);
41203 + assert("nikita-21631", object->i_nlink > 0);
41204 +
41205 + /*
41206 + * decrement ->i_nlink and update ->i_ctime
41207 + */
41208 + INODE_DEC_FIELD(object, i_nlink);
41209 + if (object->i_nlink == 1)
41210 + INODE_DEC_FIELD(object, i_nlink);
41211 + object->i_ctime = CURRENT_TIME;
41212 + return 0;
41213 +}
41214 +
41215 +/* this is common implementation of owns_item method of file plugin
41216 + compare objectids of keys in inode and coord */
41217 +int owns_item_common(const struct inode *inode, /* object to check
41218 + * against */
41219 + const coord_t * coord /* coord to check */ )
41220 +{
41221 + reiser4_key item_key;
41222 + reiser4_key file_key;
41223 +
41224 + assert("nikita-760", inode != NULL);
41225 + assert("nikita-761", coord != NULL);
41226 +
41227 + return coord_is_existing_item(coord) &&
41228 + (get_key_objectid(build_sd_key(inode, &file_key)) ==
41229 + get_key_objectid(item_key_by_coord(coord, &item_key)));
41230 +}
41231 +
41232 +/* this is common implementation of owns_item method of file plugin
41233 + for typical directory
41234 +*/
41235 +int owns_item_common_dir(const struct inode *inode, /* object to check against */
41236 + const coord_t * coord /* coord of item to check */ )
41237 +{
41238 + reiser4_key item_key;
41239 +
41240 + assert("nikita-1335", inode != NULL);
41241 + assert("nikita-1334", coord != NULL);
41242 +
41243 + if (item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE)
41244 + return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41245 + get_inode_oid(inode);
41246 + else
41247 + return owns_item_common(inode, coord);
41248 +}
41249 +
41250 +/* this is common implementation of can_add_link method of file plugin
41251 + checks whether yet another hard links to this object can be added
41252 +*/
41253 +int can_add_link_common(const struct inode *object /* object to check */ )
41254 +{
41255 + assert("nikita-732", object != NULL);
41256 +
41257 + /* inode->i_nlink is unsigned int, so just check for integer
41258 + overflow */
41259 + return object->i_nlink + 1 != 0;
41260 +}
41261 +
41262 +/* this is common implementation of can_rem_link method of file plugin for
41263 + typical directory
41264 +*/
41265 +int can_rem_link_common_dir(const struct inode *inode)
41266 +{
41267 + /* is_dir_empty() returns 0 is dir is empty */
41268 + return !is_dir_empty(inode);
41269 +}
41270 +
41271 +/* this is common implementation of detach method of file plugin for typical
41272 + directory
41273 +*/
41274 +int detach_common_dir(struct inode *child, struct inode *parent)
41275 +{
41276 + dir_plugin *dplug;
41277 +
41278 + dplug = inode_dir_plugin(child);
41279 + assert("nikita-2883", dplug != NULL);
41280 + assert("nikita-2884", dplug->detach != NULL);
41281 + return dplug->detach(child, parent);
41282 +}
41283 +
41284 +/* this is common implementation of bind method of file plugin for typical
41285 + directory
41286 +*/
41287 +int bind_common_dir(struct inode *child, struct inode *parent)
41288 +{
41289 + dir_plugin *dplug;
41290 +
41291 + dplug = inode_dir_plugin(child);
41292 + assert("nikita-2646", dplug != NULL);
41293 + return dplug->attach(child, parent);
41294 +}
41295 +
41296 +static int process_truncate(struct inode *, __u64 size);
41297 +
41298 +/* this is common implementation of safelink method of file plugin
41299 + */
41300 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41301 +{
41302 + int result;
41303 +
41304 + assert("vs-1705", get_current_context()->trans->atom == NULL);
41305 + if (link == SAFE_UNLINK)
41306 + /* nothing to do. iput() in the caller (process_safelink) will
41307 + * finish with file */
41308 + result = 0;
41309 + else if (link == SAFE_TRUNCATE)
41310 + result = process_truncate(object, value);
41311 + else {
41312 + warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41313 + result = RETERR(-EIO);
41314 + }
41315 + return result;
41316 +}
41317 +
41318 +/* this is common implementation of estimate.create method of file plugin
41319 + can be used when object creation involves insertion of one item (usually stat
41320 + data) into tree
41321 +*/
41322 +reiser4_block_nr estimate_create_common(const struct inode * object)
41323 +{
41324 + return estimate_one_insert_item(tree_by_inode(object));
41325 +}
41326 +
41327 +/* this is common implementation of estimate.create method of file plugin for
41328 + typical directory
41329 + can be used when directory creation involves insertion of two items (usually
41330 + stat data and item containing "." and "..") into tree
41331 +*/
41332 +reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41333 +{
41334 + return 2 * estimate_one_insert_item(tree_by_inode(object));
41335 +}
41336 +
41337 +/* this is common implementation of estimate.update method of file plugin
41338 + can be used when stat data update does not do more than inserting a unit
41339 + into a stat data item which is probably true for most cases
41340 +*/
41341 +reiser4_block_nr estimate_update_common(const struct inode * inode)
41342 +{
41343 + return estimate_one_insert_into_item(tree_by_inode(inode));
41344 +}
41345 +
41346 +/* this is common implementation of estimate.unlink method of file plugin
41347 + */
41348 +reiser4_block_nr
41349 +estimate_unlink_common(const struct inode * object UNUSED_ARG,
41350 + const struct inode * parent UNUSED_ARG)
41351 +{
41352 + return 0;
41353 +}
41354 +
41355 +/* this is common implementation of estimate.unlink method of file plugin for
41356 + typical directory
41357 +*/
41358 +reiser4_block_nr
41359 +estimate_unlink_common_dir(const struct inode * object,
41360 + const struct inode * parent)
41361 +{
41362 + dir_plugin *dplug;
41363 +
41364 + dplug = inode_dir_plugin(object);
41365 + assert("nikita-2888", dplug != NULL);
41366 + assert("nikita-2887", dplug->estimate.unlink != NULL);
41367 + return dplug->estimate.unlink(object, parent);
41368 +}
41369 +
41370 +char *wire_write_common(struct inode *inode, char *start)
41371 +{
41372 + return build_inode_onwire(inode, start);
41373 +}
41374 +
41375 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41376 +{
41377 + return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41378 +}
41379 +
41380 +struct dentry *wire_get_common(struct super_block *sb,
41381 + reiser4_object_on_wire * obj)
41382 +{
41383 + struct inode *inode;
41384 + struct dentry *dentry;
41385 + reiser4_key key;
41386 +
41387 + extract_key_from_id(&obj->u.std.key_id, &key);
41388 + inode = reiser4_iget(sb, &key, 1);
41389 + if (!IS_ERR(inode)) {
41390 + reiser4_iget_complete(inode);
41391 + dentry = d_alloc_anon(inode);
41392 + if (dentry == NULL) {
41393 + iput(inode);
41394 + dentry = ERR_PTR(-ENOMEM);
41395 + } else
41396 + dentry->d_op = &get_super_private(sb)->ops.dentry;
41397 + } else if (PTR_ERR(inode) == -ENOENT)
41398 + /*
41399 + * inode wasn't found at the key encoded in the file
41400 + * handle. Hence, file handle is stale.
41401 + */
41402 + dentry = ERR_PTR(RETERR(-ESTALE));
41403 + else
41404 + dentry = (void *)inode;
41405 + return dentry;
41406 +}
41407 +
41408 +int wire_size_common(struct inode *inode)
41409 +{
41410 + return inode_onwire_size(inode);
41411 +}
41412 +
41413 +void wire_done_common(reiser4_object_on_wire * obj)
41414 +{
41415 + /* nothing to do */
41416 +}
41417 +
41418 +/* helper function to print errors */
41419 +static void key_warning(const reiser4_key * key /* key to print */ ,
41420 + const struct inode *inode,
41421 + int code /* error code to print */ )
41422 +{
41423 + assert("nikita-716", key != NULL);
41424 +
41425 + if (code != -ENOMEM) {
41426 + warning("nikita-717", "Error for inode %llu (%i)",
41427 + (unsigned long long)get_key_objectid(key), code);
41428 + print_key("for key", key);
41429 + }
41430 +}
41431 +
41432 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41433 +#if REISER4_DEBUG
41434 +static void
41435 +check_inode_seal(const struct inode *inode,
41436 + const coord_t * coord, const reiser4_key * key)
41437 +{
41438 + reiser4_key unit_key;
41439 +
41440 + unit_key_by_coord(coord, &unit_key);
41441 + assert("nikita-2752",
41442 + WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
41443 + assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
41444 +}
41445 +
41446 +static void check_sd_coord(coord_t * coord, const reiser4_key * key)
41447 +{
41448 + reiser4_key ukey;
41449 +
41450 + coord_clear_iplug(coord);
41451 + if (zload(coord->node))
41452 + return;
41453 +
41454 + if (!coord_is_existing_unit(coord) ||
41455 + !item_plugin_by_coord(coord) ||
41456 + !keyeq(unit_key_by_coord(coord, &ukey), key) ||
41457 + (znode_get_level(coord->node) != LEAF_LEVEL) ||
41458 + !item_is_statdata(coord)) {
41459 + warning("nikita-1901", "Conspicuous seal");
41460 + print_key("key", key);
41461 + print_coord("coord", coord, 1);
41462 + impossible("nikita-2877", "no way");
41463 + }
41464 + zrelse(coord->node);
41465 +}
41466 +
41467 +#else
41468 +#define check_inode_seal(inode, coord, key) noop
41469 +#define check_sd_coord(coord, key) noop
41470 +#endif
41471 +
41472 +/* insert new stat-data into tree. Called with inode state
41473 + locked. Return inode state locked. */
41474 +static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
41475 +{
41476 + int result;
41477 + reiser4_key key;
41478 + coord_t coord;
41479 + reiser4_item_data data;
41480 + char *area;
41481 + reiser4_inode *ref;
41482 + lock_handle lh;
41483 + oid_t oid;
41484 +
41485 + assert("nikita-723", inode != NULL);
41486 + assert("nikita-3406", inode_get_flag(inode, REISER4_NO_SD));
41487 +
41488 + ref = reiser4_inode_data(inode);
41489 + spin_lock_inode(inode);
41490 +
41491 + if (ref->plugin_mask != 0)
41492 + /* inode has non-standard plugins */
41493 + inode_set_extension(inode, PLUGIN_STAT);
41494 + /*
41495 + * prepare specification of new item to be inserted
41496 + */
41497 +
41498 + data.iplug = inode_sd_plugin(inode);
41499 + data.length = data.iplug->s.sd.save_len(inode);
41500 + spin_unlock_inode(inode);
41501 +
41502 + data.data = NULL;
41503 + data.user = 0;
41504 +/* could be optimized for case where there is only one node format in
41505 + * use in the filesystem, probably there are lots of such
41506 + * places we could optimize for only one node layout.... -Hans */
41507 + if (data.length > tree_by_inode(inode)->nplug->max_item_size()) {
41508 + /* This is silly check, but we don't know actual node where
41509 + insertion will go into. */
41510 + return RETERR(-ENAMETOOLONG);
41511 + }
41512 + oid = oid_allocate(inode->i_sb);
41513 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
41514 + if (oid == ABSOLUTE_MAX_OID)
41515 + return RETERR(-EOVERFLOW);
41516 +
41517 + set_inode_oid(inode, oid);
41518 +
41519 + coord_init_zero(&coord);
41520 + init_lh(&lh);
41521 +
41522 + result = insert_by_key(tree_by_inode(inode),
41523 + build_sd_key(inode, &key), &data, &coord, &lh,
41524 + /* stat data lives on a leaf level */
41525 + LEAF_LEVEL, CBK_UNIQUE);
41526 +
41527 + /* we don't want to re-check that somebody didn't insert
41528 + stat-data while we were doing io, because if it did,
41529 + insert_by_key() returned error. */
41530 + /* but what _is_ possible is that plugin for inode's stat-data,
41531 + list of non-standard plugins or their state would change
41532 + during io, so that stat-data wouldn't fit into sd. To avoid
41533 + this race we keep inode_state lock. This lock has to be
41534 + taken each time you access inode in a way that would cause
41535 + changes in sd size: changing plugins etc.
41536 + */
41537 +
41538 + if (result == IBK_INSERT_OK) {
41539 + coord_clear_iplug(&coord);
41540 + result = zload(coord.node);
41541 + if (result == 0) {
41542 + /* have we really inserted stat data? */
41543 + assert("nikita-725", item_is_statdata(&coord));
41544 +
41545 + /* inode was just created. It is inserted into hash
41546 + table, but no directory entry was yet inserted into
41547 + parent. So, inode is inaccessible through
41548 + ->lookup(). All places that directly grab inode
41549 + from hash-table (like old knfsd), should check
41550 + IMMUTABLE flag that is set by common_create_child.
41551 + */
41552 + assert("nikita-3240", data.iplug != NULL);
41553 + assert("nikita-3241", data.iplug->s.sd.save != NULL);
41554 + area = item_body_by_coord(&coord);
41555 + result = data.iplug->s.sd.save(inode, &area);
41556 + znode_make_dirty(coord.node);
41557 + if (result == 0) {
41558 + /* object has stat-data now */
41559 + inode_clr_flag(inode, REISER4_NO_SD);
41560 + inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41561 + /* initialise stat-data seal */
41562 + seal_init(&ref->sd_seal, &coord, &key);
41563 + ref->sd_coord = coord;
41564 + check_inode_seal(inode, &coord, &key);
41565 + } else if (result != -ENOMEM)
41566 + /*
41567 + * convert any other error code to -EIO to
41568 + * avoid confusing user level with unexpected
41569 + * errors.
41570 + */
41571 + result = RETERR(-EIO);
41572 + zrelse(coord.node);
41573 + }
41574 + }
41575 + done_lh(&lh);
41576 +
41577 + if (result != 0)
41578 + key_warning(&key, inode, result);
41579 + else
41580 + oid_count_allocated();
41581 +
41582 + return result;
41583 +}
41584 +
41585 +/* find sd of inode in a tree, deal with errors */
41586 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
41587 + znode_lock_mode lock_mode /* lock mode */ ,
41588 + coord_t * coord /* resulting coord */ ,
41589 + lock_handle * lh /* resulting lock handle */ ,
41590 + const reiser4_key * key /* resulting key */ ,
41591 + int silent)
41592 +{
41593 + int result;
41594 + __u32 flags;
41595 +
41596 + assert("nikita-1692", inode != NULL);
41597 + assert("nikita-1693", coord != NULL);
41598 + assert("nikita-1694", key != NULL);
41599 +
41600 + /* look for the object's stat data in a tree.
41601 + This returns in "node" pointer to a locked znode and in "pos"
41602 + position of an item found in node. Both are only valid if
41603 + coord_found is returned. */
41604 + flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
41605 + flags |= CBK_UNIQUE;
41606 + /*
41607 + * traverse tree to find stat data. We cannot use vroot here, because
41608 + * it only covers _body_ of the file, and stat data don't belong
41609 + * there.
41610 + */
41611 + result = coord_by_key(tree_by_inode(inode),
41612 + key,
41613 + coord,
41614 + lh,
41615 + lock_mode,
41616 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
41617 + if (REISER4_DEBUG && result == 0)
41618 + check_sd_coord(coord, key);
41619 +
41620 + if (result != 0 && !silent)
41621 + key_warning(key, inode, result);
41622 + return result;
41623 +}
41624 +
41625 +static int
41626 +locate_inode_sd(struct inode *inode,
41627 + reiser4_key * key, coord_t * coord, lock_handle * lh)
41628 +{
41629 + reiser4_inode *state;
41630 + seal_t seal;
41631 + int result;
41632 +
41633 + assert("nikita-3483", inode != NULL);
41634 +
41635 + state = reiser4_inode_data(inode);
41636 + spin_lock_inode(inode);
41637 + *coord = state->sd_coord;
41638 + coord_clear_iplug(coord);
41639 + seal = state->sd_seal;
41640 + spin_unlock_inode(inode);
41641 +
41642 + build_sd_key(inode, key);
41643 + if (seal_is_set(&seal)) {
41644 + /* first, try to use seal */
41645 + result = seal_validate(&seal,
41646 + coord,
41647 + key,
41648 + lh, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
41649 + if (result == 0)
41650 + check_sd_coord(coord, key);
41651 + } else
41652 + result = -E_REPEAT;
41653 +
41654 + if (result != 0) {
41655 + coord_init_zero(coord);
41656 + result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
41657 + }
41658 + return result;
41659 +}
41660 +
41661 +/* update stat-data at @coord */
41662 +static int
41663 +update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
41664 + lock_handle * lh)
41665 +{
41666 + int result;
41667 + reiser4_item_data data;
41668 + char *area;
41669 + reiser4_inode *state;
41670 + znode *loaded;
41671 +
41672 + state = reiser4_inode_data(inode);
41673 +
41674 + coord_clear_iplug(coord);
41675 + result = zload(coord->node);
41676 + if (result != 0)
41677 + return result;
41678 + loaded = coord->node;
41679 +
41680 + spin_lock_inode(inode);
41681 + assert("nikita-728", inode_sd_plugin(inode) != NULL);
41682 + data.iplug = inode_sd_plugin(inode);
41683 +
41684 + /* if inode has non-standard plugins, add appropriate stat data
41685 + * extension */
41686 + if (state->plugin_mask != 0)
41687 + inode_set_extension(inode, PLUGIN_STAT);
41688 +
41689 + /* data.length is how much space to add to (or remove
41690 + from if negative) sd */
41691 + if (!inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
41692 + /* recalculate stat-data length */
41693 + data.length =
41694 + data.iplug->s.sd.save_len(inode) -
41695 + item_length_by_coord(coord);
41696 + inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41697 + } else
41698 + data.length = 0;
41699 + spin_unlock_inode(inode);
41700 +
41701 + /* if on-disk stat data is of different length than required
41702 + for this inode, resize it */
41703 + if (data.length != 0) {
41704 + data.data = NULL;
41705 + data.user = 0;
41706 +
41707 + /* insertion code requires that insertion point (coord) was
41708 + * between units. */
41709 + coord->between = AFTER_UNIT;
41710 + result = resize_item(coord,
41711 + &data, key, lh, COPI_DONT_SHIFT_LEFT);
41712 + if (result != 0) {
41713 + key_warning(key, inode, result);
41714 + zrelse(loaded);
41715 + return result;
41716 + }
41717 + if (loaded != coord->node) {
41718 + /* resize_item moved coord to another node. Zload it */
41719 + zrelse(loaded);
41720 + coord_clear_iplug(coord);
41721 + result = zload(coord->node);
41722 + if (result != 0)
41723 + return result;
41724 + loaded = coord->node;
41725 + }
41726 + }
41727 +
41728 + area = item_body_by_coord(coord);
41729 + spin_lock_inode(inode);
41730 + result = data.iplug->s.sd.save(inode, &area);
41731 + znode_make_dirty(coord->node);
41732 +
41733 + /* re-initialise stat-data seal */
41734 +
41735 + /*
41736 + * coord.between was possibly skewed from AT_UNIT when stat-data size
41737 + * was changed and new extensions were pasted into item.
41738 + */
41739 + coord->between = AT_UNIT;
41740 + seal_init(&state->sd_seal, coord, key);
41741 + state->sd_coord = *coord;
41742 + spin_unlock_inode(inode);
41743 + check_inode_seal(inode, coord, key);
41744 + zrelse(loaded);
41745 + return result;
41746 +}
41747 +
41748 +/* Update existing stat-data in a tree. Called with inode state locked. Return
41749 + inode state locked. */
41750 +static int update_sd(struct inode *inode /* inode to update sd for */ )
41751 +{
41752 + int result;
41753 + reiser4_key key;
41754 + coord_t coord;
41755 + lock_handle lh;
41756 +
41757 + assert("nikita-726", inode != NULL);
41758 +
41759 + /* no stat-data, nothing to update?! */
41760 + assert("nikita-3482", !inode_get_flag(inode, REISER4_NO_SD));
41761 +
41762 + init_lh(&lh);
41763 +
41764 + result = locate_inode_sd(inode, &key, &coord, &lh);
41765 + if (result == 0)
41766 + result = update_sd_at(inode, &coord, &key, &lh);
41767 + done_lh(&lh);
41768 +
41769 + return result;
41770 +}
41771 +
41772 +/* helper for delete_object_common and delete_directory_common. Remove object
41773 + stat data. Space for that must be reserved by caller before
41774 +*/
41775 +static int
41776 +common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
41777 +{
41778 + int result;
41779 +
41780 + assert("nikita-1477", inode != NULL);
41781 +
41782 + if (!inode_get_flag(inode, REISER4_NO_SD)) {
41783 + reiser4_key sd_key;
41784 +
41785 + DQUOT_FREE_INODE(inode);
41786 + DQUOT_DROP(inode);
41787 +
41788 + build_sd_key(inode, &sd_key);
41789 + result =
41790 + cut_tree(tree_by_inode(inode), &sd_key, &sd_key, NULL, 0);
41791 + if (result == 0) {
41792 + inode_set_flag(inode, REISER4_NO_SD);
41793 + result = oid_release(inode->i_sb, get_inode_oid(inode));
41794 + if (result == 0) {
41795 + oid_count_released();
41796 +
41797 + result = safe_link_del(tree_by_inode(inode),
41798 + get_inode_oid(inode),
41799 + SAFE_UNLINK);
41800 + }
41801 + }
41802 + } else
41803 + result = 0;
41804 + return result;
41805 +}
41806 +
41807 +/* helper for safelink_common */
41808 +static int process_truncate(struct inode *inode, __u64 size)
41809 +{
41810 + int result;
41811 + struct iattr attr;
41812 + file_plugin *fplug;
41813 + reiser4_context *ctx;
41814 + struct dentry dentry;
41815 +
41816 + assert("vs-21", is_in_reiser4_context());
41817 + ctx = init_context(inode->i_sb);
41818 + assert("vs-22", !IS_ERR(ctx));
41819 +
41820 + attr.ia_size = size;
41821 + attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
41822 + fplug = inode_file_plugin(inode);
41823 +
41824 + mutex_lock(&inode->i_mutex);
41825 + assert("vs-1704", get_current_context()->trans->atom == NULL);
41826 + dentry.d_inode = inode;
41827 + result = inode->i_op->setattr(&dentry, &attr);
41828 + mutex_unlock(&inode->i_mutex);
41829 +
41830 + context_set_commit_async(ctx);
41831 + reiser4_exit_context(ctx);
41832 +
41833 + return result;
41834 +}
41835 +
41836 +/* Local variables:
41837 + c-indentation-style: "K&R"
41838 + mode-name: "LC"
41839 + c-basic-offset: 8
41840 + tab-width: 8
41841 + fill-column: 120
41842 + End:
41843 +*/
41844 Index: linux-2.6.16/fs/reiser4/plugin/hash.c
41845 ===================================================================
41846 --- /dev/null
41847 +++ linux-2.6.16/fs/reiser4/plugin/hash.c
41848 @@ -0,0 +1,350 @@
41849 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
41850 + * reiser4/README */
41851 +
41852 +/* Hash functions */
41853 +
41854 +#include "../debug.h"
41855 +#include "plugin_header.h"
41856 +#include "plugin.h"
41857 +#include "../super.h"
41858 +#include "../inode.h"
41859 +
41860 +#include <linux/types.h>
41861 +
41862 +/* old rupasov (yura) hash */
41863 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
41864 + int len /* @name's length */ )
41865 +{
41866 + int i;
41867 + int j;
41868 + int pow;
41869 + __u64 a;
41870 + __u64 c;
41871 +
41872 + assert("nikita-672", name != NULL);
41873 + assert("nikita-673", len >= 0);
41874 +
41875 + for (pow = 1, i = 1; i < len; ++i)
41876 + pow = pow * 10;
41877 +
41878 + if (len == 1)
41879 + a = name[0] - 48;
41880 + else
41881 + a = (name[0] - 48) * pow;
41882 +
41883 + for (i = 1; i < len; ++i) {
41884 + c = name[i] - 48;
41885 + for (pow = 1, j = i; j < len - 1; ++j)
41886 + pow = pow * 10;
41887 + a = a + c * pow;
41888 + }
41889 + for (; i < 40; ++i) {
41890 + c = '0' - 48;
41891 + for (pow = 1, j = i; j < len - 1; ++j)
41892 + pow = pow * 10;
41893 + a = a + c * pow;
41894 + }
41895 +
41896 + for (; i < 256; ++i) {
41897 + c = i;
41898 + for (pow = 1, j = i; j < len - 1; ++j)
41899 + pow = pow * 10;
41900 + a = a + c * pow;
41901 + }
41902 +
41903 + a = a << 7;
41904 + return a;
41905 +}
41906 +
41907 +/* r5 hash */
41908 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
41909 + int len UNUSED_ARG /* @name's length */ )
41910 +{
41911 + __u64 a = 0;
41912 +
41913 + assert("nikita-674", name != NULL);
41914 + assert("nikita-675", len >= 0);
41915 +
41916 + while (*name) {
41917 + a += *name << 4;
41918 + a += *name >> 4;
41919 + a *= 11;
41920 + name++;
41921 + }
41922 + return a;
41923 +}
41924 +
41925 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
41926 + H0 = Key
41927 + Hi = E Mi(Hi-1) + Hi-1
41928 +
41929 + (see Applied Cryptography, 2nd edition, p448).
41930 +
41931 + Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
41932 +
41933 + Jeremy has agreed to the contents of reiserfs/README. -Hans
41934 +
41935 + This code was blindly upgraded to __u64 by s/__u32/__u64/g.
41936 +*/
41937 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
41938 + int len /* @name's length */ )
41939 +{
41940 + __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
41941 +
41942 + __u64 h0 = k[0], h1 = k[1];
41943 + __u64 a, b, c, d;
41944 + __u64 pad;
41945 + int i;
41946 +
41947 + assert("nikita-676", name != NULL);
41948 + assert("nikita-677", len >= 0);
41949 +
41950 +#define DELTA 0x9E3779B9u
41951 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
41952 +#define PARTROUNDS 6 /* 6 gets complete mixing */
41953 +
41954 +/* a, b, c, d - data; h0, h1 - accumulated hash */
41955 +#define TEACORE(rounds) \
41956 + do { \
41957 + __u64 sum = 0; \
41958 + int n = rounds; \
41959 + __u64 b0, b1; \
41960 + \
41961 + b0 = h0; \
41962 + b1 = h1; \
41963 + \
41964 + do \
41965 + { \
41966 + sum += DELTA; \
41967 + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
41968 + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
41969 + } while(--n); \
41970 + \
41971 + h0 += b0; \
41972 + h1 += b1; \
41973 + } while(0)
41974 +
41975 + pad = (__u64) len | ((__u64) len << 8);
41976 + pad |= pad << 16;
41977 +
41978 + while (len >= 16) {
41979 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41980 + 16 | (__u64) name[3] << 24;
41981 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41982 + 16 | (__u64) name[7] << 24;
41983 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41984 + 16 | (__u64) name[11] << 24;
41985 + d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
41986 + << 16 | (__u64) name[15] << 24;
41987 +
41988 + TEACORE(PARTROUNDS);
41989 +
41990 + len -= 16;
41991 + name += 16;
41992 + }
41993 +
41994 + if (len >= 12) {
41995 + //assert(len < 16);
41996 + if (len >= 16)
41997 + *(int *)0 = 0;
41998 +
41999 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42000 + 16 | (__u64) name[3] << 24;
42001 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42002 + 16 | (__u64) name[7] << 24;
42003 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42004 + 16 | (__u64) name[11] << 24;
42005 +
42006 + d = pad;
42007 + for (i = 12; i < len; i++) {
42008 + d <<= 8;
42009 + d |= name[i];
42010 + }
42011 + } else if (len >= 8) {
42012 + //assert(len < 12);
42013 + if (len >= 12)
42014 + *(int *)0 = 0;
42015 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42016 + 16 | (__u64) name[3] << 24;
42017 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42018 + 16 | (__u64) name[7] << 24;
42019 +
42020 + c = d = pad;
42021 + for (i = 8; i < len; i++) {
42022 + c <<= 8;
42023 + c |= name[i];
42024 + }
42025 + } else if (len >= 4) {
42026 + //assert(len < 8);
42027 + if (len >= 8)
42028 + *(int *)0 = 0;
42029 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42030 + 16 | (__u64) name[3] << 24;
42031 +
42032 + b = c = d = pad;
42033 + for (i = 4; i < len; i++) {
42034 + b <<= 8;
42035 + b |= name[i];
42036 + }
42037 + } else {
42038 + //assert(len < 4);
42039 + if (len >= 4)
42040 + *(int *)0 = 0;
42041 + a = b = c = d = pad;
42042 + for (i = 0; i < len; i++) {
42043 + a <<= 8;
42044 + a |= name[i];
42045 + }
42046 + }
42047 +
42048 + TEACORE(FULLROUNDS);
42049 +
42050 +/* return 0;*/
42051 + return h0 ^ h1;
42052 +
42053 +}
42054 +
42055 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
42056 +
42057 + See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
42058 +
42059 + Excerpts:
42060 +
42061 + FNV hashes are designed to be fast while maintaining a low collision
42062 + rate.
42063 +
42064 + [This version also seems to preserve lexicographical order locally.]
42065 +
42066 + FNV hash algorithms and source code have been released into the public
42067 + domain.
42068 +
42069 +*/
42070 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
42071 + int len UNUSED_ARG /* @name's length */ )
42072 +{
42073 + unsigned long long a = 0xcbf29ce484222325ull;
42074 + const unsigned long long fnv_64_prime = 0x100000001b3ull;
42075 +
42076 + assert("nikita-678", name != NULL);
42077 + assert("nikita-679", len >= 0);
42078 +
42079 + /* FNV-1 hash each octet in the buffer */
42080 + for (; *name; ++name) {
42081 + /* multiply by the 32 bit FNV magic prime mod 2^64 */
42082 + a *= fnv_64_prime;
42083 + /* xor the bottom with the current octet */
42084 + a ^= (unsigned long long)(*name);
42085 + }
42086 + /* return our new hash value */
42087 + return a;
42088 +}
42089 +
42090 +/* degenerate hash function used to simplify testing of non-unique key
42091 + handling */
42092 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
42093 + int len UNUSED_ARG /* @name's length */ )
42094 +{
42095 + return 0xc0c0c0c010101010ull;
42096 +}
42097 +
42098 +static int change_hash(struct inode *inode, reiser4_plugin * plugin)
42099 +{
42100 + int result;
42101 +
42102 + assert("nikita-3503", inode != NULL);
42103 + assert("nikita-3504", plugin != NULL);
42104 +
42105 + assert("nikita-3505", is_reiser4_inode(inode));
42106 + assert("nikita-3506", inode_dir_plugin(inode) != NULL);
42107 + assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
42108 +
42109 + result = 0;
42110 + if (inode_hash_plugin(inode) == NULL ||
42111 + inode_hash_plugin(inode)->h.id != plugin->h.id) {
42112 + if (is_dir_empty(inode) == 0)
42113 + result =
42114 + plugin_set_hash(&reiser4_inode_data(inode)->pset,
42115 + &plugin->hash);
42116 + else
42117 + result = RETERR(-ENOTEMPTY);
42118 +
42119 + }
42120 + return result;
42121 +}
42122 +
42123 +static reiser4_plugin_ops hash_plugin_ops = {
42124 + .init = NULL,
42125 + .load = NULL,
42126 + .save_len = NULL,
42127 + .save = NULL,
42128 + .change = change_hash
42129 +};
42130 +
42131 +/* hash plugins */
42132 +hash_plugin hash_plugins[LAST_HASH_ID] = {
42133 + [RUPASOV_HASH_ID] = {
42134 + .h = {
42135 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42136 + .id = RUPASOV_HASH_ID,
42137 + .pops = &hash_plugin_ops,
42138 + .label = "rupasov",
42139 + .desc = "Original Yura's hash",
42140 + .linkage = {NULL, NULL}
42141 + },
42142 + .hash = hash_rupasov
42143 + },
42144 + [R5_HASH_ID] = {
42145 + .h = {
42146 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42147 + .id = R5_HASH_ID,
42148 + .pops = &hash_plugin_ops,
42149 + .label = "r5",
42150 + .desc = "r5 hash",
42151 + .linkage = {NULL, NULL}
42152 + },
42153 + .hash = hash_r5
42154 + },
42155 + [TEA_HASH_ID] = {
42156 + .h = {
42157 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42158 + .id = TEA_HASH_ID,
42159 + .pops = &hash_plugin_ops,
42160 + .label = "tea",
42161 + .desc = "tea hash",
42162 + .linkage = {NULL, NULL}
42163 + },
42164 + .hash = hash_tea
42165 + },
42166 + [FNV1_HASH_ID] = {
42167 + .h = {
42168 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42169 + .id = FNV1_HASH_ID,
42170 + .pops = &hash_plugin_ops,
42171 + .label = "fnv1",
42172 + .desc = "fnv1 hash",
42173 + .linkage = {NULL, NULL}
42174 + },
42175 + .hash = hash_fnv1
42176 + },
42177 + [DEGENERATE_HASH_ID] = {
42178 + .h = {
42179 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42180 + .id = DEGENERATE_HASH_ID,
42181 + .pops = &hash_plugin_ops,
42182 + .label = "degenerate hash",
42183 + .desc = "Degenerate hash: only for testing",
42184 + .linkage = {NULL, NULL}
42185 + },
42186 + .hash = hash_deg
42187 + }
42188 +};
42189 +
42190 +/* Make Linus happy.
42191 + Local variables:
42192 + c-indentation-style: "K&R"
42193 + mode-name: "LC"
42194 + c-basic-offset: 8
42195 + tab-width: 8
42196 + fill-column: 120
42197 + End:
42198 +*/
42199 Index: linux-2.6.16/fs/reiser4/plugin/inode_ops.c
42200 ===================================================================
42201 --- /dev/null
42202 +++ linux-2.6.16/fs/reiser4/plugin/inode_ops.c
42203 @@ -0,0 +1,886 @@
42204 +/*
42205 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42206 + */
42207 +
42208 +/*
42209 + * this file contains typical implementations for most of methods of struct
42210 + * inode_operations
42211 + */
42212 +
42213 +#include "../inode.h"
42214 +#include "../safe_link.h"
42215 +
42216 +#include <linux/quotaops.h>
42217 +#include <linux/namei.h>
42218 +
42219 +
42220 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42221 + reiser4_object_create_data *data);
42222 +
42223 +/**
42224 + * create_common - create of inode operations
42225 + * @parent: inode of parent directory
42226 + * @dentry: dentry of new object to create
42227 + * @mode: the permissions to use
42228 + * @nameidata:
42229 + *
42230 + * This is common implementation of vfs's create method of struct
42231 + * inode_operations.
42232 + * Creates regular file using file plugin from parent directory plugin set.
42233 + */
42234 +int create_common(struct inode *parent, struct dentry *dentry,
42235 + int mode, struct nameidata *nameidata)
42236 +{
42237 + reiser4_object_create_data data;
42238 +
42239 + memset(&data, 0, sizeof data);
42240 + data.mode = S_IFREG | mode;
42241 + data.id = inode_regular_plugin(parent)->id;
42242 + return create_vfs_object(parent, dentry, &data);
42243 +}
42244 +
42245 +int lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42246 +void check_light_weight(struct inode *inode, struct inode *parent);
42247 +
42248 +/**
42249 + * lookup_common - lookup of inode operations
42250 + * @parent: inode of directory to lookup into
42251 + * @dentry: name to look for
42252 + * @nameidata:
42253 + *
42254 + * This is common implementation of vfs's lookup method of struct
42255 + * inode_operations.
42256 + */
42257 +struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
42258 + struct nameidata *nameidata)
42259 +{
42260 + reiser4_context *ctx;
42261 + int result;
42262 + struct dentry *new;
42263 + struct inode *inode;
42264 + reiser4_dir_entry_desc entry;
42265 +
42266 + ctx = init_context(parent->i_sb);
42267 + if (IS_ERR(ctx))
42268 + return (struct dentry *)ctx;
42269 +
42270 + /* set up operations on dentry. */
42271 + dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42272 +
42273 + result = lookup_name(parent, dentry, &entry.key);
42274 + if (result) {
42275 + context_set_commit_async(ctx);
42276 + reiser4_exit_context(ctx);
42277 + if (result == -ENOENT) {
42278 + /* object not found */
42279 + if (!IS_DEADDIR(parent))
42280 + d_add(dentry, NULL);
42281 + return NULL;
42282 + }
42283 + return ERR_PTR(result);
42284 + }
42285 +
42286 + inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42287 + if (IS_ERR(inode)) {
42288 + context_set_commit_async(ctx);
42289 + reiser4_exit_context(ctx);
42290 + return ERR_PTR(PTR_ERR(inode));
42291 + }
42292 +
42293 + /* success */
42294 + check_light_weight(inode, parent);
42295 + new = d_splice_alias(inode, dentry);
42296 + reiser4_iget_complete(inode);
42297 +
42298 + /* prevent balance_dirty_pages() from being called: we don't want to
42299 + * do this under directory i_mutex. */
42300 + context_set_commit_async(ctx);
42301 + reiser4_exit_context(ctx);
42302 + return new;
42303 +}
42304 +
42305 +static reiser4_block_nr common_estimate_link(struct inode *parent,
42306 + struct inode *object);
42307 +int reiser4_update_dir(struct inode *);
42308 +
42309 +/**
42310 + * link_common - link of inode operations
42311 + * @existing: dentry of object which is to get new name
42312 + * @parent: directory where new name is to be created
42313 + * @newname: new name
42314 + *
42315 + * This is common implementation of vfs's link method of struct
42316 + * inode_operations.
42317 + */
42318 +int link_common(struct dentry *existing, struct inode *parent,
42319 + struct dentry *newname)
42320 +{
42321 + reiser4_context *ctx;
42322 + int result;
42323 + struct inode *object;
42324 + dir_plugin *parent_dplug;
42325 + reiser4_dir_entry_desc entry;
42326 + reiser4_object_create_data data;
42327 + reiser4_block_nr reserve;
42328 +
42329 + ctx = init_context(parent->i_sb);
42330 + if (IS_ERR(ctx))
42331 + return PTR_ERR(ctx);
42332 +
42333 + assert("nikita-1431", existing != NULL);
42334 + assert("nikita-1432", parent != NULL);
42335 + assert("nikita-1433", newname != NULL);
42336 +
42337 + object = existing->d_inode;
42338 + assert("nikita-1434", object != NULL);
42339 +
42340 + /* check for race with create_object() */
42341 + if (inode_get_flag(object, REISER4_IMMUTABLE)) {
42342 + context_set_commit_async(ctx);
42343 + reiser4_exit_context(ctx);
42344 + return RETERR(-E_REPEAT);
42345 + }
42346 +
42347 + parent_dplug = inode_dir_plugin(parent);
42348 +
42349 + memset(&entry, 0, sizeof entry);
42350 + entry.obj = object;
42351 +
42352 + data.mode = object->i_mode;
42353 + data.id = inode_file_plugin(object)->h.id;
42354 +
42355 + reserve = common_estimate_link(parent, existing->d_inode);
42356 + if ((__s64) reserve < 0) {
42357 + context_set_commit_async(ctx);
42358 + reiser4_exit_context(ctx);
42359 + return reserve;
42360 + }
42361 +
42362 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42363 + context_set_commit_async(ctx);
42364 + reiser4_exit_context(ctx);
42365 + return RETERR(-ENOSPC);
42366 + }
42367 +
42368 + /*
42369 + * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
42370 + * means that link(2) can race against unlink(2) or rename(2), and
42371 + * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
42372 + *
42373 + * For such inode we have to undo special processing done in
42374 + * reiser4_unlink() viz. creation of safe-link.
42375 + */
42376 + if (unlikely(object->i_nlink == 0)) {
42377 + result = safe_link_del(tree_by_inode(object),
42378 + get_inode_oid(object), SAFE_UNLINK);
42379 + if (result != 0) {
42380 + context_set_commit_async(ctx);
42381 + reiser4_exit_context(ctx);
42382 + return result;
42383 + }
42384 + }
42385 +
42386 + /* increment nlink of @existing and update its stat data */
42387 + result = reiser4_add_nlink(object, parent, 1);
42388 + if (result == 0) {
42389 + /* add entry to the parent */
42390 + result =
42391 + parent_dplug->add_entry(parent, newname, &data, &entry);
42392 + if (result != 0) {
42393 + /* failed to add entry to the parent, decrement nlink
42394 + of @existing */
42395 + reiser4_del_nlink(object, parent, 1);
42396 + /*
42397 + * now, if that failed, we have a file with too big
42398 + * nlink---space leak, much better than directory
42399 + * entry pointing to nowhere
42400 + */
42401 + }
42402 + }
42403 + if (result == 0) {
42404 + atomic_inc(&object->i_count);
42405 + /*
42406 + * Upon successful completion, link() shall mark for update
42407 + * the st_ctime field of the file. Also, the st_ctime and
42408 + * st_mtime fields of the directory that contains the new
42409 + * entry shall be marked for update. --SUS
42410 + */
42411 + result = reiser4_update_dir(parent);
42412 + }
42413 + if (result == 0)
42414 + d_instantiate(newname, existing->d_inode);
42415 +
42416 + context_set_commit_async(ctx);
42417 + reiser4_exit_context(ctx);
42418 + return result;
42419 +}
42420 +
42421 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
42422 +
42423 +/**
42424 + * unlink_common - unlink of inode operations
42425 + * @parent: inode of directory to remove name from
42426 + * @victim: name to be removed
42427 + *
42428 + * This is common implementation of vfs's unlink method of struct
42429 + * inode_operations.
42430 + */
42431 +int unlink_common(struct inode *parent, struct dentry *victim)
42432 +{
42433 + reiser4_context *ctx;
42434 + int result;
42435 + struct inode *object;
42436 + file_plugin *fplug;
42437 +
42438 + ctx = init_context(parent->i_sb);
42439 + if (IS_ERR(ctx))
42440 + return PTR_ERR(ctx);
42441 +
42442 + object = victim->d_inode;
42443 + fplug = inode_file_plugin(object);
42444 + assert("nikita-2882", fplug->detach != NULL);
42445 +
42446 + result = unlink_check_and_grab(parent, victim);
42447 + if (result != 0) {
42448 + context_set_commit_async(ctx);
42449 + reiser4_exit_context(ctx);
42450 + return result;
42451 + }
42452 +
42453 + result = fplug->detach(object, parent);
42454 + if (result == 0) {
42455 + dir_plugin *parent_dplug;
42456 + reiser4_dir_entry_desc entry;
42457 +
42458 + parent_dplug = inode_dir_plugin(parent);
42459 + memset(&entry, 0, sizeof entry);
42460 +
42461 + /* first, delete directory entry */
42462 + result = parent_dplug->rem_entry(parent, victim, &entry);
42463 + if (result == 0) {
42464 + /*
42465 + * if name was removed successfully, we _have_ to
42466 + * return 0 from this function, because upper level
42467 + * caller (vfs_{rmdir,unlink}) expect this.
42468 + *
42469 + * now that directory entry is removed, update
42470 + * stat-data
42471 + */
42472 + reiser4_del_nlink(object, parent, 1);
42473 + /*
42474 + * Upon successful completion, unlink() shall mark for
42475 + * update the st_ctime and st_mtime fields of the
42476 + * parent directory. Also, if the file's link count is
42477 + * not 0, the st_ctime field of the file shall be
42478 + * marked for update. --SUS
42479 + */
42480 + reiser4_update_dir(parent);
42481 + /* add safe-link for this file */
42482 + if (object->i_nlink == 0)
42483 + safe_link_add(object, SAFE_UNLINK);
42484 + }
42485 + }
42486 +
42487 + if (unlikely(result != 0)) {
42488 + if (result != -ENOMEM)
42489 + warning("nikita-3398", "Cannot unlink %llu (%i)",
42490 + (unsigned long long)get_inode_oid(object),
42491 + result);
42492 + /* if operation failed commit pending inode modifications to
42493 + * the stat-data */
42494 + reiser4_update_sd(object);
42495 + reiser4_update_sd(parent);
42496 + }
42497 +
42498 + reiser4_release_reserved(object->i_sb);
42499 +
42500 + /* @object's i_ctime was updated by ->rem_link() method(). */
42501 +
42502 + /* @victim can be already removed from the disk by this time. Inode is
42503 + then marked so that iput() wouldn't try to remove stat data. But
42504 + inode itself is still there.
42505 + */
42506 +
42507 + /*
42508 + * we cannot release directory semaphore here, because name has
42509 + * already been deleted, but dentry (@victim) still exists. Prevent
42510 + * balance_dirty_pages() from being called on exiting this context: we
42511 + * don't want to do this under directory i_mutex.
42512 + */
42513 + context_set_commit_async(ctx);
42514 + reiser4_exit_context(ctx);
42515 + return result;
42516 +}
42517 +
42518 +/**
42519 + * symlink_common - symlink of inode operations
42520 + * @parent: inode of parent directory
42521 + * @dentry: dentry of object to be created
42522 + * @linkname: string symlink is to contain
42523 + *
42524 + * This is common implementation of vfs's symlink method of struct
42525 + * inode_operations.
42526 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
42527 + */
42528 +int symlink_common(struct inode *parent, struct dentry *dentry,
42529 + const char *linkname)
42530 +{
42531 + reiser4_object_create_data data;
42532 +
42533 + memset(&data, 0, sizeof data);
42534 + data.name = linkname;
42535 + data.id = SYMLINK_FILE_PLUGIN_ID;
42536 + data.mode = S_IFLNK | S_IRWXUGO;
42537 + return create_vfs_object(parent, dentry, &data);
42538 +}
42539 +
42540 +/**
42541 + * mkdir_common - mkdir of inode operations
42542 + * @parent: inode of parent directory
42543 + * @dentry: dentry of object to be created
42544 + * @mode: the permissions to use
42545 + *
42546 + * This is common implementation of vfs's mkdir method of struct
42547 + * inode_operations.
42548 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
42549 + */
42550 +int mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
42551 +{
42552 + reiser4_object_create_data data;
42553 +
42554 + memset(&data, 0, sizeof data);
42555 + data.mode = S_IFDIR | mode;
42556 + data.id = DIRECTORY_FILE_PLUGIN_ID;
42557 + return create_vfs_object(parent, dentry, &data);
42558 +}
42559 +
42560 +/**
42561 + * mknod_common - mknod of inode operations
42562 + * @parent: inode of parent directory
42563 + * @dentry: dentry of object to be created
42564 + * @mode: the permissions to use and file type
42565 + * @rdev: minor and major of new device file
42566 + *
42567 + * This is common implementation of vfs's mknod method of struct
42568 + * inode_operations.
42569 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
42570 + */
42571 +int mknod_common(struct inode *parent, struct dentry *dentry,
42572 + int mode, dev_t rdev)
42573 +{
42574 + reiser4_object_create_data data;
42575 +
42576 + memset(&data, 0, sizeof data);
42577 + data.mode = mode;
42578 + data.rdev = rdev;
42579 + data.id = SPECIAL_FILE_PLUGIN_ID;
42580 + return create_vfs_object(parent, dentry, &data);
42581 +}
42582 +
42583 +/*
42584 + * implementation of vfs's rename method of struct inode_operations for typical
42585 + * directory is in inode_ops_rename.c
42586 + */
42587 +
42588 +/**
42589 + * follow_link_common - follow_link of inode operations
42590 + * @dentry: dentry of symlink
42591 + * @data:
42592 + *
42593 + * This is common implementation of vfs's followlink method of struct
42594 + * inode_operations.
42595 + * Assumes that inode's generic_ip points to the content of symbolic link.
42596 + */
42597 +void *follow_link_common(struct dentry *dentry, struct nameidata *nd)
42598 +{
42599 + assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
42600 +
42601 + if (!dentry->d_inode->u.generic_ip
42602 + || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED))
42603 + return ERR_PTR(RETERR(-EINVAL));
42604 + nd_set_link(nd, dentry->d_inode->u.generic_ip);
42605 + return NULL;
42606 +}
42607 +
42608 +/**
42609 + * permission_common - permission of inode operations
42610 + * @inode: inode to check permissions for
42611 + * @mask: mode bits to check permissions for
42612 + * @nameidata:
42613 + *
42614 + * Uses generic function to check for rwx permissions.
42615 + */
42616 +int permission_common(struct inode *inode, int mask,
42617 + struct nameidata *nameidata)
42618 +{
42619 + return generic_permission(inode, mask, NULL);
42620 +}
42621 +
42622 +static int setattr_reserve(reiser4_tree *);
42623 +
42624 +/* this is common implementation of vfs's setattr method of struct
42625 + inode_operations
42626 +*/
42627 +int setattr_common(struct dentry *dentry, struct iattr *attr)
42628 +{
42629 + reiser4_context *ctx;
42630 + struct inode *inode;
42631 + int result;
42632 +
42633 + inode = dentry->d_inode;
42634 + result = inode_change_ok(inode, attr);
42635 + if (result)
42636 + return result;
42637 +
42638 + ctx = init_context(inode->i_sb);
42639 + if (IS_ERR(ctx))
42640 + return PTR_ERR(ctx);
42641 +
42642 + assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
42643 +
42644 + /*
42645 + * grab disk space and call standard inode_setattr().
42646 + */
42647 + result = setattr_reserve(tree_by_inode(inode));
42648 + if (!result) {
42649 + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
42650 + || (attr->ia_valid & ATTR_GID
42651 + && attr->ia_gid != inode->i_gid)) {
42652 + result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
42653 + if (result) {
42654 + context_set_commit_async(ctx);
42655 + reiser4_exit_context(ctx);
42656 + return result;
42657 + }
42658 + }
42659 + result = inode_setattr(inode, attr);
42660 + if (!result)
42661 + reiser4_update_sd(inode);
42662 + }
42663 +
42664 + context_set_commit_async(ctx);
42665 + reiser4_exit_context(ctx);
42666 + return result;
42667 +}
42668 +
42669 +/* this is common implementation of vfs's getattr method of struct
42670 + inode_operations
42671 +*/
42672 +int
42673 +getattr_common(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry,
42674 + struct kstat *stat)
42675 +{
42676 + struct inode *obj;
42677 +
42678 + assert("nikita-2298", dentry != NULL);
42679 + assert("nikita-2299", stat != NULL);
42680 + assert("nikita-2300", dentry->d_inode != NULL);
42681 +
42682 + obj = dentry->d_inode;
42683 +
42684 + stat->dev = obj->i_sb->s_dev;
42685 + stat->ino = oid_to_uino(get_inode_oid(obj));
42686 + stat->mode = obj->i_mode;
42687 + /* don't confuse userland with huge nlink. This is not entirely
42688 + * correct, because nlink_t is not necessary 16 bit signed. */
42689 + stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
42690 + stat->uid = obj->i_uid;
42691 + stat->gid = obj->i_gid;
42692 + stat->rdev = obj->i_rdev;
42693 + stat->atime = obj->i_atime;
42694 + stat->mtime = obj->i_mtime;
42695 + stat->ctime = obj->i_ctime;
42696 + stat->size = obj->i_size;
42697 + stat->blocks =
42698 + (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
42699 + /* "preferred" blocksize for efficient file system I/O */
42700 + stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
42701 +
42702 + return 0;
42703 +}
42704 +
42705 +/* Estimate the maximum amount of nodes which might be allocated or changed on
42706 + typical new object creation. Typical creation consists of calling create
42707 + method of file plugin, adding directory entry to parent and update parent
42708 + directory's stat data.
42709 +*/
42710 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
42711 + struct inode *object
42712 + /* object */ )
42713 +{
42714 + assert("vpf-309", parent != NULL);
42715 + assert("vpf-307", object != NULL);
42716 +
42717 + return
42718 + /* object creation estimation */
42719 + inode_file_plugin(object)->estimate.create(object) +
42720 + /* stat data of parent directory estimation */
42721 + inode_file_plugin(parent)->estimate.update(parent) +
42722 + /* adding entry estimation */
42723 + inode_dir_plugin(parent)->estimate.add_entry(parent) +
42724 + /* to undo in the case of failure */
42725 + inode_dir_plugin(parent)->estimate.rem_entry(parent);
42726 +}
42727 +
42728 +/* Create child in directory.
42729 +
42730 + . get object's plugin
42731 + . get fresh inode
42732 + . initialize inode
42733 + . add object's stat-data
42734 + . initialize object's directory
42735 + . add entry to the parent
42736 + . instantiate dentry
42737 +
42738 +*/
42739 +static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
42740 + object */
42741 + struct inode **retobj)
42742 +{
42743 + int result;
42744 +
42745 + struct dentry *dentry; /* parent object */
42746 + struct inode *parent; /* new name */
42747 +
42748 + dir_plugin *par_dir; /* directory plugin on the parent */
42749 + dir_plugin *obj_dir; /* directory plugin on the new object */
42750 + file_plugin *obj_plug; /* object plugin on the new object */
42751 + struct inode *object; /* new object */
42752 + reiser4_block_nr reserve;
42753 +
42754 + reiser4_dir_entry_desc entry; /* new directory entry */
42755 +
42756 + assert("nikita-1420", data != NULL);
42757 + parent = data->parent;
42758 + dentry = data->dentry;
42759 +
42760 + assert("nikita-1418", parent != NULL);
42761 + assert("nikita-1419", dentry != NULL);
42762 +
42763 + /* check, that name is acceptable for parent */
42764 + par_dir = inode_dir_plugin(parent);
42765 + if (par_dir->is_name_acceptable &&
42766 + !par_dir->is_name_acceptable(parent,
42767 + dentry->d_name.name,
42768 + (int)dentry->d_name.len))
42769 + return RETERR(-ENAMETOOLONG);
42770 +
42771 + result = 0;
42772 + obj_plug = file_plugin_by_id((int)data->id);
42773 + if (obj_plug == NULL) {
42774 + warning("nikita-430", "Cannot find plugin %i", data->id);
42775 + return RETERR(-ENOENT);
42776 + }
42777 + object = new_inode(parent->i_sb);
42778 + if (object == NULL)
42779 + return RETERR(-ENOMEM);
42780 + /* we'll update i_nlink below */
42781 + object->i_nlink = 0;
42782 + /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
42783 + * to simplify error handling: if some error occurs before i_ino is
42784 + * initialized with oid, i_ino should already be set to some
42785 + * distinguished value. */
42786 + object->i_ino = 0;
42787 +
42788 + /* So that on error iput will be called. */
42789 + *retobj = object;
42790 +
42791 + if (DQUOT_ALLOC_INODE(object)) {
42792 + DQUOT_DROP(object);
42793 + object->i_flags |= S_NOQUOTA;
42794 + return RETERR(-EDQUOT);
42795 + }
42796 +
42797 + memset(&entry, 0, sizeof entry);
42798 + entry.obj = object;
42799 +
42800 + plugin_set_file(&reiser4_inode_data(object)->pset, obj_plug);
42801 + result = obj_plug->set_plug_in_inode(object, parent, data);
42802 + if (result) {
42803 + warning("nikita-431", "Cannot install plugin %i on %llx",
42804 + data->id, (unsigned long long)get_inode_oid(object));
42805 + DQUOT_FREE_INODE(object);
42806 + object->i_flags |= S_NOQUOTA;
42807 + return result;
42808 + }
42809 +
42810 + /* reget plugin after installation */
42811 + obj_plug = inode_file_plugin(object);
42812 +
42813 + if (obj_plug->create_object == NULL) {
42814 + DQUOT_FREE_INODE(object);
42815 + object->i_flags |= S_NOQUOTA;
42816 + return RETERR(-EPERM);
42817 + }
42818 +
42819 + /* if any of hash, tail, sd or permission plugins for newly created
42820 + object are not set yet set them here inheriting them from parent
42821 + directory
42822 + */
42823 + assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
42824 + result = obj_plug->adjust_to_parent(object,
42825 + parent,
42826 + object->i_sb->s_root->d_inode);
42827 + if (result != 0) {
42828 + warning("nikita-432", "Cannot inherit from %llx to %llx",
42829 + (unsigned long long)get_inode_oid(parent),
42830 + (unsigned long long)get_inode_oid(object));
42831 + DQUOT_FREE_INODE(object);
42832 + object->i_flags |= S_NOQUOTA;
42833 + return result;
42834 + }
42835 +
42836 + /* setup inode and file-operations for this inode */
42837 + setup_inode_ops(object, data);
42838 +
42839 + /* call file plugin's method to initialize plugin specific part of
42840 + * inode */
42841 + if (obj_plug->init_inode_data)
42842 + obj_plug->init_inode_data(object, data, 1 /*create */ );
42843 +
42844 + /* obtain directory plugin (if any) for new object. */
42845 + obj_dir = inode_dir_plugin(object);
42846 + if (obj_dir != NULL && obj_dir->init == NULL) {
42847 + DQUOT_FREE_INODE(object);
42848 + object->i_flags |= S_NOQUOTA;
42849 + return RETERR(-EPERM);
42850 + }
42851 +
42852 + reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
42853 +
42854 + reserve = estimate_create_vfs_object(parent, object);
42855 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42856 + DQUOT_FREE_INODE(object);
42857 + object->i_flags |= S_NOQUOTA;
42858 + return RETERR(-ENOSPC);
42859 + }
42860 +
42861 + /* mark inode `immutable'. We disable changes to the file being
42862 + created until valid directory entry for it is inserted. Otherwise,
42863 + if file were expanded and insertion of directory entry fails, we
42864 + have to remove file, but we only alloted enough space in
42865 + transaction to remove _empty_ file. 3.x code used to remove stat
42866 + data in different transaction thus possibly leaking disk space on
42867 + crash. This all only matters if it's possible to access file
42868 + without name, for example, by inode number
42869 + */
42870 + inode_set_flag(object, REISER4_IMMUTABLE);
42871 +
42872 + /* create empty object, this includes allocation of new objectid. For
42873 + directories this implies creation of dot and dotdot */
42874 + assert("nikita-2265", inode_get_flag(object, REISER4_NO_SD));
42875 +
42876 + /* mark inode as `loaded'. From this point onward
42877 + reiser4_delete_inode() will try to remove its stat-data. */
42878 + inode_set_flag(object, REISER4_LOADED);
42879 +
42880 + result = obj_plug->create_object(object, parent, data);
42881 + if (result != 0) {
42882 + inode_clr_flag(object, REISER4_IMMUTABLE);
42883 + if (result != -ENAMETOOLONG && result != -ENOMEM)
42884 + warning("nikita-2219",
42885 + "Failed to create sd for %llu",
42886 + (unsigned long long)get_inode_oid(object));
42887 + DQUOT_FREE_INODE(object);
42888 + object->i_flags |= S_NOQUOTA;
42889 + return result;
42890 + }
42891 +
42892 + if (obj_dir != NULL)
42893 + result = obj_dir->init(object, parent, data);
42894 + if (result == 0) {
42895 + assert("nikita-434", !inode_get_flag(object, REISER4_NO_SD));
42896 + /* insert inode into VFS hash table */
42897 + insert_inode_hash(object);
42898 + /* create entry */
42899 + result = par_dir->add_entry(parent, dentry, data, &entry);
42900 + if (result == 0) {
42901 + result = reiser4_add_nlink(object, parent, 0);
42902 + /* If O_CREAT is set and the file did not previously
42903 + exist, upon successful completion, open() shall
42904 + mark for update the st_atime, st_ctime, and
42905 + st_mtime fields of the file and the st_ctime and
42906 + st_mtime fields of the parent directory. --SUS
42907 + */
42908 + /* @object times are already updated by
42909 + reiser4_add_nlink() */
42910 + if (result == 0)
42911 + reiser4_update_dir(parent);
42912 + if (result != 0)
42913 + /* cleanup failure to add nlink */
42914 + par_dir->rem_entry(parent, dentry, &entry);
42915 + }
42916 + if (result != 0)
42917 + /* cleanup failure to add entry */
42918 + obj_plug->detach(object, parent);
42919 + } else if (result != -ENOMEM)
42920 + warning("nikita-2219", "Failed to initialize dir for %llu: %i",
42921 + (unsigned long long)get_inode_oid(object), result);
42922 +
42923 + /*
42924 + * update stat-data, committing all pending modifications to the inode
42925 + * fields.
42926 + */
42927 + reiser4_update_sd(object);
42928 + if (result != 0) {
42929 + DQUOT_FREE_INODE(object);
42930 + object->i_flags |= S_NOQUOTA;
42931 + /* if everything was ok (result == 0), parent stat-data is
42932 + * already updated above (update_parent_dir()) */
42933 + reiser4_update_sd(parent);
42934 + /* failure to create entry, remove object */
42935 + obj_plug->delete_object(object);
42936 + }
42937 +
42938 + /* file has name now, clear immutable flag */
42939 + inode_clr_flag(object, REISER4_IMMUTABLE);
42940 +
42941 + /* on error, iput() will call ->delete_inode(). We should keep track
42942 + of the existence of stat-data for this inode and avoid attempt to
42943 + remove it in reiser4_delete_inode(). This is accomplished through
42944 + REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
42945 + */
42946 + return result;
42947 +}
42948 +
42949 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
42950 + reiser4_mknod and reiser4_symlink
42951 +*/
42952 +static int
42953 +create_vfs_object(struct inode *parent,
42954 + struct dentry *dentry, reiser4_object_create_data * data)
42955 +{
42956 + reiser4_context *ctx;
42957 + int result;
42958 + struct inode *child;
42959 +
42960 + ctx = init_context(parent->i_sb);
42961 + if (IS_ERR(ctx))
42962 + return PTR_ERR(ctx);
42963 + context_set_commit_async(ctx);
42964 +
42965 + data->parent = parent;
42966 + data->dentry = dentry;
42967 + child = NULL;
42968 + result = do_create_vfs_child(data, &child);
42969 + if (unlikely(result != 0)) {
42970 + if (child != NULL) {
42971 + reiser4_make_bad_inode(child);
42972 + iput(child);
42973 + }
42974 + } else
42975 + d_instantiate(dentry, child);
42976 +
42977 + reiser4_exit_context(ctx);
42978 + return result;
42979 +}
42980 +
42981 +/* helper for link_common. Estimate disk space necessary to add a link
42982 + from @parent to @object
42983 +*/
42984 +static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
42985 + struct inode *object
42986 + /* object to which new link is being cerated */
42987 + )
42988 +{
42989 + reiser4_block_nr res = 0;
42990 + file_plugin *fplug;
42991 + dir_plugin *dplug;
42992 +
42993 + assert("vpf-317", object != NULL);
42994 + assert("vpf-318", parent != NULL);
42995 +
42996 + fplug = inode_file_plugin(object);
42997 + dplug = inode_dir_plugin(parent);
42998 + /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
42999 + /* reiser4_add_nlink(object) */
43000 + res += fplug->estimate.update(object);
43001 + /* add_entry(parent) */
43002 + res += dplug->estimate.add_entry(parent);
43003 + /* reiser4_del_nlink(object) */
43004 + res += fplug->estimate.update(object);
43005 + /* update_dir(parent) */
43006 + res += inode_file_plugin(parent)->estimate.update(parent);
43007 + /* safe-link */
43008 + res += estimate_one_item_removal(tree_by_inode(object));
43009 +
43010 + return res;
43011 +}
43012 +
43013 +/* Estimate disk space necessary to remove a link between @parent and
43014 + @object.
43015 +*/
43016 +static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
43017 + struct inode *object
43018 + /* object to which new link is being cerated */
43019 + )
43020 +{
43021 + reiser4_block_nr res = 0;
43022 + file_plugin *fplug;
43023 + dir_plugin *dplug;
43024 +
43025 + assert("vpf-317", object != NULL);
43026 + assert("vpf-318", parent != NULL);
43027 +
43028 + fplug = inode_file_plugin(object);
43029 + dplug = inode_dir_plugin(parent);
43030 +
43031 + /* rem_entry(parent) */
43032 + res += dplug->estimate.rem_entry(parent);
43033 + /* reiser4_del_nlink(object) */
43034 + res += fplug->estimate.update(object);
43035 + /* update_dir(parent) */
43036 + res += inode_file_plugin(parent)->estimate.update(parent);
43037 + /* fplug->unlink */
43038 + res += fplug->estimate.unlink(object, parent);
43039 + /* safe-link */
43040 + res += estimate_one_insert_item(tree_by_inode(object));
43041 +
43042 + return res;
43043 +}
43044 +
43045 +/* helper for unlink_common. Estimate and grab space for unlink. */
43046 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
43047 +{
43048 + file_plugin *fplug;
43049 + struct inode *child;
43050 + int result;
43051 +
43052 + result = 0;
43053 + child = victim->d_inode;
43054 + fplug = inode_file_plugin(child);
43055 +
43056 + /* check for race with create_object() */
43057 + if (inode_get_flag(child, REISER4_IMMUTABLE))
43058 + return RETERR(-E_REPEAT);
43059 + /* object being deleted should have stat data */
43060 + assert("vs-949", !inode_get_flag(child, REISER4_NO_SD));
43061 +
43062 + /* ask object plugin */
43063 + if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
43064 + return RETERR(-ENOTEMPTY);
43065 +
43066 + result = (int)estimate_unlink(parent, child);
43067 + if (result < 0)
43068 + return result;
43069 +
43070 + return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
43071 +}
43072 +
43073 +/* helper for setattr_common */
43074 +static int setattr_reserve(reiser4_tree * tree)
43075 +{
43076 + assert("vs-1096", is_grab_enabled(get_current_context()));
43077 + return reiser4_grab_space(estimate_one_insert_into_item(tree),
43078 + BA_CAN_COMMIT);
43079 +}
43080 +
43081 +/* helper function. Standards require that for many file-system operations
43082 + on success ctime and mtime of parent directory is to be updated. */
43083 +int reiser4_update_dir(struct inode *dir)
43084 +{
43085 + assert("nikita-2525", dir != NULL);
43086 +
43087 + dir->i_ctime = dir->i_mtime = CURRENT_TIME;
43088 + return reiser4_update_sd(dir);
43089 +}
43090 Index: linux-2.6.16/fs/reiser4/plugin/inode_ops_rename.c
43091 ===================================================================
43092 --- /dev/null
43093 +++ linux-2.6.16/fs/reiser4/plugin/inode_ops_rename.c
43094 @@ -0,0 +1,904 @@
43095 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
43096 + * reiser4/README */
43097 +
43098 +#include "../inode.h"
43099 +#include "../safe_link.h"
43100 +
43101 +static const char *possible_leak = "Possible disk space leak.";
43102 +
43103 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43104 +
43105 + Helper function called from hashed_rename() */
43106 +static int replace_name(struct inode *to_inode, /* inode where @from_coord is
43107 + * to be re-targeted at */
43108 + struct inode *from_dir, /* directory where @from_coord
43109 + * lives */
43110 + struct inode *from_inode, /* inode @from_coord
43111 + * originally point to */
43112 + coord_t * from_coord, /* where directory entry is in
43113 + * the tree */
43114 + lock_handle * from_lh /* lock handle on @from_coord */ )
43115 +{
43116 + item_plugin *from_item;
43117 + int result;
43118 + znode *node;
43119 +
43120 + coord_clear_iplug(from_coord);
43121 + node = from_coord->node;
43122 + result = zload(node);
43123 + if (result != 0)
43124 + return result;
43125 + from_item = item_plugin_by_coord(from_coord);
43126 + if (item_type_by_coord(from_coord) == DIR_ENTRY_ITEM_TYPE) {
43127 + reiser4_key to_key;
43128 +
43129 + build_sd_key(to_inode, &to_key);
43130 +
43131 + /* everything is found and prepared to change directory entry
43132 + at @from_coord to point to @to_inode.
43133 +
43134 + @to_inode is just about to get new name, so bump its link
43135 + counter.
43136 +
43137 + */
43138 + result = reiser4_add_nlink(to_inode, from_dir, 0);
43139 + if (result != 0) {
43140 + /* Don't issue warning: this may be plain -EMLINK */
43141 + zrelse(node);
43142 + return result;
43143 + }
43144 +
43145 + result =
43146 + from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43147 + if (result != 0) {
43148 + reiser4_del_nlink(to_inode, from_dir, 0);
43149 + zrelse(node);
43150 + return result;
43151 + }
43152 +
43153 + /* @from_inode just lost its name, he-he.
43154 +
43155 + If @from_inode was directory, it contained dotdot pointing
43156 + to @from_dir. @from_dir i_nlink will be decreased when
43157 + iput() will be called on @from_inode.
43158 +
43159 + If file-system is not ADG (hard-links are
43160 + supported on directories), iput(from_inode) will not remove
43161 + @from_inode, and thus above is incorrect, but hard-links on
43162 + directories are problematic in many other respects.
43163 + */
43164 + result = reiser4_del_nlink(from_inode, from_dir, 0);
43165 + if (result != 0) {
43166 + warning("nikita-2330",
43167 + "Cannot remove link from source: %i. %s",
43168 + result, possible_leak);
43169 + }
43170 + /* Has to return success, because entry is already
43171 + * modified. */
43172 + result = 0;
43173 +
43174 + /* NOTE-NIKITA consider calling plugin method in stead of
43175 + accessing inode fields directly. */
43176 + from_dir->i_mtime = CURRENT_TIME;
43177 + } else {
43178 + warning("nikita-2326", "Unexpected item type");
43179 + result = RETERR(-EIO);
43180 + }
43181 + zrelse(node);
43182 + return result;
43183 +}
43184 +
43185 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43186 +
43187 + Helper function used by hashed_rename(). */
43188 +static int add_name(struct inode *inode, /* inode where @coord is to be
43189 + * re-targeted at */
43190 + struct inode *dir, /* directory where @coord lives */
43191 + struct dentry *name, /* new name */
43192 + coord_t * coord, /* where directory entry is in the tree */
43193 + lock_handle * lh, /* lock handle on @coord */
43194 + int is_dir /* true, if @inode is directory */ )
43195 +{
43196 + int result;
43197 + reiser4_dir_entry_desc entry;
43198 +
43199 + assert("nikita-2333", lh->node == coord->node);
43200 + assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43201 +
43202 + memset(&entry, 0, sizeof entry);
43203 + entry.obj = inode;
43204 + /* build key of directory entry description */
43205 + inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43206 +
43207 + /* ext2 does this in different order: first inserts new entry,
43208 + then increases directory nlink. We don't want do this,
43209 + because reiser4_add_nlink() calls ->add_link() plugin
43210 + method that can fail for whatever reason, leaving as with
43211 + cleanup problems.
43212 + */
43213 + /* @inode is getting new name */
43214 + reiser4_add_nlink(inode, dir, 0);
43215 + /* create @new_name in @new_dir pointing to
43216 + @old_inode */
43217 + result = WITH_COORD(coord,
43218 + inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43219 + coord,
43220 + lh,
43221 + name,
43222 + &entry));
43223 + if (result != 0) {
43224 + int result2;
43225 + result2 = reiser4_del_nlink(inode, dir, 0);
43226 + if (result2 != 0) {
43227 + warning("nikita-2327",
43228 + "Cannot drop link on %lli %i. %s",
43229 + (unsigned long long)get_inode_oid(inode),
43230 + result2, possible_leak);
43231 + }
43232 + } else
43233 + INODE_INC_FIELD(dir, i_size);
43234 + return result;
43235 +}
43236 +
43237 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43238 + struct dentry *old_name, /* old name */
43239 + struct inode *new_dir, /* directory where @new is located */
43240 + struct dentry *new_name /* new name */ )
43241 +{
43242 + reiser4_block_nr res1, res2;
43243 + dir_plugin *p_parent_old, *p_parent_new;
43244 + file_plugin *p_child_old, *p_child_new;
43245 +
43246 + assert("vpf-311", old_dir != NULL);
43247 + assert("vpf-312", new_dir != NULL);
43248 + assert("vpf-313", old_name != NULL);
43249 + assert("vpf-314", new_name != NULL);
43250 +
43251 + p_parent_old = inode_dir_plugin(old_dir);
43252 + p_parent_new = inode_dir_plugin(new_dir);
43253 + p_child_old = inode_file_plugin(old_name->d_inode);
43254 + if (new_name->d_inode)
43255 + p_child_new = inode_file_plugin(new_name->d_inode);
43256 + else
43257 + p_child_new = NULL;
43258 +
43259 + /* find_entry - can insert one leaf. */
43260 + res1 = res2 = 1;
43261 +
43262 + /* replace_name */
43263 + {
43264 + /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43265 + res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43266 + /* update key */
43267 + res1 += 1;
43268 + /* reiser4_del_nlink(p_child_new) */
43269 + if (p_child_new)
43270 + res1 += p_child_new->estimate.update(new_name->d_inode);
43271 + }
43272 +
43273 + /* else add_name */
43274 + {
43275 + /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43276 + res2 +=
43277 + 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43278 + /* reiser4_add_nlink(p_parent_old) */
43279 + res2 += p_child_old->estimate.update(old_name->d_inode);
43280 + /* add_entry(p_parent_new) */
43281 + res2 += p_parent_new->estimate.add_entry(new_dir);
43282 + /* reiser4_del_nlink(p_parent_old) */
43283 + res2 += p_child_old->estimate.update(old_name->d_inode);
43284 + }
43285 +
43286 + res1 = res1 < res2 ? res2 : res1;
43287 +
43288 + /* reiser4_write_sd(p_parent_new) */
43289 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43290 +
43291 + /* reiser4_write_sd(p_child_new) */
43292 + if (p_child_new)
43293 + res1 += p_child_new->estimate.update(new_name->d_inode);
43294 +
43295 + /* hashed_rem_entry(p_parent_old) */
43296 + res1 += p_parent_old->estimate.rem_entry(old_dir);
43297 +
43298 + /* reiser4_del_nlink(p_child_old) */
43299 + res1 += p_child_old->estimate.update(old_name->d_inode);
43300 +
43301 + /* replace_name */
43302 + {
43303 + /* reiser4_add_nlink(p_parent_dir_new) */
43304 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43305 + /* update_key */
43306 + res1 += 1;
43307 + /* reiser4_del_nlink(p_parent_new) */
43308 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43309 + /* reiser4_del_nlink(p_parent_old) */
43310 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43311 + }
43312 +
43313 + /* reiser4_write_sd(p_parent_old) */
43314 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43315 +
43316 + /* reiser4_write_sd(p_child_old) */
43317 + res1 += p_child_old->estimate.update(old_name->d_inode);
43318 +
43319 + return res1;
43320 +}
43321 +
43322 +static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
43323 + struct dentry *old_name, /* old name */
43324 + struct inode *new_dir, /* directory where @new is located */
43325 + struct dentry *new_name
43326 + /* new name */ )
43327 +{
43328 + reiser4_block_nr reserve;
43329 +
43330 + reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43331 +
43332 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43333 + return RETERR(-ENOSPC);
43334 +
43335 + return 0;
43336 +}
43337 +
43338 +/* check whether @old_inode and @new_inode can be moved within file system
43339 + * tree. This singles out attempts to rename pseudo-files, for example. */
43340 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
43341 + struct inode *new_dir, struct inode *new_inode)
43342 +{
43343 + file_plugin *fplug;
43344 + dir_plugin *dplug;
43345 +
43346 + assert("nikita-3370", old_inode != NULL);
43347 +
43348 + dplug = inode_dir_plugin(new_dir);
43349 + fplug = inode_file_plugin(old_inode);
43350 +
43351 + if (dplug == NULL)
43352 + return RETERR(-ENOTDIR);
43353 + else if (new_dir->i_op->create == NULL)
43354 + return RETERR(-EPERM);
43355 + else if (!fplug->can_add_link(old_inode))
43356 + return RETERR(-EMLINK);
43357 + else if (new_inode != NULL) {
43358 + fplug = inode_file_plugin(new_inode);
43359 + if (fplug->can_rem_link != NULL &&
43360 + !fplug->can_rem_link(new_inode))
43361 + return RETERR(-EBUSY);
43362 + }
43363 + return 0;
43364 +}
43365 +
43366 +int find_entry(struct inode *, struct dentry *, lock_handle *,
43367 + znode_lock_mode, reiser4_dir_entry_desc *);
43368 +int reiser4_update_dir(struct inode *);
43369 +
43370 +/* this is common implementation of vfs's rename method of struct
43371 + inode_operations
43372 + See comments in the body.
43373 +
43374 + It is arguable that this function can be made generic so, that it
43375 + will be applicable to any kind of directory plugin that deals with
43376 + directories composed out of directory entries. The only obstacle
43377 + here is that we don't have any data-type to represent directory
43378 + entry. This should be re-considered when more than one different
43379 + directory plugin will be implemented.
43380 +*/
43381 +int rename_common(struct inode *old_dir /* directory where @old is located */ ,
43382 + struct dentry *old_name /* old name */ ,
43383 + struct inode *new_dir /* directory where @new is located */ ,
43384 + struct dentry *new_name /* new name */ )
43385 +{
43386 + /* From `The Open Group Base Specifications Issue 6'
43387 +
43388 + If either the old or new argument names a symbolic link, rename()
43389 + shall operate on the symbolic link itself, and shall not resolve
43390 + the last component of the argument. If the old argument and the new
43391 + argument resolve to the same existing file, rename() shall return
43392 + successfully and perform no other action.
43393 +
43394 + [this is done by VFS: vfs_rename()]
43395 +
43396 + If the old argument points to the pathname of a file that is not a
43397 + directory, the new argument shall not point to the pathname of a
43398 + directory.
43399 +
43400 + [checked by VFS: vfs_rename->may_delete()]
43401 +
43402 + If the link named by the new argument exists, it shall
43403 + be removed and old renamed to new. In this case, a link named new
43404 + shall remain visible to other processes throughout the renaming
43405 + operation and refer either to the file referred to by new or old
43406 + before the operation began.
43407 +
43408 + [we should assure this]
43409 +
43410 + Write access permission is required for
43411 + both the directory containing old and the directory containing new.
43412 +
43413 + [checked by VFS: vfs_rename->may_delete(), may_create()]
43414 +
43415 + If the old argument points to the pathname of a directory, the new
43416 + argument shall not point to the pathname of a file that is not a
43417 + directory.
43418 +
43419 + [checked by VFS: vfs_rename->may_delete()]
43420 +
43421 + If the directory named by the new argument exists, it
43422 + shall be removed and old renamed to new. In this case, a link named
43423 + new shall exist throughout the renaming operation and shall refer
43424 + either to the directory referred to by new or old before the
43425 + operation began.
43426 +
43427 + [we should assure this]
43428 +
43429 + If new names an existing directory, it shall be
43430 + required to be an empty directory.
43431 +
43432 + [we should check this]
43433 +
43434 + If the old argument points to a pathname of a symbolic link, the
43435 + symbolic link shall be renamed. If the new argument points to a
43436 + pathname of a symbolic link, the symbolic link shall be removed.
43437 +
43438 + The new pathname shall not contain a path prefix that names
43439 + old. Write access permission is required for the directory
43440 + containing old and the directory containing new. If the old
43441 + argument points to the pathname of a directory, write access
43442 + permission may be required for the directory named by old, and, if
43443 + it exists, the directory named by new.
43444 +
43445 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
43446 +
43447 + If the link named by the new argument exists and the file's link
43448 + count becomes 0 when it is removed and no process has the file
43449 + open, the space occupied by the file shall be freed and the file
43450 + shall no longer be accessible. If one or more processes have the
43451 + file open when the last link is removed, the link shall be removed
43452 + before rename() returns, but the removal of the file contents shall
43453 + be postponed until all references to the file are closed.
43454 +
43455 + [iput() handles this, but we can do this manually, a la
43456 + reiser4_unlink()]
43457 +
43458 + Upon successful completion, rename() shall mark for update the
43459 + st_ctime and st_mtime fields of the parent directory of each file.
43460 +
43461 + [N/A]
43462 +
43463 + */
43464 + reiser4_context *ctx;
43465 + int result;
43466 + int is_dir; /* is @old_name directory */
43467 +
43468 + struct inode *old_inode;
43469 + struct inode *new_inode;
43470 + coord_t *new_coord;
43471 +
43472 + reiser4_dentry_fsdata *new_fsdata;
43473 + dir_plugin *dplug;
43474 + file_plugin *fplug;
43475 +
43476 + reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
43477 + lock_handle *new_lh, *dotdot_lh;
43478 + struct dentry *dotdot_name;
43479 + reiser4_dentry_fsdata *dataonstack;
43480 +
43481 + ctx = init_context(old_dir->i_sb);
43482 + if (IS_ERR(ctx))
43483 + return PTR_ERR(ctx);
43484 +
43485 + old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43486 + sizeof(*dotdot_name) + sizeof(*dataonstack),
43487 + GFP_KERNEL);
43488 + if (old_entry == NULL) {
43489 + context_set_commit_async(ctx);
43490 + reiser4_exit_context(ctx);
43491 + return RETERR(-ENOMEM);
43492 + }
43493 + memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43494 + sizeof(*dotdot_name) + sizeof(*dataonstack));
43495 +
43496 + new_entry = old_entry + 1;
43497 + dotdot_entry = old_entry + 2;
43498 + new_lh = (lock_handle *)(old_entry + 3);
43499 + dotdot_lh = new_lh + 1;
43500 + dotdot_name = (struct dentry *)(new_lh + 2);
43501 + dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
43502 +
43503 + assert("nikita-2318", old_dir != NULL);
43504 + assert("nikita-2319", new_dir != NULL);
43505 + assert("nikita-2320", old_name != NULL);
43506 + assert("nikita-2321", new_name != NULL);
43507 +
43508 + old_inode = old_name->d_inode;
43509 + new_inode = new_name->d_inode;
43510 +
43511 + dplug = inode_dir_plugin(old_dir);
43512 + fplug = NULL;
43513 +
43514 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
43515 + if (IS_ERR(new_fsdata)) {
43516 + kfree(old_entry);
43517 + context_set_commit_async(ctx);
43518 + reiser4_exit_context(ctx);
43519 + return PTR_ERR(new_fsdata);
43520 + }
43521 +
43522 + new_coord = &new_fsdata->dec.entry_coord;
43523 + coord_clear_iplug(new_coord);
43524 +
43525 + is_dir = S_ISDIR(old_inode->i_mode);
43526 +
43527 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43528 +
43529 + /* if target is existing directory and it's not empty---return error.
43530 +
43531 + This check is done specifically, because is_dir_empty() requires
43532 + tree traversal and have to be done before locks are taken.
43533 + */
43534 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
43535 + kfree(old_entry);
43536 + context_set_commit_async(ctx);
43537 + reiser4_exit_context(ctx);
43538 + return RETERR(-ENOTEMPTY);
43539 + }
43540 +
43541 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
43542 + if (result != 0) {
43543 + kfree(old_entry);
43544 + context_set_commit_async(ctx);
43545 + reiser4_exit_context(ctx);
43546 + return result;
43547 + }
43548 +
43549 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
43550 + new_dir, new_name);
43551 + if (result != 0) {
43552 + kfree(old_entry);
43553 + context_set_commit_async(ctx);
43554 + reiser4_exit_context(ctx);
43555 + return result;
43556 + }
43557 +
43558 + init_lh(new_lh);
43559 +
43560 + /* find entry for @new_name */
43561 + result = find_entry(new_dir,
43562 + new_name, new_lh, ZNODE_WRITE_LOCK, new_entry);
43563 +
43564 + if (IS_CBKERR(result)) {
43565 + done_lh(new_lh);
43566 + kfree(old_entry);
43567 + context_set_commit_async(ctx);
43568 + reiser4_exit_context(ctx);
43569 + return result;
43570 + }
43571 +
43572 + seal_done(&new_fsdata->dec.entry_seal);
43573 +
43574 + /* add or replace name for @old_inode as @new_name */
43575 + if (new_inode != NULL) {
43576 + /* target (@new_name) exists. */
43577 + /* Not clear what to do with objects that are
43578 + both directories and files at the same time. */
43579 + if (result == CBK_COORD_FOUND) {
43580 + result = replace_name(old_inode,
43581 + new_dir,
43582 + new_inode, new_coord, new_lh);
43583 + if (result == 0)
43584 + fplug = inode_file_plugin(new_inode);
43585 + } else if (result == CBK_COORD_NOTFOUND) {
43586 + /* VFS told us that @new_name is bound to existing
43587 + inode, but we failed to find directory entry. */
43588 + warning("nikita-2324", "Target not found");
43589 + result = RETERR(-ENOENT);
43590 + }
43591 + } else {
43592 + /* target (@new_name) doesn't exists. */
43593 + if (result == CBK_COORD_NOTFOUND)
43594 + result = add_name(old_inode,
43595 + new_dir,
43596 + new_name, new_coord, new_lh, is_dir);
43597 + else if (result == CBK_COORD_FOUND) {
43598 + /* VFS told us that @new_name is "negative" dentry,
43599 + but we found directory entry. */
43600 + warning("nikita-2331", "Target found unexpectedly");
43601 + result = RETERR(-EIO);
43602 + }
43603 + }
43604 +
43605 + assert("nikita-3462", ergo(result == 0,
43606 + old_inode->i_nlink >= 2 + !!is_dir));
43607 +
43608 + /* We are done with all modifications to the @new_dir, release lock on
43609 + node. */
43610 + done_lh(new_lh);
43611 +
43612 + if (fplug != NULL) {
43613 + /* detach @new_inode from name-space */
43614 + result = fplug->detach(new_inode, new_dir);
43615 + if (result != 0)
43616 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
43617 + (unsigned long long)get_inode_oid(new_inode),
43618 + result, possible_leak);
43619 + }
43620 +
43621 + if (new_inode != NULL)
43622 + reiser4_update_sd(new_inode);
43623 +
43624 + if (result == 0) {
43625 + old_entry->obj = old_inode;
43626 +
43627 + dplug->build_entry_key(old_dir,
43628 + &old_name->d_name, &old_entry->key);
43629 +
43630 + /* At this stage new name was introduced for
43631 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43632 + counters were updated.
43633 +
43634 + We want to remove @old_name now. If @old_inode wasn't
43635 + directory this is simple.
43636 + */
43637 + result = dplug->rem_entry(old_dir, old_name, old_entry);
43638 + if (result != 0 && result != -ENOMEM) {
43639 + warning("nikita-2335",
43640 + "Cannot remove old name: %i", result);
43641 + } else {
43642 + result = reiser4_del_nlink(old_inode, old_dir, 0);
43643 + if (result != 0 && result != -ENOMEM) {
43644 + warning("nikita-2337",
43645 + "Cannot drop link on old: %i", result);
43646 + }
43647 + }
43648 +
43649 + if (result == 0 && is_dir) {
43650 + /* @old_inode is directory. We also have to update
43651 + dotdot entry. */
43652 + coord_t *dotdot_coord;
43653 +
43654 + memset(dataonstack, 0, sizeof dataonstack);
43655 + memset(dotdot_entry, 0, sizeof dotdot_entry);
43656 + dotdot_entry->obj = old_dir;
43657 + memset(dotdot_name, 0, sizeof dotdot_name);
43658 + dotdot_name->d_name.name = "..";
43659 + dotdot_name->d_name.len = 2;
43660 + /*
43661 + * allocate ->d_fsdata on the stack to avoid using
43662 + * reiser4_get_dentry_fsdata(). Locking is not needed,
43663 + * because dentry is private to the current thread.
43664 + */
43665 + dotdot_name->d_fsdata = dataonstack;
43666 + init_lh(dotdot_lh);
43667 +
43668 + dotdot_coord = &dataonstack->dec.entry_coord;
43669 + coord_clear_iplug(dotdot_coord);
43670 +
43671 + result = find_entry(old_inode, dotdot_name, dotdot_lh,
43672 + ZNODE_WRITE_LOCK, dotdot_entry);
43673 + if (result == 0) {
43674 + /* replace_name() decreases i_nlink on
43675 + * @old_dir */
43676 + result = replace_name(new_dir,
43677 + old_inode,
43678 + old_dir,
43679 + dotdot_coord, dotdot_lh);
43680 + } else
43681 + result = RETERR(-EIO);
43682 + done_lh(dotdot_lh);
43683 + }
43684 + }
43685 + reiser4_update_dir(new_dir);
43686 + reiser4_update_dir(old_dir);
43687 + reiser4_update_sd(old_inode);
43688 + if (result == 0) {
43689 + file_plugin *fplug;
43690 +
43691 + if (new_inode != NULL) {
43692 + /* add safe-link for target file (in case we removed
43693 + * last reference to the poor fellow */
43694 + fplug = inode_file_plugin(new_inode);
43695 + if (new_inode->i_nlink == 0)
43696 + result = safe_link_add(new_inode, SAFE_UNLINK);
43697 + }
43698 + }
43699 + kfree(old_entry);
43700 + context_set_commit_async(ctx);
43701 + reiser4_exit_context(ctx);
43702 + return result;
43703 +}
43704 +
43705 +#if 0
43706 +int rename_common(struct inode *old_dir /* directory where @old is located */ ,
43707 + struct dentry *old_name /* old name */ ,
43708 + struct inode *new_dir /* directory where @new is located */ ,
43709 + struct dentry *new_name /* new name */ )
43710 +{
43711 + /* From `The Open Group Base Specifications Issue 6'
43712 +
43713 + If either the old or new argument names a symbolic link, rename()
43714 + shall operate on the symbolic link itself, and shall not resolve
43715 + the last component of the argument. If the old argument and the new
43716 + argument resolve to the same existing file, rename() shall return
43717 + successfully and perform no other action.
43718 +
43719 + [this is done by VFS: vfs_rename()]
43720 +
43721 + If the old argument points to the pathname of a file that is not a
43722 + directory, the new argument shall not point to the pathname of a
43723 + directory.
43724 +
43725 + [checked by VFS: vfs_rename->may_delete()]
43726 +
43727 + If the link named by the new argument exists, it shall
43728 + be removed and old renamed to new. In this case, a link named new
43729 + shall remain visible to other processes throughout the renaming
43730 + operation and refer either to the file referred to by new or old
43731 + before the operation began.
43732 +
43733 + [we should assure this]
43734 +
43735 + Write access permission is required for
43736 + both the directory containing old and the directory containing new.
43737 +
43738 + [checked by VFS: vfs_rename->may_delete(), may_create()]
43739 +
43740 + If the old argument points to the pathname of a directory, the new
43741 + argument shall not point to the pathname of a file that is not a
43742 + directory.
43743 +
43744 + [checked by VFS: vfs_rename->may_delete()]
43745 +
43746 + If the directory named by the new argument exists, it
43747 + shall be removed and old renamed to new. In this case, a link named
43748 + new shall exist throughout the renaming operation and shall refer
43749 + either to the directory referred to by new or old before the
43750 + operation began.
43751 +
43752 + [we should assure this]
43753 +
43754 + If new names an existing directory, it shall be
43755 + required to be an empty directory.
43756 +
43757 + [we should check this]
43758 +
43759 + If the old argument points to a pathname of a symbolic link, the
43760 + symbolic link shall be renamed. If the new argument points to a
43761 + pathname of a symbolic link, the symbolic link shall be removed.
43762 +
43763 + The new pathname shall not contain a path prefix that names
43764 + old. Write access permission is required for the directory
43765 + containing old and the directory containing new. If the old
43766 + argument points to the pathname of a directory, write access
43767 + permission may be required for the directory named by old, and, if
43768 + it exists, the directory named by new.
43769 +
43770 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
43771 +
43772 + If the link named by the new argument exists and the file's link
43773 + count becomes 0 when it is removed and no process has the file
43774 + open, the space occupied by the file shall be freed and the file
43775 + shall no longer be accessible. If one or more processes have the
43776 + file open when the last link is removed, the link shall be removed
43777 + before rename() returns, but the removal of the file contents shall
43778 + be postponed until all references to the file are closed.
43779 +
43780 + [iput() handles this, but we can do this manually, a la
43781 + reiser4_unlink()]
43782 +
43783 + Upon successful completion, rename() shall mark for update the
43784 + st_ctime and st_mtime fields of the parent directory of each file.
43785 +
43786 + [N/A]
43787 +
43788 + */
43789 + reiser4_context *ctx;
43790 + int result;
43791 + int is_dir; /* is @old_name directory */
43792 + struct inode *old_inode;
43793 + struct inode *new_inode;
43794 + reiser4_dir_entry_desc old_entry;
43795 + reiser4_dir_entry_desc new_entry;
43796 + coord_t *new_coord;
43797 + reiser4_dentry_fsdata *new_fsdata;
43798 + lock_handle new_lh;
43799 + dir_plugin *dplug;
43800 + file_plugin *fplug;
43801 +
43802 + ctx = init_context(old_dir->i_sb);
43803 + if (IS_ERR(ctx))
43804 + return PTR_ERR(ctx);
43805 +
43806 + assert("nikita-2318", old_dir != NULL);
43807 + assert("nikita-2319", new_dir != NULL);
43808 + assert("nikita-2320", old_name != NULL);
43809 + assert("nikita-2321", new_name != NULL);
43810 +
43811 + old_inode = old_name->d_inode;
43812 + new_inode = new_name->d_inode;
43813 +
43814 + dplug = inode_dir_plugin(old_dir);
43815 + fplug = NULL;
43816 +
43817 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
43818 + if (IS_ERR(new_fsdata)) {
43819 + result = PTR_ERR(new_fsdata);
43820 + goto exit;
43821 + }
43822 +
43823 + new_coord = &new_fsdata->dec.entry_coord;
43824 + coord_clear_iplug(new_coord);
43825 +
43826 + is_dir = S_ISDIR(old_inode->i_mode);
43827 +
43828 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43829 +
43830 + /* if target is existing directory and it's not empty---return error.
43831 +
43832 + This check is done specifically, because is_dir_empty() requires
43833 + tree traversal and have to be done before locks are taken.
43834 + */
43835 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
43836 + return RETERR(-ENOTEMPTY);
43837 +
43838 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
43839 + if (result != 0)
43840 + goto exit;
43841 +
43842 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
43843 + new_dir, new_name);
43844 + if (result != 0)
43845 + goto exit;
43846 +
43847 + init_lh(&new_lh);
43848 +
43849 + /* find entry for @new_name */
43850 + result = find_entry(new_dir,
43851 + new_name, &new_lh, ZNODE_WRITE_LOCK, &new_entry);
43852 +
43853 + if (IS_CBKERR(result)) {
43854 + done_lh(&new_lh);
43855 + goto exit;
43856 + }
43857 +
43858 + seal_done(&new_fsdata->dec.entry_seal);
43859 +
43860 + /* add or replace name for @old_inode as @new_name */
43861 + if (new_inode != NULL) {
43862 + /* target (@new_name) exists. */
43863 + /* Not clear what to do with objects that are
43864 + both directories and files at the same time. */
43865 + if (result == CBK_COORD_FOUND) {
43866 + result = replace_name(old_inode,
43867 + new_dir,
43868 + new_inode, new_coord, &new_lh);
43869 + if (result == 0)
43870 + fplug = inode_file_plugin(new_inode);
43871 + } else if (result == CBK_COORD_NOTFOUND) {
43872 + /* VFS told us that @new_name is bound to existing
43873 + inode, but we failed to find directory entry. */
43874 + warning("nikita-2324", "Target not found");
43875 + result = RETERR(-ENOENT);
43876 + }
43877 + } else {
43878 + /* target (@new_name) doesn't exists. */
43879 + if (result == CBK_COORD_NOTFOUND)
43880 + result = add_name(old_inode,
43881 + new_dir,
43882 + new_name, new_coord, &new_lh, is_dir);
43883 + else if (result == CBK_COORD_FOUND) {
43884 + /* VFS told us that @new_name is "negative" dentry,
43885 + but we found directory entry. */
43886 + warning("nikita-2331", "Target found unexpectedly");
43887 + result = RETERR(-EIO);
43888 + }
43889 + }
43890 +
43891 + assert("nikita-3462", ergo(result == 0,
43892 + old_inode->i_nlink >= 2 + !!is_dir));
43893 +
43894 + /* We are done with all modifications to the @new_dir, release lock on
43895 + node. */
43896 + done_lh(&new_lh);
43897 +
43898 + if (fplug != NULL) {
43899 + /* detach @new_inode from name-space */
43900 + result = fplug->detach(new_inode, new_dir);
43901 + if (result != 0)
43902 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
43903 + (unsigned long long)get_inode_oid(new_inode),
43904 + result, possible_leak);
43905 + }
43906 +
43907 + if (new_inode != NULL)
43908 + reiser4_update_sd(new_inode);
43909 +
43910 + if (result == 0) {
43911 + memset(&old_entry, 0, sizeof old_entry);
43912 + old_entry.obj = old_inode;
43913 +
43914 + dplug->build_entry_key(old_dir,
43915 + &old_name->d_name, &old_entry.key);
43916 +
43917 + /* At this stage new name was introduced for
43918 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43919 + counters were updated.
43920 +
43921 + We want to remove @old_name now. If @old_inode wasn't
43922 + directory this is simple.
43923 + */
43924 + result = dplug->rem_entry(old_dir, old_name, &old_entry);
43925 + /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
43926 + if (result != 0 && result != -ENOMEM) {
43927 + warning("nikita-2335",
43928 + "Cannot remove old name: %i", result);
43929 + } else {
43930 + result = reiser4_del_nlink(old_inode, old_dir, 0);
43931 + if (result != 0 && result != -ENOMEM) {
43932 + warning("nikita-2337",
43933 + "Cannot drop link on old: %i", result);
43934 + }
43935 + }
43936 +
43937 + if (result == 0 && is_dir) {
43938 + /* @old_inode is directory. We also have to update
43939 + dotdot entry. */
43940 + coord_t *dotdot_coord;
43941 + lock_handle dotdot_lh;
43942 + struct dentry dotdot_name;
43943 + reiser4_dir_entry_desc dotdot_entry;
43944 + reiser4_dentry_fsdata dataonstack;
43945 + reiser4_dentry_fsdata *fsdata;
43946 +
43947 + memset(&dataonstack, 0, sizeof dataonstack);
43948 + memset(&dotdot_entry, 0, sizeof dotdot_entry);
43949 + dotdot_entry.obj = old_dir;
43950 + memset(&dotdot_name, 0, sizeof dotdot_name);
43951 + dotdot_name.d_name.name = "..";
43952 + dotdot_name.d_name.len = 2;
43953 + /*
43954 + * allocate ->d_fsdata on the stack to avoid using
43955 + * reiser4_get_dentry_fsdata(). Locking is not needed,
43956 + * because dentry is private to the current thread.
43957 + */
43958 + dotdot_name.d_fsdata = &dataonstack;
43959 + init_lh(&dotdot_lh);
43960 +
43961 + fsdata = &dataonstack;
43962 + dotdot_coord = &fsdata->dec.entry_coord;
43963 + coord_clear_iplug(dotdot_coord);
43964 +
43965 + result = find_entry(old_inode, &dotdot_name, &dotdot_lh,
43966 + ZNODE_WRITE_LOCK, &dotdot_entry);
43967 + if (result == 0) {
43968 + /* replace_name() decreases i_nlink on
43969 + * @old_dir */
43970 + result = replace_name(new_dir,
43971 + old_inode,
43972 + old_dir,
43973 + dotdot_coord, &dotdot_lh);
43974 + } else
43975 + result = RETERR(-EIO);
43976 + done_lh(&dotdot_lh);
43977 + }
43978 + }
43979 + reiser4_update_dir(new_dir);
43980 + reiser4_update_dir(old_dir);
43981 + reiser4_update_sd(old_inode);
43982 + if (result == 0) {
43983 + file_plugin *fplug;
43984 +
43985 + if (new_inode != NULL) {
43986 + /* add safe-link for target file (in case we removed
43987 + * last reference to the poor fellow */
43988 + fplug = inode_file_plugin(new_inode);
43989 + if (new_inode->i_nlink == 0)
43990 + result = safe_link_add(new_inode, SAFE_UNLINK);
43991 + }
43992 + }
43993 + exit:
43994 + context_set_commit_async(ctx);
43995 + reiser4_exit_context(ctx);
43996 + return result;
43997 +}
43998 +#endif
43999 Index: linux-2.6.16/fs/reiser4/plugin/item/Makefile
44000 ===================================================================
44001 --- /dev/null
44002 +++ linux-2.6.16/fs/reiser4/plugin/item/Makefile
44003 @@ -0,0 +1,18 @@
44004 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
44005 +
44006 +item_plugins-objs := \
44007 + item.o \
44008 + static_stat.o \
44009 + sde.o \
44010 + cde.o \
44011 + blackbox.o \
44012 + internal.o \
44013 + tail.o \
44014 + ctail.o \
44015 + extent.o \
44016 + extent_item_ops.o \
44017 + extent_file_ops.o \
44018 + extent_flush_ops.o
44019 +
44020 +
44021 +
44022 Index: linux-2.6.16/fs/reiser4/plugin/item/acl.h
44023 ===================================================================
44024 --- /dev/null
44025 +++ linux-2.6.16/fs/reiser4/plugin/item/acl.h
44026 @@ -0,0 +1,66 @@
44027 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44028 +
44029 +/* Directory entry. */
44030 +
44031 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
44032 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
44033 +
44034 +#include "../../forward.h"
44035 +#include "../../dformat.h"
44036 +#include "../../kassign.h"
44037 +#include "../../key.h"
44038 +
44039 +#include <linux/fs.h>
44040 +#include <linux/dcache.h> /* for struct dentry */
44041 +
44042 +typedef struct directory_entry_format {
44043 + /* key of object stat-data. It's not necessary to store whole
44044 + key here, because it's always key of stat-data, so minor
44045 + packing locality and offset can be omitted here. But this
44046 + relies on particular key allocation scheme for stat-data, so,
44047 + for extensibility sake, whole key can be stored here.
44048 +
44049 + We store key as array of bytes, because we don't want 8-byte
44050 + alignment of dir entries.
44051 + */
44052 + obj_key_id id;
44053 + /* file name. Null terminated string. */
44054 + d8 name[0];
44055 +} directory_entry_format;
44056 +
44057 +void print_de(const char *prefix, coord_t * coord);
44058 +int extract_key_de(const coord_t * coord, reiser4_key * key);
44059 +int update_key_de(const coord_t * coord, const reiser4_key * key,
44060 + lock_handle * lh);
44061 +char *extract_name_de(const coord_t * coord, char *buf);
44062 +unsigned extract_file_type_de(const coord_t * coord);
44063 +int add_entry_de(struct inode *dir, coord_t * coord,
44064 + lock_handle * lh, const struct dentry *name,
44065 + reiser4_dir_entry_desc * entry);
44066 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
44067 + lock_handle * lh, reiser4_dir_entry_desc * entry);
44068 +int max_name_len_de(const struct inode *dir);
44069 +
44070 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
44071 +
44072 +char *extract_dent_name(const coord_t * coord,
44073 + directory_entry_format * dent, char *buf);
44074 +
44075 +#if REISER4_LARGE_KEY
44076 +#define DE_NAME_BUF_LEN (24)
44077 +#else
44078 +#define DE_NAME_BUF_LEN (16)
44079 +#endif
44080 +
44081 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
44082 +#endif
44083 +
44084 +/* Make Linus happy.
44085 + Local variables:
44086 + c-indentation-style: "K&R"
44087 + mode-name: "LC"
44088 + c-basic-offset: 8
44089 + tab-width: 8
44090 + fill-column: 120
44091 + End:
44092 +*/
44093 Index: linux-2.6.16/fs/reiser4/plugin/item/blackbox.c
44094 ===================================================================
44095 --- /dev/null
44096 +++ linux-2.6.16/fs/reiser4/plugin/item/blackbox.c
44097 @@ -0,0 +1,142 @@
44098 +/* Copyright 2003 by Hans Reiser, licensing governed by
44099 + * reiser4/README */
44100 +
44101 +/* Black box item implementation */
44102 +
44103 +#include "../../forward.h"
44104 +#include "../../debug.h"
44105 +#include "../../dformat.h"
44106 +#include "../../kassign.h"
44107 +#include "../../coord.h"
44108 +#include "../../tree.h"
44109 +#include "../../lock.h"
44110 +
44111 +#include "blackbox.h"
44112 +#include "item.h"
44113 +#include "../plugin.h"
44114 +
44115 +int
44116 +store_black_box(reiser4_tree * tree,
44117 + const reiser4_key * key, void *data, int length)
44118 +{
44119 + int result;
44120 + reiser4_item_data idata;
44121 + coord_t coord;
44122 + lock_handle lh;
44123 +
44124 + memset(&idata, 0, sizeof idata);
44125 +
44126 + idata.data = data;
44127 + idata.user = 0;
44128 + idata.length = length;
44129 + idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
44130 +
44131 + init_lh(&lh);
44132 + result = insert_by_key(tree, key,
44133 + &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
44134 +
44135 + assert("nikita-3413",
44136 + ergo(result == 0,
44137 + WITH_COORD(&coord,
44138 + item_length_by_coord(&coord) == length)));
44139 +
44140 + done_lh(&lh);
44141 + return result;
44142 +}
44143 +
44144 +int
44145 +load_black_box(reiser4_tree * tree,
44146 + reiser4_key * key, void *data, int length, int exact)
44147 +{
44148 + int result;
44149 + coord_t coord;
44150 + lock_handle lh;
44151 +
44152 + init_lh(&lh);
44153 + result = coord_by_key(tree, key,
44154 + &coord, &lh, ZNODE_READ_LOCK,
44155 + exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44156 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44157 +
44158 + if (result == 0) {
44159 + int ilen;
44160 +
44161 + result = zload(coord.node);
44162 + if (result == 0) {
44163 + ilen = item_length_by_coord(&coord);
44164 + if (ilen <= length) {
44165 + memcpy(data, item_body_by_coord(&coord), ilen);
44166 + unit_key_by_coord(&coord, key);
44167 + } else if (exact) {
44168 + /*
44169 + * item is larger than buffer provided by the
44170 + * user. Only issue a warning if @exact is
44171 + * set. If @exact is false, we are iterating
44172 + * over all safe-links and here we are reaching
44173 + * the end of the iteration.
44174 + */
44175 + warning("nikita-3415",
44176 + "Wrong black box length: %i > %i",
44177 + ilen, length);
44178 + result = RETERR(-EIO);
44179 + }
44180 + zrelse(coord.node);
44181 + }
44182 + }
44183 +
44184 + done_lh(&lh);
44185 + return result;
44186 +
44187 +}
44188 +
44189 +int
44190 +update_black_box(reiser4_tree * tree,
44191 + const reiser4_key * key, void *data, int length)
44192 +{
44193 + int result;
44194 + coord_t coord;
44195 + lock_handle lh;
44196 +
44197 + init_lh(&lh);
44198 + result = coord_by_key(tree, key,
44199 + &coord, &lh, ZNODE_READ_LOCK,
44200 + FIND_EXACT,
44201 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44202 + if (result == 0) {
44203 + int ilen;
44204 +
44205 + result = zload(coord.node);
44206 + if (result == 0) {
44207 + ilen = item_length_by_coord(&coord);
44208 + if (length <= ilen) {
44209 + memcpy(item_body_by_coord(&coord), data,
44210 + length);
44211 + } else {
44212 + warning("nikita-3437",
44213 + "Wrong black box length: %i < %i",
44214 + ilen, length);
44215 + result = RETERR(-EIO);
44216 + }
44217 + zrelse(coord.node);
44218 + }
44219 + }
44220 +
44221 + done_lh(&lh);
44222 + return result;
44223 +
44224 +}
44225 +
44226 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44227 +{
44228 + return cut_tree(tree, key, key, NULL, 1);
44229 +}
44230 +
44231 +/* Make Linus happy.
44232 + Local variables:
44233 + c-indentation-style: "K&R"
44234 + mode-name: "LC"
44235 + c-basic-offset: 8
44236 + tab-width: 8
44237 + fill-column: 120
44238 + End:
44239 +*/
44240 Index: linux-2.6.16/fs/reiser4/plugin/item/blackbox.h
44241 ===================================================================
44242 --- /dev/null
44243 +++ linux-2.6.16/fs/reiser4/plugin/item/blackbox.h
44244 @@ -0,0 +1,33 @@
44245 +/* Copyright 2003 by Hans Reiser, licensing governed by
44246 + * reiser4/README */
44247 +
44248 +/* "Black box" entry to fixed-width contain user supplied data */
44249 +
44250 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44251 +#define __FS_REISER4_BLACK_BOX_H__
44252 +
44253 +#include "../../forward.h"
44254 +#include "../../dformat.h"
44255 +#include "../../kassign.h"
44256 +#include "../../key.h"
44257 +
44258 +extern int store_black_box(reiser4_tree * tree,
44259 + const reiser4_key * key, void *data, int length);
44260 +extern int load_black_box(reiser4_tree * tree,
44261 + reiser4_key * key, void *data, int length, int exact);
44262 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44263 +extern int update_black_box(reiser4_tree * tree,
44264 + const reiser4_key * key, void *data, int length);
44265 +
44266 +/* __FS_REISER4_BLACK_BOX_H__ */
44267 +#endif
44268 +
44269 +/* Make Linus happy.
44270 + Local variables:
44271 + c-indentation-style: "K&R"
44272 + mode-name: "LC"
44273 + c-basic-offset: 8
44274 + tab-width: 8
44275 + fill-column: 120
44276 + End:
44277 +*/
44278 Index: linux-2.6.16/fs/reiser4/plugin/item/cde.c
44279 ===================================================================
44280 --- /dev/null
44281 +++ linux-2.6.16/fs/reiser4/plugin/item/cde.c
44282 @@ -0,0 +1,1007 @@
44283 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44284 +
44285 +/* Directory entry implementation */
44286 +
44287 +/* DESCRIPTION:
44288 +
44289 + This is "compound" directory item plugin implementation. This directory
44290 + item type is compound (as opposed to the "simple directory item" in
44291 + fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44292 + entries.
44293 +
44294 + The reason behind this decision is disk space efficiency: all directory
44295 + entries inside the same directory have identical fragment in their
44296 + keys. This, of course, depends on key assignment policy. In our default key
44297 + assignment policy, all directory entries have the same locality which is
44298 + equal to the object id of their directory.
44299 +
44300 + Composing directory item out of several directory entries for the same
44301 + directory allows us to store said key fragment only once. That is, this is
44302 + some ad hoc form of key compression (stem compression) that is implemented
44303 + here, because general key compression is not supposed to be implemented in
44304 + v4.0.
44305 +
44306 + Another decision that was made regarding all directory item plugins, is
44307 + that they will store entry keys unaligned. This is for that sake of disk
44308 + space efficiency again.
44309 +
44310 + In should be noted, that storing keys unaligned increases CPU consumption,
44311 + at least on some architectures.
44312 +
44313 + Internal on-disk structure of the compound directory item is the following:
44314 +
44315 + HEADER cde_item_format. Here number of entries is stored.
44316 + ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
44317 + ENTRY_HEADER_1 offset of entry body are stored.
44318 + ENTRY_HEADER_2 (basically two last parts of key)
44319 + ...
44320 + ENTRY_HEADER_N
44321 + ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
44322 + ENTRY_BODY_1 NUL-terminated name are stored.
44323 + ENTRY_BODY_2 (part of statadta key in the
44324 + sence that since all SDs have
44325 + zero offset, this offset is not
44326 + stored on disk).
44327 + ...
44328 + ENTRY_BODY_N
44329 +
44330 + When it comes to the balancing, each directory entry in compound directory
44331 + item is unit, that is, something that can be cut from one item and pasted
44332 + into another item of the same type. Handling of unit cut and paste is major
44333 + reason for the complexity of code below.
44334 +
44335 +*/
44336 +
44337 +#include "../../forward.h"
44338 +#include "../../debug.h"
44339 +#include "../../dformat.h"
44340 +#include "../../kassign.h"
44341 +#include "../../key.h"
44342 +#include "../../coord.h"
44343 +#include "sde.h"
44344 +#include "cde.h"
44345 +#include "item.h"
44346 +#include "../node/node.h"
44347 +#include "../plugin.h"
44348 +#include "../../znode.h"
44349 +#include "../../carry.h"
44350 +#include "../../tree.h"
44351 +#include "../../inode.h"
44352 +
44353 +#include <linux/fs.h> /* for struct inode */
44354 +#include <linux/dcache.h> /* for struct dentry */
44355 +#include <linux/quotaops.h>
44356 +
44357 +#if 0
44358 +#define CHECKME(coord) \
44359 +({ \
44360 + const char *message; \
44361 + coord_t dup; \
44362 + \
44363 + coord_dup_nocheck(&dup, (coord)); \
44364 + dup.unit_pos = 0; \
44365 + assert("nikita-2871", cde_check(&dup, &message) == 0); \
44366 +})
44367 +#else
44368 +#define CHECKME(coord) noop
44369 +#endif
44370 +
44371 +/* return body of compound directory item at @coord */
44372 +static inline cde_item_format *formatted_at(const coord_t * coord)
44373 +{
44374 + assert("nikita-1282", coord != NULL);
44375 + return item_body_by_coord(coord);
44376 +}
44377 +
44378 +/* return entry header at @coord */
44379 +static inline cde_unit_header *header_at(const coord_t *
44380 + coord /* coord of item */ ,
44381 + int idx /* index of unit */ )
44382 +{
44383 + assert("nikita-1283", coord != NULL);
44384 + return &formatted_at(coord)->entry[idx];
44385 +}
44386 +
44387 +/* return number of units in compound directory item at @coord */
44388 +static int units(const coord_t * coord /* coord of item */ )
44389 +{
44390 + return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
44391 +}
44392 +
44393 +/* return offset of the body of @idx-th entry in @coord */
44394 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
44395 + int idx /* index of unit */ )
44396 +{
44397 + if (idx < units(coord))
44398 + return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
44399 + else if (idx == units(coord))
44400 + return item_length_by_coord(coord);
44401 + else
44402 + impossible("nikita-1308", "Wrong idx");
44403 + return 0;
44404 +}
44405 +
44406 +/* set offset of the body of @idx-th entry in @coord */
44407 +static void set_offset(const coord_t * coord /* coord of item */ ,
44408 + int idx /* index of unit */ ,
44409 + unsigned int offset /* new offset */ )
44410 +{
44411 + put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
44412 +}
44413 +
44414 +static void adj_offset(const coord_t * coord /* coord of item */ ,
44415 + int idx /* index of unit */ ,
44416 + int delta /* offset change */ )
44417 +{
44418 + d16 *doffset;
44419 + __u16 offset;
44420 +
44421 + doffset = &header_at(coord, idx)->offset;
44422 + offset = le16_to_cpu(get_unaligned(doffset));
44423 + offset += delta;
44424 + put_unaligned(cpu_to_le16((__u16) offset), doffset);
44425 +}
44426 +
44427 +/* return pointer to @offset-th byte from the beginning of @coord */
44428 +static char *address(const coord_t * coord /* coord of item */ ,
44429 + int offset)
44430 +{
44431 + return ((char *)item_body_by_coord(coord)) + offset;
44432 +}
44433 +
44434 +/* return pointer to the body of @idx-th entry in @coord */
44435 +static directory_entry_format *entry_at(const coord_t * coord /* coord of
44436 + * item */ ,
44437 + int idx /* index of unit */ )
44438 +{
44439 + return (directory_entry_format *) address(coord,
44440 + (int)offset_of(coord, idx));
44441 +}
44442 +
44443 +/* return number of unit referenced by @coord */
44444 +static int idx_of(const coord_t * coord /* coord of item */ )
44445 +{
44446 + assert("nikita-1285", coord != NULL);
44447 + return coord->unit_pos;
44448 +}
44449 +
44450 +/* find position where entry with @entry_key would be inserted into @coord */
44451 +static int find(const coord_t * coord /* coord of item */ ,
44452 + const reiser4_key * entry_key /* key to look for */ ,
44453 + cmp_t * last /* result of last comparison */ )
44454 +{
44455 + int entries;
44456 +
44457 + int left;
44458 + int right;
44459 +
44460 + cde_unit_header *header;
44461 +
44462 + assert("nikita-1295", coord != NULL);
44463 + assert("nikita-1296", entry_key != NULL);
44464 + assert("nikita-1297", last != NULL);
44465 +
44466 + entries = units(coord);
44467 + left = 0;
44468 + right = entries - 1;
44469 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
44470 + int median;
44471 +
44472 + median = (left + right) >> 1;
44473 +
44474 + header = header_at(coord, median);
44475 + *last = de_id_key_cmp(&header->hash, entry_key);
44476 + switch (*last) {
44477 + case LESS_THAN:
44478 + left = median;
44479 + break;
44480 + case GREATER_THAN:
44481 + right = median;
44482 + break;
44483 + case EQUAL_TO:{
44484 + do {
44485 + median--;
44486 + header--;
44487 + } while (median >= 0 &&
44488 + de_id_key_cmp(&header->hash,
44489 + entry_key) == EQUAL_TO);
44490 + return median + 1;
44491 + }
44492 + }
44493 + }
44494 + header = header_at(coord, left);
44495 + for (; left < entries; ++left, ++header) {
44496 + prefetch(header + 1);
44497 + *last = de_id_key_cmp(&header->hash, entry_key);
44498 + if (*last != LESS_THAN)
44499 + break;
44500 + }
44501 + if (left < entries)
44502 + return left;
44503 + else
44504 + return RETERR(-ENOENT);
44505 +
44506 +}
44507 +
44508 +/* expand @coord as to accommodate for insertion of @no new entries starting
44509 + from @pos, with total bodies size @size. */
44510 +static int expand_item(const coord_t * coord /* coord of item */ ,
44511 + int pos /* unit position */ , int no /* number of new
44512 + * units*/ ,
44513 + int size /* total size of new units' data */ ,
44514 + unsigned int data_size /* free space already reserved
44515 + * in the item for insertion */ )
44516 +{
44517 + int entries;
44518 + cde_unit_header *header;
44519 + char *dent;
44520 + int i;
44521 +
44522 + assert("nikita-1310", coord != NULL);
44523 + assert("nikita-1311", pos >= 0);
44524 + assert("nikita-1312", no > 0);
44525 + assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
44526 + assert("nikita-1343",
44527 + item_length_by_coord(coord) >=
44528 + (int)(size + data_size + no * sizeof *header));
44529 +
44530 + entries = units(coord);
44531 +
44532 + if (pos == entries)
44533 + dent = address(coord, size);
44534 + else
44535 + dent = (char *)entry_at(coord, pos);
44536 + /* place where new header will be in */
44537 + header = header_at(coord, pos);
44538 + /* free space for new entry headers */
44539 + memmove(header + no, header,
44540 + (unsigned)(address(coord, size) - (char *)header));
44541 + /* if adding to the end initialise first new header */
44542 + if (pos == entries) {
44543 + set_offset(coord, pos, (unsigned)size);
44544 + }
44545 +
44546 + /* adjust entry pointer and size */
44547 + dent = dent + no * sizeof *header;
44548 + size += no * sizeof *header;
44549 + /* free space for new entries */
44550 + memmove(dent + data_size, dent,
44551 + (unsigned)(address(coord, size) - dent));
44552 +
44553 + /* increase counter */
44554 + entries += no;
44555 + put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
44556 +
44557 + /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
44558 + bytes. */
44559 + for (i = 0; i <= pos; ++i)
44560 + adj_offset(coord, i, no * sizeof *header);
44561 + /* [ pos + no ... +\infty ) entries were shifted by ( no *
44562 + sizeof *header + data_size ) bytes */
44563 + for (i = pos + no; i < entries; ++i)
44564 + adj_offset(coord, i, no * sizeof *header + data_size);
44565 + return 0;
44566 +}
44567 +
44568 +/* insert new @entry into item */
44569 +static int expand(const coord_t * coord /* coord of item */ ,
44570 + cde_entry * entry /* entry to insert */ ,
44571 + int len /* length of @entry data */ ,
44572 + int *pos /* position to insert */ ,
44573 + reiser4_dir_entry_desc * dir_entry /* parameters for new
44574 + * entry */ )
44575 +{
44576 + cmp_t cmp_res;
44577 + int datasize;
44578 +
44579 + *pos = find(coord, &dir_entry->key, &cmp_res);
44580 + if (*pos < 0)
44581 + *pos = units(coord);
44582 +
44583 + datasize = sizeof(directory_entry_format);
44584 + if (is_longname(entry->name->name, entry->name->len))
44585 + datasize += entry->name->len + 1;
44586 +
44587 + expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
44588 + datasize);
44589 + return 0;
44590 +}
44591 +
44592 +/* paste body of @entry into item */
44593 +static int paste_entry(const coord_t * coord /* coord of item */ ,
44594 + cde_entry * entry /* new entry */ ,
44595 + int pos /* position to insert */ ,
44596 + reiser4_dir_entry_desc * dir_entry /* parameters for
44597 + * new entry */ )
44598 +{
44599 + cde_unit_header *header;
44600 + directory_entry_format *dent;
44601 + const char *name;
44602 + int len;
44603 +
44604 + header = header_at(coord, pos);
44605 + dent = entry_at(coord, pos);
44606 +
44607 + build_de_id_by_key(&dir_entry->key, &header->hash);
44608 + build_inode_key_id(entry->obj, &dent->id);
44609 + /* AUDIT unsafe strcpy() operation! It should be replaced with
44610 + much less CPU hungry
44611 + memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
44612 +
44613 + Also a more major thing is that there should be a way to figure out
44614 + amount of space in dent -> name and be able to check that we are
44615 + not going to overwrite more than we supposed to */
44616 + name = entry->name->name;
44617 + len = entry->name->len;
44618 + if (is_longname(name, len)) {
44619 + strcpy((unsigned char *)dent->name, name);
44620 + put_unaligned(0, &dent->name[len]);
44621 + }
44622 + return 0;
44623 +}
44624 +
44625 +/* estimate how much space is necessary in item to insert/paste set of entries
44626 + described in @data. */
44627 +int estimate_cde(const coord_t * coord /* coord of item */ ,
44628 + const reiser4_item_data * data /* parameters for new item */ )
44629 +{
44630 + cde_entry_data *e;
44631 + int result;
44632 + int i;
44633 +
44634 + e = (cde_entry_data *) data->data;
44635 +
44636 + assert("nikita-1288", e != NULL);
44637 + assert("nikita-1289", e->num_of_entries >= 0);
44638 +
44639 + if (coord == NULL)
44640 + /* insert */
44641 + result = sizeof(cde_item_format);
44642 + else
44643 + /* paste */
44644 + result = 0;
44645 +
44646 + result += e->num_of_entries *
44647 + (sizeof(cde_unit_header) + sizeof(directory_entry_format));
44648 + for (i = 0; i < e->num_of_entries; ++i) {
44649 + const char *name;
44650 + int len;
44651 +
44652 + name = e->entry[i].name->name;
44653 + len = e->entry[i].name->len;
44654 + assert("nikita-2054", strlen(name) == len);
44655 + if (is_longname(name, len))
44656 + result += len + 1;
44657 + }
44658 + ((reiser4_item_data *) data)->length = result;
44659 + return result;
44660 +}
44661 +
44662 +/* ->nr_units() method for this item plugin. */
44663 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
44664 +{
44665 + return units(coord);
44666 +}
44667 +
44668 +/* ->unit_key() method for this item plugin. */
44669 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
44670 + reiser4_key * key /* resulting key */ )
44671 +{
44672 + assert("nikita-1452", coord != NULL);
44673 + assert("nikita-1345", idx_of(coord) < units(coord));
44674 + assert("nikita-1346", key != NULL);
44675 +
44676 + item_key_by_coord(coord, key);
44677 + extract_key_from_de_id(extract_dir_id_from_key(key),
44678 + &header_at(coord, idx_of(coord))->hash, key);
44679 + return key;
44680 +}
44681 +
44682 +/* mergeable_cde(): implementation of ->mergeable() item method.
44683 +
44684 + Two directory items are mergeable iff they are from the same
44685 + directory. That simple.
44686 +
44687 +*/
44688 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
44689 + const coord_t * p2 /* coord of second item */ )
44690 +{
44691 + reiser4_key k1;
44692 + reiser4_key k2;
44693 +
44694 + assert("nikita-1339", p1 != NULL);
44695 + assert("nikita-1340", p2 != NULL);
44696 +
44697 + return
44698 + (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
44699 + (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
44700 + extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
44701 +
44702 +}
44703 +
44704 +/* ->max_key_inside() method for this item plugin. */
44705 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
44706 + reiser4_key * result /* resulting key */ )
44707 +{
44708 + assert("nikita-1342", coord != NULL);
44709 +
44710 + item_key_by_coord(coord, result);
44711 + set_key_ordering(result, get_key_ordering(max_key()));
44712 + set_key_fulloid(result, get_key_fulloid(max_key()));
44713 + set_key_offset(result, get_key_offset(max_key()));
44714 + return result;
44715 +}
44716 +
44717 +/* @data contains data which are to be put into tree */
44718 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
44719 + const reiser4_key * key /* key to check */ ,
44720 + const reiser4_item_data * data /* parameters of new
44721 + * item/unit being
44722 + * created */ )
44723 +{
44724 + reiser4_key item_key;
44725 +
44726 + /* FIXME-VS: do not rely on anything but iplug field of @data. Only
44727 + data->iplug is initialized */
44728 + assert("vs-457", data && data->iplug);
44729 +/* assert( "vs-553", data -> user == 0 );*/
44730 + item_key_by_coord(coord, &item_key);
44731 +
44732 + return (item_plugin_by_coord(coord) == data->iplug) &&
44733 + (extract_dir_id_from_key(&item_key) ==
44734 + extract_dir_id_from_key(key));
44735 +}
44736 +
44737 +#if REISER4_DEBUG
44738 +/* cde_check ->check() method for compressed directory items
44739 +
44740 + used for debugging, every item should have here the most complete
44741 + possible check of the consistency of the item that the inventor can
44742 + construct
44743 +*/
44744 +int check_cde(const coord_t * coord /* coord of item to check */ ,
44745 + const char **error /* where to store error message */ )
44746 +{
44747 + int i;
44748 + int result;
44749 + char *item_start;
44750 + char *item_end;
44751 + reiser4_key key;
44752 +
44753 + coord_t c;
44754 +
44755 + assert("nikita-1357", coord != NULL);
44756 + assert("nikita-1358", error != NULL);
44757 +
44758 + if (!ergo(coord->item_pos != 0,
44759 + is_dot_key(item_key_by_coord(coord, &key)))) {
44760 + *error = "CDE doesn't start with dot";
44761 + return -1;
44762 + }
44763 + item_start = item_body_by_coord(coord);
44764 + item_end = item_start + item_length_by_coord(coord);
44765 +
44766 + coord_dup(&c, coord);
44767 + result = 0;
44768 + for (i = 0; i < units(coord); ++i) {
44769 + directory_entry_format *entry;
44770 +
44771 + if ((char *)(header_at(coord, i) + 1) >
44772 + item_end - units(coord) * sizeof *entry) {
44773 + *error = "CDE header is out of bounds";
44774 + result = -1;
44775 + break;
44776 + }
44777 + entry = entry_at(coord, i);
44778 + if ((char *)entry < item_start + sizeof(cde_item_format)) {
44779 + *error = "CDE header is too low";
44780 + result = -1;
44781 + break;
44782 + }
44783 + if ((char *)(entry + 1) > item_end) {
44784 + *error = "CDE header is too high";
44785 + result = -1;
44786 + break;
44787 + }
44788 + }
44789 +
44790 + return result;
44791 +}
44792 +#endif
44793 +
44794 +/* ->init() method for this item plugin. */
44795 +int init_cde(coord_t * coord /* coord of item */ ,
44796 + coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
44797 + UNUSED_ARG)
44798 +{
44799 + put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
44800 + return 0;
44801 +}
44802 +
44803 +/* ->lookup() method for this item plugin. */
44804 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
44805 + lookup_bias bias /* search bias */ ,
44806 + coord_t * coord /* coord of item to lookup in */ )
44807 +{
44808 + cmp_t last_comp;
44809 + int pos;
44810 +
44811 + reiser4_key utmost_key;
44812 +
44813 + assert("nikita-1293", coord != NULL);
44814 + assert("nikita-1294", key != NULL);
44815 +
44816 + CHECKME(coord);
44817 +
44818 + if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
44819 + coord->unit_pos = 0;
44820 + coord->between = BEFORE_UNIT;
44821 + return CBK_COORD_NOTFOUND;
44822 + }
44823 + pos = find(coord, key, &last_comp);
44824 + if (pos >= 0) {
44825 + coord->unit_pos = (int)pos;
44826 + switch (last_comp) {
44827 + case EQUAL_TO:
44828 + coord->between = AT_UNIT;
44829 + return CBK_COORD_FOUND;
44830 + case GREATER_THAN:
44831 + coord->between = BEFORE_UNIT;
44832 + return RETERR(-ENOENT);
44833 + case LESS_THAN:
44834 + default:
44835 + impossible("nikita-1298", "Broken find");
44836 + return RETERR(-EIO);
44837 + }
44838 + } else {
44839 + coord->unit_pos = units(coord) - 1;
44840 + coord->between = AFTER_UNIT;
44841 + return (bias ==
44842 + FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
44843 + CBK_COORD_NOTFOUND;
44844 + }
44845 +}
44846 +
44847 +/* ->paste() method for this item plugin. */
44848 +int paste_cde(coord_t * coord /* coord of item */ ,
44849 + reiser4_item_data * data /* parameters of new unit being
44850 + * inserted */ ,
44851 + carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
44852 +{
44853 + cde_entry_data *e;
44854 + int result;
44855 + int i;
44856 +
44857 + CHECKME(coord);
44858 + e = (cde_entry_data *) data->data;
44859 +
44860 + result = 0;
44861 + for (i = 0; i < e->num_of_entries; ++i) {
44862 + int pos;
44863 + int phantom_size;
44864 +
44865 + phantom_size = data->length;
44866 + if (units(coord) == 0)
44867 + phantom_size -= sizeof(cde_item_format);
44868 +
44869 + result =
44870 + expand(coord, e->entry + i, phantom_size, &pos, data->arg);
44871 + if (result != 0)
44872 + break;
44873 + result = paste_entry(coord, e->entry + i, pos, data->arg);
44874 + if (result != 0)
44875 + break;
44876 + }
44877 + CHECKME(coord);
44878 + return result;
44879 +}
44880 +
44881 +/* amount of space occupied by all entries starting from @idx both headers and
44882 + bodies. */
44883 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
44884 + int idx /* index of unit */ )
44885 +{
44886 + assert("nikita-1299", coord != NULL);
44887 + assert("nikita-1300", idx < (int)units(coord));
44888 +
44889 + return sizeof(cde_item_format) +
44890 + (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
44891 + idx + 1) -
44892 + offset_of(coord, 0);
44893 +}
44894 +
44895 +/* how many but not more than @want units of @source can be merged with
44896 + item in @target node. If pend == append - we try to append last item
44897 + of @target by first units of @source. If pend == prepend - we try to
44898 + "prepend" first item in @target by last units of @source. @target
44899 + node has @free_space bytes of free space. Total size of those units
44900 + are returned via @size */
44901 +int can_shift_cde(unsigned free_space /* free space in item */ ,
44902 + coord_t * coord /* coord of source item */ ,
44903 + znode * target /* target node */ ,
44904 + shift_direction pend /* shift direction */ ,
44905 + unsigned *size /* resulting number of shifted bytes */ ,
44906 + unsigned want /* maximal number of bytes to shift */ )
44907 +{
44908 + int shift;
44909 +
44910 + CHECKME(coord);
44911 + if (want == 0) {
44912 + *size = 0;
44913 + return 0;
44914 + }
44915 +
44916 + /* pend == SHIFT_LEFT <==> shifting to the left */
44917 + if (pend == SHIFT_LEFT) {
44918 + for (shift = min((int)want - 1, units(coord)); shift >= 0;
44919 + --shift) {
44920 + *size = part_size(coord, shift);
44921 + if (target != NULL)
44922 + *size -= sizeof(cde_item_format);
44923 + if (*size <= free_space)
44924 + break;
44925 + }
44926 + shift = shift + 1;
44927 + } else {
44928 + int total_size;
44929 +
44930 + assert("nikita-1301", pend == SHIFT_RIGHT);
44931 +
44932 + total_size = item_length_by_coord(coord);
44933 + for (shift = units(coord) - want - 1; shift < units(coord) - 1;
44934 + ++shift) {
44935 + *size = total_size - part_size(coord, shift);
44936 + if (target == NULL)
44937 + *size += sizeof(cde_item_format);
44938 + if (*size <= free_space)
44939 + break;
44940 + }
44941 + shift = units(coord) - shift - 1;
44942 + }
44943 + if (shift == 0)
44944 + *size = 0;
44945 + CHECKME(coord);
44946 + return shift;
44947 +}
44948 +
44949 +/* ->copy_units() method for this item plugin. */
44950 +void copy_units_cde(coord_t * target /* coord of target item */ ,
44951 + coord_t * source /* coord of source item */ ,
44952 + unsigned from /* starting unit */ ,
44953 + unsigned count /* how many units to copy */ ,
44954 + shift_direction where_is_free_space /* shift direction */ ,
44955 + unsigned free_space /* free space in item */ )
44956 +{
44957 + char *header_from;
44958 + char *header_to;
44959 +
44960 + char *entry_from;
44961 + char *entry_to;
44962 +
44963 + int pos_in_target;
44964 + int data_size;
44965 + int data_delta;
44966 + int i;
44967 +
44968 + assert("nikita-1303", target != NULL);
44969 + assert("nikita-1304", source != NULL);
44970 + assert("nikita-1305", (int)from < units(source));
44971 + assert("nikita-1307", (int)(from + count) <= units(source));
44972 +
44973 + if (where_is_free_space == SHIFT_LEFT) {
44974 + assert("nikita-1453", from == 0);
44975 + pos_in_target = units(target);
44976 + } else {
44977 + assert("nikita-1309", (int)(from + count) == units(source));
44978 + pos_in_target = 0;
44979 + memmove(item_body_by_coord(target),
44980 + (char *)item_body_by_coord(target) + free_space,
44981 + item_length_by_coord(target) - free_space);
44982 + }
44983 +
44984 + CHECKME(target);
44985 + CHECKME(source);
44986 +
44987 + /* expand @target */
44988 + data_size =
44989 + offset_of(source, (int)(from + count)) - offset_of(source,
44990 + (int)from);
44991 +
44992 + if (units(target) == 0)
44993 + free_space -= sizeof(cde_item_format);
44994 +
44995 + expand_item(target, pos_in_target, (int)count,
44996 + (int)(item_length_by_coord(target) - free_space),
44997 + (unsigned)data_size);
44998 +
44999 + /* copy first @count units of @source into @target */
45000 + data_delta =
45001 + offset_of(target, pos_in_target) - offset_of(source, (int)from);
45002 +
45003 + /* copy entries */
45004 + entry_from = (char *)entry_at(source, (int)from);
45005 + entry_to = (char *)entry_at(source, (int)(from + count));
45006 + memmove(entry_at(target, pos_in_target), entry_from,
45007 + (unsigned)(entry_to - entry_from));
45008 +
45009 + /* copy headers */
45010 + header_from = (char *)header_at(source, (int)from);
45011 + header_to = (char *)header_at(source, (int)(from + count));
45012 + memmove(header_at(target, pos_in_target), header_from,
45013 + (unsigned)(header_to - header_from));
45014 +
45015 + /* update offsets */
45016 + for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
45017 + adj_offset(target, i, data_delta);
45018 + CHECKME(target);
45019 + CHECKME(source);
45020 +}
45021 +
45022 +/* ->cut_units() method for this item plugin. */
45023 +int cut_units_cde(coord_t * coord /* coord of item */ ,
45024 + pos_in_node_t from /* start unit pos */ ,
45025 + pos_in_node_t to /* stop unit pos */ ,
45026 + struct carry_cut_data *cdata UNUSED_ARG,
45027 + reiser4_key * smallest_removed, reiser4_key * new_first)
45028 +{
45029 + char *header_from;
45030 + char *header_to;
45031 +
45032 + char *entry_from;
45033 + char *entry_to;
45034 +
45035 + int size;
45036 + int entry_delta;
45037 + int header_delta;
45038 + int i;
45039 +
45040 + unsigned count;
45041 +
45042 + CHECKME(coord);
45043 +
45044 + count = to - from + 1;
45045 +
45046 + assert("nikita-1454", coord != NULL);
45047 + assert("nikita-1455", (int)(from + count) <= units(coord));
45048 +
45049 + if (smallest_removed)
45050 + unit_key_by_coord(coord, smallest_removed);
45051 +
45052 + if (new_first) {
45053 + coord_t next;
45054 +
45055 + /* not everything is cut from item head */
45056 + assert("vs-1527", from == 0);
45057 + assert("vs-1528", to < units(coord) - 1);
45058 +
45059 + coord_dup(&next, coord);
45060 + next.unit_pos++;
45061 + unit_key_by_coord(&next, new_first);
45062 + }
45063 +
45064 + size = item_length_by_coord(coord);
45065 + if (count == (unsigned)units(coord)) {
45066 + return size;
45067 + }
45068 +
45069 + header_from = (char *)header_at(coord, (int)from);
45070 + header_to = (char *)header_at(coord, (int)(from + count));
45071 +
45072 + entry_from = (char *)entry_at(coord, (int)from);
45073 + entry_to = (char *)entry_at(coord, (int)(from + count));
45074 +
45075 + /* move headers */
45076 + memmove(header_from, header_to,
45077 + (unsigned)(address(coord, size) - header_to));
45078 +
45079 + header_delta = header_to - header_from;
45080 +
45081 + entry_from -= header_delta;
45082 + entry_to -= header_delta;
45083 + size -= header_delta;
45084 +
45085 + /* copy entries */
45086 + memmove(entry_from, entry_to,
45087 + (unsigned)(address(coord, size) - entry_to));
45088 +
45089 + entry_delta = entry_to - entry_from;
45090 + size -= entry_delta;
45091 +
45092 + /* update offsets */
45093 +
45094 + for (i = 0; i < (int)from; ++i)
45095 + adj_offset(coord, i, -header_delta);
45096 +
45097 + for (i = from; i < units(coord) - (int)count; ++i)
45098 + adj_offset(coord, i, -header_delta - entry_delta);
45099 +
45100 + put_unaligned(cpu_to_le16((__u16) units(coord) - count),
45101 + &formatted_at(coord)->num_of_entries);
45102 +
45103 + if (from == 0) {
45104 + /* entries from head was removed - move remaining to right */
45105 + memmove((char *)item_body_by_coord(coord) +
45106 + header_delta + entry_delta, item_body_by_coord(coord),
45107 + (unsigned)size);
45108 + if (REISER4_DEBUG)
45109 + memset(item_body_by_coord(coord), 0,
45110 + (unsigned)header_delta + entry_delta);
45111 + } else {
45112 + /* freed space is already at the end of item */
45113 + if (REISER4_DEBUG)
45114 + memset((char *)item_body_by_coord(coord) + size, 0,
45115 + (unsigned)header_delta + entry_delta);
45116 + }
45117 +
45118 + return header_delta + entry_delta;
45119 +}
45120 +
45121 +int kill_units_cde(coord_t * coord /* coord of item */ ,
45122 + pos_in_node_t from /* start unit pos */ ,
45123 + pos_in_node_t to /* stop unit pos */ ,
45124 + struct carry_kill_data *kdata UNUSED_ARG,
45125 + reiser4_key * smallest_removed, reiser4_key * new_first)
45126 +{
45127 + return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
45128 +}
45129 +
45130 +/* ->s.dir.extract_key() method for this item plugin. */
45131 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
45132 + reiser4_key * key /* resulting key */ )
45133 +{
45134 + directory_entry_format *dent;
45135 +
45136 + assert("nikita-1155", coord != NULL);
45137 + assert("nikita-1156", key != NULL);
45138 +
45139 + dent = entry_at(coord, idx_of(coord));
45140 + return extract_key_from_id(&dent->id, key);
45141 +}
45142 +
45143 +int
45144 +update_key_cde(const coord_t * coord, const reiser4_key * key,
45145 + lock_handle * lh UNUSED_ARG)
45146 +{
45147 + directory_entry_format *dent;
45148 + obj_key_id obj_id;
45149 + int result;
45150 +
45151 + assert("nikita-2344", coord != NULL);
45152 + assert("nikita-2345", key != NULL);
45153 +
45154 + dent = entry_at(coord, idx_of(coord));
45155 + result = build_obj_key_id(key, &obj_id);
45156 + if (result == 0) {
45157 + dent->id = obj_id;
45158 + znode_make_dirty(coord->node);
45159 + }
45160 + return 0;
45161 +}
45162 +
45163 +/* ->s.dir.extract_name() method for this item plugin. */
45164 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45165 +{
45166 + directory_entry_format *dent;
45167 +
45168 + assert("nikita-1157", coord != NULL);
45169 +
45170 + dent = entry_at(coord, idx_of(coord));
45171 + return extract_dent_name(coord, dent, buf);
45172 +}
45173 +
45174 +static int cde_bytes(int pasting, const reiser4_item_data * data)
45175 +{
45176 + int result;
45177 +
45178 + result = data->length;
45179 + if (!pasting)
45180 + result -= sizeof(cde_item_format);
45181 + return result;
45182 +}
45183 +
45184 +/* ->s.dir.add_entry() method for this item plugin */
45185 +int add_entry_cde(struct inode *dir /* directory object */ ,
45186 + coord_t * coord /* coord of item */ ,
45187 + lock_handle * lh /* lock handle for insertion */ ,
45188 + const struct dentry *name /* name to insert */ ,
45189 + reiser4_dir_entry_desc * dir_entry /* parameters of new
45190 + * directory entry */ )
45191 +{
45192 + reiser4_item_data data;
45193 + cde_entry entry;
45194 + cde_entry_data edata;
45195 + int result;
45196 +
45197 + assert("nikita-1656", coord->node == lh->node);
45198 + assert("nikita-1657", znode_is_write_locked(coord->node));
45199 +
45200 + edata.num_of_entries = 1;
45201 + edata.entry = &entry;
45202 +
45203 + entry.dir = dir;
45204 + entry.obj = dir_entry->obj;
45205 + entry.name = &name->d_name;
45206 +
45207 + data.data = (char *)&edata;
45208 + data.user = 0; /* &edata is not user space */
45209 + data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45210 + data.arg = dir_entry;
45211 + assert("nikita-1302", data.iplug != NULL);
45212 +
45213 + result = is_dot_key(&dir_entry->key);
45214 + data.length = estimate_cde(result ? coord : NULL, &data);
45215 +
45216 + /* NOTE-NIKITA quota plugin? */
45217 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45218 + return RETERR(-EDQUOT);
45219 +
45220 + if (result)
45221 + result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45222 + else
45223 + result = resize_item(coord, &data, &dir_entry->key, lh, 0);
45224 + return result;
45225 +}
45226 +
45227 +/* ->s.dir.rem_entry() */
45228 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
45229 + const struct qstr *name, coord_t * coord /* coord of item */ ,
45230 + lock_handle * lh UNUSED_ARG /* lock handle for
45231 + * removal */ ,
45232 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
45233 + * directory entry
45234 + * being removed */ )
45235 +{
45236 + coord_t shadow;
45237 + int result;
45238 + int length;
45239 + ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45240 +
45241 + assert("nikita-2870", strlen(name->name) == name->len);
45242 + assert("nikita-2869",
45243 + !strcmp(name->name, extract_name_cde(coord, buf)));
45244 +
45245 + length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45246 + if (is_longname(name->name, name->len))
45247 + length += name->len + 1;
45248 +
45249 + if (inode_get_bytes(dir) < length) {
45250 + warning("nikita-2628", "Dir is broke: %llu: %llu",
45251 + (unsigned long long)get_inode_oid(dir),
45252 + inode_get_bytes(dir));
45253 +
45254 + return RETERR(-EIO);
45255 + }
45256 +
45257 + /* cut_node() is supposed to take pointers to _different_
45258 + coords, because it will modify them without respect to
45259 + possible aliasing. To work around this, create temporary copy
45260 + of @coord.
45261 + */
45262 + coord_dup(&shadow, coord);
45263 + result =
45264 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45265 + if (result == 0) {
45266 + /* NOTE-NIKITA quota plugin? */
45267 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
45268 + }
45269 + return result;
45270 +}
45271 +
45272 +/* ->s.dir.max_name_len() method for this item plugin */
45273 +int max_name_len_cde(const struct inode *dir /* directory */ )
45274 +{
45275 + return
45276 + tree_by_inode(dir)->nplug->max_item_size() -
45277 + sizeof(directory_entry_format) - sizeof(cde_item_format) -
45278 + sizeof(cde_unit_header) - 2;
45279 +}
45280 +
45281 +/* Make Linus happy.
45282 + Local variables:
45283 + c-indentation-style: "K&R"
45284 + mode-name: "LC"
45285 + c-basic-offset: 8
45286 + tab-width: 8
45287 + fill-column: 120
45288 + End:
45289 +*/
45290 Index: linux-2.6.16/fs/reiser4/plugin/item/cde.h
45291 ===================================================================
45292 --- /dev/null
45293 +++ linux-2.6.16/fs/reiser4/plugin/item/cde.h
45294 @@ -0,0 +1,87 @@
45295 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45296 +
45297 +/* Compound directory item. See cde.c for description. */
45298 +
45299 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45300 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45301 +
45302 +#include "../../forward.h"
45303 +#include "../../kassign.h"
45304 +#include "../../dformat.h"
45305 +
45306 +#include <linux/fs.h> /* for struct inode */
45307 +#include <linux/dcache.h> /* for struct dentry, etc */
45308 +
45309 +typedef struct cde_unit_header {
45310 + de_id hash;
45311 + d16 offset;
45312 +} cde_unit_header;
45313 +
45314 +typedef struct cde_item_format {
45315 + d16 num_of_entries;
45316 + cde_unit_header entry[0];
45317 +} cde_item_format;
45318 +
45319 +typedef struct cde_entry {
45320 + const struct inode *dir;
45321 + const struct inode *obj;
45322 + const struct qstr *name;
45323 +} cde_entry;
45324 +
45325 +typedef struct cde_entry_data {
45326 + int num_of_entries;
45327 + cde_entry *entry;
45328 +} cde_entry_data;
45329 +
45330 +/* plugin->item.b.* */
45331 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
45332 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
45333 + const reiser4_item_data *);
45334 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
45335 +pos_in_node_t nr_units_cde(const coord_t * coord);
45336 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
45337 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
45338 +void print_cde(const char *prefix, coord_t * coord);
45339 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
45340 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
45341 + coord_t * coord);
45342 +int paste_cde(coord_t * coord, reiser4_item_data * data,
45343 + carry_plugin_info * info UNUSED_ARG);
45344 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
45345 + shift_direction pend, unsigned *size, unsigned want);
45346 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
45347 + unsigned count, shift_direction where_is_free_space,
45348 + unsigned free_space);
45349 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45350 + struct carry_cut_data *, reiser4_key * smallest_removed,
45351 + reiser4_key * new_first);
45352 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45353 + struct carry_kill_data *, reiser4_key * smallest_removed,
45354 + reiser4_key * new_first);
45355 +void print_cde(const char *prefix, coord_t * coord);
45356 +int check_cde(const coord_t * coord, const char **error);
45357 +
45358 +/* plugin->u.item.s.dir.* */
45359 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
45360 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
45361 + lock_handle * lh);
45362 +char *extract_name_cde(const coord_t * coord, char *buf);
45363 +int add_entry_cde(struct inode *dir, coord_t * coord,
45364 + lock_handle * lh, const struct dentry *name,
45365 + reiser4_dir_entry_desc * entry);
45366 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
45367 + lock_handle * lh, reiser4_dir_entry_desc * entry);
45368 +int max_name_len_cde(const struct inode *dir);
45369 +
45370 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
45371 +#endif
45372 +
45373 +/* Make Linus happy.
45374 + Local variables:
45375 + c-indentation-style: "K&R"
45376 + mode-name: "LC"
45377 + c-basic-offset: 8
45378 + tab-width: 8
45379 + fill-column: 120
45380 + End:
45381 +*/
45382 Index: linux-2.6.16/fs/reiser4/plugin/item/ctail.c
45383 ===================================================================
45384 --- /dev/null
45385 +++ linux-2.6.16/fs/reiser4/plugin/item/ctail.c
45386 @@ -0,0 +1,1588 @@
45387 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45388 +
45389 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
45390 +
45391 +/* DESCRIPTION:
45392 +
45393 +Each cryptcompress object is stored on disk as a set of clusters sliced
45394 +into ctails.
45395 +
45396 +Internal on-disk structure:
45397 +
45398 + HEADER (1) Here stored disk cluster shift
45399 + BODY
45400 +*/
45401 +
45402 +#include "../../forward.h"
45403 +#include "../../debug.h"
45404 +#include "../../dformat.h"
45405 +#include "../../kassign.h"
45406 +#include "../../key.h"
45407 +#include "../../coord.h"
45408 +#include "item.h"
45409 +#include "../node/node.h"
45410 +#include "../plugin.h"
45411 +#include "../object.h"
45412 +#include "../../znode.h"
45413 +#include "../../carry.h"
45414 +#include "../../tree.h"
45415 +#include "../../inode.h"
45416 +#include "../../super.h"
45417 +#include "../../context.h"
45418 +#include "../../page_cache.h"
45419 +#include "../cluster.h"
45420 +#include "../../flush.h"
45421 +#include "../../tree_walk.h"
45422 +
45423 +#include <linux/pagevec.h>
45424 +#include <linux/swap.h>
45425 +#include <linux/fs.h>
45426 +
45427 +/* return body of ctail item at @coord */
45428 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
45429 +{
45430 + assert("edward-60", coord != NULL);
45431 + return item_body_by_coord(coord);
45432 +}
45433 +
45434 +int cluster_shift_by_coord(const coord_t * coord)
45435 +{
45436 + return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
45437 +}
45438 +
45439 +static loff_t off_by_coord(const coord_t * coord)
45440 +{
45441 + reiser4_key key;
45442 + return get_key_offset(item_key_by_coord(coord, &key));
45443 +}
45444 +
45445 +static int coord_is_unprepped_ctail(const coord_t * coord)
45446 +{
45447 + assert("edward-1233", coord != NULL);
45448 + assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
45449 + assert("edward-1235",
45450 + ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
45451 + nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
45452 +
45453 + return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
45454 +}
45455 +
45456 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
45457 +{
45458 + int shift;
45459 +
45460 + if (inode != NULL) {
45461 + shift = inode_cluster_shift(inode);
45462 + assert("edward-1236",
45463 + ergo(!coord_is_unprepped_ctail(coord),
45464 + shift == cluster_shift_by_coord(coord)));
45465 + } else {
45466 + assert("edward-1237", !coord_is_unprepped_ctail(coord));
45467 + shift = cluster_shift_by_coord(coord);
45468 + }
45469 + return off_by_coord(coord) >> shift;
45470 +}
45471 +
45472 +static int disk_cluster_size(const coord_t * coord)
45473 +{
45474 + assert("edward-1156",
45475 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45476 + /* calculation of disk cluster size
45477 + is meaninless if ctail is unprepped */
45478 + assert("edward-1238", !coord_is_unprepped_ctail(coord));
45479 +
45480 + return 1 << cluster_shift_by_coord(coord);
45481 +}
45482 +
45483 +/* true if the key is of first disk cluster item */
45484 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
45485 +{
45486 + assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
45487 +
45488 + return coord_is_unprepped_ctail(coord) ||
45489 + ((get_key_offset(key) &
45490 + ((loff_t) disk_cluster_size(coord) - 1)) == 0);
45491 +}
45492 +
45493 +static char *first_unit(coord_t * coord)
45494 +{
45495 + /* FIXME: warning: pointer of type `void *' used in arithmetic */
45496 + return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
45497 +}
45498 +
45499 +/* plugin->u.item.b.max_key_inside :
45500 + tail_max_key_inside */
45501 +
45502 +/* plugin->u.item.b.can_contain_key */
45503 +int
45504 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
45505 + const reiser4_item_data * data)
45506 +{
45507 + reiser4_key item_key;
45508 +
45509 + if (item_plugin_by_coord(coord) != data->iplug)
45510 + return 0;
45511 +
45512 + item_key_by_coord(coord, &item_key);
45513 + if (get_key_locality(key) != get_key_locality(&item_key) ||
45514 + get_key_objectid(key) != get_key_objectid(&item_key))
45515 + return 0;
45516 + if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
45517 + get_key_offset(key))
45518 + return 0;
45519 + if (is_disk_cluster_key(key, coord))
45520 + return 0;
45521 + return 1;
45522 +}
45523 +
45524 +/* plugin->u.item.b.mergeable
45525 + c-tails of different clusters are not mergeable */
45526 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
45527 +{
45528 + reiser4_key key1, key2;
45529 +
45530 + assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
45531 + assert("edward-61",
45532 + item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
45533 +
45534 + if (item_id_by_coord(p2) != CTAIL_ID) {
45535 + /* second item is of another type */
45536 + return 0;
45537 + }
45538 +
45539 + item_key_by_coord(p1, &key1);
45540 + item_key_by_coord(p2, &key2);
45541 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
45542 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
45543 + get_key_type(&key1) != get_key_type(&key2)) {
45544 + /* items of different objects */
45545 + return 0;
45546 + }
45547 + if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
45548 + /* not adjacent items */
45549 + return 0;
45550 + if (is_disk_cluster_key(&key2, p2))
45551 + return 0;
45552 + return 1;
45553 +}
45554 +
45555 +/* plugin->u.item.b.nr_units */
45556 +pos_in_node_t nr_units_ctail(const coord_t * coord)
45557 +{
45558 + return (item_length_by_coord(coord) -
45559 + sizeof(ctail_formatted_at(coord)->cluster_shift));
45560 +}
45561 +
45562 +/* plugin->u.item.b.estimate:
45563 + estimate how much space is needed to insert/paste @data->length bytes
45564 + into ctail at @coord */
45565 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
45566 + const reiser4_item_data *
45567 + data /* parameters for new item */ )
45568 +{
45569 + if (coord == NULL)
45570 + /* insert */
45571 + return (sizeof(ctail_item_format) + data->length);
45572 + else
45573 + /* paste */
45574 + return data->length;
45575 +}
45576 +
45577 +/* ->init() method for this item plugin. */
45578 +int init_ctail(coord_t * to /* coord of item */ ,
45579 + coord_t * from /* old_item */ ,
45580 + reiser4_item_data * data /* structure used for insertion */ )
45581 +{
45582 + int cluster_shift; /* cpu value to convert */
45583 +
45584 + if (data) {
45585 + assert("edward-463", data->length > sizeof(ctail_item_format));
45586 + cluster_shift = *((int *)(data->arg));
45587 + data->length -= sizeof(ctail_item_format);
45588 + } else {
45589 + assert("edward-464", from != NULL);
45590 + assert("edward-855", ctail_ok(from));
45591 + cluster_shift = (int)(cluster_shift_by_coord(from));
45592 + }
45593 + put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
45594 + assert("edward-856", ctail_ok(to));
45595 + return 0;
45596 +}
45597 +
45598 +/* plugin->u.item.b.lookup:
45599 + NULL: We are looking for item keys only */
45600 +
45601 +#if REISER4_DEBUG
45602 +int ctail_ok(const coord_t * coord)
45603 +{
45604 + return coord_is_unprepped_ctail(coord) ||
45605 + cluster_shift_ok(cluster_shift_by_coord(coord));
45606 +}
45607 +
45608 +/* plugin->u.item.b.check */
45609 +int check_ctail(const coord_t * coord, const char **error)
45610 +{
45611 + if (!ctail_ok(coord)) {
45612 + if (error)
45613 + *error = "bad cluster shift in ctail";
45614 + return 1;
45615 + }
45616 + return 0;
45617 +}
45618 +#endif
45619 +
45620 +/* plugin->u.item.b.paste */
45621 +int
45622 +paste_ctail(coord_t * coord, reiser4_item_data * data,
45623 + carry_plugin_info * info UNUSED_ARG)
45624 +{
45625 + unsigned old_nr_units;
45626 +
45627 + assert("edward-268", data->data != NULL);
45628 + /* copy only from kernel space */
45629 + assert("edward-66", data->user == 0);
45630 +
45631 + old_nr_units =
45632 + item_length_by_coord(coord) - sizeof(ctail_item_format) -
45633 + data->length;
45634 +
45635 + /* ctail items never get pasted in the middle */
45636 +
45637 + if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
45638 +
45639 + /* paste at the beginning when create new item */
45640 + assert("edward-450",
45641 + item_length_by_coord(coord) ==
45642 + data->length + sizeof(ctail_item_format));
45643 + assert("edward-451", old_nr_units == 0);
45644 + } else if (coord->unit_pos == old_nr_units - 1
45645 + && coord->between == AFTER_UNIT) {
45646 +
45647 + /* paste at the end */
45648 + coord->unit_pos++;
45649 + } else
45650 + impossible("edward-453", "bad paste position");
45651 +
45652 + memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
45653 +
45654 + assert("edward-857", ctail_ok(coord));
45655 +
45656 + return 0;
45657 +}
45658 +
45659 +/* plugin->u.item.b.fast_paste */
45660 +
45661 +/* plugin->u.item.b.can_shift
45662 + number of units is returned via return value, number of bytes via @size. For
45663 + ctail items they coincide */
45664 +int
45665 +can_shift_ctail(unsigned free_space, coord_t * source,
45666 + znode * target, shift_direction direction UNUSED_ARG,
45667 + unsigned *size /* number of bytes */ , unsigned want)
45668 +{
45669 + /* make sure that that we do not want to shift more than we have */
45670 + assert("edward-68", want > 0 && want <= nr_units_ctail(source));
45671 +
45672 + *size = min(want, free_space);
45673 +
45674 + if (!target) {
45675 + /* new item will be created */
45676 + if (*size <= sizeof(ctail_item_format)) {
45677 + *size = 0;
45678 + return 0;
45679 + }
45680 + return *size - sizeof(ctail_item_format);
45681 + }
45682 + return *size;
45683 +}
45684 +
45685 +/* plugin->u.item.b.copy_units
45686 + cooperates with ->can_shift() */
45687 +void
45688 +copy_units_ctail(coord_t * target, coord_t * source,
45689 + unsigned from, unsigned count /* units */ ,
45690 + shift_direction where_is_free_space,
45691 + unsigned free_space /* bytes */ )
45692 +{
45693 + /* make sure that item @target is expanded already */
45694 + assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
45695 + assert("edward-70", free_space == count || free_space == count + 1);
45696 +
45697 + assert("edward-858", ctail_ok(source));
45698 +
45699 + if (where_is_free_space == SHIFT_LEFT) {
45700 + /* append item @target with @count first bytes of @source:
45701 + this restriction came from ordinary tails */
45702 + assert("edward-71", from == 0);
45703 + assert("edward-860", ctail_ok(target));
45704 +
45705 + memcpy(first_unit(target) + nr_units_ctail(target) - count,
45706 + first_unit(source), count);
45707 + } else {
45708 + /* target item is moved to right already */
45709 + reiser4_key key;
45710 +
45711 + assert("edward-72", nr_units_ctail(source) == from + count);
45712 +
45713 + if (free_space == count) {
45714 + init_ctail(target, source, NULL);
45715 + } else {
45716 + /* new item has been created */
45717 + assert("edward-862", ctail_ok(target));
45718 + }
45719 + memcpy(first_unit(target), first_unit(source) + from, count);
45720 +
45721 + assert("edward-863", ctail_ok(target));
45722 +
45723 + /* new units are inserted before first unit in an item,
45724 + therefore, we have to update item key */
45725 + item_key_by_coord(source, &key);
45726 + set_key_offset(&key, get_key_offset(&key) + from);
45727 +
45728 + node_plugin_by_node(target->node)->update_item_key(target, &key,
45729 + NULL /*info */);
45730 + }
45731 +}
45732 +
45733 +/* plugin->u.item.b.create_hook */
45734 +int create_hook_ctail(const coord_t * coord, void *arg)
45735 +{
45736 + assert("edward-864", znode_is_loaded(coord->node));
45737 +
45738 + znode_set_convertible(coord->node);
45739 + return 0;
45740 +}
45741 +
45742 +/* plugin->u.item.b.kill_hook */
45743 +int
45744 +kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
45745 + carry_kill_data * kdata)
45746 +{
45747 + struct inode *inode;
45748 +
45749 + assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
45750 + assert("edward-291", znode_is_write_locked(coord->node));
45751 +
45752 + inode = kdata->inode;
45753 + if (inode) {
45754 + reiser4_key key;
45755 + item_key_by_coord(coord, &key);
45756 +
45757 + if (from == 0 && is_disk_cluster_key(&key, coord)) {
45758 + cloff_t start =
45759 + off_to_clust(get_key_offset(&key), inode);
45760 + truncate_page_cluster(inode, start);
45761 + }
45762 + }
45763 + return 0;
45764 +}
45765 +
45766 +/* for shift_hook_ctail(),
45767 + return true if the first disk cluster item has dirty child
45768 +*/
45769 +static int ctail_convertible(const coord_t * coord)
45770 +{
45771 + int result;
45772 + reiser4_key key;
45773 + jnode *child = NULL;
45774 +
45775 + assert("edward-477", coord != NULL);
45776 + assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
45777 +
45778 + if (coord_is_unprepped_ctail(coord))
45779 + /* unprepped ctail should be converted */
45780 + return 1;
45781 +
45782 + item_key_by_coord(coord, &key);
45783 + child = jlookup(current_tree,
45784 + get_key_objectid(&key),
45785 + off_to_pg(off_by_coord(coord)));
45786 + if (!child)
45787 + return 0;
45788 + result = JF_ISSET(child, JNODE_DIRTY);
45789 + jput(child);
45790 + return result;
45791 +}
45792 +
45793 +/* FIXME-EDWARD */
45794 +/* plugin->u.item.b.shift_hook */
45795 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
45796 + unsigned from UNUSED_ARG /* start unit */ ,
45797 + unsigned count UNUSED_ARG /* stop unit */ ,
45798 + znode * old_node /* old parent */ )
45799 +{
45800 + assert("edward-479", item != NULL);
45801 + assert("edward-480", item->node != old_node);
45802 +
45803 + if (!znode_convertible(old_node) || znode_convertible(item->node))
45804 + return 0;
45805 + if (ctail_convertible(item))
45806 + znode_set_convertible(item->node);
45807 + return 0;
45808 +}
45809 +
45810 +static int
45811 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45812 + int cut, void *p, reiser4_key * smallest_removed,
45813 + reiser4_key * new_first)
45814 +{
45815 + pos_in_node_t count; /* number of units to cut */
45816 + char *item;
45817 +
45818 + count = to - from + 1;
45819 + item = item_body_by_coord(coord);
45820 +
45821 + assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
45822 +
45823 + if (smallest_removed) {
45824 + /* store smallest key removed */
45825 + item_key_by_coord(coord, smallest_removed);
45826 + set_key_offset(smallest_removed,
45827 + get_key_offset(smallest_removed) + from);
45828 + }
45829 +
45830 + if (new_first) {
45831 + assert("vs-1531", from == 0);
45832 +
45833 + item_key_by_coord(coord, new_first);
45834 + set_key_offset(new_first,
45835 + get_key_offset(new_first) + from + count);
45836 + }
45837 +
45838 + if (!cut)
45839 + kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
45840 +
45841 + if (from == 0) {
45842 + if (count != nr_units_ctail(coord)) {
45843 + /* part of item is removed, so move free space at the beginning
45844 + of the item and update item key */
45845 + reiser4_key key;
45846 + memcpy(item + to + 1, item, sizeof(ctail_item_format));
45847 + item_key_by_coord(coord, &key);
45848 + set_key_offset(&key, get_key_offset(&key) + count);
45849 + node_plugin_by_node(coord->node)->update_item_key(coord,
45850 + &key,
45851 + NULL);
45852 + } else {
45853 + /* cut_units should not be called to cut evrything */
45854 + assert("vs-1532", ergo(cut, 0));
45855 + /* whole item is cut, so more then amount of space occupied
45856 + by units got freed */
45857 + count += sizeof(ctail_item_format);
45858 + }
45859 + if (REISER4_DEBUG)
45860 + memset(item, 0, count);
45861 + } else if (REISER4_DEBUG)
45862 + memset(item + sizeof(ctail_item_format) + from, 0, count);
45863 + return count;
45864 +}
45865 +
45866 +/* plugin->u.item.b.cut_units */
45867 +int
45868 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45869 + carry_cut_data * cdata, reiser4_key * smallest_removed,
45870 + reiser4_key * new_first)
45871 +{
45872 + return cut_or_kill_ctail_units(item, from, to, 1, NULL,
45873 + smallest_removed, new_first);
45874 +}
45875 +
45876 +/* plugin->u.item.b.kill_units */
45877 +int
45878 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45879 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
45880 + reiser4_key * new_first)
45881 +{
45882 + return cut_or_kill_ctail_units(item, from, to, 0, kdata,
45883 + smallest_removed, new_first);
45884 +}
45885 +
45886 +/* plugin->u.item.s.file.read */
45887 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
45888 +{
45889 + uf_coord_t *uf_coord;
45890 + coord_t *coord;
45891 +
45892 + uf_coord = &hint->ext_coord;
45893 + coord = &uf_coord->coord;
45894 + assert("edward-127", f->user == 0);
45895 + assert("edward-129", coord && coord->node);
45896 + assert("edward-130", coord_is_existing_unit(coord));
45897 + assert("edward-132", znode_is_loaded(coord->node));
45898 +
45899 + /* start read only from the beginning of ctail */
45900 + assert("edward-133", coord->unit_pos == 0);
45901 + /* read only whole ctails */
45902 + assert("edward-135", nr_units_ctail(coord) <= f->length);
45903 +
45904 + assert("edward-136", schedulable());
45905 + assert("edward-886", ctail_ok(coord));
45906 +
45907 + if (f->data)
45908 + memcpy(f->data, (char *)first_unit(coord),
45909 + (size_t) nr_units_ctail(coord));
45910 +
45911 + dclust_set_extension(hint);
45912 + mark_page_accessed(znode_page(coord->node));
45913 + move_flow_forward(f, nr_units_ctail(coord));
45914 +
45915 + return 0;
45916 +}
45917 +
45918 +/* Reads a disk cluster consists of ctail items,
45919 + attaches a transform stream with plain text */
45920 +int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
45921 + int write)
45922 +{
45923 + int result;
45924 + assert("edward-671", clust->hint != NULL);
45925 + assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
45926 + assert("edward-672", crc_inode_ok(inode));
45927 +
45928 + /* set input stream */
45929 + result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
45930 + if (result)
45931 + return result;
45932 +
45933 + result = find_cluster(clust, inode, 1 /* read */ , write);
45934 + assert("edward-1340", !result);
45935 + if (result)
45936 + return result;
45937 + if (!write)
45938 + /* write still need the lock to insert unprepped
45939 + items, etc... */
45940 + put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
45941 +
45942 + assert("edward-673",
45943 + ergo(write, znode_is_write_locked(clust->hint->lh.node)));
45944 +
45945 + if (clust->dstat == FAKE_DISK_CLUSTER ||
45946 + clust->dstat == UNPR_DISK_CLUSTER) {
45947 + tfm_cluster_set_uptodate(&clust->tc);
45948 + return 0;
45949 + }
45950 + result = grab_coa(&clust->tc, inode_compression_plugin(inode));
45951 + if (result)
45952 + return result;
45953 + result = inflate_cluster(clust, inode);
45954 + if (result)
45955 + return result;
45956 + tfm_cluster_set_uptodate(&clust->tc);
45957 + return 0;
45958 +}
45959 +
45960 +/* read one locked page */
45961 +int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
45962 + struct page *page)
45963 +{
45964 + int ret;
45965 + unsigned cloff;
45966 + char *data;
45967 + size_t pgcnt;
45968 + tfm_cluster_t *tc = &clust->tc;
45969 +
45970 + assert("edward-212", PageLocked(page));
45971 +
45972 + if (PageUptodate(page))
45973 + goto exit;
45974 +
45975 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
45976 + clust->index = pg_to_clust(page->index, inode);
45977 + unlock_page(page);
45978 + ret = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
45979 + lock_page(page);
45980 + if (ret)
45981 + return ret;
45982 + }
45983 + if (PageUptodate(page))
45984 + /* races with another read/write */
45985 + goto exit;
45986 +
45987 + /* bytes in the page */
45988 + pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
45989 +
45990 + if (pgcnt == 0) {
45991 + assert("edward-1290", 0);
45992 + return RETERR(-EINVAL);
45993 + }
45994 + assert("edward-119", tfm_cluster_is_uptodate(tc));
45995 +
45996 + switch (clust->dstat) {
45997 + case UNPR_DISK_CLUSTER:
45998 + assert("edward-1285", 0);
45999 +#if REISER4_DEBUG
46000 + warning("edward-1168",
46001 + "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
46002 + page->index, clust->index,
46003 + (unsigned long long)get_inode_oid(inode));
46004 +#endif
46005 + case FAKE_DISK_CLUSTER:
46006 + /* fill the page by zeroes */
46007 + data = kmap_atomic(page, KM_USER0);
46008 +
46009 + memset(data, 0, PAGE_CACHE_SIZE);
46010 + flush_dcache_page(page);
46011 + kunmap_atomic(data, KM_USER0);
46012 + SetPageUptodate(page);
46013 + break;
46014 + case PREP_DISK_CLUSTER:
46015 + /* fill the page by transformed data */
46016 + assert("edward-1058", !PageUptodate(page));
46017 + assert("edward-120", tc->len <= inode_cluster_size(inode));
46018 +
46019 + /* start page offset in the cluster */
46020 + cloff = pg_to_off_to_cloff(page->index, inode);
46021 +
46022 + data = kmap(page);
46023 + memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
46024 + memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
46025 + flush_dcache_page(page);
46026 + kunmap(page);
46027 + SetPageUptodate(page);
46028 + break;
46029 + default:
46030 + impossible("edward-1169", "bad disk cluster state");
46031 + }
46032 + exit:
46033 + return 0;
46034 +}
46035 +
46036 +/* plugin->u.item.s.file.readpage */
46037 +int readpage_ctail(void *vp, struct page *page)
46038 +{
46039 + int result;
46040 + hint_t *hint;
46041 + reiser4_cluster_t *clust = vp;
46042 +
46043 + assert("edward-114", clust != NULL);
46044 + assert("edward-115", PageLocked(page));
46045 + assert("edward-116", !PageUptodate(page));
46046 + assert("edward-117", !jprivate(page) && !PagePrivate(page));
46047 + assert("edward-118", page->mapping && page->mapping->host);
46048 + assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
46049 +
46050 + hint = kmalloc(sizeof(*hint), GFP_KERNEL);
46051 + if (hint == NULL)
46052 + return RETERR(-ENOMEM);
46053 + clust->hint = hint;
46054 + result = load_file_hint(clust->file, hint);
46055 + if (result) {
46056 + kfree(hint);
46057 + return result;
46058 + }
46059 + assert("vs-25", hint->ext_coord.lh == &hint->lh);
46060 + result = do_readpage_ctail(page->mapping->host, clust, page);
46061 +
46062 + assert("edward-213", PageLocked(page));
46063 + assert("edward-1163", ergo(!result, PageUptodate(page)));
46064 + assert("edward-868",
46065 + ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
46066 +
46067 + unlock_page(page);
46068 + done_lh(&hint->lh);
46069 + hint->ext_coord.valid = 0;
46070 + save_file_hint(clust->file, hint);
46071 + kfree(hint);
46072 + tfm_cluster_clr_uptodate(&clust->tc);
46073 +
46074 + return result;
46075 +}
46076 +
46077 +/* This unconditionally reads a disk cluster.
46078 + Helper function for ->readpages() */
46079 +static int
46080 +ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
46081 +{
46082 + int i;
46083 + int result;
46084 + assert("edward-779", clust != NULL);
46085 + assert("edward-1059", clust->win == NULL);
46086 + assert("edward-780", inode != NULL);
46087 +
46088 + result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
46089 + if (result)
46090 + return result;
46091 + result = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
46092 + if (result)
46093 + goto out;
46094 + /* at this point stream with valid plain text is attached */
46095 + assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
46096 +
46097 + for (i = 0; i < clust->nr_pages; i++) {
46098 + struct page *page = clust->pages[i];
46099 + lock_page(page);
46100 + result = do_readpage_ctail(inode, clust, page);
46101 + unlock_page(page);
46102 + if (result)
46103 + break;
46104 + }
46105 + tfm_cluster_clr_uptodate(&clust->tc);
46106 + out:
46107 + release_cluster_pages(clust);
46108 + return result;
46109 +}
46110 +
46111 +#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
46112 +#define list_to_next_page(head) (list_entry((head)->prev->prev, struct page, lru))
46113 +
46114 +#if REISER4_DEBUG
46115 +#define check_order(pages) \
46116 +assert("edward-214", ergo(!list_empty(pages) && pages->next != pages->prev, \
46117 + list_to_page(pages)->index < list_to_next_page(pages)->index))
46118 +#endif
46119 +
46120 +/* plugin->u.item.s.file.readpages
46121 + Populate an address space with some page clusters,
46122 + and start reads against them.
46123 + FIXME-EDWARD: this function should return errors?
46124 +*/
46125 +void
46126 +readpages_ctail(void *vp, struct address_space *mapping,
46127 + struct list_head *pages)
46128 +{
46129 + int ret = 0;
46130 + hint_t *hint;
46131 + reiser4_cluster_t clust;
46132 + struct page *page;
46133 + struct pagevec lru_pvec;
46134 + struct inode *inode = mapping->host;
46135 + int progress = 0;
46136 +
46137 + assert("edward-214", ergo(!list_empty(pages) &&
46138 + pages->next != pages->prev,
46139 + list_to_page(pages)->index <
46140 + list_to_next_page(pages)->index));
46141 + pagevec_init(&lru_pvec, 0);
46142 + cluster_init_read(&clust, NULL);
46143 + clust.file = vp;
46144 + hint = kmalloc(sizeof(*hint), GFP_KERNEL);
46145 + if (hint == NULL) {
46146 + warning("vs-28", "failed to allocate hint");
46147 + goto exit1;
46148 + }
46149 + clust.hint = hint;
46150 + ret = load_file_hint(clust.file, hint);
46151 + if (ret)
46152 + goto exit2;
46153 + ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46154 + if (ret)
46155 + goto exit3;
46156 + assert("vs-26", hint->ext_coord.lh == &hint->lh);
46157 +
46158 + /* address_space-level file readahead doesn't know about
46159 + reiser4 concept of clustering, so we work around this
46160 + fact: with each page of the list @pages address space
46161 + will be populated with the whole page cluster.
46162 + */
46163 + while (!list_empty(pages)) {
46164 + page = list_to_page(pages);
46165 + list_del(&page->lru);
46166 + if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
46167 + page_cache_release(page);
46168 + continue;
46169 + }
46170 + if (PageUptodate(page)) {
46171 + if (!pagevec_add(&lru_pvec, page))
46172 + __pagevec_lru_add(&lru_pvec);
46173 + unlock_page(page);
46174 + continue;
46175 + }
46176 + unlock_page(page);
46177 +
46178 + move_cluster_forward(&clust, inode, page->index, &progress);
46179 + ret = ctail_read_page_cluster(&clust, inode);
46180 + if (ret)
46181 + break;
46182 + assert("edward-869", !tfm_cluster_is_uptodate(&clust.tc));
46183 + lock_page(page);
46184 +
46185 + ret = do_readpage_ctail(inode, &clust, page);
46186 + if (!pagevec_add(&lru_pvec, page))
46187 + __pagevec_lru_add(&lru_pvec);
46188 + if (ret) {
46189 + warning("edward-215", "do_readpage_ctail failed");
46190 + unlock_page(page);
46191 + break;
46192 + }
46193 + assert("edward-1061", PageUptodate(page));
46194 +
46195 + unlock_page(page);
46196 + }
46197 + assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46198 + exit3:
46199 + done_lh(&hint->lh);
46200 + save_file_hint(clust.file, hint);
46201 + hint->ext_coord.valid = 0;
46202 + exit2:
46203 + kfree(hint);
46204 + exit1:
46205 + while (!list_empty(pages)) {
46206 + struct page *victim;
46207 + victim = list_to_page(pages);
46208 + list_del(&victim->lru);
46209 + page_cache_release(victim);
46210 + }
46211 + put_cluster_handle(&clust);
46212 + pagevec_lru_add(&lru_pvec);
46213 + return;
46214 +}
46215 +
46216 +/*
46217 + plugin->u.item.s.file.append_key
46218 + key of the first item of the next disk cluster
46219 +*/
46220 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46221 +{
46222 + assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46223 + assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46224 +
46225 + item_key_by_coord(coord, key);
46226 + set_key_offset(key,
46227 + ((__u64) (clust_by_coord(coord, NULL)) +
46228 + 1) << cluster_shift_by_coord(coord));
46229 + return key;
46230 +}
46231 +
46232 +static int
46233 +insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
46234 +{
46235 + int result;
46236 + char buf[UCTAIL_NR_UNITS];
46237 + reiser4_item_data data;
46238 + reiser4_key key;
46239 + int shift = (int)UCTAIL_SHIFT;
46240 +
46241 + memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46242 + result = key_by_inode_cryptcompress(inode,
46243 + clust_to_off(clust->index, inode),
46244 + &key);
46245 + if (result)
46246 + return result;
46247 + data.user = 0;
46248 + data.iplug = item_plugin_by_id(CTAIL_ID);
46249 + data.arg = &shift;
46250 + data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46251 + data.data = buf;
46252 +
46253 + result = insert_by_coord(&clust->hint->ext_coord.coord,
46254 + &data, &key, clust->hint->ext_coord.lh, 0);
46255 + return result;
46256 +}
46257 +
46258 +static int
46259 +insert_crc_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46260 + struct inode *inode)
46261 +{
46262 + int result;
46263 + carry_pool *pool;
46264 + carry_level *lowest_level;
46265 + reiser4_item_data *data;
46266 + carry_op *op;
46267 + int cluster_shift = inode_cluster_shift(inode);
46268 +
46269 + pool =
46270 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46271 + sizeof(*data));
46272 + if (IS_ERR(pool))
46273 + return PTR_ERR(pool);
46274 + lowest_level = (carry_level *) (pool + 1);
46275 + init_carry_level(lowest_level, pool);
46276 + data = (reiser4_item_data *) (lowest_level + 3);
46277 +
46278 + assert("edward-466", coord->between == AFTER_ITEM
46279 + || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46280 + || coord->between == EMPTY_NODE
46281 + || coord->between == BEFORE_UNIT);
46282 +
46283 + if (coord->between == AFTER_UNIT) {
46284 + coord->unit_pos = 0;
46285 + coord->between = AFTER_ITEM;
46286 + }
46287 + op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46288 + 0 /* operate directly on coord -> node */ );
46289 + if (IS_ERR(op) || (op == NULL)) {
46290 + done_carry_pool(pool);
46291 + return RETERR(op ? PTR_ERR(op) : -EIO);
46292 + }
46293 + data->user = 0;
46294 + data->iplug = item_plugin_by_id(CTAIL_ID);
46295 + data->arg = &cluster_shift;
46296 +
46297 + data->length = 0;
46298 + data->data = NULL;
46299 +
46300 + op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46301 + op->u.insert_flow.insert_point = coord;
46302 + op->u.insert_flow.flow = f;
46303 + op->u.insert_flow.data = data;
46304 + op->u.insert_flow.new_nodes = 0;
46305 +
46306 + lowest_level->track_type = CARRY_TRACK_CHANGE;
46307 + lowest_level->tracked = lh;
46308 +
46309 + result = carry(lowest_level, NULL);
46310 + done_carry_pool(pool);
46311 +
46312 + return result;
46313 +}
46314 +
46315 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46316 +static int
46317 +insert_crc_flow_in_place(coord_t * coord, lock_handle * lh, flow_t * f,
46318 + struct inode *inode)
46319 +{
46320 + int ret;
46321 + coord_t pos;
46322 + lock_handle lock;
46323 +
46324 + assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46325 + assert("edward-484", coord->between == AT_UNIT
46326 + || coord->between == AFTER_ITEM);
46327 + assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46328 +
46329 + coord_dup(&pos, coord);
46330 + pos.unit_pos = 0;
46331 + pos.between = AFTER_ITEM;
46332 +
46333 + init_lh(&lock);
46334 + copy_lh(&lock, lh);
46335 +
46336 + ret = insert_crc_flow(&pos, &lock, f, inode);
46337 + done_lh(&lock);
46338 + assert("edward-1347", znode_is_write_locked(lh->node));
46339 + assert("edward-1228", !ret);
46340 + return ret;
46341 +}
46342 +
46343 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46344 +static int overwrite_ctail(coord_t * coord, flow_t * f)
46345 +{
46346 + unsigned count;
46347 +
46348 + assert("edward-269", f->user == 0);
46349 + assert("edward-270", f->data != NULL);
46350 + assert("edward-271", f->length > 0);
46351 + assert("edward-272", coord_is_existing_unit(coord));
46352 + assert("edward-273", coord->unit_pos == 0);
46353 + assert("edward-274", znode_is_write_locked(coord->node));
46354 + assert("edward-275", schedulable());
46355 + assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
46356 + assert("edward-1243", ctail_ok(coord));
46357 +
46358 + count = nr_units_ctail(coord);
46359 +
46360 + if (count > f->length)
46361 + count = f->length;
46362 + memcpy(first_unit(coord), f->data, count);
46363 + move_flow_forward(f, count);
46364 + coord->unit_pos += count;
46365 + return 0;
46366 +}
46367 +
46368 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
46369 + cut ctail (part or whole) starting from next unit position */
46370 +static int cut_ctail(coord_t * coord)
46371 +{
46372 + coord_t stop;
46373 +
46374 + assert("edward-435", coord->between == AT_UNIT &&
46375 + coord->item_pos < coord_num_items(coord) &&
46376 + coord->unit_pos <= coord_num_units(coord));
46377 +
46378 + if (coord->unit_pos == coord_num_units(coord))
46379 + /* nothing to cut */
46380 + return 0;
46381 + coord_dup(&stop, coord);
46382 + stop.unit_pos = coord_last_unit_pos(coord);
46383 +
46384 + return cut_node_content(coord, &stop, NULL, NULL, NULL);
46385 +}
46386 +
46387 +int
46388 +ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
46389 +{
46390 + int result;
46391 + assert("edward-1244", inode != NULL);
46392 + assert("edward-1245", clust->hint != NULL);
46393 + assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
46394 + assert("edward-1247", clust->reserved == 1);
46395 + assert("edward-1248", get_current_context()->grabbed_blocks ==
46396 + estimate_insert_cluster(inode));
46397 +
46398 + result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
46399 + if (cbk_errored(result))
46400 + return result;
46401 + assert("edward-1249", result == CBK_COORD_NOTFOUND);
46402 + assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
46403 +
46404 + assert("edward-1295",
46405 + clust->hint->ext_coord.lh->node ==
46406 + clust->hint->ext_coord.coord.node);
46407 +
46408 + coord_set_between_clusters(&clust->hint->ext_coord.coord);
46409 +
46410 + result = insert_unprepped_ctail(clust, inode);
46411 + all_grabbed2free();
46412 +
46413 + assert("edward-1251", !result);
46414 + assert("edward-1252", crc_inode_ok(inode));
46415 + assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
46416 + assert("edward-1254",
46417 + reiser4_clustered_blocks(reiser4_get_current_sb()));
46418 + assert("edward-1255",
46419 + znode_convertible(clust->hint->ext_coord.coord.node));
46420 +
46421 + return result;
46422 +}
46423 +
46424 +static int do_convert_ctail(flush_pos_t * pos, crc_write_mode_t mode)
46425 +{
46426 + int result = 0;
46427 + convert_item_info_t *info;
46428 +
46429 + assert("edward-468", pos != NULL);
46430 + assert("edward-469", pos->sq != NULL);
46431 + assert("edward-845", item_convert_data(pos) != NULL);
46432 +
46433 + info = item_convert_data(pos);
46434 + assert("edward-679", info->flow.data != NULL);
46435 +
46436 + switch (mode) {
46437 + case CRC_APPEND_ITEM:
46438 + assert("edward-1229", info->flow.length != 0);
46439 + assert("edward-1256",
46440 + cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
46441 + result =
46442 + insert_crc_flow_in_place(&pos->coord, &pos->lock,
46443 + &info->flow, info->inode);
46444 + break;
46445 + case CRC_OVERWRITE_ITEM:
46446 + assert("edward-1230", info->flow.length != 0);
46447 + overwrite_ctail(&pos->coord, &info->flow);
46448 + if (info->flow.length != 0)
46449 + break;
46450 + case CRC_CUT_ITEM:
46451 + assert("edward-1231", info->flow.length == 0);
46452 + result = cut_ctail(&pos->coord);
46453 + break;
46454 + default:
46455 + result = RETERR(-EIO);
46456 + impossible("edward-244", "bad convert mode");
46457 + }
46458 + return result;
46459 +}
46460 +
46461 +/* plugin->u.item.f.scan */
46462 +int scan_ctail(flush_scan * scan)
46463 +{
46464 + int result = 0;
46465 + struct page *page;
46466 + struct inode *inode;
46467 + jnode *node = scan->node;
46468 +
46469 + assert("edward-227", scan->node != NULL);
46470 + assert("edward-228", jnode_is_cluster_page(scan->node));
46471 + assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
46472 +
46473 + page = jnode_page(node);
46474 + inode = page->mapping->host;
46475 +
46476 + if (!scanning_left(scan))
46477 + return result;
46478 + if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
46479 + znode_make_dirty(scan->parent_lock.node);
46480 +
46481 + if (!znode_convertible(scan->parent_lock.node)) {
46482 + if (JF_ISSET(scan->node, JNODE_DIRTY))
46483 + znode_set_convertible(scan->parent_lock.node);
46484 + else {
46485 + warning("edward-681",
46486 + "cluster page is already processed");
46487 + return -EAGAIN;
46488 + }
46489 + }
46490 + return result;
46491 +}
46492 +
46493 +/* If true, this function attaches children */
46494 +static int should_attach_convert_idata(flush_pos_t * pos)
46495 +{
46496 + int result;
46497 + assert("edward-431", pos != NULL);
46498 + assert("edward-432", pos->child == NULL);
46499 + assert("edward-619", znode_is_write_locked(pos->coord.node));
46500 + assert("edward-470",
46501 + item_plugin_by_coord(&pos->coord) ==
46502 + item_plugin_by_id(CTAIL_ID));
46503 +
46504 + /* check for leftmost child */
46505 + utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
46506 +
46507 + if (!pos->child)
46508 + return 0;
46509 + spin_lock_jnode(pos->child);
46510 + result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
46511 + pos->child->atom == ZJNODE(pos->coord.node)->atom);
46512 + spin_unlock_jnode(pos->child);
46513 + if (!result && pos->child) {
46514 + /* existing child isn't to attach, clear up this one */
46515 + jput(pos->child);
46516 + pos->child = NULL;
46517 + }
46518 + return result;
46519 +}
46520 +
46521 +/* plugin->init_convert_data() */
46522 +static int
46523 +init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
46524 +{
46525 + assert("edward-813", idata != NULL);
46526 + assert("edward-814", inode != NULL);
46527 +
46528 + idata->inode = inode;
46529 + idata->d_cur = DC_FIRST_ITEM;
46530 + idata->d_next = DC_INVALID_STATE;
46531 +
46532 + return 0;
46533 +}
46534 +
46535 +static int alloc_item_convert_data(convert_info_t * sq)
46536 +{
46537 + assert("edward-816", sq != NULL);
46538 + assert("edward-817", sq->itm == NULL);
46539 +
46540 + sq->itm = kmalloc(sizeof(*sq->itm), GFP_KERNEL);
46541 + if (sq->itm == NULL)
46542 + return RETERR(-ENOMEM);
46543 + return 0;
46544 +}
46545 +
46546 +static void free_item_convert_data(convert_info_t * sq)
46547 +{
46548 + assert("edward-818", sq != NULL);
46549 + assert("edward-819", sq->itm != NULL);
46550 + assert("edward-820", sq->iplug != NULL);
46551 +
46552 + kfree(sq->itm);
46553 + sq->itm = NULL;
46554 + return;
46555 +}
46556 +
46557 +static int alloc_convert_data(flush_pos_t * pos)
46558 +{
46559 + assert("edward-821", pos != NULL);
46560 + assert("edward-822", pos->sq == NULL);
46561 +
46562 + pos->sq = kmalloc(sizeof(*pos->sq), GFP_KERNEL);
46563 + if (!pos->sq)
46564 + return RETERR(-ENOMEM);
46565 + memset(pos->sq, 0, sizeof(*pos->sq));
46566 + cluster_init_write(&pos->sq->clust, 0);
46567 + return 0;
46568 +}
46569 +
46570 +void free_convert_data(flush_pos_t * pos)
46571 +{
46572 + convert_info_t *sq;
46573 +
46574 + assert("edward-823", pos != NULL);
46575 + assert("edward-824", pos->sq != NULL);
46576 +
46577 + sq = pos->sq;
46578 + if (sq->itm)
46579 + free_item_convert_data(sq);
46580 + put_cluster_handle(&sq->clust);
46581 + kfree(pos->sq);
46582 + pos->sq = NULL;
46583 + return;
46584 +}
46585 +
46586 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
46587 +{
46588 + convert_info_t *sq;
46589 +
46590 + assert("edward-825", pos != NULL);
46591 + assert("edward-826", pos->sq != NULL);
46592 + assert("edward-827", item_convert_data(pos) != NULL);
46593 + assert("edward-828", inode != NULL);
46594 +
46595 + sq = pos->sq;
46596 +
46597 + memset(sq->itm, 0, sizeof(*sq->itm));
46598 +
46599 + /* iplug->init_convert_data() */
46600 + return init_convert_data_ctail(sq->itm, inode);
46601 +}
46602 +
46603 +/* create and attach disk cluster info used by 'convert' phase of the flush
46604 + squalloc() */
46605 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
46606 +{
46607 + int ret = 0;
46608 + convert_item_info_t *info;
46609 + reiser4_cluster_t *clust;
46610 + file_plugin *fplug = inode_file_plugin(inode);
46611 + compression_plugin *cplug = inode_compression_plugin(inode);
46612 +
46613 + assert("edward-248", pos != NULL);
46614 + assert("edward-249", pos->child != NULL);
46615 + assert("edward-251", inode != NULL);
46616 + assert("edward-682", crc_inode_ok(inode));
46617 + assert("edward-252", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
46618 + assert("edward-473",
46619 + item_plugin_by_coord(&pos->coord) ==
46620 + item_plugin_by_id(CTAIL_ID));
46621 +
46622 + if (!pos->sq) {
46623 + ret = alloc_convert_data(pos);
46624 + if (ret)
46625 + return ret;
46626 + }
46627 + clust = &pos->sq->clust;
46628 + ret = grab_coa(&clust->tc, cplug);
46629 + if (ret)
46630 + goto err;
46631 + ret = set_cluster_by_page(clust,
46632 + jnode_page(pos->child),
46633 + MAX_CLUSTER_NRPAGES);
46634 + if (ret)
46635 + goto err;
46636 +
46637 + assert("edward-829", pos->sq != NULL);
46638 + assert("edward-250", item_convert_data(pos) == NULL);
46639 +
46640 + pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
46641 +
46642 + ret = alloc_item_convert_data(pos->sq);
46643 + if (ret)
46644 + goto err;
46645 + ret = init_item_convert_data(pos, inode);
46646 + if (ret)
46647 + goto err;
46648 + info = item_convert_data(pos);
46649 +
46650 + ret = flush_cluster_pages(clust, pos->child, inode);
46651 + if (ret)
46652 + goto err;
46653 +
46654 + deflate_cluster(clust, inode);
46655 + inc_item_convert_count(pos);
46656 +
46657 + /* make flow by transformed stream */
46658 + fplug->flow_by_inode(info->inode,
46659 + (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
46660 + 0 /* kernel space */ ,
46661 + clust->tc.len,
46662 + clust_to_off(clust->index, inode),
46663 + WRITE_OP, &info->flow);
46664 + jput(pos->child);
46665 +
46666 + assert("edward-683", crc_inode_ok(inode));
46667 + return 0;
46668 + err:
46669 + jput(pos->child);
46670 + free_convert_data(pos);
46671 + return ret;
46672 +}
46673 +
46674 +/* clear up disk cluster info */
46675 +static void detach_convert_idata(convert_info_t * sq)
46676 +{
46677 + convert_item_info_t *info;
46678 +
46679 + assert("edward-253", sq != NULL);
46680 + assert("edward-840", sq->itm != NULL);
46681 +
46682 + info = sq->itm;
46683 + assert("edward-255", info->inode != NULL);
46684 + assert("edward-1212", info->flow.length == 0);
46685 +
46686 + free_item_convert_data(sq);
46687 + return;
46688 +}
46689 +
46690 +/* plugin->u.item.f.utmost_child */
46691 +
46692 +/* This function sets leftmost child for a first cluster item,
46693 + if the child exists, and NULL in other cases.
46694 + NOTE-EDWARD: Do not call this for RIGHT_SIDE */
46695 +
46696 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
46697 +{
46698 + reiser4_key key;
46699 +
46700 + item_key_by_coord(coord, &key);
46701 +
46702 + assert("edward-257", coord != NULL);
46703 + assert("edward-258", child != NULL);
46704 + assert("edward-259", side == LEFT_SIDE);
46705 + assert("edward-260",
46706 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46707 +
46708 + if (!is_disk_cluster_key(&key, coord))
46709 + *child = NULL;
46710 + else
46711 + *child = jlookup(current_tree,
46712 + get_key_objectid(item_key_by_coord
46713 + (coord, &key)),
46714 + off_to_pg(get_key_offset(&key)));
46715 + return 0;
46716 +}
46717 +
46718 +/* Returns true if @p2 is the next item to @p1
46719 + in the _same_ disk cluster.
46720 + Disk cluster is a set of items. If ->clustered() != NULL,
46721 + with each item the whole disk cluster should be read/modified
46722 +*/
46723 +static int clustered_ctail(const coord_t * p1, const coord_t * p2)
46724 +{
46725 + return mergeable_ctail(p1, p2);
46726 +}
46727 +
46728 +/* Go rightward and check for next disk cluster item, set
46729 + d_next to DC_CHAINED_ITEM, if the last one exists.
46730 + If the current position is last item, go to right neighbor.
46731 + Skip empty nodes. Note, that right neighbors may be not in
46732 + the slum because of races. If so, make it dirty and
46733 + convertible.
46734 +*/
46735 +static int next_item_dc_stat(flush_pos_t * pos)
46736 +{
46737 + int ret = 0;
46738 + int stop = 0;
46739 + znode *cur;
46740 + coord_t coord;
46741 + lock_handle lh;
46742 + lock_handle right_lock;
46743 +
46744 + assert("edward-1232", !node_is_empty(pos->coord.node));
46745 + assert("edward-1014",
46746 + pos->coord.item_pos < coord_num_items(&pos->coord));
46747 + assert("edward-1015", chaining_data_present(pos));
46748 + assert("edward-1017",
46749 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
46750 +
46751 + item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
46752 +
46753 + if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
46754 + return ret;
46755 + if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
46756 + return ret;
46757 +
46758 + /* check next slum item */
46759 + init_lh(&right_lock);
46760 + cur = pos->coord.node;
46761 +
46762 + while (!stop) {
46763 + init_lh(&lh);
46764 + ret = reiser4_get_right_neighbor(&lh,
46765 + cur,
46766 + ZNODE_WRITE_LOCK,
46767 + GN_CAN_USE_UPPER_LEVELS);
46768 + if (ret)
46769 + break;
46770 + ret = zload(lh.node);
46771 + if (ret) {
46772 + done_lh(&lh);
46773 + break;
46774 + }
46775 + coord_init_before_first_item(&coord, lh.node);
46776 +
46777 + if (node_is_empty(lh.node)) {
46778 + znode_make_dirty(lh.node);
46779 + znode_set_convertible(lh.node);
46780 + stop = 0;
46781 + } else if (clustered_ctail(&pos->coord, &coord)) {
46782 +
46783 + item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
46784 +
46785 + if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
46786 + /*
46787 + warning("edward-1024",
46788 + "next slum item mergeable, "
46789 + "but znode %p isn't dirty\n",
46790 + lh.node);
46791 + */
46792 + znode_make_dirty(lh.node);
46793 + }
46794 + if (!znode_convertible(lh.node)) {
46795 + /*
46796 + warning("edward-1272",
46797 + "next slum item mergeable, "
46798 + "but znode %p isn't convertible\n",
46799 + lh.node);
46800 + */
46801 + znode_set_convertible(lh.node);
46802 + }
46803 + stop = 1;
46804 + } else
46805 + stop = 1;
46806 + zrelse(lh.node);
46807 + done_lh(&right_lock);
46808 + copy_lh(&right_lock, &lh);
46809 + done_lh(&lh);
46810 + cur = right_lock.node;
46811 + }
46812 + done_lh(&right_lock);
46813 +
46814 + if (ret == -E_NO_NEIGHBOR)
46815 + ret = 0;
46816 + return ret;
46817 +}
46818 +
46819 +static int
46820 +assign_convert_mode(convert_item_info_t * idata, crc_write_mode_t * mode)
46821 +{
46822 + int result = 0;
46823 +
46824 + assert("edward-1025", idata != NULL);
46825 +
46826 + if (idata->flow.length) {
46827 + /* append or overwrite */
46828 + switch (idata->d_cur) {
46829 + case DC_FIRST_ITEM:
46830 + case DC_CHAINED_ITEM:
46831 + *mode = CRC_OVERWRITE_ITEM;
46832 + break;
46833 + case DC_AFTER_CLUSTER:
46834 + *mode = CRC_APPEND_ITEM;
46835 + break;
46836 + default:
46837 + impossible("edward-1018", "wrong current item state");
46838 + }
46839 + } else {
46840 + /* cut or invalidate */
46841 + switch (idata->d_cur) {
46842 + case DC_FIRST_ITEM:
46843 + case DC_CHAINED_ITEM:
46844 + *mode = CRC_CUT_ITEM;
46845 + break;
46846 + case DC_AFTER_CLUSTER:
46847 + result = 1;
46848 + break;
46849 + default:
46850 + impossible("edward-1019", "wrong current item state");
46851 + }
46852 + }
46853 + return result;
46854 +}
46855 +
46856 +/* plugin->u.item.f.convert */
46857 +/* write ctail in guessed mode */
46858 +int convert_ctail(flush_pos_t * pos)
46859 +{
46860 + int result;
46861 + int nr_items;
46862 + crc_write_mode_t mode = CRC_OVERWRITE_ITEM;
46863 +
46864 + assert("edward-1020", pos != NULL);
46865 + assert("edward-1213", coord_num_items(&pos->coord) != 0);
46866 + assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
46867 + assert("edward-1258", ctail_ok(&pos->coord));
46868 + assert("edward-261", pos->coord.node != NULL);
46869 +
46870 + nr_items = coord_num_items(&pos->coord);
46871 + if (!chaining_data_present(pos)) {
46872 + if (should_attach_convert_idata(pos)) {
46873 + /* attach convert item info */
46874 + struct inode *inode;
46875 +
46876 + assert("edward-264", pos->child != NULL);
46877 + assert("edward-265", jnode_page(pos->child) != NULL);
46878 + assert("edward-266",
46879 + jnode_page(pos->child)->mapping != NULL);
46880 +
46881 + inode = jnode_page(pos->child)->mapping->host;
46882 +
46883 + assert("edward-267", inode != NULL);
46884 +
46885 + /* attach item convert info by child and put the last one */
46886 + result = attach_convert_idata(pos, inode);
46887 + pos->child = NULL;
46888 + if (result == -E_REPEAT) {
46889 + /* jnode became clean, or there is no dirty
46890 + pages (nothing to update in disk cluster) */
46891 + warning("edward-1021",
46892 + "convert_ctail: nothing to attach");
46893 + return 0;
46894 + }
46895 + if (result != 0)
46896 + return result;
46897 + } else
46898 + /* unconvertible */
46899 + return 0;
46900 + } else {
46901 + /* use old convert info */
46902 +
46903 + convert_item_info_t *idata;
46904 +
46905 + idata = item_convert_data(pos);
46906 +
46907 + result = assign_convert_mode(idata, &mode);
46908 + if (result) {
46909 + /* disk cluster is over,
46910 + nothing to update anymore */
46911 + detach_convert_idata(pos->sq);
46912 + return 0;
46913 + }
46914 + }
46915 +
46916 + assert("edward-433", chaining_data_present(pos));
46917 + assert("edward-1022",
46918 + pos->coord.item_pos < coord_num_items(&pos->coord));
46919 +
46920 + result = next_item_dc_stat(pos);
46921 + if (result) {
46922 + detach_convert_idata(pos->sq);
46923 + return result;
46924 + }
46925 + result = do_convert_ctail(pos, mode);
46926 + if (result) {
46927 + detach_convert_idata(pos->sq);
46928 + return result;
46929 + }
46930 + switch (mode) {
46931 + case CRC_CUT_ITEM:
46932 + assert("edward-1214", item_convert_data(pos)->flow.length == 0);
46933 + assert("edward-1215",
46934 + coord_num_items(&pos->coord) == nr_items ||
46935 + coord_num_items(&pos->coord) == nr_items - 1);
46936 + if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
46937 + break;
46938 + if (coord_num_items(&pos->coord) != nr_items) {
46939 + /* the item was killed, no more chained items */
46940 + detach_convert_idata(pos->sq);
46941 + if (!node_is_empty(pos->coord.node))
46942 + /* make sure the next item will be scanned */
46943 + coord_init_before_item(&pos->coord);
46944 + break;
46945 + }
46946 + case CRC_APPEND_ITEM:
46947 + assert("edward-434", item_convert_data(pos)->flow.length == 0);
46948 + detach_convert_idata(pos->sq);
46949 + break;
46950 + case CRC_OVERWRITE_ITEM:
46951 + if (coord_is_unprepped_ctail(&pos->coord)) {
46952 + /* convert unpprepped ctail to prepped one */
46953 + int shift;
46954 + shift =
46955 + inode_cluster_shift(item_convert_data(pos)->inode);
46956 + assert("edward-1259", cluster_shift_ok(shift));
46957 + put_unaligned((d8)shift,
46958 + &ctail_formatted_at(&pos->coord)->
46959 + cluster_shift);
46960 + }
46961 + break;
46962 + }
46963 + return result;
46964 +}
46965 +
46966 +/* Make Linus happy.
46967 + Local variables:
46968 + c-indentation-style: "K&R"
46969 + mode-name: "LC"
46970 + c-basic-offset: 8
46971 + tab-width: 8
46972 + fill-column: 120
46973 + End:
46974 +*/
46975 Index: linux-2.6.16/fs/reiser4/plugin/item/ctail.h
46976 ===================================================================
46977 --- /dev/null
46978 +++ linux-2.6.16/fs/reiser4/plugin/item/ctail.h
46979 @@ -0,0 +1,89 @@
46980 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46981 +
46982 +#if !defined( __FS_REISER4_CTAIL_H__ )
46983 +#define __FS_REISER4_CTAIL_H__
46984 +
46985 +/* cryptcompress object item. See ctail.c for description. */
46986 +
46987 +#define UCTAIL_NR_UNITS 1
46988 +#define UCTAIL_SHIFT 0xff
46989 +
46990 +typedef struct ctail_item_format {
46991 + /* cluster shift */
46992 + d8 cluster_shift;
46993 + /* ctail body */
46994 + d8 body[0];
46995 +} __attribute__ ((packed)) ctail_item_format;
46996 +
46997 +/* The following is a set of various item states in a disk cluster.
46998 + Disk cluster is a set of items whose keys belong to the interval
46999 + [dc_key , dc_key + disk_cluster_size - 1] */
47000 +typedef enum {
47001 + DC_INVALID_STATE = 0,
47002 + DC_FIRST_ITEM = 1,
47003 + DC_CHAINED_ITEM = 2,
47004 + DC_AFTER_CLUSTER = 3
47005 +} dc_item_stat;
47006 +
47007 +typedef struct {
47008 + int shift; /* we keep here a cpu value of cluster_shift field
47009 + of ctail_item_format (see above) */
47010 +} ctail_coord_extension_t;
47011 +
47012 +struct cut_list;
47013 +
47014 +/* plugin->item.b.* */
47015 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
47016 + const reiser4_item_data *);
47017 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
47018 +pos_in_node_t nr_units_ctail(const coord_t * coord);
47019 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
47020 +void print_ctail(const char *prefix, coord_t * coord);
47021 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
47022 +
47023 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
47024 + carry_plugin_info * info UNUSED_ARG);
47025 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
47026 +int can_shift_ctail(unsigned free_space, coord_t * coord,
47027 + znode * target, shift_direction pend, unsigned *size,
47028 + unsigned want);
47029 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
47030 + unsigned count, shift_direction where_is_free_space,
47031 + unsigned free_space);
47032 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47033 + carry_cut_data *, reiser4_key * smallest_removed,
47034 + reiser4_key * new_first);
47035 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47036 + carry_kill_data *, reiser4_key * smallest_removed,
47037 + reiser4_key * new_first);
47038 +int ctail_ok(const coord_t * coord);
47039 +int check_ctail(const coord_t * coord, const char **error);
47040 +
47041 +/* plugin->u.item.s.* */
47042 +int read_ctail(struct file *, flow_t *, hint_t *);
47043 +int readpage_ctail(void *, struct page *);
47044 +void readpages_ctail(void *, struct address_space *, struct list_head *);
47045 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
47046 +int create_hook_ctail(const coord_t * coord, void *arg);
47047 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
47048 + carry_kill_data *);
47049 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
47050 +
47051 +/* plugin->u.item.f */
47052 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
47053 +int scan_ctail(flush_scan *);
47054 +int convert_ctail(flush_pos_t *);
47055 +size_t inode_scaled_cluster_size(struct inode *);
47056 +int cluster_shift_by_coord(const coord_t * coord);
47057 +
47058 +#endif /* __FS_REISER4_CTAIL_H__ */
47059 +
47060 +/* Make Linus happy.
47061 + Local variables:
47062 + c-indentation-style: "K&R"
47063 + mode-name: "LC"
47064 + c-basic-offset: 8
47065 + tab-width: 8
47066 + fill-column: 120
47067 + End:
47068 +*/
47069 Index: linux-2.6.16/fs/reiser4/plugin/item/extent.c
47070 ===================================================================
47071 --- /dev/null
47072 +++ linux-2.6.16/fs/reiser4/plugin/item/extent.c
47073 @@ -0,0 +1,197 @@
47074 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47075 +
47076 +#include "item.h"
47077 +#include "../../key.h"
47078 +#include "../../super.h"
47079 +#include "../../carry.h"
47080 +#include "../../inode.h"
47081 +#include "../../page_cache.h"
47082 +#include "../../flush.h"
47083 +#include "../object.h"
47084 +
47085 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
47086 +/* Audited by: green(2002.06.13) */
47087 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47088 + int nr_extents)
47089 +{
47090 + data->data = ext_unit;
47091 + /* data->data is kernel space */
47092 + data->user = 0;
47093 + data->length = sizeof(reiser4_extent) * nr_extents;
47094 + data->arg = NULL;
47095 + data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47096 + return data;
47097 +}
47098 +
47099 +/* how many bytes are addressed by @nr first extents of the extent item */
47100 +reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr)
47101 +{
47102 + pos_in_node_t i;
47103 + reiser4_block_nr blocks;
47104 + reiser4_extent *ext;
47105 +
47106 + ext = item_body_by_coord(coord);
47107 + assert("vs-263", nr <= nr_units_extent(coord));
47108 +
47109 + blocks = 0;
47110 + for (i = 0; i < nr; i++, ext++) {
47111 + blocks += extent_get_width(ext);
47112 + }
47113 +
47114 + return blocks * current_blocksize;
47115 +}
47116 +
47117 +extent_state state_of_extent(reiser4_extent * ext)
47118 +{
47119 + switch ((int)extent_get_start(ext)) {
47120 + case 0:
47121 + return HOLE_EXTENT;
47122 + case 1:
47123 + return UNALLOCATED_EXTENT;
47124 + default:
47125 + break;
47126 + }
47127 + return ALLOCATED_EXTENT;
47128 +}
47129 +
47130 +int extent_is_unallocated(const coord_t * item)
47131 +{
47132 + assert("jmacd-5133", item_is_extent(item));
47133 +
47134 + return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47135 +}
47136 +
47137 +/* set extent's start and width */
47138 +void
47139 +set_extent(reiser4_extent * ext, reiser4_block_nr start, reiser4_block_nr width)
47140 +{
47141 + extent_set_start(ext, start);
47142 + extent_set_width(ext, width);
47143 +}
47144 +
47145 +
47146 +/**
47147 + * replace_extent - replace extent and paste 1 or 2 after it
47148 + * @un_extent: coordinate of extent to be overwritten
47149 + * @lh: need better comment
47150 + * @key: need better comment
47151 + * @exts_to_add: data prepared for insertion into tree
47152 + * @replace: need better comment
47153 + * @flags: need better comment
47154 + * @return_insert_position: need better comment
47155 + *
47156 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
47157 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47158 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47159 + * set to extent which was overwritten.
47160 + */
47161 +int replace_extent(struct replace_handle *h, int return_inserted_position)
47162 +{
47163 + int result;
47164 + znode *orig_znode;
47165 + /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
47166 +
47167 + assert("vs-990", coord_is_existing_unit(h->coord));
47168 + assert("vs-1375", znode_is_write_locked(h->coord->node));
47169 + assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47170 + assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47171 + assert("vs-1427", ergo(h->nr_new_extents == 2,
47172 + extent_get_width(&h->new_extents[1]) != 0));
47173 +
47174 + /* compose structure for paste */
47175 + init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47176 +
47177 + coord_dup(&h->coord_after, h->coord);
47178 + init_lh(&h->lh_after);
47179 + copy_lh(&h->lh_after, h->lh);
47180 + tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47181 + tap_monitor(&h->watch);
47182 +
47183 + ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47184 + orig_znode = h->coord->node;
47185 +
47186 +#if REISER4_DEBUG
47187 + /* make sure that key is set properly */
47188 + unit_key_by_coord(h->coord, &h->tmp);
47189 + set_key_offset(&h->tmp,
47190 + get_key_offset(&h->tmp) +
47191 + extent_get_width(&h->overwrite) * current_blocksize);
47192 + assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47193 +#endif
47194 +
47195 + /* set insert point after unit to be replaced */
47196 + h->coord->between = AFTER_UNIT;
47197 +
47198 + result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47199 + &h->paste_key, &h->item, h->flags);
47200 + if (!result) {
47201 + /* now we have to replace the unit after which new units were
47202 + inserted. Its position is tracked by @watch */
47203 + reiser4_extent *ext;
47204 + znode *node;
47205 +
47206 + node = h->coord_after.node;
47207 + if (node != orig_znode) {
47208 + coord_clear_iplug(&h->coord_after);
47209 + result = zload(node);
47210 + }
47211 +
47212 + if (likely(!result)) {
47213 + ext = extent_by_coord(&h->coord_after);
47214 +
47215 + assert("vs-987", znode_is_loaded(node));
47216 + assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47217 +
47218 + /* overwrite extent unit */
47219 + memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47220 + znode_make_dirty(node);
47221 +
47222 + if (node != orig_znode)
47223 + zrelse(node);
47224 +
47225 + if (return_inserted_position == 0) {
47226 + /* coord and lh are to be set to overwritten
47227 + extent */
47228 + assert("vs-1662",
47229 + WITH_DATA(node, !memcmp(&h->overwrite,
47230 + extent_by_coord(
47231 + &h->coord_after),
47232 + sizeof(reiser4_extent))));
47233 +
47234 + *h->coord = h->coord_after;
47235 + done_lh(h->lh);
47236 + copy_lh(h->lh, &h->lh_after);
47237 + } else {
47238 + /* h->coord and h->lh are to be set to first of
47239 + inserted units */
47240 + assert("vs-1663",
47241 + WITH_DATA(h->coord->node,
47242 + !memcmp(&h->new_extents[0],
47243 + extent_by_coord(h->coord),
47244 + sizeof(reiser4_extent))));
47245 + assert("vs-1664", h->lh->node == h->coord->node);
47246 + }
47247 + }
47248 + }
47249 + tap_done(&h->watch);
47250 +
47251 + return result;
47252 +}
47253 +
47254 +lock_handle *znode_lh(znode *node)
47255 +{
47256 + assert("vs-1371", znode_is_write_locked(node));
47257 + assert("vs-1372", znode_is_wlocked_once(node));
47258 + return list_entry(node->lock.owners.next, lock_handle, owners_link);
47259 +}
47260 +
47261 +/*
47262 + * Local variables:
47263 + * c-indentation-style: "K&R"
47264 + * mode-name: "LC"
47265 + * c-basic-offset: 8
47266 + * tab-width: 8
47267 + * fill-column: 79
47268 + * scroll-step: 1
47269 + * End:
47270 + */
47271 Index: linux-2.6.16/fs/reiser4/plugin/item/extent.h
47272 ===================================================================
47273 --- /dev/null
47274 +++ linux-2.6.16/fs/reiser4/plugin/item/extent.h
47275 @@ -0,0 +1,228 @@
47276 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47277 +
47278 +#ifndef __REISER4_EXTENT_H__
47279 +#define __REISER4_EXTENT_H__
47280 +
47281 +/* on disk extent */
47282 +typedef struct {
47283 + reiser4_dblock_nr start;
47284 + reiser4_dblock_nr width;
47285 +} reiser4_extent;
47286 +
47287 +typedef struct extent_stat {
47288 + int unallocated_units;
47289 + int unallocated_blocks;
47290 + int allocated_units;
47291 + int allocated_blocks;
47292 + int hole_units;
47293 + int hole_blocks;
47294 +} extent_stat;
47295 +
47296 +/* extents in an extent item can be either holes, or unallocated or allocated
47297 + extents */
47298 +typedef enum {
47299 + HOLE_EXTENT,
47300 + UNALLOCATED_EXTENT,
47301 + ALLOCATED_EXTENT
47302 +} extent_state;
47303 +
47304 +#define HOLE_EXTENT_START 0
47305 +#define UNALLOCATED_EXTENT_START 1
47306 +#define UNALLOCATED_EXTENT_START2 2
47307 +
47308 +typedef struct {
47309 + reiser4_block_nr pos_in_unit;
47310 + reiser4_block_nr width; /* width of current unit */
47311 + pos_in_node_t nr_units; /* number of units */
47312 + int ext_offset; /* offset from the beginning of zdata() */
47313 + unsigned long expected_page;
47314 +#if REISER4_DEBUG
47315 + reiser4_extent extent;
47316 +#endif
47317 +} extent_coord_extension_t;
47318 +
47319 +/* macros to set/get fields of on-disk extent */
47320 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47321 +{
47322 + return le64_to_cpu(ext->start);
47323 +}
47324 +
47325 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47326 +{
47327 + return le64_to_cpu(ext->width);
47328 +}
47329 +
47330 +extern __u64 reiser4_current_block_count(void);
47331 +
47332 +static inline void
47333 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47334 +{
47335 + cassert(sizeof(ext->start) == 8);
47336 + assert("nikita-2510",
47337 + ergo(start > 1, start < reiser4_current_block_count()));
47338 + put_unaligned(cpu_to_le64(start), &ext->start);
47339 +}
47340 +
47341 +static inline void
47342 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47343 +{
47344 + cassert(sizeof(ext->width) == 8);
47345 + assert("", width > 0);
47346 + put_unaligned(cpu_to_le64(width), &ext->width);
47347 + assert("nikita-2511",
47348 + ergo(extent_get_start(ext) > 1,
47349 + extent_get_start(ext) + width <=
47350 + reiser4_current_block_count()));
47351 +}
47352 +
47353 +#define extent_item(coord) \
47354 +({ \
47355 + assert("nikita-3143", item_is_extent(coord)); \
47356 + ((reiser4_extent *)item_body_by_coord (coord)); \
47357 +})
47358 +
47359 +#define extent_by_coord(coord) \
47360 +({ \
47361 + assert("nikita-3144", item_is_extent(coord)); \
47362 + (extent_item (coord) + (coord)->unit_pos); \
47363 +})
47364 +
47365 +#define width_by_coord(coord) \
47366 +({ \
47367 + assert("nikita-3145", item_is_extent(coord)); \
47368 + extent_get_width (extent_by_coord(coord)); \
47369 +})
47370 +
47371 +struct carry_cut_data;
47372 +struct carry_kill_data;
47373 +
47374 +/* plugin->u.item.b.* */
47375 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47376 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47377 + const reiser4_item_data *);
47378 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
47379 +pos_in_node_t nr_units_extent(const coord_t *);
47380 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47381 +void init_coord_extent(coord_t *);
47382 +int init_extent(coord_t *, reiser4_item_data *);
47383 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47384 +int can_shift_extent(unsigned free_space,
47385 + coord_t * source, znode * target, shift_direction,
47386 + unsigned *size, unsigned want);
47387 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47388 + unsigned count, shift_direction where_is_free_space,
47389 + unsigned free_space);
47390 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47391 + struct carry_kill_data *);
47392 +int create_hook_extent(const coord_t * coord, void *arg);
47393 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47394 + struct carry_cut_data *, reiser4_key * smallest_removed,
47395 + reiser4_key * new_first);
47396 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47397 + struct carry_kill_data *, reiser4_key * smallest_removed,
47398 + reiser4_key * new_first);
47399 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47400 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47401 +void print_extent(const char *, coord_t *);
47402 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47403 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47404 + reiser4_block_nr * block);
47405 +void item_stat_extent(const coord_t * coord, void *vp);
47406 +int check_extent(const coord_t * coord, const char **error);
47407 +
47408 +/* plugin->u.item.s.file.* */
47409 +ssize_t write_extent(struct file *, const char __user *, size_t, loff_t *);
47410 +int read_extent(struct file *, flow_t *, hint_t *);
47411 +int readpage_extent(void *, struct page *);
47412 +void readpages_extent(void *, struct address_space *, struct list_head *pages);
47413 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47414 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47415 +int get_block_address_extent(const coord_t *, sector_t block,
47416 + sector_t * result);
47417 +
47418 +/* these are used in flush.c
47419 + FIXME-VS: should they be somewhere in item_plugin? */
47420 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47421 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47422 + reiser4_key * stop_key);
47423 +
47424 +int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47425 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47426 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47427 +
47428 +/* plugin->u.item.f. */
47429 +int scan_extent(flush_scan * scan);
47430 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47431 +
47432 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47433 + int nr_extents);
47434 +reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr);
47435 +extent_state state_of_extent(reiser4_extent * ext);
47436 +void set_extent(reiser4_extent *, reiser4_block_nr start,
47437 + reiser4_block_nr width);
47438 +int update_extent(struct inode *, jnode *, loff_t pos, int *plugged_hole);
47439 +
47440 +#include "../../coord.h"
47441 +#include "../../lock.h"
47442 +#include "../../tap.h"
47443 +
47444 +struct replace_handle {
47445 + /* these are to be set before calling replace_extent */
47446 + coord_t *coord;
47447 + lock_handle *lh;
47448 + reiser4_key key;
47449 + reiser4_key *pkey;
47450 + reiser4_extent overwrite;
47451 + reiser4_extent new_extents[2];
47452 + int nr_new_extents;
47453 + unsigned flags;
47454 +
47455 + /* these are used by replace_extent */
47456 + reiser4_item_data item;
47457 + coord_t coord_after;
47458 + lock_handle lh_after;
47459 + tap_t watch;
47460 + reiser4_key paste_key;
47461 +#if REISER4_DEBUG
47462 + reiser4_extent orig_ext;
47463 + reiser4_key tmp;
47464 +#endif
47465 +};
47466 +
47467 +/* this structure is kmalloced before calling make_extent to avoid excessive
47468 + stack consumption on plug_hole->replace_extent */
47469 +struct make_extent_handle {
47470 + uf_coord_t *uf_coord;
47471 + reiser4_block_nr blocknr;
47472 + int created;
47473 + struct inode *inode;
47474 + union {
47475 + struct {
47476 + } append;
47477 + struct replace_handle replace;
47478 + } u;
47479 +};
47480 +
47481 +int replace_extent(struct replace_handle *, int return_inserted_position);
47482 +lock_handle *znode_lh(znode *);
47483 +
47484 +/* the reiser4 repacker support */
47485 +struct repacker_cursor;
47486 +extern int process_extent_backward_for_repacking(tap_t *,
47487 + struct repacker_cursor *);
47488 +extern int mark_extent_for_repacking(tap_t *, int);
47489 +
47490 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47491 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47492 +
47493 +/* __REISER4_EXTENT_H__ */
47494 +#endif
47495 +/*
47496 + Local variables:
47497 + c-indentation-style: "K&R"
47498 + mode-name: "LC"
47499 + c-basic-offset: 8
47500 + tab-width: 8
47501 + fill-column: 120
47502 + End:
47503 +*/
47504 Index: linux-2.6.16/fs/reiser4/plugin/item/extent_file_ops.c
47505 ===================================================================
47506 --- /dev/null
47507 +++ linux-2.6.16/fs/reiser4/plugin/item/extent_file_ops.c
47508 @@ -0,0 +1,1712 @@
47509 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47510 +
47511 +#include "item.h"
47512 +#include "../../inode.h"
47513 +#include "../../page_cache.h"
47514 +#include "../object.h"
47515 +
47516 +#include <linux/quotaops.h>
47517 +#include <linux/swap.h>
47518 +#include "../../../../mm/filemap.h"
47519 +
47520 +
47521 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
47522 +{
47523 + reiser4_extent *ext;
47524 +
47525 + ext = (reiser4_extent *) (zdata(node) + offset);
47526 + return ext;
47527 +}
47528 +
47529 +/**
47530 + * check_uf_coord - verify coord extension
47531 + * @uf_coord:
47532 + * @key:
47533 + *
47534 + * Makes sure that all fields of @uf_coord are set properly. If @key is
47535 + * specified - check whether @uf_coord is set correspondingly.
47536 + */
47537 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
47538 +{
47539 +#if REISER4_DEBUG
47540 + const coord_t *coord;
47541 + const extent_coord_extension_t *ext_coord;
47542 + reiser4_extent *ext;
47543 +
47544 + coord = &uf_coord->coord;
47545 + ext_coord = &uf_coord->extension.extent;
47546 + ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
47547 +
47548 + assert("",
47549 + WITH_DATA(coord->node,
47550 + (uf_coord->valid == 1 &&
47551 + coord_is_iplug_set(coord) &&
47552 + item_is_extent(coord) &&
47553 + ext_coord->nr_units == nr_units_extent(coord) &&
47554 + ext == extent_by_coord(coord) &&
47555 + ext_coord->width == extent_get_width(ext) &&
47556 + coord->unit_pos < ext_coord->nr_units &&
47557 + ext_coord->pos_in_unit < ext_coord->width &&
47558 + memcmp(ext, &ext_coord->extent,
47559 + sizeof(reiser4_extent)) == 0)));
47560 + if (key) {
47561 + reiser4_key coord_key;
47562 +
47563 + unit_key_by_coord(&uf_coord->coord, &coord_key);
47564 + set_key_offset(&coord_key,
47565 + get_key_offset(&coord_key) +
47566 + (uf_coord->extension.extent.
47567 + pos_in_unit << PAGE_CACHE_SHIFT));
47568 + assert("", keyeq(key, &coord_key));
47569 + }
47570 +#endif
47571 +}
47572 +
47573 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
47574 +{
47575 + check_uf_coord(uf_coord, NULL);
47576 +
47577 + return ext_by_offset(uf_coord->coord.node,
47578 + uf_coord->extension.extent.ext_offset);
47579 +}
47580 +
47581 +#if REISER4_DEBUG
47582 +
47583 +/**
47584 + * offset_is_in_unit
47585 + *
47586 + *
47587 + *
47588 + */
47589 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
47590 + pos_in_unit inside of unit correspondingly */
47591 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
47592 +{
47593 + reiser4_key unit_key;
47594 + __u64 unit_off;
47595 + reiser4_extent *ext;
47596 +
47597 + ext = extent_by_coord(coord);
47598 +
47599 + unit_key_extent(coord, &unit_key);
47600 + unit_off = get_key_offset(&unit_key);
47601 + if (off < unit_off)
47602 + return 0;
47603 + if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
47604 + return 0;
47605 + return 1;
47606 +}
47607 +
47608 +static int
47609 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
47610 +{
47611 + reiser4_key item_key;
47612 +
47613 + assert("vs-771", coord_is_existing_unit(coord));
47614 + assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
47615 + assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
47616 +
47617 + return offset_is_in_unit(coord, get_key_offset(key));
47618 +}
47619 +
47620 +#endif
47621 +
47622 +/**
47623 + * can_append -
47624 + * @key:
47625 + * @coord:
47626 + *
47627 + * Returns 1 if @key is equal to an append key of item @coord is set to
47628 + */
47629 +static int can_append(const reiser4_key *key, const coord_t *coord)
47630 +{
47631 + reiser4_key append_key;
47632 +
47633 + return keyeq(key, append_key_extent(coord, &append_key));
47634 +}
47635 +
47636 +/**
47637 + * append_hole
47638 + * @coord:
47639 + * @lh:
47640 + * @key:
47641 + *
47642 + */
47643 +static int append_hole(coord_t *coord, lock_handle *lh,
47644 + const reiser4_key *key)
47645 +{
47646 + reiser4_key append_key;
47647 + reiser4_block_nr hole_width;
47648 + reiser4_extent *ext, new_ext;
47649 + reiser4_item_data idata;
47650 +
47651 + /* last item of file may have to be appended with hole */
47652 + assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
47653 + assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
47654 +
47655 + /* key of first byte which is not addressed by this extent */
47656 + append_key_extent(coord, &append_key);
47657 +
47658 + assert("", keyle(&append_key, key));
47659 +
47660 + /*
47661 + * extent item has to be appended with hole. Calculate length of that
47662 + * hole
47663 + */
47664 + hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
47665 + current_blocksize - 1) >> current_blocksize_bits);
47666 + assert("vs-954", hole_width > 0);
47667 +
47668 + /* set coord after last unit */
47669 + coord_init_after_item_end(coord);
47670 +
47671 + /* get last extent in the item */
47672 + ext = extent_by_coord(coord);
47673 + if (state_of_extent(ext) == HOLE_EXTENT) {
47674 + /*
47675 + * last extent of a file is hole extent. Widen that extent by
47676 + * @hole_width blocks. Note that we do not worry about
47677 + * overflowing - extent width is 64 bits
47678 + */
47679 + set_extent(ext, HOLE_EXTENT_START,
47680 + extent_get_width(ext) + hole_width);
47681 + znode_make_dirty(coord->node);
47682 + return 0;
47683 + }
47684 +
47685 + /* append last item of the file with hole extent unit */
47686 + assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
47687 + state_of_extent(ext) == UNALLOCATED_EXTENT));
47688 +
47689 + set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47690 + init_new_extent(&idata, &new_ext, 1);
47691 + return insert_into_item(coord, lh, &append_key, &idata, 0);
47692 +}
47693 +
47694 +/**
47695 + * check_jnodes
47696 + * @twig: longterm locked twig node
47697 + * @key:
47698 + *
47699 + */
47700 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
47701 +{
47702 +#if REISER4_DEBUG
47703 + coord_t c;
47704 + reiser4_key node_key, jnode_key;
47705 +
47706 + jnode_key = *key;
47707 +
47708 + assert("", twig != NULL);
47709 + assert("", znode_get_level(twig) == TWIG_LEVEL);
47710 + assert("", znode_is_write_locked(twig));
47711 +
47712 + zload(twig);
47713 + /* get the smallest key in twig node */
47714 + coord_init_first_unit(&c, twig);
47715 + unit_key_by_coord(&c, &node_key);
47716 + assert("", keyle(&node_key, &jnode_key));
47717 +
47718 + coord_init_last_unit(&c, twig);
47719 + unit_key_by_coord(&c, &node_key);
47720 + if (item_plugin_by_coord(&c)->s.file.append_key)
47721 + item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
47722 + set_key_offset(&jnode_key,
47723 + get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
47724 + assert("", keylt(&jnode_key, &node_key));
47725 + zrelse(twig);
47726 +#endif
47727 +}
47728 +
47729 +/**
47730 + * append_last_extent - append last file item
47731 + * @uf_coord: coord to start insertion from
47732 + * @jnodes: array of jnodes
47733 + * @count: number of jnodes in the array
47734 + *
47735 + * There is already at least one extent item of file @inode in the tree. Append
47736 + * the last of them with unallocated extent unit of width @count. Assign
47737 + * fake block numbers to jnodes corresponding to the inserted extent.
47738 + */
47739 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47740 + jnode **jnodes, int count)
47741 +{
47742 + int result;
47743 + reiser4_extent new_ext;
47744 + reiser4_item_data idata;
47745 + coord_t *coord;
47746 + extent_coord_extension_t *ext_coord;
47747 + reiser4_extent *ext;
47748 + reiser4_block_nr block;
47749 + jnode *node;
47750 + int i;
47751 +
47752 + coord = &uf_coord->coord;
47753 + ext_coord = &uf_coord->extension.extent;
47754 + ext = ext_by_ext_coord(uf_coord);
47755 +
47756 + /* check correctness of position in the item */
47757 + assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
47758 + assert("vs-1311", coord->between == AFTER_UNIT);
47759 + assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
47760 +
47761 + if (!can_append(key, coord)) {
47762 + /* hole extent has to be inserted */
47763 + result = append_hole(coord, uf_coord->lh, key);
47764 + uf_coord->valid = 0;
47765 + return result;
47766 + }
47767 +
47768 + if (count == 0)
47769 + return 0;
47770 +
47771 + assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
47772 +
47773 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
47774 + count);
47775 + BUG_ON(result != 0);
47776 +
47777 + switch (state_of_extent(ext)) {
47778 + case UNALLOCATED_EXTENT:
47779 + /*
47780 + * last extent unit of the file is unallocated one. Increase
47781 + * its width by @count
47782 + */
47783 + set_extent(ext, UNALLOCATED_EXTENT_START,
47784 + extent_get_width(ext) + count);
47785 + znode_make_dirty(coord->node);
47786 +
47787 + /* update coord extension */
47788 + ext_coord->width += count;
47789 + ON_DEBUG(extent_set_width
47790 + (&uf_coord->extension.extent.extent,
47791 + ext_coord->width));
47792 + break;
47793 +
47794 + case HOLE_EXTENT:
47795 + case ALLOCATED_EXTENT:
47796 + /*
47797 + * last extent unit of the file is either hole or allocated
47798 + * one. Append one unallocated extent of width @count
47799 + */
47800 + set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47801 + init_new_extent(&idata, &new_ext, 1);
47802 + result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
47803 + uf_coord->valid = 0;
47804 + if (result)
47805 + return result;
47806 + break;
47807 +
47808 + default:
47809 + return RETERR(-EIO);
47810 + }
47811 +
47812 + /*
47813 + * make sure that we hold long term locked twig node containing all
47814 + * jnodes we are about to capture
47815 + */
47816 + check_jnodes(uf_coord->lh->node, key, count);
47817 +
47818 + /*
47819 + * assign fake block numbers to all jnodes. FIXME: make sure whether
47820 + * twig node containing inserted extent item is locked
47821 + */
47822 + block = fake_blocknr_unformatted(count);
47823 + for (i = 0; i < count; i ++, block ++) {
47824 + node = jnodes[i];
47825 + spin_lock_jnode(node);
47826 + JF_SET(node, JNODE_CREATED);
47827 + jnode_set_block(node, &block);
47828 + result = try_capture(node, ZNODE_WRITE_LOCK, 0);
47829 + BUG_ON(result != 0);
47830 + jnode_make_dirty_locked(node);
47831 + spin_unlock_jnode(node);
47832 + }
47833 + return count;
47834 +}
47835 +
47836 +/**
47837 + * insert_first_hole - inser hole extent into tree
47838 + * @coord:
47839 + * @lh:
47840 + * @key:
47841 + *
47842 + *
47843 + */
47844 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
47845 + const reiser4_key *key)
47846 +{
47847 + reiser4_extent new_ext;
47848 + reiser4_item_data idata;
47849 + reiser4_key item_key;
47850 + reiser4_block_nr hole_width;
47851 +
47852 + /* @coord must be set for inserting of new item */
47853 + assert("vs-711", coord_is_between_items(coord));
47854 +
47855 + item_key = *key;
47856 + set_key_offset(&item_key, 0ull);
47857 +
47858 + hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
47859 + current_blocksize_bits);
47860 + assert("vs-710", hole_width > 0);
47861 +
47862 + /* compose body of hole extent and insert item into tree */
47863 + set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47864 + init_new_extent(&idata, &new_ext, 1);
47865 + return insert_extent_by_coord(coord, &idata, &item_key, lh);
47866 +}
47867 +
47868 +
47869 +/**
47870 + * insert_first_extent - insert first file item
47871 + * @inode: inode of file
47872 + * @uf_coord: coord to start insertion from
47873 + * @jnodes: array of jnodes
47874 + * @count: number of jnodes in the array
47875 + * @inode:
47876 + *
47877 + * There are no items of file @inode in the tree yet. Insert unallocated extent
47878 + * of width @count into tree or hole extent if writing not to the
47879 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
47880 + * unallocated extent. Returns number of jnodes or error code.
47881 + */
47882 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47883 + jnode **jnodes, int count,
47884 + struct inode *inode)
47885 +{
47886 + int result;
47887 + int i;
47888 + reiser4_extent new_ext;
47889 + reiser4_item_data idata;
47890 + reiser4_block_nr block;
47891 + unix_file_info_t *uf_info;
47892 + jnode *node;
47893 +
47894 + /* first extent insertion starts at leaf level */
47895 + assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
47896 + assert("vs-711", coord_is_between_items(&uf_coord->coord));
47897 +
47898 + if (get_key_offset(key) != 0) {
47899 + result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
47900 + uf_coord->valid = 0;
47901 + uf_info = unix_file_inode_data(inode);
47902 +
47903 + /*
47904 + * first item insertion is only possible when writing to empty
47905 + * file or performing tail conversion
47906 + */
47907 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
47908 + (inode_get_flag(inode, REISER4_PART_MIXED) &&
47909 + inode_get_flag(inode, REISER4_PART_IN_CONV))));
47910 +
47911 + /* if file was empty - update its state */
47912 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
47913 + uf_info->container = UF_CONTAINER_EXTENTS;
47914 + return result;
47915 + }
47916 +
47917 + if (count == 0)
47918 + return 0;
47919 +
47920 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
47921 + BUG_ON(result != 0);
47922 +
47923 + /*
47924 + * prepare for tree modification: compose body of item and item data
47925 + * structure needed for insertion
47926 + */
47927 + set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47928 + init_new_extent(&idata, &new_ext, 1);
47929 +
47930 + /* insert extent item into the tree */
47931 + result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
47932 + uf_coord->lh);
47933 + if (result)
47934 + return result;
47935 +
47936 + /*
47937 + * make sure that we hold long term locked twig node containing all
47938 + * jnodes we are about to capture
47939 + */
47940 + check_jnodes(uf_coord->lh->node, key, count);
47941 + /*
47942 + * assign fake block numbers to all jnodes, capture and mark them dirty
47943 + */
47944 + block = fake_blocknr_unformatted(count);
47945 + for (i = 0; i < count; i ++, block ++) {
47946 + node = jnodes[i];
47947 + spin_lock_jnode(node);
47948 + JF_SET(node, JNODE_CREATED);
47949 + jnode_set_block(node, &block);
47950 + result = try_capture(node, ZNODE_WRITE_LOCK, 0);
47951 + BUG_ON(result != 0);
47952 + jnode_make_dirty_locked(node);
47953 + spin_unlock_jnode(node);
47954 + }
47955 +
47956 + /*
47957 + * invalidate coordinate, research must be performed to continue
47958 + * because write will continue on twig level
47959 + */
47960 + uf_coord->valid = 0;
47961 + return count;
47962 +}
47963 +
47964 +/**
47965 + * plug_hole - replace hole extent with unallocated and holes
47966 + * @uf_coord:
47967 + * @key:
47968 + * @node:
47969 + * @h: structure containing coordinate, lock handle, key, etc
47970 + *
47971 + * Creates an unallocated extent of width 1 within a hole. In worst case two
47972 + * additional extents can be created.
47973 + */
47974 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
47975 +{
47976 + struct replace_handle rh;
47977 + reiser4_extent *ext;
47978 + reiser4_block_nr width, pos_in_unit;
47979 + coord_t *coord;
47980 + extent_coord_extension_t *ext_coord;
47981 + int return_inserted_position;
47982 +
47983 + check_uf_coord(uf_coord, key);
47984 +
47985 + rh.coord = coord_by_uf_coord(uf_coord);
47986 + rh.lh = uf_coord->lh;
47987 + rh.flags = 0;
47988 +
47989 + coord = coord_by_uf_coord(uf_coord);
47990 + ext_coord = ext_coord_by_uf_coord(uf_coord);
47991 + ext = ext_by_ext_coord(uf_coord);
47992 +
47993 + width = ext_coord->width;
47994 + pos_in_unit = ext_coord->pos_in_unit;
47995 +
47996 + *how = 0;
47997 + if (width == 1) {
47998 + set_extent(ext, UNALLOCATED_EXTENT_START, 1);
47999 + znode_make_dirty(coord->node);
48000 + /* update uf_coord */
48001 + ON_DEBUG(ext_coord->extent = *ext);
48002 + *how = 1;
48003 + return 0;
48004 + } else if (pos_in_unit == 0) {
48005 + /* we deal with first element of extent */
48006 + if (coord->unit_pos) {
48007 + /* there is an extent to the left */
48008 + if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
48009 + /*
48010 + * left neighboring unit is an unallocated
48011 + * extent. Increase its width and decrease
48012 + * width of hole
48013 + */
48014 + extent_set_width(ext - 1,
48015 + extent_get_width(ext - 1) + 1);
48016 + extent_set_width(ext, width - 1);
48017 + znode_make_dirty(coord->node);
48018 +
48019 + /* update coord extension */
48020 + coord->unit_pos--;
48021 + ext_coord->width = extent_get_width(ext - 1);
48022 + ext_coord->pos_in_unit = ext_coord->width - 1;
48023 + ext_coord->ext_offset -= sizeof(reiser4_extent);
48024 + ON_DEBUG(ext_coord->extent =
48025 + *extent_by_coord(coord));
48026 + *how = 2;
48027 + return 0;
48028 + }
48029 + }
48030 + /* extent for replace */
48031 + set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
48032 + /* extent to be inserted */
48033 + set_extent(&rh.new_extents[0], HOLE_EXTENT_START, width - 1);
48034 + rh.nr_new_extents = 1;
48035 +
48036 + /* have replace_extent to return with @coord and @uf_coord->lh
48037 + set to unit which was replaced */
48038 + return_inserted_position = 0;
48039 + *how = 3;
48040 + } else if (pos_in_unit == width - 1) {
48041 + /* we deal with last element of extent */
48042 + if (coord->unit_pos < nr_units_extent(coord) - 1) {
48043 + /* there is an extent unit to the right */
48044 + if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
48045 + /*
48046 + * right neighboring unit is an unallocated
48047 + * extent. Increase its width and decrease
48048 + * width of hole
48049 + */
48050 + extent_set_width(ext + 1,
48051 + extent_get_width(ext + 1) + 1);
48052 + extent_set_width(ext, width - 1);
48053 + znode_make_dirty(coord->node);
48054 +
48055 + /* update coord extension */
48056 + coord->unit_pos++;
48057 + ext_coord->width = extent_get_width(ext + 1);
48058 + ext_coord->pos_in_unit = 0;
48059 + ext_coord->ext_offset += sizeof(reiser4_extent);
48060 + ON_DEBUG(ext_coord->extent =
48061 + *extent_by_coord(coord));
48062 + *how = 4;
48063 + return 0;
48064 + }
48065 + }
48066 + /* extent for replace */
48067 + set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
48068 + /* extent to be inserted */
48069 + set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
48070 + rh.nr_new_extents = 1;
48071 +
48072 + /* have replace_extent to return with @coord and @uf_coord->lh
48073 + set to unit which was inserted */
48074 + return_inserted_position = 1;
48075 + *how = 5;
48076 + } else {
48077 + /* extent for replace */
48078 + set_extent(&rh.overwrite, HOLE_EXTENT_START, pos_in_unit);
48079 + /* extents to be inserted */
48080 + set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
48081 + set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
48082 + width - pos_in_unit - 1);
48083 + rh.nr_new_extents = 2;
48084 +
48085 + /* have replace_extent to return with @coord and @uf_coord->lh
48086 + set to first of units which were inserted */
48087 + return_inserted_position = 1;
48088 + *how = 6;
48089 + }
48090 + unit_key_by_coord(coord, &rh.paste_key);
48091 + set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
48092 + extent_get_width(&rh.overwrite) * current_blocksize);
48093 +
48094 + uf_coord->valid = 0;
48095 + return replace_extent(&rh, return_inserted_position);
48096 +}
48097 +
48098 +/**
48099 + * overwrite_one_block -
48100 + * @uf_coord:
48101 + * @key:
48102 + * @node:
48103 + *
48104 + * If @node corresponds to hole extent - create unallocated extent for it and
48105 + * assign fake block number. If @node corresponds to allocated extent - assign
48106 + * block number of jnode
48107 + */
48108 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
48109 + jnode *node, int *hole_plugged)
48110 +{
48111 + int result;
48112 + extent_coord_extension_t *ext_coord;
48113 + reiser4_extent *ext;
48114 + reiser4_block_nr block;
48115 + int how;
48116 +
48117 + assert("vs-1312", uf_coord->coord.between == AT_UNIT);
48118 +
48119 + result = 0;
48120 + ext_coord = ext_coord_by_uf_coord(uf_coord);
48121 + ext = ext_by_ext_coord(uf_coord);
48122 + assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
48123 +
48124 + switch (state_of_extent(ext)) {
48125 + case ALLOCATED_EXTENT:
48126 + block = extent_get_start(ext) + ext_coord->pos_in_unit;
48127 + break;
48128 +
48129 + case HOLE_EXTENT:
48130 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
48131 + BUG_ON(result != 0);
48132 + result = plug_hole(uf_coord, key, &how);
48133 + if (result)
48134 + return result;
48135 + block = fake_blocknr_unformatted(1);
48136 + if (hole_plugged)
48137 + *hole_plugged = 1;
48138 + JF_SET(node, JNODE_CREATED);
48139 + break;
48140 +
48141 + default:
48142 + return RETERR(-EIO);
48143 + }
48144 +
48145 + jnode_set_block(node, &block);
48146 + return 0;
48147 +}
48148 +
48149 +/**
48150 + * move_coord - move coordinate forward
48151 + * @uf_coord:
48152 + *
48153 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
48154 + * the last one already or is invalid.
48155 + */
48156 +static int move_coord(uf_coord_t *uf_coord)
48157 +{
48158 + extent_coord_extension_t *ext_coord;
48159 +
48160 + if (uf_coord->valid == 0)
48161 + return 1;
48162 + ext_coord = &uf_coord->extension.extent;
48163 + ext_coord->pos_in_unit ++;
48164 + if (ext_coord->pos_in_unit < ext_coord->width)
48165 + /* coordinate moved within the unit */
48166 + return 0;
48167 +
48168 + /* end of unit is reached. Try to move to next unit */
48169 + ext_coord->pos_in_unit = 0;
48170 + uf_coord->coord.unit_pos ++;
48171 + if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
48172 + /* coordinate moved to next unit */
48173 + ext_coord->ext_offset += sizeof(reiser4_extent);
48174 + ext_coord->width =
48175 + extent_get_width(ext_by_offset
48176 + (uf_coord->coord.node,
48177 + ext_coord->ext_offset));
48178 + ON_DEBUG(ext_coord->extent =
48179 + *ext_by_offset(uf_coord->coord.node,
48180 + ext_coord->ext_offset));
48181 + return 0;
48182 + }
48183 + /* end of item is reached */
48184 + uf_coord->valid = 0;
48185 + return 1;
48186 +}
48187 +
48188 +/**
48189 + * overwrite_extent -
48190 + * @inode:
48191 + *
48192 + * Returns number of handled jnodes.
48193 + */
48194 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48195 + jnode **jnodes, int count, int *plugged_hole)
48196 +{
48197 + int result;
48198 + reiser4_key k;
48199 + int i;
48200 + jnode *node;
48201 +
48202 + k = *key;
48203 + for (i = 0; i < count; i ++) {
48204 + node = jnodes[i];
48205 + if (*jnode_get_block(node) == 0) {
48206 + result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
48207 + if (result)
48208 + return result;
48209 + }
48210 + /*
48211 + * make sure that we hold long term locked twig node containing
48212 + * all jnodes we are about to capture
48213 + */
48214 + check_jnodes(uf_coord->lh->node, &k, 1);
48215 + /*
48216 + * assign fake block numbers to all jnodes, capture and mark
48217 + * them dirty
48218 + */
48219 + spin_lock_jnode(node);
48220 + result = try_capture(node, ZNODE_WRITE_LOCK, 0);
48221 + BUG_ON(result != 0);
48222 + jnode_make_dirty_locked(node);
48223 + spin_unlock_jnode(node);
48224 +
48225 + if (uf_coord->valid == 0)
48226 + return i + 1;
48227 +
48228 + check_uf_coord(uf_coord, &k);
48229 +
48230 + if (move_coord(uf_coord)) {
48231 + /*
48232 + * failed to move to the next node pointer. Either end
48233 + * of file or end of twig node is reached. In the later
48234 + * case we might go to the right neighbor.
48235 + */
48236 + uf_coord->valid = 0;
48237 + return i + 1;
48238 + }
48239 + set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
48240 + }
48241 +
48242 + return count;
48243 +}
48244 +
48245 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
48246 +
48247 +/**
48248 + * update_extent
48249 + * @file:
48250 + * @jnodes:
48251 + * @count:
48252 + * @off:
48253 + *
48254 + */
48255 +int update_extent(struct inode *inode, jnode *node, loff_t pos,
48256 + int *plugged_hole)
48257 +{
48258 + int result;
48259 + znode *loaded;
48260 + uf_coord_t uf_coord;
48261 + coord_t *coord;
48262 + lock_handle lh;
48263 + reiser4_key key;
48264 +
48265 + assert("", lock_counters()->d_refs == 0);
48266 +
48267 + key_by_inode_and_offset_common(inode, pos, &key);
48268 +
48269 + init_uf_coord(&uf_coord, &lh);
48270 + coord = &uf_coord.coord;
48271 + result = find_file_item_nohint(coord, &lh, &key,
48272 + ZNODE_WRITE_LOCK, inode);
48273 + if (IS_CBKERR(result)) {
48274 + assert("", lock_counters()->d_refs == 0);
48275 + return result;
48276 + }
48277 +
48278 + result = zload(coord->node);
48279 + BUG_ON(result != 0);
48280 + loaded = coord->node;
48281 +
48282 + if (coord->between == AFTER_UNIT) {
48283 + /*
48284 + * append existing extent item with unallocated extent of width
48285 + * nr_jnodes
48286 + */
48287 + init_coord_extension_extent(&uf_coord,
48288 + get_key_offset(&key));
48289 + result = append_last_extent(&uf_coord, &key,
48290 + &node, 1);
48291 + } else if (coord->between == AT_UNIT) {
48292 + /*
48293 + * overwrite
48294 + * not optimal yet. Will be optimized if new write will show
48295 + * performance win.
48296 + */
48297 + init_coord_extension_extent(&uf_coord,
48298 + get_key_offset(&key));
48299 + result = overwrite_extent(&uf_coord, &key,
48300 + &node, 1, plugged_hole);
48301 + } else {
48302 + /*
48303 + * there are no items of this file in the tree yet. Create
48304 + * first item of the file inserting one unallocated extent of
48305 + * width nr_jnodes
48306 + */
48307 + result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
48308 + }
48309 + assert("", result == 1 || result < 0);
48310 + zrelse(loaded);
48311 + done_lh(&lh);
48312 + assert("", lock_counters()->d_refs == 0);
48313 + return (result == 1) ? 0 : result;
48314 +}
48315 +
48316 +/**
48317 + * update_extents
48318 + * @file:
48319 + * @jnodes:
48320 + * @count:
48321 + * @off:
48322 + *
48323 + */
48324 +static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
48325 +{
48326 + struct inode *inode;
48327 + struct hint hint;
48328 + reiser4_key key;
48329 + int result;
48330 + znode *loaded;
48331 +
48332 + result = load_file_hint(file, &hint);
48333 + BUG_ON(result != 0);
48334 +
48335 + inode = file->f_dentry->d_inode;
48336 + if (count != 0)
48337 + /*
48338 + * count == 0 is special case: expanding truncate
48339 + */
48340 + pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
48341 + key_by_inode_and_offset_common(inode, pos, &key);
48342 +
48343 + assert("", lock_counters()->d_refs == 0);
48344 +
48345 + do {
48346 + result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
48347 + if (IS_CBKERR(result)) {
48348 + assert("", lock_counters()->d_refs == 0);
48349 + return result;
48350 + }
48351 +
48352 + result = zload(hint.ext_coord.coord.node);
48353 + BUG_ON(result != 0);
48354 + loaded = hint.ext_coord.coord.node;
48355 +
48356 + if (hint.ext_coord.coord.between == AFTER_UNIT) {
48357 + /*
48358 + * append existing extent item with unallocated extent
48359 + * of width nr_jnodes
48360 + */
48361 + if (hint.ext_coord.valid == 0)
48362 + /* NOTE: get statistics on this */
48363 + init_coord_extension_extent(&hint.ext_coord,
48364 + get_key_offset(&key));
48365 + result = append_last_extent(&hint.ext_coord, &key,
48366 + jnodes, count);
48367 + } else if (hint.ext_coord.coord.between == AT_UNIT) {
48368 + /*
48369 + * overwrite
48370 + * not optimal yet. Will be optimized if new write will
48371 + * show performance win.
48372 + */
48373 + if (hint.ext_coord.valid == 0)
48374 + /* NOTE: get statistics on this */
48375 + init_coord_extension_extent(&hint.ext_coord,
48376 + get_key_offset(&key));
48377 + result = overwrite_extent(&hint.ext_coord, &key,
48378 + jnodes, count, NULL);
48379 + } else {
48380 + /*
48381 + * there are no items of this file in the tree
48382 + * yet. Create first item of the file inserting one
48383 + * unallocated extent of * width nr_jnodes
48384 + */
48385 + result = insert_first_extent(&hint.ext_coord, &key,
48386 + jnodes, count, inode);
48387 + }
48388 + zrelse(loaded);
48389 + if (result < 0) {
48390 + done_lh(hint.ext_coord.lh);
48391 + break;
48392 + }
48393 +
48394 + jnodes += result;
48395 + count -= result;
48396 + set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
48397 +
48398 + /* seal and unlock znode */
48399 + if (hint.ext_coord.valid)
48400 + set_hint(&hint, &key, ZNODE_WRITE_LOCK);
48401 + else
48402 + unset_hint(&hint);
48403 +
48404 + } while (count > 0);
48405 +
48406 + save_file_hint(file, &hint);
48407 + assert("", lock_counters()->d_refs == 0);
48408 + return result;
48409 +}
48410 +
48411 +/**
48412 + * write_extent_reserve_space - reserve space for extent write operation
48413 + * @inode:
48414 + *
48415 + * Estimates and reserves space which may be required for writing
48416 + * WRITE_GRANULARITY pages of file.
48417 + */
48418 +static int write_extent_reserve_space(struct inode *inode)
48419 +{
48420 + __u64 count;
48421 + reiser4_tree *tree;
48422 +
48423 + /*
48424 + * to write WRITE_GRANULARITY pages to a file by extents we have to
48425 + * reserve disk space for:
48426 +
48427 + * 1. find_file_item may have to insert empty node to the tree (empty
48428 + * leaf node between two extent items). This requires 1 block and
48429 + * number of blocks which are necessary to perform insertion of an
48430 + * internal item into twig level.
48431 +
48432 + * 2. for each of written pages there might be needed 1 block and
48433 + * number of blocks which might be necessary to perform insertion of or
48434 + * paste to an extent item.
48435 +
48436 + * 3. stat data update
48437 + */
48438 + tree = tree_by_inode(inode);
48439 + count = estimate_one_insert_item(tree) +
48440 + WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
48441 + estimate_one_insert_item(tree);
48442 + grab_space_enable();
48443 + return reiser4_grab_space(count, 0 /* flags */);
48444 +}
48445 +
48446 +/**
48447 + * write_extent - write method of extent item plugin
48448 + * @file: file to write to
48449 + * @buf: address of user-space buffer
48450 + * @write_amount: number of bytes to write
48451 + * @off: position in file to write to
48452 + *
48453 + */
48454 +ssize_t write_extent(struct file *file, const char __user *buf, size_t count,
48455 + loff_t *pos)
48456 +{
48457 + int have_to_update_extent;
48458 + int nr_pages;
48459 + struct page *page;
48460 + jnode *jnodes[WRITE_GRANULARITY + 1];
48461 + struct inode *inode;
48462 + unsigned long index;
48463 + unsigned long end;
48464 + int i;
48465 + int to_page, page_off;
48466 + size_t left, written;
48467 + int result;
48468 +
48469 + inode = file->f_dentry->d_inode;
48470 + if (write_extent_reserve_space(inode))
48471 + return RETERR(-ENOSPC);
48472 +
48473 + if (count == 0) {
48474 + /* truncate case */
48475 + update_extents(file, jnodes, 0, *pos);
48476 + return 0;
48477 + }
48478 +
48479 + BUG_ON(get_current_context()->trans->atom != NULL);
48480 +
48481 + index = *pos >> PAGE_CACHE_SHIFT;
48482 + /* calculate number of pages which are to be written */
48483 + end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
48484 + nr_pages = end - index + 1;
48485 + assert("", nr_pages <= WRITE_GRANULARITY + 1);
48486 +
48487 + /* get pages and jnodes */
48488 + for (i = 0; i < nr_pages; i ++) {
48489 + page = find_or_create_page(inode->i_mapping, index + i, get_gfp_mask());
48490 + if (page == NULL) {
48491 + while(i --) {
48492 + unlock_page(jnode_page(jnodes[i]));
48493 + page_cache_release(jnode_page(jnodes[i]));
48494 + }
48495 + return RETERR(-ENOMEM);
48496 + }
48497 +
48498 + jnodes[i] = jnode_of_page(page);
48499 + if (IS_ERR(jnodes[i])) {
48500 + unlock_page(page);
48501 + page_cache_release(page);
48502 + while (i --) {
48503 + jput(jnodes[i]);
48504 + page_cache_release(jnode_page(jnodes[i]));
48505 + }
48506 + return RETERR(-ENOMEM);
48507 + }
48508 + /* prevent jnode and page from disconnecting */
48509 + JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
48510 + unlock_page(page);
48511 + }
48512 +
48513 + BUG_ON(get_current_context()->trans->atom != NULL);
48514 +
48515 + have_to_update_extent = 0;
48516 +
48517 + left = count;
48518 + page_off = (*pos & (PAGE_CACHE_SIZE - 1));
48519 + for (i = 0; i < nr_pages; i ++) {
48520 + to_page = PAGE_CACHE_SIZE - page_off;
48521 + if (to_page > left)
48522 + to_page = left;
48523 + page = jnode_page(jnodes[i]);
48524 + if (((loff_t)page->index << PAGE_CACHE_SHIFT) < inode->i_size &&
48525 + !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48526 + /*
48527 + * the above is not optimal for partial write to last
48528 + * page of file when file size is not at boundary of
48529 + * page
48530 + */
48531 + lock_page(page);
48532 + if (!PageUptodate(page)) {
48533 + result = readpage_unix_file(NULL, page);
48534 + BUG_ON(result != 0);
48535 + /* wait for read completion */
48536 + lock_page(page);
48537 + BUG_ON(!PageUptodate(page));
48538 + unlock_page(page);
48539 + } else
48540 + result = 0;
48541 + }
48542 +
48543 + BUG_ON(get_current_context()->trans->atom != NULL);
48544 + fault_in_pages_readable(buf, to_page);
48545 + BUG_ON(get_current_context()->trans->atom != NULL);
48546 +
48547 + lock_page(page);
48548 + if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48549 + void *kaddr;
48550 +
48551 + kaddr = kmap_atomic(page, KM_USER0);
48552 + memset(kaddr, 0, page_off);
48553 + memset(kaddr + page_off + to_page, 0,
48554 + PAGE_CACHE_SIZE - (page_off + to_page));
48555 + flush_dcache_page(page);
48556 + kunmap_atomic(kaddr, KM_USER0);
48557 + }
48558 +
48559 + written = filemap_copy_from_user(page, page_off, buf, to_page);
48560 + if (written != to_page) {
48561 + unlock_page(page);
48562 + page_cache_release(page);
48563 + nr_pages = i;
48564 + jput(jnodes[i]);
48565 + result = RETERR(-EFAULT);
48566 + break;
48567 + }
48568 + flush_dcache_page(page);
48569 + set_page_dirty_internal(page);
48570 + unlock_page(page);
48571 + mark_page_accessed(page);
48572 + SetPageUptodate(page);
48573 + page_cache_release(page);
48574 +
48575 + if (jnodes[i]->blocknr == 0)
48576 + have_to_update_extent ++;
48577 +
48578 + page_off = 0;
48579 + buf += to_page;
48580 + left -= to_page;
48581 + BUG_ON(get_current_context()->trans->atom != NULL);
48582 + }
48583 +
48584 + if (have_to_update_extent) {
48585 + update_extents(file, jnodes, nr_pages, *pos);
48586 + } else {
48587 + for (i = 0; i < nr_pages; i ++) {
48588 + spin_lock_jnode(jnodes[i]);
48589 + result = try_capture(jnodes[i], ZNODE_WRITE_LOCK, 0);
48590 + BUG_ON(result != 0);
48591 + jnode_make_dirty_locked(jnodes[i]);
48592 + spin_unlock_jnode(jnodes[i]);
48593 + }
48594 + }
48595 +
48596 + for (i = 0; i < nr_pages; i ++) {
48597 + JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
48598 + jput(jnodes[i]);
48599 + }
48600 +
48601 + /* the only error handled so far is EFAULT on copy_from_user */
48602 + return (count - left) ? (count - left) : -EFAULT;
48603 +}
48604 +
48605 +static inline void zero_page(struct page *page)
48606 +{
48607 + char *kaddr = kmap_atomic(page, KM_USER0);
48608 +
48609 + memset(kaddr, 0, PAGE_CACHE_SIZE);
48610 + flush_dcache_page(page);
48611 + kunmap_atomic(kaddr, KM_USER0);
48612 + SetPageUptodate(page);
48613 + unlock_page(page);
48614 +}
48615 +
48616 +static int
48617 +do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
48618 + struct page *page)
48619 +{
48620 + jnode *j;
48621 + struct address_space *mapping;
48622 + unsigned long index;
48623 + oid_t oid;
48624 + reiser4_block_nr block;
48625 +
48626 + mapping = page->mapping;
48627 + oid = get_inode_oid(mapping->host);
48628 + index = page->index;
48629 +
48630 + switch (state_of_extent(ext)) {
48631 + case HOLE_EXTENT:
48632 + /*
48633 + * it is possible to have hole page with jnode, if page was
48634 + * eflushed previously.
48635 + */
48636 + j = jfind(mapping, index);
48637 + if (j == NULL) {
48638 + zero_page(page);
48639 + return 0;
48640 + }
48641 + spin_lock_jnode(j);
48642 + if (!jnode_page(j)) {
48643 + jnode_attach_page(j, page);
48644 + } else {
48645 + BUG_ON(jnode_page(j) != page);
48646 + assert("vs-1504", jnode_page(j) == page);
48647 + }
48648 + block = *jnode_get_io_block(j);
48649 + spin_unlock_jnode(j);
48650 + if (block == 0) {
48651 + zero_page(page);
48652 + jput(j);
48653 + return 0;
48654 + }
48655 + break;
48656 +
48657 + case ALLOCATED_EXTENT:
48658 + j = jnode_of_page(page);
48659 + if (IS_ERR(j))
48660 + return PTR_ERR(j);
48661 + if (*jnode_get_block(j) == 0) {
48662 + reiser4_block_nr blocknr;
48663 +
48664 + blocknr = extent_get_start(ext) + pos;
48665 + jnode_set_block(j, &blocknr);
48666 + } else
48667 + assert("vs-1403",
48668 + j->blocknr == extent_get_start(ext) + pos);
48669 + break;
48670 +
48671 + case UNALLOCATED_EXTENT:
48672 + j = jfind(mapping, index);
48673 + assert("nikita-2688", j);
48674 + assert("vs-1426", jnode_page(j) == NULL);
48675 +
48676 + spin_lock_jnode(j);
48677 + jnode_attach_page(j, page);
48678 + spin_unlock_jnode(j);
48679 + break;
48680 +
48681 + default:
48682 + warning("vs-957", "wrong extent\n");
48683 + return RETERR(-EIO);
48684 + }
48685 +
48686 + BUG_ON(j == 0);
48687 + page_io(page, j, READ, get_gfp_mask());
48688 + jput(j);
48689 + return 0;
48690 +}
48691 +
48692 +static int
48693 +move_coord_pages(coord_t * coord, extent_coord_extension_t * ext_coord,
48694 + unsigned count)
48695 +{
48696 + reiser4_extent *ext;
48697 +
48698 + ext_coord->expected_page += count;
48699 +
48700 + ext = ext_by_offset(coord->node, ext_coord->ext_offset);
48701 +
48702 + do {
48703 + if (ext_coord->pos_in_unit + count < ext_coord->width) {
48704 + ext_coord->pos_in_unit += count;
48705 + break;
48706 + }
48707 +
48708 + if (coord->unit_pos == ext_coord->nr_units - 1) {
48709 + coord->between = AFTER_UNIT;
48710 + return 1;
48711 + }
48712 +
48713 + /* shift to next unit */
48714 + count -= (ext_coord->width - ext_coord->pos_in_unit);
48715 + coord->unit_pos++;
48716 + ext_coord->pos_in_unit = 0;
48717 + ext_coord->ext_offset += sizeof(reiser4_extent);
48718 + ext++;
48719 + ON_DEBUG(ext_coord->extent = *ext);
48720 + ext_coord->width = extent_get_width(ext);
48721 + } while (1);
48722 +
48723 + return 0;
48724 +}
48725 +
48726 +static int readahead_readpage_extent(void *vp, struct page *page)
48727 +{
48728 + int result;
48729 + uf_coord_t *uf_coord;
48730 + coord_t *coord;
48731 + extent_coord_extension_t *ext_coord;
48732 +
48733 + uf_coord = vp;
48734 + coord = &uf_coord->coord;
48735 +
48736 + if (coord->between != AT_UNIT) {
48737 + unlock_page(page);
48738 + return RETERR(-EINVAL);
48739 + }
48740 +
48741 + ext_coord = &uf_coord->extension.extent;
48742 + if (ext_coord->expected_page != page->index) {
48743 + /* read_cache_pages skipped few pages. Try to adjust coord to page */
48744 + assert("vs-1269", page->index > ext_coord->expected_page);
48745 + if (move_coord_pages
48746 + (coord, ext_coord,
48747 + page->index - ext_coord->expected_page)) {
48748 + /* extent pointing to this page is not here */
48749 + unlock_page(page);
48750 + return RETERR(-EINVAL);
48751 + }
48752 +
48753 + assert("vs-1274", offset_is_in_unit(coord,
48754 + (loff_t) page->
48755 + index << PAGE_CACHE_SHIFT));
48756 + ext_coord->expected_page = page->index;
48757 + }
48758 +
48759 + assert("vs-1281", page->index == ext_coord->expected_page);
48760 + result =
48761 + do_readpage_extent(ext_by_ext_coord(uf_coord),
48762 + ext_coord->pos_in_unit, page);
48763 + if (!result)
48764 + move_coord_pages(coord, ext_coord, 1);
48765 + return result;
48766 +}
48767 +
48768 +static int move_coord_forward(uf_coord_t *ext_coord)
48769 +{
48770 + coord_t *coord;
48771 + extent_coord_extension_t *extension;
48772 +
48773 + check_uf_coord(ext_coord, NULL);
48774 +
48775 + extension = &ext_coord->extension.extent;
48776 + extension->pos_in_unit++;
48777 + if (extension->pos_in_unit < extension->width)
48778 + /* stay within the same extent unit */
48779 + return 0;
48780 +
48781 + coord = &ext_coord->coord;
48782 +
48783 + /* try to move to the next extent unit */
48784 + coord->unit_pos++;
48785 + if (coord->unit_pos < extension->nr_units) {
48786 + /* went to the next extent unit */
48787 + reiser4_extent *ext;
48788 +
48789 + extension->pos_in_unit = 0;
48790 + extension->ext_offset += sizeof(reiser4_extent);
48791 + ext = ext_by_offset(coord->node, extension->ext_offset);
48792 + ON_DEBUG(extension->extent = *ext);
48793 + extension->width = extent_get_width(ext);
48794 + return 0;
48795 + }
48796 +
48797 + /* there is no units in the item anymore */
48798 + return 1;
48799 +}
48800 +
48801 +/* this is called by read_cache_pages for each of readahead pages */
48802 +static int extent_readpage_filler(void *data, struct page *page)
48803 +{
48804 + hint_t *hint;
48805 + loff_t offset;
48806 + reiser4_key key;
48807 + uf_coord_t *ext_coord;
48808 + int result;
48809 +
48810 + offset = (loff_t) page->index << PAGE_CACHE_SHIFT;
48811 + key_by_inode_and_offset_common(page->mapping->host, offset, &key);
48812 +
48813 + hint = (hint_t *) data;
48814 + ext_coord = &hint->ext_coord;
48815 +
48816 + BUG_ON(PageUptodate(page));
48817 + unlock_page(page);
48818 +
48819 + if (hint_validate(hint, &key, 1 /* check key */ , ZNODE_READ_LOCK) != 0) {
48820 + result = coord_by_key(current_tree, &key, &ext_coord->coord,
48821 + ext_coord->lh, ZNODE_READ_LOCK,
48822 + FIND_EXACT, TWIG_LEVEL,
48823 + TWIG_LEVEL, CBK_UNIQUE, NULL);
48824 + if (result != CBK_COORD_FOUND) {
48825 + unset_hint(hint);
48826 + return result;
48827 + }
48828 + ext_coord->valid = 0;
48829 + }
48830 +
48831 + if (zload(ext_coord->coord.node)) {
48832 + unset_hint(hint);
48833 + return RETERR(-EIO);
48834 + }
48835 + if (!item_is_extent(&ext_coord->coord)) {
48836 + /* tail conversion is running in parallel */
48837 + zrelse(ext_coord->coord.node);
48838 + unset_hint(hint);
48839 + return RETERR(-EIO);
48840 + }
48841 +
48842 + if (ext_coord->valid == 0)
48843 + init_coord_extension_extent(ext_coord, offset);
48844 +
48845 + check_uf_coord(ext_coord, &key);
48846 +
48847 + lock_page(page);
48848 + if (!PageUptodate(page)) {
48849 + result = do_readpage_extent(ext_by_ext_coord(ext_coord),
48850 + ext_coord->extension.extent.
48851 + pos_in_unit, page);
48852 + if (result)
48853 + unlock_page(page);
48854 + } else {
48855 + unlock_page(page);
48856 + result = 0;
48857 + }
48858 + if (!result && move_coord_forward(ext_coord) == 0) {
48859 + set_key_offset(&key, offset + PAGE_CACHE_SIZE);
48860 + set_hint(hint, &key, ZNODE_READ_LOCK);
48861 + } else
48862 + unset_hint(hint);
48863 + zrelse(ext_coord->coord.node);
48864 + return result;
48865 +}
48866 +
48867 +/* this is called by reiser4_readpages */
48868 +static void
48869 +extent_readpages_hook(struct address_space *mapping, struct list_head *pages,
48870 + void *data)
48871 +{
48872 + /* FIXME: try whether having reiser4_read_cache_pages improves anything */
48873 + read_cache_pages(mapping, pages, extent_readpage_filler, data);
48874 +}
48875 +
48876 +static int
48877 +call_page_cache_readahead(struct address_space *mapping, struct file *file,
48878 + hint_t * hint,
48879 + unsigned long page_nr,
48880 + unsigned long ra_pages, struct file_ra_state *ra)
48881 +{
48882 + reiser4_file_fsdata *fsdata;
48883 + int result;
48884 +
48885 + fsdata = reiser4_get_file_fsdata(file);
48886 + if (IS_ERR(fsdata))
48887 + return page_nr;
48888 + fsdata->ra2.data = hint;
48889 + fsdata->ra2.readpages = extent_readpages_hook;
48890 +
48891 + result = page_cache_readahead(mapping, ra, file, page_nr, ra_pages);
48892 + fsdata->ra2.readpages = NULL;
48893 + return result;
48894 +}
48895 +
48896 +/* this is called when readahead did not */
48897 +static int call_readpage(struct file *file, struct page *page)
48898 +{
48899 + int result;
48900 +
48901 + result = readpage_unix_file_nolock(file, page);
48902 + if (result)
48903 + return result;
48904 +
48905 + lock_page(page);
48906 + if (!PageUptodate(page)) {
48907 + unlock_page(page);
48908 + page_detach_jnode(page, page->mapping, page->index);
48909 + warning("jmacd-97178", "page is not up to date");
48910 + return RETERR(-EIO);
48911 + }
48912 + unlock_page(page);
48913 + return 0;
48914 +}
48915 +
48916 +static int filler(void *vp, struct page *page)
48917 +{
48918 + return readpage_unix_file_nolock(vp, page);
48919 +}
48920 +
48921 +/* Implements plugin->u.item.s.file.read operation for extent items. */
48922 +int read_extent(struct file *file, flow_t *flow, hint_t *hint)
48923 +{
48924 + int result;
48925 + struct page *page;
48926 + unsigned long cur_page, next_page;
48927 + unsigned long page_off, count;
48928 + struct address_space *mapping;
48929 + loff_t file_off;
48930 + uf_coord_t *uf_coord;
48931 + coord_t *coord;
48932 + extent_coord_extension_t *ext_coord;
48933 + unsigned long nr_pages, prev_page;
48934 + struct file_ra_state ra;
48935 + char *kaddr;
48936 +
48937 + assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
48938 + assert("vs-572", flow->user == 1);
48939 + assert("vs-1351", flow->length > 0);
48940 +
48941 + uf_coord = &hint->ext_coord;
48942 +
48943 + check_uf_coord(uf_coord, NULL);
48944 + assert("vs-33", uf_coord->lh == &hint->lh);
48945 +
48946 + coord = &uf_coord->coord;
48947 + assert("vs-1119", znode_is_rlocked(coord->node));
48948 + assert("vs-1120", znode_is_loaded(coord->node));
48949 + assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
48950 +
48951 + mapping = file->f_dentry->d_inode->i_mapping;
48952 + ext_coord = &uf_coord->extension.extent;
48953 +
48954 + /* offset in a file to start read from */
48955 + file_off = get_key_offset(&flow->key);
48956 + /* offset within the page to start read from */
48957 + page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
48958 + /* bytes which can be read from the page which contains file_off */
48959 + count = PAGE_CACHE_SIZE - page_off;
48960 +
48961 + /* index of page containing offset read is to start from */
48962 + cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
48963 + next_page = cur_page;
48964 + /* number of pages flow spans over */
48965 + nr_pages =
48966 + ((file_off + flow->length + PAGE_CACHE_SIZE -
48967 + 1) >> PAGE_CACHE_SHIFT) - cur_page;
48968 +
48969 + /* we start having twig node read locked. However, we do not want to
48970 + keep that lock all the time readahead works. So, set a sel and
48971 + release twig node. */
48972 + set_hint(hint, &flow->key, ZNODE_READ_LOCK);
48973 + /* &hint->lh is done-ed */
48974 +
48975 + ra = file->f_ra;
48976 + prev_page = ra.prev_page;
48977 + do {
48978 + txn_restart_current();
48979 + if (next_page == cur_page)
48980 + next_page =
48981 + call_page_cache_readahead(mapping, file, hint,
48982 + cur_page, nr_pages, &ra);
48983 +
48984 + page = find_get_page(mapping, cur_page);
48985 + if (unlikely(page == NULL)) {
48986 + handle_ra_miss(mapping, &ra, cur_page);
48987 + page = read_cache_page(mapping, cur_page, filler, file);
48988 + if (IS_ERR(page))
48989 + return PTR_ERR(page);
48990 + lock_page(page);
48991 + if (!PageUptodate(page)) {
48992 + unlock_page(page);
48993 + page_detach_jnode(page, mapping, cur_page);
48994 + page_cache_release(page);
48995 + warning("jmacd-97178",
48996 + "extent_read: page is not up to date");
48997 + return RETERR(-EIO);
48998 + }
48999 + unlock_page(page);
49000 + } else {
49001 + if (!PageUptodate(page)) {
49002 + lock_page(page);
49003 +
49004 + assert("", page->mapping == mapping);
49005 + if (PageUptodate(page))
49006 + unlock_page(page);
49007 + else {
49008 + result = call_readpage(file, page);
49009 + if (result) {
49010 + page_cache_release(page);
49011 + return RETERR(result);
49012 + }
49013 + }
49014 + }
49015 + if (prev_page != cur_page)
49016 + mark_page_accessed(page);
49017 + prev_page = cur_page;
49018 + }
49019 +
49020 + /* If users can be writing to this page using arbitrary virtual
49021 + addresses, take care about potential aliasing before reading
49022 + the page on the kernel side.
49023 + */
49024 + if (mapping_writably_mapped(mapping))
49025 + flush_dcache_page(page);
49026 +
49027 + assert("nikita-3034", schedulable());
49028 +
49029 + /* number of bytes which are to be read from the page */
49030 + if (count > flow->length)
49031 + count = flow->length;
49032 +
49033 + result = fault_in_pages_writeable(flow->data, count);
49034 + if (result) {
49035 + page_cache_release(page);
49036 + return RETERR(-EFAULT);
49037 + }
49038 +
49039 + kaddr = kmap_atomic(page, KM_USER0);
49040 + result = __copy_to_user_inatomic(flow->data,
49041 + kaddr + page_off, count);
49042 + kunmap_atomic(kaddr, KM_USER0);
49043 + if (result != 0) {
49044 + kaddr = kmap(page);
49045 + result = __copy_to_user(flow->data, kaddr + page_off, count);
49046 + kunmap(page);
49047 + if (unlikely(result))
49048 + return RETERR(-EFAULT);
49049 + }
49050 +
49051 + page_cache_release(page);
49052 +
49053 + /* increase key (flow->key), update user area pointer (flow->data) */
49054 + move_flow_forward(flow, count);
49055 +
49056 + page_off = 0;
49057 + cur_page ++;
49058 + count = PAGE_CACHE_SIZE;
49059 + nr_pages--;
49060 + } while (flow->length);
49061 +
49062 + file->f_ra = ra;
49063 + return 0;
49064 +}
49065 +
49066 +/*
49067 + plugin->u.item.s.file.readpages
49068 +*/
49069 +void
49070 +readpages_extent(void *vp, struct address_space *mapping,
49071 + struct list_head *pages)
49072 +{
49073 + assert("vs-1739", 0);
49074 + if (vp)
49075 + read_cache_pages(mapping, pages, readahead_readpage_extent, vp);
49076 +}
49077 +
49078 +/*
49079 + plugin->s.file.readpage
49080 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
49081 + or
49082 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
49083 +
49084 + At the beginning: coord->node is read locked, zloaded, page is
49085 + locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
49086 +*/
49087 +int readpage_extent(void *vp, struct page *page)
49088 +{
49089 + uf_coord_t *uf_coord = vp;
49090 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
49091 + ON_DEBUG(reiser4_key key);
49092 +
49093 + assert("vs-1040", PageLocked(page));
49094 + assert("vs-1050", !PageUptodate(page));
49095 + assert("vs-1039", page->mapping && page->mapping->host);
49096 +
49097 + assert("vs-1044", znode_is_loaded(coord->node));
49098 + assert("vs-758", item_is_extent(coord));
49099 + assert("vs-1046", coord_is_existing_unit(coord));
49100 + assert("vs-1045", znode_is_rlocked(coord->node));
49101 + assert("vs-1047",
49102 + page->mapping->host->i_ino ==
49103 + get_key_objectid(item_key_by_coord(coord, &key)));
49104 + check_uf_coord(uf_coord, NULL);
49105 +
49106 + return do_readpage_extent(ext_by_ext_coord(uf_coord),
49107 + uf_coord->extension.extent.pos_in_unit, page);
49108 +}
49109 +
49110 +/**
49111 + * get_block_address_extent
49112 + * @coord:
49113 + * @block:
49114 + * @result:
49115 + *
49116 + *
49117 + */
49118 +int get_block_address_extent(const coord_t *coord, sector_t block,
49119 + sector_t *result)
49120 +{
49121 + reiser4_extent *ext;
49122 +
49123 + if (!coord_is_existing_unit(coord))
49124 + return RETERR(-EINVAL);
49125 +
49126 + ext = extent_by_coord(coord);
49127 +
49128 + if (state_of_extent(ext) != ALLOCATED_EXTENT)
49129 + /* FIXME: bad things may happen if it is unallocated extent */
49130 + *result = 0;
49131 + else {
49132 + reiser4_key key;
49133 +
49134 + unit_key_by_coord(coord, &key);
49135 + assert("vs-1645",
49136 + block >= get_key_offset(&key) >> current_blocksize_bits);
49137 + assert("vs-1646",
49138 + block <
49139 + (get_key_offset(&key) >> current_blocksize_bits) +
49140 + extent_get_width(ext));
49141 + *result =
49142 + extent_get_start(ext) + (block -
49143 + (get_key_offset(&key) >>
49144 + current_blocksize_bits));
49145 + }
49146 + return 0;
49147 +}
49148 +
49149 +/*
49150 + plugin->u.item.s.file.append_key
49151 + key of first byte which is the next to last byte by addressed by this extent
49152 +*/
49153 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
49154 +{
49155 + item_key_by_coord(coord, key);
49156 + set_key_offset(key,
49157 + get_key_offset(key) + extent_size(coord,
49158 + nr_units_extent
49159 + (coord)));
49160 +
49161 + assert("vs-610", get_key_offset(key)
49162 + && (get_key_offset(key) & (current_blocksize - 1)) == 0);
49163 + return key;
49164 +}
49165 +
49166 +/* plugin->u.item.s.file.init_coord_extension */
49167 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
49168 +{
49169 + coord_t *coord;
49170 + extent_coord_extension_t *ext_coord;
49171 + reiser4_key key;
49172 + loff_t offset;
49173 +
49174 + assert("vs-1295", uf_coord->valid == 0);
49175 +
49176 + coord = &uf_coord->coord;
49177 + assert("vs-1288", coord_is_iplug_set(coord));
49178 + assert("vs-1327", znode_is_loaded(coord->node));
49179 +
49180 + if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
49181 + return;
49182 +
49183 + ext_coord = &uf_coord->extension.extent;
49184 + ext_coord->nr_units = nr_units_extent(coord);
49185 + ext_coord->ext_offset =
49186 + (char *)extent_by_coord(coord) - zdata(coord->node);
49187 + ext_coord->width = extent_get_width(extent_by_coord(coord));
49188 + ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
49189 + uf_coord->valid = 1;
49190 +
49191 + /* pos_in_unit is the only uninitialized field in extended coord */
49192 + if (coord->between == AFTER_UNIT) {
49193 + assert("vs-1330",
49194 + coord->unit_pos == nr_units_extent(coord) - 1);
49195 +
49196 + ext_coord->pos_in_unit = ext_coord->width - 1;
49197 + } else {
49198 + /* AT_UNIT */
49199 + unit_key_by_coord(coord, &key);
49200 + offset = get_key_offset(&key);
49201 +
49202 + assert("vs-1328", offset <= lookuped);
49203 + assert("vs-1329",
49204 + lookuped <
49205 + offset + ext_coord->width * current_blocksize);
49206 + ext_coord->pos_in_unit =
49207 + ((lookuped - offset) >> current_blocksize_bits);
49208 + }
49209 +}
49210 +
49211 +/*
49212 + * Local variables:
49213 + * c-indentation-style: "K&R"
49214 + * mode-name: "LC"
49215 + * c-basic-offset: 8
49216 + * tab-width: 8
49217 + * fill-column: 79
49218 + * scroll-step: 1
49219 + * End:
49220 + */
49221 Index: linux-2.6.16/fs/reiser4/plugin/item/extent_flush_ops.c
49222 ===================================================================
49223 --- /dev/null
49224 +++ linux-2.6.16/fs/reiser4/plugin/item/extent_flush_ops.c
49225 @@ -0,0 +1,1018 @@
49226 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49227 +
49228 +#include "item.h"
49229 +#include "../../tree.h"
49230 +#include "../../jnode.h"
49231 +#include "../../super.h"
49232 +#include "../../flush.h"
49233 +#include "../../carry.h"
49234 +#include "../object.h"
49235 +
49236 +#include <linux/pagemap.h>
49237 +
49238 +static reiser4_block_nr extent_unit_start(const coord_t * item);
49239 +
49240 +/* Return either first or last extent (depending on @side) of the item
49241 + @coord is set to. Set @pos_in_unit either to first or to last block
49242 + of extent. */
49243 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
49244 + reiser4_block_nr * pos_in_unit)
49245 +{
49246 + reiser4_extent *ext;
49247 +
49248 + if (side == LEFT_SIDE) {
49249 + /* get first extent of item */
49250 + ext = extent_item(coord);
49251 + *pos_in_unit = 0;
49252 + } else {
49253 + /* get last extent of item and last position within it */
49254 + assert("vs-363", side == RIGHT_SIDE);
49255 + ext = extent_item(coord) + coord_last_unit_pos(coord);
49256 + *pos_in_unit = extent_get_width(ext) - 1;
49257 + }
49258 +
49259 + return ext;
49260 +}
49261 +
49262 +/* item_plugin->f.utmost_child */
49263 +/* Return the child. Coord is set to extent item. Find jnode corresponding
49264 + either to first or to last unformatted node pointed by the item */
49265 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
49266 +{
49267 + reiser4_extent *ext;
49268 + reiser4_block_nr pos_in_unit;
49269 +
49270 + ext = extent_utmost_ext(coord, side, &pos_in_unit);
49271 +
49272 + switch (state_of_extent(ext)) {
49273 + case HOLE_EXTENT:
49274 + *childp = NULL;
49275 + return 0;
49276 + case ALLOCATED_EXTENT:
49277 + case UNALLOCATED_EXTENT:
49278 + break;
49279 + default:
49280 + /* this should never happen */
49281 + assert("vs-1417", 0);
49282 + }
49283 +
49284 + {
49285 + reiser4_key key;
49286 + reiser4_tree *tree;
49287 + unsigned long index;
49288 +
49289 + if (side == LEFT_SIDE) {
49290 + /* get key of first byte addressed by the extent */
49291 + item_key_by_coord(coord, &key);
49292 + } else {
49293 + /* get key of byte which next after last byte addressed by the extent */
49294 + append_key_extent(coord, &key);
49295 + }
49296 +
49297 + assert("vs-544",
49298 + (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
49299 + /* index of first or last (depending on @side) page addressed
49300 + by the extent */
49301 + index =
49302 + (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
49303 + if (side == RIGHT_SIDE)
49304 + index--;
49305 +
49306 + tree = coord->node->zjnode.tree;
49307 + *childp = jlookup(tree, get_key_objectid(&key), index);
49308 + }
49309 +
49310 + return 0;
49311 +}
49312 +
49313 +/* item_plugin->f.utmost_child_real_block */
49314 +/* Return the child's block, if allocated. */
49315 +int
49316 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
49317 + reiser4_block_nr * block)
49318 +{
49319 + reiser4_extent *ext;
49320 +
49321 + ext = extent_by_coord(coord);
49322 +
49323 + switch (state_of_extent(ext)) {
49324 + case ALLOCATED_EXTENT:
49325 + *block = extent_get_start(ext);
49326 + if (side == RIGHT_SIDE)
49327 + *block += extent_get_width(ext) - 1;
49328 + break;
49329 + case HOLE_EXTENT:
49330 + case UNALLOCATED_EXTENT:
49331 + *block = 0;
49332 + break;
49333 + default:
49334 + /* this should never happen */
49335 + assert("vs-1418", 0);
49336 + }
49337 +
49338 + return 0;
49339 +}
49340 +
49341 +/* item_plugin->f.scan */
49342 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
49343 + This scan continues, advancing the parent coordinate, until either it encounters a
49344 + formatted child or it finishes scanning this node.
49345 +
49346 + If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
49347 + not sure this is last property (same atom) is enforced, but it should be the case since
49348 + one atom must write the parent and the others must read the parent, thus fusing?). In
49349 + any case, the code below asserts this case for unallocated extents. Unallocated
49350 + extents are thus optimized because we can skip to the endpoint when scanning.
49351 +
49352 + It returns control to scan_extent, handles these terminating conditions, e.g., by
49353 + loading the next twig.
49354 +*/
49355 +int scan_extent(flush_scan * scan)
49356 +{
49357 + coord_t coord;
49358 + jnode *neighbor;
49359 + unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
49360 + reiser4_block_nr unit_start;
49361 + __u64 oid;
49362 + reiser4_key key;
49363 + int ret = 0, allocated, incr;
49364 + reiser4_tree *tree;
49365 +
49366 + if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
49367 + scan->stop = 1;
49368 + return 0; /* Race with truncate, this node is already
49369 + * truncated. */
49370 + }
49371 +
49372 + coord_dup(&coord, &scan->parent_coord);
49373 +
49374 + assert("jmacd-1404", !scan_finished(scan));
49375 + assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
49376 + assert("jmacd-1406", jnode_is_unformatted(scan->node));
49377 +
49378 + /* The scan_index variable corresponds to the current page index of the
49379 + unformatted block scan position. */
49380 + scan_index = index_jnode(scan->node);
49381 +
49382 + assert("jmacd-7889", item_is_extent(&coord));
49383 +
49384 + repeat:
49385 + /* objectid of file */
49386 + oid = get_key_objectid(item_key_by_coord(&coord, &key));
49387 +
49388 + allocated = !extent_is_unallocated(&coord);
49389 + /* Get the values of this extent unit: */
49390 + unit_index = extent_unit_index(&coord);
49391 + unit_width = extent_unit_width(&coord);
49392 + unit_start = extent_unit_start(&coord);
49393 +
49394 + assert("jmacd-7187", unit_width > 0);
49395 + assert("jmacd-7188", scan_index >= unit_index);
49396 + assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
49397 +
49398 + /* Depending on the scan direction, we set different maximum values for scan_index
49399 + (scan_max) and the number of nodes that would be passed if the scan goes the
49400 + entire way (scan_dist). Incr is an integer reflecting the incremental
49401 + direction of scan_index. */
49402 + if (scanning_left(scan)) {
49403 + scan_max = unit_index;
49404 + scan_dist = scan_index - unit_index;
49405 + incr = -1;
49406 + } else {
49407 + scan_max = unit_index + unit_width - 1;
49408 + scan_dist = scan_max - unit_index;
49409 + incr = +1;
49410 + }
49411 +
49412 + tree = coord.node->zjnode.tree;
49413 +
49414 + /* If the extent is allocated we have to check each of its blocks. If the extent
49415 + is unallocated we can skip to the scan_max. */
49416 + if (allocated) {
49417 + do {
49418 + neighbor = jlookup(tree, oid, scan_index);
49419 + if (neighbor == NULL)
49420 + goto stop_same_parent;
49421 +
49422 + if (scan->node != neighbor
49423 + && !scan_goto(scan, neighbor)) {
49424 + /* @neighbor was jput() by scan_goto(). */
49425 + goto stop_same_parent;
49426 + }
49427 +
49428 + ret = scan_set_current(scan, neighbor, 1, &coord);
49429 + if (ret != 0) {
49430 + goto exit;
49431 + }
49432 +
49433 + /* reference to @neighbor is stored in @scan, no need
49434 + to jput(). */
49435 + scan_index += incr;
49436 +
49437 + } while (incr + scan_max != scan_index);
49438 +
49439 + } else {
49440 + /* Optimized case for unallocated extents, skip to the end. */
49441 + neighbor = jlookup(tree, oid, scan_max /*index */ );
49442 + if (neighbor == NULL) {
49443 + /* Race with truncate */
49444 + scan->stop = 1;
49445 + ret = 0;
49446 + goto exit;
49447 + }
49448 +
49449 + assert("zam-1043", blocknr_is_fake(jnode_get_block(neighbor)));
49450 +
49451 + ret = scan_set_current(scan, neighbor, scan_dist, &coord);
49452 + if (ret != 0) {
49453 + goto exit;
49454 + }
49455 + }
49456 +
49457 + if (coord_sideof_unit(&coord, scan->direction) == 0
49458 + && item_is_extent(&coord)) {
49459 + /* Continue as long as there are more extent units. */
49460 +
49461 + scan_index =
49462 + extent_unit_index(&coord) +
49463 + (scanning_left(scan) ? extent_unit_width(&coord) - 1 : 0);
49464 + goto repeat;
49465 + }
49466 +
49467 + if (0) {
49468 + stop_same_parent:
49469 +
49470 + /* If we are scanning left and we stop in the middle of an allocated
49471 + extent, we know the preceder immediately.. */
49472 + /* middle of extent is (scan_index - unit_index) != 0. */
49473 + if (scanning_left(scan) && (scan_index - unit_index) != 0) {
49474 + /* FIXME(B): Someone should step-through and verify that this preceder
49475 + calculation is indeed correct. */
49476 + /* @unit_start is starting block (number) of extent
49477 + unit. Flush stopped at the @scan_index block from
49478 + the beginning of the file, which is (scan_index -
49479 + unit_index) block within extent.
49480 + */
49481 + if (unit_start) {
49482 + /* skip preceder update when we are at hole */
49483 + scan->preceder_blk =
49484 + unit_start + scan_index - unit_index;
49485 + check_preceder(scan->preceder_blk);
49486 + }
49487 + }
49488 +
49489 + /* In this case, we leave coord set to the parent of scan->node. */
49490 + scan->stop = 1;
49491 +
49492 + } else {
49493 + /* In this case, we are still scanning, coord is set to the next item which is
49494 + either off-the-end of the node or not an extent. */
49495 + assert("jmacd-8912", scan->stop == 0);
49496 + assert("jmacd-7812",
49497 + (coord_is_after_sideof_unit(&coord, scan->direction)
49498 + || !item_is_extent(&coord)));
49499 + }
49500 +
49501 + ret = 0;
49502 + exit:
49503 + return ret;
49504 +}
49505 +
49506 +/* ask block allocator for some blocks */
49507 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
49508 + reiser4_block_nr wanted_count,
49509 + reiser4_block_nr *first_allocated,
49510 + reiser4_block_nr *allocated,
49511 + block_stage_t block_stage)
49512 +{
49513 + *allocated = wanted_count;
49514 + preceder->max_dist = 0; /* scan whole disk, if needed */
49515 +
49516 + /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
49517 + preceder->block_stage = block_stage;
49518 +
49519 + /* FIXME: we do not handle errors here now */
49520 + check_me("vs-420",
49521 + reiser4_alloc_blocks(preceder, first_allocated, allocated,
49522 + BA_PERMANENT) == 0);
49523 + /* update flush_pos's preceder to last allocated block number */
49524 + preceder->blk = *first_allocated + *allocated - 1;
49525 +}
49526 +
49527 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
49528 + will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
49529 + to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
49530 +static reiser4_block_nr reserve_replace(void)
49531 +{
49532 + reiser4_block_nr grabbed, needed;
49533 +
49534 + grabbed = get_current_context()->grabbed_blocks;
49535 + needed = estimate_one_insert_into_item(current_tree);
49536 + check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
49537 + return grabbed;
49538 +}
49539 +
49540 +static void free_replace_reserved(reiser4_block_nr grabbed)
49541 +{
49542 + reiser4_context *ctx;
49543 +
49544 + ctx = get_current_context();
49545 + grabbed2free(ctx, get_super_private(ctx->super),
49546 + ctx->grabbed_blocks - grabbed);
49547 +}
49548 +
49549 +/* Block offset of first block addressed by unit */
49550 +__u64 extent_unit_index(const coord_t * item)
49551 +{
49552 + reiser4_key key;
49553 +
49554 + assert("vs-648", coord_is_existing_unit(item));
49555 + unit_key_by_coord(item, &key);
49556 + return get_key_offset(&key) >> current_blocksize_bits;
49557 +}
49558 +
49559 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
49560 + Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
49561 +__u64 extent_unit_width(const coord_t * item)
49562 +{
49563 + assert("vs-649", coord_is_existing_unit(item));
49564 + return width_by_coord(item);
49565 +}
49566 +
49567 +/* Starting block location of this unit */
49568 +static reiser4_block_nr extent_unit_start(const coord_t * item)
49569 +{
49570 + return extent_get_start(extent_by_coord(item));
49571 +}
49572 +
49573 +/**
49574 + * split_allocated_extent -
49575 + * @coord:
49576 + * @pos_in_unit:
49577 + *
49578 + * replace allocated extent with two allocated extents
49579 + */
49580 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
49581 +{
49582 + int result;
49583 + struct replace_handle *h;
49584 + reiser4_extent *ext;
49585 + reiser4_block_nr grabbed;
49586 +
49587 + ext = extent_by_coord(coord);
49588 + assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
49589 + assert("vs-1411", extent_get_width(ext) > pos_in_unit);
49590 +
49591 + h = kmalloc(sizeof(*h), get_gfp_mask());
49592 + if (h == NULL)
49593 + return RETERR(-ENOMEM);
49594 + h->coord = coord;
49595 + h->lh = znode_lh(coord->node);
49596 + h->pkey = &h->key;
49597 + unit_key_by_coord(coord, h->pkey);
49598 + set_key_offset(h->pkey,
49599 + (get_key_offset(h->pkey) +
49600 + pos_in_unit * current_blocksize));
49601 + set_extent(&h->overwrite, extent_get_start(ext), pos_in_unit);
49602 + set_extent(&h->new_extents[0], extent_get_start(ext) + pos_in_unit,
49603 + extent_get_width(ext) - pos_in_unit);
49604 + h->nr_new_extents = 1;
49605 + h->flags = COPI_DONT_SHIFT_LEFT;
49606 + h->paste_key = h->key;
49607 +
49608 + /* reserve space for extent unit paste, @grabbed is reserved before */
49609 + grabbed = reserve_replace();
49610 + result = replace_extent(h, 0 /* leave @coord set to overwritten
49611 + extent */);
49612 + /* restore reserved */
49613 + free_replace_reserved(grabbed);
49614 + kfree(h);
49615 + return result;
49616 +}
49617 +
49618 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
49619 + one). Return 1 if it succeeded, 0 - otherwise */
49620 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
49621 + reiser4_extent *replace)
49622 +{
49623 + assert("vs-1415", extent_by_coord(coord) == ext);
49624 +
49625 + if (coord->unit_pos == 0
49626 + || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
49627 + /* @ext either does not exist or is not allocated extent */
49628 + return 0;
49629 + if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
49630 + extent_get_start(replace))
49631 + return 0;
49632 +
49633 + /* we can glue, widen previous unit */
49634 + extent_set_width(ext - 1,
49635 + extent_get_width(ext - 1) + extent_get_width(replace));
49636 +
49637 + if (extent_get_width(ext) != extent_get_width(replace)) {
49638 + /* make current extent narrower */
49639 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
49640 + extent_set_start(ext,
49641 + extent_get_start(ext) +
49642 + extent_get_width(replace));
49643 + extent_set_width(ext,
49644 + extent_get_width(ext) -
49645 + extent_get_width(replace));
49646 + } else {
49647 + /* current extent completely glued with its left neighbor, remove it */
49648 + coord_t from, to;
49649 +
49650 + coord_dup(&from, coord);
49651 + from.unit_pos = nr_units_extent(coord) - 1;
49652 + coord_dup(&to, &from);
49653 +
49654 + /* currently cut from extent can cut either from the beginning or from the end. Move place which got
49655 + freed after unit removal to end of item */
49656 + memmove(ext, ext + 1,
49657 + (from.unit_pos -
49658 + coord->unit_pos) * sizeof(reiser4_extent));
49659 + /* wipe part of item which is going to be cut, so that node_check will not be confused */
49660 + cut_node_content(&from, &to, NULL, NULL, NULL);
49661 + }
49662 + znode_make_dirty(coord->node);
49663 + /* move coord back */
49664 + coord->unit_pos--;
49665 + return 1;
49666 +}
49667 +
49668 +/**
49669 + * conv_extent - replace extent with 2 ones
49670 + * @coord: coordinate of extent to be replaced
49671 + * @replace: extent to overwrite the one @coord is set to
49672 + *
49673 + * Overwrites extent @coord is set to and paste one extent unit after
49674 + * overwritten one if @replace is shorter than initial extent
49675 + */
49676 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
49677 +{
49678 + int result;
49679 + struct replace_handle *h;
49680 + reiser4_extent *ext;
49681 + reiser4_block_nr start, width, new_width;
49682 + reiser4_block_nr grabbed;
49683 + extent_state state;
49684 +
49685 + ext = extent_by_coord(coord);
49686 + state = state_of_extent(ext);
49687 + start = extent_get_start(ext);
49688 + width = extent_get_width(ext);
49689 + new_width = extent_get_width(replace);
49690 +
49691 + assert("vs-1458", (state == UNALLOCATED_EXTENT ||
49692 + state == ALLOCATED_EXTENT));
49693 + assert("vs-1459", width >= new_width);
49694 +
49695 + if (try_to_merge_with_left(coord, ext, replace)) {
49696 + /* merged @replace with left neighbor. Current unit is either
49697 + removed or narrowed */
49698 + return 0;
49699 + }
49700 +
49701 + if (width == new_width) {
49702 + /* replace current extent with @replace */
49703 + *ext = *replace;
49704 + znode_make_dirty(coord->node);
49705 + return 0;
49706 + }
49707 +
49708 + h = kmalloc(sizeof(*h), get_gfp_mask());
49709 + if (h == NULL)
49710 + return RETERR(-ENOMEM);
49711 + h->coord = coord;
49712 + h->lh = znode_lh(coord->node);
49713 + h->pkey = &h->key;
49714 + unit_key_by_coord(coord, h->pkey);
49715 + set_key_offset(h->pkey,
49716 + (get_key_offset(h->pkey) + new_width * current_blocksize));
49717 + h->overwrite = *replace;
49718 +
49719 + /* replace @ext with @replace and padding extent */
49720 + set_extent(&h->new_extents[0],
49721 + (state == ALLOCATED_EXTENT) ? (start + new_width) : UNALLOCATED_EXTENT_START,
49722 + width - new_width);
49723 + h->nr_new_extents = 1;
49724 + h->flags = COPI_DONT_SHIFT_LEFT;
49725 + h->paste_key = h->key;
49726 +
49727 + /* reserve space for extent unit paste, @grabbed is reserved before */
49728 + grabbed = reserve_replace();
49729 + result = replace_extent(h, 0 /* leave @coord set to overwritten
49730 + extent */);
49731 +
49732 + /* restore reserved */
49733 + free_replace_reserved(grabbed);
49734 + kfree(h);
49735 + return result;
49736 +}
49737 +
49738 +/**
49739 + * assign_real_blocknrs
49740 + * @flush_pos:
49741 + * @oid: objectid of file jnodes to assign block number to belongs to
49742 + * @index: first jnode on the range
49743 + * @count: number of jnodes to assign block numbers to
49744 + * @first: start of allocated block range
49745 + *
49746 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
49747 + * @index. Jnodes get lookuped with jlookup.
49748 + */
49749 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
49750 + unsigned long index, reiser4_block_nr count,
49751 + reiser4_block_nr first)
49752 +{
49753 + unsigned long i;
49754 + reiser4_tree *tree;
49755 + txn_atom *atom;
49756 + int nr;
49757 +
49758 + atom = atom_locked_by_fq(flush_pos->fq);
49759 + assert("vs-1468", atom);
49760 + BUG_ON(atom == NULL);
49761 +
49762 + nr = 0;
49763 + tree = current_tree;
49764 + for (i = 0; i < count; ++i, ++index) {
49765 + jnode *node;
49766 +
49767 + node = jlookup(tree, oid, index);
49768 + assert("", node != NULL);
49769 + BUG_ON(node == NULL);
49770 +
49771 + spin_lock_jnode(node);
49772 + assert("", !jnode_is_flushprepped(node));
49773 + assert("vs-1475", node->atom == atom);
49774 + assert("vs-1476", atomic_read(&node->x_count) > 0);
49775 +
49776 + JF_CLR(node, JNODE_FLUSH_RESERVED);
49777 + jnode_set_block(node, &first);
49778 + unformatted_make_reloc(node, flush_pos->fq);
49779 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
49780 + FQ_LIST, 0));
49781 + spin_unlock_jnode(node);
49782 + first++;
49783 +
49784 + atomic_dec(&node->x_count);
49785 + nr ++;
49786 + }
49787 +
49788 + spin_unlock_atom(atom);
49789 + return;
49790 +}
49791 +
49792 +/**
49793 + * make_node_ovrwr - assign node to overwrite set
49794 + * @jnodes: overwrite set list head
49795 + * @node: jnode to belong to overwrite set
49796 + *
49797 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
49798 + * which is an accumulator for nodes before they get to overwrite set list of
49799 + * atom.
49800 + */
49801 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
49802 +{
49803 + spin_lock_jnode(node);
49804 +
49805 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
49806 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
49807 +
49808 + JF_SET(node, JNODE_OVRWR);
49809 + list_move_tail(&node->capture_link, jnodes);
49810 + ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
49811 +
49812 + spin_unlock_jnode(node);
49813 +}
49814 +
49815 +/**
49816 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
49817 + * @flush_pos: flush position
49818 + * @oid: objectid of file jnodes belong to
49819 + * @index: starting index
49820 + * @width: extent width
49821 + *
49822 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
49823 + * overwrite set. Starting from the one with index @index. If end of slum is
49824 + * detected (node is not found or flushprepped) - stop iterating and set flush
49825 + * position's state to POS_INVALID.
49826 + */
49827 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
49828 + unsigned long index, reiser4_block_nr width)
49829 +{
49830 + unsigned long i;
49831 + reiser4_tree *tree;
49832 + jnode *node;
49833 + txn_atom *atom;
49834 + LIST_HEAD(jnodes);
49835 +
49836 + tree = current_tree;
49837 +
49838 + atom = atom_locked_by_fq(pos_fq(flush_pos));
49839 + assert("vs-1478", atom);
49840 +
49841 + for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
49842 + node = jlookup(tree, oid, index);
49843 + if (!node) {
49844 + flush_pos->state = POS_INVALID;
49845 + break;
49846 + }
49847 + if (jnode_check_flushprepped(node)) {
49848 + flush_pos->state = POS_INVALID;
49849 + atomic_dec(&node->x_count);
49850 + break;
49851 + }
49852 + if (node->atom != atom) {
49853 + flush_pos->state = POS_INVALID;
49854 + atomic_dec(&node->x_count);
49855 + break;
49856 + }
49857 + make_node_ovrwr(&jnodes, node);
49858 + atomic_dec(&node->x_count);
49859 + }
49860 +
49861 + list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
49862 + spin_unlock_atom(atom);
49863 +}
49864 +
49865 +/**
49866 + * allocated_extent_slum_size
49867 + * @flush_pos:
49868 + * @oid:
49869 + * @index:
49870 + * @count:
49871 + *
49872 + *
49873 + */
49874 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
49875 + unsigned long index, unsigned long count)
49876 +{
49877 + unsigned long i;
49878 + reiser4_tree *tree;
49879 + txn_atom *atom;
49880 + int nr;
49881 +
49882 + atom = atom_locked_by_fq(pos_fq(flush_pos));
49883 + assert("vs-1468", atom);
49884 +
49885 + nr = 0;
49886 + tree = current_tree;
49887 + for (i = 0; i < count; ++i, ++index) {
49888 + jnode *node;
49889 +
49890 + node = jlookup(tree, oid, index);
49891 + if (!node)
49892 + break;
49893 +
49894 + if (jnode_check_flushprepped(node)) {
49895 + atomic_dec(&node->x_count);
49896 + break;
49897 + }
49898 +
49899 + if (node->atom != atom) {
49900 + /*
49901 + * this is possible on overwrite: extent_write may
49902 + * capture several unformatted nodes without capturing
49903 + * any formatted nodes.
49904 + */
49905 + atomic_dec(&node->x_count);
49906 + break;
49907 + }
49908 +
49909 + assert("vs-1476", atomic_read(&node->x_count) > 1);
49910 + atomic_dec(&node->x_count);
49911 + nr ++;
49912 + }
49913 +
49914 + spin_unlock_atom(atom);
49915 + return nr;
49916 +}
49917 +
49918 +/**
49919 + * alloc_extent
49920 + * @flush_pos:
49921 + *
49922 + *
49923 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
49924 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
49925 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
49926 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
49927 + * set to 1 and to overwrite set otherwise
49928 + */
49929 +int alloc_extent(flush_pos_t *flush_pos)
49930 +{
49931 + coord_t *coord;
49932 + reiser4_extent *ext;
49933 + reiser4_extent replace_ext;
49934 + oid_t oid;
49935 + reiser4_block_nr protected;
49936 + reiser4_block_nr start;
49937 + __u64 index;
49938 + __u64 width;
49939 + extent_state state;
49940 + int result;
49941 + reiser4_block_nr first_allocated;
49942 + __u64 allocated;
49943 + reiser4_key key;
49944 + block_stage_t block_stage;
49945 +
49946 + assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
49947 + assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
49948 + && item_is_extent(&flush_pos->coord));
49949 +
49950 + coord = &flush_pos->coord;
49951 +
49952 + ext = extent_by_coord(coord);
49953 + state = state_of_extent(ext);
49954 + if (state == HOLE_EXTENT) {
49955 + flush_pos->state = POS_INVALID;
49956 + return 0;
49957 + }
49958 +
49959 + item_key_by_coord(coord, &key);
49960 + oid = get_key_objectid(&key);
49961 + index = extent_unit_index(coord) + flush_pos->pos_in_unit;
49962 + start = extent_get_start(ext);
49963 + width = extent_get_width(ext);
49964 +
49965 + assert("vs-1457", width > flush_pos->pos_in_unit);
49966 +
49967 + if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
49968 + /* relocate */
49969 + if (flush_pos->pos_in_unit) {
49970 + /* split extent unit into two */
49971 + result =
49972 + split_allocated_extent(coord,
49973 + flush_pos->pos_in_unit);
49974 + flush_pos->pos_in_unit = 0;
49975 + return result;
49976 + }
49977 +
49978 + /* limit number of nodes to allocate */
49979 + if (flush_pos->nr_to_write < width)
49980 + width = flush_pos->nr_to_write;
49981 +
49982 + if (state == ALLOCATED_EXTENT) {
49983 + /*
49984 + * all protected nodes are not flushprepped, therefore
49985 + * they are counted as flush_reserved
49986 + */
49987 + block_stage = BLOCK_FLUSH_RESERVED;
49988 + protected = allocated_extent_slum_size(flush_pos, oid,
49989 + index, width);
49990 + if (protected == 0) {
49991 + flush_pos->state = POS_INVALID;
49992 + flush_pos->pos_in_unit = 0;
49993 + return 0;
49994 + }
49995 + } else {
49996 + block_stage = BLOCK_UNALLOCATED;
49997 + protected = width;
49998 + }
49999 +
50000 + /*
50001 + * look at previous unit if possible. If it is allocated, make
50002 + * preceder more precise
50003 + */
50004 + if (coord->unit_pos &&
50005 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50006 + pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
50007 + extent_get_width(ext - 1);
50008 +
50009 + /* allocate new block numbers for protected nodes */
50010 + extent_allocate_blocks(pos_hint(flush_pos), protected,
50011 + &first_allocated, &allocated,
50012 + block_stage);
50013 +
50014 + if (state == ALLOCATED_EXTENT)
50015 + /*
50016 + * on relocating - free nodes which are going to be
50017 + * relocated
50018 + */
50019 + reiser4_dealloc_blocks(&start, &allocated,
50020 + BLOCK_ALLOCATED, BA_DEFER);
50021 +
50022 + /* assign new block numbers to protected nodes */
50023 + assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
50024 +
50025 +
50026 + /* prepare extent which will replace current one */
50027 + set_extent(&replace_ext, first_allocated, allocated);
50028 +
50029 + /* adjust extent item */
50030 + result = conv_extent(coord, &replace_ext);
50031 + if (result != 0 && result != -ENOMEM) {
50032 + warning("vs-1461",
50033 + "Failed to allocate extent. Should not happen\n");
50034 + return result;
50035 + }
50036 +
50037 + /*
50038 + * break flush: we prepared for flushing as many blocks as we
50039 + * were asked for
50040 + */
50041 + if (flush_pos->nr_to_write == allocated)
50042 + flush_pos->state = POS_INVALID;
50043 + } else {
50044 + /* overwrite */
50045 + mark_jnodes_overwrite(flush_pos, oid, index, width);
50046 + }
50047 + flush_pos->pos_in_unit = 0;
50048 + return 0;
50049 +}
50050 +
50051 +/* if @key is glueable to the item @coord is set to */
50052 +static int must_insert(const coord_t *coord, const reiser4_key *key)
50053 +{
50054 + reiser4_key last;
50055 +
50056 + if (item_id_by_coord(coord) == EXTENT_POINTER_ID
50057 + && keyeq(append_key_extent(coord, &last), key))
50058 + return 0;
50059 + return 1;
50060 +}
50061 +
50062 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
50063 + or modify last unit of last item to have greater width */
50064 +static int put_unit_to_end(znode *node, const reiser4_key *key,
50065 + reiser4_extent *copy_ext)
50066 +{
50067 + int result;
50068 + coord_t coord;
50069 + cop_insert_flag flags;
50070 + reiser4_extent *last_ext;
50071 + reiser4_item_data data;
50072 +
50073 + /* set coord after last unit in an item */
50074 + coord_init_last_unit(&coord, node);
50075 + coord.between = AFTER_UNIT;
50076 +
50077 + flags =
50078 + COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
50079 + if (must_insert(&coord, key)) {
50080 + result =
50081 + insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
50082 + key, NULL /*lh */ , flags);
50083 +
50084 + } else {
50085 + /* try to glue with last unit */
50086 + last_ext = extent_by_coord(&coord);
50087 + if (state_of_extent(last_ext) &&
50088 + extent_get_start(last_ext) + extent_get_width(last_ext) ==
50089 + extent_get_start(copy_ext)) {
50090 + /* widen last unit of node */
50091 + extent_set_width(last_ext,
50092 + extent_get_width(last_ext) +
50093 + extent_get_width(copy_ext));
50094 + znode_make_dirty(node);
50095 + return 0;
50096 + }
50097 +
50098 + /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
50099 + result =
50100 + insert_into_item(&coord, NULL /*lh */ , key,
50101 + init_new_extent(&data, copy_ext, 1),
50102 + flags);
50103 + }
50104 +
50105 + assert("vs-438", result == 0 || result == -E_NODE_FULL);
50106 + return result;
50107 +}
50108 +
50109 +/* @coord is set to extent unit */
50110 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
50111 + flush_pos_t *flush_pos,
50112 + reiser4_key *stop_key)
50113 +{
50114 + reiser4_extent *ext;
50115 + __u64 index;
50116 + __u64 width;
50117 + reiser4_block_nr start;
50118 + extent_state state;
50119 + oid_t oid;
50120 + reiser4_block_nr first_allocated;
50121 + __u64 allocated;
50122 + __u64 protected;
50123 + reiser4_extent copy_extent;
50124 + reiser4_key key;
50125 + int result;
50126 + block_stage_t block_stage;
50127 +
50128 + assert("vs-1457", flush_pos->pos_in_unit == 0);
50129 + assert("vs-1467", coord_is_leftmost_unit(coord));
50130 + assert("vs-1467", item_is_extent(coord));
50131 +
50132 + ext = extent_by_coord(coord);
50133 + index = extent_unit_index(coord);
50134 + start = extent_get_start(ext);
50135 + width = extent_get_width(ext);
50136 + state = state_of_extent(ext);
50137 + unit_key_by_coord(coord, &key);
50138 + oid = get_key_objectid(&key);
50139 +
50140 + if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
50141 + (state == UNALLOCATED_EXTENT)) {
50142 + /* relocate */
50143 + if (state == ALLOCATED_EXTENT) {
50144 + /* all protected nodes are not flushprepped, therefore
50145 + * they are counted as flush_reserved */
50146 + block_stage = BLOCK_FLUSH_RESERVED;
50147 + protected = allocated_extent_slum_size(flush_pos, oid,
50148 + index, width);
50149 + if (protected == 0) {
50150 + flush_pos->state = POS_INVALID;
50151 + flush_pos->pos_in_unit = 0;
50152 + return 0;
50153 + }
50154 + } else {
50155 + block_stage = BLOCK_UNALLOCATED;
50156 + protected = width;
50157 + }
50158 +
50159 + /*
50160 + * look at previous unit if possible. If it is allocated, make
50161 + * preceder more precise
50162 + */
50163 + if (coord->unit_pos &&
50164 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50165 + pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
50166 + extent_get_width(ext - 1);
50167 +
50168 + /* allocate new block numbers for protected nodes */
50169 + extent_allocate_blocks(pos_hint(flush_pos), protected,
50170 + &first_allocated, &allocated,
50171 + block_stage);
50172 +
50173 + /* prepare extent which will be copied to left */
50174 + set_extent(&copy_extent, first_allocated, allocated);
50175 +
50176 + result = put_unit_to_end(left, &key, &copy_extent);
50177 + if (result == -E_NODE_FULL) {
50178 + int target_block_stage;
50179 +
50180 + /* free blocks which were just allocated */
50181 + target_block_stage =
50182 + (state ==
50183 + ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
50184 + BLOCK_UNALLOCATED;
50185 + reiser4_dealloc_blocks(&first_allocated, &allocated,
50186 + target_block_stage,
50187 + BA_PERMANENT);
50188 +
50189 + /* rewind the preceder. */
50190 + flush_pos->preceder.blk = first_allocated;
50191 + check_preceder(flush_pos->preceder.blk);
50192 +
50193 + return SQUEEZE_TARGET_FULL;
50194 + }
50195 +
50196 + if (state == ALLOCATED_EXTENT) {
50197 + /* free nodes which were relocated */
50198 + reiser4_dealloc_blocks(&start, &allocated,
50199 + BLOCK_ALLOCATED, BA_DEFER);
50200 + }
50201 +
50202 + /* assign new block numbers to protected nodes */
50203 + assign_real_blocknrs(flush_pos, oid, index, allocated,
50204 + first_allocated);
50205 +
50206 + set_key_offset(&key,
50207 + get_key_offset(&key) +
50208 + (allocated << current_blocksize_bits));
50209 + } else {
50210 + /*
50211 + * overwrite: try to copy unit as it is to left neighbor and
50212 + * make all first not flushprepped nodes overwrite nodes
50213 + */
50214 + set_extent(&copy_extent, start, width);
50215 + result = put_unit_to_end(left, &key, &copy_extent);
50216 + if (result == -E_NODE_FULL)
50217 + return SQUEEZE_TARGET_FULL;
50218 +
50219 + if (state != HOLE_EXTENT)
50220 + mark_jnodes_overwrite(flush_pos, oid, index, width);
50221 + set_key_offset(&key,
50222 + get_key_offset(&key) +
50223 + (width << current_blocksize_bits));
50224 + }
50225 + *stop_key = key;
50226 + return SQUEEZE_CONTINUE;
50227 +}
50228 +
50229 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
50230 +{
50231 + return key_by_inode_and_offset_common(inode, off, key);
50232 +}
50233 +
50234 +/*
50235 + * Local variables:
50236 + * c-indentation-style: "K&R"
50237 + * mode-name: "LC"
50238 + * c-basic-offset: 8
50239 + * tab-width: 8
50240 + * fill-column: 79
50241 + * scroll-step: 1
50242 + * End:
50243 + */
50244 Index: linux-2.6.16/fs/reiser4/plugin/item/extent_item_ops.c
50245 ===================================================================
50246 --- /dev/null
50247 +++ linux-2.6.16/fs/reiser4/plugin/item/extent_item_ops.c
50248 @@ -0,0 +1,882 @@
50249 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50250 +
50251 +#include "item.h"
50252 +#include "../../inode.h"
50253 +#include "../../tree_walk.h" /* check_sibling_list() */
50254 +#include "../../page_cache.h"
50255 +#include "../../carry.h"
50256 +
50257 +#include <linux/quotaops.h>
50258 +
50259 +/* item_plugin->b.max_key_inside */
50260 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
50261 +{
50262 + item_key_by_coord(coord, key);
50263 + set_key_offset(key, get_key_offset(max_key()));
50264 + return key;
50265 +}
50266 +
50267 +/* item_plugin->b.can_contain_key
50268 + this checks whether @key of @data is matching to position set by @coord */
50269 +int
50270 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
50271 + const reiser4_item_data * data)
50272 +{
50273 + reiser4_key item_key;
50274 +
50275 + if (item_plugin_by_coord(coord) != data->iplug)
50276 + return 0;
50277 +
50278 + item_key_by_coord(coord, &item_key);
50279 + if (get_key_locality(key) != get_key_locality(&item_key) ||
50280 + get_key_objectid(key) != get_key_objectid(&item_key) ||
50281 + get_key_ordering(key) != get_key_ordering(&item_key))
50282 + return 0;
50283 +
50284 + return 1;
50285 +}
50286 +
50287 +/* item_plugin->b.mergeable
50288 + first item is of extent type */
50289 +/* Audited by: green(2002.06.13) */
50290 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
50291 +{
50292 + reiser4_key key1, key2;
50293 +
50294 + assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
50295 + /* FIXME-VS: Which is it? Assert or return 0 */
50296 + if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
50297 + return 0;
50298 + }
50299 +
50300 + item_key_by_coord(p1, &key1);
50301 + item_key_by_coord(p2, &key2);
50302 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
50303 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
50304 + get_key_ordering(&key1) != get_key_ordering(&key2) ||
50305 + get_key_type(&key1) != get_key_type(&key2))
50306 + return 0;
50307 + if (get_key_offset(&key1) + extent_size(p1, nr_units_extent(p1)) !=
50308 + get_key_offset(&key2))
50309 + return 0;
50310 + return 1;
50311 +}
50312 +
50313 +/* item_plugin->b.nr_units */
50314 +pos_in_node_t nr_units_extent(const coord_t * coord)
50315 +{
50316 + /* length of extent item has to be multiple of extent size */
50317 + assert("vs-1424",
50318 + (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
50319 + return item_length_by_coord(coord) / sizeof(reiser4_extent);
50320 +}
50321 +
50322 +/* item_plugin->b.lookup */
50323 +lookup_result
50324 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
50325 + coord_t * coord)
50326 +{ /* znode and item_pos are
50327 + set to an extent item to
50328 + look through */
50329 + reiser4_key item_key;
50330 + reiser4_block_nr lookuped, offset;
50331 + unsigned i, nr_units;
50332 + reiser4_extent *ext;
50333 + unsigned blocksize;
50334 + unsigned char blocksize_bits;
50335 +
50336 + item_key_by_coord(coord, &item_key);
50337 + offset = get_key_offset(&item_key);
50338 +
50339 + /* key we are looking for must be greater than key of item @coord */
50340 + assert("vs-414", keygt(key, &item_key));
50341 +
50342 + assert("umka-99945",
50343 + !keygt(key, max_key_inside_extent(coord, &item_key)));
50344 +
50345 + ext = extent_item(coord);
50346 + assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
50347 +
50348 + blocksize = current_blocksize;
50349 + blocksize_bits = current_blocksize_bits;
50350 +
50351 + /* offset we are looking for */
50352 + lookuped = get_key_offset(key);
50353 +
50354 + nr_units = nr_units_extent(coord);
50355 + /* go through all extents until the one which address given offset */
50356 + for (i = 0; i < nr_units; i++, ext++) {
50357 + offset += (extent_get_width(ext) << blocksize_bits);
50358 + if (offset > lookuped) {
50359 + /* desired byte is somewhere in this extent */
50360 + coord->unit_pos = i;
50361 + coord->between = AT_UNIT;
50362 + return CBK_COORD_FOUND;
50363 + }
50364 + }
50365 +
50366 + /* set coord after last unit */
50367 + coord->unit_pos = nr_units - 1;
50368 + coord->between = AFTER_UNIT;
50369 + return CBK_COORD_FOUND;
50370 +}
50371 +
50372 +/* item_plugin->b.paste
50373 + item @coord is set to has been appended with @data->length of free
50374 + space. data->data contains data to be pasted into the item in position
50375 + @coord->in_item.unit_pos. It must fit into that free space.
50376 + @coord must be set between units.
50377 +*/
50378 +int
50379 +paste_extent(coord_t * coord, reiser4_item_data * data,
50380 + carry_plugin_info * info UNUSED_ARG)
50381 +{
50382 + unsigned old_nr_units;
50383 + reiser4_extent *ext;
50384 + int item_length;
50385 +
50386 + ext = extent_item(coord);
50387 + item_length = item_length_by_coord(coord);
50388 + old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50389 +
50390 + /* this is also used to copy extent into newly created item, so
50391 + old_nr_units could be 0 */
50392 + assert("vs-260", item_length >= data->length);
50393 +
50394 + /* make sure that coord is set properly */
50395 + assert("vs-35",
50396 + ((!coord_is_existing_unit(coord))
50397 + || (!old_nr_units && !coord->unit_pos)));
50398 +
50399 + /* first unit to be moved */
50400 + switch (coord->between) {
50401 + case AFTER_UNIT:
50402 + coord->unit_pos++;
50403 + case BEFORE_UNIT:
50404 + coord->between = AT_UNIT;
50405 + break;
50406 + case AT_UNIT:
50407 + assert("vs-331", !old_nr_units && !coord->unit_pos);
50408 + break;
50409 + default:
50410 + impossible("vs-330", "coord is set improperly");
50411 + }
50412 +
50413 + /* prepare space for new units */
50414 + memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50415 + ext + coord->unit_pos,
50416 + (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50417 +
50418 + /* copy new data from kernel space */
50419 + assert("vs-556", data->user == 0);
50420 + memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50421 +
50422 + /* after paste @coord is set to first of pasted units */
50423 + assert("vs-332", coord_is_existing_unit(coord));
50424 + assert("vs-333",
50425 + !memcmp(data->data, extent_by_coord(coord),
50426 + (unsigned)data->length));
50427 + return 0;
50428 +}
50429 +
50430 +/* item_plugin->b.can_shift */
50431 +int
50432 +can_shift_extent(unsigned free_space, coord_t * source,
50433 + znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50434 + unsigned *size, unsigned want)
50435 +{
50436 + *size = item_length_by_coord(source);
50437 + if (*size > free_space)
50438 + /* never split a unit of extent item */
50439 + *size = free_space - free_space % sizeof(reiser4_extent);
50440 +
50441 + /* we can shift *size bytes, calculate how many do we want to shift */
50442 + if (*size > want * sizeof(reiser4_extent))
50443 + *size = want * sizeof(reiser4_extent);
50444 +
50445 + if (*size % sizeof(reiser4_extent) != 0)
50446 + impossible("vs-119", "Wrong extent size: %i %zd", *size,
50447 + sizeof(reiser4_extent));
50448 + return *size / sizeof(reiser4_extent);
50449 +
50450 +}
50451 +
50452 +/* item_plugin->b.copy_units */
50453 +void
50454 +copy_units_extent(coord_t * target, coord_t * source,
50455 + unsigned from, unsigned count,
50456 + shift_direction where_is_free_space, unsigned free_space)
50457 +{
50458 + char *from_ext, *to_ext;
50459 +
50460 + assert("vs-217", free_space == count * sizeof(reiser4_extent));
50461 +
50462 + from_ext = item_body_by_coord(source);
50463 + to_ext = item_body_by_coord(target);
50464 +
50465 + if (where_is_free_space == SHIFT_LEFT) {
50466 + assert("vs-215", from == 0);
50467 +
50468 + /* At this moment, item length was already updated in the item
50469 + header by shifting code, hence nr_units_extent() will
50470 + return "new" number of units---one we obtain after copying
50471 + units.
50472 + */
50473 + to_ext +=
50474 + (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50475 + } else {
50476 + reiser4_key key;
50477 + coord_t coord;
50478 +
50479 + assert("vs-216",
50480 + from + count == coord_last_unit_pos(source) + 1);
50481 +
50482 + from_ext += item_length_by_coord(source) - free_space;
50483 +
50484 + /* new units are inserted before first unit in an item,
50485 + therefore, we have to update item key */
50486 + coord = *source;
50487 + coord.unit_pos = from;
50488 + unit_key_extent(&coord, &key);
50489 +
50490 + node_plugin_by_node(target->node)->update_item_key(target, &key,
50491 + NULL /*info */);
50492 + }
50493 +
50494 + memcpy(to_ext, from_ext, free_space);
50495 +}
50496 +
50497 +/* item_plugin->b.create_hook
50498 + @arg is znode of leaf node for which we need to update right delimiting key */
50499 +int create_hook_extent(const coord_t * coord, void *arg)
50500 +{
50501 + coord_t *child_coord;
50502 + znode *node;
50503 + reiser4_key key;
50504 + reiser4_tree *tree;
50505 +
50506 + if (!arg)
50507 + return 0;
50508 +
50509 + child_coord = arg;
50510 + tree = znode_get_tree(coord->node);
50511 +
50512 + assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50513 +
50514 + write_lock_tree(tree);
50515 + write_lock_dk(tree);
50516 + /* find a node on the left level for which right delimiting key has to
50517 + be updated */
50518 + if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50519 + assert("vs-411", znode_is_left_connected(child_coord->node));
50520 + node = child_coord->node->left;
50521 + } else {
50522 + assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50523 + node = child_coord->node;
50524 + assert("nikita-3314", node != NULL);
50525 + }
50526 +
50527 + if (node != NULL) {
50528 + znode_set_rd_key(node, item_key_by_coord(coord, &key));
50529 +
50530 + assert("nikita-3282", check_sibling_list(node));
50531 + /* break sibling links */
50532 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50533 + ON_DEBUG(node->right->left_version =
50534 + atomic_inc_return(&delim_key_version);
50535 + node->right_version =
50536 + atomic_inc_return(&delim_key_version););
50537 +
50538 + node->right->left = NULL;
50539 + node->right = NULL;
50540 + }
50541 + }
50542 + write_unlock_dk(tree);
50543 + write_unlock_tree(tree);
50544 + return 0;
50545 +}
50546 +
50547 +#define ITEM_TAIL_KILLED 0
50548 +#define ITEM_HEAD_KILLED 1
50549 +#define ITEM_KILLED 2
50550 +
50551 +/* item_plugin->b.kill_hook
50552 + this is called when @count units starting from @from-th one are going to be removed
50553 + */
50554 +int
50555 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50556 + struct carry_kill_data *kdata)
50557 +{
50558 + reiser4_extent *ext;
50559 + reiser4_block_nr start, length;
50560 + const reiser4_key *pfrom_key, *pto_key;
50561 + struct inode *inode;
50562 + reiser4_tree *tree;
50563 + pgoff_t from_off, to_off, offset, skip;
50564 + int retval;
50565 +
50566 + /* these are located in memory kmalloc-ed by kill_node_content */
50567 + reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50568 + coord_t *dup, *next;
50569 +
50570 + assert("zam-811", znode_is_write_locked(coord->node));
50571 + assert("nikita-3315", kdata != NULL);
50572 + assert("vs-34", kdata->buf != NULL);
50573 +
50574 + /* map structures to kdata->buf */
50575 + min_item_key = (reiser4_key *) (kdata->buf);
50576 + max_item_key = min_item_key + 1;
50577 + from_key = max_item_key + 1;
50578 + to_key = from_key + 1;
50579 + key = to_key + 1;
50580 + dup = (coord_t *) (key + 1);
50581 + next = dup + 1;
50582 +
50583 + item_key_by_coord(coord, min_item_key);
50584 + max_item_key_by_coord(coord, max_item_key);
50585 +
50586 + if (kdata->params.from_key) {
50587 + pfrom_key = kdata->params.from_key;
50588 + pto_key = kdata->params.to_key;
50589 + } else {
50590 + assert("vs-1549", from == coord->unit_pos);
50591 + unit_key_by_coord(coord, from_key);
50592 + pfrom_key = from_key;
50593 +
50594 + coord_dup(dup, coord);
50595 + dup->unit_pos = from + count - 1;
50596 + max_unit_key_by_coord(dup, to_key);
50597 + pto_key = to_key;
50598 + }
50599 +
50600 + if (!keylt(pto_key, max_item_key)) {
50601 + if (!keygt(pfrom_key, min_item_key)) {
50602 + znode *left, *right;
50603 +
50604 + /* item is to be removed completely */
50605 + assert("nikita-3316", kdata->left != NULL
50606 + && kdata->right != NULL);
50607 +
50608 + left = kdata->left->node;
50609 + right = kdata->right->node;
50610 +
50611 + tree = current_tree;
50612 + /* we have to do two things:
50613 + *
50614 + * 1. link left and right formatted neighbors of
50615 + * extent being removed, and
50616 + *
50617 + * 2. update their delimiting keys.
50618 + *
50619 + * atomicity of these operations is protected by
50620 + * taking dk-lock and tree-lock.
50621 + */
50622 + /* if neighbors of item being removed are znodes -
50623 + * link them */
50624 + write_lock_tree(tree);
50625 + write_lock_dk(tree);
50626 + link_left_and_right(left, right);
50627 + if (left) {
50628 + /* update right delimiting key of left
50629 + * neighbor of extent item */
50630 + /*coord_t next;
50631 + reiser4_key key; */
50632 +
50633 + coord_dup(next, coord);
50634 +
50635 + if (coord_next_item(next))
50636 + *key = *znode_get_rd_key(coord->node);
50637 + else
50638 + item_key_by_coord(next, key);
50639 + znode_set_rd_key(left, key);
50640 + }
50641 + write_unlock_dk(tree);
50642 + write_unlock_tree(tree);
50643 +
50644 + from_off =
50645 + get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
50646 + to_off =
50647 + (get_key_offset(max_item_key) +
50648 + 1) >> PAGE_CACHE_SHIFT;
50649 + retval = ITEM_KILLED;
50650 + } else {
50651 + /* tail of item is to be removed */
50652 + from_off =
50653 + (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
50654 + 1) >> PAGE_CACHE_SHIFT;
50655 + to_off =
50656 + (get_key_offset(max_item_key) +
50657 + 1) >> PAGE_CACHE_SHIFT;
50658 + retval = ITEM_TAIL_KILLED;
50659 + }
50660 + } else {
50661 + /* head of item is to be removed */
50662 + assert("vs-1571", keyeq(pfrom_key, min_item_key));
50663 + assert("vs-1572",
50664 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
50665 + 0);
50666 + assert("vs-1573",
50667 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50668 + 1)) == 0);
50669 +
50670 + if (kdata->left->node) {
50671 + /* update right delimiting key of left neighbor of extent item */
50672 + /*reiser4_key key; */
50673 +
50674 + *key = *pto_key;
50675 + set_key_offset(key, get_key_offset(pto_key) + 1);
50676 +
50677 + write_lock_dk(current_tree);
50678 + znode_set_rd_key(kdata->left->node, key);
50679 + write_unlock_dk(current_tree);
50680 + }
50681 +
50682 + from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
50683 + to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
50684 + retval = ITEM_HEAD_KILLED;
50685 + }
50686 +
50687 + inode = kdata->inode;
50688 + assert("vs-1545", inode != NULL);
50689 + if (inode != NULL)
50690 + /* take care of pages and jnodes corresponding to part of item being killed */
50691 + reiser4_invalidate_pages(inode->i_mapping, from_off,
50692 + to_off - from_off,
50693 + kdata->params.truncate);
50694 +
50695 + ext = extent_item(coord) + from;
50696 + offset =
50697 + (get_key_offset(min_item_key) +
50698 + extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
50699 +
50700 + assert("vs-1551", from_off >= offset);
50701 + assert("vs-1552", from_off - offset <= extent_get_width(ext));
50702 + skip = from_off - offset;
50703 + offset = from_off;
50704 +
50705 + while (offset < to_off) {
50706 + length = extent_get_width(ext) - skip;
50707 + if (state_of_extent(ext) == HOLE_EXTENT) {
50708 + skip = 0;
50709 + offset += length;
50710 + ext++;
50711 + continue;
50712 + }
50713 +
50714 + if (offset + length > to_off) {
50715 + length = to_off - offset;
50716 + }
50717 +
50718 + DQUOT_FREE_BLOCK_NODIRTY(inode, length);
50719 +
50720 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50721 + /* some jnodes corresponding to this unallocated extent */
50722 + fake_allocated2free(length, 0 /* unformatted */ );
50723 +
50724 + skip = 0;
50725 + offset += length;
50726 + ext++;
50727 + continue;
50728 + }
50729 +
50730 + assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
50731 +
50732 + if (length != 0) {
50733 + start = extent_get_start(ext) + skip;
50734 +
50735 + /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
50736 + immediately */
50737 + reiser4_dealloc_blocks(&start, &length,
50738 + 0 /* not used */ ,
50739 + BA_DEFER
50740 + /* unformatted with defer */ );
50741 + }
50742 + skip = 0;
50743 + offset += length;
50744 + ext++;
50745 + }
50746 + return retval;
50747 +}
50748 +
50749 +/* item_plugin->b.kill_units */
50750 +int
50751 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50752 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
50753 + reiser4_key * new_first)
50754 +{
50755 + reiser4_extent *ext;
50756 + reiser4_key item_key;
50757 + pos_in_node_t count;
50758 + reiser4_key from_key, to_key;
50759 + const reiser4_key *pfrom_key, *pto_key;
50760 + loff_t off;
50761 + int result;
50762 +
50763 + assert("vs-1541",
50764 + ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
50765 + || (kdata->params.from_key != NULL
50766 + && kdata->params.to_key != NULL)));
50767 +
50768 + if (kdata->params.from_key) {
50769 + pfrom_key = kdata->params.from_key;
50770 + pto_key = kdata->params.to_key;
50771 + } else {
50772 + coord_t dup;
50773 +
50774 + /* calculate key range of kill */
50775 + assert("vs-1549", from == coord->unit_pos);
50776 + unit_key_by_coord(coord, &from_key);
50777 + pfrom_key = &from_key;
50778 +
50779 + coord_dup(&dup, coord);
50780 + dup.unit_pos = to;
50781 + max_unit_key_by_coord(&dup, &to_key);
50782 + pto_key = &to_key;
50783 + }
50784 +
50785 + item_key_by_coord(coord, &item_key);
50786 +
50787 +#if REISER4_DEBUG
50788 + {
50789 + reiser4_key max_item_key;
50790 +
50791 + max_item_key_by_coord(coord, &max_item_key);
50792 +
50793 + if (new_first) {
50794 + /* head of item is to be cut */
50795 + assert("vs-1542", keyeq(pfrom_key, &item_key));
50796 + assert("vs-1538", keylt(pto_key, &max_item_key));
50797 + } else {
50798 + /* tail of item is to be cut */
50799 + assert("vs-1540", keygt(pfrom_key, &item_key));
50800 + assert("vs-1543", !keylt(pto_key, &max_item_key));
50801 + }
50802 + }
50803 +#endif
50804 +
50805 + if (smallest_removed)
50806 + *smallest_removed = *pfrom_key;
50807 +
50808 + if (new_first) {
50809 + /* item head is cut. Item key will change. This new key is calculated here */
50810 + assert("vs-1556",
50811 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50812 + (PAGE_CACHE_SIZE - 1));
50813 + *new_first = *pto_key;
50814 + set_key_offset(new_first, get_key_offset(new_first) + 1);
50815 + }
50816 +
50817 + count = to - from + 1;
50818 + result = kill_hook_extent(coord, from, count, kdata);
50819 + if (result == ITEM_TAIL_KILLED) {
50820 + assert("vs-1553",
50821 + get_key_offset(pfrom_key) >=
50822 + get_key_offset(&item_key) + extent_size(coord, from));
50823 + off =
50824 + get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
50825 + extent_size(coord, from));
50826 + if (off) {
50827 + /* unit @from is to be cut partially. Its width decreases */
50828 + ext = extent_item(coord) + from;
50829 + extent_set_width(ext,
50830 + (off + PAGE_CACHE_SIZE -
50831 + 1) >> PAGE_CACHE_SHIFT);
50832 + count--;
50833 + }
50834 + } else {
50835 + __u64 max_to_offset;
50836 + __u64 rest;
50837 +
50838 + assert("vs-1575", result == ITEM_HEAD_KILLED);
50839 + assert("", from == 0);
50840 + assert("",
50841 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50842 + 1)) == 0);
50843 + assert("",
50844 + get_key_offset(pto_key) + 1 >
50845 + get_key_offset(&item_key) + extent_size(coord, to));
50846 + max_to_offset =
50847 + get_key_offset(&item_key) + extent_size(coord, to + 1) - 1;
50848 + assert("", get_key_offset(pto_key) <= max_to_offset);
50849 +
50850 + rest =
50851 + (max_to_offset -
50852 + get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
50853 + if (rest) {
50854 + /* unit @to is to be cut partially */
50855 + ext = extent_item(coord) + to;
50856 +
50857 + assert("", extent_get_width(ext) > rest);
50858 +
50859 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
50860 + extent_set_start(ext,
50861 + extent_get_start(ext) +
50862 + (extent_get_width(ext) -
50863 + rest));
50864 +
50865 + extent_set_width(ext, rest);
50866 + count--;
50867 + }
50868 + }
50869 + return count * sizeof(reiser4_extent);
50870 +}
50871 +
50872 +/* item_plugin->b.cut_units
50873 + this is too similar to kill_units_extent */
50874 +int
50875 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50876 + struct carry_cut_data *cdata, reiser4_key * smallest_removed,
50877 + reiser4_key * new_first)
50878 +{
50879 + reiser4_extent *ext;
50880 + reiser4_key item_key;
50881 + pos_in_node_t count;
50882 + reiser4_key from_key, to_key;
50883 + const reiser4_key *pfrom_key, *pto_key;
50884 + loff_t off;
50885 +
50886 + assert("vs-1541",
50887 + ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
50888 + || (cdata->params.from_key != NULL
50889 + && cdata->params.to_key != NULL)));
50890 +
50891 + if (cdata->params.from_key) {
50892 + pfrom_key = cdata->params.from_key;
50893 + pto_key = cdata->params.to_key;
50894 + } else {
50895 + coord_t dup;
50896 +
50897 + /* calculate key range of kill */
50898 + coord_dup(&dup, coord);
50899 + dup.unit_pos = from;
50900 + unit_key_by_coord(&dup, &from_key);
50901 +
50902 + dup.unit_pos = to;
50903 + max_unit_key_by_coord(&dup, &to_key);
50904 +
50905 + pfrom_key = &from_key;
50906 + pto_key = &to_key;
50907 + }
50908 +
50909 + assert("vs-1555",
50910 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
50911 + assert("vs-1556",
50912 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50913 + (PAGE_CACHE_SIZE - 1));
50914 +
50915 + item_key_by_coord(coord, &item_key);
50916 +
50917 +#if REISER4_DEBUG
50918 + {
50919 + reiser4_key max_item_key;
50920 +
50921 + assert("vs-1584",
50922 + get_key_locality(pfrom_key) ==
50923 + get_key_locality(&item_key));
50924 + assert("vs-1585",
50925 + get_key_type(pfrom_key) == get_key_type(&item_key));
50926 + assert("vs-1586",
50927 + get_key_objectid(pfrom_key) ==
50928 + get_key_objectid(&item_key));
50929 + assert("vs-1587",
50930 + get_key_ordering(pfrom_key) ==
50931 + get_key_ordering(&item_key));
50932 +
50933 + max_item_key_by_coord(coord, &max_item_key);
50934 +
50935 + if (new_first != NULL) {
50936 + /* head of item is to be cut */
50937 + assert("vs-1542", keyeq(pfrom_key, &item_key));
50938 + assert("vs-1538", keylt(pto_key, &max_item_key));
50939 + } else {
50940 + /* tail of item is to be cut */
50941 + assert("vs-1540", keygt(pfrom_key, &item_key));
50942 + assert("vs-1543", keyeq(pto_key, &max_item_key));
50943 + }
50944 + }
50945 +#endif
50946 +
50947 + if (smallest_removed)
50948 + *smallest_removed = *pfrom_key;
50949 +
50950 + if (new_first) {
50951 + /* item head is cut. Item key will change. This new key is calculated here */
50952 + *new_first = *pto_key;
50953 + set_key_offset(new_first, get_key_offset(new_first) + 1);
50954 + }
50955 +
50956 + count = to - from + 1;
50957 +
50958 + assert("vs-1553",
50959 + get_key_offset(pfrom_key) >=
50960 + get_key_offset(&item_key) + extent_size(coord, from));
50961 + off =
50962 + get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
50963 + extent_size(coord, from));
50964 + if (off) {
50965 + /* tail of unit @from is to be cut partially. Its width decreases */
50966 + assert("vs-1582", new_first == NULL);
50967 + ext = extent_item(coord) + from;
50968 + extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
50969 + count--;
50970 + }
50971 +
50972 + assert("vs-1554",
50973 + get_key_offset(pto_key) <=
50974 + get_key_offset(&item_key) + extent_size(coord, to + 1) - 1);
50975 + off =
50976 + (get_key_offset(&item_key) + extent_size(coord, to + 1) - 1) -
50977 + get_key_offset(pto_key);
50978 + if (off) {
50979 + /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
50980 + and width decreased. */
50981 + assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
50982 + ext = extent_item(coord) + to;
50983 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
50984 + extent_set_start(ext,
50985 + extent_get_start(ext) +
50986 + (extent_get_width(ext) -
50987 + (off >> PAGE_CACHE_SHIFT)));
50988 +
50989 + extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
50990 + count--;
50991 + }
50992 + return count * sizeof(reiser4_extent);
50993 +}
50994 +
50995 +/* item_plugin->b.unit_key */
50996 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
50997 +{
50998 + assert("vs-300", coord_is_existing_unit(coord));
50999 +
51000 + item_key_by_coord(coord, key);
51001 + set_key_offset(key,
51002 + (get_key_offset(key) +
51003 + extent_size(coord, coord->unit_pos)));
51004 +
51005 + return key;
51006 +}
51007 +
51008 +/* item_plugin->b.max_unit_key */
51009 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
51010 +{
51011 + assert("vs-300", coord_is_existing_unit(coord));
51012 +
51013 + item_key_by_coord(coord, key);
51014 + set_key_offset(key,
51015 + (get_key_offset(key) +
51016 + extent_size(coord, coord->unit_pos + 1) - 1));
51017 + return key;
51018 +}
51019 +
51020 +/* item_plugin->b.estimate
51021 + item_plugin->b.item_data_by_flow */
51022 +
51023 +#if REISER4_DEBUG
51024 +
51025 +/* item_plugin->b.check
51026 + used for debugging, every item should have here the most complete
51027 + possible check of the consistency of the item that the inventor can
51028 + construct
51029 +*/
51030 +int check_extent(const coord_t * coord /* coord of item to check */ ,
51031 + const char **error /* where to store error message */ )
51032 +{
51033 + reiser4_extent *ext, *first;
51034 + unsigned i, j;
51035 + reiser4_block_nr start, width, blk_cnt;
51036 + unsigned num_units;
51037 + reiser4_tree *tree;
51038 + oid_t oid;
51039 + reiser4_key key;
51040 + coord_t scan;
51041 +
51042 + assert("vs-933", REISER4_DEBUG);
51043 +
51044 + if (znode_get_level(coord->node) != TWIG_LEVEL) {
51045 + *error = "Extent on the wrong level";
51046 + return -1;
51047 + }
51048 + if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
51049 + *error = "Wrong item size";
51050 + return -1;
51051 + }
51052 + ext = first = extent_item(coord);
51053 + blk_cnt = reiser4_block_count(reiser4_get_current_sb());
51054 + num_units = coord_num_units(coord);
51055 + tree = znode_get_tree(coord->node);
51056 + item_key_by_coord(coord, &key);
51057 + oid = get_key_objectid(&key);
51058 + coord_dup(&scan, coord);
51059 +
51060 + for (i = 0; i < num_units; ++i, ++ext) {
51061 + __u64 index;
51062 +
51063 + scan.unit_pos = i;
51064 + index = extent_unit_index(&scan);
51065 +
51066 +#if 0
51067 + /* check that all jnodes are present for the unallocated
51068 + * extent */
51069 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51070 + for (j = 0; j < extent_get_width(ext); j++) {
51071 + jnode *node;
51072 +
51073 + node = jlookup(tree, oid, index + j);
51074 + if (node == NULL) {
51075 + print_coord("scan", &scan, 0);
51076 + *error = "Jnode missing";
51077 + return -1;
51078 + }
51079 + jput(node);
51080 + }
51081 + }
51082 +#endif
51083 +
51084 + start = extent_get_start(ext);
51085 + if (start < 2)
51086 + continue;
51087 + /* extent is allocated one */
51088 + width = extent_get_width(ext);
51089 + if (start >= blk_cnt) {
51090 + *error = "Start too large";
51091 + return -1;
51092 + }
51093 + if (start + width > blk_cnt) {
51094 + *error = "End too large";
51095 + return -1;
51096 + }
51097 + /* make sure that this extent does not overlap with other
51098 + allocated extents extents */
51099 + for (j = 0; j < i; j++) {
51100 + if (state_of_extent(first + j) != ALLOCATED_EXTENT)
51101 + continue;
51102 + if (!
51103 + ((extent_get_start(ext) >=
51104 + extent_get_start(first + j) +
51105 + extent_get_width(first + j))
51106 + || (extent_get_start(ext) +
51107 + extent_get_width(ext) <=
51108 + extent_get_start(first + j)))) {
51109 + *error = "Extent overlaps with others";
51110 + return -1;
51111 + }
51112 + }
51113 +
51114 + }
51115 +
51116 + return 0;
51117 +}
51118 +
51119 +#endif /* REISER4_DEBUG */
51120 +
51121 +/*
51122 + Local variables:
51123 + c-indentation-style: "K&R"
51124 + mode-name: "LC"
51125 + c-basic-offset: 8
51126 + tab-width: 8
51127 + fill-column: 120
51128 + scroll-step: 1
51129 + End:
51130 +*/
51131 Index: linux-2.6.16/fs/reiser4/plugin/item/internal.c
51132 ===================================================================
51133 --- /dev/null
51134 +++ linux-2.6.16/fs/reiser4/plugin/item/internal.c
51135 @@ -0,0 +1,392 @@
51136 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51137 +
51138 +/* Implementation of internal-item plugin methods. */
51139 +
51140 +#include "../../forward.h"
51141 +#include "../../debug.h"
51142 +#include "../../dformat.h"
51143 +#include "../../key.h"
51144 +#include "../../coord.h"
51145 +#include "internal.h"
51146 +#include "item.h"
51147 +#include "../node/node.h"
51148 +#include "../plugin.h"
51149 +#include "../../jnode.h"
51150 +#include "../../znode.h"
51151 +#include "../../tree_walk.h"
51152 +#include "../../tree_mod.h"
51153 +#include "../../tree.h"
51154 +#include "../../super.h"
51155 +#include "../../block_alloc.h"
51156 +
51157 +/* see internal.h for explanation */
51158 +
51159 +/* plugin->u.item.b.mergeable */
51160 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
51161 + const coord_t * p2 UNUSED_ARG /* second item */ )
51162 +{
51163 + /* internal items are not mergeable */
51164 + return 0;
51165 +}
51166 +
51167 +/* ->lookup() method for internal items */
51168 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
51169 + lookup_bias bias UNUSED_ARG /* lookup bias */ ,
51170 + coord_t * coord /* coord of item */ )
51171 +{
51172 + reiser4_key ukey;
51173 +
51174 + switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
51175 + default:
51176 + impossible("", "keycmp()?!");
51177 + case LESS_THAN:
51178 + /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
51179 + item plugin can not be taken using coord set this way */
51180 + assert("vs-681", coord->unit_pos == 0);
51181 + coord->between = AFTER_UNIT;
51182 + case EQUAL_TO:
51183 + return CBK_COORD_FOUND;
51184 + case GREATER_THAN:
51185 + return CBK_COORD_NOTFOUND;
51186 + }
51187 +}
51188 +
51189 +/* return body of internal item at @coord */
51190 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
51191 + * item */ )
51192 +{
51193 + assert("nikita-607", coord != NULL);
51194 + assert("nikita-1650",
51195 + item_plugin_by_coord(coord) ==
51196 + item_plugin_by_id(NODE_POINTER_ID));
51197 + return (internal_item_layout *) item_body_by_coord(coord);
51198 +}
51199 +
51200 +void update_internal(const coord_t * coord, const reiser4_block_nr * blocknr)
51201 +{
51202 + internal_item_layout *item = internal_at(coord);
51203 + assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
51204 +
51205 + put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
51206 +}
51207 +
51208 +/* return child block number stored in the internal item at @coord */
51209 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
51210 +{
51211 + assert("nikita-608", coord != NULL);
51212 + return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
51213 +}
51214 +
51215 +/* get znode pointed to by internal @item */
51216 +static znode *znode_at(const coord_t * item /* coord of item */ ,
51217 + znode * parent /* parent node */ )
51218 +{
51219 + return child_znode(item, parent, 1, 0);
51220 +}
51221 +
51222 +/* store pointer from internal item into "block". Implementation of
51223 + ->down_link() method */
51224 +void down_link_internal(const coord_t * coord /* coord of item */ ,
51225 + const reiser4_key * key UNUSED_ARG /* key to get
51226 + * pointer for */ ,
51227 + reiser4_block_nr * block /* resulting block number */ )
51228 +{
51229 + ON_DEBUG(reiser4_key item_key);
51230 +
51231 + assert("nikita-609", coord != NULL);
51232 + assert("nikita-611", block != NULL);
51233 + assert("nikita-612", (key == NULL) ||
51234 + /* twig horrors */
51235 + (znode_get_level(coord->node) == TWIG_LEVEL)
51236 + || keyle(item_key_by_coord(coord, &item_key), key));
51237 +
51238 + *block = pointer_at(coord);
51239 + assert("nikita-2960", reiser4_blocknr_is_sane(block));
51240 +}
51241 +
51242 +/* Get the child's block number, or 0 if the block is unallocated. */
51243 +int
51244 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
51245 + reiser4_block_nr * block)
51246 +{
51247 + assert("jmacd-2059", coord != NULL);
51248 +
51249 + *block = pointer_at(coord);
51250 + assert("nikita-2961", reiser4_blocknr_is_sane(block));
51251 +
51252 + if (blocknr_is_fake(block)) {
51253 + *block = 0;
51254 + }
51255 +
51256 + return 0;
51257 +}
51258 +
51259 +/* Return the child. */
51260 +int
51261 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
51262 + jnode ** childp)
51263 +{
51264 + reiser4_block_nr block = pointer_at(coord);
51265 + znode *child;
51266 +
51267 + assert("jmacd-2059", childp != NULL);
51268 + assert("nikita-2962", reiser4_blocknr_is_sane(&block));
51269 +
51270 + child = zlook(znode_get_tree(coord->node), &block);
51271 +
51272 + if (IS_ERR(child)) {
51273 + return PTR_ERR(child);
51274 + }
51275 +
51276 + *childp = ZJNODE(child);
51277 +
51278 + return 0;
51279 +}
51280 +
51281 +static void check_link(znode * left, znode * right)
51282 +{
51283 + znode *scan;
51284 +
51285 + for (scan = left; scan != right; scan = scan->right) {
51286 + if (ZF_ISSET(scan, JNODE_RIP))
51287 + break;
51288 + if (znode_is_right_connected(scan) && scan->right != NULL) {
51289 + if (ZF_ISSET(scan->right, JNODE_RIP))
51290 + break;
51291 + assert("nikita-3285",
51292 + znode_is_left_connected(scan->right));
51293 + assert("nikita-3265",
51294 + ergo(scan != left,
51295 + ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
51296 + assert("nikita-3284", scan->right->left == scan);
51297 + } else
51298 + break;
51299 + }
51300 +}
51301 +
51302 +int check__internal(const coord_t * coord, const char **error)
51303 +{
51304 + reiser4_block_nr blk;
51305 + znode *child;
51306 + coord_t cpy;
51307 +
51308 + blk = pointer_at(coord);
51309 + if (!reiser4_blocknr_is_sane(&blk)) {
51310 + *error = "Invalid pointer";
51311 + return -1;
51312 + }
51313 + coord_dup(&cpy, coord);
51314 + child = znode_at(&cpy, cpy.node);
51315 + if (child != NULL) {
51316 + znode *left_child;
51317 + znode *right_child;
51318 +
51319 + left_child = right_child = NULL;
51320 +
51321 + assert("nikita-3256", znode_invariant(child));
51322 + if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
51323 + left_child = znode_at(&cpy, cpy.node);
51324 + if (left_child != NULL) {
51325 + read_lock_tree(znode_get_tree(child));
51326 + check_link(left_child, child);
51327 + read_unlock_tree(znode_get_tree(child));
51328 + zput(left_child);
51329 + }
51330 + }
51331 + coord_dup(&cpy, coord);
51332 + if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
51333 + right_child = znode_at(&cpy, cpy.node);
51334 + if (right_child != NULL) {
51335 + read_lock_tree(znode_get_tree(child));
51336 + check_link(child, right_child);
51337 + read_unlock_tree(znode_get_tree(child));
51338 + zput(right_child);
51339 + }
51340 + }
51341 + zput(child);
51342 + }
51343 + return 0;
51344 +}
51345 +
51346 +/* return true only if this item really points to "block" */
51347 +/* Audited by: green(2002.06.14) */
51348 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
51349 + const reiser4_block_nr * block /* block number to
51350 + * check */ )
51351 +{
51352 + assert("nikita-613", coord != NULL);
51353 + assert("nikita-614", block != NULL);
51354 +
51355 + return pointer_at(coord) == *block;
51356 +}
51357 +
51358 +/* hook called by ->create_item() method of node plugin after new internal
51359 + item was just created.
51360 +
51361 + This is point where pointer to new node is inserted into tree. Initialize
51362 + parent pointer in child znode, insert child into sibling list and slum.
51363 +
51364 +*/
51365 +int create_hook_internal(const coord_t * item /* coord of item */ ,
51366 + void *arg /* child's left neighbor, if any */ )
51367 +{
51368 + znode *child;
51369 + __u64 child_ptr;
51370 +
51371 + assert("nikita-1252", item != NULL);
51372 + assert("nikita-1253", item->node != NULL);
51373 + assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51374 + assert("nikita-1450", item->unit_pos == 0);
51375 +
51376 + /*
51377 + * preparing to item insertion build_child_ptr_data sets pointer to
51378 + * data to be inserted to jnode's blocknr which is in cpu byte
51379 + * order. Node's create_item simply copied those data. As result we
51380 + * have child pointer in cpu's byte order. Convert content of internal
51381 + * item to little endian byte order.
51382 + */
51383 + child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51384 + update_internal(item, &child_ptr);
51385 +
51386 + child = znode_at(item, item->node);
51387 + if (child != NULL && !IS_ERR(child)) {
51388 + znode *left;
51389 + int result = 0;
51390 + reiser4_tree *tree;
51391 +
51392 + left = arg;
51393 + tree = znode_get_tree(item->node);
51394 + write_lock_tree(tree);
51395 + write_lock_dk(tree);
51396 + assert("nikita-1400", (child->in_parent.node == NULL)
51397 + || (znode_above_root(child->in_parent.node)));
51398 + ++item->node->c_count;
51399 + coord_to_parent_coord(item, &child->in_parent);
51400 + sibling_list_insert_nolock(child, left);
51401 +
51402 + assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51403 + ZF_CLR(child, JNODE_ORPHAN);
51404 +
51405 + if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51406 + znode_get_rd_key(child))) {
51407 + znode_set_rd_key(child, znode_get_rd_key(left));
51408 + }
51409 + write_unlock_dk(tree);
51410 + write_unlock_tree(tree);
51411 + zput(child);
51412 + return result;
51413 + } else {
51414 + if (child == NULL)
51415 + child = ERR_PTR(-EIO);
51416 + return PTR_ERR(child);
51417 + }
51418 +}
51419 +
51420 +/* hook called by ->cut_and_kill() method of node plugin just before internal
51421 + item is removed.
51422 +
51423 + This is point where empty node is removed from the tree. Clear parent
51424 + pointer in child, and mark node for pending deletion.
51425 +
51426 + Node will be actually deleted later and in several installations:
51427 +
51428 + . when last lock on this node will be released, node will be removed from
51429 + the sibling list and its lock will be invalidated
51430 +
51431 + . when last reference to this node will be dropped, bitmap will be updated
51432 + and node will be actually removed from the memory.
51433 +
51434 +
51435 +*/
51436 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
51437 + pos_in_node_t from UNUSED_ARG /* start unit */ ,
51438 + pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51439 + struct carry_kill_data *p UNUSED_ARG)
51440 +{
51441 + znode *child;
51442 +
51443 + assert("nikita-1222", item != NULL);
51444 + assert("nikita-1224", from == 0);
51445 + assert("nikita-1225", count == 1);
51446 +
51447 + child = znode_at(item, item->node);
51448 + if (IS_ERR(child))
51449 + return PTR_ERR(child);
51450 + else if (node_is_empty(child)) {
51451 + reiser4_tree *tree;
51452 +
51453 + assert("nikita-1397", znode_is_write_locked(child));
51454 + assert("nikita-1398", child->c_count == 0);
51455 + assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51456 +
51457 + tree = znode_get_tree(item->node);
51458 + write_lock_tree(tree);
51459 + init_parent_coord(&child->in_parent, NULL);
51460 + --item->node->c_count;
51461 + write_unlock_tree(tree);
51462 + zput(child);
51463 + return 0;
51464 + } else {
51465 + warning("nikita-1223",
51466 + "Cowardly refuse to remove link to non-empty node");
51467 + zput(child);
51468 + return RETERR(-EIO);
51469 + }
51470 +}
51471 +
51472 +/* hook called by ->shift() node plugin method when iternal item was just
51473 + moved from one node to another.
51474 +
51475 + Update parent pointer in child and c_counts in old and new parent
51476 +
51477 +*/
51478 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
51479 + unsigned from UNUSED_ARG /* start unit */ ,
51480 + unsigned count UNUSED_ARG /* stop unit */ ,
51481 + znode * old_node /* old parent */ )
51482 +{
51483 + znode *child;
51484 + znode *new_node;
51485 + reiser4_tree *tree;
51486 +
51487 + assert("nikita-1276", item != NULL);
51488 + assert("nikita-1277", from == 0);
51489 + assert("nikita-1278", count == 1);
51490 + assert("nikita-1451", item->unit_pos == 0);
51491 +
51492 + new_node = item->node;
51493 + assert("nikita-2132", new_node != old_node);
51494 + tree = znode_get_tree(item->node);
51495 + child = child_znode(item, old_node, 1, 0);
51496 + if (child == NULL)
51497 + return 0;
51498 + if (!IS_ERR(child)) {
51499 + write_lock_tree(tree);
51500 + ++new_node->c_count;
51501 + assert("nikita-1395", znode_parent(child) == old_node);
51502 + assert("nikita-1396", old_node->c_count > 0);
51503 + coord_to_parent_coord(item, &child->in_parent);
51504 + assert("nikita-1781", znode_parent(child) == new_node);
51505 + assert("nikita-1782",
51506 + check_tree_pointer(item, child) == NS_FOUND);
51507 + --old_node->c_count;
51508 + write_unlock_tree(tree);
51509 + zput(child);
51510 + return 0;
51511 + } else
51512 + return PTR_ERR(child);
51513 +}
51514 +
51515 +/* plugin->u.item.b.max_key_inside - not defined */
51516 +
51517 +/* plugin->u.item.b.nr_units - item.c:single_unit */
51518 +
51519 +/* Make Linus happy.
51520 + Local variables:
51521 + c-indentation-style: "K&R"
51522 + mode-name: "LC"
51523 + c-basic-offset: 8
51524 + tab-width: 8
51525 + fill-column: 120
51526 + End:
51527 +*/
51528 Index: linux-2.6.16/fs/reiser4/plugin/item/internal.h
51529 ===================================================================
51530 --- /dev/null
51531 +++ linux-2.6.16/fs/reiser4/plugin/item/internal.h
51532 @@ -0,0 +1,57 @@
51533 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51534 +/* Internal item contains down-link to the child of the internal/twig
51535 + node in a tree. It is internal items that are actually used during
51536 + tree traversal. */
51537 +
51538 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51539 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51540 +
51541 +#include "../../forward.h"
51542 +#include "../../dformat.h"
51543 +
51544 +/* on-disk layout of internal item */
51545 +typedef struct internal_item_layout {
51546 + /* 0 */ reiser4_dblock_nr pointer;
51547 + /* 4 */
51548 +} internal_item_layout;
51549 +
51550 +struct cut_list;
51551 +
51552 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
51553 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51554 + coord_t * coord);
51555 +/* store pointer from internal item into "block". Implementation of
51556 + ->down_link() method */
51557 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51558 + reiser4_block_nr * block);
51559 +extern int has_pointer_to_internal(const coord_t * coord,
51560 + const reiser4_block_nr * block);
51561 +extern int create_hook_internal(const coord_t * item, void *arg);
51562 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51563 + pos_in_node_t count, struct carry_kill_data *);
51564 +extern int shift_hook_internal(const coord_t * item, unsigned from,
51565 + unsigned count, znode * old_node);
51566 +extern void print_internal(const char *prefix, coord_t * coord);
51567 +
51568 +extern int utmost_child_internal(const coord_t * coord, sideof side,
51569 + jnode ** child);
51570 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51571 + reiser4_block_nr * block);
51572 +
51573 +extern void update_internal(const coord_t * coord,
51574 + const reiser4_block_nr * blocknr);
51575 +/* FIXME: reiserfs has check_internal */
51576 +extern int check__internal(const coord_t * coord, const char **error);
51577 +
51578 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51579 +#endif
51580 +
51581 +/* Make Linus happy.
51582 + Local variables:
51583 + c-indentation-style: "K&R"
51584 + mode-name: "LC"
51585 + c-basic-offset: 8
51586 + tab-width: 8
51587 + fill-column: 120
51588 + End:
51589 +*/
51590 Index: linux-2.6.16/fs/reiser4/plugin/item/item.c
51591 ===================================================================
51592 --- /dev/null
51593 +++ linux-2.6.16/fs/reiser4/plugin/item/item.c
51594 @@ -0,0 +1,727 @@
51595 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51596 +
51597 +/* definition of item plugins. */
51598 +
51599 +#include "../../forward.h"
51600 +#include "../../debug.h"
51601 +#include "../../key.h"
51602 +#include "../../coord.h"
51603 +#include "../plugin_header.h"
51604 +#include "sde.h"
51605 +#include "internal.h"
51606 +#include "item.h"
51607 +#include "static_stat.h"
51608 +#include "../plugin.h"
51609 +#include "../../znode.h"
51610 +#include "../../tree.h"
51611 +#include "../../context.h"
51612 +#include "ctail.h"
51613 +
51614 +/* return pointer to item body */
51615 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
51616 +{
51617 + assert("nikita-324", coord != NULL);
51618 + assert("nikita-325", coord->node != NULL);
51619 + assert("nikita-326", znode_is_loaded(coord->node));
51620 + assert("nikita-3200", coord->offset == INVALID_OFFSET);
51621 +
51622 + coord->offset =
51623 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
51624 + zdata(coord->node);
51625 + ON_DEBUG(coord->body_v = coord->node->times_locked);
51626 +}
51627 +
51628 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
51629 +{
51630 + return zdata(coord->node) + coord->offset;
51631 +}
51632 +
51633 +#if REISER4_DEBUG
51634 +
51635 +int item_body_is_valid(const coord_t * coord)
51636 +{
51637 + return
51638 + coord->offset ==
51639 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
51640 + zdata(coord->node);
51641 +}
51642 +
51643 +#endif
51644 +
51645 +/* return length of item at @coord */
51646 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
51647 +{
51648 + int len;
51649 +
51650 + assert("nikita-327", coord != NULL);
51651 + assert("nikita-328", coord->node != NULL);
51652 + assert("nikita-329", znode_is_loaded(coord->node));
51653 +
51654 + len = node_plugin_by_node(coord->node)->length_by_coord(coord);
51655 + return len;
51656 +}
51657 +
51658 +void obtain_item_plugin(const coord_t * coord)
51659 +{
51660 + assert("nikita-330", coord != NULL);
51661 + assert("nikita-331", coord->node != NULL);
51662 + assert("nikita-332", znode_is_loaded(coord->node));
51663 +
51664 + coord_set_iplug((coord_t *) coord,
51665 + node_plugin_by_node(coord->node)->
51666 + plugin_by_coord(coord));
51667 + assert("nikita-2479",
51668 + coord_iplug(coord) ==
51669 + node_plugin_by_node(coord->node)->plugin_by_coord(coord));
51670 +}
51671 +
51672 +/* return type of item at @coord */
51673 +item_type_id item_type_by_coord(const coord_t * coord /* coord to query */ )
51674 +{
51675 + assert("nikita-333", coord != NULL);
51676 + assert("nikita-334", coord->node != NULL);
51677 + assert("nikita-335", znode_is_loaded(coord->node));
51678 + assert("nikita-336", item_plugin_by_coord(coord) != NULL);
51679 +
51680 + return item_plugin_by_coord(coord)->b.item_type;
51681 +}
51682 +
51683 +/* return id of item */
51684 +/* Audited by: green(2002.06.15) */
51685 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
51686 +{
51687 + assert("vs-539", coord != NULL);
51688 + assert("vs-538", coord->node != NULL);
51689 + assert("vs-537", znode_is_loaded(coord->node));
51690 + assert("vs-536", item_plugin_by_coord(coord) != NULL);
51691 + assert("vs-540",
51692 + item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
51693 +
51694 + return item_id_by_plugin(item_plugin_by_coord(coord));
51695 +}
51696 +
51697 +/* return key of item at @coord */
51698 +/* Audited by: green(2002.06.15) */
51699 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
51700 + reiser4_key * key /* result */ )
51701 +{
51702 + assert("nikita-338", coord != NULL);
51703 + assert("nikita-339", coord->node != NULL);
51704 + assert("nikita-340", znode_is_loaded(coord->node));
51705 +
51706 + return node_plugin_by_node(coord->node)->key_at(coord, key);
51707 +}
51708 +
51709 +/* this returns max key in the item */
51710 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
51711 + reiser4_key * key /* result */ )
51712 +{
51713 + coord_t last;
51714 +
51715 + assert("nikita-338", coord != NULL);
51716 + assert("nikita-339", coord->node != NULL);
51717 + assert("nikita-340", znode_is_loaded(coord->node));
51718 +
51719 + /* make coord pointing to last item's unit */
51720 + coord_dup(&last, coord);
51721 + last.unit_pos = coord_num_units(&last) - 1;
51722 + assert("vs-1560", coord_is_existing_unit(&last));
51723 +
51724 + max_unit_key_by_coord(&last, key);
51725 + return key;
51726 +}
51727 +
51728 +/* return key of unit at @coord */
51729 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51730 + reiser4_key * key /* result */ )
51731 +{
51732 + assert("nikita-772", coord != NULL);
51733 + assert("nikita-774", coord->node != NULL);
51734 + assert("nikita-775", znode_is_loaded(coord->node));
51735 +
51736 + if (item_plugin_by_coord(coord)->b.unit_key != NULL)
51737 + return item_plugin_by_coord(coord)->b.unit_key(coord, key);
51738 + else
51739 + return item_key_by_coord(coord, key);
51740 +}
51741 +
51742 +/* return the biggest key contained the unit @coord */
51743 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51744 + reiser4_key * key /* result */ )
51745 +{
51746 + assert("nikita-772", coord != NULL);
51747 + assert("nikita-774", coord->node != NULL);
51748 + assert("nikita-775", znode_is_loaded(coord->node));
51749 +
51750 + if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
51751 + return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
51752 + else
51753 + return unit_key_by_coord(coord, key);
51754 +}
51755 +
51756 +/* ->max_key_inside() method for items consisting of exactly one key (like
51757 + stat-data) */
51758 +static reiser4_key *max_key_inside_single_key(const coord_t *
51759 + coord /* coord of item */ ,
51760 + reiser4_key *
51761 + result /* resulting key */ )
51762 +{
51763 + assert("nikita-604", coord != NULL);
51764 +
51765 + /* coord -> key is starting key of this item and it has to be already
51766 + filled in */
51767 + return unit_key_by_coord(coord, result);
51768 +}
51769 +
51770 +/* ->nr_units() method for items consisting of exactly one unit always */
51771 +static pos_in_node_t
51772 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
51773 +{
51774 + return 1;
51775 +}
51776 +
51777 +static int
51778 +paste_no_paste(coord_t * coord UNUSED_ARG,
51779 + reiser4_item_data * data UNUSED_ARG,
51780 + carry_plugin_info * info UNUSED_ARG)
51781 +{
51782 + return 0;
51783 +}
51784 +
51785 +/* default ->fast_paste() method */
51786 +static int
51787 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
51788 +{
51789 + return 1;
51790 +}
51791 +
51792 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
51793 + const reiser4_key * key /* key to check */ ,
51794 + const reiser4_item_data * data /* parameters of item
51795 + * being created */ )
51796 +{
51797 + item_plugin *iplug;
51798 + reiser4_key min_key_in_item;
51799 + reiser4_key max_key_in_item;
51800 +
51801 + assert("nikita-1658", item != NULL);
51802 + assert("nikita-1659", key != NULL);
51803 +
51804 + iplug = item_plugin_by_coord(item);
51805 + if (iplug->b.can_contain_key != NULL)
51806 + return iplug->b.can_contain_key(item, key, data);
51807 + else {
51808 + assert("nikita-1681", iplug->b.max_key_inside != NULL);
51809 + item_key_by_coord(item, &min_key_in_item);
51810 + iplug->b.max_key_inside(item, &max_key_in_item);
51811 +
51812 + /* can contain key if
51813 + min_key_in_item <= key &&
51814 + key <= max_key_in_item
51815 + */
51816 + return keyle(&min_key_in_item, key)
51817 + && keyle(key, &max_key_in_item);
51818 + }
51819 +}
51820 +
51821 +/* mergeable method for non mergeable items */
51822 +static int
51823 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
51824 +{
51825 + return 0;
51826 +}
51827 +
51828 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
51829 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
51830 + const coord_t * i2 /* coord of second item */ )
51831 +{
51832 + item_plugin *iplug;
51833 + reiser4_key k1;
51834 + reiser4_key k2;
51835 +
51836 + assert("nikita-1336", i1 != NULL);
51837 + assert("nikita-1337", i2 != NULL);
51838 +
51839 + iplug = item_plugin_by_coord(i1);
51840 + assert("nikita-1338", iplug != NULL);
51841 +
51842 + /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
51843 + shifting code when nodes are in "suspended" state. */
51844 + assert("nikita-1663",
51845 + keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
51846 +
51847 + if (iplug->b.mergeable != NULL) {
51848 + return iplug->b.mergeable(i1, i2);
51849 + } else if (iplug->b.max_key_inside != NULL) {
51850 + iplug->b.max_key_inside(i1, &k1);
51851 + item_key_by_coord(i2, &k2);
51852 +
51853 + /* mergeable if ->max_key_inside() >= key of i2; */
51854 + return keyge(iplug->b.max_key_inside(i1, &k1),
51855 + item_key_by_coord(i2, &k2));
51856 + } else {
51857 + item_key_by_coord(i1, &k1);
51858 + item_key_by_coord(i2, &k2);
51859 +
51860 + return
51861 + (get_key_locality(&k1) == get_key_locality(&k2)) &&
51862 + (get_key_objectid(&k1) == get_key_objectid(&k2))
51863 + && (iplug == item_plugin_by_coord(i2));
51864 + }
51865 +}
51866 +
51867 +int item_is_extent(const coord_t * item)
51868 +{
51869 + assert("vs-482", coord_is_existing_item(item));
51870 + return item_id_by_coord(item) == EXTENT_POINTER_ID;
51871 +}
51872 +
51873 +int item_is_tail(const coord_t * item)
51874 +{
51875 + assert("vs-482", coord_is_existing_item(item));
51876 + return item_id_by_coord(item) == FORMATTING_ID;
51877 +}
51878 +
51879 +int item_is_statdata(const coord_t * item)
51880 +{
51881 + assert("vs-516", coord_is_existing_item(item));
51882 + return item_type_by_coord(item) == STAT_DATA_ITEM_TYPE;
51883 +}
51884 +
51885 +int item_is_ctail(const coord_t * item)
51886 +{
51887 + assert("edward-xx", coord_is_existing_item(item));
51888 + return item_id_by_coord(item) == CTAIL_ID;
51889 +}
51890 +
51891 +static int change_item(struct inode *inode, reiser4_plugin * plugin)
51892 +{
51893 + /* cannot change constituent item (sd, or dir_item) */
51894 + return RETERR(-EINVAL);
51895 +}
51896 +
51897 +static reiser4_plugin_ops item_plugin_ops = {
51898 + .init = NULL,
51899 + .load = NULL,
51900 + .save_len = NULL,
51901 + .save = NULL,
51902 + .change = change_item
51903 +};
51904 +
51905 +item_plugin item_plugins[LAST_ITEM_ID] = {
51906 + [STATIC_STAT_DATA_ID] = {
51907 + .h = {
51908 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51909 + .id = STATIC_STAT_DATA_ID,
51910 + .pops = &item_plugin_ops,
51911 + .label = "sd",
51912 + .desc = "stat-data",
51913 + .linkage = {NULL, NULL}
51914 + },
51915 + .b = {
51916 + .item_type = STAT_DATA_ITEM_TYPE,
51917 + .max_key_inside = max_key_inside_single_key,
51918 + .can_contain_key = NULL,
51919 + .mergeable = not_mergeable,
51920 + .nr_units = nr_units_single_unit,
51921 + .lookup = NULL,
51922 + .init = NULL,
51923 + .paste = paste_no_paste,
51924 + .fast_paste = NULL,
51925 + .can_shift = NULL,
51926 + .copy_units = NULL,
51927 + .create_hook = NULL,
51928 + .kill_hook = NULL,
51929 + .shift_hook = NULL,
51930 + .cut_units = NULL,
51931 + .kill_units = NULL,
51932 + .unit_key = NULL,
51933 + .max_unit_key = NULL,
51934 + .estimate = NULL,
51935 + .item_data_by_flow = NULL,
51936 +#if REISER4_DEBUG
51937 + .check = NULL
51938 +#endif
51939 + },
51940 + .f = {
51941 + .utmost_child = NULL,
51942 + .utmost_child_real_block = NULL,
51943 + .update = NULL,
51944 + .scan = NULL,
51945 + .convert = NULL
51946 + },
51947 + .s = {
51948 + .sd = {
51949 + .init_inode = init_inode_static_sd,
51950 + .save_len = save_len_static_sd,
51951 + .save = save_static_sd
51952 + }
51953 + }
51954 + },
51955 + [SIMPLE_DIR_ENTRY_ID] = {
51956 + .h = {
51957 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51958 + .id = SIMPLE_DIR_ENTRY_ID,
51959 + .pops = &item_plugin_ops,
51960 + .label = "de",
51961 + .desc = "directory entry",
51962 + .linkage = {NULL, NULL}
51963 + },
51964 + .b = {
51965 + .item_type = DIR_ENTRY_ITEM_TYPE,
51966 + .max_key_inside = max_key_inside_single_key,
51967 + .can_contain_key = NULL,
51968 + .mergeable = NULL,
51969 + .nr_units = nr_units_single_unit,
51970 + .lookup = NULL,
51971 + .init = NULL,
51972 + .paste = NULL,
51973 + .fast_paste = NULL,
51974 + .can_shift = NULL,
51975 + .copy_units = NULL,
51976 + .create_hook = NULL,
51977 + .kill_hook = NULL,
51978 + .shift_hook = NULL,
51979 + .cut_units = NULL,
51980 + .kill_units = NULL,
51981 + .unit_key = NULL,
51982 + .max_unit_key = NULL,
51983 + .estimate = NULL,
51984 + .item_data_by_flow = NULL,
51985 +#if REISER4_DEBUG
51986 + .check = NULL
51987 +#endif
51988 + },
51989 + .f = {
51990 + .utmost_child = NULL,
51991 + .utmost_child_real_block = NULL,
51992 + .update = NULL,
51993 + .scan = NULL,
51994 + .convert = NULL
51995 + },
51996 + .s = {
51997 + .dir = {
51998 + .extract_key = extract_key_de,
51999 + .update_key = update_key_de,
52000 + .extract_name = extract_name_de,
52001 + .extract_file_type = extract_file_type_de,
52002 + .add_entry = add_entry_de,
52003 + .rem_entry = rem_entry_de,
52004 + .max_name_len = max_name_len_de
52005 + }
52006 + }
52007 + },
52008 + [COMPOUND_DIR_ID] = {
52009 + .h = {
52010 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52011 + .id = COMPOUND_DIR_ID,
52012 + .pops = &item_plugin_ops,
52013 + .label = "cde",
52014 + .desc = "compressed directory entry",
52015 + .linkage = {NULL, NULL}
52016 + },
52017 + .b = {
52018 + .item_type = DIR_ENTRY_ITEM_TYPE,
52019 + .max_key_inside = max_key_inside_cde,
52020 + .can_contain_key = can_contain_key_cde,
52021 + .mergeable = mergeable_cde,
52022 + .nr_units = nr_units_cde,
52023 + .lookup = lookup_cde,
52024 + .init = init_cde,
52025 + .paste = paste_cde,
52026 + .fast_paste = agree_to_fast_op,
52027 + .can_shift = can_shift_cde,
52028 + .copy_units = copy_units_cde,
52029 + .create_hook = NULL,
52030 + .kill_hook = NULL,
52031 + .shift_hook = NULL,
52032 + .cut_units = cut_units_cde,
52033 + .kill_units = kill_units_cde,
52034 + .unit_key = unit_key_cde,
52035 + .max_unit_key = unit_key_cde,
52036 + .estimate = estimate_cde,
52037 + .item_data_by_flow = NULL,
52038 +#if REISER4_DEBUG
52039 + .check = check_cde
52040 +#endif
52041 + },
52042 + .f = {
52043 + .utmost_child = NULL,
52044 + .utmost_child_real_block = NULL,
52045 + .update = NULL,
52046 + .scan = NULL,
52047 + .convert = NULL
52048 + },
52049 + .s = {
52050 + .dir = {
52051 + .extract_key = extract_key_cde,
52052 + .update_key = update_key_cde,
52053 + .extract_name = extract_name_cde,
52054 + .extract_file_type = extract_file_type_de,
52055 + .add_entry = add_entry_cde,
52056 + .rem_entry = rem_entry_cde,
52057 + .max_name_len = max_name_len_cde
52058 + }
52059 + }
52060 + },
52061 + [NODE_POINTER_ID] = {
52062 + .h = {
52063 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52064 + .id = NODE_POINTER_ID,
52065 + .pops = NULL,
52066 + .label = "internal",
52067 + .desc = "internal item",
52068 + .linkage = {NULL, NULL}
52069 + },
52070 + .b = {
52071 + .item_type = INTERNAL_ITEM_TYPE,
52072 + .max_key_inside = NULL,
52073 + .can_contain_key = NULL,
52074 + .mergeable = mergeable_internal,
52075 + .nr_units = nr_units_single_unit,
52076 + .lookup = lookup_internal,
52077 + .init = NULL,
52078 + .paste = NULL,
52079 + .fast_paste = NULL,
52080 + .can_shift = NULL,
52081 + .copy_units = NULL,
52082 + .create_hook = create_hook_internal,
52083 + .kill_hook = kill_hook_internal,
52084 + .shift_hook = shift_hook_internal,
52085 + .cut_units = NULL,
52086 + .kill_units = NULL,
52087 + .unit_key = NULL,
52088 + .max_unit_key = NULL,
52089 + .estimate = NULL,
52090 + .item_data_by_flow = NULL,
52091 +#if REISER4_DEBUG
52092 + .check = check__internal
52093 +#endif
52094 + },
52095 + .f = {
52096 + .utmost_child = utmost_child_internal,
52097 + .utmost_child_real_block =
52098 + utmost_child_real_block_internal,
52099 + .update = update_internal,
52100 + .scan = NULL,
52101 + .convert = NULL
52102 + },
52103 + .s = {
52104 + .internal = {
52105 + .down_link = down_link_internal,
52106 + .has_pointer_to = has_pointer_to_internal
52107 + }
52108 + }
52109 + },
52110 + [EXTENT_POINTER_ID] = {
52111 + .h = {
52112 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52113 + .id = EXTENT_POINTER_ID,
52114 + .pops = NULL,
52115 + .label = "extent",
52116 + .desc = "extent item",
52117 + .linkage = {NULL, NULL}
52118 + },
52119 + .b = {
52120 + .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52121 + .max_key_inside = max_key_inside_extent,
52122 + .can_contain_key = can_contain_key_extent,
52123 + .mergeable = mergeable_extent,
52124 + .nr_units = nr_units_extent,
52125 + .lookup = lookup_extent,
52126 + .init = NULL,
52127 + .paste = paste_extent,
52128 + .fast_paste = agree_to_fast_op,
52129 + .can_shift = can_shift_extent,
52130 + .create_hook = create_hook_extent,
52131 + .copy_units = copy_units_extent,
52132 + .kill_hook = kill_hook_extent,
52133 + .shift_hook = NULL,
52134 + .cut_units = cut_units_extent,
52135 + .kill_units = kill_units_extent,
52136 + .unit_key = unit_key_extent,
52137 + .max_unit_key = max_unit_key_extent,
52138 + .estimate = NULL,
52139 + .item_data_by_flow = NULL,
52140 +#if REISER4_DEBUG
52141 + .check = check_extent
52142 +#endif
52143 + },
52144 + .f = {
52145 + .utmost_child = utmost_child_extent,
52146 + .utmost_child_real_block =
52147 + utmost_child_real_block_extent,
52148 + .update = NULL,
52149 + .scan = scan_extent,
52150 + .convert = NULL,
52151 + .key_by_offset = key_by_offset_extent
52152 + },
52153 + .s = {
52154 + .file = {
52155 + .write = write_extent,
52156 + .read = read_extent,
52157 + .readpage = readpage_extent,
52158 + .get_block = get_block_address_extent,
52159 + .readpages = readpages_extent,
52160 + .append_key = append_key_extent,
52161 + .init_coord_extension =
52162 + init_coord_extension_extent
52163 + }
52164 + }
52165 + },
52166 + [FORMATTING_ID] = {
52167 + .h = {
52168 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52169 + .id = FORMATTING_ID,
52170 + .pops = NULL,
52171 + .label = "body",
52172 + .desc = "body (or tail?) item",
52173 + .linkage = {NULL, NULL}
52174 + },
52175 + .b = {
52176 + .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52177 + .max_key_inside = max_key_inside_tail,
52178 + .can_contain_key = can_contain_key_tail,
52179 + .mergeable = mergeable_tail,
52180 + .nr_units = nr_units_tail,
52181 + .lookup = lookup_tail,
52182 + .init = NULL,
52183 + .paste = paste_tail,
52184 + .fast_paste = agree_to_fast_op,
52185 + .can_shift = can_shift_tail,
52186 + .create_hook = NULL,
52187 + .copy_units = copy_units_tail,
52188 + .kill_hook = kill_hook_tail,
52189 + .shift_hook = NULL,
52190 + .cut_units = cut_units_tail,
52191 + .kill_units = kill_units_tail,
52192 + .unit_key = unit_key_tail,
52193 + .max_unit_key = unit_key_tail,
52194 + .estimate = NULL,
52195 + .item_data_by_flow = NULL,
52196 +#if REISER4_DEBUG
52197 + .check = NULL
52198 +#endif
52199 + },
52200 + .f = {
52201 + .utmost_child = NULL,
52202 + .utmost_child_real_block = NULL,
52203 + .update = NULL,
52204 + .scan = NULL,
52205 + .convert = NULL
52206 + },
52207 + .s = {
52208 + .file = {
52209 + .write = write_tail,
52210 + .read = read_tail,
52211 + .readpage = readpage_tail,
52212 + .get_block = NULL,
52213 + .readpages = NULL,
52214 + .append_key = append_key_tail,
52215 + .init_coord_extension =
52216 + init_coord_extension_tail
52217 + }
52218 + }
52219 + },
52220 + [CTAIL_ID] = {
52221 + .h = {
52222 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52223 + .id = CTAIL_ID,
52224 + .pops = NULL,
52225 + .label = "ctail",
52226 + .desc = "cryptcompress tail item",
52227 + .linkage = {NULL, NULL}
52228 + },
52229 + .b = {
52230 + .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52231 + .max_key_inside = max_key_inside_tail,
52232 + .can_contain_key = can_contain_key_ctail,
52233 + .mergeable = mergeable_ctail,
52234 + .nr_units = nr_units_ctail,
52235 + .lookup = NULL,
52236 + .init = init_ctail,
52237 + .paste = paste_ctail,
52238 + .fast_paste = agree_to_fast_op,
52239 + .can_shift = can_shift_ctail,
52240 + .create_hook = create_hook_ctail,
52241 + .copy_units = copy_units_ctail,
52242 + .kill_hook = kill_hook_ctail,
52243 + .shift_hook = shift_hook_ctail,
52244 + .cut_units = cut_units_ctail,
52245 + .kill_units = kill_units_ctail,
52246 + .unit_key = unit_key_tail,
52247 + .max_unit_key = unit_key_tail,
52248 + .estimate = estimate_ctail,
52249 + .item_data_by_flow = NULL,
52250 +#if REISER4_DEBUG
52251 + .check = check_ctail
52252 +#endif
52253 + },
52254 + .f = {
52255 + .utmost_child = utmost_child_ctail,
52256 + /* FIXME-EDWARD: write this */
52257 + .utmost_child_real_block = NULL,
52258 + .update = NULL,
52259 + .scan = scan_ctail,
52260 + .convert = convert_ctail
52261 + },
52262 + .s = {
52263 + .file = {
52264 + .write = NULL,
52265 + .read = read_ctail,
52266 + .readpage = readpage_ctail,
52267 + .get_block = get_block_address_tail,
52268 + .readpages = readpages_ctail,
52269 + .append_key = append_key_ctail,
52270 + .init_coord_extension =
52271 + init_coord_extension_tail
52272 + }
52273 + }
52274 + },
52275 + [BLACK_BOX_ID] = {
52276 + .h = {
52277 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
52278 + .id = BLACK_BOX_ID,
52279 + .pops = NULL,
52280 + .label = "blackbox",
52281 + .desc = "black box item",
52282 + .linkage = {NULL, NULL}
52283 + },
52284 + .b = {
52285 + .item_type = OTHER_ITEM_TYPE,
52286 + .max_key_inside = NULL,
52287 + .can_contain_key = NULL,
52288 + .mergeable = not_mergeable,
52289 + .nr_units = nr_units_single_unit,
52290 + /* to need for ->lookup method */
52291 + .lookup = NULL,
52292 + .init = NULL,
52293 + .paste = NULL,
52294 + .fast_paste = NULL,
52295 + .can_shift = NULL,
52296 + .copy_units = NULL,
52297 + .create_hook = NULL,
52298 + .kill_hook = NULL,
52299 + .shift_hook = NULL,
52300 + .cut_units = NULL,
52301 + .kill_units = NULL,
52302 + .unit_key = NULL,
52303 + .max_unit_key = NULL,
52304 + .estimate = NULL,
52305 + .item_data_by_flow = NULL,
52306 +#if REISER4_DEBUG
52307 + .check = NULL
52308 +#endif
52309 + }
52310 + }
52311 +};
52312 +
52313 +/* Make Linus happy.
52314 + Local variables:
52315 + c-indentation-style: "K&R"
52316 + mode-name: "LC"
52317 + c-basic-offset: 8
52318 + tab-width: 8
52319 + fill-column: 120
52320 + End:
52321 +*/
52322 Index: linux-2.6.16/fs/reiser4/plugin/item/item.h
52323 ===================================================================
52324 --- /dev/null
52325 +++ linux-2.6.16/fs/reiser4/plugin/item/item.h
52326 @@ -0,0 +1,399 @@
52327 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52328 +
52329 +/* first read balance.c comments before reading this */
52330 +
52331 +/* An item_plugin implements all of the operations required for
52332 + balancing that are item specific. */
52333 +
52334 +/* an item plugin also implements other operations that are specific to that
52335 + item. These go into the item specific operations portion of the item
52336 + handler, and all of the item specific portions of the item handler are put
52337 + into a union. */
52338 +
52339 +#if !defined( __REISER4_ITEM_H__ )
52340 +#define __REISER4_ITEM_H__
52341 +
52342 +#include "../../forward.h"
52343 +#include "../plugin_header.h"
52344 +#include "../../dformat.h"
52345 +#include "../../seal.h"
52346 +#include "../../plugin/file/file.h"
52347 +
52348 +#include <linux/fs.h> /* for struct file, struct inode */
52349 +#include <linux/mm.h> /* for struct page */
52350 +#include <linux/dcache.h> /* for struct dentry */
52351 +
52352 +typedef enum {
52353 + STAT_DATA_ITEM_TYPE,
52354 + DIR_ENTRY_ITEM_TYPE,
52355 + INTERNAL_ITEM_TYPE,
52356 + UNIX_FILE_METADATA_ITEM_TYPE,
52357 + OTHER_ITEM_TYPE
52358 +} item_type_id;
52359 +
52360 +/* this is the part of each item plugin that all items are expected to
52361 + support or at least explicitly fail to support by setting the
52362 + pointer to null. */
52363 +typedef struct {
52364 + item_type_id item_type;
52365 +
52366 + /* operations called by balancing
52367 +
52368 + It is interesting to consider that some of these item
52369 + operations could be given sources or targets that are not
52370 + really items in nodes. This could be ok/useful.
52371 +
52372 + */
52373 + /* maximal key that can _possibly_ be occupied by this item
52374 +
52375 + When inserting, and node ->lookup() method (called by
52376 + coord_by_key()) reaches an item after binary search,
52377 + the ->max_key_inside() item plugin method is used to determine
52378 + whether new item should pasted into existing item
52379 + (new_key<=max_key_inside()) or new item has to be created
52380 + (new_key>max_key_inside()).
52381 +
52382 + For items that occupy exactly one key (like stat-data)
52383 + this method should return this key. For items that can
52384 + grow indefinitely (extent, directory item) this should
52385 + return max_key().
52386 +
52387 + For example extent with the key
52388 +
52389 + (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52390 +
52391 + ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52392 + */
52393 + reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52394 +
52395 + /* true if item @coord can merge data at @key. */
52396 + int (*can_contain_key) (const coord_t *, const reiser4_key *,
52397 + const reiser4_item_data *);
52398 + /* mergeable() - check items for mergeability
52399 +
52400 + Optional method. Returns true if two items can be merged.
52401 +
52402 + */
52403 + int (*mergeable) (const coord_t *, const coord_t *);
52404 +
52405 + /* number of atomic things in an item */
52406 + pos_in_node_t(*nr_units) (const coord_t *);
52407 +
52408 + /* search within item for a unit within the item, and return a
52409 + pointer to it. This can be used to calculate how many
52410 + bytes to shrink an item if you use pointer arithmetic and
52411 + compare to the start of the item body if the item's data
52412 + are continuous in the node, if the item's data are not
52413 + continuous in the node, all sorts of other things are maybe
52414 + going to break as well. */
52415 + lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52416 + /* method called by ode_plugin->create_item() to initialise new
52417 + item */
52418 + int (*init) (coord_t * target, coord_t * from,
52419 + reiser4_item_data * data);
52420 + /* method called (e.g., by resize_item()) to place new data into
52421 + item when it grows */
52422 + int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52423 + /* return true if paste into @coord is allowed to skip
52424 + carry. That is, if such paste would require any changes
52425 + at the parent level
52426 + */
52427 + int (*fast_paste) (const coord_t *);
52428 + /* how many but not more than @want units of @source can be
52429 + shifted into @target node. If pend == append - we try to
52430 + append last item of @target by first units of @source. If
52431 + pend == prepend - we try to "prepend" first item in @target
52432 + by last units of @source. @target node has @free_space
52433 + bytes of free space. Total size of those units are returned
52434 + via @size.
52435 +
52436 + @target is not NULL if shifting to the mergeable item and
52437 + NULL is new item will be created during shifting.
52438 + */
52439 + int (*can_shift) (unsigned free_space, coord_t *,
52440 + znode *, shift_direction, unsigned *size,
52441 + unsigned want);
52442 +
52443 + /* starting off @from-th unit of item @source append or
52444 + prepend @count units to @target. @target has been already
52445 + expanded by @free_space bytes. That must be exactly what is
52446 + needed for those items in @target. If @where_is_free_space
52447 + == SHIFT_LEFT - free space is at the end of @target item,
52448 + othersize - it is in the beginning of it. */
52449 + void (*copy_units) (coord_t *, coord_t *,
52450 + unsigned from, unsigned count,
52451 + shift_direction where_is_free_space,
52452 + unsigned free_space);
52453 +
52454 + int (*create_hook) (const coord_t *, void *);
52455 + /* do whatever is necessary to do when @count units starting
52456 + from @from-th one are removed from the tree */
52457 + /* FIXME-VS: this is used to be here for, in particular,
52458 + extents and items of internal type to free blocks they point
52459 + to at the same time with removing items from a
52460 + tree. Problems start, however, when dealloc_block fails due
52461 + to some reason. Item gets removed, but blocks it pointed to
52462 + are not freed. It is not clear how to fix this for items of
52463 + internal type because a need to remove internal item may
52464 + appear in the middle of balancing, and there is no way to
52465 + undo changes made. OTOH, if space allocator involves
52466 + balancing to perform dealloc_block - this will probably
52467 + break balancing due to deadlock issues
52468 + */
52469 + int (*kill_hook) (const coord_t *, pos_in_node_t from,
52470 + pos_in_node_t count, struct carry_kill_data *);
52471 + int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52472 + znode * _node);
52473 +
52474 + /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52475 + including boundaries. When units are cut from item beginning - move space which gets freed to head of
52476 + item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52477 + item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52478 + @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52479 + */
52480 + int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52481 + struct carry_cut_data *,
52482 + reiser4_key * smallest_removed,
52483 + reiser4_key * new_first_key);
52484 +
52485 + /* like cut_units, except that these units are removed from the
52486 + tree, not only from a node */
52487 + int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52488 + struct carry_kill_data *,
52489 + reiser4_key * smallest_removed,
52490 + reiser4_key * new_first);
52491 +
52492 + /* if @key_of_coord == 1 - returned key of coord, otherwise -
52493 + key of unit is returned. If @coord is not set to certain
52494 + unit - ERR_PTR(-ENOENT) is returned */
52495 + reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52496 + reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52497 + /* estimate how much space is needed for paste @data into item at
52498 + @coord. if @coord==0 - estimate insertion, otherwise - estimate
52499 + pasting
52500 + */
52501 + int (*estimate) (const coord_t *, const reiser4_item_data *);
52502 +
52503 + /* converts flow @f to item data. @coord == 0 on insert */
52504 + int (*item_data_by_flow) (const coord_t *, const flow_t *,
52505 + reiser4_item_data *);
52506 +
52507 + /*void (*show) (struct seq_file *, coord_t *); */
52508 +
52509 +#if REISER4_DEBUG
52510 + /* used for debugging, every item should have here the most
52511 + complete possible check of the consistency of the item that
52512 + the inventor can construct */
52513 + int (*check) (const coord_t *, const char **error);
52514 +#endif
52515 +
52516 +} balance_ops;
52517 +
52518 +typedef struct {
52519 + /* return the right or left child of @coord, only if it is in memory */
52520 + int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52521 +
52522 + /* return whether the right or left child of @coord has a non-fake
52523 + block number. */
52524 + int (*utmost_child_real_block) (const coord_t *, sideof side,
52525 + reiser4_block_nr *);
52526 + /* relocate child at @coord to the @block */
52527 + void (*update) (const coord_t *, const reiser4_block_nr *);
52528 + /* count unformatted nodes per item for leave relocation policy, etc.. */
52529 + int (*scan) (flush_scan * scan);
52530 + /* convert item by flush */
52531 + int (*convert) (flush_pos_t * pos);
52532 + /* backward mapping from jnode offset to a key. */
52533 + int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52534 +} flush_ops;
52535 +
52536 +/* operations specific to the directory item */
52537 +typedef struct {
52538 + /* extract stat-data key from directory entry at @coord and place it
52539 + into @key. */
52540 + int (*extract_key) (const coord_t *, reiser4_key * key);
52541 + /* update object key in item. */
52542 + int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52543 + /* extract name from directory entry at @coord and return it */
52544 + char *(*extract_name) (const coord_t *, char *buf);
52545 + /* extract file type (DT_* stuff) from directory entry at @coord and
52546 + return it */
52547 + unsigned (*extract_file_type) (const coord_t *);
52548 + int (*add_entry) (struct inode * dir,
52549 + coord_t *, lock_handle *,
52550 + const struct dentry * name,
52551 + reiser4_dir_entry_desc * entry);
52552 + int (*rem_entry) (struct inode * dir, const struct qstr * name,
52553 + coord_t *, lock_handle *,
52554 + reiser4_dir_entry_desc * entry);
52555 + int (*max_name_len) (const struct inode * dir);
52556 +} dir_entry_ops;
52557 +
52558 +/* operations specific to items regular (unix) file metadata are built of */
52559 +typedef struct {
52560 + int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52561 + int (*read) (struct file *, flow_t *, hint_t *);
52562 + int (*readpage) (void *, struct page *);
52563 + int (*get_block) (const coord_t *, sector_t, sector_t *);
52564 + void (*readpages) (void *, struct address_space *,
52565 + struct list_head * pages);
52566 + /*
52567 + * key of first byte which is not addressed by the item @coord is set
52568 + * to.
52569 + * For example, for extent item with the key
52570 + *
52571 + * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52572 + *
52573 + * ->append_key is
52574 + *
52575 + * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52576 + */
52577 + reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52578 +
52579 + void (*init_coord_extension) (uf_coord_t *, loff_t);
52580 +} file_ops;
52581 +
52582 +/* operations specific to items of stat data type */
52583 +typedef struct {
52584 + int (*init_inode) (struct inode * inode, char *sd, int len);
52585 + int (*save_len) (struct inode * inode);
52586 + int (*save) (struct inode * inode, char **area);
52587 +} sd_ops;
52588 +
52589 +/* operations specific to internal item */
52590 +typedef struct {
52591 + /* all tree traversal want to know from internal item is where
52592 + to go next. */
52593 + void (*down_link) (const coord_t * coord,
52594 + const reiser4_key * key, reiser4_block_nr * block);
52595 + /* check that given internal item contains given pointer. */
52596 + int (*has_pointer_to) (const coord_t * coord,
52597 + const reiser4_block_nr * block);
52598 +} internal_item_ops;
52599 +
52600 +struct item_plugin {
52601 + /* generic fields */
52602 + plugin_header h;
52603 +
52604 + /* methods common for all item types */
52605 + balance_ops b;
52606 + /* methods used during flush */
52607 + flush_ops f;
52608 +
52609 + /* methods specific to particular type of item */
52610 + union {
52611 + dir_entry_ops dir;
52612 + file_ops file;
52613 + sd_ops sd;
52614 + internal_item_ops internal;
52615 + } s;
52616 +
52617 +};
52618 +
52619 +static inline item_id item_id_by_plugin(item_plugin * plugin)
52620 +{
52621 + return plugin->h.id;
52622 +}
52623 +
52624 +static inline char get_iplugid(item_plugin * iplug)
52625 +{
52626 + assert("nikita-2838", iplug != NULL);
52627 + assert("nikita-2839", iplug->h.id < 0xff);
52628 + return (char)item_id_by_plugin(iplug);
52629 +}
52630 +
52631 +extern unsigned long znode_times_locked(const znode * z);
52632 +
52633 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
52634 +{
52635 + assert("nikita-2837", coord != NULL);
52636 + assert("nikita-2838", iplug != NULL);
52637 + coord->iplugid = get_iplugid(iplug);
52638 + ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
52639 +}
52640 +
52641 +static inline item_plugin *coord_iplug(const coord_t * coord)
52642 +{
52643 + assert("nikita-2833", coord != NULL);
52644 + assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
52645 + assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
52646 + return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
52647 + coord->iplugid);
52648 +}
52649 +
52650 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
52651 + const reiser4_item_data *);
52652 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
52653 +extern int item_is_extent(const coord_t *);
52654 +extern int item_is_tail(const coord_t *);
52655 +extern int item_is_statdata(const coord_t * item);
52656 +extern int item_is_ctail(const coord_t *);
52657 +
52658 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
52659 +extern item_type_id item_type_by_coord(const coord_t * coord);
52660 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
52661 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
52662 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
52663 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
52664 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
52665 + reiser4_key * key);
52666 +
52667 +extern void obtain_item_plugin(const coord_t * coord);
52668 +
52669 +#if defined(REISER4_DEBUG)
52670 +extern int znode_is_loaded(const znode * node);
52671 +#endif
52672 +
52673 +/* return plugin of item at @coord */
52674 +static inline item_plugin *item_plugin_by_coord(const coord_t *
52675 + coord /* coord to query */ )
52676 +{
52677 + assert("nikita-330", coord != NULL);
52678 + assert("nikita-331", coord->node != NULL);
52679 + assert("nikita-332", znode_is_loaded(coord->node));
52680 +
52681 + if (unlikely(!coord_is_iplug_set(coord)))
52682 + obtain_item_plugin(coord);
52683 + return coord_iplug(coord);
52684 +}
52685 +
52686 +/* this returns true if item is of internal type */
52687 +static inline int item_is_internal(const coord_t * item)
52688 +{
52689 + assert("vs-483", coord_is_existing_item(item));
52690 + return item_type_by_coord(item) == INTERNAL_ITEM_TYPE;
52691 +}
52692 +
52693 +extern void item_body_by_coord_hard(coord_t * coord);
52694 +extern void *item_body_by_coord_easy(const coord_t * coord);
52695 +#if REISER4_DEBUG
52696 +extern int item_body_is_valid(const coord_t * coord);
52697 +#endif
52698 +
52699 +/* return pointer to item body */
52700 +static inline void *item_body_by_coord(const coord_t *
52701 + coord /* coord to query */ )
52702 +{
52703 + assert("nikita-324", coord != NULL);
52704 + assert("nikita-325", coord->node != NULL);
52705 + assert("nikita-326", znode_is_loaded(coord->node));
52706 +
52707 + if (coord->offset == INVALID_OFFSET)
52708 + item_body_by_coord_hard((coord_t *) coord);
52709 + assert("nikita-3201", item_body_is_valid(coord));
52710 + assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
52711 + return item_body_by_coord_easy(coord);
52712 +}
52713 +
52714 +/* __REISER4_ITEM_H__ */
52715 +#endif
52716 +/* Make Linus happy.
52717 + Local variables:
52718 + c-indentation-style: "K&R"
52719 + mode-name: "LC"
52720 + c-basic-offset: 8
52721 + tab-width: 8
52722 + fill-column: 120
52723 + scroll-step: 1
52724 + End:
52725 +*/
52726 Index: linux-2.6.16/fs/reiser4/plugin/item/sde.c
52727 ===================================================================
52728 --- /dev/null
52729 +++ linux-2.6.16/fs/reiser4/plugin/item/sde.c
52730 @@ -0,0 +1,190 @@
52731 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52732 +
52733 +/* Directory entry implementation */
52734 +#include "../../forward.h"
52735 +#include "../../debug.h"
52736 +#include "../../dformat.h"
52737 +#include "../../kassign.h"
52738 +#include "../../coord.h"
52739 +#include "sde.h"
52740 +#include "item.h"
52741 +#include "../plugin.h"
52742 +#include "../../znode.h"
52743 +#include "../../carry.h"
52744 +#include "../../tree.h"
52745 +#include "../../inode.h"
52746 +
52747 +#include <linux/fs.h> /* for struct inode */
52748 +#include <linux/dcache.h> /* for struct dentry */
52749 +#include <linux/quotaops.h>
52750 +
52751 +/* ->extract_key() method of simple directory item plugin. */
52752 +int extract_key_de(const coord_t * coord /* coord of item */ ,
52753 + reiser4_key * key /* resulting key */ )
52754 +{
52755 + directory_entry_format *dent;
52756 +
52757 + assert("nikita-1458", coord != NULL);
52758 + assert("nikita-1459", key != NULL);
52759 +
52760 + dent = (directory_entry_format *) item_body_by_coord(coord);
52761 + assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
52762 + return extract_key_from_id(&dent->id, key);
52763 +}
52764 +
52765 +int
52766 +update_key_de(const coord_t * coord, const reiser4_key * key,
52767 + lock_handle * lh UNUSED_ARG)
52768 +{
52769 + directory_entry_format *dent;
52770 + obj_key_id obj_id;
52771 + int result;
52772 +
52773 + assert("nikita-2342", coord != NULL);
52774 + assert("nikita-2343", key != NULL);
52775 +
52776 + dent = (directory_entry_format *) item_body_by_coord(coord);
52777 + result = build_obj_key_id(key, &obj_id);
52778 + if (result == 0) {
52779 + dent->id = obj_id;
52780 + znode_make_dirty(coord->node);
52781 + }
52782 + return 0;
52783 +}
52784 +
52785 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
52786 + char *buf)
52787 +{
52788 + reiser4_key key;
52789 +
52790 + unit_key_by_coord(coord, &key);
52791 + if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
52792 + reiser4_print_address("oops", znode_get_block(coord->node));
52793 + if (!is_longname_key(&key)) {
52794 + if (is_dot_key(&key))
52795 + return (char *)".";
52796 + else
52797 + return extract_name_from_key(&key, buf);
52798 + } else
52799 + return (char *)dent->name;
52800 +}
52801 +
52802 +/* ->extract_name() method of simple directory item plugin. */
52803 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
52804 +{
52805 + directory_entry_format *dent;
52806 +
52807 + assert("nikita-1460", coord != NULL);
52808 +
52809 + dent = (directory_entry_format *) item_body_by_coord(coord);
52810 + return extract_dent_name(coord, dent, buf);
52811 +}
52812 +
52813 +/* ->extract_file_type() method of simple directory item plugin. */
52814 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
52815 + * item */ )
52816 +{
52817 + assert("nikita-1764", coord != NULL);
52818 + /* we don't store file type in the directory entry yet.
52819 +
52820 + But see comments at kassign.h:obj_key_id
52821 + */
52822 + return DT_UNKNOWN;
52823 +}
52824 +
52825 +int add_entry_de(struct inode *dir /* directory of item */ ,
52826 + coord_t * coord /* coord of item */ ,
52827 + lock_handle * lh /* insertion lock handle */ ,
52828 + const struct dentry *de /* name to add */ ,
52829 + reiser4_dir_entry_desc * entry /* parameters of new directory
52830 + * entry */ )
52831 +{
52832 + reiser4_item_data data;
52833 + directory_entry_format *dent;
52834 + int result;
52835 + const char *name;
52836 + int len;
52837 + int longname;
52838 +
52839 + name = de->d_name.name;
52840 + len = de->d_name.len;
52841 + assert("nikita-1163", strlen(name) == len);
52842 +
52843 + longname = is_longname(name, len);
52844 +
52845 + data.length = sizeof *dent;
52846 + if (longname)
52847 + data.length += len + 1;
52848 + data.data = NULL;
52849 + data.user = 0;
52850 + data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
52851 +
52852 + /* NOTE-NIKITA quota plugin */
52853 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
52854 + return -EDQUOT;
52855 +
52856 + result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
52857 + if (result != 0)
52858 + return result;
52859 +
52860 + dent = (directory_entry_format *) item_body_by_coord(coord);
52861 + build_inode_key_id(entry->obj, &dent->id);
52862 + if (longname) {
52863 + memcpy(dent->name, name, len);
52864 + put_unaligned(0, &dent->name[len]);
52865 + }
52866 + return 0;
52867 +}
52868 +
52869 +int rem_entry_de(struct inode *dir /* directory of item */ ,
52870 + const struct qstr *name UNUSED_ARG,
52871 + coord_t * coord /* coord of item */ ,
52872 + lock_handle * lh UNUSED_ARG /* lock handle for
52873 + * removal */ ,
52874 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
52875 + * directory entry
52876 + * being removed */ )
52877 +{
52878 + coord_t shadow;
52879 + int result;
52880 + int length;
52881 +
52882 + length = item_length_by_coord(coord);
52883 + if (inode_get_bytes(dir) < length) {
52884 + warning("nikita-2627", "Dir is broke: %llu: %llu",
52885 + (unsigned long long)get_inode_oid(dir),
52886 + inode_get_bytes(dir));
52887 +
52888 + return RETERR(-EIO);
52889 + }
52890 +
52891 + /* cut_node() is supposed to take pointers to _different_
52892 + coords, because it will modify them without respect to
52893 + possible aliasing. To work around this, create temporary copy
52894 + of @coord.
52895 + */
52896 + coord_dup(&shadow, coord);
52897 + result =
52898 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
52899 + if (result == 0) {
52900 + /* NOTE-NIKITA quota plugin */
52901 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
52902 + }
52903 + return result;
52904 +}
52905 +
52906 +int max_name_len_de(const struct inode *dir)
52907 +{
52908 + return tree_by_inode(dir)->nplug->max_item_size() -
52909 + sizeof(directory_entry_format) - 2;
52910 +}
52911 +
52912 +/* Make Linus happy.
52913 + Local variables:
52914 + c-indentation-style: "K&R"
52915 + mode-name: "LC"
52916 + c-basic-offset: 8
52917 + tab-width: 8
52918 + fill-column: 120
52919 + End:
52920 +*/
52921 Index: linux-2.6.16/fs/reiser4/plugin/item/sde.h
52922 ===================================================================
52923 --- /dev/null
52924 +++ linux-2.6.16/fs/reiser4/plugin/item/sde.h
52925 @@ -0,0 +1,66 @@
52926 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52927 +
52928 +/* Directory entry. */
52929 +
52930 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
52931 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
52932 +
52933 +#include "../../forward.h"
52934 +#include "../../dformat.h"
52935 +#include "../../kassign.h"
52936 +#include "../../key.h"
52937 +
52938 +#include <linux/fs.h>
52939 +#include <linux/dcache.h> /* for struct dentry */
52940 +
52941 +typedef struct directory_entry_format {
52942 + /* key of object stat-data. It's not necessary to store whole
52943 + key here, because it's always key of stat-data, so minor
52944 + packing locality and offset can be omitted here. But this
52945 + relies on particular key allocation scheme for stat-data, so,
52946 + for extensibility sake, whole key can be stored here.
52947 +
52948 + We store key as array of bytes, because we don't want 8-byte
52949 + alignment of dir entries.
52950 + */
52951 + obj_key_id id;
52952 + /* file name. Null terminated string. */
52953 + d8 name[0];
52954 +} directory_entry_format;
52955 +
52956 +void print_de(const char *prefix, coord_t * coord);
52957 +int extract_key_de(const coord_t * coord, reiser4_key * key);
52958 +int update_key_de(const coord_t * coord, const reiser4_key * key,
52959 + lock_handle * lh);
52960 +char *extract_name_de(const coord_t * coord, char *buf);
52961 +unsigned extract_file_type_de(const coord_t * coord);
52962 +int add_entry_de(struct inode *dir, coord_t * coord,
52963 + lock_handle * lh, const struct dentry *name,
52964 + reiser4_dir_entry_desc * entry);
52965 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
52966 + lock_handle * lh, reiser4_dir_entry_desc * entry);
52967 +int max_name_len_de(const struct inode *dir);
52968 +
52969 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
52970 +
52971 +char *extract_dent_name(const coord_t * coord,
52972 + directory_entry_format * dent, char *buf);
52973 +
52974 +#if REISER4_LARGE_KEY
52975 +#define DE_NAME_BUF_LEN (24)
52976 +#else
52977 +#define DE_NAME_BUF_LEN (16)
52978 +#endif
52979 +
52980 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
52981 +#endif
52982 +
52983 +/* Make Linus happy.
52984 + Local variables:
52985 + c-indentation-style: "K&R"
52986 + mode-name: "LC"
52987 + c-basic-offset: 8
52988 + tab-width: 8
52989 + fill-column: 120
52990 + End:
52991 +*/
52992 Index: linux-2.6.16/fs/reiser4/plugin/item/static_stat.c
52993 ===================================================================
52994 --- /dev/null
52995 +++ linux-2.6.16/fs/reiser4/plugin/item/static_stat.c
52996 @@ -0,0 +1,1040 @@
52997 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52998 +
52999 +/* stat data manipulation. */
53000 +
53001 +#include "../../forward.h"
53002 +#include "../../super.h"
53003 +#include "../../vfs_ops.h"
53004 +#include "../../inode.h"
53005 +#include "../../debug.h"
53006 +#include "../../dformat.h"
53007 +#include "../object.h"
53008 +#include "../plugin.h"
53009 +#include "../plugin_header.h"
53010 +#include "static_stat.h"
53011 +#include "item.h"
53012 +
53013 +#include <linux/types.h>
53014 +#include <linux/fs.h>
53015 +
53016 +/* see static_stat.h for explanation */
53017 +
53018 +/* helper function used while we are dumping/loading inode/plugin state
53019 + to/from the stat-data. */
53020 +
53021 +static void move_on(int *length /* space remaining in stat-data */ ,
53022 + char **area /* current coord in stat data */ ,
53023 + int size_of /* how many bytes to move forward */ )
53024 +{
53025 + assert("nikita-615", length != NULL);
53026 + assert("nikita-616", area != NULL);
53027 +
53028 + *length -= size_of;
53029 + *area += size_of;
53030 +
53031 + assert("nikita-617", *length >= 0);
53032 +}
53033 +
53034 +/* helper function used while loading inode/plugin state from stat-data.
53035 + Complain if there is less space in stat-data than was expected.
53036 + Can only happen on disk corruption. */
53037 +static int not_enough_space(struct inode *inode /* object being processed */ ,
53038 + const char *where /* error message */ )
53039 +{
53040 + assert("nikita-618", inode != NULL);
53041 +
53042 + warning("nikita-619", "Not enough space in %llu while loading %s",
53043 + (unsigned long long)get_inode_oid(inode), where);
53044 +
53045 + return RETERR(-EINVAL);
53046 +}
53047 +
53048 +/* helper function used while loading inode/plugin state from
53049 + stat-data. Call it if invalid plugin id was found. */
53050 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
53051 + struct inode *inode /* object being processed */ )
53052 +{
53053 + warning("nikita-620", "Unknown plugin %i in %llu",
53054 + id, (unsigned long long)get_inode_oid(inode));
53055 +
53056 + return RETERR(-EINVAL);
53057 +}
53058 +
53059 +/* this is installed as ->init_inode() method of
53060 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
53061 + Copies data from on-disk stat-data format into inode.
53062 + Handles stat-data extensions. */
53063 +/* was sd_load */
53064 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
53065 + char *sd /* stat-data body */ ,
53066 + int len /* length of stat-data */ )
53067 +{
53068 + int result;
53069 + int bit;
53070 + int chunk;
53071 + __u16 mask;
53072 + __u64 bigmask;
53073 + reiser4_stat_data_base *sd_base;
53074 + reiser4_inode *state;
53075 +
53076 + assert("nikita-625", inode != NULL);
53077 + assert("nikita-626", sd != NULL);
53078 +
53079 + result = 0;
53080 + sd_base = (reiser4_stat_data_base *) sd;
53081 + state = reiser4_inode_data(inode);
53082 + mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
53083 + bigmask = mask;
53084 + inode_set_flag(inode, REISER4_SDLEN_KNOWN);
53085 +
53086 + move_on(&len, &sd, sizeof *sd_base);
53087 + for (bit = 0, chunk = 0;
53088 + mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
53089 + ++bit, mask >>= 1) {
53090 + if (((bit + 1) % 16) != 0) {
53091 + /* handle extension */
53092 + sd_ext_plugin *sdplug;
53093 +
53094 + if (bit >= LAST_SD_EXTENSION) {
53095 + warning("vpf-1904",
53096 + "No such extension %i in inode %llu",
53097 + bit,
53098 + (unsigned long long)
53099 + get_inode_oid(inode));
53100 +
53101 + result = RETERR(-EINVAL);
53102 + break;
53103 + }
53104 +
53105 + sdplug = sd_ext_plugin_by_id(bit);
53106 + if (sdplug == NULL) {
53107 + warning("nikita-627",
53108 + "No such extension %i in inode %llu",
53109 + bit,
53110 + (unsigned long long)
53111 + get_inode_oid(inode));
53112 +
53113 + result = RETERR(-EINVAL);
53114 + break;
53115 + }
53116 + if (mask & 1) {
53117 + assert("nikita-628", sdplug->present);
53118 + /* alignment is not supported in node layout
53119 + plugin yet.
53120 + result = align( inode, &len, &sd,
53121 + sdplug -> alignment );
53122 + if( result != 0 )
53123 + return result; */
53124 + result = sdplug->present(inode, &sd, &len);
53125 + } else if (sdplug->absent != NULL)
53126 + result = sdplug->absent(inode);
53127 + if (result)
53128 + break;
53129 + /* else, we are looking at the last bit in 16-bit
53130 + portion of bitmask */
53131 + } else if (mask & 1) {
53132 + /* next portion of bitmask */
53133 + if (len < (int)sizeof(d16)) {
53134 + warning("nikita-629",
53135 + "No space for bitmap in inode %llu",
53136 + (unsigned long long)
53137 + get_inode_oid(inode));
53138 +
53139 + result = RETERR(-EINVAL);
53140 + break;
53141 + }
53142 + mask = le16_to_cpu(get_unaligned((d16 *)sd));
53143 + bigmask <<= 16;
53144 + bigmask |= mask;
53145 + move_on(&len, &sd, sizeof(d16));
53146 + ++chunk;
53147 + if (chunk == 3) {
53148 + if (!(mask & 0x8000)) {
53149 + /* clear last bit */
53150 + mask &= ~0x8000;
53151 + continue;
53152 + }
53153 + /* too much */
53154 + warning("nikita-630",
53155 + "Too many extensions in %llu",
53156 + (unsigned long long)
53157 + get_inode_oid(inode));
53158 +
53159 + result = RETERR(-EINVAL);
53160 + break;
53161 + }
53162 + } else
53163 + /* bitmask exhausted */
53164 + break;
53165 + }
53166 + state->extmask = bigmask;
53167 + /* common initialisations */
53168 + inode->i_blksize = get_super_private(inode->i_sb)->optimal_io_size;
53169 + if (len - (bit / 16 * sizeof(d16)) > 0) {
53170 + /* alignment in save_len_static_sd() is taken into account
53171 + -edward */
53172 + warning("nikita-631", "unused space in inode %llu",
53173 + (unsigned long long)get_inode_oid(inode));
53174 + }
53175 +
53176 + return result;
53177 +}
53178 +
53179 +/* estimates size of stat-data required to store inode.
53180 + Installed as ->save_len() method of
53181 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53182 +/* was sd_len */
53183 +int save_len_static_sd(struct inode *inode /* object being processed */ )
53184 +{
53185 + unsigned int result;
53186 + __u64 mask;
53187 + int bit;
53188 +
53189 + assert("nikita-632", inode != NULL);
53190 +
53191 + result = sizeof(reiser4_stat_data_base);
53192 + mask = reiser4_inode_data(inode)->extmask;
53193 + for (bit = 0; mask != 0; ++bit, mask >>= 1) {
53194 + if (mask & 1) {
53195 + sd_ext_plugin *sdplug;
53196 +
53197 + sdplug = sd_ext_plugin_by_id(bit);
53198 + assert("nikita-633", sdplug != NULL);
53199 + /* no aligment support
53200 + result +=
53201 + round_up( result, sdplug -> alignment ) - result; */
53202 + result += sdplug->save_len(inode);
53203 + }
53204 + }
53205 + result += bit / 16 * sizeof(d16);
53206 + return result;
53207 +}
53208 +
53209 +/* saves inode into stat-data.
53210 + Installed as ->save() method of
53211 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53212 +/* was sd_save */
53213 +int save_static_sd(struct inode *inode /* object being processed */ ,
53214 + char **area /* where to save stat-data */ )
53215 +{
53216 + int result;
53217 + __u64 emask;
53218 + int bit;
53219 + unsigned int len;
53220 + reiser4_stat_data_base *sd_base;
53221 +
53222 + assert("nikita-634", inode != NULL);
53223 + assert("nikita-635", area != NULL);
53224 +
53225 + result = 0;
53226 + emask = reiser4_inode_data(inode)->extmask;
53227 + sd_base = (reiser4_stat_data_base *) * area;
53228 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
53229 + /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
53230 +
53231 + *area += sizeof *sd_base;
53232 + len = 0xffffffffu;
53233 + for (bit = 0; emask != 0; ++bit, emask >>= 1) {
53234 + if (emask & 1) {
53235 + if ((bit + 1) % 16 != 0) {
53236 + sd_ext_plugin *sdplug;
53237 + sdplug = sd_ext_plugin_by_id(bit);
53238 + assert("nikita-636", sdplug != NULL);
53239 + /* no alignment support yet
53240 + align( inode, &len, area,
53241 + sdplug -> alignment ); */
53242 + result = sdplug->save(inode, area);
53243 + if (result)
53244 + break;
53245 + } else {
53246 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
53247 + (d16 *)(*area));
53248 + /*cputod16((unsigned)(emask & 0xffff),
53249 + (d16 *) * area);*/
53250 + *area += sizeof(d16);
53251 + }
53252 + }
53253 + }
53254 + return result;
53255 +}
53256 +
53257 +/* stat-data extension handling functions. */
53258 +
53259 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
53260 + char **area /* position in stat-data */ ,
53261 + int *len /* remaining length */ )
53262 +{
53263 + if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
53264 + reiser4_light_weight_stat *sd_lw;
53265 +
53266 + sd_lw = (reiser4_light_weight_stat *) * area;
53267 +
53268 + inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
53269 + inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
53270 + inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
53271 + if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
53272 + inode->i_mode &= ~S_IFIFO;
53273 + warning("", "partially converted file is encountered");
53274 + inode_set_flag(inode, REISER4_PART_MIXED);
53275 + }
53276 + move_on(len, area, sizeof *sd_lw);
53277 + return 0;
53278 + } else
53279 + return not_enough_space(inode, "lw sd");
53280 +}
53281 +
53282 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
53283 + * processed */ )
53284 +{
53285 + return sizeof(reiser4_light_weight_stat);
53286 +}
53287 +
53288 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
53289 + char **area /* position in stat-data */ )
53290 +{
53291 + reiser4_light_weight_stat *sd;
53292 + mode_t delta;
53293 +
53294 + assert("nikita-2705", inode != NULL);
53295 + assert("nikita-2706", area != NULL);
53296 + assert("nikita-2707", *area != NULL);
53297 +
53298 + sd = (reiser4_light_weight_stat *) * area;
53299 +
53300 + delta = (inode_get_flag(inode, REISER4_PART_MIXED) ? S_IFIFO : 0);
53301 + put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
53302 + put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
53303 + put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
53304 + *area += sizeof *sd;
53305 + return 0;
53306 +}
53307 +
53308 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
53309 + char **area /* position in stat-data */ ,
53310 + int *len /* remaining length */ )
53311 +{
53312 + assert("nikita-637", inode != NULL);
53313 + assert("nikita-638", area != NULL);
53314 + assert("nikita-639", *area != NULL);
53315 + assert("nikita-640", len != NULL);
53316 + assert("nikita-641", *len > 0);
53317 +
53318 + if (*len >= (int)sizeof(reiser4_unix_stat)) {
53319 + reiser4_unix_stat *sd;
53320 +
53321 + sd = (reiser4_unix_stat *) * area;
53322 +
53323 + inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
53324 + inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
53325 + inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
53326 + inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
53327 + inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
53328 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53329 + inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
53330 + else
53331 + inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
53332 + move_on(len, area, sizeof *sd);
53333 + return 0;
53334 + } else
53335 + return not_enough_space(inode, "unix sd");
53336 +}
53337 +
53338 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
53339 +{
53340 + inode->i_uid = get_super_private(inode->i_sb)->default_uid;
53341 + inode->i_gid = get_super_private(inode->i_sb)->default_gid;
53342 + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53343 + inode_set_bytes(inode, inode->i_size);
53344 + /* mark inode as lightweight, so that caller (reiser4_lookup) will
53345 + complete initialisation by copying [ug]id from a parent. */
53346 + inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
53347 + return 0;
53348 +}
53349 +
53350 +/* Audited by: green(2002.06.14) */
53351 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
53352 + * processed */ )
53353 +{
53354 + return sizeof(reiser4_unix_stat);
53355 +}
53356 +
53357 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
53358 + char **area /* position in stat-data */ )
53359 +{
53360 + reiser4_unix_stat *sd;
53361 +
53362 + assert("nikita-642", inode != NULL);
53363 + assert("nikita-643", area != NULL);
53364 + assert("nikita-644", *area != NULL);
53365 +
53366 + sd = (reiser4_unix_stat *) * area;
53367 + put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53368 + put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53369 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53370 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53371 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53372 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53373 + put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53374 + else
53375 + put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53376 + *area += sizeof *sd;
53377 + return 0;
53378 +}
53379 +
53380 +static int
53381 +present_large_times_sd(struct inode *inode /* object being processed */ ,
53382 + char **area /* position in stat-data */ ,
53383 + int *len /* remaining length */ )
53384 +{
53385 + if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53386 + reiser4_large_times_stat *sd_lt;
53387 +
53388 + sd_lt = (reiser4_large_times_stat *) * area;
53389 +
53390 + inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53391 + inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53392 + inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53393 +
53394 + move_on(len, area, sizeof *sd_lt);
53395 + return 0;
53396 + } else
53397 + return not_enough_space(inode, "large times sd");
53398 +}
53399 +
53400 +static int
53401 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
53402 + /* object being processed */ )
53403 +{
53404 + return sizeof(reiser4_large_times_stat);
53405 +}
53406 +
53407 +static int
53408 +save_large_times_sd(struct inode *inode /* object being processed */ ,
53409 + char **area /* position in stat-data */ )
53410 +{
53411 + reiser4_large_times_stat *sd;
53412 +
53413 + assert("nikita-2817", inode != NULL);
53414 + assert("nikita-2818", area != NULL);
53415 + assert("nikita-2819", *area != NULL);
53416 +
53417 + sd = (reiser4_large_times_stat *) * area;
53418 +
53419 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53420 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53421 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53422 +
53423 + *area += sizeof *sd;
53424 + return 0;
53425 +}
53426 +
53427 +/* symlink stat data extension */
53428 +
53429 +/* allocate memory for symlink target and attach it to inode->u.generic_ip */
53430 +static int
53431 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
53432 +{
53433 + assert("vs-845", inode->u.generic_ip == NULL);
53434 + assert("vs-846", !inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
53435 +
53436 + /* FIXME-VS: this is prone to deadlock. Not more than other similar
53437 + places, though */
53438 + inode->u.generic_ip = kmalloc((size_t) len + 1, get_gfp_mask());
53439 + if (!inode->u.generic_ip)
53440 + return RETERR(-ENOMEM);
53441 +
53442 + memcpy((char *)(inode->u.generic_ip), target, (size_t) len);
53443 + ((char *)(inode->u.generic_ip))[len] = 0;
53444 + inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53445 + return 0;
53446 +}
53447 +
53448 +/* this is called on read_inode. There is nothing to do actually, but some
53449 + sanity checks */
53450 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
53451 +{
53452 + int result;
53453 + int length;
53454 + reiser4_symlink_stat *sd;
53455 +
53456 + length = (int)inode->i_size;
53457 + /*
53458 + * *len is number of bytes in stat data item from *area to the end of
53459 + * item. It must be not less than size of symlink + 1 for ending 0
53460 + */
53461 + if (length > *len)
53462 + return not_enough_space(inode, "symlink");
53463 +
53464 + if (*(*area + length) != 0) {
53465 + warning("vs-840", "Symlink is not zero terminated");
53466 + return RETERR(-EIO);
53467 + }
53468 +
53469 + sd = (reiser4_symlink_stat *) * area;
53470 + result = symlink_target_to_inode(inode, sd->body, length);
53471 +
53472 + move_on(len, area, length + 1);
53473 + return result;
53474 +}
53475 +
53476 +static int save_len_symlink_sd(struct inode *inode)
53477 +{
53478 + return inode->i_size + 1;
53479 +}
53480 +
53481 +/* this is called on create and update stat data. Do nothing on update but
53482 + update @area */
53483 +static int save_symlink_sd(struct inode *inode, char **area)
53484 +{
53485 + int result;
53486 + int length;
53487 + reiser4_symlink_stat *sd;
53488 +
53489 + length = (int)inode->i_size;
53490 + /* inode->i_size must be set already */
53491 + assert("vs-841", length);
53492 +
53493 + result = 0;
53494 + sd = (reiser4_symlink_stat *) * area;
53495 + if (!inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53496 + const char *target;
53497 +
53498 + target = (const char *)(inode->u.generic_ip);
53499 + inode->u.generic_ip = NULL;
53500 +
53501 + result = symlink_target_to_inode(inode, target, length);
53502 +
53503 + /* copy symlink to stat data */
53504 + memcpy(sd->body, target, (size_t) length);
53505 + (*area)[length] = 0;
53506 + } else {
53507 + /* there is nothing to do in update but move area */
53508 + assert("vs-844",
53509 + !memcmp(inode->u.generic_ip, sd->body,
53510 + (size_t) length + 1));
53511 + }
53512 +
53513 + *area += (length + 1);
53514 + return result;
53515 +}
53516 +
53517 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
53518 + char **area /* position in stat-data */ ,
53519 + int *len /* remaining length */ )
53520 +{
53521 + assert("nikita-645", inode != NULL);
53522 + assert("nikita-646", area != NULL);
53523 + assert("nikita-647", *area != NULL);
53524 + assert("nikita-648", len != NULL);
53525 + assert("nikita-649", *len > 0);
53526 +
53527 + if (*len >= (int)sizeof(reiser4_flags_stat)) {
53528 + reiser4_flags_stat *sd;
53529 +
53530 + sd = (reiser4_flags_stat *) * area;
53531 + inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53532 + move_on(len, area, sizeof *sd);
53533 + return 0;
53534 + } else
53535 + return not_enough_space(inode, "generation and attrs");
53536 +}
53537 +
53538 +/* Audited by: green(2002.06.14) */
53539 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
53540 + * processed */ )
53541 +{
53542 + return sizeof(reiser4_flags_stat);
53543 +}
53544 +
53545 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
53546 + char **area /* position in stat-data */ )
53547 +{
53548 + reiser4_flags_stat *sd;
53549 +
53550 + assert("nikita-650", inode != NULL);
53551 + assert("nikita-651", area != NULL);
53552 + assert("nikita-652", *area != NULL);
53553 +
53554 + sd = (reiser4_flags_stat *) * area;
53555 + put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53556 + *area += sizeof *sd;
53557 + return 0;
53558 +}
53559 +
53560 +static int absent_plugin_sd(struct inode *inode);
53561 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53562 + char **area /* position in stat-data */ ,
53563 + int *len /* remaining length */ )
53564 +{
53565 + reiser4_plugin_stat *sd;
53566 + reiser4_plugin *plugin;
53567 + int i;
53568 + __u16 mask;
53569 + int result;
53570 + int num_of_plugins;
53571 +
53572 + assert("nikita-653", inode != NULL);
53573 + assert("nikita-654", area != NULL);
53574 + assert("nikita-655", *area != NULL);
53575 + assert("nikita-656", len != NULL);
53576 + assert("nikita-657", *len > 0);
53577 +
53578 + if (*len < (int)sizeof(reiser4_plugin_stat))
53579 + return not_enough_space(inode, "plugin");
53580 +
53581 + sd = (reiser4_plugin_stat *) * area;
53582 +
53583 + mask = 0;
53584 + num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
53585 + move_on(len, area, sizeof *sd);
53586 + result = 0;
53587 + for (i = 0; i < num_of_plugins; ++i) {
53588 + reiser4_plugin_slot *slot;
53589 + reiser4_plugin_type type;
53590 + pset_member memb;
53591 +
53592 + slot = (reiser4_plugin_slot *) * area;
53593 + if (*len < (int)sizeof *slot)
53594 + return not_enough_space(inode, "additional plugin");
53595 +
53596 + memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
53597 + type = pset_member_to_type_unsafe(memb);
53598 + if (type == REISER4_PLUGIN_TYPES) {
53599 + warning("nikita-3502",
53600 + "wrong pset member (%i) for %llu", memb,
53601 + (unsigned long long)get_inode_oid(inode));
53602 + return RETERR(-EINVAL);
53603 + }
53604 + plugin = plugin_by_disk_id(tree_by_inode(inode),
53605 + type, &slot->id);
53606 + if (plugin == NULL)
53607 + return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
53608 +
53609 + /* plugin is loaded into inode, mark this into inode's
53610 + bitmask of loaded non-standard plugins */
53611 + if (!(mask & (1 << memb))) {
53612 + mask |= (1 << memb);
53613 + } else {
53614 + warning("nikita-658", "duplicate plugin for %llu",
53615 + (unsigned long long)get_inode_oid(inode));
53616 + return RETERR(-EINVAL);
53617 + }
53618 + move_on(len, area, sizeof *slot);
53619 + /* load plugin data, if any */
53620 + if (plugin->h.pops != NULL && plugin->h.pops->load) {
53621 + result = plugin->h.pops->load(inode, plugin, area, len);
53622 + if (result != 0)
53623 + return result;
53624 + } else
53625 + result = grab_plugin_from(inode, memb, plugin);
53626 + }
53627 + /* if object plugin wasn't loaded from stat-data, guess it by
53628 + mode bits */
53629 + plugin = file_plugin_to_plugin(inode_file_plugin(inode));
53630 + if (plugin == NULL)
53631 + result = absent_plugin_sd(inode);
53632 +
53633 + reiser4_inode_data(inode)->plugin_mask = mask;
53634 + return result;
53635 +}
53636 +
53637 +/* Determine object plugin for @inode based on i_mode.
53638 +
53639 + Many objects in reiser4 file system are controlled by standard object
53640 + plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
53641 +
53642 + For such files we don't explicitly store plugin id in object stat
53643 + data. Rather required plugin is guessed from mode bits, where file "type"
53644 + is encoded (see stat(2)).
53645 +*/
53646 +static int
53647 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
53648 +{
53649 + int fplug_id;
53650 + int dplug_id;
53651 + reiser4_inode *info;
53652 +
53653 + assert("nikita-736", inode != NULL);
53654 +
53655 + dplug_id = fplug_id = -1;
53656 +
53657 + switch (inode->i_mode & S_IFMT) {
53658 + case S_IFSOCK:
53659 + case S_IFBLK:
53660 + case S_IFCHR:
53661 + case S_IFIFO:
53662 + fplug_id = SPECIAL_FILE_PLUGIN_ID;
53663 + break;
53664 + case S_IFLNK:
53665 + fplug_id = SYMLINK_FILE_PLUGIN_ID;
53666 + break;
53667 + case S_IFDIR:
53668 + fplug_id = DIRECTORY_FILE_PLUGIN_ID;
53669 + dplug_id = HASHED_DIR_PLUGIN_ID;
53670 + break;
53671 + default:
53672 + warning("nikita-737", "wrong file mode: %o", inode->i_mode);
53673 + return RETERR(-EIO);
53674 + case S_IFREG:
53675 + fplug_id = UNIX_FILE_PLUGIN_ID;
53676 + break;
53677 + }
53678 + info = reiser4_inode_data(inode);
53679 + plugin_set_file(&info->pset,
53680 + (fplug_id >= 0) ? file_plugin_by_id(fplug_id) : NULL);
53681 + plugin_set_dir(&info->pset,
53682 + (dplug_id >= 0) ? dir_plugin_by_id(dplug_id) : NULL);
53683 + return 0;
53684 +}
53685 +
53686 +/* Audited by: green(2002.06.14) */
53687 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
53688 +{
53689 + int result;
53690 +
53691 + assert("nikita-659", inode != NULL);
53692 +
53693 + result = guess_plugin_by_mode(inode);
53694 + /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
53695 + but setup_inode_ops() will call make_bad_inode().
53696 + Another, more logical but bit more complex solution is to add
53697 + "bad-file plugin". */
53698 + /* FIXME-VS: activate was called here */
53699 + return result;
53700 +}
53701 +
53702 +/* helper function for plugin_sd_save_len(): calculate how much space
53703 + required to save state of given plugin */
53704 +/* Audited by: green(2002.06.14) */
53705 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
53706 + struct inode *inode /* object being processed */ ,
53707 + pset_member memb, int len)
53708 +{
53709 + reiser4_inode *info;
53710 + assert("nikita-661", inode != NULL);
53711 +
53712 + info = reiser4_inode_data(inode);
53713 + if (plugin != NULL && (info->plugin_mask & (1 << memb))) {
53714 + len += sizeof(reiser4_plugin_slot);
53715 + if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
53716 + /* non-standard plugin, call method */
53717 + /* commented as it is incompatible with alignment
53718 + * policy in save_plug() -edward */
53719 + /* len = round_up(len, plugin->h.pops->alignment); */
53720 + len += plugin->h.pops->save_len(inode, plugin);
53721 + }
53722 + }
53723 + return len;
53724 +}
53725 +
53726 +/* calculate how much space is required to save state of all plugins,
53727 + associated with inode */
53728 +static int save_len_plugin_sd(struct inode *inode /* object being processed */ )
53729 +{
53730 + int len;
53731 + reiser4_inode *state;
53732 + pset_member memb;
53733 +
53734 + assert("nikita-663", inode != NULL);
53735 +
53736 + state = reiser4_inode_data(inode);
53737 + /* common case: no non-standard plugins */
53738 + if (state->plugin_mask == 0)
53739 + return 0;
53740 + len = sizeof(reiser4_plugin_stat);
53741 + for (memb = 0; memb < PSET_LAST; ++memb)
53742 + len = len_for(pset_get(state->pset, memb), inode, memb, len);
53743 + assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
53744 + return len;
53745 +}
53746 +
53747 +/* helper function for plugin_sd_save(): save plugin, associated with
53748 + inode. */
53749 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
53750 + struct inode *inode /* object being processed */ ,
53751 + pset_member memb /* what element of pset is saved */ ,
53752 + char **area /* position in stat-data */ ,
53753 + int *count /* incremented if plugin were actually
53754 + * saved. */ )
53755 +{
53756 + reiser4_plugin_slot *slot;
53757 + int fake_len;
53758 + int result;
53759 +
53760 + assert("nikita-665", inode != NULL);
53761 + assert("nikita-666", area != NULL);
53762 + assert("nikita-667", *area != NULL);
53763 +
53764 + if (plugin == NULL)
53765 + return 0;
53766 + if (!(reiser4_inode_data(inode)->plugin_mask & (1 << memb)))
53767 + return 0;
53768 + slot = (reiser4_plugin_slot *) * area;
53769 + put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
53770 + put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
53771 + fake_len = (int)0xffff;
53772 + move_on(&fake_len, area, sizeof *slot);
53773 + ++*count;
53774 + result = 0;
53775 + if (plugin->h.pops != NULL) {
53776 + if (plugin->h.pops->save != NULL)
53777 + result = plugin->h.pops->save(inode, plugin, area);
53778 + }
53779 + return result;
53780 +}
53781 +
53782 +/* save state of all non-standard plugins associated with inode */
53783 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
53784 + char **area /* position in stat-data */ )
53785 +{
53786 + int result = 0;
53787 + int num_of_plugins;
53788 + reiser4_plugin_stat *sd;
53789 + reiser4_inode *state;
53790 + int fake_len;
53791 + pset_member memb;
53792 +
53793 + assert("nikita-669", inode != NULL);
53794 + assert("nikita-670", area != NULL);
53795 + assert("nikita-671", *area != NULL);
53796 +
53797 + state = reiser4_inode_data(inode);
53798 + if (state->plugin_mask == 0)
53799 + return 0;
53800 + sd = (reiser4_plugin_stat *) * area;
53801 + fake_len = (int)0xffff;
53802 + move_on(&fake_len, area, sizeof *sd);
53803 +
53804 + num_of_plugins = 0;
53805 + for (memb = 0; memb < PSET_LAST; ++memb) {
53806 + result = save_plug(pset_get(state->pset, memb),
53807 + inode, memb, area, &num_of_plugins);
53808 + if (result != 0)
53809 + break;
53810 + }
53811 +
53812 + put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
53813 + return result;
53814 +}
53815 +
53816 +/* helper function for crypto_sd_present(), crypto_sd_save.
53817 + Allocates memory for crypto stat, keyid and attaches it to the inode */
53818 +static int extract_crypto_stat (struct inode * inode,
53819 + reiser4_crypto_stat * sd)
53820 +{
53821 + crypto_stat_t * info;
53822 + assert("edward-11", !inode_crypto_stat(inode));
53823 + assert("edward-1413",
53824 + !inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
53825 + /* create and attach a crypto-stat without secret key loaded */
53826 + info = alloc_crypto_stat(inode);
53827 + if (IS_ERR(info))
53828 + return PTR_ERR(info);
53829 + info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
53830 + memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
53831 + attach_crypto_stat(inode, info);
53832 + inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53833 + return 0;
53834 +}
53835 +
53836 +/* crypto stat-data extension */
53837 +
53838 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
53839 +{
53840 + int result;
53841 + reiser4_crypto_stat *sd;
53842 + digest_plugin *dplug = inode_digest_plugin(inode);
53843 +
53844 + assert("edward-06", dplug != NULL);
53845 + assert("edward-684", dplug->fipsize);
53846 + assert("edward-07", area != NULL);
53847 + assert("edward-08", *area != NULL);
53848 + assert("edward-09", len != NULL);
53849 + assert("edward-10", *len > 0);
53850 +
53851 + if (*len < (int)sizeof(reiser4_crypto_stat)) {
53852 + return not_enough_space(inode, "crypto-sd");
53853 + }
53854 + /* *len is number of bytes in stat data item from *area to the end of
53855 + item. It must be not less than size of this extension */
53856 + assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
53857 +
53858 + sd = (reiser4_crypto_stat *) * area;
53859 + result = extract_crypto_stat(inode, sd);
53860 + move_on(len, area, sizeof(*sd) + dplug->fipsize);
53861 +
53862 + return result;
53863 +}
53864 +
53865 +static int save_len_crypto_sd(struct inode *inode)
53866 +{
53867 + return sizeof(reiser4_crypto_stat) +
53868 + inode_digest_plugin(inode)->fipsize;
53869 +}
53870 +
53871 +static int save_crypto_sd(struct inode *inode, char **area)
53872 +{
53873 + int result = 0;
53874 + reiser4_crypto_stat *sd;
53875 + crypto_stat_t * info = inode_crypto_stat(inode);
53876 + digest_plugin *dplug = inode_digest_plugin(inode);
53877 +
53878 + assert("edward-12", dplug != NULL);
53879 + assert("edward-13", area != NULL);
53880 + assert("edward-14", *area != NULL);
53881 + assert("edward-15", info != NULL);
53882 + assert("edward-1414", info->keyid != NULL);
53883 + assert("edward-1415", info->keysize != 0);
53884 + assert("edward-76", reiser4_inode_data(inode) != NULL);
53885 +
53886 + if (!inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
53887 + /* file is just created */
53888 + sd = (reiser4_crypto_stat *) *area;
53889 + /* copy everything but private key to the disk stat-data */
53890 + put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
53891 + memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
53892 + inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53893 + }
53894 + *area += (sizeof(*sd) + dplug->fipsize);
53895 + return result;
53896 +}
53897 +
53898 +static int eio(struct inode *inode, char **area, int *len)
53899 +{
53900 + return RETERR(-EIO);
53901 +}
53902 +
53903 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
53904 + [LIGHT_WEIGHT_STAT] = {
53905 + .h = {
53906 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53907 + .id = LIGHT_WEIGHT_STAT,
53908 + .pops = NULL,
53909 + .label = "light-weight sd",
53910 + .desc = "sd for light-weight files",
53911 + .linkage = {NULL,NULL}
53912 + },
53913 + .present = present_lw_sd,
53914 + .absent = NULL,
53915 + .save_len = save_len_lw_sd,
53916 + .save = save_lw_sd,
53917 + .alignment = 8
53918 + },
53919 + [UNIX_STAT] = {
53920 + .h = {
53921 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53922 + .id = UNIX_STAT,
53923 + .pops = NULL,
53924 + .label = "unix-sd",
53925 + .desc = "unix stat-data fields",
53926 + .linkage = {NULL,NULL}
53927 + },
53928 + .present = present_unix_sd,
53929 + .absent = absent_unix_sd,
53930 + .save_len = save_len_unix_sd,
53931 + .save = save_unix_sd,
53932 + .alignment = 8
53933 + },
53934 + [LARGE_TIMES_STAT] = {
53935 + .h = {
53936 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53937 + .id = LARGE_TIMES_STAT,
53938 + .pops = NULL,
53939 + .label = "64time-sd",
53940 + .desc = "nanosecond resolution for times",
53941 + .linkage = {NULL,NULL}
53942 + },
53943 + .present = present_large_times_sd,
53944 + .absent = NULL,
53945 + .save_len = save_len_large_times_sd,
53946 + .save = save_large_times_sd,
53947 + .alignment = 8
53948 + },
53949 + [SYMLINK_STAT] = {
53950 + /* stat data of symlink has this extension */
53951 + .h = {
53952 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53953 + .id = SYMLINK_STAT,
53954 + .pops = NULL,
53955 + .label = "symlink-sd",
53956 + .desc =
53957 + "stat data is appended with symlink name",
53958 + .linkage = {NULL,NULL}
53959 + },
53960 + .present = present_symlink_sd,
53961 + .absent = NULL,
53962 + .save_len = save_len_symlink_sd,
53963 + .save = save_symlink_sd,
53964 + .alignment = 8
53965 + },
53966 + [PLUGIN_STAT] = {
53967 + .h = {
53968 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53969 + .id = PLUGIN_STAT,
53970 + .pops = NULL,
53971 + .label = "plugin-sd",
53972 + .desc = "plugin stat-data fields",
53973 + .linkage = {NULL,NULL}
53974 + },
53975 + .present = present_plugin_sd,
53976 + .absent = absent_plugin_sd,
53977 + .save_len = save_len_plugin_sd,
53978 + .save = save_plugin_sd,
53979 + .alignment = 8
53980 + },
53981 + [FLAGS_STAT] = {
53982 + .h = {
53983 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53984 + .id = FLAGS_STAT,
53985 + .pops = NULL,
53986 + .label = "flags-sd",
53987 + .desc = "inode bit flags",
53988 + .linkage = {NULL, NULL}
53989 + },
53990 + .present = present_flags_sd,
53991 + .absent = NULL,
53992 + .save_len = save_len_flags_sd,
53993 + .save = save_flags_sd,
53994 + .alignment = 8
53995 + },
53996 + [CAPABILITIES_STAT] = {
53997 + .h = {
53998 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53999 + .id = CAPABILITIES_STAT,
54000 + .pops = NULL,
54001 + .label = "capabilities-sd",
54002 + .desc = "capabilities",
54003 + .linkage = {NULL, NULL}
54004 + },
54005 + .present = eio,
54006 + .absent = NULL,
54007 + .save_len = save_len_flags_sd,
54008 + .save = save_flags_sd,
54009 + .alignment = 8
54010 + },
54011 + [CRYPTO_STAT] = {
54012 + .h = {
54013 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54014 + .id = CRYPTO_STAT,
54015 + .pops = NULL,
54016 + .label = "crypto-sd",
54017 + .desc = "secret key size and id",
54018 + .linkage = {NULL, NULL}
54019 + },
54020 + .present = present_crypto_sd,
54021 + .absent = NULL,
54022 + .save_len = save_len_crypto_sd,
54023 + .save = save_crypto_sd,
54024 + .alignment = 8
54025 + }
54026 +};
54027 +
54028 +/* Make Linus happy.
54029 + Local variables:
54030 + c-indentation-style: "K&R"
54031 + mode-name: "LC"
54032 + c-basic-offset: 8
54033 + tab-width: 8
54034 + fill-column: 120
54035 + End:
54036 +*/
54037 Index: linux-2.6.16/fs/reiser4/plugin/item/static_stat.h
54038 ===================================================================
54039 --- /dev/null
54040 +++ linux-2.6.16/fs/reiser4/plugin/item/static_stat.h
54041 @@ -0,0 +1,219 @@
54042 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54043 +
54044 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
54045 +
54046 +In the case where each file has not less than the fields needed by the
54047 +stat() syscall, it is more compact to store those fields in this
54048 +struct.
54049 +
54050 +If this item does not exist, then all stats are dynamically resolved.
54051 +At the moment, we either resolve all stats dynamically or all of them
54052 +statically. If you think this is not fully optimal, and the rest of
54053 +reiser4 is working, then fix it...:-)
54054 +
54055 +*/
54056 +
54057 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
54058 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
54059 +
54060 +#include "../../forward.h"
54061 +#include "../../dformat.h"
54062 +
54063 +#include <linux/fs.h> /* for struct inode */
54064 +
54065 +/* Stat data layout: goals and implementation.
54066 +
54067 + We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
54068 + them, including not having semantic metadata attached to them.
54069 +
54070 + There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
54071 + want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
54072 + sized structure because the statically sized structure knows without recording it what the names and lengths of the
54073 + attributes are.
54074 +
54075 + This leads to a natural compromise, which is to special case those files which have simply the standard unix file
54076 + attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
54077 + file in their use of file attributes.
54078 +
54079 + Yet this compromise deserves to be compromised a little.
54080 +
54081 + We accommodate the case where you have no more than the standard unix file attributes by using an "extension
54082 + bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
54083 +
54084 + If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
54085 + from parent directory (as uid, gid) or initialised to some sane values.
54086 +
54087 + To capitalize on existing code infrastructure, extensions are
54088 + implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
54089 + Each stat-data extension plugin implements four methods:
54090 +
54091 + ->present() called by sd_load() when this extension is found in stat-data
54092 + ->absent() called by sd_load() when this extension is not found in stat-data
54093 + ->save_len() called by sd_len() to calculate total length of stat-data
54094 + ->save() called by sd_save() to store extension data into stat-data
54095 +
54096 + Implementation is in fs/reiser4/plugin/item/static_stat.c
54097 +*/
54098 +
54099 +/* stat-data extension. Please order this by presumed frequency of use */
54100 +typedef enum {
54101 + /* support for light-weight files */
54102 + LIGHT_WEIGHT_STAT,
54103 + /* data required to implement unix stat(2) call. Layout is in
54104 + reiser4_unix_stat. If this is not present, file is light-weight */
54105 + UNIX_STAT,
54106 + /* this contains additional set of 32bit [anc]time fields to implement
54107 + nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
54108 + if this extension is governed by 32bittimes mount option. */
54109 + LARGE_TIMES_STAT,
54110 + /* stat data has link name included */
54111 + SYMLINK_STAT,
54112 + /* if this is present, file is controlled by non-standard
54113 + plugin (that is, plugin that cannot be deduced from file
54114 + mode bits), for example, aggregation, interpolation etc. */
54115 + PLUGIN_STAT,
54116 + /* this extension contains persistent inode flags. These flags are
54117 + single bits: immutable, append, only, etc. Layout is in
54118 + reiser4_flags_stat. */
54119 + FLAGS_STAT,
54120 + /* this extension contains capabilities sets, associated with this
54121 + file. Layout is in reiser4_capabilities_stat */
54122 + CAPABILITIES_STAT,
54123 + /* this extension contains size and public id of the secret key.
54124 + Layout is in reiser4_crypto_stat */
54125 + CRYPTO_STAT,
54126 + LAST_SD_EXTENSION,
54127 + /*
54128 + * init_inode_static_sd() iterates over extension mask until all
54129 + * non-zero bits are processed. This means, that neither ->present(),
54130 + * nor ->absent() methods will be called for stat-data extensions that
54131 + * go after last present extension. But some basic extensions, we want
54132 + * either ->absent() or ->present() method to be called, because these
54133 + * extensions set up something in inode even when they are not
54134 + * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
54135 + * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
54136 + * ->present(), or ->absent() method will be called, independently of
54137 + * what other extensions are present.
54138 + */
54139 + LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT,
54140 +} sd_ext_bits;
54141 +
54142 +/* minimal stat-data. This allows to support light-weight files. */
54143 +typedef struct reiser4_stat_data_base {
54144 + /* 0 */ __le16 extmask;
54145 + /* 2 */
54146 +} PACKED reiser4_stat_data_base;
54147 +
54148 +typedef struct reiser4_light_weight_stat {
54149 + /* 0 */ __le16 mode;
54150 + /* 2 */ __le32 nlink;
54151 + /* 8 */ __le64 size;
54152 + /* size in bytes */
54153 + /* 16 */
54154 +} PACKED reiser4_light_weight_stat;
54155 +
54156 +typedef struct reiser4_unix_stat {
54157 + /* owner id */
54158 + /* 0 */ __le32 uid;
54159 + /* group id */
54160 + /* 4 */ __le32 gid;
54161 + /* access time */
54162 + /* 8 */ __le32 atime;
54163 + /* modification time */
54164 + /* 12 */ __le32 mtime;
54165 + /* change time */
54166 + /* 16 */ __le32 ctime;
54167 + union {
54168 + /* minor:major for device files */
54169 + /* 20 */ __le64 rdev;
54170 + /* bytes used by file */
54171 + /* 20 */ __le64 bytes;
54172 + } u;
54173 + /* 28 */
54174 +} PACKED reiser4_unix_stat;
54175 +
54176 +/* symlink stored as part of inode */
54177 +typedef struct reiser4_symlink_stat {
54178 + char body[0];
54179 +} PACKED reiser4_symlink_stat;
54180 +
54181 +typedef struct reiser4_plugin_slot {
54182 + /* 0 */ __le16 pset_memb;
54183 + /* 2 */ __le16 id;
54184 + /* 4 *//* here plugin stores its persistent state */
54185 +} PACKED reiser4_plugin_slot;
54186 +
54187 +/* stat-data extension for files with non-standard plugin. */
54188 +typedef struct reiser4_plugin_stat {
54189 + /* number of additional plugins, associated with this object */
54190 + /* 0 */ __le16 plugins_no;
54191 + /* 2 */ reiser4_plugin_slot slot[0];
54192 + /* 2 */
54193 +} PACKED reiser4_plugin_stat;
54194 +
54195 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
54196 + * bit mask. If need arise, this can be replaced with variable width
54197 + * bitmask. */
54198 +typedef struct reiser4_flags_stat {
54199 + /* 0 */ __le32 flags;
54200 + /* 4 */
54201 +} PACKED reiser4_flags_stat;
54202 +
54203 +typedef struct reiser4_capabilities_stat {
54204 + /* 0 */ __le32 effective;
54205 + /* 8 */ __le32 permitted;
54206 + /* 16 */
54207 +} PACKED reiser4_capabilities_stat;
54208 +
54209 +typedef struct reiser4_cluster_stat {
54210 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
54211 + /* 0 */ d8 cluster_shift;
54212 + /* 1 */
54213 +} PACKED reiser4_cluster_stat;
54214 +
54215 +typedef struct reiser4_crypto_stat {
54216 + /* secret key size, bits */
54217 + /* 0 */ d16 keysize;
54218 + /* secret key id */
54219 + /* 2 */ d8 keyid[0];
54220 + /* 2 */
54221 +} PACKED reiser4_crypto_stat;
54222 +
54223 +typedef struct reiser4_large_times_stat {
54224 + /* access time */
54225 + /* 0 */ d32 atime;
54226 + /* modification time */
54227 + /* 8 */ d32 mtime;
54228 + /* change time */
54229 + /* 16 */ d32 ctime;
54230 + /* 24 */
54231 +} PACKED reiser4_large_times_stat;
54232 +
54233 +/* this structure is filled by sd_item_stat */
54234 +typedef struct sd_stat {
54235 + int dirs;
54236 + int files;
54237 + int others;
54238 +} sd_stat;
54239 +
54240 +/* plugin->item.common.* */
54241 +extern void print_sd(const char *prefix, coord_t * coord);
54242 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
54243 +
54244 +/* plugin->item.s.sd.* */
54245 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
54246 +extern int save_len_static_sd(struct inode *inode);
54247 +extern int save_static_sd(struct inode *inode, char **area);
54248 +
54249 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
54250 +#endif
54251 +
54252 +/* Make Linus happy.
54253 + Local variables:
54254 + c-indentation-style: "K&R"
54255 + mode-name: "LC"
54256 + c-basic-offset: 8
54257 + tab-width: 8
54258 + fill-column: 120
54259 + End:
54260 +*/
54261 Index: linux-2.6.16/fs/reiser4/plugin/item/tail.c
54262 ===================================================================
54263 --- /dev/null
54264 +++ linux-2.6.16/fs/reiser4/plugin/item/tail.c
54265 @@ -0,0 +1,805 @@
54266 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54267 +
54268 +#include "item.h"
54269 +#include "../../inode.h"
54270 +#include "../../page_cache.h"
54271 +#include "../../carry.h"
54272 +#include "../../vfs_ops.h"
54273 +
54274 +#include <linux/quotaops.h>
54275 +#include <asm/uaccess.h>
54276 +#include <linux/swap.h>
54277 +#include <linux/writeback.h>
54278 +
54279 +/* plugin->u.item.b.max_key_inside */
54280 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
54281 +{
54282 + item_key_by_coord(coord, key);
54283 + set_key_offset(key, get_key_offset(max_key()));
54284 + return key;
54285 +}
54286 +
54287 +/* plugin->u.item.b.can_contain_key */
54288 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54289 + const reiser4_item_data *data)
54290 +{
54291 + reiser4_key item_key;
54292 +
54293 + if (item_plugin_by_coord(coord) != data->iplug)
54294 + return 0;
54295 +
54296 + item_key_by_coord(coord, &item_key);
54297 + if (get_key_locality(key) != get_key_locality(&item_key) ||
54298 + get_key_objectid(key) != get_key_objectid(&item_key))
54299 + return 0;
54300 +
54301 + return 1;
54302 +}
54303 +
54304 +/* plugin->u.item.b.mergeable
54305 + first item is of tail type */
54306 +/* Audited by: green(2002.06.14) */
54307 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
54308 +{
54309 + reiser4_key key1, key2;
54310 +
54311 + assert("vs-535",
54312 + item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
54313 + assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54314 +
54315 + if (item_id_by_coord(p2) != FORMATTING_ID) {
54316 + /* second item is of another type */
54317 + return 0;
54318 + }
54319 +
54320 + item_key_by_coord(p1, &key1);
54321 + item_key_by_coord(p2, &key2);
54322 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
54323 + get_key_objectid(&key1) != get_key_objectid(&key2)
54324 + || get_key_type(&key1) != get_key_type(&key2)) {
54325 + /* items of different objects */
54326 + return 0;
54327 + }
54328 + if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54329 + /* not adjacent items */
54330 + return 0;
54331 + }
54332 + return 1;
54333 +}
54334 +
54335 +/* plugin->u.item.b.print
54336 + plugin->u.item.b.check */
54337 +
54338 +/* plugin->u.item.b.nr_units */
54339 +pos_in_node_t nr_units_tail(const coord_t * coord)
54340 +{
54341 + return item_length_by_coord(coord);
54342 +}
54343 +
54344 +/* plugin->u.item.b.lookup */
54345 +lookup_result
54346 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54347 +{
54348 + reiser4_key item_key;
54349 + __u64 lookuped, offset;
54350 + unsigned nr_units;
54351 +
54352 + item_key_by_coord(coord, &item_key);
54353 + offset = get_key_offset(item_key_by_coord(coord, &item_key));
54354 + nr_units = nr_units_tail(coord);
54355 +
54356 + /* key we are looking for must be greater than key of item @coord */
54357 + assert("vs-416", keygt(key, &item_key));
54358 +
54359 + /* offset we are looking for */
54360 + lookuped = get_key_offset(key);
54361 +
54362 + if (lookuped >= offset && lookuped < offset + nr_units) {
54363 + /* byte we are looking for is in this item */
54364 + coord->unit_pos = lookuped - offset;
54365 + coord->between = AT_UNIT;
54366 + return CBK_COORD_FOUND;
54367 + }
54368 +
54369 + /* set coord after last unit */
54370 + coord->unit_pos = nr_units - 1;
54371 + coord->between = AFTER_UNIT;
54372 + return bias ==
54373 + FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54374 +}
54375 +
54376 +/* plugin->u.item.b.paste */
54377 +int
54378 +paste_tail(coord_t *coord, reiser4_item_data *data,
54379 + carry_plugin_info *info UNUSED_ARG)
54380 +{
54381 + unsigned old_item_length;
54382 + char *item;
54383 +
54384 + /* length the item had before resizing has been performed */
54385 + old_item_length = item_length_by_coord(coord) - data->length;
54386 +
54387 + /* tail items never get pasted in the middle */
54388 + assert("vs-363",
54389 + (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54390 + (coord->unit_pos == old_item_length - 1 &&
54391 + coord->between == AFTER_UNIT) ||
54392 + (coord->unit_pos == 0 && old_item_length == 0
54393 + && coord->between == AT_UNIT));
54394 +
54395 + item = item_body_by_coord(coord);
54396 + if (coord->unit_pos == 0)
54397 + /* make space for pasted data when pasting at the beginning of
54398 + the item */
54399 + memmove(item + data->length, item, old_item_length);
54400 +
54401 + if (coord->between == AFTER_UNIT)
54402 + coord->unit_pos++;
54403 +
54404 + if (data->data) {
54405 + assert("vs-554", data->user == 0 || data->user == 1);
54406 + if (data->user) {
54407 + assert("nikita-3035", schedulable());
54408 + /* copy from user space */
54409 + if (__copy_from_user(item + coord->unit_pos,
54410 + (const char __user *)data->data,
54411 + (unsigned)data->length))
54412 + return RETERR(-EFAULT);
54413 + } else
54414 + /* copy from kernel space */
54415 + memcpy(item + coord->unit_pos, data->data,
54416 + (unsigned)data->length);
54417 + } else {
54418 + memset(item + coord->unit_pos, 0, (unsigned)data->length);
54419 + }
54420 + return 0;
54421 +}
54422 +
54423 +/* plugin->u.item.b.fast_paste */
54424 +
54425 +/* plugin->u.item.b.can_shift
54426 + number of units is returned via return value, number of bytes via @size. For
54427 + tail items they coincide */
54428 +int
54429 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54430 + znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54431 + unsigned *size, unsigned want)
54432 +{
54433 + /* make sure that that we do not want to shift more than we have */
54434 + assert("vs-364", want > 0
54435 + && want <= (unsigned)item_length_by_coord(source));
54436 +
54437 + *size = min(want, free_space);
54438 + return *size;
54439 +}
54440 +
54441 +/* plugin->u.item.b.copy_units */
54442 +void
54443 +copy_units_tail(coord_t * target, coord_t * source,
54444 + unsigned from, unsigned count,
54445 + shift_direction where_is_free_space,
54446 + unsigned free_space UNUSED_ARG)
54447 +{
54448 + /* make sure that item @target is expanded already */
54449 + assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54450 + assert("vs-370", free_space >= count);
54451 +
54452 + if (where_is_free_space == SHIFT_LEFT) {
54453 + /* append item @target with @count first bytes of @source */
54454 + assert("vs-365", from == 0);
54455 +
54456 + memcpy((char *)item_body_by_coord(target) +
54457 + item_length_by_coord(target) - count,
54458 + (char *)item_body_by_coord(source), count);
54459 + } else {
54460 + /* target item is moved to right already */
54461 + reiser4_key key;
54462 +
54463 + assert("vs-367",
54464 + (unsigned)item_length_by_coord(source) == from + count);
54465 +
54466 + memcpy((char *)item_body_by_coord(target),
54467 + (char *)item_body_by_coord(source) + from, count);
54468 +
54469 + /* new units are inserted before first unit in an item,
54470 + therefore, we have to update item key */
54471 + item_key_by_coord(source, &key);
54472 + set_key_offset(&key, get_key_offset(&key) + from);
54473 +
54474 + node_plugin_by_node(target->node)->update_item_key(target, &key,
54475 + NULL /*info */);
54476 + }
54477 +}
54478 +
54479 +/* plugin->u.item.b.create_hook */
54480 +
54481 +/* item_plugin->b.kill_hook
54482 + this is called when @count units starting from @from-th one are going to be removed
54483 + */
54484 +int
54485 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54486 + pos_in_node_t count, struct carry_kill_data *kdata)
54487 +{
54488 + reiser4_key key;
54489 + loff_t start, end;
54490 +
54491 + assert("vs-1577", kdata);
54492 + assert("vs-1579", kdata->inode);
54493 +
54494 + item_key_by_coord(coord, &key);
54495 + start = get_key_offset(&key) + from;
54496 + end = start + count;
54497 + fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54498 + return 0;
54499 +}
54500 +
54501 +/* plugin->u.item.b.shift_hook */
54502 +
54503 +/* helper for kill_units_tail and cut_units_tail */
54504 +static int
54505 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54506 + reiser4_key * smallest_removed, reiser4_key * new_first)
54507 +{
54508 + pos_in_node_t count;
54509 +
54510 + /* this method is only called to remove part of item */
54511 + assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54512 + /* tails items are never cut from the middle of an item */
54513 + assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
54514 + assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
54515 +
54516 + count = to - from + 1;
54517 +
54518 + if (smallest_removed) {
54519 + /* store smallest key removed */
54520 + item_key_by_coord(coord, smallest_removed);
54521 + set_key_offset(smallest_removed,
54522 + get_key_offset(smallest_removed) + from);
54523 + }
54524 + if (new_first) {
54525 + /* head of item is cut */
54526 + assert("vs-1529", from == 0);
54527 +
54528 + item_key_by_coord(coord, new_first);
54529 + set_key_offset(new_first,
54530 + get_key_offset(new_first) + from + count);
54531 + }
54532 +
54533 + if (REISER4_DEBUG)
54534 + memset((char *)item_body_by_coord(coord) + from, 0, count);
54535 + return count;
54536 +}
54537 +
54538 +/* plugin->u.item.b.cut_units */
54539 +int
54540 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54541 + struct carry_cut_data *cdata UNUSED_ARG,
54542 + reiser4_key * smallest_removed, reiser4_key * new_first)
54543 +{
54544 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54545 +}
54546 +
54547 +/* plugin->u.item.b.kill_units */
54548 +int
54549 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54550 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
54551 + reiser4_key * new_first)
54552 +{
54553 + kill_hook_tail(coord, from, to - from + 1, kdata);
54554 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54555 +}
54556 +
54557 +/* plugin->u.item.b.unit_key */
54558 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
54559 +{
54560 + assert("vs-375", coord_is_existing_unit(coord));
54561 +
54562 + item_key_by_coord(coord, key);
54563 + set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
54564 +
54565 + return key;
54566 +}
54567 +
54568 +/* plugin->u.item.b.estimate
54569 + plugin->u.item.b.item_data_by_flow */
54570 +
54571 +/* tail redpage function. It is called from readpage_tail(). */
54572 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
54573 +{
54574 + tap_t tap;
54575 + int result;
54576 + coord_t coord;
54577 + lock_handle lh;
54578 + int count, mapped;
54579 + struct inode *inode;
54580 + char *pagedata;
54581 +
54582 + /* saving passed coord in order to do not move it by tap. */
54583 + init_lh(&lh);
54584 + copy_lh(&lh, uf_coord->lh);
54585 + inode = page->mapping->host;
54586 + coord_dup(&coord, &uf_coord->coord);
54587 +
54588 + tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
54589 +
54590 + if ((result = tap_load(&tap)))
54591 + goto out_tap_done;
54592 +
54593 + /* lookup until page is filled up. */
54594 + for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
54595 + /* number of bytes to be copied to page */
54596 + count = item_length_by_coord(&coord) - coord.unit_pos;
54597 + if (count > PAGE_CACHE_SIZE - mapped)
54598 + count = PAGE_CACHE_SIZE - mapped;
54599 +
54600 + /* attach @page to address space and get data address */
54601 + pagedata = kmap_atomic(page, KM_USER0);
54602 +
54603 + /* copy tail item to page */
54604 + memcpy(pagedata + mapped,
54605 + ((char *)item_body_by_coord(&coord) + coord.unit_pos),
54606 + count);
54607 + mapped += count;
54608 +
54609 + flush_dcache_page(page);
54610 +
54611 + /* dettach page from address space */
54612 + kunmap_atomic(pagedata, KM_USER0);
54613 +
54614 + /* Getting next tail item. */
54615 + if (mapped < PAGE_CACHE_SIZE) {
54616 + /*
54617 + * unlock page in order to avoid keep it locked
54618 + * during tree lookup, which takes long term locks
54619 + */
54620 + unlock_page(page);
54621 +
54622 + /* getting right neighbour. */
54623 + result = go_dir_el(&tap, RIGHT_SIDE, 0);
54624 +
54625 + /* lock page back */
54626 + lock_page(page);
54627 + if (PageUptodate(page)) {
54628 + /*
54629 + * another thread read the page, we have
54630 + * nothing to do
54631 + */
54632 + result = 0;
54633 + goto out_unlock_page;
54634 + }
54635 +
54636 + if (result) {
54637 + if (result == -E_NO_NEIGHBOR) {
54638 + /*
54639 + * rigth neighbor is not a formatted
54640 + * node
54641 + */
54642 + result = 0;
54643 + goto done;
54644 + } else {
54645 + goto out_tap_relse;
54646 + }
54647 + } else {
54648 + if (!inode_file_plugin(inode)->
54649 + owns_item(inode, &coord)) {
54650 + /* item of another file is found */
54651 + result = 0;
54652 + goto done;
54653 + }
54654 + }
54655 + }
54656 + }
54657 +
54658 + done:
54659 + if (mapped != PAGE_CACHE_SIZE) {
54660 + pagedata = kmap_atomic(page, KM_USER0);
54661 + memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
54662 + flush_dcache_page(page);
54663 + kunmap_atomic(pagedata, KM_USER0);
54664 + }
54665 + SetPageUptodate(page);
54666 + out_unlock_page:
54667 + unlock_page(page);
54668 + out_tap_relse:
54669 + tap_relse(&tap);
54670 + out_tap_done:
54671 + tap_done(&tap);
54672 + return result;
54673 +}
54674 +
54675 +/*
54676 + plugin->s.file.readpage
54677 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
54678 + or
54679 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
54680 +
54681 + At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
54682 + item. */
54683 +int readpage_tail(void *vp, struct page *page)
54684 +{
54685 + uf_coord_t *uf_coord = vp;
54686 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
54687 + ON_DEBUG(reiser4_key key);
54688 +
54689 + assert("umka-2515", PageLocked(page));
54690 + assert("umka-2516", !PageUptodate(page));
54691 + assert("umka-2517", !jprivate(page) && !PagePrivate(page));
54692 + assert("umka-2518", page->mapping && page->mapping->host);
54693 +
54694 + assert("umka-2519", znode_is_loaded(coord->node));
54695 + assert("umka-2520", item_is_tail(coord));
54696 + assert("umka-2521", coord_is_existing_unit(coord));
54697 + assert("umka-2522", znode_is_rlocked(coord->node));
54698 + assert("umka-2523",
54699 + page->mapping->host->i_ino ==
54700 + get_key_objectid(item_key_by_coord(coord, &key)));
54701 +
54702 + return do_readpage_tail(uf_coord, page);
54703 +}
54704 +
54705 +/**
54706 + * overwrite_tail
54707 + * @flow:
54708 + * @coord:
54709 + *
54710 + * Overwrites tail item or its part by user data. Returns number of bytes
54711 + * written or error code.
54712 + */
54713 +static int overwrite_tail(flow_t *flow, coord_t *coord)
54714 +{
54715 + unsigned count;
54716 +
54717 + assert("vs-570", flow->user == 1);
54718 + assert("vs-946", flow->data);
54719 + assert("vs-947", coord_is_existing_unit(coord));
54720 + assert("vs-948", znode_is_write_locked(coord->node));
54721 + assert("nikita-3036", schedulable());
54722 +
54723 + count = item_length_by_coord(coord) - coord->unit_pos;
54724 + if (count > flow->length)
54725 + count = flow->length;
54726 +
54727 + if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
54728 + (const char __user *)flow->data, count))
54729 + return RETERR(-EFAULT);
54730 +
54731 + znode_make_dirty(coord->node);
54732 + return count;
54733 +}
54734 +
54735 +/**
54736 + * insert_first_tail
54737 + * @inode:
54738 + * @flow:
54739 + * @coord:
54740 + * @lh:
54741 + *
54742 + * Returns number of bytes written or error code.
54743 + */
54744 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
54745 + coord_t *coord, lock_handle *lh)
54746 +{
54747 + int result;
54748 + loff_t to_write;
54749 + unix_file_info_t *uf_info;
54750 +
54751 + if (get_key_offset(&flow->key) != 0) {
54752 + /*
54753 + * file is empty and we have to write not to the beginning of
54754 + * file. Create a hole at the beginning of file. On success
54755 + * insert_flow returns 0 as number of written bytes which is
54756 + * what we have to return on padding a file with holes
54757 + */
54758 + flow->data = NULL;
54759 + flow->length = get_key_offset(&flow->key);
54760 + set_key_offset(&flow->key, 0);
54761 + /*
54762 + * holes in files built of tails are stored just like if there
54763 + * were real data which are all zeros. Therefore we have to
54764 + * allocate quota here as well
54765 + */
54766 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54767 + return RETERR(-EDQUOT);
54768 + result = insert_flow(coord, lh, flow);
54769 + if (flow->length)
54770 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54771 +
54772 + uf_info = unix_file_inode_data(inode);
54773 +
54774 + /*
54775 + * first item insertion is only possible when writing to empty
54776 + * file or performing tail conversion
54777 + */
54778 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
54779 + (inode_get_flag(inode, REISER4_PART_MIXED) &&
54780 + inode_get_flag(inode, REISER4_PART_IN_CONV))));
54781 +
54782 + /* if file was empty - update its state */
54783 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
54784 + uf_info->container = UF_CONTAINER_TAILS;
54785 + return result;
54786 + }
54787 +
54788 + /* check quota before appending data */
54789 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54790 + return RETERR(-EDQUOT);
54791 +
54792 + to_write = flow->length;
54793 + result = insert_flow(coord, lh, flow);
54794 + if (flow->length)
54795 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54796 + return (to_write - flow->length) ? (to_write - flow->length) : result;
54797 +}
54798 +
54799 +/**
54800 + * append_tail
54801 + * @inode:
54802 + * @flow:
54803 + * @coord:
54804 + * @lh:
54805 + *
54806 + * Returns number of bytes written or error code.
54807 + */
54808 +static ssize_t append_tail(struct inode *inode,
54809 + flow_t *flow, coord_t *coord, lock_handle *lh)
54810 +{
54811 + int result;
54812 + reiser4_key append_key;
54813 + loff_t to_write;
54814 +
54815 + if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
54816 + flow->data = NULL;
54817 + flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
54818 + set_key_offset(&flow->key, get_key_offset(&append_key));
54819 + /*
54820 + * holes in files built of tails are stored just like if there
54821 + * were real data which are all zeros. Therefore we have to
54822 + * allocate quota here as well
54823 + */
54824 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54825 + return RETERR(-EDQUOT);
54826 + result = insert_flow(coord, lh, flow);
54827 + if (flow->length)
54828 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54829 + return result;
54830 + }
54831 +
54832 + /* check quota before appending data */
54833 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54834 + return RETERR(-EDQUOT);
54835 +
54836 + to_write = flow->length;
54837 + result = insert_flow(coord, lh, flow);
54838 + if (flow->length)
54839 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54840 + return (to_write - flow->length) ? (to_write - flow->length) : result;
54841 +}
54842 +
54843 +/**
54844 + * write_tail_reserve_space - reserve space for tail write operation
54845 + * @inode:
54846 + *
54847 + * Estimates and reserves space which may be required for writing one flow to a
54848 + * file
54849 + */
54850 +static int write_extent_reserve_space(struct inode *inode)
54851 +{
54852 + __u64 count;
54853 + reiser4_tree *tree;
54854 +
54855 + /*
54856 + * to write one flow to a file by tails we have to reserve disk space for:
54857 +
54858 + * 1. find_file_item may have to insert empty node to the tree (empty
54859 + * leaf node between two extent items). This requires 1 block and
54860 + * number of blocks which are necessary to perform insertion of an
54861 + * internal item into twig level.
54862 + *
54863 + * 2. flow insertion
54864 + *
54865 + * 3. stat data update
54866 + */
54867 + tree = tree_by_inode(inode);
54868 + count = estimate_one_insert_item(tree) +
54869 + estimate_insert_flow(tree->height) +
54870 + estimate_one_insert_item(tree);
54871 + grab_space_enable();
54872 + return reiser4_grab_space(count, 0 /* flags */);
54873 +}
54874 +
54875 +#define PAGE_PER_FLOW 4
54876 +
54877 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
54878 +{
54879 + loff_t faulted;
54880 + int to_fault;
54881 +
54882 + if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
54883 + count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
54884 + faulted = 0;
54885 + while (count > 0) {
54886 + to_fault = PAGE_CACHE_SIZE;
54887 + if (count < to_fault)
54888 + to_fault = count;
54889 + fault_in_pages_readable(buf + faulted, to_fault);
54890 + count -= to_fault;
54891 + faulted += to_fault;
54892 + }
54893 + return faulted;
54894 +}
54895 +
54896 +/**
54897 + * write_extent - write method of tail item plugin
54898 + * @file: file to write to
54899 + * @buf: address of user-space buffer
54900 + * @count: number of bytes to write
54901 + * @pos: position in file to write to
54902 + *
54903 + * Returns number of written bytes or error code.
54904 + */
54905 +ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
54906 + loff_t *pos)
54907 +{
54908 + struct inode *inode;
54909 + struct hint hint;
54910 + int result;
54911 + flow_t flow;
54912 + coord_t *coord;
54913 + lock_handle *lh;
54914 + znode *loaded;
54915 +
54916 + inode = file->f_dentry->d_inode;
54917 +
54918 + if (write_extent_reserve_space(inode))
54919 + return RETERR(-ENOSPC);
54920 +
54921 + result = load_file_hint(file, &hint);
54922 + BUG_ON(result != 0);
54923 +
54924 + flow.length = faultin_user_pages(buf, count);
54925 + flow.user = 1;
54926 + memcpy(&flow.data, &buf, sizeof(buf));
54927 + flow.op = WRITE_OP;
54928 + key_by_inode_and_offset_common(inode, *pos, &flow.key);
54929 +
54930 + result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
54931 + if (IS_CBKERR(result))
54932 + return result;
54933 +
54934 + coord = &hint.ext_coord.coord;
54935 + lh = hint.ext_coord.lh;
54936 +
54937 + result = zload(coord->node);
54938 + BUG_ON(result != 0);
54939 + loaded = coord->node;
54940 +
54941 + if (coord->between == AFTER_UNIT) {
54942 + /* append with data or hole */
54943 + result = append_tail(inode, &flow, coord, lh);
54944 + } else if (coord->between == AT_UNIT) {
54945 + /* overwrite */
54946 + result = overwrite_tail(&flow, coord);
54947 + } else {
54948 + /* no items of this file yet. insert data or hole */
54949 + result = insert_first_tail(inode, &flow, coord, lh);
54950 + }
54951 + zrelse(loaded);
54952 + if (result < 0) {
54953 + done_lh(lh);
54954 + return result;
54955 + }
54956 +
54957 + /* seal and unlock znode */
54958 + hint.ext_coord.valid = 0;
54959 + if (hint.ext_coord.valid)
54960 + set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
54961 + else
54962 + unset_hint(&hint);
54963 +
54964 + save_file_hint(file, &hint);
54965 + return result;
54966 +}
54967 +
54968 +#if REISER4_DEBUG
54969 +
54970 +static int
54971 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
54972 +{
54973 + reiser4_key item_key;
54974 +
54975 + assert("vs-1356", coord_is_existing_unit(coord));
54976 + assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
54977 + assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
54978 + return get_key_offset(key) ==
54979 + get_key_offset(&item_key) + coord->unit_pos;
54980 +
54981 +}
54982 +
54983 +#endif
54984 +
54985 +/* plugin->u.item.s.file.read */
54986 +int read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
54987 +{
54988 + unsigned count;
54989 + int item_length;
54990 + coord_t *coord;
54991 + uf_coord_t *uf_coord;
54992 +
54993 + uf_coord = &hint->ext_coord;
54994 + coord = &uf_coord->coord;
54995 +
54996 + assert("vs-571", f->user == 1);
54997 + assert("vs-571", f->data);
54998 + assert("vs-967", coord && coord->node);
54999 + assert("vs-1117", znode_is_rlocked(coord->node));
55000 + assert("vs-1118", znode_is_loaded(coord->node));
55001 +
55002 + assert("nikita-3037", schedulable());
55003 + assert("vs-1357", coord_matches_key_tail(coord, &f->key));
55004 +
55005 + /* calculate number of bytes to read off the item */
55006 + item_length = item_length_by_coord(coord);
55007 + count = item_length_by_coord(coord) - coord->unit_pos;
55008 + if (count > f->length)
55009 + count = f->length;
55010 +
55011 + /* user page has to be brought in so that major page fault does not
55012 + * occur here when longtem lock is held */
55013 + if (__copy_to_user((char __user *)f->data,
55014 + ((char *)item_body_by_coord(coord) + coord->unit_pos),
55015 + count))
55016 + return RETERR(-EFAULT);
55017 +
55018 + /* probably mark_page_accessed() should only be called if
55019 + * coord->unit_pos is zero. */
55020 + mark_page_accessed(znode_page(coord->node));
55021 + move_flow_forward(f, count);
55022 +
55023 + coord->unit_pos += count;
55024 + if (item_length == coord->unit_pos) {
55025 + coord->unit_pos--;
55026 + coord->between = AFTER_UNIT;
55027 + }
55028 +
55029 + return 0;
55030 +}
55031 +
55032 +/*
55033 + plugin->u.item.s.file.append_key
55034 + key of first byte which is the next to last byte by addressed by this item
55035 +*/
55036 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
55037 +{
55038 + item_key_by_coord(coord, key);
55039 + set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
55040 + return key;
55041 +}
55042 +
55043 +/* plugin->u.item.s.file.init_coord_extension */
55044 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
55045 +{
55046 + uf_coord->valid = 1;
55047 +}
55048 +
55049 +/*
55050 + plugin->u.item.s.file.get_block
55051 +*/
55052 +int
55053 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
55054 +{
55055 + assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
55056 +
55057 + *block = *znode_get_block(coord->node);
55058 + return 0;
55059 +}
55060 +
55061 +/*
55062 + * Local variables:
55063 + * c-indentation-style: "K&R"
55064 + * mode-name: "LC"
55065 + * c-basic-offset: 8
55066 + * tab-width: 8
55067 + * fill-column: 79
55068 + * scroll-step: 1
55069 + * End:
55070 + */
55071 Index: linux-2.6.16/fs/reiser4/plugin/item/tail.h
55072 ===================================================================
55073 --- /dev/null
55074 +++ linux-2.6.16/fs/reiser4/plugin/item/tail.h
55075 @@ -0,0 +1,58 @@
55076 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55077 +
55078 +#if !defined( __REISER4_TAIL_H__ )
55079 +#define __REISER4_TAIL_H__
55080 +
55081 +typedef struct {
55082 + int not_used;
55083 +} tail_coord_extension_t;
55084 +
55085 +struct cut_list;
55086 +
55087 +/* plugin->u.item.b.* */
55088 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
55089 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
55090 + const reiser4_item_data *);
55091 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
55092 +pos_in_node_t nr_units_tail(const coord_t *);
55093 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
55094 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
55095 +int can_shift_tail(unsigned free_space, coord_t * source,
55096 + znode * target, shift_direction, unsigned *size,
55097 + unsigned want);
55098 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
55099 + unsigned count, shift_direction, unsigned free_space);
55100 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
55101 + struct carry_kill_data *);
55102 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55103 + struct carry_cut_data *, reiser4_key * smallest_removed,
55104 + reiser4_key * new_first);
55105 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55106 + struct carry_kill_data *, reiser4_key * smallest_removed,
55107 + reiser4_key * new_first);
55108 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
55109 +
55110 +/* plugin->u.item.s.* */
55111 +ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
55112 + loff_t *pos);
55113 +int read_tail(struct file *, flow_t *, hint_t *);
55114 +int readpage_tail(void *vp, struct page *page);
55115 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
55116 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
55117 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
55118 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
55119 + hint_t *, int back_to_dirty, int set_hint);
55120 +
55121 +/* __REISER4_TAIL_H__ */
55122 +#endif
55123 +
55124 +/* Make Linus happy.
55125 + Local variables:
55126 + c-indentation-style: "K&R"
55127 + mode-name: "LC"
55128 + c-basic-offset: 8
55129 + tab-width: 8
55130 + fill-column: 120
55131 + scroll-step: 1
55132 + End:
55133 +*/
55134 Index: linux-2.6.16/fs/reiser4/plugin/node/Makefile
55135 ===================================================================
55136 --- /dev/null
55137 +++ linux-2.6.16/fs/reiser4/plugin/node/Makefile
55138 @@ -0,0 +1,5 @@
55139 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
55140 +
55141 +node_plugins-objs := \
55142 + node.o \
55143 + node40.o
55144 Index: linux-2.6.16/fs/reiser4/plugin/node/node.c
55145 ===================================================================
55146 --- /dev/null
55147 +++ linux-2.6.16/fs/reiser4/plugin/node/node.c
55148 @@ -0,0 +1,131 @@
55149 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55150 +
55151 +/* Node plugin interface.
55152 +
55153 + Description: The tree provides the abstraction of flows, which it
55154 + internally fragments into items which it stores in nodes.
55155 +
55156 + A key_atom is a piece of data bound to a single key.
55157 +
55158 + For reasonable space efficiency to be achieved it is often
55159 + necessary to store key_atoms in the nodes in the form of items, where
55160 + an item is a sequence of key_atoms of the same or similar type. It is
55161 + more space-efficient, because the item can implement (very)
55162 + efficient compression of key_atom's bodies using internal knowledge
55163 + about their semantics, and it can often avoid having a key for each
55164 + key_atom. Each type of item has specific operations implemented by its
55165 + item handler (see balance.c).
55166 +
55167 + Rationale: the rest of the code (specifically balancing routines)
55168 + accesses leaf level nodes through this interface. This way we can
55169 + implement various block layouts and even combine various layouts
55170 + within the same tree. Balancing/allocating algorithms should not
55171 + care about peculiarities of splitting/merging specific item types,
55172 + but rather should leave that to the item's item handler.
55173 +
55174 + Items, including those that provide the abstraction of flows, have
55175 + the property that if you move them in part or in whole to another
55176 + node, the balancing code invokes their is_left_mergeable()
55177 + item_operation to determine if they are mergeable with their new
55178 + neighbor in the node you have moved them to. For some items the
55179 + is_left_mergeable() function always returns null.
55180 +
55181 + When moving the bodies of items from one node to another:
55182 +
55183 + if a partial item is shifted to another node the balancing code invokes
55184 + an item handler method to handle the item splitting.
55185 +
55186 + if the balancing code needs to merge with an item in the node it
55187 + is shifting to, it will invoke an item handler method to handle
55188 + the item merging.
55189 +
55190 + if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55191 + adjusting the item headers after the move is done using the node handler.
55192 +*/
55193 +
55194 +#include "../../forward.h"
55195 +#include "../../debug.h"
55196 +#include "../../key.h"
55197 +#include "../../coord.h"
55198 +#include "../plugin_header.h"
55199 +#include "../item/item.h"
55200 +#include "node.h"
55201 +#include "../plugin.h"
55202 +#include "../../znode.h"
55203 +#include "../../tree.h"
55204 +#include "../../super.h"
55205 +#include "../../reiser4.h"
55206 +
55207 +/**
55208 + * leftmost_key_in_node - get the smallest key in node
55209 + * @node:
55210 + * @key: store result here
55211 + *
55212 + * Stores the leftmost key of @node in @key.
55213 + */
55214 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55215 +{
55216 + assert("nikita-1634", node != NULL);
55217 + assert("nikita-1635", key != NULL);
55218 +
55219 + if (!node_is_empty(node)) {
55220 + coord_t first_item;
55221 +
55222 + coord_init_first_unit(&first_item, (znode *) node);
55223 + item_key_by_coord(&first_item, key);
55224 + } else
55225 + *key = *max_key();
55226 + return key;
55227 +}
55228 +
55229 +node_plugin node_plugins[LAST_NODE_ID] = {
55230 + [NODE40_ID] = {
55231 + .h = {
55232 + .type_id = REISER4_NODE_PLUGIN_TYPE,
55233 + .id = NODE40_ID,
55234 + .pops = NULL,
55235 + .label = "unified",
55236 + .desc = "unified node layout",
55237 + .linkage = {NULL, NULL}
55238 + },
55239 + .item_overhead = item_overhead_node40,
55240 + .free_space = free_space_node40,
55241 + .lookup = lookup_node40,
55242 + .num_of_items = num_of_items_node40,
55243 + .item_by_coord = item_by_coord_node40,
55244 + .length_by_coord = length_by_coord_node40,
55245 + .plugin_by_coord = plugin_by_coord_node40,
55246 + .key_at = key_at_node40,
55247 + .estimate = estimate_node40,
55248 + .check = check_node40,
55249 + .parse = parse_node40,
55250 + .init = init_node40,
55251 +#ifdef GUESS_EXISTS
55252 + .guess = guess_node40,
55253 +#endif
55254 + .change_item_size = change_item_size_node40,
55255 + .create_item = create_item_node40,
55256 + .update_item_key = update_item_key_node40,
55257 + .cut_and_kill = kill_node40,
55258 + .cut = cut_node40,
55259 + .shift = shift_node40,
55260 + .shrink_item = shrink_item_node40,
55261 + .fast_insert = fast_insert_node40,
55262 + .fast_paste = fast_paste_node40,
55263 + .fast_cut = fast_cut_node40,
55264 + .max_item_size = max_item_size_node40,
55265 + .prepare_removal = prepare_removal_node40,
55266 + .set_item_plugin = set_item_plugin_node40
55267 + }
55268 +};
55269 +
55270 +/*
55271 + Local variables:
55272 + c-indentation-style: "K&R"
55273 + mode-name: "LC"
55274 + c-basic-offset: 8
55275 + tab-width: 8
55276 + fill-column: 120
55277 + scroll-step: 1
55278 + End:
55279 +*/
55280 Index: linux-2.6.16/fs/reiser4/plugin/node/node.h
55281 ===================================================================
55282 --- /dev/null
55283 +++ linux-2.6.16/fs/reiser4/plugin/node/node.h
55284 @@ -0,0 +1,272 @@
55285 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55286 +
55287 +/* We need a definition of the default node layout here. */
55288 +
55289 +/* Generally speaking, it is best to have free space in the middle of the
55290 + node so that two sets of things can grow towards it, and to have the
55291 + item bodies on the left so that the last one of them grows into free
55292 + space. We optimize for the case where we append new items to the end
55293 + of the node, or grow the last item, because it hurts nothing to so
55294 + optimize and it is a common special case to do massive insertions in
55295 + increasing key order (and one of cases more likely to have a real user
55296 + notice the delay time for).
55297 +
55298 + formatted leaf default layout: (leaf1)
55299 +
55300 + |node header:item bodies:free space:key + pluginid + item offset|
55301 +
55302 + We grow towards the middle, optimizing layout for the case where we
55303 + append new items to the end of the node. The node header is fixed
55304 + length. Keys, and item offsets plus pluginids for the items
55305 + corresponding to them are in increasing key order, and are fixed
55306 + length. Item offsets are relative to start of node (16 bits creating
55307 + a node size limit of 64k, 12 bits might be a better choice....). Item
55308 + bodies are in decreasing key order. Item bodies have a variable size.
55309 + There is a one to one to one mapping of keys to item offsets to item
55310 + bodies. Item offsets consist of pointers to the zeroth byte of the
55311 + item body. Item length equals the start of the next item minus the
55312 + start of this item, except the zeroth item whose length equals the end
55313 + of the node minus the start of that item (plus a byte). In other
55314 + words, the item length is not recorded anywhere, and it does not need
55315 + to be since it is computable.
55316 +
55317 + Leaf variable length items and keys layout : (lvar)
55318 +
55319 + |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55320 +
55321 + We grow towards the middle, optimizing layout for the case where we
55322 + append new items to the end of the node. The node header is fixed
55323 + length. Keys and item offsets for the items corresponding to them are
55324 + in increasing key order, and keys are variable length. Item offsets
55325 + are relative to start of node (16 bits). Item bodies are in
55326 + decreasing key order. Item bodies have a variable size. There is a
55327 + one to one to one mapping of keys to item offsets to item bodies.
55328 + Item offsets consist of pointers to the zeroth byte of the item body.
55329 + Item length equals the start of the next item's key minus the start of
55330 + this item, except the zeroth item whose length equals the end of the
55331 + node minus the start of that item (plus a byte).
55332 +
55333 + leaf compressed keys layout: (lcomp)
55334 +
55335 + |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55336 +
55337 + We grow towards the middle, optimizing layout for the case where we
55338 + append new items to the end of the node. The node header is fixed
55339 + length. Keys and item offsets for the items corresponding to them are
55340 + in increasing key order, and keys are variable length. The "key
55341 + inherit" field indicates how much of the key prefix is identical to
55342 + the previous key (stem compression as described in "Managing
55343 + Gigabytes" is used). key_inherit is a one byte integer. The
55344 + intra-node searches performed through this layout are linear searches,
55345 + and this is theorized to not hurt performance much due to the high
55346 + cost of processor stalls on modern CPUs, and the small number of keys
55347 + in a single node. Item offsets are relative to start of node (16
55348 + bits). Item bodies are in decreasing key order. Item bodies have a
55349 + variable size. There is a one to one to one mapping of keys to item
55350 + offsets to item bodies. Item offsets consist of pointers to the
55351 + zeroth byte of the item body. Item length equals the start of the
55352 + next item minus the start of this item, except the zeroth item whose
55353 + length equals the end of the node minus the start of that item (plus a
55354 + byte). In other words, item length and key length is not recorded
55355 + anywhere, and it does not need to be since it is computable.
55356 +
55357 + internal node default layout: (idef1)
55358 +
55359 + just like ldef1 except that item bodies are either blocknrs of
55360 + children or extents, and moving them may require updating parent
55361 + pointers in the nodes that they point to.
55362 +*/
55363 +
55364 +/* There is an inherent 3-way tradeoff between optimizing and
55365 + exchanging disks between different architectures and code
55366 + complexity. This is optimal and simple and inexchangeable.
55367 + Someone else can do the code for exchanging disks and make it
55368 + complex. It would not be that hard. Using other than the PAGE_SIZE
55369 + might be suboptimal.
55370 +*/
55371 +
55372 +#if !defined( __REISER4_NODE_H__ )
55373 +#define __REISER4_NODE_H__
55374 +
55375 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55376 +
55377 +#include "../../dformat.h"
55378 +#include "../plugin_header.h"
55379 +
55380 +#include <linux/types.h>
55381 +
55382 +typedef enum {
55383 + NS_FOUND = 0,
55384 + NS_NOT_FOUND = -ENOENT
55385 +} node_search_result;
55386 +
55387 +/* Maximal possible space overhead for creation of new item in a node */
55388 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55389 +
55390 +typedef enum {
55391 + REISER4_NODE_DKEYS = (1 << 0),
55392 + REISER4_NODE_TREE_STABLE = (1 << 1)
55393 +} reiser4_node_check_flag;
55394 +
55395 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55396 +struct cut_list {
55397 + coord_t *from;
55398 + coord_t *to;
55399 + const reiser4_key *from_key;
55400 + const reiser4_key *to_key;
55401 + reiser4_key *smallest_removed;
55402 + carry_plugin_info *info;
55403 + __u32 flags;
55404 + struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55405 + lock_handle *left;
55406 + lock_handle *right;
55407 +};
55408 +
55409 +struct carry_cut_data;
55410 +struct carry_kill_data;
55411 +
55412 +/* The responsibility of the node plugin is to store and give access
55413 + to the sequence of items within the node. */
55414 +typedef struct node_plugin {
55415 + /* generic plugin fields */
55416 + plugin_header h;
55417 +
55418 + /* calculates the amount of space that will be required to store an
55419 + item which is in addition to the space consumed by the item body.
55420 + (the space consumed by the item body can be gotten by calling
55421 + item->estimate) */
55422 + size_t(*item_overhead) (const znode * node, flow_t * f);
55423 +
55424 + /* returns free space by looking into node (i.e., without using
55425 + znode->free_space). */
55426 + size_t(*free_space) (znode * node);
55427 + /* search within the node for the one item which might
55428 + contain the key, invoking item->search_within to search within
55429 + that item to see if it is in there */
55430 + node_search_result(*lookup) (znode * node, const reiser4_key * key,
55431 + lookup_bias bias, coord_t * coord);
55432 + /* number of items in node */
55433 + int (*num_of_items) (const znode * node);
55434 +
55435 + /* store information about item in @coord in @data */
55436 + /* break into several node ops, don't add any more uses of this before doing so */
55437 + /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55438 + char *(*item_by_coord) (const coord_t * coord);
55439 + int (*length_by_coord) (const coord_t * coord);
55440 + item_plugin *(*plugin_by_coord) (const coord_t * coord);
55441 +
55442 + /* store item key in @key */
55443 + reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55444 + /* conservatively estimate whether unit of what size can fit
55445 + into node. This estimation should be performed without
55446 + actually looking into the node's content (free space is saved in
55447 + znode). */
55448 + size_t(*estimate) (znode * node);
55449 +
55450 + /* performs every consistency check the node plugin author could
55451 + imagine. Optional. */
55452 + int (*check) (const znode * node, __u32 flags, const char **error);
55453 +
55454 + /* Called when node is read into memory and node plugin is
55455 + already detected. This should read some data into znode (like free
55456 + space counter) and, optionally, check data consistency.
55457 + */
55458 + int (*parse) (znode * node);
55459 + /* This method is called on a new node to initialise plugin specific
55460 + data (header, etc.) */
55461 + int (*init) (znode * node);
55462 + /* Check whether @node content conforms to this plugin format.
55463 + Probably only useful after support for old V3.x formats is added.
55464 + Uncomment after 4.0 only.
55465 + */
55466 + /* int ( *guess )( const znode *node ); */
55467 +#if REISER4_DEBUG
55468 + void (*print) (const char *prefix, const znode * node, __u32 flags);
55469 +#endif
55470 + /* change size of @item by @by bytes. @item->node has enough free
55471 + space. When @by > 0 - free space is appended to end of item. When
55472 + @by < 0 - item is truncated - it is assumed that last @by bytes if
55473 + the item are freed already */
55474 + void (*change_item_size) (coord_t * item, int by);
55475 +
55476 + /* create new item @length bytes long in coord @target */
55477 + int (*create_item) (coord_t * target, const reiser4_key * key,
55478 + reiser4_item_data * data, carry_plugin_info * info);
55479 +
55480 + /* update key of item. */
55481 + void (*update_item_key) (coord_t * target, const reiser4_key * key,
55482 + carry_plugin_info * info);
55483 +
55484 + int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
55485 + int (*cut) (struct carry_cut_data *, carry_plugin_info *);
55486 +
55487 + /*
55488 + * shrink item pointed to by @coord by @delta bytes.
55489 + */
55490 + int (*shrink_item) (coord_t * coord, int delta);
55491 +
55492 + /* copy as much as possible but not more than up to @stop from
55493 + @stop->node to @target. If (pend == append) then data from beginning of
55494 + @stop->node are copied to the end of @target. If (pend == prepend) then
55495 + data from the end of @stop->node are copied to the beginning of
55496 + @target. Copied data are removed from @stop->node. Information
55497 + about what to do on upper level is stored in @todo */
55498 + int (*shift) (coord_t * stop, znode * target, shift_direction pend,
55499 + int delete_node, int including_insert_coord,
55500 + carry_plugin_info * info);
55501 + /* return true if this node allows skip carry() in some situations
55502 + (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
55503 + emulation doesn't.
55504 +
55505 + This will speedup insertions that doesn't require updates to the
55506 + parent, by bypassing initialisation of carry() structures. It's
55507 + believed that majority of insertions will fit there.
55508 +
55509 + */
55510 + int (*fast_insert) (const coord_t * coord);
55511 + int (*fast_paste) (const coord_t * coord);
55512 + int (*fast_cut) (const coord_t * coord);
55513 + /* this limits max size of item which can be inserted into a node and
55514 + number of bytes item in a node may be appended with */
55515 + int (*max_item_size) (void);
55516 + int (*prepare_removal) (znode * empty, carry_plugin_info * info);
55517 + /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
55518 + * files */
55519 + int (*set_item_plugin) (coord_t * coord, item_id);
55520 +} node_plugin;
55521 +
55522 +typedef enum {
55523 + /* standard unified node layout used for both leaf and internal
55524 + nodes */
55525 + NODE40_ID,
55526 + LAST_NODE_ID
55527 +} reiser4_node_id;
55528 +
55529 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
55530 +#if REISER4_DEBUG
55531 +extern void print_node_content(const char *prefix, const znode * node,
55532 + __u32 flags);
55533 +#endif
55534 +
55535 +extern void indent_znode(const znode * node);
55536 +
55537 +typedef struct common_node_header {
55538 + /*
55539 + * identifier of node plugin. Must be located at the very beginning of
55540 + * a node.
55541 + */
55542 + __le16 plugin_id;
55543 +} common_node_header;
55544 +
55545 +/* __REISER4_NODE_H__ */
55546 +#endif
55547 +/*
55548 + * Local variables:
55549 + * c-indentation-style: "K&R"
55550 + * mode-name: "LC"
55551 + * c-basic-offset: 8
55552 + * tab-width: 8
55553 + * fill-column: 79
55554 + * scroll-step: 1
55555 + * End:
55556 + */
55557 Index: linux-2.6.16/fs/reiser4/plugin/node/node40.c
55558 ===================================================================
55559 --- /dev/null
55560 +++ linux-2.6.16/fs/reiser4/plugin/node/node40.c
55561 @@ -0,0 +1,2924 @@
55562 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55563 +
55564 +#include "../../debug.h"
55565 +#include "../../key.h"
55566 +#include "../../coord.h"
55567 +#include "../plugin_header.h"
55568 +#include "../item/item.h"
55569 +#include "node.h"
55570 +#include "node40.h"
55571 +#include "../plugin.h"
55572 +#include "../../jnode.h"
55573 +#include "../../znode.h"
55574 +#include "../../pool.h"
55575 +#include "../../carry.h"
55576 +#include "../../tap.h"
55577 +#include "../../tree.h"
55578 +#include "../../super.h"
55579 +#include "../../reiser4.h"
55580 +
55581 +#include <asm/uaccess.h>
55582 +#include <linux/types.h>
55583 +#include <linux/prefetch.h>
55584 +
55585 +/* leaf 40 format:
55586 +
55587 + [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
55588 + plugin_id (16) key
55589 + free_space (16) pluginid (16)
55590 + free_space_start (16) offset (16)
55591 + level (8)
55592 + num_items (16)
55593 + magic (32)
55594 + flush_time (32)
55595 +*/
55596 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
55597 +/* magic number that is stored in ->magic field of node header */
55598 +static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
55599 +
55600 +static int prepare_for_update(znode * left, znode * right,
55601 + carry_plugin_info * info);
55602 +
55603 +/* header of node of reiser40 format is at the beginning of node */
55604 +static inline node40_header *node40_node_header(const znode * node /* node to
55605 + * query */ )
55606 +{
55607 + assert("nikita-567", node != NULL);
55608 + assert("nikita-568", znode_page(node) != NULL);
55609 + assert("nikita-569", zdata(node) != NULL);
55610 + return (node40_header *) zdata(node);
55611 +}
55612 +
55613 +/* functions to get/set fields of node40_header */
55614 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
55615 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
55616 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
55617 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
55618 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
55619 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
55620 +
55621 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
55622 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
55623 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
55624 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
55625 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
55626 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
55627 +
55628 +
55629 +/* plugin field of node header should be read/set by
55630 + plugin_by_disk_id/save_disk_plugin */
55631 +
55632 +/* array of item headers is at the end of node */
55633 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
55634 +{
55635 + return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
55636 +}
55637 +
55638 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
55639 + */
55640 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
55641 +{
55642 + return (item_header40 *) (zdata(coord->node) +
55643 + znode_size(coord->node)) - (coord->item_pos) -
55644 + 1;
55645 +}
55646 +
55647 +/* functions to get/set fields of item_header40 */
55648 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
55649 +
55650 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
55651 +
55652 +/* plugin field of item header should be read/set by
55653 + plugin_by_disk_id/save_disk_plugin */
55654 +
55655 +/* plugin methods */
55656 +
55657 +/* plugin->u.node.item_overhead
55658 + look for description of this method in plugin/node/node.h */
55659 +size_t
55660 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
55661 +{
55662 + return sizeof(item_header40);
55663 +}
55664 +
55665 +/* plugin->u.node.free_space
55666 + look for description of this method in plugin/node/node.h */
55667 +size_t free_space_node40(znode * node)
55668 +{
55669 + assert("nikita-577", node != NULL);
55670 + assert("nikita-578", znode_is_loaded(node));
55671 + assert("nikita-579", zdata(node) != NULL);
55672 +
55673 + return nh40_get_free_space(node40_node_header(node));
55674 +}
55675 +
55676 +/* private inline version of node40_num_of_items() for use in this file. This
55677 + is necessary, because address of node40_num_of_items() is taken and it is
55678 + never inlined as a result. */
55679 +static inline short node40_num_of_items_internal(const znode * node)
55680 +{
55681 + return nh40_get_num_items(node40_node_header(node));
55682 +}
55683 +
55684 +#if REISER4_DEBUG
55685 +static inline void check_num_items(const znode * node)
55686 +{
55687 + assert("nikita-2749",
55688 + node40_num_of_items_internal(node) == node->nr_items);
55689 + assert("nikita-2746", znode_is_write_locked(node));
55690 +}
55691 +#else
55692 +#define check_num_items(node) noop
55693 +#endif
55694 +
55695 +/* plugin->u.node.num_of_items
55696 + look for description of this method in plugin/node/node.h */
55697 +int num_of_items_node40(const znode * node)
55698 +{
55699 + return node40_num_of_items_internal(node);
55700 +}
55701 +
55702 +static void
55703 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
55704 +{
55705 + assert("nikita-2751", node != NULL);
55706 + assert("nikita-2750", nh == node40_node_header(node));
55707 +
55708 + check_num_items(node);
55709 + nh40_set_num_items(nh, value);
55710 + node->nr_items = value;
55711 + check_num_items(node);
55712 +}
55713 +
55714 +/* plugin->u.node.item_by_coord
55715 + look for description of this method in plugin/node/node.h */
55716 +char *item_by_coord_node40(const coord_t * coord)
55717 +{
55718 + item_header40 *ih;
55719 + char *p;
55720 +
55721 + /* @coord is set to existing item */
55722 + assert("nikita-596", coord != NULL);
55723 + assert("vs-255", coord_is_existing_item(coord));
55724 +
55725 + ih = node40_ih_at_coord(coord);
55726 + p = zdata(coord->node) + ih40_get_offset(ih);
55727 + return p;
55728 +}
55729 +
55730 +/* plugin->u.node.length_by_coord
55731 + look for description of this method in plugin/node/node.h */
55732 +int length_by_coord_node40(const coord_t * coord)
55733 +{
55734 + item_header40 *ih;
55735 + int result;
55736 +
55737 + /* @coord is set to existing item */
55738 + assert("vs-256", coord != NULL);
55739 + assert("vs-257", coord_is_existing_item(coord));
55740 +
55741 + ih = node40_ih_at_coord(coord);
55742 + if ((int)coord->item_pos ==
55743 + node40_num_of_items_internal(coord->node) - 1)
55744 + result =
55745 + nh40_get_free_space_start(node40_node_header(coord->node)) -
55746 + ih40_get_offset(ih);
55747 + else
55748 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55749 +
55750 + return result;
55751 +}
55752 +
55753 +static pos_in_node_t
55754 +node40_item_length(const znode * node, pos_in_node_t item_pos)
55755 +{
55756 + item_header40 *ih;
55757 + pos_in_node_t result;
55758 +
55759 + /* @coord is set to existing item */
55760 + assert("vs-256", node != NULL);
55761 + assert("vs-257", node40_num_of_items_internal(node) > item_pos);
55762 +
55763 + ih = node40_ih_at(node, item_pos);
55764 + if (item_pos == node40_num_of_items_internal(node) - 1)
55765 + result =
55766 + nh40_get_free_space_start(node40_node_header(node)) -
55767 + ih40_get_offset(ih);
55768 + else
55769 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55770 +
55771 + return result;
55772 +}
55773 +
55774 +/* plugin->u.node.plugin_by_coord
55775 + look for description of this method in plugin/node/node.h */
55776 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
55777 +{
55778 + item_header40 *ih;
55779 + item_plugin *result;
55780 +
55781 + /* @coord is set to existing item */
55782 + assert("vs-258", coord != NULL);
55783 + assert("vs-259", coord_is_existing_item(coord));
55784 +
55785 + ih = node40_ih_at_coord(coord);
55786 + /* pass NULL in stead of current tree. This is time critical call. */
55787 + result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
55788 + return result;
55789 +}
55790 +
55791 +/* plugin->u.node.key_at
55792 + look for description of this method in plugin/node/node.h */
55793 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
55794 +{
55795 + item_header40 *ih;
55796 +
55797 + assert("nikita-1765", coord_is_existing_item(coord));
55798 +
55799 + /* @coord is set to existing item */
55800 + ih = node40_ih_at_coord(coord);
55801 + memcpy(key, &ih->key, sizeof(reiser4_key));
55802 + return key;
55803 +}
55804 +
55805 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
55806 +
55807 +#define NODE_INCSTAT(n, counter) \
55808 + reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
55809 +
55810 +#define NODE_ADDSTAT(n, counter, val) \
55811 + reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
55812 +
55813 +/* plugin->u.node.lookup
55814 + look for description of this method in plugin/node/node.h */
55815 +node_search_result lookup_node40(znode * node /* node to query */ ,
55816 + const reiser4_key * key /* key to look for */ ,
55817 + lookup_bias bias /* search bias */ ,
55818 + coord_t * coord /* resulting coord */ )
55819 +{
55820 + int left;
55821 + int right;
55822 + int found;
55823 + int items;
55824 +
55825 + item_header40 *lefth;
55826 + item_header40 *righth;
55827 +
55828 + item_plugin *iplug;
55829 + item_header40 *bstop;
55830 + item_header40 *ih;
55831 + cmp_t order;
55832 +
55833 + assert("nikita-583", node != NULL);
55834 + assert("nikita-584", key != NULL);
55835 + assert("nikita-585", coord != NULL);
55836 + assert("nikita-2693", znode_is_any_locked(node));
55837 + cassert(REISER4_SEQ_SEARCH_BREAK > 2);
55838 +
55839 + items = node_num_items(node);
55840 +
55841 + if (unlikely(items == 0)) {
55842 + coord_init_first_unit(coord, node);
55843 + return NS_NOT_FOUND;
55844 + }
55845 +
55846 + /* binary search for item that can contain given key */
55847 + left = 0;
55848 + right = items - 1;
55849 + coord->node = node;
55850 + coord_clear_iplug(coord);
55851 + found = 0;
55852 +
55853 + lefth = node40_ih_at(node, left);
55854 + righth = node40_ih_at(node, right);
55855 +
55856 + /* It is known that for small arrays sequential search is on average
55857 + more efficient than binary. This is because sequential search is
55858 + coded as tight loop that can be better optimized by compilers and
55859 + for small array size gain from this optimization makes sequential
55860 + search the winner. Another, maybe more important, reason for this,
55861 + is that sequential array is more CPU cache friendly, whereas binary
55862 + search effectively destroys CPU caching.
55863 +
55864 + Critical here is the notion of "smallness". Reasonable value of
55865 + REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
55866 + fs/reiser4/ulevel/ulevel.c:test_search().
55867 +
55868 + Don't try to further optimize sequential search by scanning from
55869 + right to left in attempt to use more efficient loop termination
55870 + condition (comparison with 0). This doesn't work.
55871 +
55872 + */
55873 +
55874 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
55875 + int median;
55876 + item_header40 *medianh;
55877 +
55878 + median = (left + right) / 2;
55879 + medianh = node40_ih_at(node, median);
55880 +
55881 + assert("nikita-1084", median >= 0);
55882 + assert("nikita-1085", median < items);
55883 + switch (keycmp(key, &medianh->key)) {
55884 + case LESS_THAN:
55885 + right = median;
55886 + righth = medianh;
55887 + break;
55888 + default:
55889 + wrong_return_value("nikita-586", "keycmp");
55890 + case GREATER_THAN:
55891 + left = median;
55892 + lefth = medianh;
55893 + break;
55894 + case EQUAL_TO:
55895 + do {
55896 + --median;
55897 + /* headers are ordered from right to left */
55898 + ++medianh;
55899 + } while (median >= 0 && keyeq(key, &medianh->key));
55900 + right = left = median + 1;
55901 + ih = lefth = righth = medianh - 1;
55902 + found = 1;
55903 + break;
55904 + }
55905 + }
55906 + /* sequential scan. Item headers, and, therefore, keys are stored at
55907 + the rightmost part of a node from right to left. We are trying to
55908 + access memory from left to right, and hence, scan in _descending_
55909 + order of item numbers.
55910 + */
55911 + if (!found) {
55912 + for (left = right, ih = righth; left >= 0; ++ih, --left) {
55913 + cmp_t comparison;
55914 +
55915 + prefetchkey(&(ih + 1)->key);
55916 + comparison = keycmp(&ih->key, key);
55917 + if (comparison == GREATER_THAN)
55918 + continue;
55919 + if (comparison == EQUAL_TO) {
55920 + found = 1;
55921 + do {
55922 + --left;
55923 + ++ih;
55924 + } while (left >= 0 && keyeq(&ih->key, key));
55925 + ++left;
55926 + --ih;
55927 + } else {
55928 + assert("nikita-1256", comparison == LESS_THAN);
55929 + }
55930 + break;
55931 + }
55932 + if (unlikely(left < 0))
55933 + left = 0;
55934 + }
55935 +
55936 + assert("nikita-3212", right >= left);
55937 + assert("nikita-3214",
55938 + equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
55939 +
55940 + coord_set_item_pos(coord, left);
55941 + coord->unit_pos = 0;
55942 + coord->between = AT_UNIT;
55943 +
55944 + /* key < leftmost key in a mode or node is corrupted and keys
55945 + are not sorted */
55946 + bstop = node40_ih_at(node, (unsigned)left);
55947 + order = keycmp(&bstop->key, key);
55948 + if (unlikely(order == GREATER_THAN)) {
55949 + if (unlikely(left != 0)) {
55950 + /* screw up */
55951 + warning("nikita-587", "Key less than %i key in a node",
55952 + left);
55953 + print_key("key", key);
55954 + print_key("min", &bstop->key);
55955 + print_coord_content("coord", coord);
55956 + return RETERR(-EIO);
55957 + } else {
55958 + coord->between = BEFORE_UNIT;
55959 + return NS_NOT_FOUND;
55960 + }
55961 + }
55962 + /* left <= key, ok */
55963 + iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
55964 +
55965 + if (unlikely(iplug == NULL)) {
55966 + warning("nikita-588", "Unknown plugin %i",
55967 + le16_to_cpu(get_unaligned(&bstop->plugin_id)));
55968 + print_key("key", key);
55969 + print_coord_content("coord", coord);
55970 + return RETERR(-EIO);
55971 + }
55972 +
55973 + coord_set_iplug(coord, iplug);
55974 +
55975 + /* if exact key from item header was found by binary search, no
55976 + further checks are necessary. */
55977 + if (found) {
55978 + assert("nikita-1259", order == EQUAL_TO);
55979 + return NS_FOUND;
55980 + }
55981 + if (iplug->b.max_key_inside != NULL) {
55982 + reiser4_key max_item_key;
55983 +
55984 + /* key > max_item_key --- outside of an item */
55985 + if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
55986 + coord->unit_pos = 0;
55987 + coord->between = AFTER_ITEM;
55988 + /* FIXME-VS: key we are looking for does not fit into
55989 + found item. Return NS_NOT_FOUND then. Without that
55990 + the following case does not work: there is extent of
55991 + file 10000, 10001. File 10000, 10002 has been just
55992 + created. When writing to position 0 in that file -
55993 + traverse_tree will stop here on twig level. When we
55994 + want it to go down to leaf level
55995 + */
55996 + return NS_NOT_FOUND;
55997 + }
55998 + }
55999 +
56000 + if (iplug->b.lookup != NULL) {
56001 + return iplug->b.lookup(key, bias, coord);
56002 + } else {
56003 + assert("nikita-1260", order == LESS_THAN);
56004 + coord->between = AFTER_UNIT;
56005 + return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
56006 + }
56007 +}
56008 +
56009 +#undef NODE_ADDSTAT
56010 +#undef NODE_INCSTAT
56011 +
56012 +/* plugin->u.node.estimate
56013 + look for description of this method in plugin/node/node.h */
56014 +size_t estimate_node40(znode * node)
56015 +{
56016 + size_t result;
56017 +
56018 + assert("nikita-597", node != NULL);
56019 +
56020 + result = free_space_node40(node) - sizeof(item_header40);
56021 +
56022 + return (result > 0) ? result : 0;
56023 +}
56024 +
56025 +/* plugin->u.node.check
56026 + look for description of this method in plugin/node/node.h */
56027 +int check_node40(const znode * node /* node to check */ ,
56028 + __u32 flags /* check flags */ ,
56029 + const char **error /* where to store error message */ )
56030 +{
56031 + int nr_items;
56032 + int i;
56033 + reiser4_key prev;
56034 + unsigned old_offset;
56035 + tree_level level;
56036 + coord_t coord;
56037 + int result;
56038 +
56039 + assert("nikita-580", node != NULL);
56040 + assert("nikita-581", error != NULL);
56041 + assert("nikita-2948", znode_is_loaded(node));
56042 +
56043 + if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
56044 + return 0;
56045 +
56046 + assert("nikita-582", zdata(node) != NULL);
56047 +
56048 + nr_items = node40_num_of_items_internal(node);
56049 + if (nr_items < 0) {
56050 + *error = "Negative number of items";
56051 + return -1;
56052 + }
56053 +
56054 + if (flags & REISER4_NODE_DKEYS)
56055 + prev = *znode_get_ld_key((znode *) node);
56056 + else
56057 + prev = *min_key();
56058 +
56059 + old_offset = 0;
56060 + coord_init_zero(&coord);
56061 + coord.node = (znode *) node;
56062 + coord.unit_pos = 0;
56063 + coord.between = AT_UNIT;
56064 + level = znode_get_level(node);
56065 + for (i = 0; i < nr_items; i++) {
56066 + item_header40 *ih;
56067 + reiser4_key unit_key;
56068 + unsigned j;
56069 +
56070 + ih = node40_ih_at(node, (unsigned)i);
56071 + coord_set_item_pos(&coord, i);
56072 + if ((ih40_get_offset(ih) >=
56073 + znode_size(node) - nr_items * sizeof(item_header40)) ||
56074 + (ih40_get_offset(ih) < sizeof(node40_header))) {
56075 + *error = "Offset is out of bounds";
56076 + return -1;
56077 + }
56078 + if (ih40_get_offset(ih) <= old_offset) {
56079 + *error = "Offsets are in wrong order";
56080 + return -1;
56081 + }
56082 + if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
56083 + *error = "Wrong offset of first item";
56084 + return -1;
56085 + }
56086 + old_offset = ih40_get_offset(ih);
56087 +
56088 + if (keygt(&prev, &ih->key)) {
56089 + *error = "Keys are in wrong order";
56090 + return -1;
56091 + }
56092 + if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
56093 + *error = "Wrong key of first unit";
56094 + return -1;
56095 + }
56096 + prev = ih->key;
56097 + for (j = 0; j < coord_num_units(&coord); ++j) {
56098 + coord.unit_pos = j;
56099 + unit_key_by_coord(&coord, &unit_key);
56100 + if (keygt(&prev, &unit_key)) {
56101 + *error = "Unit keys are in wrong order";
56102 + return -1;
56103 + }
56104 + prev = unit_key;
56105 + }
56106 + coord.unit_pos = 0;
56107 + if (level != TWIG_LEVEL && item_is_extent(&coord)) {
56108 + *error = "extent on the wrong level";
56109 + return -1;
56110 + }
56111 + if (level == LEAF_LEVEL && item_is_internal(&coord)) {
56112 + *error = "internal item on the wrong level";
56113 + return -1;
56114 + }
56115 + if (level != LEAF_LEVEL &&
56116 + !item_is_internal(&coord) && !item_is_extent(&coord)) {
56117 + *error = "wrong item on the internal level";
56118 + return -1;
56119 + }
56120 + if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
56121 + *error = "non-internal item on the internal level";
56122 + return -1;
56123 + }
56124 +#if REISER4_DEBUG
56125 + if (item_plugin_by_coord(&coord)->b.check
56126 + && item_plugin_by_coord(&coord)->b.check(&coord, error))
56127 + return -1;
56128 +#endif
56129 + if (i) {
56130 + coord_t prev_coord;
56131 + /* two neighboring items can not be mergeable */
56132 + coord_dup(&prev_coord, &coord);
56133 + coord_prev_item(&prev_coord);
56134 + if (are_items_mergeable(&prev_coord, &coord)) {
56135 + *error = "mergeable items in one node";
56136 + return -1;
56137 + }
56138 +
56139 + }
56140 + }
56141 +
56142 + if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
56143 + coord_t coord;
56144 + item_plugin *iplug;
56145 +
56146 + coord_init_last_unit(&coord, node);
56147 + iplug = item_plugin_by_coord(&coord);
56148 + if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
56149 + iplug->s.file.append_key != NULL) {
56150 + reiser4_key mkey;
56151 +
56152 + iplug->s.file.append_key(&coord, &mkey);
56153 + set_key_offset(&mkey, get_key_offset(&mkey) - 1);
56154 + read_lock_dk(current_tree);
56155 + result = keygt(&mkey, znode_get_rd_key((znode *) node));
56156 + read_unlock_dk(current_tree);
56157 + if (result) {
56158 + *error = "key of rightmost item is too large";
56159 + return -1;
56160 + }
56161 + }
56162 + }
56163 + if (flags & REISER4_NODE_DKEYS) {
56164 + read_lock_tree(current_tree);
56165 + read_lock_dk(current_tree);
56166 +
56167 + flags |= REISER4_NODE_TREE_STABLE;
56168 +
56169 + if (keygt(&prev, znode_get_rd_key((znode *) node))) {
56170 + if (flags & REISER4_NODE_TREE_STABLE) {
56171 + *error = "Last key is greater than rdkey";
56172 + read_unlock_dk(current_tree);
56173 + read_unlock_tree(current_tree);
56174 + return -1;
56175 + }
56176 + }
56177 + if (keygt
56178 + (znode_get_ld_key((znode *) node),
56179 + znode_get_rd_key((znode *) node))) {
56180 + *error = "ldkey is greater than rdkey";
56181 + read_unlock_dk(current_tree);
56182 + read_unlock_tree(current_tree);
56183 + return -1;
56184 + }
56185 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
56186 + (node->left != NULL) &&
56187 + !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
56188 + ergo(flags & REISER4_NODE_TREE_STABLE,
56189 + !keyeq(znode_get_rd_key(node->left),
56190 + znode_get_ld_key((znode *) node)))
56191 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56192 + keygt(znode_get_rd_key(node->left),
56193 + znode_get_ld_key((znode *) node)))) {
56194 + *error = "left rdkey or ldkey is wrong";
56195 + read_unlock_dk(current_tree);
56196 + read_unlock_tree(current_tree);
56197 + return -1;
56198 + }
56199 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
56200 + (node->right != NULL) &&
56201 + !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
56202 + ergo(flags & REISER4_NODE_TREE_STABLE,
56203 + !keyeq(znode_get_rd_key((znode *) node),
56204 + znode_get_ld_key(node->right)))
56205 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56206 + keygt(znode_get_rd_key((znode *) node),
56207 + znode_get_ld_key(node->right)))) {
56208 + *error = "rdkey or right ldkey is wrong";
56209 + read_unlock_dk(current_tree);
56210 + read_unlock_tree(current_tree);
56211 + return -1;
56212 + }
56213 +
56214 + read_unlock_dk(current_tree);
56215 + read_unlock_tree(current_tree);
56216 + }
56217 +
56218 + return 0;
56219 +}
56220 +
56221 +/* plugin->u.node.parse
56222 + look for description of this method in plugin/node/node.h */
56223 +int parse_node40(znode * node /* node to parse */ )
56224 +{
56225 + node40_header *header;
56226 + int result;
56227 + d8 level;
56228 +
56229 + header = node40_node_header((znode *) node);
56230 + result = -EIO;
56231 + level = nh40_get_level(header);
56232 + if (unlikely(((__u8) znode_get_level(node)) != level))
56233 + warning("nikita-494", "Wrong level found in node: %i != %i",
56234 + znode_get_level(node), level);
56235 + else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
56236 + warning("nikita-495",
56237 + "Wrong magic in tree node: want %x, got %x",
56238 + REISER4_NODE_MAGIC, nh40_get_magic(header));
56239 + else {
56240 + node->nr_items = node40_num_of_items_internal(node);
56241 + result = 0;
56242 + }
56243 + if (unlikely(result != 0))
56244 + /* print_znode("node", node) */ ;
56245 + return RETERR(result);
56246 +}
56247 +
56248 +/* plugin->u.node.init
56249 + look for description of this method in plugin/node/node.h */
56250 +int init_node40(znode * node /* node to initialise */ )
56251 +{
56252 + node40_header *header;
56253 +
56254 + assert("nikita-570", node != NULL);
56255 + assert("nikita-572", zdata(node) != NULL);
56256 +
56257 + header = node40_node_header(node);
56258 + memset(header, 0, sizeof(node40_header));
56259 + nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
56260 + nh40_set_free_space_start(header, sizeof(node40_header));
56261 + /* sane hypothesis: 0 in CPU format is 0 in disk format */
56262 + /* items: 0 */
56263 + save_plugin_id(node_plugin_to_plugin(node->nplug),
56264 + &header->common_header.plugin_id);
56265 + nh40_set_level(header, znode_get_level(node));
56266 + nh40_set_magic(header, REISER4_NODE_MAGIC);
56267 + node->nr_items = 0;
56268 + nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
56269 +
56270 + /* flags: 0 */
56271 + return 0;
56272 +}
56273 +
56274 +#ifdef GUESS_EXISTS
56275 +int guess_node40(const znode * node /* node to guess plugin of */ )
56276 +{
56277 + node40_header *nethack;
56278 +
56279 + assert("nikita-1058", node != NULL);
56280 + nethack = node40_node_header(node);
56281 + return
56282 + (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
56283 + (plugin_by_disk_id(znode_get_tree(node),
56284 + REISER4_NODE_PLUGIN_TYPE,
56285 + &nethack->common_header.plugin_id)->h.id ==
56286 + NODE40_ID);
56287 +}
56288 +#endif
56289 +
56290 +/* plugin->u.node.chage_item_size
56291 + look for description of this method in plugin/node/node.h */
56292 +void change_item_size_node40(coord_t * coord, int by)
56293 +{
56294 + node40_header *nh;
56295 + item_header40 *ih;
56296 + char *item_data;
56297 + int item_length;
56298 + unsigned i;
56299 +
56300 + /* make sure that @item is coord of existing item */
56301 + assert("vs-210", coord_is_existing_item(coord));
56302 +
56303 + nh = node40_node_header(coord->node);
56304 +
56305 + item_data = item_by_coord_node40(coord);
56306 + item_length = length_by_coord_node40(coord);
56307 +
56308 + /* move item bodies */
56309 + ih = node40_ih_at_coord(coord);
56310 + memmove(item_data + item_length + by, item_data + item_length,
56311 + nh40_get_free_space_start(node40_node_header(coord->node)) -
56312 + (ih40_get_offset(ih) + item_length));
56313 +
56314 + /* update offsets of moved items */
56315 + for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
56316 + ih = node40_ih_at(coord->node, i);
56317 + ih40_set_offset(ih, ih40_get_offset(ih) + by);
56318 + }
56319 +
56320 + /* update node header */
56321 + nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
56322 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
56323 +}
56324 +
56325 +static int should_notify_parent(const znode * node)
56326 +{
56327 + /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
56328 + return !disk_addr_eq(znode_get_block(node),
56329 + &znode_get_tree(node)->root_block);
56330 +}
56331 +
56332 +/* plugin->u.node.create_item
56333 + look for description of this method in plugin/node/node.h */
56334 +int
56335 +create_item_node40(coord_t *target, const reiser4_key *key,
56336 + reiser4_item_data *data, carry_plugin_info *info)
56337 +{
56338 + node40_header *nh;
56339 + item_header40 *ih;
56340 + unsigned offset;
56341 + unsigned i;
56342 +
56343 + nh = node40_node_header(target->node);
56344 +
56345 + assert("vs-212", coord_is_between_items(target));
56346 + /* node must have enough free space */
56347 + assert("vs-254",
56348 + free_space_node40(target->node) >=
56349 + data->length + sizeof(item_header40));
56350 + assert("vs-1410", data->length >= 0);
56351 +
56352 + if (coord_set_to_right(target))
56353 + /* there are not items to the right of @target, so, new item
56354 + will be inserted after last one */
56355 + coord_set_item_pos(target, nh40_get_num_items(nh));
56356 +
56357 + if (target->item_pos < nh40_get_num_items(nh)) {
56358 + /* there are items to be moved to prepare space for new
56359 + item */
56360 + ih = node40_ih_at_coord(target);
56361 + /* new item will start at this offset */
56362 + offset = ih40_get_offset(ih);
56363 +
56364 + memmove(zdata(target->node) + offset + data->length,
56365 + zdata(target->node) + offset,
56366 + nh40_get_free_space_start(nh) - offset);
56367 + /* update headers of moved items */
56368 + for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
56369 + ih = node40_ih_at(target->node, i);
56370 + ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
56371 + }
56372 +
56373 + /* @ih is set to item header of the last item, move item headers */
56374 + memmove(ih - 1, ih,
56375 + sizeof(item_header40) * (nh40_get_num_items(nh) -
56376 + target->item_pos));
56377 + } else {
56378 + /* new item will start at this offset */
56379 + offset = nh40_get_free_space_start(nh);
56380 + }
56381 +
56382 + /* make item header for the new item */
56383 + ih = node40_ih_at_coord(target);
56384 + memcpy(&ih->key, key, sizeof(reiser4_key));
56385 + ih40_set_offset(ih, offset);
56386 + save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
56387 +
56388 + /* update node header */
56389 + nh40_set_free_space(nh,
56390 + nh40_get_free_space(nh) - data->length -
56391 + sizeof(item_header40));
56392 + nh40_set_free_space_start(nh,
56393 + nh40_get_free_space_start(nh) + data->length);
56394 + node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
56395 +
56396 + /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
56397 + target->unit_pos = 0;
56398 + target->between = AT_UNIT;
56399 + coord_clear_iplug(target);
56400 +
56401 + /* initialize item */
56402 + if (data->iplug->b.init != NULL) {
56403 + data->iplug->b.init(target, NULL, data);
56404 + }
56405 + /* copy item body */
56406 + if (data->iplug->b.paste != NULL) {
56407 + data->iplug->b.paste(target, data, info);
56408 + } else if (data->data != NULL) {
56409 + if (data->user) {
56410 + /* AUDIT: Are we really should not check that pointer
56411 + from userspace was valid and data bytes were
56412 + available? How will we return -EFAULT of some kind
56413 + without this check? */
56414 + assert("nikita-3038", schedulable());
56415 + /* copy data from user space */
56416 + __copy_from_user(zdata(target->node) + offset,
56417 + (const char __user *)data->data,
56418 + (unsigned)data->length);
56419 + } else
56420 + /* copy from kernel space */
56421 + memcpy(zdata(target->node) + offset, data->data,
56422 + (unsigned)data->length);
56423 + }
56424 +
56425 + if (target->item_pos == 0) {
56426 + /* left delimiting key has to be updated */
56427 + prepare_for_update(NULL, target->node, info);
56428 + }
56429 +
56430 + if (item_plugin_by_coord(target)->b.create_hook != NULL) {
56431 + item_plugin_by_coord(target)->b.create_hook(target, data->arg);
56432 + }
56433 +
56434 + return 0;
56435 +}
56436 +
56437 +/* plugin->u.node.update_item_key
56438 + look for description of this method in plugin/node/node.h */
56439 +void
56440 +update_item_key_node40(coord_t * target, const reiser4_key * key,
56441 + carry_plugin_info * info)
56442 +{
56443 + item_header40 *ih;
56444 +
56445 + ih = node40_ih_at_coord(target);
56446 + memcpy(&ih->key, key, sizeof(reiser4_key));
56447 +
56448 + if (target->item_pos == 0) {
56449 + prepare_for_update(NULL, target->node, info);
56450 + }
56451 +}
56452 +
56453 +/* this bits encode cut mode */
56454 +#define CMODE_TAIL 1
56455 +#define CMODE_WHOLE 2
56456 +#define CMODE_HEAD 4
56457 +
56458 +struct cut40_info {
56459 + int mode;
56460 + pos_in_node_t tail_removed; /* position of item which gets tail removed */
56461 + pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
56462 + pos_in_node_t removed_count; /* number of items removed completely */
56463 + pos_in_node_t head_removed; /* position of item which gets head removed */
56464 +
56465 + pos_in_node_t freed_space_start;
56466 + pos_in_node_t freed_space_end;
56467 + pos_in_node_t first_moved;
56468 + pos_in_node_t head_removed_location;
56469 +};
56470 +
56471 +static void init_cinfo(struct cut40_info *cinfo)
56472 +{
56473 + cinfo->mode = 0;
56474 + cinfo->tail_removed = MAX_POS_IN_NODE;
56475 + cinfo->first_removed = MAX_POS_IN_NODE;
56476 + cinfo->removed_count = MAX_POS_IN_NODE;
56477 + cinfo->head_removed = MAX_POS_IN_NODE;
56478 + cinfo->freed_space_start = MAX_POS_IN_NODE;
56479 + cinfo->freed_space_end = MAX_POS_IN_NODE;
56480 + cinfo->first_moved = MAX_POS_IN_NODE;
56481 + cinfo->head_removed_location = MAX_POS_IN_NODE;
56482 +}
56483 +
56484 +/* complete cut_node40/kill_node40 content by removing the gap created by */
56485 +static void compact(znode * node, struct cut40_info *cinfo)
56486 +{
56487 + node40_header *nh;
56488 + item_header40 *ih;
56489 + pos_in_node_t freed;
56490 + pos_in_node_t pos, nr_items;
56491 +
56492 + assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
56493 + cinfo->freed_space_end != MAX_POS_IN_NODE &&
56494 + cinfo->first_moved != MAX_POS_IN_NODE));
56495 + assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
56496 +
56497 + nh = node40_node_header(node);
56498 + nr_items = nh40_get_num_items(nh);
56499 +
56500 + /* remove gap made up by removal */
56501 + memmove(zdata(node) + cinfo->freed_space_start,
56502 + zdata(node) + cinfo->freed_space_end,
56503 + nh40_get_free_space_start(nh) - cinfo->freed_space_end);
56504 +
56505 + /* update item headers of moved items - change their locations */
56506 + pos = cinfo->first_moved;
56507 + ih = node40_ih_at(node, pos);
56508 + if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
56509 + assert("vs-1580", pos == cinfo->head_removed);
56510 + ih40_set_offset(ih, cinfo->head_removed_location);
56511 + pos++;
56512 + ih--;
56513 + }
56514 +
56515 + freed = cinfo->freed_space_end - cinfo->freed_space_start;
56516 + for (; pos < nr_items; pos++, ih--) {
56517 + assert("vs-1581", ih == node40_ih_at(node, pos));
56518 + ih40_set_offset(ih, ih40_get_offset(ih) - freed);
56519 + }
56520 +
56521 + /* free space start moved to right */
56522 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
56523 +
56524 + if (cinfo->removed_count != MAX_POS_IN_NODE) {
56525 + /* number of items changed. Remove item headers of those items */
56526 + ih = node40_ih_at(node, nr_items - 1);
56527 + memmove(ih + cinfo->removed_count, ih,
56528 + sizeof(item_header40) * (nr_items -
56529 + cinfo->removed_count -
56530 + cinfo->first_removed));
56531 + freed += sizeof(item_header40) * cinfo->removed_count;
56532 + node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
56533 + }
56534 +
56535 + /* total amount of free space increased */
56536 + nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
56537 +}
56538 +
56539 +int shrink_item_node40(coord_t * coord, int delta)
56540 +{
56541 + node40_header *nh;
56542 + item_header40 *ih;
56543 + pos_in_node_t pos;
56544 + pos_in_node_t nr_items;
56545 + char *end;
56546 + znode *node;
56547 + int off;
56548 +
56549 + assert("nikita-3487", coord != NULL);
56550 + assert("nikita-3488", delta >= 0);
56551 +
56552 + node = coord->node;
56553 + nh = node40_node_header(node);
56554 + nr_items = nh40_get_num_items(nh);
56555 +
56556 + ih = node40_ih_at_coord(coord);
56557 + assert("nikita-3489", delta <= length_by_coord_node40(coord));
56558 + off = ih40_get_offset(ih) + length_by_coord_node40(coord);
56559 + end = zdata(node) + off;
56560 +
56561 + /* remove gap made up by removal */
56562 + memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
56563 +
56564 + /* update item headers of moved items - change their locations */
56565 + pos = coord->item_pos + 1;
56566 + ih = node40_ih_at(node, pos);
56567 + for (; pos < nr_items; pos++, ih--) {
56568 + assert("nikita-3490", ih == node40_ih_at(node, pos));
56569 + ih40_set_offset(ih, ih40_get_offset(ih) - delta);
56570 + }
56571 +
56572 + /* free space start moved to left */
56573 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
56574 + /* total amount of free space increased */
56575 + nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
56576 + /*
56577 + * This method does _not_ changes number of items. Hence, it cannot
56578 + * make node empty. Also it doesn't remove items at all, which means
56579 + * that no keys have to be updated either.
56580 + */
56581 + return 0;
56582 +}
56583 +
56584 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
56585 + of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
56586 + rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
56587 + getting head cut. Function returns 0 in this case */
56588 +static int
56589 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
56590 +{
56591 + reiser4_key left_key, right_key;
56592 + reiser4_key min_from_key, max_to_key;
56593 + const reiser4_key *from_key, *to_key;
56594 +
56595 + init_cinfo(cinfo);
56596 +
56597 + /* calculate minimal key stored in first item of items to be cut (params->from) */
56598 + item_key_by_coord(params->from, &min_from_key);
56599 + /* and max key stored in last item of items to be cut (params->to) */
56600 + max_item_key_by_coord(params->to, &max_to_key);
56601 +
56602 + /* if cut key range is not defined in input parameters - define it using cut coord range */
56603 + if (params->from_key == NULL) {
56604 + assert("vs-1513", params->to_key == NULL);
56605 + unit_key_by_coord(params->from, &left_key);
56606 + from_key = &left_key;
56607 + max_unit_key_by_coord(params->to, &right_key);
56608 + to_key = &right_key;
56609 + } else {
56610 + from_key = params->from_key;
56611 + to_key = params->to_key;
56612 + }
56613 +
56614 + if (params->from->item_pos == params->to->item_pos) {
56615 + if (keylt(&min_from_key, from_key)
56616 + && keylt(to_key, &max_to_key))
56617 + return 1;
56618 +
56619 + if (keygt(from_key, &min_from_key)) {
56620 + /* tail of item is to be cut cut */
56621 + cinfo->tail_removed = params->from->item_pos;
56622 + cinfo->mode |= CMODE_TAIL;
56623 + } else if (keylt(to_key, &max_to_key)) {
56624 + /* head of item is to be cut */
56625 + cinfo->head_removed = params->from->item_pos;
56626 + cinfo->mode |= CMODE_HEAD;
56627 + } else {
56628 + /* item is removed completely */
56629 + cinfo->first_removed = params->from->item_pos;
56630 + cinfo->removed_count = 1;
56631 + cinfo->mode |= CMODE_WHOLE;
56632 + }
56633 + } else {
56634 + cinfo->first_removed = params->from->item_pos + 1;
56635 + cinfo->removed_count =
56636 + params->to->item_pos - params->from->item_pos - 1;
56637 +
56638 + if (keygt(from_key, &min_from_key)) {
56639 + /* first item is not cut completely */
56640 + cinfo->tail_removed = params->from->item_pos;
56641 + cinfo->mode |= CMODE_TAIL;
56642 + } else {
56643 + cinfo->first_removed--;
56644 + cinfo->removed_count++;
56645 + }
56646 + if (keylt(to_key, &max_to_key)) {
56647 + /* last item is not cut completely */
56648 + cinfo->head_removed = params->to->item_pos;
56649 + cinfo->mode |= CMODE_HEAD;
56650 + } else {
56651 + cinfo->removed_count++;
56652 + }
56653 + if (cinfo->removed_count)
56654 + cinfo->mode |= CMODE_WHOLE;
56655 + }
56656 +
56657 + return 0;
56658 +}
56659 +
56660 +static void
56661 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
56662 + carry_kill_data * kdata)
56663 +{
56664 + coord_t coord;
56665 + item_plugin *iplug;
56666 + pos_in_node_t pos;
56667 +
56668 + coord.node = node;
56669 + coord.unit_pos = 0;
56670 + coord.between = AT_UNIT;
56671 + for (pos = 0; pos < count; pos++) {
56672 + coord_set_item_pos(&coord, from + pos);
56673 + coord.unit_pos = 0;
56674 + coord.between = AT_UNIT;
56675 + iplug = item_plugin_by_coord(&coord);
56676 + if (iplug->b.kill_hook) {
56677 + iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
56678 + kdata);
56679 + }
56680 + }
56681 +}
56682 +
56683 +/* this is used to kill item partially */
56684 +static pos_in_node_t
56685 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56686 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
56687 +{
56688 + struct carry_kill_data *kdata;
56689 + item_plugin *iplug;
56690 +
56691 + kdata = data;
56692 + iplug = item_plugin_by_coord(coord);
56693 +
56694 + assert("vs-1524", iplug->b.kill_units);
56695 + return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
56696 + new_first_key);
56697 +}
56698 +
56699 +/* call item plugin to cut tail of file */
56700 +static pos_in_node_t
56701 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56702 +{
56703 + struct carry_kill_data *kdata;
56704 + pos_in_node_t to;
56705 +
56706 + kdata = data;
56707 + to = coord_last_unit_pos(coord);
56708 + return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
56709 + NULL);
56710 +}
56711 +
56712 +/* call item plugin to cut head of item */
56713 +static pos_in_node_t
56714 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56715 + reiser4_key * new_first_key)
56716 +{
56717 + return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
56718 + new_first_key);
56719 +}
56720 +
56721 +/* this is used to cut item partially */
56722 +static pos_in_node_t
56723 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56724 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
56725 +{
56726 + carry_cut_data *cdata;
56727 + item_plugin *iplug;
56728 +
56729 + cdata = data;
56730 + iplug = item_plugin_by_coord(coord);
56731 + assert("vs-302", iplug->b.cut_units);
56732 + return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
56733 + new_first_key);
56734 +}
56735 +
56736 +/* call item plugin to cut tail of file */
56737 +static pos_in_node_t
56738 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56739 +{
56740 + carry_cut_data *cdata;
56741 + pos_in_node_t to;
56742 +
56743 + cdata = data;
56744 + to = coord_last_unit_pos(cdata->params.from);
56745 + return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
56746 +}
56747 +
56748 +/* call item plugin to cut head of item */
56749 +static pos_in_node_t
56750 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56751 + reiser4_key * new_first_key)
56752 +{
56753 + return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
56754 + new_first_key);
56755 +}
56756 +
56757 +/* this returns 1 of key of first item changed, 0 - if it did not */
56758 +static int
56759 +prepare_for_compact(struct cut40_info *cinfo,
56760 + const struct cut_kill_params *params, int is_cut,
56761 + void *data, carry_plugin_info * info)
56762 +{
56763 + znode *node;
56764 + item_header40 *ih;
56765 + pos_in_node_t freed;
56766 + pos_in_node_t item_pos;
56767 + coord_t coord;
56768 + reiser4_key new_first_key;
56769 + pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
56770 + void *, reiser4_key *, reiser4_key *);
56771 + pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
56772 + pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
56773 + reiser4_key *);
56774 + int retval;
56775 +
56776 + retval = 0;
56777 +
56778 + node = params->from->node;
56779 +
56780 + assert("vs-184", node == params->to->node);
56781 + assert("vs-312", !node_is_empty(node));
56782 + assert("vs-297",
56783 + coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
56784 +
56785 + if (is_cut) {
56786 + kill_units_f = cut_units;
56787 + kill_tail_f = cut_tail;
56788 + kill_head_f = cut_head;
56789 + } else {
56790 + kill_units_f = kill_units;
56791 + kill_tail_f = kill_tail;
56792 + kill_head_f = kill_head;
56793 + }
56794 +
56795 + if (parse_cut(cinfo, params) == 1) {
56796 + /* cut from the middle of item */
56797 + freed =
56798 + kill_units_f(params->from, params->from->unit_pos,
56799 + params->to->unit_pos, data,
56800 + params->smallest_removed, NULL);
56801 +
56802 + item_pos = params->from->item_pos;
56803 + ih = node40_ih_at(node, item_pos);
56804 + cinfo->freed_space_start =
56805 + ih40_get_offset(ih) + node40_item_length(node,
56806 + item_pos) - freed;
56807 + cinfo->freed_space_end = cinfo->freed_space_start + freed;
56808 + cinfo->first_moved = item_pos + 1;
56809 + } else {
56810 + assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
56811 + cinfo->first_removed != MAX_POS_IN_NODE ||
56812 + cinfo->head_removed != MAX_POS_IN_NODE));
56813 +
56814 + switch (cinfo->mode) {
56815 + case CMODE_TAIL:
56816 + /* one item gets cut partially from its end */
56817 + assert("vs-1562",
56818 + cinfo->tail_removed == params->from->item_pos);
56819 +
56820 + freed =
56821 + kill_tail_f(params->from, data,
56822 + params->smallest_removed);
56823 +
56824 + item_pos = cinfo->tail_removed;
56825 + ih = node40_ih_at(node, item_pos);
56826 + cinfo->freed_space_start =
56827 + ih40_get_offset(ih) + node40_item_length(node,
56828 + item_pos) -
56829 + freed;
56830 + cinfo->freed_space_end =
56831 + cinfo->freed_space_start + freed;
56832 + cinfo->first_moved = cinfo->tail_removed + 1;
56833 + break;
56834 +
56835 + case CMODE_WHOLE:
56836 + /* one or more items get removed completely */
56837 + assert("vs-1563",
56838 + cinfo->first_removed == params->from->item_pos);
56839 + assert("vs-1564", cinfo->removed_count > 0
56840 + && cinfo->removed_count != MAX_POS_IN_NODE);
56841 +
56842 + /* call kill hook for all items removed completely */
56843 + if (is_cut == 0)
56844 + call_kill_hooks(node, cinfo->first_removed,
56845 + cinfo->removed_count, data);
56846 +
56847 + item_pos = cinfo->first_removed;
56848 + ih = node40_ih_at(node, item_pos);
56849 +
56850 + if (params->smallest_removed)
56851 + memcpy(params->smallest_removed, &ih->key,
56852 + sizeof(reiser4_key));
56853 +
56854 + cinfo->freed_space_start = ih40_get_offset(ih);
56855 +
56856 + item_pos += (cinfo->removed_count - 1);
56857 + ih -= (cinfo->removed_count - 1);
56858 + cinfo->freed_space_end =
56859 + ih40_get_offset(ih) + node40_item_length(node,
56860 + item_pos);
56861 + cinfo->first_moved = item_pos + 1;
56862 + if (cinfo->first_removed == 0)
56863 + /* key of first item of the node changes */
56864 + retval = 1;
56865 + break;
56866 +
56867 + case CMODE_HEAD:
56868 + /* one item gets cut partially from its head */
56869 + assert("vs-1565",
56870 + cinfo->head_removed == params->from->item_pos);
56871 +
56872 + freed =
56873 + kill_head_f(params->to, data,
56874 + params->smallest_removed,
56875 + &new_first_key);
56876 +
56877 + item_pos = cinfo->head_removed;
56878 + ih = node40_ih_at(node, item_pos);
56879 + cinfo->freed_space_start = ih40_get_offset(ih);
56880 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56881 + cinfo->first_moved = cinfo->head_removed + 1;
56882 +
56883 + /* item head is removed, therefore, item key changed */
56884 + coord.node = node;
56885 + coord_set_item_pos(&coord, item_pos);
56886 + coord.unit_pos = 0;
56887 + coord.between = AT_UNIT;
56888 + update_item_key_node40(&coord, &new_first_key, NULL);
56889 + if (item_pos == 0)
56890 + /* key of first item of the node changes */
56891 + retval = 1;
56892 + break;
56893 +
56894 + case CMODE_TAIL | CMODE_WHOLE:
56895 + /* one item gets cut from its end and one or more items get removed completely */
56896 + assert("vs-1566",
56897 + cinfo->tail_removed == params->from->item_pos);
56898 + assert("vs-1567",
56899 + cinfo->first_removed == cinfo->tail_removed + 1);
56900 + assert("vs-1564", cinfo->removed_count > 0
56901 + && cinfo->removed_count != MAX_POS_IN_NODE);
56902 +
56903 + freed =
56904 + kill_tail_f(params->from, data,
56905 + params->smallest_removed);
56906 +
56907 + item_pos = cinfo->tail_removed;
56908 + ih = node40_ih_at(node, item_pos);
56909 + cinfo->freed_space_start =
56910 + ih40_get_offset(ih) + node40_item_length(node,
56911 + item_pos) -
56912 + freed;
56913 +
56914 + /* call kill hook for all items removed completely */
56915 + if (is_cut == 0)
56916 + call_kill_hooks(node, cinfo->first_removed,
56917 + cinfo->removed_count, data);
56918 +
56919 + item_pos += cinfo->removed_count;
56920 + ih -= cinfo->removed_count;
56921 + cinfo->freed_space_end =
56922 + ih40_get_offset(ih) + node40_item_length(node,
56923 + item_pos);
56924 + cinfo->first_moved = item_pos + 1;
56925 + break;
56926 +
56927 + case CMODE_WHOLE | CMODE_HEAD:
56928 + /* one or more items get removed completely and one item gets cut partially from its head */
56929 + assert("vs-1568",
56930 + cinfo->first_removed == params->from->item_pos);
56931 + assert("vs-1564", cinfo->removed_count > 0
56932 + && cinfo->removed_count != MAX_POS_IN_NODE);
56933 + assert("vs-1569",
56934 + cinfo->head_removed ==
56935 + cinfo->first_removed + cinfo->removed_count);
56936 +
56937 + /* call kill hook for all items removed completely */
56938 + if (is_cut == 0)
56939 + call_kill_hooks(node, cinfo->first_removed,
56940 + cinfo->removed_count, data);
56941 +
56942 + item_pos = cinfo->first_removed;
56943 + ih = node40_ih_at(node, item_pos);
56944 +
56945 + if (params->smallest_removed)
56946 + memcpy(params->smallest_removed, &ih->key,
56947 + sizeof(reiser4_key));
56948 +
56949 + freed =
56950 + kill_head_f(params->to, data, NULL, &new_first_key);
56951 +
56952 + cinfo->freed_space_start = ih40_get_offset(ih);
56953 +
56954 + ih = node40_ih_at(node, cinfo->head_removed);
56955 + /* this is the most complex case. Item which got head removed and items which are to be moved
56956 + intact change their location differently. */
56957 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56958 + cinfo->first_moved = cinfo->head_removed;
56959 + cinfo->head_removed_location = cinfo->freed_space_start;
56960 +
56961 + /* item head is removed, therefore, item key changed */
56962 + coord.node = node;
56963 + coord_set_item_pos(&coord, cinfo->head_removed);
56964 + coord.unit_pos = 0;
56965 + coord.between = AT_UNIT;
56966 + update_item_key_node40(&coord, &new_first_key, NULL);
56967 +
56968 + assert("vs-1579", cinfo->first_removed == 0);
56969 + /* key of first item of the node changes */
56970 + retval = 1;
56971 + break;
56972 +
56973 + case CMODE_TAIL | CMODE_HEAD:
56974 + /* one item get cut from its end and its neighbor gets cut from its tail */
56975 + impossible("vs-1576", "this can not happen currently");
56976 + break;
56977 +
56978 + case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
56979 + impossible("vs-1577", "this can not happen currently");
56980 + break;
56981 + default:
56982 + impossible("vs-1578", "unexpected cut mode");
56983 + break;
56984 + }
56985 + }
56986 + return retval;
56987 +}
56988 +
56989 +/* plugin->u.node.kill
56990 + return value is number of items removed completely */
56991 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
56992 +{
56993 + znode *node;
56994 + struct cut40_info cinfo;
56995 + int first_key_changed;
56996 +
56997 + node = kdata->params.from->node;
56998 +
56999 + first_key_changed =
57000 + prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
57001 + info);
57002 + compact(node, &cinfo);
57003 +
57004 + if (info) {
57005 + /* it is not called by node40_shift, so we have to take care
57006 + of changes on upper levels */
57007 + if (node_is_empty(node)
57008 + && !(kdata->flags & DELETE_RETAIN_EMPTY))
57009 + /* all contents of node is deleted */
57010 + prepare_removal_node40(node, info);
57011 + else if (first_key_changed) {
57012 + prepare_for_update(NULL, node, info);
57013 + }
57014 + }
57015 +
57016 + coord_clear_iplug(kdata->params.from);
57017 + coord_clear_iplug(kdata->params.to);
57018 +
57019 + znode_make_dirty(node);
57020 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57021 +}
57022 +
57023 +/* plugin->u.node.cut
57024 + return value is number of items removed completely */
57025 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
57026 +{
57027 + znode *node;
57028 + struct cut40_info cinfo;
57029 + int first_key_changed;
57030 +
57031 + node = cdata->params.from->node;
57032 +
57033 + first_key_changed =
57034 + prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
57035 + info);
57036 + compact(node, &cinfo);
57037 +
57038 + if (info) {
57039 + /* it is not called by node40_shift, so we have to take care
57040 + of changes on upper levels */
57041 + if (node_is_empty(node))
57042 + /* all contents of node is deleted */
57043 + prepare_removal_node40(node, info);
57044 + else if (first_key_changed) {
57045 + prepare_for_update(NULL, node, info);
57046 + }
57047 + }
57048 +
57049 + coord_clear_iplug(cdata->params.from);
57050 + coord_clear_iplug(cdata->params.to);
57051 +
57052 + znode_make_dirty(node);
57053 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57054 +}
57055 +
57056 +/* this structure is used by shift method of node40 plugin */
57057 +struct shift_params {
57058 + shift_direction pend; /* when @pend == append - we are shifting to
57059 + left, when @pend == prepend - to right */
57060 + coord_t wish_stop; /* when shifting to left this is last unit we
57061 + want shifted, when shifting to right - this
57062 + is set to unit we want to start shifting
57063 + from */
57064 + znode *target;
57065 + int everything; /* it is set to 1 if everything we have to shift is
57066 + shifted, 0 - otherwise */
57067 +
57068 + /* FIXME-VS: get rid of read_stop */
57069 +
57070 + /* these are set by estimate_shift */
57071 + coord_t real_stop; /* this will be set to last unit which will be
57072 + really shifted */
57073 +
57074 + /* coordinate in source node before operation of unit which becomes
57075 + first after shift to left of last after shift to right */
57076 + union {
57077 + coord_t future_first;
57078 + coord_t future_last;
57079 + } u;
57080 +
57081 + unsigned merging_units; /* number of units of first item which have to
57082 + be merged with last item of target node */
57083 + unsigned merging_bytes; /* number of bytes in those units */
57084 +
57085 + unsigned entire; /* items shifted in their entirety */
57086 + unsigned entire_bytes; /* number of bytes in those items */
57087 +
57088 + unsigned part_units; /* number of units of partially copied item */
57089 + unsigned part_bytes; /* number of bytes in those units */
57090 +
57091 + unsigned shift_bytes; /* total number of bytes in items shifted (item
57092 + headers not included) */
57093 +
57094 +};
57095 +
57096 +static int item_creation_overhead(coord_t *item)
57097 +{
57098 + return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
57099 +}
57100 +
57101 +/* how many units are there in @source starting from source->unit_pos
57102 + but not further than @stop_coord */
57103 +static int
57104 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
57105 +{
57106 + if (pend == SHIFT_LEFT) {
57107 + assert("vs-181", source->unit_pos == 0);
57108 + } else {
57109 + assert("vs-182",
57110 + source->unit_pos == coord_last_unit_pos(source));
57111 + }
57112 +
57113 + if (source->item_pos != stop_coord->item_pos) {
57114 + /* @source and @stop_coord are different items */
57115 + return coord_last_unit_pos(source) + 1;
57116 + }
57117 +
57118 + if (pend == SHIFT_LEFT) {
57119 + return stop_coord->unit_pos + 1;
57120 + } else {
57121 + return source->unit_pos - stop_coord->unit_pos + 1;
57122 + }
57123 +}
57124 +
57125 +/* this calculates what can be copied from @shift->wish_stop.node to
57126 + @shift->target */
57127 +static void
57128 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
57129 +{
57130 + unsigned target_free_space, size;
57131 + pos_in_node_t stop_item; /* item which estimating should not consider */
57132 + unsigned want; /* number of units of item we want shifted */
57133 + coord_t source; /* item being estimated */
57134 + item_plugin *iplug;
57135 +
57136 + /* shifting to left/right starts from first/last units of
57137 + @shift->wish_stop.node */
57138 + if (shift->pend == SHIFT_LEFT) {
57139 + coord_init_first_unit(&source, shift->wish_stop.node);
57140 + } else {
57141 + coord_init_last_unit(&source, shift->wish_stop.node);
57142 + }
57143 + shift->real_stop = source;
57144 +
57145 + /* free space in target node and number of items in source */
57146 + target_free_space = znode_free_space(shift->target);
57147 +
57148 + shift->everything = 0;
57149 + if (!node_is_empty(shift->target)) {
57150 + /* target node is not empty, check for boundary items
57151 + mergeability */
57152 + coord_t to;
57153 +
57154 + /* item we try to merge @source with */
57155 + if (shift->pend == SHIFT_LEFT) {
57156 + coord_init_last_unit(&to, shift->target);
57157 + } else {
57158 + coord_init_first_unit(&to, shift->target);
57159 + }
57160 +
57161 + if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
57162 + &source) :
57163 + are_items_mergeable(&source, &to)) {
57164 + /* how many units of @source do we want to merge to
57165 + item @to */
57166 + want =
57167 + wanted_units(&source, &shift->wish_stop,
57168 + shift->pend);
57169 +
57170 + /* how many units of @source we can merge to item
57171 + @to */
57172 + iplug = item_plugin_by_coord(&source);
57173 + if (iplug->b.can_shift != NULL)
57174 + shift->merging_units =
57175 + iplug->b.can_shift(target_free_space,
57176 + &source, shift->target,
57177 + shift->pend, &size,
57178 + want);
57179 + else {
57180 + shift->merging_units = 0;
57181 + size = 0;
57182 + }
57183 + shift->merging_bytes = size;
57184 + shift->shift_bytes += size;
57185 + /* update stop coord to be set to last unit of @source
57186 + we can merge to @target */
57187 + if (shift->merging_units)
57188 + /* at least one unit can be shifted */
57189 + shift->real_stop.unit_pos =
57190 + (shift->merging_units - source.unit_pos -
57191 + 1) * shift->pend;
57192 + else {
57193 + /* nothing can be shifted */
57194 + if (shift->pend == SHIFT_LEFT)
57195 + coord_init_before_first_item(&shift->
57196 + real_stop,
57197 + source.
57198 + node);
57199 + else
57200 + coord_init_after_last_item(&shift->
57201 + real_stop,
57202 + source.node);
57203 + }
57204 + assert("nikita-2081", shift->real_stop.unit_pos + 1);
57205 +
57206 + if (shift->merging_units != want) {
57207 + /* we could not copy as many as we want, so,
57208 + there is no reason for estimating any
57209 + longer */
57210 + return;
57211 + }
57212 +
57213 + target_free_space -= size;
57214 + coord_add_item_pos(&source, shift->pend);
57215 + }
57216 + }
57217 +
57218 + /* number of item nothing of which we want to shift */
57219 + stop_item = shift->wish_stop.item_pos + shift->pend;
57220 +
57221 + /* calculate how many items can be copied into given free
57222 + space as whole */
57223 + for (; source.item_pos != stop_item;
57224 + coord_add_item_pos(&source, shift->pend)) {
57225 + if (shift->pend == SHIFT_RIGHT)
57226 + source.unit_pos = coord_last_unit_pos(&source);
57227 +
57228 + /* how many units of @source do we want to copy */
57229 + want = wanted_units(&source, &shift->wish_stop, shift->pend);
57230 +
57231 + if (want == coord_last_unit_pos(&source) + 1) {
57232 + /* we want this item to be copied entirely */
57233 + size =
57234 + item_length_by_coord(&source) +
57235 + item_creation_overhead(&source);
57236 + if (size <= target_free_space) {
57237 + /* item fits into target node as whole */
57238 + target_free_space -= size;
57239 + shift->shift_bytes +=
57240 + size - item_creation_overhead(&source);
57241 + shift->entire_bytes +=
57242 + size - item_creation_overhead(&source);
57243 + shift->entire++;
57244 +
57245 + /* update shift->real_stop coord to be set to
57246 + last unit of @source we can merge to
57247 + @target */
57248 + shift->real_stop = source;
57249 + if (shift->pend == SHIFT_LEFT)
57250 + shift->real_stop.unit_pos =
57251 + coord_last_unit_pos(&shift->
57252 + real_stop);
57253 + else
57254 + shift->real_stop.unit_pos = 0;
57255 + continue;
57256 + }
57257 + }
57258 +
57259 + /* we reach here only for an item which does not fit into
57260 + target node in its entirety. This item may be either
57261 + partially shifted, or not shifted at all. We will have to
57262 + create new item in target node, so decrease amout of free
57263 + space by an item creation overhead. We can reach here also
57264 + if stop coord is in this item */
57265 + if (target_free_space >=
57266 + (unsigned)item_creation_overhead(&source)) {
57267 + target_free_space -= item_creation_overhead(&source);
57268 + iplug = item_plugin_by_coord(&source);
57269 + if (iplug->b.can_shift) {
57270 + shift->part_units = iplug->b.can_shift(target_free_space,
57271 + &source,
57272 + NULL, /* target */
57273 + shift->pend,
57274 + &size,
57275 + want);
57276 + } else {
57277 + target_free_space = 0;
57278 + shift->part_units = 0;
57279 + size = 0;
57280 + }
57281 + } else {
57282 + target_free_space = 0;
57283 + shift->part_units = 0;
57284 + size = 0;
57285 + }
57286 + shift->part_bytes = size;
57287 + shift->shift_bytes += size;
57288 +
57289 + /* set @shift->real_stop to last unit of @source we can merge
57290 + to @shift->target */
57291 + if (shift->part_units) {
57292 + shift->real_stop = source;
57293 + shift->real_stop.unit_pos =
57294 + (shift->part_units - source.unit_pos -
57295 + 1) * shift->pend;
57296 + assert("nikita-2082", shift->real_stop.unit_pos + 1);
57297 + }
57298 +
57299 + if (want != shift->part_units)
57300 + /* not everything wanted were shifted */
57301 + return;
57302 + break;
57303 + }
57304 +
57305 + shift->everything = 1;
57306 +}
57307 +
57308 +static void
57309 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
57310 + shift_direction dir, unsigned free_space)
57311 +{
57312 + item_plugin *iplug;
57313 +
57314 + assert("nikita-1463", target != NULL);
57315 + assert("nikita-1464", source != NULL);
57316 + assert("nikita-1465", from + count <= coord_num_units(source));
57317 +
57318 + iplug = item_plugin_by_coord(source);
57319 + assert("nikita-1468", iplug == item_plugin_by_coord(target));
57320 + iplug->b.copy_units(target, source, from, count, dir, free_space);
57321 +
57322 + if (dir == SHIFT_RIGHT) {
57323 + /* FIXME-VS: this looks not necessary. update_item_key was
57324 + called already by copy_units method */
57325 + reiser4_key split_key;
57326 +
57327 + assert("nikita-1469", target->unit_pos == 0);
57328 +
57329 + unit_key_by_coord(target, &split_key);
57330 + node_plugin_by_coord(target)->update_item_key(target,
57331 + &split_key, NULL);
57332 + }
57333 +}
57334 +
57335 +/* copy part of @shift->real_stop.node starting either from its beginning or
57336 + from its end and ending at @shift->real_stop to either the end or the
57337 + beginning of @shift->target */
57338 +static void copy(struct shift_params *shift)
57339 +{
57340 + node40_header *nh;
57341 + coord_t from;
57342 + coord_t to;
57343 + item_header40 *from_ih, *to_ih;
57344 + int free_space_start;
57345 + int new_items;
57346 + unsigned old_items;
57347 + int old_offset;
57348 + unsigned i;
57349 +
57350 + nh = node40_node_header(shift->target);
57351 + free_space_start = nh40_get_free_space_start(nh);
57352 + old_items = nh40_get_num_items(nh);
57353 + new_items = shift->entire + (shift->part_units ? 1 : 0);
57354 + assert("vs-185",
57355 + shift->shift_bytes ==
57356 + shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
57357 +
57358 + from = shift->wish_stop;
57359 +
57360 + coord_init_first_unit(&to, shift->target);
57361 +
57362 + /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
57363 + hence to.between is set to EMPTY_NODE above. Looks like we want it
57364 + to be AT_UNIT.
57365 +
57366 + Oh, wonders of ->betweeness...
57367 +
57368 + */
57369 + to.between = AT_UNIT;
57370 +
57371 + if (shift->pend == SHIFT_LEFT) {
57372 + /* copying to left */
57373 +
57374 + coord_set_item_pos(&from, 0);
57375 + from_ih = node40_ih_at(from.node, 0);
57376 +
57377 + coord_set_item_pos(&to,
57378 + node40_num_of_items_internal(to.node) - 1);
57379 + if (shift->merging_units) {
57380 + /* expand last item, so that plugin methods will see
57381 + correct data */
57382 + free_space_start += shift->merging_bytes;
57383 + nh40_set_free_space_start(nh,
57384 + (unsigned)free_space_start);
57385 + nh40_set_free_space(nh,
57386 + nh40_get_free_space(nh) -
57387 + shift->merging_bytes);
57388 +
57389 + /* appending last item of @target */
57390 + copy_units(&to, &from, 0, /* starting from 0-th unit */
57391 + shift->merging_units, SHIFT_LEFT,
57392 + shift->merging_bytes);
57393 + coord_inc_item_pos(&from);
57394 + from_ih--;
57395 + coord_inc_item_pos(&to);
57396 + }
57397 +
57398 + to_ih = node40_ih_at(shift->target, old_items);
57399 + if (shift->entire) {
57400 + /* copy @entire items entirely */
57401 +
57402 + /* copy item headers */
57403 + memcpy(to_ih - shift->entire + 1,
57404 + from_ih - shift->entire + 1,
57405 + shift->entire * sizeof(item_header40));
57406 + /* update item header offset */
57407 + old_offset = ih40_get_offset(from_ih);
57408 + /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
57409 + for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
57410 + ih40_set_offset(to_ih,
57411 + ih40_get_offset(from_ih) -
57412 + old_offset + free_space_start);
57413 +
57414 + /* copy item bodies */
57415 + memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
57416 + shift->entire_bytes);
57417 +
57418 + coord_add_item_pos(&from, (int)shift->entire);
57419 + coord_add_item_pos(&to, (int)shift->entire);
57420 + }
57421 +
57422 + nh40_set_free_space_start(nh,
57423 + free_space_start +
57424 + shift->shift_bytes -
57425 + shift->merging_bytes);
57426 + nh40_set_free_space(nh,
57427 + nh40_get_free_space(nh) -
57428 + (shift->shift_bytes - shift->merging_bytes +
57429 + sizeof(item_header40) * new_items));
57430 +
57431 + /* update node header */
57432 + node40_set_num_items(shift->target, nh, old_items + new_items);
57433 + assert("vs-170",
57434 + nh40_get_free_space(nh) < znode_size(shift->target));
57435 +
57436 + if (shift->part_units) {
57437 + /* copy heading part (@part units) of @source item as
57438 + a new item into @target->node */
57439 +
57440 + /* copy item header of partially copied item */
57441 + coord_set_item_pos(&to,
57442 + node40_num_of_items_internal(to.node)
57443 + - 1);
57444 + memcpy(to_ih, from_ih, sizeof(item_header40));
57445 + ih40_set_offset(to_ih,
57446 + nh40_get_free_space_start(nh) -
57447 + shift->part_bytes);
57448 + if (item_plugin_by_coord(&to)->b.init)
57449 + item_plugin_by_coord(&to)->b.init(&to, &from,
57450 + NULL);
57451 + copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
57452 + shift->part_bytes);
57453 + }
57454 +
57455 + } else {
57456 + /* copying to right */
57457 +
57458 + coord_set_item_pos(&from,
57459 + node40_num_of_items_internal(from.node) - 1);
57460 + from_ih = node40_ih_at_coord(&from);
57461 +
57462 + coord_set_item_pos(&to, 0);
57463 +
57464 + /* prepare space for new items */
57465 + memmove(zdata(to.node) + sizeof(node40_header) +
57466 + shift->shift_bytes,
57467 + zdata(to.node) + sizeof(node40_header),
57468 + free_space_start - sizeof(node40_header));
57469 + /* update item headers of moved items */
57470 + to_ih = node40_ih_at(to.node, 0);
57471 + /* first item gets @merging_bytes longer. free space appears
57472 + at its beginning */
57473 + if (!node_is_empty(to.node))
57474 + ih40_set_offset(to_ih,
57475 + ih40_get_offset(to_ih) +
57476 + shift->shift_bytes -
57477 + shift->merging_bytes);
57478 +
57479 + for (i = 1; i < old_items; i++)
57480 + ih40_set_offset(to_ih - i,
57481 + ih40_get_offset(to_ih - i) +
57482 + shift->shift_bytes);
57483 +
57484 + /* move item headers to make space for new items */
57485 + memmove(to_ih - old_items + 1 - new_items,
57486 + to_ih - old_items + 1,
57487 + sizeof(item_header40) * old_items);
57488 + to_ih -= (new_items - 1);
57489 +
57490 + nh40_set_free_space_start(nh,
57491 + free_space_start +
57492 + shift->shift_bytes);
57493 + nh40_set_free_space(nh,
57494 + nh40_get_free_space(nh) -
57495 + (shift->shift_bytes +
57496 + sizeof(item_header40) * new_items));
57497 +
57498 + /* update node header */
57499 + node40_set_num_items(shift->target, nh, old_items + new_items);
57500 + assert("vs-170",
57501 + nh40_get_free_space(nh) < znode_size(shift->target));
57502 +
57503 + if (shift->merging_units) {
57504 + coord_add_item_pos(&to, new_items);
57505 + to.unit_pos = 0;
57506 + to.between = AT_UNIT;
57507 + /* prepend first item of @to */
57508 + copy_units(&to, &from,
57509 + coord_last_unit_pos(&from) -
57510 + shift->merging_units + 1,
57511 + shift->merging_units, SHIFT_RIGHT,
57512 + shift->merging_bytes);
57513 + coord_dec_item_pos(&from);
57514 + from_ih++;
57515 + }
57516 +
57517 + if (shift->entire) {
57518 + /* copy @entire items entirely */
57519 +
57520 + /* copy item headers */
57521 + memcpy(to_ih, from_ih,
57522 + shift->entire * sizeof(item_header40));
57523 +
57524 + /* update item header offset */
57525 + old_offset =
57526 + ih40_get_offset(from_ih + shift->entire - 1);
57527 + /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
57528 + for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
57529 + ih40_set_offset(to_ih,
57530 + ih40_get_offset(from_ih) -
57531 + old_offset +
57532 + sizeof(node40_header) +
57533 + shift->part_bytes);
57534 + /* copy item bodies */
57535 + coord_add_item_pos(&from, -(int)(shift->entire - 1));
57536 + memcpy(zdata(to.node) + sizeof(node40_header) +
57537 + shift->part_bytes, item_by_coord_node40(&from),
57538 + shift->entire_bytes);
57539 + coord_dec_item_pos(&from);
57540 + }
57541 +
57542 + if (shift->part_units) {
57543 + coord_set_item_pos(&to, 0);
57544 + to.unit_pos = 0;
57545 + to.between = AT_UNIT;
57546 + /* copy heading part (@part units) of @source item as
57547 + a new item into @target->node */
57548 +
57549 + /* copy item header of partially copied item */
57550 + memcpy(to_ih, from_ih, sizeof(item_header40));
57551 + ih40_set_offset(to_ih, sizeof(node40_header));
57552 + if (item_plugin_by_coord(&to)->b.init)
57553 + item_plugin_by_coord(&to)->b.init(&to, &from,
57554 + NULL);
57555 + copy_units(&to, &from,
57556 + coord_last_unit_pos(&from) -
57557 + shift->part_units + 1, shift->part_units,
57558 + SHIFT_RIGHT, shift->part_bytes);
57559 + }
57560 + }
57561 +}
57562 +
57563 +/* remove everything either before or after @fact_stop. Number of items
57564 + removed completely is returned */
57565 +static int delete_copied(struct shift_params *shift)
57566 +{
57567 + coord_t from;
57568 + coord_t to;
57569 + struct carry_cut_data cdata;
57570 +
57571 + if (shift->pend == SHIFT_LEFT) {
57572 + /* we were shifting to left, remove everything from the
57573 + beginning of @shift->wish_stop->node upto
57574 + @shift->wish_stop */
57575 + coord_init_first_unit(&from, shift->real_stop.node);
57576 + to = shift->real_stop;
57577 +
57578 + /* store old coordinate of unit which will be first after
57579 + shift to left */
57580 + shift->u.future_first = to;
57581 + coord_next_unit(&shift->u.future_first);
57582 + } else {
57583 + /* we were shifting to right, remove everything from
57584 + @shift->stop_coord upto to end of
57585 + @shift->stop_coord->node */
57586 + from = shift->real_stop;
57587 + coord_init_last_unit(&to, from.node);
57588 +
57589 + /* store old coordinate of unit which will be last after
57590 + shift to right */
57591 + shift->u.future_last = from;
57592 + coord_prev_unit(&shift->u.future_last);
57593 + }
57594 +
57595 + cdata.params.from = &from;
57596 + cdata.params.to = &to;
57597 + cdata.params.from_key = NULL;
57598 + cdata.params.to_key = NULL;
57599 + cdata.params.smallest_removed = NULL;
57600 + return cut_node40(&cdata, NULL);
57601 +}
57602 +
57603 +/* something was moved between @left and @right. Add carry operation to @info
57604 + list to have carry to update delimiting key between them */
57605 +static int
57606 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
57607 +{
57608 + carry_op *op;
57609 + carry_node *cn;
57610 +
57611 + if (info == NULL)
57612 + /* nowhere to send operation to. */
57613 + return 0;
57614 +
57615 + if (!should_notify_parent(right))
57616 + return 0;
57617 +
57618 + op = node_post_carry(info, COP_UPDATE, right, 1);
57619 + if (IS_ERR(op) || op == NULL)
57620 + return op ? PTR_ERR(op) : -EIO;
57621 +
57622 + if (left != NULL) {
57623 + carry_node *reference;
57624 +
57625 + if (info->doing)
57626 + reference = insert_carry_node(info->doing,
57627 + info->todo, left);
57628 + else
57629 + reference = op->node;
57630 + assert("nikita-2992", reference != NULL);
57631 + cn = add_carry(info->todo, POOLO_BEFORE, reference);
57632 + if (IS_ERR(cn))
57633 + return PTR_ERR(cn);
57634 + cn->parent = 1;
57635 + cn->node = left;
57636 + if (ZF_ISSET(left, JNODE_ORPHAN))
57637 + cn->left_before = 1;
57638 + op->u.update.left = cn;
57639 + } else
57640 + op->u.update.left = NULL;
57641 + return 0;
57642 +}
57643 +
57644 +/* plugin->u.node.prepare_removal
57645 + to delete a pointer to @empty from the tree add corresponding carry
57646 + operation (delete) to @info list */
57647 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
57648 +{
57649 + carry_op *op;
57650 + reiser4_tree *tree;
57651 +
57652 + if (!should_notify_parent(empty))
57653 + return 0;
57654 + /* already on a road to Styx */
57655 + if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
57656 + return 0;
57657 + op = node_post_carry(info, COP_DELETE, empty, 1);
57658 + if (IS_ERR(op) || op == NULL)
57659 + return RETERR(op ? PTR_ERR(op) : -EIO);
57660 +
57661 + op->u.delete.child = NULL;
57662 + op->u.delete.flags = 0;
57663 +
57664 + /* fare thee well */
57665 + tree = znode_get_tree(empty);
57666 + read_lock_tree(tree);
57667 + write_lock_dk(tree);
57668 + znode_set_ld_key(empty, znode_get_rd_key(empty));
57669 + if (znode_is_left_connected(empty) && empty->left)
57670 + znode_set_rd_key(empty->left, znode_get_rd_key(empty));
57671 + write_unlock_dk(tree);
57672 + read_unlock_tree(tree);
57673 +
57674 + ZF_SET(empty, JNODE_HEARD_BANSHEE);
57675 + return 0;
57676 +}
57677 +
57678 +/* something were shifted from @insert_coord->node to @shift->target, update
57679 + @insert_coord correspondingly */
57680 +static void
57681 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
57682 + int including_insert_coord)
57683 +{
57684 + /* item plugin was invalidated by shifting */
57685 + coord_clear_iplug(insert_coord);
57686 +
57687 + if (node_is_empty(shift->wish_stop.node)) {
57688 + assert("vs-242", shift->everything);
57689 + if (including_insert_coord) {
57690 + if (shift->pend == SHIFT_RIGHT) {
57691 + /* set @insert_coord before first unit of
57692 + @shift->target node */
57693 + coord_init_before_first_item(insert_coord,
57694 + shift->target);
57695 + } else {
57696 + /* set @insert_coord after last in target node */
57697 + coord_init_after_last_item(insert_coord,
57698 + shift->target);
57699 + }
57700 + } else {
57701 + /* set @insert_coord inside of empty node. There is
57702 + only one possible coord within an empty
57703 + node. init_first_unit will set that coord */
57704 + coord_init_first_unit(insert_coord,
57705 + shift->wish_stop.node);
57706 + }
57707 + return;
57708 + }
57709 +
57710 + if (shift->pend == SHIFT_RIGHT) {
57711 + /* there was shifting to right */
57712 + if (shift->everything) {
57713 + /* everything wanted was shifted */
57714 + if (including_insert_coord) {
57715 + /* @insert_coord is set before first unit of
57716 + @to node */
57717 + coord_init_before_first_item(insert_coord,
57718 + shift->target);
57719 + insert_coord->between = BEFORE_UNIT;
57720 + } else {
57721 + /* @insert_coord is set after last unit of
57722 + @insert->node */
57723 + coord_init_last_unit(insert_coord,
57724 + shift->wish_stop.node);
57725 + insert_coord->between = AFTER_UNIT;
57726 + }
57727 + }
57728 + return;
57729 + }
57730 +
57731 + /* there was shifting to left */
57732 + if (shift->everything) {
57733 + /* everything wanted was shifted */
57734 + if (including_insert_coord) {
57735 + /* @insert_coord is set after last unit in @to node */
57736 + coord_init_after_last_item(insert_coord, shift->target);
57737 + } else {
57738 + /* @insert_coord is set before first unit in the same
57739 + node */
57740 + coord_init_before_first_item(insert_coord,
57741 + shift->wish_stop.node);
57742 + }
57743 + return;
57744 + }
57745 +
57746 + /* FIXME-VS: the code below is complicated because with between ==
57747 + AFTER_ITEM unit_pos is set to 0 */
57748 +
57749 + if (!removed) {
57750 + /* no items were shifted entirely */
57751 + assert("vs-195", shift->merging_units == 0
57752 + || shift->part_units == 0);
57753 +
57754 + if (shift->real_stop.item_pos == insert_coord->item_pos) {
57755 + if (shift->merging_units) {
57756 + if (insert_coord->between == AFTER_UNIT) {
57757 + assert("nikita-1441",
57758 + insert_coord->unit_pos >=
57759 + shift->merging_units);
57760 + insert_coord->unit_pos -=
57761 + shift->merging_units;
57762 + } else if (insert_coord->between == BEFORE_UNIT) {
57763 + assert("nikita-2090",
57764 + insert_coord->unit_pos >
57765 + shift->merging_units);
57766 + insert_coord->unit_pos -=
57767 + shift->merging_units;
57768 + }
57769 +
57770 + assert("nikita-2083",
57771 + insert_coord->unit_pos + 1);
57772 + } else {
57773 + if (insert_coord->between == AFTER_UNIT) {
57774 + assert("nikita-1442",
57775 + insert_coord->unit_pos >=
57776 + shift->part_units);
57777 + insert_coord->unit_pos -=
57778 + shift->part_units;
57779 + } else if (insert_coord->between == BEFORE_UNIT) {
57780 + assert("nikita-2089",
57781 + insert_coord->unit_pos >
57782 + shift->part_units);
57783 + insert_coord->unit_pos -=
57784 + shift->part_units;
57785 + }
57786 +
57787 + assert("nikita-2084",
57788 + insert_coord->unit_pos + 1);
57789 + }
57790 + }
57791 + return;
57792 + }
57793 +
57794 + /* we shifted to left and there was no enough space for everything */
57795 + switch (insert_coord->between) {
57796 + case AFTER_UNIT:
57797 + case BEFORE_UNIT:
57798 + if (shift->real_stop.item_pos == insert_coord->item_pos)
57799 + insert_coord->unit_pos -= shift->part_units;
57800 + case AFTER_ITEM:
57801 + coord_add_item_pos(insert_coord, -removed);
57802 + break;
57803 + default:
57804 + impossible("nikita-2087", "not ready");
57805 + }
57806 + assert("nikita-2085", insert_coord->unit_pos + 1);
57807 +}
57808 +
57809 +static int call_shift_hooks(struct shift_params *shift)
57810 +{
57811 + unsigned i, shifted;
57812 + coord_t coord;
57813 + item_plugin *iplug;
57814 +
57815 + assert("vs-275", !node_is_empty(shift->target));
57816 +
57817 + /* number of items shift touches */
57818 + shifted =
57819 + shift->entire + (shift->merging_units ? 1 : 0) +
57820 + (shift->part_units ? 1 : 0);
57821 +
57822 + if (shift->pend == SHIFT_LEFT) {
57823 + /* moved items are at the end */
57824 + coord_init_last_unit(&coord, shift->target);
57825 + coord.unit_pos = 0;
57826 +
57827 + assert("vs-279", shift->pend == 1);
57828 + for (i = 0; i < shifted; i++) {
57829 + unsigned from, count;
57830 +
57831 + iplug = item_plugin_by_coord(&coord);
57832 + if (i == 0 && shift->part_units) {
57833 + assert("vs-277",
57834 + coord_num_units(&coord) ==
57835 + shift->part_units);
57836 + count = shift->part_units;
57837 + from = 0;
57838 + } else if (i == shifted - 1 && shift->merging_units) {
57839 + count = shift->merging_units;
57840 + from = coord_num_units(&coord) - count;
57841 + } else {
57842 + count = coord_num_units(&coord);
57843 + from = 0;
57844 + }
57845 +
57846 + if (iplug->b.shift_hook) {
57847 + iplug->b.shift_hook(&coord, from, count,
57848 + shift->wish_stop.node);
57849 + }
57850 + coord_add_item_pos(&coord, -shift->pend);
57851 + }
57852 + } else {
57853 + /* moved items are at the beginning */
57854 + coord_init_first_unit(&coord, shift->target);
57855 +
57856 + assert("vs-278", shift->pend == -1);
57857 + for (i = 0; i < shifted; i++) {
57858 + unsigned from, count;
57859 +
57860 + iplug = item_plugin_by_coord(&coord);
57861 + if (i == 0 && shift->part_units) {
57862 + assert("vs-277",
57863 + coord_num_units(&coord) ==
57864 + shift->part_units);
57865 + count = coord_num_units(&coord);
57866 + from = 0;
57867 + } else if (i == shifted - 1 && shift->merging_units) {
57868 + count = shift->merging_units;
57869 + from = 0;
57870 + } else {
57871 + count = coord_num_units(&coord);
57872 + from = 0;
57873 + }
57874 +
57875 + if (iplug->b.shift_hook) {
57876 + iplug->b.shift_hook(&coord, from, count,
57877 + shift->wish_stop.node);
57878 + }
57879 + coord_add_item_pos(&coord, -shift->pend);
57880 + }
57881 + }
57882 +
57883 + return 0;
57884 +}
57885 +
57886 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
57887 +static int
57888 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
57889 +{
57890 + assert("vs-944", shift->real_stop.node == old->node);
57891 +
57892 + if (shift->real_stop.item_pos < old->item_pos)
57893 + return 0;
57894 + if (shift->real_stop.item_pos == old->item_pos) {
57895 + if (shift->real_stop.unit_pos < old->unit_pos)
57896 + return 0;
57897 + }
57898 + return 1;
57899 +}
57900 +
57901 +/* shift to right is completed. Return 1 if unit @old was moved to right
57902 + neighbor */
57903 +static int
57904 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
57905 +{
57906 + assert("vs-944", shift->real_stop.node == old->node);
57907 +
57908 + if (shift->real_stop.item_pos > old->item_pos)
57909 + return 0;
57910 + if (shift->real_stop.item_pos == old->item_pos) {
57911 + if (shift->real_stop.unit_pos > old->unit_pos)
57912 + return 0;
57913 + }
57914 + return 1;
57915 +}
57916 +
57917 +/* coord @old was set in node from which shift was performed. What was shifted
57918 + is stored in @shift. Update @old correspondingly to performed shift */
57919 +static coord_t *adjust_coord2(const struct shift_params *shift,
57920 + const coord_t * old, coord_t * new)
57921 +{
57922 + coord_clear_iplug(new);
57923 + new->between = old->between;
57924 +
57925 + coord_clear_iplug(new);
57926 + if (old->node == shift->target) {
57927 + if (shift->pend == SHIFT_LEFT) {
57928 + /* coord which is set inside of left neighbor does not
57929 + change during shift to left */
57930 + coord_dup(new, old);
57931 + return new;
57932 + }
57933 + new->node = old->node;
57934 + coord_set_item_pos(new,
57935 + old->item_pos + shift->entire +
57936 + (shift->part_units ? 1 : 0));
57937 + new->unit_pos = old->unit_pos;
57938 + if (old->item_pos == 0 && shift->merging_units)
57939 + new->unit_pos += shift->merging_units;
57940 + return new;
57941 + }
57942 +
57943 + assert("vs-977", old->node == shift->wish_stop.node);
57944 + if (shift->pend == SHIFT_LEFT) {
57945 + if (unit_moved_left(shift, old)) {
57946 + /* unit @old moved to left neighbor. Calculate its
57947 + coordinate there */
57948 + new->node = shift->target;
57949 + coord_set_item_pos(new,
57950 + node_num_items(shift->target) -
57951 + shift->entire -
57952 + (shift->part_units ? 1 : 0) +
57953 + old->item_pos);
57954 +
57955 + new->unit_pos = old->unit_pos;
57956 + if (shift->merging_units) {
57957 + coord_dec_item_pos(new);
57958 + if (old->item_pos == 0) {
57959 + /* unit_pos only changes if item got
57960 + merged */
57961 + new->unit_pos =
57962 + coord_num_units(new) -
57963 + (shift->merging_units -
57964 + old->unit_pos);
57965 + }
57966 + }
57967 + } else {
57968 + /* unit @old did not move to left neighbor.
57969 +
57970 + Use _nocheck, because @old is outside of its node.
57971 + */
57972 + coord_dup_nocheck(new, old);
57973 + coord_add_item_pos(new,
57974 + -shift->u.future_first.item_pos);
57975 + if (new->item_pos == 0)
57976 + new->unit_pos -= shift->u.future_first.unit_pos;
57977 + }
57978 + } else {
57979 + if (unit_moved_right(shift, old)) {
57980 + /* unit @old moved to right neighbor */
57981 + new->node = shift->target;
57982 + coord_set_item_pos(new,
57983 + old->item_pos -
57984 + shift->real_stop.item_pos);
57985 + if (new->item_pos == 0) {
57986 + /* unit @old might change unit pos */
57987 + coord_set_item_pos(new,
57988 + old->unit_pos -
57989 + shift->real_stop.unit_pos);
57990 + }
57991 + } else {
57992 + /* unit @old did not move to right neighbor, therefore
57993 + it did not change */
57994 + coord_dup(new, old);
57995 + }
57996 + }
57997 + coord_set_iplug(new, item_plugin_by_coord(new));
57998 + return new;
57999 +}
58000 +
58001 +/* this is called when shift is completed (something of source node is copied
58002 + to target and deleted in source) to update all taps set in current
58003 + context */
58004 +static void update_taps(const struct shift_params *shift)
58005 +{
58006 + tap_t *tap;
58007 + coord_t new;
58008 +
58009 + for_all_taps(tap) {
58010 + /* update only taps set to nodes participating in shift */
58011 + if (tap->coord->node == shift->wish_stop.node
58012 + || tap->coord->node == shift->target)
58013 + tap_to_coord(tap,
58014 + adjust_coord2(shift, tap->coord, &new));
58015 + }
58016 +}
58017 +
58018 +#if REISER4_DEBUG
58019 +
58020 +struct shift_check {
58021 + reiser4_key key;
58022 + __u16 plugin_id;
58023 + union {
58024 + __u64 bytes;
58025 + __u64 entries;
58026 + void *unused;
58027 + } u;
58028 +};
58029 +
58030 +void *shift_check_prepare(const znode * left, const znode * right)
58031 +{
58032 + pos_in_node_t i, nr_items;
58033 + int mergeable;
58034 + struct shift_check *data;
58035 + item_header40 *ih;
58036 +
58037 + if (node_is_empty(left) || node_is_empty(right))
58038 + mergeable = 0;
58039 + else {
58040 + coord_t l, r;
58041 +
58042 + coord_init_last_unit(&l, left);
58043 + coord_init_first_unit(&r, right);
58044 + mergeable = are_items_mergeable(&l, &r);
58045 + }
58046 + nr_items =
58047 + node40_num_of_items_internal(left) +
58048 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58049 + data =
58050 + kmalloc(sizeof(struct shift_check) * nr_items, get_gfp_mask());
58051 + if (data != NULL) {
58052 + coord_t coord;
58053 + pos_in_node_t item_pos;
58054 +
58055 + coord_init_first_unit(&coord, left);
58056 + i = 0;
58057 +
58058 + for (item_pos = 0;
58059 + item_pos < node40_num_of_items_internal(left);
58060 + item_pos++) {
58061 +
58062 + coord_set_item_pos(&coord, item_pos);
58063 + ih = node40_ih_at_coord(&coord);
58064 +
58065 + data[i].key = ih->key;
58066 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58067 + switch (data[i].plugin_id) {
58068 + case CTAIL_ID:
58069 + case FORMATTING_ID:
58070 + data[i].u.bytes = coord_num_units(&coord);
58071 + break;
58072 + case EXTENT_POINTER_ID:
58073 + data[i].u.bytes =
58074 + extent_size(&coord,
58075 + coord_num_units(&coord));
58076 + break;
58077 + case COMPOUND_DIR_ID:
58078 + data[i].u.entries = coord_num_units(&coord);
58079 + break;
58080 + default:
58081 + data[i].u.unused = NULL;
58082 + break;
58083 + }
58084 + i++;
58085 + }
58086 +
58087 + coord_init_first_unit(&coord, right);
58088 +
58089 + if (mergeable) {
58090 + assert("vs-1609", i != 0);
58091 +
58092 + ih = node40_ih_at_coord(&coord);
58093 +
58094 + assert("vs-1589",
58095 + data[i - 1].plugin_id ==
58096 + le16_to_cpu(get_unaligned(&ih->plugin_id)));
58097 + switch (data[i - 1].plugin_id) {
58098 + case CTAIL_ID:
58099 + case FORMATTING_ID:
58100 + data[i - 1].u.bytes += coord_num_units(&coord);
58101 + break;
58102 + case EXTENT_POINTER_ID:
58103 + data[i - 1].u.bytes +=
58104 + extent_size(&coord,
58105 + coord_num_units(&coord));
58106 + break;
58107 + case COMPOUND_DIR_ID:
58108 + data[i - 1].u.entries +=
58109 + coord_num_units(&coord);
58110 + break;
58111 + default:
58112 + impossible("vs-1605", "wrong mergeable item");
58113 + break;
58114 + }
58115 + item_pos = 1;
58116 + } else
58117 + item_pos = 0;
58118 + for (; item_pos < node40_num_of_items_internal(right);
58119 + item_pos++) {
58120 +
58121 + assert("vs-1604", i < nr_items);
58122 + coord_set_item_pos(&coord, item_pos);
58123 + ih = node40_ih_at_coord(&coord);
58124 +
58125 + data[i].key = ih->key;
58126 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58127 + switch (data[i].plugin_id) {
58128 + case CTAIL_ID:
58129 + case FORMATTING_ID:
58130 + data[i].u.bytes = coord_num_units(&coord);
58131 + break;
58132 + case EXTENT_POINTER_ID:
58133 + data[i].u.bytes =
58134 + extent_size(&coord,
58135 + coord_num_units(&coord));
58136 + break;
58137 + case COMPOUND_DIR_ID:
58138 + data[i].u.entries = coord_num_units(&coord);
58139 + break;
58140 + default:
58141 + data[i].u.unused = NULL;
58142 + break;
58143 + }
58144 + i++;
58145 + }
58146 + assert("vs-1606", i == nr_items);
58147 + }
58148 + return data;
58149 +}
58150 +
58151 +void shift_check(void *vp, const znode * left, const znode * right)
58152 +{
58153 + pos_in_node_t i, nr_items;
58154 + coord_t coord;
58155 + __u64 last_bytes;
58156 + int mergeable;
58157 + item_header40 *ih;
58158 + pos_in_node_t item_pos;
58159 + struct shift_check *data;
58160 +
58161 + data = (struct shift_check *)vp;
58162 +
58163 + if (data == NULL)
58164 + return;
58165 +
58166 + if (node_is_empty(left) || node_is_empty(right))
58167 + mergeable = 0;
58168 + else {
58169 + coord_t l, r;
58170 +
58171 + coord_init_last_unit(&l, left);
58172 + coord_init_first_unit(&r, right);
58173 + mergeable = are_items_mergeable(&l, &r);
58174 + }
58175 +
58176 + nr_items =
58177 + node40_num_of_items_internal(left) +
58178 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58179 +
58180 + i = 0;
58181 + last_bytes = 0;
58182 +
58183 + coord_init_first_unit(&coord, left);
58184 +
58185 + for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
58186 + item_pos++) {
58187 +
58188 + coord_set_item_pos(&coord, item_pos);
58189 + ih = node40_ih_at_coord(&coord);
58190 +
58191 + assert("vs-1611", i == item_pos);
58192 + assert("vs-1590", keyeq(&ih->key, &data[i].key));
58193 + assert("vs-1591",
58194 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58195 + if ((i < (node40_num_of_items_internal(left) - 1))
58196 + || !mergeable) {
58197 + switch (data[i].plugin_id) {
58198 + case CTAIL_ID:
58199 + case FORMATTING_ID:
58200 + assert("vs-1592",
58201 + data[i].u.bytes ==
58202 + coord_num_units(&coord));
58203 + break;
58204 + case EXTENT_POINTER_ID:
58205 + assert("vs-1593",
58206 + data[i].u.bytes == extent_size(&coord,
58207 + coord_num_units
58208 + (&coord)));
58209 + break;
58210 + case COMPOUND_DIR_ID:
58211 + assert("vs-1594",
58212 + data[i].u.entries ==
58213 + coord_num_units(&coord));
58214 + break;
58215 + default:
58216 + break;
58217 + }
58218 + }
58219 + if (item_pos == (node40_num_of_items_internal(left) - 1)
58220 + && mergeable) {
58221 + switch (data[i].plugin_id) {
58222 + case CTAIL_ID:
58223 + case FORMATTING_ID:
58224 + last_bytes = coord_num_units(&coord);
58225 + break;
58226 + case EXTENT_POINTER_ID:
58227 + last_bytes =
58228 + extent_size(&coord,
58229 + coord_num_units(&coord));
58230 + break;
58231 + case COMPOUND_DIR_ID:
58232 + last_bytes = coord_num_units(&coord);
58233 + break;
58234 + default:
58235 + impossible("vs-1595", "wrong mergeable item");
58236 + break;
58237 + }
58238 + }
58239 + i++;
58240 + }
58241 +
58242 + coord_init_first_unit(&coord, right);
58243 + if (mergeable) {
58244 + ih = node40_ih_at_coord(&coord);
58245 +
58246 + assert("vs-1589",
58247 + data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
58248 + assert("vs-1608", last_bytes != 0);
58249 + switch (data[i - 1].plugin_id) {
58250 + case CTAIL_ID:
58251 + case FORMATTING_ID:
58252 + assert("vs-1596",
58253 + data[i - 1].u.bytes ==
58254 + last_bytes + coord_num_units(&coord));
58255 + break;
58256 +
58257 + case EXTENT_POINTER_ID:
58258 + assert("vs-1597",
58259 + data[i - 1].u.bytes ==
58260 + last_bytes + extent_size(&coord,
58261 + coord_num_units
58262 + (&coord)));
58263 + break;
58264 +
58265 + case COMPOUND_DIR_ID:
58266 + assert("vs-1598",
58267 + data[i - 1].u.bytes ==
58268 + last_bytes + coord_num_units(&coord));
58269 + break;
58270 + default:
58271 + impossible("vs-1599", "wrong mergeable item");
58272 + break;
58273 + }
58274 + item_pos = 1;
58275 + } else
58276 + item_pos = 0;
58277 +
58278 + for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
58279 +
58280 + coord_set_item_pos(&coord, item_pos);
58281 + ih = node40_ih_at_coord(&coord);
58282 +
58283 + assert("vs-1612", keyeq(&ih->key, &data[i].key));
58284 + assert("vs-1613",
58285 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58286 + switch (data[i].plugin_id) {
58287 + case CTAIL_ID:
58288 + case FORMATTING_ID:
58289 + assert("vs-1600",
58290 + data[i].u.bytes == coord_num_units(&coord));
58291 + break;
58292 + case EXTENT_POINTER_ID:
58293 + assert("vs-1601",
58294 + data[i].u.bytes == extent_size(&coord,
58295 + coord_num_units
58296 + (&coord)));
58297 + break;
58298 + case COMPOUND_DIR_ID:
58299 + assert("vs-1602",
58300 + data[i].u.entries == coord_num_units(&coord));
58301 + break;
58302 + default:
58303 + break;
58304 + }
58305 + i++;
58306 + }
58307 +
58308 + assert("vs-1603", i == nr_items);
58309 + kfree(data);
58310 +}
58311 +
58312 +#endif
58313 +
58314 +/* plugin->u.node.shift
58315 + look for description of this method in plugin/node/node.h */
58316 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
58317 + deleted from the tree if this is set to 1 */
58318 + int including_stop_coord, carry_plugin_info * info)
58319 +{
58320 + struct shift_params shift;
58321 + int result;
58322 + znode *left, *right;
58323 + znode *source;
58324 + int target_empty;
58325 +
58326 + assert("nikita-2161", coord_check(from));
58327 +
58328 + memset(&shift, 0, sizeof(shift));
58329 + shift.pend = pend;
58330 + shift.wish_stop = *from;
58331 + shift.target = to;
58332 +
58333 + assert("nikita-1473", znode_is_write_locked(from->node));
58334 + assert("nikita-1474", znode_is_write_locked(to));
58335 +
58336 + source = from->node;
58337 +
58338 + /* set @shift.wish_stop to rightmost/leftmost unit among units we want
58339 + shifted */
58340 + if (pend == SHIFT_LEFT) {
58341 + result = coord_set_to_left(&shift.wish_stop);
58342 + left = to;
58343 + right = from->node;
58344 + } else {
58345 + result = coord_set_to_right(&shift.wish_stop);
58346 + left = from->node;
58347 + right = to;
58348 + }
58349 +
58350 + if (result) {
58351 + /* move insertion coord even if there is nothing to move */
58352 + if (including_stop_coord) {
58353 + /* move insertion coord (@from) */
58354 + if (pend == SHIFT_LEFT) {
58355 + /* after last item in target node */
58356 + coord_init_after_last_item(from, to);
58357 + } else {
58358 + /* before first item in target node */
58359 + coord_init_before_first_item(from, to);
58360 + }
58361 + }
58362 +
58363 + if (delete_child && node_is_empty(shift.wish_stop.node))
58364 + result =
58365 + prepare_removal_node40(shift.wish_stop.node, info);
58366 + else
58367 + result = 0;
58368 + /* there is nothing to shift */
58369 + assert("nikita-2078", coord_check(from));
58370 + return result;
58371 + }
58372 +
58373 + target_empty = node_is_empty(to);
58374 +
58375 + /* when first node plugin with item body compression is implemented,
58376 + this must be changed to call node specific plugin */
58377 +
58378 + /* shift->stop_coord is updated to last unit which really will be
58379 + shifted */
58380 + estimate_shift(&shift, get_current_context());
58381 + if (!shift.shift_bytes) {
58382 + /* we could not shift anything */
58383 + assert("nikita-2079", coord_check(from));
58384 + return 0;
58385 + }
58386 +
58387 + copy(&shift);
58388 +
58389 + /* result value of this is important. It is used by adjust_coord below */
58390 + result = delete_copied(&shift);
58391 +
58392 + assert("vs-1610", result >= 0);
58393 + assert("vs-1471",
58394 + ((reiser4_context *) current->journal_info)->magic ==
58395 + context_magic);
58396 +
58397 + /* item which has been moved from one node to another might want to do
58398 + something on that event. This can be done by item's shift_hook
58399 + method, which will be now called for every moved items */
58400 + call_shift_hooks(&shift);
58401 +
58402 + assert("vs-1472",
58403 + ((reiser4_context *) current->journal_info)->magic ==
58404 + context_magic);
58405 +
58406 + update_taps(&shift);
58407 +
58408 + assert("vs-1473",
58409 + ((reiser4_context *) current->journal_info)->magic ==
58410 + context_magic);
58411 +
58412 + /* adjust @from pointer in accordance with @including_stop_coord flag
58413 + and amount of data which was really shifted */
58414 + adjust_coord(from, &shift, result, including_stop_coord);
58415 +
58416 + if (target_empty)
58417 + /*
58418 + * items were shifted into empty node. Update delimiting key.
58419 + */
58420 + result = prepare_for_update(NULL, left, info);
58421 +
58422 + /* add update operation to @info, which is the list of operations to
58423 + be performed on a higher level */
58424 + result = prepare_for_update(left, right, info);
58425 + if (!result && node_is_empty(source) && delete_child) {
58426 + /* all contents of @from->node is moved to @to and @from->node
58427 + has to be removed from the tree, so, on higher level we
58428 + will be removing the pointer to node @from->node */
58429 + result = prepare_removal_node40(source, info);
58430 + }
58431 + assert("nikita-2080", coord_check(from));
58432 + return result ? result : (int)shift.shift_bytes;
58433 +}
58434 +
58435 +/* plugin->u.node.fast_insert()
58436 + look for description of this method in plugin/node/node.h */
58437 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58438 +{
58439 + return 1;
58440 +}
58441 +
58442 +/* plugin->u.node.fast_paste()
58443 + look for description of this method in plugin/node/node.h */
58444 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58445 +{
58446 + return 1;
58447 +}
58448 +
58449 +/* plugin->u.node.fast_cut()
58450 + look for description of this method in plugin/node/node.h */
58451 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58452 +{
58453 + return 1;
58454 +}
58455 +
58456 +/* plugin->u.node.modify - not defined */
58457 +
58458 +/* plugin->u.node.max_item_size */
58459 +int max_item_size_node40(void)
58460 +{
58461 + return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
58462 + sizeof(item_header40);
58463 +}
58464 +
58465 +/* plugin->u.node.set_item_plugin */
58466 +int set_item_plugin_node40(coord_t *coord, item_id id)
58467 +{
58468 + item_header40 *ih;
58469 +
58470 + ih = node40_ih_at_coord(coord);
58471 + put_unaligned(cpu_to_le16(id), &ih->plugin_id);
58472 + coord->iplugid = id;
58473 + return 0;
58474 +}
58475 +
58476 +/*
58477 + Local variables:
58478 + c-indentation-style: "K&R"
58479 + mode-name: "LC"
58480 + c-basic-offset: 8
58481 + tab-width: 8
58482 + fill-column: 120
58483 + scroll-step: 1
58484 + End:
58485 +*/
58486 Index: linux-2.6.16/fs/reiser4/plugin/node/node40.h
58487 ===================================================================
58488 --- /dev/null
58489 +++ linux-2.6.16/fs/reiser4/plugin/node/node40.h
58490 @@ -0,0 +1,125 @@
58491 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58492 +
58493 +#if !defined( __REISER4_NODE40_H__ )
58494 +#define __REISER4_NODE40_H__
58495 +
58496 +#include "../../forward.h"
58497 +#include "../../dformat.h"
58498 +#include "node.h"
58499 +
58500 +#include <linux/types.h>
58501 +
58502 +/* format of node header for 40 node layouts. Keep bloat out of this struct. */
58503 +typedef struct node40_header {
58504 + /* identifier of node plugin. Must be located at the very beginning
58505 + of a node. */
58506 + common_node_header common_header; /* this is 16 bits */
58507 + /* number of items. Should be first element in the node header,
58508 + because we haven't yet finally decided whether it shouldn't go into
58509 + common_header.
58510 + */
58511 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
58512 + * node format at compile time, and it is this one, accesses do not function dereference when
58513 + * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
58514 + d16 nr_items;
58515 + /* free space in node measured in bytes */
58516 + d16 free_space;
58517 + /* offset to start of free space in node */
58518 + d16 free_space_start;
58519 + /* for reiser4_fsck. When information about what is a free
58520 + block is corrupted, and we try to recover everything even
58521 + if marked as freed, then old versions of data may
58522 + duplicate newer versions, and this field allows us to
58523 + restore the newer version. Also useful for when users
58524 + who don't have the new trashcan installed on their linux distro
58525 + delete the wrong files and send us desperate emails
58526 + offering $25 for them back. */
58527 +
58528 + /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
58529 + d32 magic;
58530 + /* flushstamp is made of mk_id and write_counter. mk_id is an
58531 + id generated randomly at mkreiserfs time. So we can just
58532 + skip all nodes with different mk_id. write_counter is d64
58533 + incrementing counter of writes on disk. It is used for
58534 + choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
58535 +
58536 + d32 mkfs_id;
58537 + d64 flush_id;
58538 + /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
58539 + and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
58540 + d16 flags;
58541 +
58542 + /* 1 is leaf level, 2 is twig level, root is the numerically
58543 + largest level */
58544 + d8 level;
58545 +
58546 + d8 pad;
58547 +} PACKED node40_header;
58548 +
58549 +/* item headers are not standard across all node layouts, pass
58550 + pos_in_node to functions instead */
58551 +typedef struct item_header40 {
58552 + /* key of item */
58553 + /* 0 */ reiser4_key key;
58554 + /* offset from start of a node measured in 8-byte chunks */
58555 + /* 24 */ d16 offset;
58556 + /* 26 */ d16 flags;
58557 + /* 28 */ d16 plugin_id;
58558 +} PACKED item_header40;
58559 +
58560 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
58561 +size_t free_space_node40(znode * node);
58562 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
58563 + lookup_bias bias, coord_t * coord);
58564 +int num_of_items_node40(const znode * node);
58565 +char *item_by_coord_node40(const coord_t * coord);
58566 +int length_by_coord_node40(const coord_t * coord);
58567 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
58568 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
58569 +size_t estimate_node40(znode * node);
58570 +int check_node40(const znode * node, __u32 flags, const char **error);
58571 +int parse_node40(znode * node);
58572 +int init_node40(znode * node);
58573 +#ifdef GUESS_EXISTS
58574 +int guess_node40(const znode * node);
58575 +#endif
58576 +void change_item_size_node40(coord_t * coord, int by);
58577 +int create_item_node40(coord_t * target, const reiser4_key * key,
58578 + reiser4_item_data * data, carry_plugin_info * info);
58579 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
58580 + carry_plugin_info * info);
58581 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
58582 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
58583 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
58584 + /* if @from->node becomes
58585 + empty - it will be deleted from
58586 + the tree if this is set to 1
58587 + */
58588 + int delete_child, int including_stop_coord,
58589 + carry_plugin_info * info);
58590 +
58591 +int fast_insert_node40(const coord_t * coord);
58592 +int fast_paste_node40(const coord_t * coord);
58593 +int fast_cut_node40(const coord_t * coord);
58594 +int max_item_size_node40(void);
58595 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
58596 +int set_item_plugin_node40(coord_t * coord, item_id id);
58597 +int shrink_item_node40(coord_t * coord, int delta);
58598 +
58599 +#if REISER4_DEBUG
58600 +void *shift_check_prepare(const znode *left, const znode *right);
58601 +void shift_check(void *vp, const znode *left, const znode *right);
58602 +#endif
58603 +
58604 +/* __REISER4_NODE40_H__ */
58605 +#endif
58606 +/*
58607 + Local variables:
58608 + c-indentation-style: "K&R"
58609 + mode-name: "LC"
58610 + c-basic-offset: 8
58611 + tab-width: 8
58612 + fill-column: 120
58613 + scroll-step: 1
58614 + End:
58615 +*/
58616 Index: linux-2.6.16/fs/reiser4/plugin/object.c
58617 ===================================================================
58618 --- /dev/null
58619 +++ linux-2.6.16/fs/reiser4/plugin/object.c
58620 @@ -0,0 +1,501 @@
58621 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58622 + * reiser4/README */
58623 +
58624 +/*
58625 + * Examples of object plugins: file, directory, symlink, special file.
58626 + *
58627 + * Plugins associated with inode:
58628 + *
58629 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
58630 + * stat-data. How we store this plugin in in-core inode is not
58631 + * important. Currently pointers are used, another variant is to store offsets
58632 + * and do array lookup on each access.
58633 + *
58634 + * Now, each inode has one selected plugin: object plugin that
58635 + * determines what type of file this object is: directory, regular etc.
58636 + *
58637 + * This main plugin can use other plugins that are thus subordinated to
58638 + * it. Directory instance of object plugin uses hash; regular file
58639 + * instance uses tail policy plugin.
58640 + *
58641 + * Object plugin is either taken from id in stat-data or guessed from
58642 + * i_mode bits. Once it is established we ask it to install its
58643 + * subordinate plugins, by looking again in stat-data or inheriting them
58644 + * from parent.
58645 + *
58646 + * How new inode is initialized during ->read_inode():
58647 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
58648 + * i_generation, capabilities etc.
58649 + * 2 read plugin id from stat data or try to guess plugin id
58650 + * from inode->i_mode bits if plugin id is missing.
58651 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
58652 + *
58653 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
58654 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
58655 + *
58656 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
58657 + * from stat-data or guessed from mode bits
58658 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
58659 + * plugins from parent.
58660 + *
58661 + * Easy induction proves that on last step all plugins of inode would be
58662 + * initialized.
58663 + *
58664 + * When creating new object:
58665 + * 1 obtain object plugin id (see next period)
58666 + * NIKITA-FIXME-HANS: period?
58667 + * 2 ->install() this plugin
58668 + * 3 ->inherit() the rest from the parent
58669 + *
58670 + * We need some examples of creating an object with default and non-default
58671 + * plugin ids. Nikita, please create them.
58672 + */
58673 +
58674 +#include "../inode.h"
58675 +
58676 +static int _bugop(void)
58677 +{
58678 + BUG_ON(1);
58679 + return 0;
58680 +}
58681 +
58682 +#define bugop ((void *)_bugop)
58683 +
58684 +static int _dummyop(void)
58685 +{
58686 + return 0;
58687 +}
58688 +
58689 +#define dummyop ((void *)_dummyop)
58690 +
58691 +static int change_file(struct inode *inode, reiser4_plugin * plugin)
58692 +{
58693 + /* cannot change object plugin of already existing object */
58694 + return RETERR(-EINVAL);
58695 +}
58696 +
58697 +static reiser4_plugin_ops file_plugin_ops = {
58698 + .change = change_file
58699 +};
58700 +
58701 +/*
58702 + * Definitions of object plugins.
58703 + */
58704 +
58705 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
58706 + [UNIX_FILE_PLUGIN_ID] = {
58707 + .h = {
58708 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58709 + .id = UNIX_FILE_PLUGIN_ID,
58710 + .pops = &file_plugin_ops,
58711 + .label = "reg",
58712 + .desc = "regular file",
58713 + .linkage = {NULL, NULL},
58714 + },
58715 + .inode_ops = {
58716 + .permission = permission_common,
58717 + .setattr = setattr_unix_file,
58718 + .getattr = getattr_common
58719 + },
58720 + .file_ops = {
58721 + .llseek = generic_file_llseek,
58722 + .read = read_unix_file,
58723 + .write = write_unix_file,
58724 + .ioctl = ioctl_unix_file,
58725 + .mmap = mmap_unix_file,
58726 + .open = open_unix_file,
58727 + .release = release_unix_file,
58728 + .fsync = sync_unix_file,
58729 + .sendfile = sendfile_unix_file
58730 + },
58731 + .as_ops = {
58732 + .writepage = reiser4_writepage,
58733 + .readpage = readpage_unix_file,
58734 + .sync_page = block_sync_page,
58735 + .writepages = writepages_unix_file,
58736 + .set_page_dirty = reiser4_set_page_dirty,
58737 + .readpages = reiser4_readpages,
58738 + .prepare_write = prepare_write_unix_file,
58739 + .commit_write = commit_write_unix_file,
58740 + .bmap = bmap_unix_file,
58741 + .invalidatepage = reiser4_invalidatepage,
58742 + .releasepage = reiser4_releasepage
58743 + },
58744 + .write_sd_by_inode = write_sd_by_inode_common,
58745 + .flow_by_inode = flow_by_inode_unix_file,
58746 + .key_by_inode = key_by_inode_and_offset_common,
58747 + .set_plug_in_inode = set_plug_in_inode_common,
58748 + .adjust_to_parent = adjust_to_parent_common,
58749 + .create_object = create_object_common, /* this is not inode_operations's create */
58750 + .delete_object = delete_object_unix_file,
58751 + .add_link = add_link_common,
58752 + .rem_link = rem_link_common,
58753 + .owns_item = owns_item_unix_file,
58754 + .can_add_link = can_add_link_common,
58755 + .detach = dummyop,
58756 + .bind = dummyop,
58757 + .safelink = safelink_common,
58758 + .estimate = {
58759 + .create = estimate_create_common,
58760 + .update = estimate_update_common,
58761 + .unlink = estimate_unlink_common
58762 + },
58763 + .init_inode_data = init_inode_data_unix_file,
58764 + .cut_tree_worker = cut_tree_worker_common,
58765 + .wire = {
58766 + .write = wire_write_common,
58767 + .read = wire_read_common,
58768 + .get = wire_get_common,
58769 + .size = wire_size_common,
58770 + .done = wire_done_common
58771 + }
58772 + },
58773 + [DIRECTORY_FILE_PLUGIN_ID] = {
58774 + .h = {
58775 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58776 + .id = DIRECTORY_FILE_PLUGIN_ID,
58777 + .pops = &file_plugin_ops,
58778 + .label = "dir",
58779 + .desc = "directory",
58780 + .linkage = {NULL, NULL}
58781 + },
58782 + .inode_ops = {NULL,},
58783 + .file_ops = {NULL,},
58784 + .as_ops = {NULL,},
58785 +
58786 + .write_sd_by_inode = write_sd_by_inode_common,
58787 + .flow_by_inode = bugop,
58788 + .key_by_inode = bugop,
58789 + .set_plug_in_inode = set_plug_in_inode_common,
58790 + .adjust_to_parent = adjust_to_parent_common_dir,
58791 + .create_object = create_object_common,
58792 + .delete_object = delete_directory_common,
58793 + .add_link = add_link_common,
58794 + .rem_link = rem_link_common_dir,
58795 + .owns_item = owns_item_common_dir,
58796 + .can_add_link = can_add_link_common,
58797 + .can_rem_link = can_rem_link_common_dir,
58798 + .detach = detach_common_dir,
58799 + .bind = bind_common_dir,
58800 + .safelink = safelink_common,
58801 + .estimate = {
58802 + .create = estimate_create_common_dir,
58803 + .update = estimate_update_common,
58804 + .unlink = estimate_unlink_common_dir
58805 + },
58806 + .wire = {
58807 + .write = wire_write_common,
58808 + .read = wire_read_common,
58809 + .get = wire_get_common,
58810 + .size = wire_size_common,
58811 + .done = wire_done_common
58812 + },
58813 + .init_inode_data = init_inode_ordering,
58814 + .cut_tree_worker = cut_tree_worker_common,
58815 + },
58816 + [SYMLINK_FILE_PLUGIN_ID] = {
58817 + .h = {
58818 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58819 + .id = SYMLINK_FILE_PLUGIN_ID,
58820 + .pops = &file_plugin_ops,
58821 + .label = "symlink",
58822 + .desc = "symbolic link",
58823 + .linkage = {NULL,NULL}
58824 + },
58825 + .inode_ops = {
58826 + .readlink = generic_readlink,
58827 + .follow_link = follow_link_common,
58828 + .permission = permission_common,
58829 + .setattr = setattr_common,
58830 + .getattr = getattr_common
58831 + },
58832 + /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
58833 + .file_ops = {NULL,},
58834 + .as_ops = {NULL,},
58835 +
58836 + .write_sd_by_inode = write_sd_by_inode_common,
58837 + .set_plug_in_inode = set_plug_in_inode_common,
58838 + .adjust_to_parent = adjust_to_parent_common,
58839 + .create_object = create_symlink,
58840 + .delete_object = delete_object_common,
58841 + .add_link = add_link_common,
58842 + .rem_link = rem_link_common,
58843 + .can_add_link = can_add_link_common,
58844 + .detach = dummyop,
58845 + .bind = dummyop,
58846 + .safelink = safelink_common,
58847 + .estimate = {
58848 + .create = estimate_create_common,
58849 + .update = estimate_update_common,
58850 + .unlink = estimate_unlink_common
58851 + },
58852 + .init_inode_data = init_inode_ordering,
58853 + .cut_tree_worker = cut_tree_worker_common,
58854 + .destroy_inode = destroy_inode_symlink,
58855 + .wire = {
58856 + .write = wire_write_common,
58857 + .read = wire_read_common,
58858 + .get = wire_get_common,
58859 + .size = wire_size_common,
58860 + .done = wire_done_common
58861 + }
58862 + },
58863 + [SPECIAL_FILE_PLUGIN_ID] = {
58864 + .h = {
58865 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58866 + .id = SPECIAL_FILE_PLUGIN_ID,
58867 + .pops = &file_plugin_ops,
58868 + .label = "special",
58869 + .desc =
58870 + "special: fifo, device or socket",
58871 + .linkage = {NULL, NULL}
58872 + },
58873 + .inode_ops = {
58874 + .permission = permission_common,
58875 + .setattr = setattr_common,
58876 + .getattr = getattr_common
58877 + },
58878 + /* file_ops of special files (sockets, block, char, fifo) are
58879 + initialized by init_special_inode. */
58880 + .file_ops = {NULL,},
58881 + .as_ops = {NULL,},
58882 +
58883 + .write_sd_by_inode = write_sd_by_inode_common,
58884 + .set_plug_in_inode = set_plug_in_inode_common,
58885 + .adjust_to_parent = adjust_to_parent_common,
58886 + .create_object = create_object_common,
58887 + .delete_object = delete_object_common,
58888 + .add_link = add_link_common,
58889 + .rem_link = rem_link_common,
58890 + .owns_item = owns_item_common,
58891 + .can_add_link = can_add_link_common,
58892 + .detach = dummyop,
58893 + .bind = dummyop,
58894 + .safelink = safelink_common,
58895 + .estimate = {
58896 + .create = estimate_create_common,
58897 + .update = estimate_update_common,
58898 + .unlink = estimate_unlink_common
58899 + },
58900 + .init_inode_data = init_inode_ordering,
58901 + .cut_tree_worker = cut_tree_worker_common,
58902 + .wire = {
58903 + .write = wire_write_common,
58904 + .read = wire_read_common,
58905 + .get = wire_get_common,
58906 + .size = wire_size_common,
58907 + .done = wire_done_common
58908 + }
58909 + },
58910 + [CRC_FILE_PLUGIN_ID] = {
58911 + .h = {
58912 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58913 + .id = CRC_FILE_PLUGIN_ID,
58914 + .pops = &cryptcompress_plugin_ops,
58915 + .label = "cryptcompress",
58916 + .desc = "cryptcompress file",
58917 + .linkage = {NULL, NULL}
58918 + },
58919 + .inode_ops = {
58920 + .permission = permission_common,
58921 + .setattr = setattr_cryptcompress,
58922 + .getattr = getattr_common
58923 + },
58924 + .file_ops = {
58925 + .llseek = generic_file_llseek,
58926 + .read = read_cryptcompress,
58927 + .write = write_cryptcompress,
58928 + .mmap = mmap_cryptcompress,
58929 + .release = release_cryptcompress,
58930 + .fsync = sync_common,
58931 + .sendfile = sendfile_cryptcompress
58932 + },
58933 + .as_ops = {
58934 + .writepage = reiser4_writepage,
58935 + .readpage = readpage_cryptcompress,
58936 + .sync_page = block_sync_page,
58937 + .writepages = writepages_cryptcompress,
58938 + .set_page_dirty = reiser4_set_page_dirty,
58939 + .readpages = reiser4_readpages,
58940 + .prepare_write = prepare_write_common,
58941 + .invalidatepage = reiser4_invalidatepage,
58942 + .releasepage = reiser4_releasepage
58943 + },
58944 + .write_sd_by_inode = write_sd_by_inode_common,
58945 + .flow_by_inode = flow_by_inode_cryptcompress,
58946 + .key_by_inode = key_by_inode_cryptcompress,
58947 + .set_plug_in_inode = set_plug_in_inode_common,
58948 + .adjust_to_parent = adjust_to_parent_cryptcompress,
58949 + .create_object = create_cryptcompress,
58950 + .open_object = open_cryptcompress,
58951 + .delete_object = delete_cryptcompress,
58952 + .add_link = add_link_common,
58953 + .rem_link = rem_link_common,
58954 + .owns_item = owns_item_common,
58955 + .can_add_link = can_add_link_common,
58956 + .detach = dummyop,
58957 + .bind = dummyop,
58958 + .safelink = safelink_common,
58959 + .estimate = {
58960 + .create = estimate_create_common,
58961 + .update = estimate_update_common,
58962 + .unlink = estimate_unlink_common
58963 + },
58964 + .init_inode_data = init_inode_data_cryptcompress,
58965 + .cut_tree_worker = cut_tree_worker_cryptcompress,
58966 + .destroy_inode = destroy_inode_cryptcompress,
58967 + .wire = {
58968 + .write = wire_write_common,
58969 + .read = wire_read_common,
58970 + .get = wire_get_common,
58971 + .size = wire_size_common,
58972 + .done = wire_done_common
58973 + }
58974 + }
58975 +};
58976 +
58977 +static int change_dir(struct inode *inode, reiser4_plugin * plugin)
58978 +{
58979 + /* cannot change dir plugin of already existing object */
58980 + return RETERR(-EINVAL);
58981 +}
58982 +
58983 +static reiser4_plugin_ops dir_plugin_ops = {
58984 + .change = change_dir
58985 +};
58986 +
58987 +/*
58988 + * definition of directory plugins
58989 + */
58990 +
58991 +dir_plugin dir_plugins[LAST_DIR_ID] = {
58992 + /* standard hashed directory plugin */
58993 + [HASHED_DIR_PLUGIN_ID] = {
58994 + .h = {
58995 + .type_id = REISER4_DIR_PLUGIN_TYPE,
58996 + .id = HASHED_DIR_PLUGIN_ID,
58997 + .pops = &dir_plugin_ops,
58998 + .label = "dir",
58999 + .desc = "hashed directory",
59000 + .linkage = {NULL, NULL}
59001 + },
59002 + .inode_ops = {
59003 + .create = create_common,
59004 + .lookup = lookup_common,
59005 + .link = link_common,
59006 + .unlink = unlink_common,
59007 + .symlink = symlink_common,
59008 + .mkdir = mkdir_common,
59009 + .rmdir = unlink_common,
59010 + .mknod = mknod_common,
59011 + .rename = rename_common,
59012 + .permission = permission_common,
59013 + .setattr = setattr_common,
59014 + .getattr = getattr_common
59015 + },
59016 + .file_ops = {
59017 + .llseek = llseek_common_dir,
59018 + .read = generic_read_dir,
59019 + .readdir = readdir_common,
59020 + .release = release_dir_common,
59021 + .fsync = sync_common
59022 + },
59023 + .as_ops = {
59024 + .writepage = bugop,
59025 + .sync_page = bugop,
59026 + .writepages = dummyop,
59027 + .set_page_dirty = bugop,
59028 + .readpages = bugop,
59029 + .prepare_write = bugop,
59030 + .commit_write = bugop,
59031 + .bmap = bugop,
59032 + .invalidatepage = bugop,
59033 + .releasepage = bugop
59034 + },
59035 + .get_parent = get_parent_common,
59036 + .is_name_acceptable = is_name_acceptable_common,
59037 + .build_entry_key = build_entry_key_hashed,
59038 + .build_readdir_key = build_readdir_key_common,
59039 + .add_entry = add_entry_common,
59040 + .rem_entry = rem_entry_common,
59041 + .init = init_common,
59042 + .done = done_common,
59043 + .attach = attach_common,
59044 + .detach = detach_common,
59045 + .estimate = {
59046 + .add_entry = estimate_add_entry_common,
59047 + .rem_entry = estimate_rem_entry_common,
59048 + .unlink = dir_estimate_unlink_common
59049 + }
59050 + },
59051 + /* hashed directory for which seekdir/telldir are guaranteed to
59052 + * work. Brain-damage. */
59053 + [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
59054 + .h = {
59055 + .type_id = REISER4_DIR_PLUGIN_TYPE,
59056 + .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
59057 + .pops = &dir_plugin_ops,
59058 + .label = "dir32",
59059 + .desc = "directory hashed with 31 bit hash",
59060 + .linkage = {NULL, NULL}
59061 + },
59062 + .inode_ops = {
59063 + .create = create_common,
59064 + .lookup = lookup_common,
59065 + .link = link_common,
59066 + .unlink = unlink_common,
59067 + .symlink = symlink_common,
59068 + .mkdir = mkdir_common,
59069 + .rmdir = unlink_common,
59070 + .mknod = mknod_common,
59071 + .rename = rename_common,
59072 + .permission = permission_common,
59073 + .setattr = setattr_common,
59074 + .getattr = getattr_common
59075 + },
59076 + .file_ops = {
59077 + .llseek = llseek_common_dir,
59078 + .read = generic_read_dir,
59079 + .readdir = readdir_common,
59080 + .release = release_dir_common,
59081 + .fsync = sync_common
59082 + },
59083 + .as_ops = {
59084 + .writepage = bugop,
59085 + .sync_page = bugop,
59086 + .writepages = dummyop,
59087 + .set_page_dirty = bugop,
59088 + .readpages = bugop,
59089 + .prepare_write = bugop,
59090 + .commit_write = bugop,
59091 + .bmap = bugop,
59092 + .invalidatepage = bugop,
59093 + .releasepage = bugop
59094 + },
59095 + .get_parent = get_parent_common,
59096 + .is_name_acceptable = is_name_acceptable_common,
59097 + .build_entry_key = build_entry_key_seekable,
59098 + .build_readdir_key = build_readdir_key_common,
59099 + .add_entry = add_entry_common,
59100 + .rem_entry = rem_entry_common,
59101 + .init = init_common,
59102 + .done = done_common,
59103 + .attach = attach_common,
59104 + .detach = detach_common,
59105 + .estimate = {
59106 + .add_entry = estimate_add_entry_common,
59107 + .rem_entry = estimate_rem_entry_common,
59108 + .unlink = dir_estimate_unlink_common
59109 + }
59110 + }
59111 +};
59112 +
59113 +/* Make Linus happy.
59114 + Local variables:
59115 + c-indentation-style: "K&R"
59116 + mode-name: "LC"
59117 + c-basic-offset: 8
59118 + tab-width: 8
59119 + fill-column: 120
59120 + End:
59121 +*/
59122 Index: linux-2.6.16/fs/reiser4/plugin/object.h
59123 ===================================================================
59124 --- /dev/null
59125 +++ linux-2.6.16/fs/reiser4/plugin/object.h
59126 @@ -0,0 +1,121 @@
59127 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
59128 + * reiser4/README */
59129 +
59130 +/* Declaration of object plugin functions. */
59131 +
59132 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
59133 +#define __FS_REISER4_PLUGIN_OBJECT_H__
59134 +
59135 +#include "../type_safe_hash.h"
59136 +
59137 +/* common implementations of inode operations */
59138 +int create_common(struct inode *parent, struct dentry *dentry,
59139 + int mode, struct nameidata *);
59140 +struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
59141 + struct nameidata *nameidata);
59142 +int link_common(struct dentry *existing, struct inode *parent,
59143 + struct dentry *newname);
59144 +int unlink_common(struct inode *parent, struct dentry *victim);
59145 +int mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
59146 +int symlink_common(struct inode *parent, struct dentry *dentry,
59147 + const char *linkname);
59148 +int mknod_common(struct inode *parent, struct dentry *dentry,
59149 + int mode, dev_t rdev);
59150 +int rename_common(struct inode *old_dir, struct dentry *old_name,
59151 + struct inode *new_dir, struct dentry *new_name);
59152 +void *follow_link_common(struct dentry *, struct nameidata *data);
59153 +int permission_common(struct inode *, int mask, /* mode bits to check permissions for */
59154 + struct nameidata *nameidata);
59155 +int setattr_common(struct dentry *, struct iattr *);
59156 +int getattr_common(struct vfsmount *mnt, struct dentry *, struct kstat *);
59157 +
59158 +/* common implementations of file operations */
59159 +loff_t llseek_common_dir(struct file *, loff_t off, int origin);
59160 +int readdir_common(struct file *, void *dirent, filldir_t);
59161 +int release_dir_common(struct inode *, struct file *);
59162 +int sync_common(struct file *, struct dentry *, int datasync);
59163 +
59164 +/* common implementations of address space operations */
59165 +int prepare_write_common(struct file *, struct page *, unsigned from,
59166 + unsigned to);
59167 +
59168 +/* file plugin operations: common implementations */
59169 +int write_sd_by_inode_common(struct inode *);
59170 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
59171 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
59172 + reiser4_object_create_data *);
59173 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
59174 + struct inode *root);
59175 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
59176 + struct inode *root);
59177 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
59178 + struct inode *root);
59179 +int create_object_common(struct inode *object, struct inode *parent,
59180 + reiser4_object_create_data *);
59181 +int delete_object_common(struct inode *);
59182 +int delete_directory_common(struct inode *);
59183 +int add_link_common(struct inode *object, struct inode *parent);
59184 +int rem_link_common(struct inode *object, struct inode *parent);
59185 +int rem_link_common_dir(struct inode *object, struct inode *parent);
59186 +int owns_item_common(const struct inode *, const coord_t *);
59187 +int owns_item_common_dir(const struct inode *, const coord_t *);
59188 +int can_add_link_common(const struct inode *);
59189 +int can_rem_link_common_dir(const struct inode *);
59190 +int detach_common_dir(struct inode *child, struct inode *parent);
59191 +int open_cryptcompress(struct inode * inode, struct file * file);
59192 +int bind_common_dir(struct inode *child, struct inode *parent);
59193 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
59194 +reiser4_block_nr estimate_create_common(const struct inode *);
59195 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
59196 +reiser4_block_nr estimate_update_common(const struct inode *);
59197 +reiser4_block_nr estimate_unlink_common(const struct inode *,
59198 + const struct inode *);
59199 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
59200 + const struct inode *);
59201 +char *wire_write_common(struct inode *, char *start);
59202 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
59203 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
59204 +int wire_size_common(struct inode *);
59205 +void wire_done_common(reiser4_object_on_wire *);
59206 +
59207 +/* dir plugin operations: common implementations */
59208 +struct dentry *get_parent_common(struct inode *child);
59209 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
59210 +void build_entry_key_common(const struct inode *,
59211 + const struct qstr *qname, reiser4_key *);
59212 +int build_readdir_key_common(struct file *dir, reiser4_key *);
59213 +int add_entry_common(struct inode *object, struct dentry *where,
59214 + reiser4_object_create_data *, reiser4_dir_entry_desc *);
59215 +int rem_entry_common(struct inode *object, struct dentry *where,
59216 + reiser4_dir_entry_desc *);
59217 +int init_common(struct inode *object, struct inode *parent,
59218 + reiser4_object_create_data *);
59219 +int done_common(struct inode *);
59220 +int attach_common(struct inode *child, struct inode *parent);
59221 +int detach_common(struct inode *object, struct inode *parent);
59222 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
59223 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
59224 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
59225 + const struct inode *);
59226 +
59227 +/* these are essential parts of common implementations, they are to make
59228 + customized implementations easier */
59229 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
59230 +
59231 +/* merely useful functions */
59232 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
59233 + const reiser4_key *, int silent);
59234 +
59235 +
59236 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
59237 +#endif
59238 +
59239 +/* Make Linus happy.
59240 + Local variables:
59241 + c-indentation-style: "K&R"
59242 + mode-name: "LC"
59243 + c-basic-offset: 8
59244 + tab-width: 8
59245 + fill-column: 120
59246 + End:
59247 +*/
59248 Index: linux-2.6.16/fs/reiser4/plugin/plugin.c
59249 ===================================================================
59250 --- /dev/null
59251 +++ linux-2.6.16/fs/reiser4/plugin/plugin.c
59252 @@ -0,0 +1,533 @@
59253 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59254 + * reiser4/README */
59255 +
59256 +/* Basic plugin infrastructure, lookup etc. */
59257 +
59258 +/* PLUGINS:
59259 +
59260 + Plugins are internal Reiser4 "modules" or "objects" used to increase
59261 + extensibility and allow external users to easily adapt reiser4 to
59262 + their needs.
59263 +
59264 + Plugins are classified into several disjoint "types". Plugins
59265 + belonging to the particular plugin type are termed "instances" of
59266 + this type. Currently the following types are present:
59267 +
59268 + . object plugin
59269 + . hash plugin
59270 + . tail plugin
59271 + . perm plugin
59272 + . item plugin
59273 + . node layout plugin
59274 +
59275 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59276 +
59277 + Object (file) plugin determines how given file-system object serves
59278 + standard VFS requests for read, write, seek, mmap etc. Instances of
59279 + file plugins are: regular file, directory, symlink. Another example
59280 + of file plugin is audit plugin, that optionally records accesses to
59281 + underlying object and forwards requests to it.
59282 +
59283 + Hash plugins compute hashes used by reiser4 to store and locate
59284 + files within directories. Instances of hash plugin type are: r5,
59285 + tea, rupasov.
59286 +
59287 + Tail plugins (or, more precisely, tail policy plugins) determine
59288 + when last part of the file should be stored in a formatted item.
59289 +
59290 + Perm plugins control permissions granted for a process accessing a file.
59291 +
59292 + Scope and lookup:
59293 +
59294 + label such that pair ( type_label, plugin_label ) is unique. This
59295 + pair is a globally persistent and user-visible plugin
59296 + identifier. Internally kernel maintains plugins and plugin types in
59297 + arrays using an index into those arrays as plugin and plugin type
59298 + identifiers. File-system in turn, also maintains persistent
59299 + "dictionary" which is mapping from plugin label to numerical
59300 + identifier which is stored in file-system objects. That is, we
59301 + store the offset into the plugin array for that plugin type as the
59302 + plugin id in the stat data of the filesystem object.
59303 +
59304 + plugin_labels have meaning for the user interface that assigns
59305 + plugins to files, and may someday have meaning for dynamic loading of
59306 + plugins and for copying of plugins from one fs instance to
59307 + another by utilities like cp and tar.
59308 +
59309 + Internal kernel plugin type identifier (index in plugins[] array) is
59310 + of type reiser4_plugin_type. Set of available plugin types is
59311 + currently static, but dynamic loading doesn't seem to pose
59312 + insurmountable problems.
59313 +
59314 + Within each type plugins are addressed by the identifiers of type
59315 + reiser4_plugin_id (indices in
59316 + reiser4_plugin_type_data.builtin[]). Such identifiers are only
59317 + required to be unique within one type, not globally.
59318 +
59319 + Thus, plugin in memory is uniquely identified by the pair (type_id,
59320 + id).
59321 +
59322 + Usage:
59323 +
59324 + There exists only one instance of each plugin instance, but this
59325 + single instance can be associated with many entities (file-system
59326 + objects, items, nodes, transactions, file-descriptors etc.). Entity
59327 + to which plugin of given type is termed (due to the lack of
59328 + imagination) "subject" of this plugin type and, by abuse of
59329 + terminology, subject of particular instance of this type to which
59330 + it's attached currently. For example, inode is subject of object
59331 + plugin type. Inode representing directory is subject of directory
59332 + plugin, hash plugin type and some particular instance of hash plugin
59333 + type. Inode, representing regular file is subject of "regular file"
59334 + plugin, tail-policy plugin type etc.
59335 +
59336 + With each subject the plugin possibly stores some state. For example,
59337 + the state of a directory plugin (instance of object plugin type) is pointer
59338 + to hash plugin (if directories always use hashing that is). State of
59339 + audit plugin is file descriptor (struct file) of log file or some
59340 + magic value to do logging through printk().
59341 +
59342 + Interface:
59343 +
59344 + In addition to a scalar identifier, each plugin type and plugin
59345 + proper has a "label": short string and a "description"---longer
59346 + descriptive string. Labels and descriptions of plugin types are
59347 + hard-coded into plugins[] array, declared and defined in
59348 + plugin.c. Label and description of plugin are stored in .label and
59349 + .desc fields of reiser4_plugin_header respectively. It's possible to
59350 + locate plugin by the pair of labels.
59351 +
59352 + Features:
59353 +
59354 + . user-level plugin manipulations:
59355 + + reiser4("filename/..file_plugin<='audit'");
59356 + + write(open("filename/..file_plugin"), "audit", 8);
59357 +
59358 + . user level utilities lsplug and chplug to manipulate plugins.
59359 + Utilities are not of primary priority. Possibly they will be not
59360 + working on v4.0
59361 +
59362 +NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree? I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
59363 +
59364 + . mount option "plug" to set-up plugins of root-directory.
59365 + "plug=foo:bar" will set "bar" as default plugin of type "foo".
59366 +
59367 + Limitations:
59368 +
59369 + . each plugin type has to provide at least one builtin
59370 + plugin. This is technical limitation and it can be lifted in the
59371 + future.
59372 +
59373 + TODO:
59374 +
59375 + New plugin types/plugings:
59376 + Things we should be able to separately choose to inherit:
59377 +
59378 + security plugins
59379 +
59380 + stat data
59381 +
59382 + file bodies
59383 +
59384 + file plugins
59385 +
59386 + dir plugins
59387 +
59388 + . perm:acl
59389 +
59390 + d audi---audit plugin intercepting and possibly logging all
59391 + accesses to object. Requires to put stub functions in file_operations
59392 + in stead of generic_file_*.
59393 +
59394 +NIKITA-FIXME-HANS: why make overflows a plugin?
59395 + . over---handle hash overflows
59396 +
59397 + . sqnt---handle different access patterns and instruments read-ahead
59398 +
59399 +NIKITA-FIXME-HANS: describe the line below in more detail.
59400 +
59401 + . hier---handle inheritance of plugins along file-system hierarchy
59402 +
59403 + Different kinds of inheritance: on creation vs. on access.
59404 + Compatible/incompatible plugins.
59405 + Inheritance for multi-linked files.
59406 + Layered plugins.
59407 + Notion of plugin context is abandoned.
59408 +
59409 +Each file is associated
59410 + with one plugin and dependant plugins (hash, etc.) are stored as
59411 + main plugin state. Now, if we have plugins used for regular files
59412 + but not for directories, how such plugins would be inherited?
59413 + . always store them with directories also
59414 +
59415 +NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing the line below which is also useful.
59416 +
59417 + . use inheritance hierarchy, independent of file-system namespace
59418 +
59419 +*/
59420 +
59421 +#include "../debug.h"
59422 +#include "../dformat.h"
59423 +#include "plugin_header.h"
59424 +#include "item/static_stat.h"
59425 +#include "node/node.h"
59426 +#include "security/perm.h"
59427 +#include "space/space_allocator.h"
59428 +#include "disk_format/disk_format.h"
59429 +#include "plugin.h"
59430 +#include "../reiser4.h"
59431 +#include "../jnode.h"
59432 +#include "../inode.h"
59433 +
59434 +#include <linux/fs.h> /* for struct super_block */
59435 +
59436 +/* public interface */
59437 +
59438 +/* initialise plugin sub-system. Just call this once on reiser4 startup. */
59439 +int init_plugins(void);
59440 +int setup_plugins(struct super_block *super, reiser4_plugin ** area);
59441 +int locate_plugin(struct inode *inode, plugin_locator * loc);
59442 +
59443 +
59444 +/**
59445 + * init_plugins - initialize plugins
59446 + *
59447 + * Initializes plugin sub-system. It is part of reiser4 module
59448 + * initialization. For each plugin of each type init method is called and each
59449 + * plugin is put into list of plugins.
59450 + */
59451 +int init_plugins(void)
59452 +{
59453 + reiser4_plugin_type type_id;
59454 +
59455 + for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59456 + reiser4_plugin_type_data *ptype;
59457 + int i;
59458 +
59459 + ptype = &plugins[type_id];
59460 + assert("nikita-3508", ptype->label != NULL);
59461 + assert("nikita-3509", ptype->type_id == type_id);
59462 +
59463 + INIT_LIST_HEAD(&ptype->plugins_list);
59464 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59465 + for (i = 0; i < ptype->builtin_num; ++i) {
59466 + reiser4_plugin *plugin;
59467 +
59468 + plugin = plugin_at(ptype, i);
59469 +
59470 + if (plugin->h.label == NULL)
59471 + /* uninitialized slot encountered */
59472 + continue;
59473 + assert("nikita-3445", plugin->h.type_id == type_id);
59474 + plugin->h.id = i;
59475 + if (plugin->h.pops != NULL &&
59476 + plugin->h.pops->init != NULL) {
59477 + int result;
59478 +
59479 + result = plugin->h.pops->init(plugin);
59480 + if (result != 0)
59481 + return result;
59482 + }
59483 + INIT_LIST_HEAD(&plugin->h.linkage);
59484 + list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
59485 + }
59486 + }
59487 + return 0;
59488 +}
59489 +
59490 +/* true if plugin type id is valid */
59491 +int is_type_id_valid(reiser4_plugin_type type_id /* plugin type id */ )
59492 +{
59493 + /* "type_id" is unsigned, so no comparison with 0 is
59494 + necessary */
59495 + return (type_id < REISER4_PLUGIN_TYPES);
59496 +}
59497 +
59498 +/* true if plugin id is valid */
59499 +int is_plugin_id_valid(reiser4_plugin_type type_id /* plugin type id */ ,
59500 + reiser4_plugin_id id /* plugin id */ )
59501 +{
59502 + assert("nikita-1653", is_type_id_valid(type_id));
59503 + return id < plugins[type_id].builtin_num;
59504 +}
59505 +
59506 +/* return plugin by its @type_id and @id.
59507 +
59508 + Both arguments are checked for validness: this is supposed to be called
59509 + from user-level.
59510 +
59511 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
59512 +user space, and passed to the filesystem by use of method files? Your
59513 +comment really confused me on the first reading....
59514 +
59515 +*/
59516 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id /* plugin
59517 + * type id,
59518 + * unchecked */ ,
59519 + reiser4_plugin_id id /* plugin id,
59520 + * unchecked */ )
59521 +{
59522 + if (is_type_id_valid(type_id)) {
59523 + if (is_plugin_id_valid(type_id, id))
59524 + return plugin_at(&plugins[type_id], id);
59525 + else
59526 + /* id out of bounds */
59527 + warning("nikita-2913",
59528 + "Invalid plugin id: [%i:%i]", type_id, id);
59529 + } else
59530 + /* type_id out of bounds */
59531 + warning("nikita-2914", "Invalid type_id: %i", type_id);
59532 + return NULL;
59533 +}
59534 +
59535 +/**
59536 + * save_plugin_id - store plugin id in disk format
59537 + * @plugin: plugin to convert
59538 + * @area: where to store result
59539 + *
59540 + * Puts id of @plugin in little endian format to address @area.
59541 + */
59542 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
59543 + d16 *area /* where to store result */ )
59544 +{
59545 + assert("nikita-1261", plugin != NULL);
59546 + assert("nikita-1262", area != NULL);
59547 +
59548 + put_unaligned(cpu_to_le16(plugin->h.id), area);
59549 + return 0;
59550 +}
59551 +
59552 +/* list of all plugins of given type */
59553 +struct list_head *get_plugin_list(reiser4_plugin_type type_id /* plugin type
59554 + * id */ )
59555 +{
59556 + assert("nikita-1056", is_type_id_valid(type_id));
59557 + return &plugins[type_id].plugins_list;
59558 +}
59559 +
59560 +int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb)
59561 +{
59562 + reiser4_plugin *plug;
59563 + reiser4_inode *parent;
59564 +
59565 + parent = reiser4_inode_data(ancestor);
59566 + plug = pset_get(parent->hset, memb) ? : pset_get(parent->pset, memb);
59567 + return grab_plugin_from(self, memb, plug);
59568 +}
59569 +
59570 +static void update_plugin_mask(reiser4_inode * info, pset_member memb)
59571 +{
59572 + struct dentry *rootdir;
59573 + reiser4_inode *root;
59574 +
59575 + rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
59576 + if (rootdir != NULL) {
59577 + root = reiser4_inode_data(rootdir->d_inode);
59578 + /*
59579 + * if inode is different from the default one, or we are
59580 + * changing plugin of root directory, update plugin_mask
59581 + */
59582 + if (pset_get(info->pset, memb) != pset_get(root->pset, memb) ||
59583 + info == root)
59584 + info->plugin_mask |= (1 << memb);
59585 + }
59586 +}
59587 +
59588 +int
59589 +grab_plugin_from(struct inode *self, pset_member memb, reiser4_plugin * plug)
59590 +{
59591 + reiser4_inode *info;
59592 + int result = 0;
59593 +
59594 + info = reiser4_inode_data(self);
59595 + if (pset_get(info->pset, memb) == NULL) {
59596 + result = pset_set(&info->pset, memb, plug);
59597 + if (result == 0)
59598 + update_plugin_mask(info, memb);
59599 + }
59600 + return result;
59601 +}
59602 +
59603 +int force_plugin(struct inode *self, pset_member memb, reiser4_plugin * plug)
59604 +{
59605 + reiser4_inode *info;
59606 + int result = 0;
59607 +
59608 + info = reiser4_inode_data(self);
59609 + if (plug->h.pops != NULL && plug->h.pops->change != NULL)
59610 + result = plug->h.pops->change(self, plug);
59611 + else
59612 + result = pset_set(&info->pset, memb, plug);
59613 + if (result == 0)
59614 + update_plugin_mask(info, memb);
59615 + return result;
59616 +}
59617 +
59618 +reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
59619 + /* C90 initializers */
59620 + [REISER4_FILE_PLUGIN_TYPE] = {
59621 + .type_id = REISER4_FILE_PLUGIN_TYPE,
59622 + .label = "file",
59623 + .desc = "Object plugins",
59624 + .builtin_num = sizeof_array(file_plugins),
59625 + .builtin = file_plugins,
59626 + .plugins_list = {NULL, NULL},
59627 + .size = sizeof(file_plugin)
59628 + },
59629 + [REISER4_DIR_PLUGIN_TYPE] = {
59630 + .type_id = REISER4_DIR_PLUGIN_TYPE,
59631 + .label = "dir",
59632 + .desc = "Directory plugins",
59633 + .builtin_num = sizeof_array(dir_plugins),
59634 + .builtin = dir_plugins,
59635 + .plugins_list = {NULL, NULL},
59636 + .size = sizeof(dir_plugin)
59637 + },
59638 + [REISER4_HASH_PLUGIN_TYPE] = {
59639 + .type_id = REISER4_HASH_PLUGIN_TYPE,
59640 + .label = "hash",
59641 + .desc = "Directory hashes",
59642 + .builtin_num = sizeof_array(hash_plugins),
59643 + .builtin = hash_plugins,
59644 + .plugins_list = {NULL, NULL},
59645 + .size = sizeof(hash_plugin)
59646 + },
59647 + [REISER4_FIBRATION_PLUGIN_TYPE] = {
59648 + .type_id =
59649 + REISER4_FIBRATION_PLUGIN_TYPE,
59650 + .label = "fibration",
59651 + .desc = "Directory fibrations",
59652 + .builtin_num = sizeof_array(fibration_plugins),
59653 + .builtin = fibration_plugins,
59654 + .plugins_list = {NULL, NULL},
59655 + .size = sizeof(fibration_plugin)
59656 + },
59657 + [REISER4_CIPHER_PLUGIN_TYPE] = {
59658 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
59659 + .label = "cipher",
59660 + .desc = "Cipher plugins",
59661 + .builtin_num = sizeof_array(cipher_plugins),
59662 + .builtin = cipher_plugins,
59663 + .plugins_list = {NULL, NULL},
59664 + .size = sizeof(cipher_plugin)
59665 + },
59666 + [REISER4_DIGEST_PLUGIN_TYPE] = {
59667 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
59668 + .label = "digest",
59669 + .desc = "Digest plugins",
59670 + .builtin_num = sizeof_array(digest_plugins),
59671 + .builtin = digest_plugins,
59672 + .plugins_list = {NULL, NULL},
59673 + .size = sizeof(digest_plugin)
59674 + },
59675 + [REISER4_COMPRESSION_PLUGIN_TYPE] = {
59676 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
59677 + .label = "compression",
59678 + .desc = "Compression plugins",
59679 + .builtin_num = sizeof_array(compression_plugins),
59680 + .builtin = compression_plugins,
59681 + .plugins_list = {NULL, NULL},
59682 + .size = sizeof(compression_plugin)
59683 + },
59684 + [REISER4_FORMATTING_PLUGIN_TYPE] = {
59685 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
59686 + .label = "formatting",
59687 + .desc = "Tail inlining policies",
59688 + .builtin_num = sizeof_array(formatting_plugins),
59689 + .builtin = formatting_plugins,
59690 + .plugins_list = {NULL, NULL},
59691 + .size = sizeof(formatting_plugin)
59692 + },
59693 + [REISER4_PERM_PLUGIN_TYPE] = {
59694 + .type_id = REISER4_PERM_PLUGIN_TYPE,
59695 + .label = "perm",
59696 + .desc = "Permission checks",
59697 + .builtin_num = sizeof_array(perm_plugins),
59698 + .builtin = perm_plugins,
59699 + .plugins_list = {NULL, NULL},
59700 + .size = sizeof(perm_plugin)
59701 + },
59702 + [REISER4_ITEM_PLUGIN_TYPE] = {
59703 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
59704 + .label = "item",
59705 + .desc = "Item handlers",
59706 + .builtin_num = sizeof_array(item_plugins),
59707 + .builtin = item_plugins,
59708 + .plugins_list = {NULL, NULL},
59709 + .size = sizeof(item_plugin)
59710 + },
59711 + [REISER4_NODE_PLUGIN_TYPE] = {
59712 + .type_id = REISER4_NODE_PLUGIN_TYPE,
59713 + .label = "node",
59714 + .desc = "node layout handlers",
59715 + .builtin_num = sizeof_array(node_plugins),
59716 + .builtin = node_plugins,
59717 + .plugins_list = {NULL, NULL},
59718 + .size = sizeof(node_plugin)
59719 + },
59720 + [REISER4_SD_EXT_PLUGIN_TYPE] = {
59721 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
59722 + .label = "sd_ext",
59723 + .desc = "Parts of stat-data",
59724 + .builtin_num = sizeof_array(sd_ext_plugins),
59725 + .builtin = sd_ext_plugins,
59726 + .plugins_list = {NULL, NULL},
59727 + .size = sizeof(sd_ext_plugin)
59728 + },
59729 + [REISER4_FORMAT_PLUGIN_TYPE] = {
59730 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
59731 + .label = "disk_layout",
59732 + .desc = "defines filesystem on disk layout",
59733 + .builtin_num = sizeof_array(format_plugins),
59734 + .builtin = format_plugins,
59735 + .plugins_list = {NULL, NULL},
59736 + .size = sizeof(disk_format_plugin)
59737 + },
59738 + [REISER4_JNODE_PLUGIN_TYPE] = {
59739 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
59740 + .label = "jnode",
59741 + .desc = "defines kind of jnode",
59742 + .builtin_num = sizeof_array(jnode_plugins),
59743 + .builtin = jnode_plugins,
59744 + .plugins_list = {NULL, NULL},
59745 + .size = sizeof(jnode_plugin)
59746 + },
59747 + [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
59748 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59749 + .label = "compression_mode",
59750 + .desc = "Defines compression mode",
59751 + .builtin_num = sizeof_array(compression_mode_plugins),
59752 + .builtin = compression_mode_plugins,
59753 + .plugins_list = {NULL, NULL},
59754 + .size = sizeof(compression_mode_plugin)
59755 + },
59756 + [REISER4_CLUSTER_PLUGIN_TYPE] = {
59757 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
59758 + .label = "cluster",
59759 + .desc = "Defines cluster size",
59760 + .builtin_num = sizeof_array(cluster_plugins),
59761 + .builtin = cluster_plugins,
59762 + .plugins_list = {NULL, NULL},
59763 + .size = sizeof(cluster_plugin)
59764 + },
59765 + [REISER4_REGULAR_PLUGIN_TYPE] = {
59766 + .type_id = REISER4_REGULAR_PLUGIN_TYPE,
59767 + .label = "regular",
59768 + .desc = "Defines kind of regular file",
59769 + .builtin_num =
59770 + sizeof_array(regular_plugins),
59771 + .builtin = regular_plugins,
59772 + .plugins_list = {NULL, NULL},
59773 + .size = sizeof(regular_plugin)
59774 + }
59775 +};
59776 +
59777 +/*
59778 + * Local variables:
59779 + * c-indentation-style: "K&R"
59780 + * mode-name: "LC"
59781 + * c-basic-offset: 8
59782 + * tab-width: 8
59783 + * fill-column: 120
59784 + * End:
59785 + */
59786 Index: linux-2.6.16/fs/reiser4/plugin/plugin.h
59787 ===================================================================
59788 --- /dev/null
59789 +++ linux-2.6.16/fs/reiser4/plugin/plugin.h
59790 @@ -0,0 +1,936 @@
59791 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59792 +
59793 +/* Basic plugin data-types.
59794 + see fs/reiser4/plugin/plugin.c for details */
59795 +
59796 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
59797 +#define __FS_REISER4_PLUGIN_TYPES_H__
59798 +
59799 +#include "../forward.h"
59800 +#include "../debug.h"
59801 +#include "../dformat.h"
59802 +#include "../key.h"
59803 +#include "compress/compress.h"
59804 +#include "crypto/cipher.h"
59805 +#include "plugin_header.h"
59806 +#include "item/static_stat.h"
59807 +#include "item/internal.h"
59808 +#include "item/sde.h"
59809 +#include "item/cde.h"
59810 +#include "item/item.h"
59811 +#include "node/node.h"
59812 +#include "node/node40.h"
59813 +#include "security/perm.h"
59814 +#include "fibration.h"
59815 +
59816 +#include "space/bitmap.h"
59817 +#include "space/space_allocator.h"
59818 +
59819 +#include "disk_format/disk_format40.h"
59820 +#include "disk_format/disk_format.h"
59821 +
59822 +#include <linux/fs.h> /* for struct super_block, address_space */
59823 +#include <linux/mm.h> /* for struct page */
59824 +#include <linux/buffer_head.h> /* for struct buffer_head */
59825 +#include <linux/dcache.h> /* for struct dentry */
59826 +#include <linux/types.h>
59827 +#include <linux/crypto.h>
59828 +
59829 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
59830 +
59831 +/*
59832 + * File plugin. Defines the set of methods that file plugins implement, some
59833 + * of which are optional.
59834 + *
59835 + * A file plugin offers to the caller an interface for IO ( writing to and/or
59836 + * reading from) to what the caller sees as one sequence of bytes. An IO to it
59837 + * may affect more than one physical sequence of bytes, or no physical sequence
59838 + * of bytes, it may affect sequences of bytes offered by other file plugins to
59839 + * the semantic layer, and the file plugin may invoke other plugins and
59840 + * delegate work to them, but its interface is structured for offering the
59841 + * caller the ability to read and/or write what the caller sees as being a
59842 + * single sequence of bytes.
59843 + *
59844 + * The file plugin must present a sequence of bytes to the caller, but it does
59845 + * not necessarily have to store a sequence of bytes, it does not necessarily
59846 + * have to support efficient tree traversal to any offset in the sequence of
59847 + * bytes (tail and extent items, whose keys contain offsets, do however provide
59848 + * efficient non-sequential lookup of any offset in the sequence of bytes).
59849 + *
59850 + * Directory plugins provide methods for selecting file plugins by resolving a
59851 + * name for them.
59852 + *
59853 + * The functionality other filesystems call an attribute, and rigidly tie
59854 + * together, we decompose into orthogonal selectable features of files. Using
59855 + * the terminology we will define next, an attribute is a perhaps constrained,
59856 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
59857 + * which might be grandparent-major-packed, and whose parent has a deletion
59858 + * method that deletes it.
59859 + *
59860 + * File plugins can implement constraints.
59861 + *
59862 + * Files can be of variable length (e.g. regular unix files), or of static
59863 + * length (e.g. static sized attributes).
59864 + *
59865 + * An object may have many sequences of bytes, and many file plugins, but, it
59866 + * has exactly one objectid. It is usually desirable that an object has a
59867 + * deletion method which deletes every item with that objectid. Items cannot
59868 + * in general be found by just their objectids. This means that an object must
59869 + * have either a method built into its deletion plugin method for knowing what
59870 + * items need to be deleted, or links stored with the object that provide the
59871 + * plugin with a method for finding those items. Deleting a file within an
59872 + * object may or may not have the effect of deleting the entire object,
59873 + * depending on the file plugin's deletion method.
59874 + *
59875 + * LINK TAXONOMY:
59876 + *
59877 + * Many objects have a reference count, and when the reference count reaches 0
59878 + * the object's deletion method is invoked. Some links embody a reference
59879 + * count increase ("countlinks"), and others do not ("nocountlinks").
59880 + *
59881 + * Some links are bi-directional links ("bilinks"), and some are
59882 + * uni-directional("unilinks").
59883 + *
59884 + * Some links are between parts of the same object ("intralinks"), and some are
59885 + * between different objects ("interlinks").
59886 + *
59887 + * PACKING TAXONOMY:
59888 + *
59889 + * Some items of an object are stored with a major packing locality based on
59890 + * their object's objectid (e.g. unix directory items in plan A), and these are
59891 + * called "self-major-packed".
59892 + *
59893 + * Some items of an object are stored with a major packing locality based on
59894 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
59895 + * and these are called "parent-major-packed".
59896 + *
59897 + * Some items of an object are stored with a major packing locality based on
59898 + * their semantic grandparent, and these are called "grandparent-major-packed".
59899 + * Now carefully notice that we run into trouble with key length if we have to
59900 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
59901 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
59902 + * a 24 byte key. One of these fields must be sacrificed if an item is to be
59903 + * grandparent-major-packed, and which to sacrifice is left to the item author
59904 + * choosing to make the item grandparent-major-packed. You cannot make tail
59905 + * items and extent items grandparent-major-packed, though you could make them
59906 + * self-major-packed (usually they are parent-major-packed).
59907 + *
59908 + * In the case of ACLs (which are composed of fixed length ACEs which consist
59909 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
59910 + * to not have an offset field in the ACE item key, and to allow duplicate keys
59911 + * for ACEs. Thus, the set of ACES for a given file is found by looking for a
59912 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
59913 + * a directory together), the minor packing locality of ACE, the objectid of
59914 + * the file, and 0.
59915 + *
59916 + * IO involves moving data from one location to another, which means that two
59917 + * locations must be specified, source and destination.
59918 + *
59919 + * This source and destination can be in the filesystem, or they can be a
59920 + * pointer in the user process address space plus a byte count.
59921 + *
59922 + * If both source and destination are in the filesystem, then at least one of
59923 + * them must be representable as a pure stream of bytes (which we call a flow,
59924 + * and define as a struct containing a key, a data pointer, and a length).
59925 + * This may mean converting one of them into a flow. We provide a generic
59926 + * cast_into_flow() method, which will work for any plugin supporting
59927 + * read_flow(), though it is inefficiently implemented in that it temporarily
59928 + * stores the flow in a buffer (Question: what to do with huge flows that
59929 + * cannot fit into memory? Answer: we must not convert them all at once. )
59930 + *
59931 + * Performing a write requires resolving the write request into a flow defining
59932 + * the source, and a method that performs the write, and a key that defines
59933 + * where in the tree the write is to go.
59934 + *
59935 + * Performing a read requires resolving the read request into a flow defining
59936 + * the target, and a method that performs the read, and a key that defines
59937 + * where in the tree the read is to come from.
59938 + *
59939 + * There will exist file plugins which have no pluginid stored on the disk for
59940 + * them, and which are only invoked by other plugins.
59941 + */
59942 +
59943 +/* builtin file-plugins */
59944 +typedef enum {
59945 + /* regular file */
59946 + UNIX_FILE_PLUGIN_ID,
59947 + /* directory */
59948 + DIRECTORY_FILE_PLUGIN_ID,
59949 + /* symlink */
59950 + SYMLINK_FILE_PLUGIN_ID,
59951 + /* for objects completely handled by the VFS: fifos, devices,
59952 + sockets */
59953 + SPECIAL_FILE_PLUGIN_ID,
59954 + /* regular cryptcompress file */
59955 + CRC_FILE_PLUGIN_ID,
59956 + /* number of file plugins. Used as size of arrays to hold
59957 + file plugins. */
59958 + LAST_FILE_PLUGIN_ID
59959 +} reiser4_file_id;
59960 +
59961 +typedef struct file_plugin {
59962 +
59963 + /* generic fields */
59964 + plugin_header h;
59965 +
59966 + struct inode_operations inode_ops;
59967 + struct file_operations file_ops;
59968 + struct address_space_operations as_ops;
59969 +
59970 + /* save inode cached stat-data onto disk. It was called
59971 + reiserfs_update_sd() in 3.x */
59972 + int (*write_sd_by_inode) (struct inode *);
59973 +
59974 + /*
59975 + * private methods: These are optional. If used they will allow you to
59976 + * minimize the amount of code needed to implement a deviation from
59977 + * some other method that also uses them.
59978 + */
59979 +
59980 + /*
59981 + * Construct flow into @flow according to user-supplied data.
59982 + *
59983 + * This is used by read/write methods to construct a flow to
59984 + * write/read. ->flow_by_inode() is plugin method, rather than single
59985 + * global implementation, because key in a flow used by plugin may
59986 + * depend on data in a @buf.
59987 + *
59988 + * NIKITA-FIXME-HANS: please create statistics on what functions are
59989 + * dereferenced how often for the mongo benchmark. You can supervise
59990 + * Elena doing this for you if that helps. Email me the list of the
59991 + * top 10, with their counts, and an estimate of the total number of
59992 + * CPU cycles spent dereferencing as a percentage of CPU cycles spent
59993 + * processing (non-idle processing). If the total percent is, say,
59994 + * less than 1%, it will make our coding discussions much easier, and
59995 + * keep me from questioning whether functions like the below are too
59996 + * frequently called to be dereferenced. If the total percent is more
59997 + * than 1%, perhaps private methods should be listed in a "required"
59998 + * comment at the top of each plugin (with stern language about how if
59999 + * the comment is missing it will not be accepted by the maintainer),
60000 + * and implemented using macros not dereferenced functions. How about
60001 + * replacing this whole private methods part of the struct with a
60002 + * thorough documentation of what the standard helper functions are for
60003 + * use in constructing plugins? I think users have been asking for
60004 + * that, though not in so many words.
60005 + */
60006 + int (*flow_by_inode) (struct inode *, const char __user *buf,
60007 + int user, loff_t size,
60008 + loff_t off, rw_op op, flow_t *);
60009 +
60010 + /*
60011 + * Return the key used to retrieve an offset of a file. It is used by
60012 + * default implementation of ->flow_by_inode() method
60013 + * (common_build_flow()) and, among other things, to get to the extent
60014 + * from jnode of unformatted node.
60015 + */
60016 + int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
60017 +
60018 + /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
60019 + /*
60020 + * set the plugin for a file. Called during file creation in creat()
60021 + * but not reiser4() unless an inode already exists for the file.
60022 + */
60023 + int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
60024 + reiser4_object_create_data *);
60025 +
60026 + /* NIKITA-FIXME-HANS: comment and name seem to say different things,
60027 + * are you setting up the object itself also or just adjusting the
60028 + * parent?.... */
60029 + /* set up plugins for new @object created in @parent. @root is root
60030 + directory. */
60031 + int (*adjust_to_parent) (struct inode *object, struct inode *parent,
60032 + struct inode *root);
60033 + /*
60034 + * this does whatever is necessary to do when object is created. For
60035 + * instance, for unix files stat data is inserted. It is supposed to be
60036 + * called by create of struct inode_operations.
60037 + */
60038 + int (*create_object) (struct inode *object, struct inode *parent,
60039 + reiser4_object_create_data *);
60040 +
60041 + /* this does whatever is necessary to do when object is opened */
60042 + int (*open_object) (struct inode * inode, struct file * file);
60043 + /*
60044 + * this method should check REISER4_NO_SD and set REISER4_NO_SD on
60045 + * success. Deletion of an object usually includes removal of items
60046 + * building file body (for directories this is removal of "." and "..")
60047 + * and removal of stat-data item.
60048 + */
60049 + int (*delete_object) (struct inode *);
60050 +
60051 + /* add link from @parent to @object */
60052 + int (*add_link) (struct inode *object, struct inode *parent);
60053 +
60054 + /* remove link from @parent to @object */
60055 + int (*rem_link) (struct inode *object, struct inode *parent);
60056 +
60057 + /*
60058 + * return true if item addressed by @coord belongs to @inode. This is
60059 + * used by read/write to properly slice flow into items in presence of
60060 + * multiple key assignment policies, because items of a file are not
60061 + * necessarily contiguous in a key space, for example, in a plan-b.
60062 + */
60063 + int (*owns_item) (const struct inode *, const coord_t *);
60064 +
60065 + /* checks whether yet another hard links to this object can be
60066 + added */
60067 + int (*can_add_link) (const struct inode *);
60068 +
60069 + /* checks whether hard links to this object can be removed */
60070 + int (*can_rem_link) (const struct inode *);
60071 +
60072 + /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
60073 + detach of directory plugin to remove ".." */
60074 + int (*detach) (struct inode * child, struct inode * parent);
60075 +
60076 + /* called when @child was just looked up in the @parent. It is not
60077 + empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
60078 + directory plugin */
60079 + int (*bind) (struct inode * child, struct inode * parent);
60080 +
60081 + /* process safe-link during mount */
60082 + int (*safelink) (struct inode * object, reiser4_safe_link_t link,
60083 + __u64 value);
60084 +
60085 + /* The couple of estimate methods for all file operations */
60086 + struct {
60087 + reiser4_block_nr(*create) (const struct inode *);
60088 + reiser4_block_nr(*update) (const struct inode *);
60089 + reiser4_block_nr(*unlink) (const struct inode *,
60090 + const struct inode *);
60091 + } estimate;
60092 +
60093 + /*
60094 + * reiser4 specific part of inode has a union of structures which are
60095 + * specific to a plugin. This method is called when inode is read
60096 + * (read_inode) and when file is created (common_create_child) so that
60097 + * file plugin could initialize its inode data
60098 + */
60099 + void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
60100 + int);
60101 +
60102 + /*
60103 + * This method performs progressive deletion of items and whole nodes
60104 + * from right to left.
60105 + *
60106 + * @tap: the point deletion process begins from,
60107 + * @from_key: the beginning of the deleted key range,
60108 + * @to_key: the end of the deleted key range,
60109 + * @smallest_removed: the smallest removed key,
60110 + *
60111 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
60112 + * operation was interrupted for allowing atom commit .
60113 + */
60114 + int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
60115 + const reiser4_key * to_key,
60116 + reiser4_key * smallest_removed, struct inode *,
60117 + int, int *);
60118 +
60119 + /* called from ->destroy_inode() */
60120 + void (*destroy_inode) (struct inode *);
60121 +
60122 + /*
60123 + * methods to serialize object identify. This is used, for example, by
60124 + * reiser4_{en,de}code_fh().
60125 + */
60126 + struct {
60127 + /* store object's identity at @area */
60128 + char *(*write) (struct inode * inode, char *area);
60129 + /* parse object from wire to the @obj */
60130 + char *(*read) (char *area, reiser4_object_on_wire * obj);
60131 + /* given object identity in @obj, find or create its dentry */
60132 + struct dentry *(*get) (struct super_block * s,
60133 + reiser4_object_on_wire * obj);
60134 + /* how many bytes ->wire.write() consumes */
60135 + int (*size) (struct inode * inode);
60136 + /* finish with object identify */
60137 + void (*done) (reiser4_object_on_wire * obj);
60138 + } wire;
60139 +} file_plugin;
60140 +
60141 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60142 +
60143 +struct reiser4_object_on_wire {
60144 + file_plugin *plugin;
60145 + union {
60146 + struct {
60147 + obj_key_id key_id;
60148 + } std;
60149 + void *generic;
60150 + } u;
60151 +};
60152 +
60153 +/* builtin dir-plugins */
60154 +typedef enum {
60155 + HASHED_DIR_PLUGIN_ID,
60156 + SEEKABLE_HASHED_DIR_PLUGIN_ID,
60157 + LAST_DIR_ID
60158 +} reiser4_dir_id;
60159 +
60160 +typedef struct dir_plugin {
60161 + /* generic fields */
60162 + plugin_header h;
60163 +
60164 + struct inode_operations inode_ops;
60165 + struct file_operations file_ops;
60166 + struct address_space_operations as_ops;
60167 +
60168 + /*
60169 + * private methods: These are optional. If used they will allow you to
60170 + * minimize the amount of code needed to implement a deviation from
60171 + * some other method that uses them. You could logically argue that
60172 + * they should be a separate type of plugin.
60173 + */
60174 +
60175 + struct dentry *(*get_parent) (struct inode * childdir);
60176 +
60177 + /*
60178 + * check whether "name" is acceptable name to be inserted into this
60179 + * object. Optionally implemented by directory-like objects. Can check
60180 + * for maximal length, reserved symbols etc
60181 + */
60182 + int (*is_name_acceptable) (const struct inode * inode, const char *name,
60183 + int len);
60184 +
60185 + void (*build_entry_key) (const struct inode * dir /* directory where
60186 + * entry is (or will
60187 + * be) in.*/ ,
60188 + const struct qstr * name /* name of file
60189 + * referenced by this
60190 + * entry */ ,
60191 + reiser4_key * result /* resulting key of
60192 + * directory entry */ );
60193 + int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60194 + int (*add_entry) (struct inode * object, struct dentry * where,
60195 + reiser4_object_create_data * data,
60196 + reiser4_dir_entry_desc * entry);
60197 + int (*rem_entry) (struct inode * object, struct dentry * where,
60198 + reiser4_dir_entry_desc * entry);
60199 +
60200 + /*
60201 + * initialize directory structure for newly created object. For normal
60202 + * unix directories, insert dot and dotdot.
60203 + */
60204 + int (*init) (struct inode * object, struct inode * parent,
60205 + reiser4_object_create_data * data);
60206 +
60207 + /* destroy directory */
60208 + int (*done) (struct inode * child);
60209 +
60210 + /* called when @subdir was just looked up in the @dir */
60211 + int (*attach) (struct inode * subdir, struct inode * dir);
60212 + int (*detach) (struct inode * subdir, struct inode * dir);
60213 +
60214 + struct {
60215 + reiser4_block_nr(*add_entry) (const struct inode *);
60216 + reiser4_block_nr(*rem_entry) (const struct inode *);
60217 + reiser4_block_nr(*unlink) (const struct inode *,
60218 + const struct inode *);
60219 + } estimate;
60220 +} dir_plugin;
60221 +
60222 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60223 +
60224 +typedef struct formatting_plugin {
60225 + /* generic fields */
60226 + plugin_header h;
60227 + /* returns non-zero iff file's tail has to be stored
60228 + in a direct item. */
60229 + int (*have_tail) (const struct inode * inode, loff_t size);
60230 +} formatting_plugin;
60231 +
60232 +typedef struct hash_plugin {
60233 + /* generic fields */
60234 + plugin_header h;
60235 + /* computes hash of the given name */
60236 + __u64(*hash) (const unsigned char *name, int len);
60237 +} hash_plugin;
60238 +
60239 +typedef struct cipher_plugin {
60240 + /* generic fields */
60241 + plugin_header h;
60242 + struct crypto_tfm * (*alloc) (void);
60243 + void (*free) (struct crypto_tfm * tfm);
60244 + /* Offset translator. For each offset this returns (k * offset), where
60245 + k (k >= 1) is an expansion factor of the cipher algorithm.
60246 + For all symmetric algorithms k == 1. For asymmetric algorithms (which
60247 + inflate data) offset translation guarantees that all disk cluster's
60248 + units will have keys smaller then next cluster's one.
60249 + */
60250 + loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60251 + /* Cipher algorithms can accept data only by chunks of cipher block
60252 + size. This method is to align any flow up to cipher block size when
60253 + we pass it to cipher algorithm. To align means to append padding of
60254 + special format specific to the cipher algorithm */
60255 + int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60256 + /* low-level key manager (check, install, etc..) */
60257 + int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60258 + unsigned int keylen);
60259 + /* main text processing procedures */
60260 + void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60261 + void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60262 +} cipher_plugin;
60263 +
60264 +typedef struct digest_plugin {
60265 + /* generic fields */
60266 + plugin_header h;
60267 + /* fingerprint size in bytes */
60268 + int fipsize;
60269 + struct crypto_tfm * (*alloc) (void);
60270 + void (*free) (struct crypto_tfm * tfm);
60271 +} digest_plugin;
60272 +
60273 +typedef struct compression_plugin {
60274 + /* generic fields */
60275 + plugin_header h;
60276 + int (*init) (void);
60277 + /* the maximum number of bytes the size of the "compressed" data can
60278 + * exceed the uncompressed data. */
60279 + int (*overrun) (unsigned src_len);
60280 + coa_t(*alloc) (tfm_action act);
60281 + void (*free) (coa_t coa, tfm_action act);
60282 + /* minimal size of the flow we still try to compress */
60283 + int (*min_size_deflate) (void);
60284 + __u32(*checksum) (char *data, __u32 length);
60285 + /* main transform procedures */
60286 + void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60287 + __u8 * dst_first, unsigned *dst_len);
60288 + void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60289 + __u8 * dst_first, unsigned *dst_len);
60290 +} compression_plugin;
60291 +
60292 +typedef struct compression_mode_plugin {
60293 + /* generic fields */
60294 + plugin_header h;
60295 + /* this is called when estimating compressibility
60296 + of a logical cluster by its content */
60297 + int (*should_deflate) (struct inode * inode, cloff_t index);
60298 + /* this is called when results of compression should be saved */
60299 + int (*accept_hook) (struct inode * inode, cloff_t index);
60300 + /* this is called when results of compression should be discarded */
60301 + int (*discard_hook) (struct inode * inode, cloff_t index);
60302 +} compression_mode_plugin;
60303 +
60304 +typedef struct regular_plugin {
60305 + /* generic fields */
60306 + plugin_header h;
60307 + /* file plugin id which implements regular file */
60308 + reiser4_file_id id;
60309 +} regular_plugin;
60310 +
60311 +typedef struct cluster_plugin {
60312 + /* generic fields */
60313 + plugin_header h;
60314 + int shift;
60315 +} cluster_plugin;
60316 +
60317 +typedef struct sd_ext_plugin {
60318 + /* generic fields */
60319 + plugin_header h;
60320 + int (*present) (struct inode * inode, char **area, int *len);
60321 + int (*absent) (struct inode * inode);
60322 + int (*save_len) (struct inode * inode);
60323 + int (*save) (struct inode * inode, char **area);
60324 + /* alignment requirement for this stat-data part */
60325 + int alignment;
60326 +} sd_ext_plugin;
60327 +
60328 +/* this plugin contains methods to allocate objectid for newly created files,
60329 + to deallocate objectid when file gets removed, to report number of used and
60330 + free objectids */
60331 +typedef struct oid_allocator_plugin {
60332 + /* generic fields */
60333 + plugin_header h;
60334 + int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60335 + __u64 oids);
60336 + /* used to report statfs->f_files */
60337 + __u64(*oids_used) (reiser4_oid_allocator * map);
60338 + /* get next oid to use */
60339 + __u64(*next_oid) (reiser4_oid_allocator * map);
60340 + /* used to report statfs->f_ffree */
60341 + __u64(*oids_free) (reiser4_oid_allocator * map);
60342 + /* allocate new objectid */
60343 + int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60344 + /* release objectid */
60345 + int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60346 + /* how many pages to reserve in transaction for allocation of new
60347 + objectid */
60348 + int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60349 + /* how many pages to reserve in transaction for freeing of an
60350 + objectid */
60351 + int (*oid_reserve_release) (reiser4_oid_allocator * map);
60352 + void (*print_info) (const char *, reiser4_oid_allocator *);
60353 +} oid_allocator_plugin;
60354 +
60355 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
60356 + are any) locations, etc */
60357 +typedef struct disk_format_plugin {
60358 + /* generic fields */
60359 + plugin_header h;
60360 + /* replay journal, initialize super_info_data, etc */
60361 + int (*init_format) (struct super_block *, void *data);
60362 +
60363 + /* key of root directory stat data */
60364 + const reiser4_key *(*root_dir_key) (const struct super_block *);
60365 +
60366 + int (*release) (struct super_block *);
60367 + jnode *(*log_super) (struct super_block *);
60368 + int (*check_open) (const struct inode * object);
60369 +} disk_format_plugin;
60370 +
60371 +struct jnode_plugin {
60372 + /* generic fields */
60373 + plugin_header h;
60374 + int (*init) (jnode * node);
60375 + int (*parse) (jnode * node);
60376 + struct address_space *(*mapping) (const jnode * node);
60377 + unsigned long (*index) (const jnode * node);
60378 + jnode *(*clone) (jnode * node);
60379 +};
60380 +
60381 +/* plugin instance. */
60382 +/* */
60383 +/* This is "wrapper" union for all types of plugins. Most of the code uses */
60384 +/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
60385 +/* operates with pointers to reiser4_plugin. This union is only used in */
60386 +/* some generic code in plugin/plugin.c that operates on all */
60387 +/* plugins. Technically speaking purpose of this union is to add type */
60388 +/* safety to said generic code: each plugin type (file_plugin, for */
60389 +/* example), contains plugin_header as its first memeber. This first member */
60390 +/* is located at the same place in memory as .h member of */
60391 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
60392 +/* looks in the .h which is header of plugin type located in union. This */
60393 +/* allows to avoid type-casts. */
60394 +union reiser4_plugin {
60395 + /* generic fields */
60396 + plugin_header h;
60397 + /* file plugin */
60398 + file_plugin file;
60399 + /* directory plugin */
60400 + dir_plugin dir;
60401 + /* hash plugin, used by directory plugin */
60402 + hash_plugin hash;
60403 + /* fibration plugin used by directory plugin */
60404 + fibration_plugin fibration;
60405 + /* cipher transform plugin, used by file plugin */
60406 + cipher_plugin cipher;
60407 + /* digest transform plugin, used by file plugin */
60408 + digest_plugin digest;
60409 + /* compression transform plugin, used by file plugin */
60410 + compression_plugin compression;
60411 + /* tail plugin, used by file plugin */
60412 + formatting_plugin formatting;
60413 + /* permission plugin */
60414 + perm_plugin perm;
60415 + /* node plugin */
60416 + node_plugin node;
60417 + /* item plugin */
60418 + item_plugin item;
60419 + /* stat-data extension plugin */
60420 + sd_ext_plugin sd_ext;
60421 + /* disk layout plugin */
60422 + disk_format_plugin format;
60423 + /* object id allocator plugin */
60424 + oid_allocator_plugin oid_allocator;
60425 + /* plugin for different jnode types */
60426 + jnode_plugin jnode;
60427 + /* compression mode plugin, used by object plugin */
60428 + compression_mode_plugin compression_mode;
60429 + /* cluster plugin, used by object plugin */
60430 + cluster_plugin clust;
60431 + /* regular plugin, used by directory plugin */
60432 + regular_plugin regular;
60433 + /* place-holder for new plugin types that can be registered
60434 + dynamically, and used by other dynamically loaded plugins. */
60435 + void *generic;
60436 +};
60437 +
60438 +struct reiser4_plugin_ops {
60439 + /* called when plugin is initialized */
60440 + int (*init) (reiser4_plugin * plugin);
60441 + /* called when plugin is unloaded */
60442 + int (*done) (reiser4_plugin * plugin);
60443 + /* load given plugin from disk */
60444 + int (*load) (struct inode * inode,
60445 + reiser4_plugin * plugin, char **area, int *len);
60446 + /* how many space is required to store this plugin's state
60447 + in stat-data */
60448 + int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
60449 + /* save persistent plugin-data to disk */
60450 + int (*save) (struct inode * inode, reiser4_plugin * plugin,
60451 + char **area);
60452 + /* alignment requirement for on-disk state of this plugin
60453 + in number of bytes */
60454 + int alignment;
60455 + /* install itself into given inode. This can return error
60456 + (e.g., you cannot change hash of non-empty directory). */
60457 + int (*change) (struct inode * inode, reiser4_plugin * plugin);
60458 + /* install itself into given inode. This can return error
60459 + (e.g., you cannot change hash of non-empty directory). */
60460 + int (*inherit) (struct inode * inode, struct inode * parent,
60461 + reiser4_plugin * plugin);
60462 +};
60463 +
60464 +/* functions implemented in fs/reiser4/plugin/plugin.c */
60465 +
60466 +/* stores plugin reference in reiser4-specific part of inode */
60467 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
60468 +extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
60469 +extern int init_plugins(void);
60470 +
60471 +/* builtin plugins */
60472 +
60473 +/* builtin hash-plugins */
60474 +
60475 +typedef enum {
60476 + RUPASOV_HASH_ID,
60477 + R5_HASH_ID,
60478 + TEA_HASH_ID,
60479 + FNV1_HASH_ID,
60480 + DEGENERATE_HASH_ID,
60481 + LAST_HASH_ID
60482 +} reiser4_hash_id;
60483 +
60484 +/* builtin cipher plugins */
60485 +
60486 +typedef enum {
60487 + NONE_CIPHER_ID,
60488 + AES_CIPHER_ID,
60489 + LAST_CIPHER_ID
60490 +} reiser4_cipher_id;
60491 +
60492 +/* builtin digest plugins */
60493 +
60494 +typedef enum {
60495 + SHA256_32_DIGEST_ID,
60496 + LAST_DIGEST_ID
60497 +} reiser4_digest_id;
60498 +
60499 +/* builtin compression mode plugins */
60500 +typedef enum {
60501 + NONE_COMPRESSION_MODE_ID,
60502 + COL_8_COMPRESSION_MODE_ID,
60503 + COL_16_COMPRESSION_MODE_ID,
60504 + COL_32_COMPRESSION_MODE_ID,
60505 + COZ_COMPRESSION_MODE_ID,
60506 + FORCE_COMPRESSION_MODE_ID,
60507 + TEST_COMPRESSION_MODE_ID,
60508 + LAST_COMPRESSION_MODE_ID
60509 +} reiser4_compression_mode_id;
60510 +
60511 +/* builtin cluster plugins */
60512 +typedef enum {
60513 + CLUSTER_64K_ID,
60514 + CLUSTER_32K_ID,
60515 + CLUSTER_16K_ID,
60516 + CLUSTER_8K_ID,
60517 + CLUSTER_4K_ID,
60518 + LAST_CLUSTER_ID
60519 +} reiser4_cluster_id;
60520 +
60521 +/* builtin regular plugins */
60522 +typedef enum {
60523 + UF_REGULAR_ID,
60524 + CRC_REGULAR_ID,
60525 + LAST_REGULAR_ID
60526 +} reiser4_regular_id;
60527 +
60528 +/* builtin tail-plugins */
60529 +
60530 +typedef enum {
60531 + NEVER_TAILS_FORMATTING_ID,
60532 + ALWAYS_TAILS_FORMATTING_ID,
60533 + SMALL_FILE_FORMATTING_ID,
60534 + LAST_TAIL_FORMATTING_ID
60535 +} reiser4_formatting_id;
60536 +
60537 +/* compression/clustering specific data */
60538 +typedef struct compression_data {
60539 + reiser4_compression_id coa; /* id of the compression algorithm */
60540 +} compression_data_t;
60541 +
60542 +typedef __u8 cluster_data_t; /* cluster info */
60543 +
60544 +/* data type used to pack parameters that we pass to vfs object creation
60545 + function create_object() */
60546 +struct reiser4_object_create_data {
60547 + /* plugin to control created object */
60548 + reiser4_file_id id;
60549 + /* mode of regular file, directory or special file */
60550 +/* what happens if some other sort of perm plugin is in use? */
60551 + int mode;
60552 + /* rdev of special file */
60553 + dev_t rdev;
60554 + /* symlink target */
60555 + const char *name;
60556 + /* add here something for non-standard objects you invent, like
60557 + query for interpolation file etc. */
60558 +
60559 + crypto_stat_t * crypto;
60560 + compression_data_t *compression;
60561 + cluster_data_t *cluster;
60562 +
60563 + struct inode *parent;
60564 + struct dentry *dentry;
60565 +};
60566 +
60567 +/* description of directory entry being created/destroyed/sought for
60568 +
60569 + It is passed down to the directory plugin and farther to the
60570 + directory item plugin methods. Creation of new directory is done in
60571 + several stages: first we search for an entry with the same name, then
60572 + create new one. reiser4_dir_entry_desc is used to store some information
60573 + collected at some stage of this process and required later: key of
60574 + item that we want to insert/delete and pointer to an object that will
60575 + be bound by the new directory entry. Probably some more fields will
60576 + be added there.
60577 +
60578 +*/
60579 +struct reiser4_dir_entry_desc {
60580 + /* key of directory entry */
60581 + reiser4_key key;
60582 + /* object bound by this entry. */
60583 + struct inode *obj;
60584 +};
60585 +
60586 +#define MAX_PLUGIN_TYPE_LABEL_LEN 32
60587 +#define MAX_PLUGIN_PLUG_LABEL_LEN 32
60588 +
60589 +/* used for interface with user-land: table-driven parsing in
60590 + reiser4(). */
60591 +typedef struct plugin_locator {
60592 + reiser4_plugin_type type_id;
60593 + reiser4_plugin_id id;
60594 + char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
60595 + char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
60596 +} plugin_locator;
60597 +
60598 +extern int locate_plugin(struct inode *inode, plugin_locator * loc);
60599 +
60600 +
60601 +#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
60602 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
60603 +{ \
60604 + reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
60605 + return plugin ? & plugin -> FIELD : NULL; \
60606 +} \
60607 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
60608 +{ \
60609 + reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
60610 + return plugin ? & plugin -> FIELD : NULL; \
60611 +} \
60612 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
60613 +{ \
60614 + reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
60615 + return plugin ? & plugin -> FIELD : NULL; \
60616 +} \
60617 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
60618 +{ \
60619 + return ( reiser4_plugin * ) plugin; \
60620 +} \
60621 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
60622 +{ \
60623 + return TYPE ## _to_plugin (plugin) -> h.id; \
60624 +} \
60625 +typedef struct { int foo; } TYPE ## _plugin_dummy
60626 +
60627 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
60628 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
60629 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
60630 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
60631 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
60632 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
60633 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
60634 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
60635 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
60636 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
60637 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
60638 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
60639 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
60640 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
60641 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60642 + compression_mode);
60643 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
60644 +PLUGIN_BY_ID(regular_plugin, REISER4_REGULAR_PLUGIN_TYPE, regular);
60645 +
60646 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
60647 +
60648 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
60649 +
60650 +#define for_all_plugins(ptype, plugin) \
60651 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
60652 + get_plugin_list(ptype) != &plugin->h.linkage; \
60653 + plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
60654 +
60655 +
60656 +/* enumeration of fields within plugin_set */
60657 +typedef enum {
60658 + PSET_FILE,
60659 + PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
60660 + * inode.c:read_inode() depends on this. */
60661 + PSET_PERM,
60662 + PSET_FORMATTING,
60663 + PSET_HASH,
60664 + PSET_FIBRATION,
60665 + PSET_SD,
60666 + PSET_DIR_ITEM,
60667 + PSET_CIPHER,
60668 + PSET_DIGEST,
60669 + PSET_COMPRESSION,
60670 + PSET_COMPRESSION_MODE,
60671 + PSET_CLUSTER,
60672 + PSET_REGULAR_ENTRY,
60673 + PSET_LAST
60674 +} pset_member;
60675 +
60676 +int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb);
60677 +int grab_plugin_from(struct inode *self, pset_member memb,
60678 + reiser4_plugin * plug);
60679 +int force_plugin(struct inode *self, pset_member memb, reiser4_plugin * plug);
60680 +
60681 +/* defined in fs/reiser4/plugin/object.c */
60682 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60683 +/* defined in fs/reiser4/plugin/object.c */
60684 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60685 +/* defined in fs/reiser4/plugin/item/static_stat.c */
60686 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
60687 +/* defined in fs/reiser4/plugin/hash.c */
60688 +extern hash_plugin hash_plugins[LAST_HASH_ID];
60689 +/* defined in fs/reiser4/plugin/fibration.c */
60690 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
60691 +/* defined in fs/reiser4/plugin/crypt.c */
60692 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
60693 +/* defined in fs/reiser4/plugin/digest.c */
60694 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
60695 +/* defined in fs/reiser4/plugin/compress/compress.c */
60696 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
60697 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
60698 +extern compression_mode_plugin
60699 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
60700 +/* defined in fs/reiser4/plugin/cluster.c */
60701 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
60702 +/* defined in fs/reiser4/plugin/regular.c */
60703 +extern regular_plugin regular_plugins[LAST_REGULAR_ID];
60704 +/* defined in fs/reiser4/plugin/tail.c */
60705 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
60706 +/* defined in fs/reiser4/plugin/security/security.c */
60707 +extern perm_plugin perm_plugins[LAST_PERM_ID];
60708 +/* defined in fs/reiser4/plugin/item/item.c */
60709 +extern item_plugin item_plugins[LAST_ITEM_ID];
60710 +/* defined in fs/reiser4/plugin/node/node.c */
60711 +extern node_plugin node_plugins[LAST_NODE_ID];
60712 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
60713 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
60714 +
60715 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
60716 +#endif
60717 +
60718 +/* Make Linus happy.
60719 + Local variables:
60720 + c-indentation-style: "K&R"
60721 + mode-name: "LC"
60722 + c-basic-offset: 8
60723 + tab-width: 8
60724 + fill-column: 120
60725 + End:
60726 +*/
60727 Index: linux-2.6.16/fs/reiser4/plugin/plugin_header.h
60728 ===================================================================
60729 --- /dev/null
60730 +++ linux-2.6.16/fs/reiser4/plugin/plugin_header.h
60731 @@ -0,0 +1,136 @@
60732 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60733 +
60734 +/* plugin header. Data structures required by all plugin types. */
60735 +
60736 +#if !defined( __PLUGIN_HEADER_H__ )
60737 +#define __PLUGIN_HEADER_H__
60738 +
60739 +/* plugin data-types and constants */
60740 +
60741 +#include "../debug.h"
60742 +#include "../dformat.h"
60743 +
60744 +typedef enum {
60745 + REISER4_FILE_PLUGIN_TYPE,
60746 + REISER4_DIR_PLUGIN_TYPE,
60747 + REISER4_ITEM_PLUGIN_TYPE,
60748 + REISER4_NODE_PLUGIN_TYPE,
60749 + REISER4_HASH_PLUGIN_TYPE,
60750 + REISER4_FIBRATION_PLUGIN_TYPE,
60751 + REISER4_FORMATTING_PLUGIN_TYPE,
60752 + REISER4_PERM_PLUGIN_TYPE,
60753 + REISER4_SD_EXT_PLUGIN_TYPE,
60754 + REISER4_FORMAT_PLUGIN_TYPE,
60755 + REISER4_JNODE_PLUGIN_TYPE,
60756 + REISER4_CIPHER_PLUGIN_TYPE,
60757 + REISER4_DIGEST_PLUGIN_TYPE,
60758 + REISER4_COMPRESSION_PLUGIN_TYPE,
60759 + REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60760 + REISER4_CLUSTER_PLUGIN_TYPE,
60761 + REISER4_REGULAR_PLUGIN_TYPE,
60762 + REISER4_PLUGIN_TYPES
60763 +} reiser4_plugin_type;
60764 +
60765 +struct reiser4_plugin_ops;
60766 +/* generic plugin operations, supported by each
60767 + plugin type. */
60768 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
60769 +
60770 +/* the common part of all plugin instances. */
60771 +typedef struct plugin_header {
60772 + /* plugin type */
60773 + reiser4_plugin_type type_id;
60774 + /* id of this plugin */
60775 + reiser4_plugin_id id;
60776 + /* plugin operations */
60777 + reiser4_plugin_ops *pops;
60778 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
60779 + /* short label of this plugin */
60780 + const char *label;
60781 + /* descriptive string.. */
60782 + const char *desc;
60783 + /* list linkage */
60784 + struct list_head linkage;
60785 +} plugin_header;
60786 +
60787 +/* PRIVATE INTERFACES */
60788 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
60789 +/* plugin type representation. */
60790 +typedef struct reiser4_plugin_type_data {
60791 + /* internal plugin type identifier. Should coincide with
60792 + index of this item in plugins[] array. */
60793 + reiser4_plugin_type type_id;
60794 + /* short symbolic label of this plugin type. Should be no longer
60795 + than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
60796 + const char *label;
60797 + /* plugin type description longer than .label */
60798 + const char *desc;
60799 +
60800 +/* NIKITA-FIXME-HANS: define built-in */
60801 + /* number of built-in plugin instances of this type */
60802 + int builtin_num;
60803 + /* array of built-in plugins */
60804 + void *builtin;
60805 + struct list_head plugins_list;
60806 + size_t size;
60807 +} reiser4_plugin_type_data;
60808 +
60809 +extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
60810 +
60811 +int is_type_id_valid(reiser4_plugin_type type_id);
60812 +int is_plugin_id_valid(reiser4_plugin_type type_id, reiser4_plugin_id id);
60813 +
60814 +static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
60815 +{
60816 + char *builtin;
60817 +
60818 + builtin = ptype->builtin;
60819 + return (reiser4_plugin *) (builtin + i * ptype->size);
60820 +}
60821 +
60822 +/* return plugin by its @type_id and @id */
60823 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type_id
60824 + /* plugin type id */ ,
60825 + reiser4_plugin_id id /* plugin id */
60826 + )
60827 +{
60828 + assert("nikita-1651", is_type_id_valid(type_id));
60829 + assert("nikita-1652", is_plugin_id_valid(type_id, id));
60830 + return plugin_at(&plugins[type_id], id);
60831 +}
60832 +
60833 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
60834 + reiser4_plugin_id id);
60835 +
60836 +/**
60837 + * plugin_by_disk_id - get reiser4_plugin
60838 + * @type_id: plugin type id
60839 + * @did: plugin id in disk format
60840 + *
60841 + * Returns reiser4_plugin by plugin type id an dplugin_id.
60842 + */
60843 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
60844 + reiser4_plugin_type type_id,
60845 + __le16 *plugin_id)
60846 +{
60847 + /*
60848 + * what we should do properly is to maintain within each file-system a
60849 + * dictionary that maps on-disk plugin ids to "universal" ids. This
60850 + * dictionary will be resolved on mount time, so that this function
60851 + * will perform just one additional array lookup.
60852 + */
60853 + return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
60854 +}
60855 +
60856 +/* __PLUGIN_HEADER_H__ */
60857 +#endif
60858 +
60859 +/*
60860 + * Local variables:
60861 + * c-indentation-style: "K&R"
60862 + * mode-name: "LC"
60863 + * c-basic-offset: 8
60864 + * tab-width: 8
60865 + * fill-column: 79
60866 + * End:
60867 + */
60868 Index: linux-2.6.16/fs/reiser4/plugin/plugin_set.c
60869 ===================================================================
60870 --- /dev/null
60871 +++ linux-2.6.16/fs/reiser4/plugin/plugin_set.c
60872 @@ -0,0 +1,378 @@
60873 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60874 + * reiser4/README */
60875 +/* NIKITA-FIXME-HANS: you didn't discuss this with me before coding it did you? Remove plugin-sets from code by March 15th, 2004 */
60876 +/* plugin-sets */
60877 +
60878 +/*
60879 + * Each inode comes with a whole set of plugins: file plugin, directory
60880 + * plugin, hash plugin, tail policy plugin, security plugin, etc.
60881 + *
60882 + * Storing them (pointers to them, that is) in inode is a waste of
60883 + * space. Especially, given that on average file system plugins of vast
60884 + * majority of files will belong to few sets (e.g., one set for regular files,
60885 + * another set for standard directory, etc.)
60886 + *
60887 + * Plugin set (pset) is an object containing pointers to all plugins required
60888 + * by inode. Inode only stores a pointer to pset. psets are "interned", that
60889 + * is, different inodes with the same set of plugins point to the same
60890 + * pset. This is archived by storing psets in global hash table. Races are
60891 + * avoided by simple (and efficient so far) solution of never recycling psets,
60892 + * even when last inode pointing to it is destroyed.
60893 + *
60894 + */
60895 +
60896 +#include "../debug.h"
60897 +#include "../super.h"
60898 +#include "plugin_set.h"
60899 +
60900 +#include <linux/slab.h>
60901 +#include <linux/stddef.h>
60902 +
60903 +/* slab for plugin sets */
60904 +static kmem_cache_t *plugin_set_slab;
60905 +
60906 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
60907 + [0 ... 7] = SPIN_LOCK_UNLOCKED
60908 +};
60909 +
60910 +/* hash table support */
60911 +
60912 +#define PS_TABLE_SIZE (32)
60913 +
60914 +static inline plugin_set *cast_to(const unsigned long *a)
60915 +{
60916 + return container_of(a, plugin_set, hashval);
60917 +}
60918 +
60919 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
60920 +{
60921 + plugin_set *set1;
60922 + plugin_set *set2;
60923 +
60924 + /* make sure fields are not missed in the code below */
60925 + cassert(sizeof *set1 ==
60926 + sizeof set1->hashval +
60927 + sizeof set1->link +
60928 + sizeof set1->file +
60929 + sizeof set1->dir +
60930 + sizeof set1->perm +
60931 + sizeof set1->formatting +
60932 + sizeof set1->hash +
60933 + sizeof set1->fibration +
60934 + sizeof set1->sd +
60935 + sizeof set1->dir_item +
60936 + sizeof set1->cipher +
60937 + sizeof set1->digest +
60938 + sizeof set1->compression +
60939 + sizeof set1->compression_mode +
60940 + sizeof set1->cluster + sizeof set1->regular_entry);
60941 +
60942 + set1 = cast_to(a1);
60943 + set2 = cast_to(a2);
60944 + return
60945 + set1->hashval == set2->hashval &&
60946 + set1->file == set2->file &&
60947 + set1->dir == set2->dir &&
60948 + set1->perm == set2->perm &&
60949 + set1->formatting == set2->formatting &&
60950 + set1->hash == set2->hash &&
60951 + set1->fibration == set2->fibration &&
60952 + set1->sd == set2->sd &&
60953 + set1->dir_item == set2->dir_item &&
60954 + set1->cipher == set2->cipher &&
60955 + set1->digest == set2->digest &&
60956 + set1->compression == set2->compression &&
60957 + set1->compression_mode == set2->compression_mode &&
60958 + set1->cluster == set2->cluster &&
60959 + set1->regular_entry == set2->regular_entry;
60960 +}
60961 +
60962 +#define HASH_FIELD(hash, set, field) \
60963 +({ \
60964 + (hash) += (unsigned long)(set)->field >> 2; \
60965 +})
60966 +
60967 +static inline unsigned long calculate_hash(const plugin_set * set)
60968 +{
60969 + unsigned long result;
60970 +
60971 + result = 0;
60972 + HASH_FIELD(result, set, file);
60973 + HASH_FIELD(result, set, dir);
60974 + HASH_FIELD(result, set, perm);
60975 + HASH_FIELD(result, set, formatting);
60976 + HASH_FIELD(result, set, hash);
60977 + HASH_FIELD(result, set, fibration);
60978 + HASH_FIELD(result, set, sd);
60979 + HASH_FIELD(result, set, dir_item);
60980 + HASH_FIELD(result, set, cipher);
60981 + HASH_FIELD(result, set, digest);
60982 + HASH_FIELD(result, set, compression);
60983 + HASH_FIELD(result, set, compression_mode);
60984 + HASH_FIELD(result, set, cluster);
60985 + HASH_FIELD(result, set, regular_entry);
60986 + return result & (PS_TABLE_SIZE - 1);
60987 +}
60988 +
60989 +static inline unsigned long
60990 +pshash(ps_hash_table * table, const unsigned long *a)
60991 +{
60992 + return *a;
60993 +}
60994 +
60995 +/* The hash table definition */
60996 +#define KMALLOC(size) kmalloc((size), get_gfp_mask())
60997 +#define KFREE(ptr, size) kfree(ptr)
60998 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
60999 + pseq);
61000 +#undef KFREE
61001 +#undef KMALLOC
61002 +
61003 +static ps_hash_table ps_table;
61004 +static plugin_set empty_set = {
61005 + .hashval = 0,
61006 + .file = NULL,
61007 + .dir = NULL,
61008 + .perm = NULL,
61009 + .formatting = NULL,
61010 + .hash = NULL,
61011 + .fibration = NULL,
61012 + .sd = NULL,
61013 + .dir_item = NULL,
61014 + .cipher = NULL,
61015 + .digest = NULL,
61016 + .compression = NULL,
61017 + .compression_mode = NULL,
61018 + .cluster = NULL,
61019 + .regular_entry = NULL,
61020 + .link = {NULL}
61021 +};
61022 +
61023 +plugin_set *plugin_set_get_empty(void)
61024 +{
61025 + return &empty_set;
61026 +}
61027 +
61028 +void plugin_set_put(plugin_set * set)
61029 +{
61030 +}
61031 +
61032 +static inline unsigned long *pset_field(plugin_set * set, int offset)
61033 +{
61034 + return (unsigned long *)(((char *)set) + offset);
61035 +}
61036 +
61037 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
61038 + const int offset)
61039 +{
61040 + unsigned long *spot;
61041 + spinlock_t *lock;
61042 + plugin_set replica;
61043 + plugin_set *twin;
61044 + plugin_set *psal;
61045 + plugin_set *orig;
61046 +
61047 + assert("nikita-2902", set != NULL);
61048 + assert("nikita-2904", *set != NULL);
61049 +
61050 + spot = pset_field(*set, offset);
61051 + if (unlikely(*spot == val))
61052 + return 0;
61053 +
61054 + replica = *(orig = *set);
61055 + *pset_field(&replica, offset) = val;
61056 + replica.hashval = calculate_hash(&replica);
61057 + rcu_read_lock();
61058 + twin = ps_hash_find(&ps_table, &replica.hashval);
61059 + if (unlikely(twin == NULL)) {
61060 + rcu_read_unlock();
61061 + psal = kmem_cache_alloc(plugin_set_slab, get_gfp_mask());
61062 + if (psal == NULL)
61063 + return RETERR(-ENOMEM);
61064 + *psal = replica;
61065 + lock = &plugin_set_lock[replica.hashval & 7];
61066 + spin_lock(lock);
61067 + twin = ps_hash_find(&ps_table, &replica.hashval);
61068 + if (likely(twin == NULL)) {
61069 + *set = psal;
61070 + ps_hash_insert_rcu(&ps_table, psal);
61071 + } else {
61072 + *set = twin;
61073 + kmem_cache_free(plugin_set_slab, psal);
61074 + }
61075 + spin_unlock(lock);
61076 + } else {
61077 + rcu_read_unlock();
61078 + *set = twin;
61079 + }
61080 + return 0;
61081 +}
61082 +
61083 +static struct {
61084 + int offset;
61085 + reiser4_plugin_type type;
61086 +} pset_descr[PSET_LAST] = {
61087 + [PSET_FILE] = {
61088 + .offset = offsetof(plugin_set, file),
61089 + .type = REISER4_FILE_PLUGIN_TYPE
61090 + },
61091 + [PSET_DIR] = {
61092 + .offset = offsetof(plugin_set, dir),
61093 + .type = REISER4_DIR_PLUGIN_TYPE
61094 + },
61095 + [PSET_PERM] = {
61096 + .offset = offsetof(plugin_set, perm),
61097 + .type = REISER4_PERM_PLUGIN_TYPE
61098 + },
61099 + [PSET_FORMATTING] = {
61100 + .offset = offsetof(plugin_set, formatting),
61101 + .type = REISER4_FORMATTING_PLUGIN_TYPE
61102 + },
61103 + [PSET_HASH] = {
61104 + .offset = offsetof(plugin_set, hash),
61105 + .type = REISER4_HASH_PLUGIN_TYPE
61106 + },
61107 + [PSET_FIBRATION] = {
61108 + .offset = offsetof(plugin_set, fibration),
61109 + .type = REISER4_FIBRATION_PLUGIN_TYPE
61110 + },
61111 + [PSET_SD] = {
61112 + .offset = offsetof(plugin_set, sd),
61113 + .type = REISER4_ITEM_PLUGIN_TYPE
61114 + },
61115 + [PSET_DIR_ITEM] = {
61116 + .offset = offsetof(plugin_set, dir_item),
61117 + .type = REISER4_ITEM_PLUGIN_TYPE
61118 + },
61119 + [PSET_CIPHER] = {
61120 + .offset = offsetof(plugin_set, cipher),
61121 + .type = REISER4_CIPHER_PLUGIN_TYPE
61122 + },
61123 + [PSET_DIGEST] = {
61124 + .offset = offsetof(plugin_set, digest),
61125 + .type = REISER4_DIGEST_PLUGIN_TYPE
61126 + },
61127 + [PSET_COMPRESSION] = {
61128 + .offset = offsetof(plugin_set, compression),
61129 + .type = REISER4_COMPRESSION_PLUGIN_TYPE
61130 + },
61131 + [PSET_COMPRESSION_MODE] = {
61132 + .offset = offsetof(plugin_set, compression_mode),
61133 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE
61134 + },
61135 + [PSET_CLUSTER] = {
61136 + .offset = offsetof(plugin_set, cluster),
61137 + .type = REISER4_CLUSTER_PLUGIN_TYPE
61138 + },
61139 + [PSET_REGULAR_ENTRY] = {
61140 + .offset = offsetof(plugin_set, regular_entry),
61141 + .type = REISER4_REGULAR_PLUGIN_TYPE
61142 + }
61143 +};
61144 +
61145 +#if REISER4_DEBUG
61146 +static reiser4_plugin_type pset_member_to_type(pset_member memb)
61147 +{
61148 + assert("nikita-3501", 0 <= memb && memb < PSET_LAST);
61149 + return pset_descr[memb].type;
61150 +}
61151 +#endif
61152 +
61153 +reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb)
61154 +{
61155 + if (0 <= memb && memb < PSET_LAST)
61156 + return pset_descr[memb].type;
61157 + else
61158 + return REISER4_PLUGIN_TYPES;
61159 +}
61160 +
61161 +int pset_set(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
61162 +{
61163 + assert("nikita-3492", set != NULL);
61164 + assert("nikita-3493", *set != NULL);
61165 + assert("nikita-3494", plugin != NULL);
61166 + assert("nikita-3495", 0 <= memb && memb < PSET_LAST);
61167 + assert("nikita-3496", plugin->h.type_id == pset_member_to_type(memb));
61168 +
61169 + return plugin_set_field(set,
61170 + (unsigned long)plugin, pset_descr[memb].offset);
61171 +}
61172 +
61173 +reiser4_plugin *pset_get(plugin_set * set, pset_member memb)
61174 +{
61175 + assert("nikita-3497", set != NULL);
61176 + assert("nikita-3498", 0 <= memb && memb < PSET_LAST);
61177 +
61178 + return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset);
61179 +}
61180 +
61181 +#define DEFINE_PLUGIN_SET(type, field) \
61182 +int plugin_set_ ## field(plugin_set **set, type *val) \
61183 +{ \
61184 + cassert(sizeof val == sizeof(unsigned long)); \
61185 + return plugin_set_field(set, (unsigned long)val, \
61186 + offsetof(plugin_set, field)); \
61187 +}
61188 +
61189 +DEFINE_PLUGIN_SET(file_plugin, file)
61190 + DEFINE_PLUGIN_SET(dir_plugin, dir)
61191 + DEFINE_PLUGIN_SET(formatting_plugin, formatting)
61192 + DEFINE_PLUGIN_SET(hash_plugin, hash)
61193 + DEFINE_PLUGIN_SET(fibration_plugin, fibration)
61194 + DEFINE_PLUGIN_SET(item_plugin, sd)
61195 + /* DEFINE_PLUGIN_SET(cipher_plugin, cipher) */
61196 + /* DEFINE_PLUGIN_SET(digest_plugin, digest) */
61197 + DEFINE_PLUGIN_SET(compression_plugin, compression)
61198 + /* DEFINE_PLUGIN_SET(compression_mode_plugin, compression_mode) */
61199 + DEFINE_PLUGIN_SET(cluster_plugin, cluster)
61200 + /* DEFINE_PLUGIN_SET(regular_plugin, regular_entry) */
61201 +
61202 +
61203 +/**
61204 + * init_plugin_set - create pset cache and hash table
61205 + *
61206 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
61207 + * reiser4 module initialization.
61208 + */
61209 +int init_plugin_set(void)
61210 +{
61211 + int result;
61212 +
61213 + result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
61214 + if (result == 0) {
61215 + plugin_set_slab = kmem_cache_create("plugin_set",
61216 + sizeof(plugin_set), 0,
61217 + SLAB_HWCACHE_ALIGN,
61218 + NULL, NULL);
61219 + if (plugin_set_slab == NULL)
61220 + result = RETERR(-ENOMEM);
61221 + }
61222 + return result;
61223 +}
61224 +
61225 +/**
61226 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
61227 + *
61228 + * This is called on reiser4 module unloading or system shutdown.
61229 + */
61230 +void done_plugin_set(void)
61231 +{
61232 + plugin_set *cur, *next;
61233 +
61234 + for_all_in_htable(&ps_table, ps, cur, next) {
61235 + ps_hash_remove(&ps_table, cur);
61236 + kmem_cache_free(plugin_set_slab, cur);
61237 + }
61238 + destroy_reiser4_cache(&plugin_set_slab);
61239 + ps_hash_done(&ps_table);
61240 +}
61241 +
61242 +/*
61243 + * Local variables:
61244 + * c-indentation-style: "K&R"
61245 + * mode-name: "LC"
61246 + * c-basic-offset: 8
61247 + * tab-width: 8
61248 + * fill-column: 120
61249 + * End:
61250 + */
61251 Index: linux-2.6.16/fs/reiser4/plugin/plugin_set.h
61252 ===================================================================
61253 --- /dev/null
61254 +++ linux-2.6.16/fs/reiser4/plugin/plugin_set.h
61255 @@ -0,0 +1,83 @@
61256 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61257 +
61258 +/* plugin-sets. see fs/reiser4/plugin/plugin_set.c for details */
61259 +
61260 +#if !defined( __PLUGIN_SET_H__ )
61261 +#define __PLUGIN_SET_H__
61262 +
61263 +#include "../type_safe_hash.h"
61264 +#include "plugin.h"
61265 +
61266 +#include <linux/rcupdate.h>
61267 +
61268 +struct plugin_set;
61269 +typedef struct plugin_set plugin_set;
61270 +
61271 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61272 +
61273 +struct plugin_set {
61274 + unsigned long hashval;
61275 + /* plugin of file */
61276 + file_plugin *file;
61277 + /* plugin of dir */
61278 + dir_plugin *dir;
61279 + /* perm plugin for this file */
61280 + perm_plugin *perm;
61281 + /* tail policy plugin. Only meaningful for regular files */
61282 + formatting_plugin *formatting;
61283 + /* hash plugin. Only meaningful for directories. */
61284 + hash_plugin *hash;
61285 + /* fibration plugin. Only meaningful for directories. */
61286 + fibration_plugin *fibration;
61287 + /* plugin of stat-data */
61288 + item_plugin *sd;
61289 + /* plugin of items a directory is built of */
61290 + item_plugin *dir_item;
61291 + /* cipher plugin */
61292 + cipher_plugin *cipher;
61293 + /* digest plugin */
61294 + digest_plugin *digest;
61295 + /* compression plugin */
61296 + compression_plugin *compression;
61297 + /* compression mode plugin */
61298 + compression_mode_plugin *compression_mode;
61299 + /* cluster plugin */
61300 + cluster_plugin *cluster;
61301 + /* plugin of regular child should be created */
61302 + regular_plugin *regular_entry;
61303 + ps_hash_link link;
61304 +};
61305 +
61306 +extern plugin_set *plugin_set_get_empty(void);
61307 +extern void plugin_set_put(plugin_set * set);
61308 +
61309 +extern int plugin_set_file(plugin_set ** set, file_plugin * plug);
61310 +extern int plugin_set_dir(plugin_set ** set, dir_plugin * plug);
61311 +extern int plugin_set_formatting(plugin_set ** set, formatting_plugin * plug);
61312 +extern int plugin_set_hash(plugin_set ** set, hash_plugin * plug);
61313 +extern int plugin_set_fibration(plugin_set ** set, fibration_plugin * plug);
61314 +extern int plugin_set_sd(plugin_set ** set, item_plugin * plug);
61315 +extern int plugin_set_compression(plugin_set ** set, compression_plugin * plug);
61316 +extern int plugin_set_cluster(plugin_set ** set, cluster_plugin * plug);
61317 +
61318 +extern int init_plugin_set(void);
61319 +extern void done_plugin_set(void);
61320 +
61321 +extern int pset_set(plugin_set ** set, pset_member memb,
61322 + reiser4_plugin * plugin);
61323 +extern reiser4_plugin *pset_get(plugin_set * set, pset_member memb);
61324 +
61325 +extern reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb);
61326 +
61327 +/* __PLUGIN_SET_H__ */
61328 +#endif
61329 +
61330 +/* Make Linus happy.
61331 + Local variables:
61332 + c-indentation-style: "K&R"
61333 + mode-name: "LC"
61334 + c-basic-offset: 8
61335 + tab-width: 8
61336 + fill-column: 120
61337 + End:
61338 +*/
61339 Index: linux-2.6.16/fs/reiser4/plugin/regular.c
61340 ===================================================================
61341 --- /dev/null
61342 +++ linux-2.6.16/fs/reiser4/plugin/regular.c
61343 @@ -0,0 +1,44 @@
61344 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61345 +
61346 +/* Contains Reiser4 regular plugins which:
61347 + . specify a set of reiser4 regular object plugins,
61348 + . used by directory plugin to create entries powered by specified
61349 + regular plugins */
61350 +
61351 +#include "plugin.h"
61352 +
61353 +regular_plugin regular_plugins[LAST_REGULAR_ID] = {
61354 + [UF_REGULAR_ID] = {
61355 + .h = {
61356 + .type_id = REISER4_REGULAR_PLUGIN_TYPE,
61357 + .id = UF_REGULAR_ID,
61358 + .pops = NULL,
61359 + .label = "unixfile",
61360 + .desc = "Unix file regular plugin",
61361 + .linkage = {NULL, NULL}
61362 + },
61363 + .id = UNIX_FILE_PLUGIN_ID
61364 + },
61365 + [CRC_REGULAR_ID] = {
61366 + .h = {
61367 + .type_id = REISER4_REGULAR_PLUGIN_TYPE,
61368 + .id = CRC_REGULAR_ID,
61369 + .pops = NULL,
61370 + .label = "cryptcompress",
61371 + .desc = "Cryptcompress regular plugin",
61372 + .linkage = {NULL, NULL}
61373 + },
61374 + .id = CRC_FILE_PLUGIN_ID
61375 + }
61376 +};
61377 +
61378 +/*
61379 + Local variables:
61380 + c-indentation-style: "K&R"
61381 + mode-name: "LC"
61382 + c-basic-offset: 8
61383 + tab-width: 8
61384 + fill-column: 120
61385 + scroll-step: 1
61386 + End:
61387 +*/
61388 Index: linux-2.6.16/fs/reiser4/plugin/security/Makefile
61389 ===================================================================
61390 --- /dev/null
61391 +++ linux-2.6.16/fs/reiser4/plugin/security/Makefile
61392 @@ -0,0 +1,4 @@
61393 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
61394 +
61395 +security_plugins-objs := \
61396 + perm.o
61397 Index: linux-2.6.16/fs/reiser4/plugin/security/perm.c
61398 ===================================================================
61399 --- /dev/null
61400 +++ linux-2.6.16/fs/reiser4/plugin/security/perm.c
61401 @@ -0,0 +1,44 @@
61402 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61403 +
61404 +/*
61405 + * this file contains implementation of permission plugins. Currently, only
61406 + * RWX_PERM_ID is implemented
61407 + */
61408 +
61409 +#include "../plugin.h"
61410 +#include "../plugin_header.h"
61411 +#include "../../debug.h"
61412 +
61413 +perm_plugin perm_plugins[LAST_PERM_ID] = {
61414 + [NULL_PERM_ID] = {
61415 + .h = {
61416 + .type_id = REISER4_PERM_PLUGIN_TYPE,
61417 + .id = NULL_PERM_ID,
61418 + .pops = NULL,
61419 + .label = "null",
61420 + .desc = "stub permission plugin",
61421 + .linkage = {NULL, NULL}
61422 + },
61423 + .read_ok = NULL,
61424 + .write_ok = NULL,
61425 + .lookup_ok = NULL,
61426 + .create_ok = NULL,
61427 + .link_ok = NULL,
61428 + .unlink_ok = NULL,
61429 + .delete_ok = NULL,
61430 + .mask_ok = NULL,
61431 + .setattr_ok = NULL,
61432 + .getattr_ok = NULL,
61433 + .rename_ok = NULL,
61434 + }
61435 +};
61436 +
61437 +/*
61438 + * Local variables:
61439 + * c-indentation-style: "K&R"
61440 + * mode-name: "LC"
61441 + * c-basic-offset: 8
61442 + * tab-width: 8
61443 + * fill-column: 79
61444 + * End:
61445 + */
61446 Index: linux-2.6.16/fs/reiser4/plugin/security/perm.h
61447 ===================================================================
61448 --- /dev/null
61449 +++ linux-2.6.16/fs/reiser4/plugin/security/perm.h
61450 @@ -0,0 +1,82 @@
61451 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61452 +
61453 +/* Perm (short for "permissions") plugins common stuff. */
61454 +
61455 +#if !defined( __REISER4_PERM_H__ )
61456 +#define __REISER4_PERM_H__
61457 +
61458 +#include "../../forward.h"
61459 +#include "../plugin_header.h"
61460 +
61461 +#include <linux/types.h>
61462 +#include <linux/fs.h> /* for struct file */
61463 +#include <linux/dcache.h> /* for struct dentry */
61464 +
61465 +/* interface for perm plugin.
61466 +
61467 + Perm plugin method can be implemented through:
61468 +
61469 + 1. consulting ->i_mode bits in stat data
61470 +
61471 + 2. obtaining acl from the tree and inspecting it
61472 +
61473 + 3. asking some kernel module or user-level program to authorize access.
61474 +
61475 + This allows for integration with things like capabilities, SELinux-style
61476 + secutiry contexts, etc.
61477 +
61478 +*/
61479 +/* NIKITA-FIXME-HANS: define what this is targeted for. It does not seem to be intended for use with sys_reiser4. Explain. */
61480 +typedef struct perm_plugin {
61481 + /* generic plugin fields */
61482 + plugin_header h;
61483 +
61484 + /* check permissions for read/write */
61485 + int (*read_ok) (struct file *file, const char __user *buf,
61486 + size_t size, loff_t *off);
61487 + int (*write_ok) (struct file *file, const char __user *buf,
61488 + size_t size, loff_t *off);
61489 +
61490 + /* check permissions for lookup */
61491 + int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
61492 +
61493 + /* check permissions for create */
61494 + int (*create_ok) (struct inode * parent, struct dentry * dentry,
61495 + reiser4_object_create_data * data);
61496 +
61497 + /* check permissions for linking @where to @existing */
61498 + int (*link_ok) (struct dentry * existing, struct inode * parent,
61499 + struct dentry * where);
61500 +
61501 + /* check permissions for unlinking @victim from @parent */
61502 + int (*unlink_ok) (struct inode * parent, struct dentry * victim);
61503 +
61504 + /* check permissions for deletion of @object whose last reference is
61505 + by @parent */
61506 + int (*delete_ok) (struct inode * parent, struct dentry * victim);
61507 + int (*mask_ok) (struct inode * inode, int mask);
61508 + /* check whether attribute change is acceptable */
61509 + int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
61510 +
61511 + /* check whether stat(2) is allowed */
61512 + int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
61513 + struct dentry * dentry, struct kstat * stat);
61514 + /* check whether rename(2) is allowed */
61515 + int (*rename_ok) (struct inode * old_dir, struct dentry * old,
61516 + struct inode * new_dir, struct dentry * new);
61517 +} perm_plugin;
61518 +
61519 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
61520 +
61521 +/* __REISER4_PERM_H__ */
61522 +#endif
61523 +
61524 +/* Make Linus happy.
61525 + Local variables:
61526 + c-indentation-style: "K&R"
61527 + mode-name: "LC"
61528 + c-basic-offset: 8
61529 + tab-width: 8
61530 + fill-column: 120
61531 + End:
61532 +*/
61533 Index: linux-2.6.16/fs/reiser4/plugin/space/Makefile
61534 ===================================================================
61535 --- /dev/null
61536 +++ linux-2.6.16/fs/reiser4/plugin/space/Makefile
61537 @@ -0,0 +1,4 @@
61538 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
61539 +
61540 +space_plugins-objs := \
61541 + bitmap.o
61542 Index: linux-2.6.16/fs/reiser4/plugin/space/bitmap.c
61543 ===================================================================
61544 --- /dev/null
61545 +++ linux-2.6.16/fs/reiser4/plugin/space/bitmap.c
61546 @@ -0,0 +1,1592 @@
61547 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61548 +
61549 +#include "../../debug.h"
61550 +#include "../../dformat.h"
61551 +#include "../../txnmgr.h"
61552 +#include "../../jnode.h"
61553 +#include "../../block_alloc.h"
61554 +#include "../../tree.h"
61555 +#include "../../super.h"
61556 +#include "../plugin.h"
61557 +#include "space_allocator.h"
61558 +#include "bitmap.h"
61559 +
61560 +#include <linux/types.h>
61561 +#include <linux/fs.h> /* for struct super_block */
61562 +#include <asm/semaphore.h>
61563 +#include <linux/vmalloc.h>
61564 +#include <asm/div64.h>
61565 +
61566 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
61567 + * blocks
61568 +
61569 + A useful optimization of reiser4 bitmap handling would be dynamic bitmap
61570 + blocks loading/unloading which is different from v3.x where all bitmap
61571 + blocks are loaded at mount time.
61572 +
61573 + To implement bitmap blocks unloading we need to count bitmap block usage
61574 + and detect currently unused blocks allowing them to be unloaded. It is not
61575 + a simple task since we allow several threads to modify one bitmap block
61576 + simultaneously.
61577 +
61578 + Briefly speaking, the following schema is proposed: we count in special
61579 + variable associated with each bitmap block. That is for counting of block
61580 + alloc/dealloc operations on that bitmap block. With a deferred block
61581 + deallocation feature of reiser4 all those operation will be represented in
61582 + atom dirty/deleted lists as jnodes for freshly allocated or deleted
61583 + nodes.
61584 +
61585 + So, we increment usage counter for each new node allocated or deleted, and
61586 + decrement it at atom commit one time for each node from the dirty/deleted
61587 + atom's list. Of course, freshly allocated node deletion and node reusing
61588 + from atom deleted (if we do so) list should decrement bitmap usage counter
61589 + also.
61590 +
61591 + This schema seems to be working but that reference counting is
61592 + not easy to debug. I think we should agree with Hans and do not implement
61593 + it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
61594 +
61595 + For simplicity all bitmap nodes (both commit and working bitmap blocks) are
61596 + loaded into memory on fs mount time or each bitmap nodes are loaded at the
61597 + first access to it, the "dont_load_bitmap" mount option controls whether
61598 + bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
61599 + nodes currently is not supported. */
61600 +
61601 +#define CHECKSUM_SIZE 4
61602 +
61603 +#define BYTES_PER_LONG (sizeof(long))
61604 +
61605 +#if BITS_PER_LONG == 64
61606 +# define LONG_INT_SHIFT (6)
61607 +#else
61608 +# define LONG_INT_SHIFT (5)
61609 +#endif
61610 +
61611 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
61612 +
61613 +typedef unsigned long ulong_t;
61614 +
61615 +#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
61616 +#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
61617 +
61618 +/* Block allocation/deallocation are done through special bitmap objects which
61619 + are allocated in an array at fs mount. */
61620 +struct bitmap_node {
61621 + struct semaphore sema; /* long term lock object */
61622 +
61623 + jnode *wjnode; /* j-nodes for WORKING ... */
61624 + jnode *cjnode; /* ... and COMMIT bitmap blocks */
61625 +
61626 + bmap_off_t first_zero_bit; /* for skip_busy option implementation */
61627 +
61628 + atomic_t loaded; /* a flag which shows that bnode is loaded
61629 + * already */
61630 +};
61631 +
61632 +static inline char *bnode_working_data(struct bitmap_node *bnode)
61633 +{
61634 + char *data;
61635 +
61636 + data = jdata(bnode->wjnode);
61637 + assert("zam-429", data != NULL);
61638 +
61639 + return data + CHECKSUM_SIZE;
61640 +}
61641 +
61642 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
61643 +{
61644 + char *data;
61645 +
61646 + data = jdata(bnode->cjnode);
61647 + assert("zam-430", data != NULL);
61648 +
61649 + return data + CHECKSUM_SIZE;
61650 +}
61651 +
61652 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
61653 +{
61654 + char *data;
61655 +
61656 + data = jdata(bnode->cjnode);
61657 + assert("vpf-261", data != NULL);
61658 +
61659 + return le32_to_cpu(get_unaligned((d32 *)data));
61660 +}
61661 +
61662 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
61663 +{
61664 + char *data;
61665 +
61666 + data = jdata(bnode->cjnode);
61667 + assert("vpf-261", data != NULL);
61668 +
61669 + put_unaligned(cpu_to_le32(crc), (d32 *)data);
61670 +}
61671 +
61672 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
61673 + * written the code, does this added abstraction still have */
61674 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
61675 + * reiser4_space_allocator structure) */
61676 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
61677 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
61678 + * someday?". What they about? If there is a reason to have a union, it should
61679 + * be a union, if not, it should not be a union. "..might be someday" means no
61680 + * reason. */
61681 +struct bitmap_allocator_data {
61682 + /* an array for bitmap blocks direct access */
61683 + struct bitmap_node *bitmap;
61684 +};
61685 +
61686 +#define get_barray(super) \
61687 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
61688 +
61689 +#define get_bnode(super, i) (get_barray(super) + i)
61690 +
61691 +/* allocate and initialize jnode with JNODE_BITMAP type */
61692 +static jnode *bnew(void)
61693 +{
61694 + jnode *jal = jalloc();
61695 +
61696 + if (jal)
61697 + jnode_init(jal, current_tree, JNODE_BITMAP);
61698 +
61699 + return jal;
61700 +}
61701 +
61702 +/* this file contains:
61703 + - bitmap based implementation of space allocation plugin
61704 + - all the helper functions like set bit, find_first_zero_bit, etc */
61705 +
61706 +/* Audited by: green(2002.06.12) */
61707 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
61708 +{
61709 + ulong_t mask = 1UL << start_bit;
61710 + int i = start_bit;
61711 +
61712 + while ((word & mask) != 0) {
61713 + mask <<= 1;
61714 + if (++i >= BITS_PER_LONG)
61715 + break;
61716 + }
61717 +
61718 + return i;
61719 +}
61720 +
61721 +#include <asm/bitops.h>
61722 +
61723 +#if BITS_PER_LONG == 64
61724 +
61725 +#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
61726 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
61727 +
61728 +static inline void reiser4_set_bit(int nr, void *addr)
61729 +{
61730 + ext2_set_bit(nr + OFF(addr), BASE(addr));
61731 +}
61732 +
61733 +static inline void reiser4_clear_bit(int nr, void *addr)
61734 +{
61735 + ext2_clear_bit(nr + OFF(addr), BASE(addr));
61736 +}
61737 +
61738 +static inline int reiser4_test_bit(int nr, void *addr)
61739 +{
61740 + return ext2_test_bit(nr + OFF(addr), BASE(addr));
61741 +}
61742 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
61743 + int offset)
61744 +{
61745 + int off = OFF(addr);
61746 +
61747 + return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
61748 + offset + off) - off;
61749 +}
61750 +
61751 +#else
61752 +
61753 +#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
61754 +#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
61755 +#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
61756 +
61757 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
61758 +ext2_find_next_zero_bit(addr, maxoffset, offset)
61759 +#endif
61760 +
61761 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
61762 + * are counted from @addr, return the offset of the first bit if it is found,
61763 + * @maxoffset otherwise. */
61764 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61765 + bmap_off_t start_offset)
61766 +{
61767 + ulong_t *base = addr;
61768 + /* start_offset is in bits, convert it to byte offset within bitmap. */
61769 + int word_nr = start_offset >> LONG_INT_SHIFT;
61770 + /* bit number within the byte. */
61771 + int bit_nr = start_offset & LONG_INT_MASK;
61772 + int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
61773 +
61774 + assert("zam-387", max_offset != 0);
61775 +
61776 + /* Unaligned @start_offset case. */
61777 + if (bit_nr != 0) {
61778 + bmap_nr_t nr;
61779 +
61780 + nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
61781 +
61782 + if (nr < BITS_PER_LONG)
61783 + return (word_nr << LONG_INT_SHIFT) + nr;
61784 +
61785 + ++word_nr;
61786 + }
61787 +
61788 + /* Fast scan trough aligned words. */
61789 + while (word_nr <= max_word_nr) {
61790 + if (base[word_nr] != 0) {
61791 + return (word_nr << LONG_INT_SHIFT)
61792 + + find_next_zero_bit_in_word(~(base[word_nr]), 0);
61793 + }
61794 +
61795 + ++word_nr;
61796 + }
61797 +
61798 + return max_offset;
61799 +}
61800 +
61801 +#if BITS_PER_LONG == 64
61802 +
61803 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61804 + bmap_off_t start_offset)
61805 +{
61806 + bmap_off_t off = OFF(addr);
61807 +
61808 + return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
61809 + start_offset + off) - off;
61810 +}
61811 +
61812 +#else
61813 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
61814 + __reiser4_find_next_set_bit(addr, max_offset, start_offset)
61815 +#endif
61816 +
61817 +/* search for the first set bit in single word. */
61818 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
61819 +{
61820 + ulong_t bit_mask;
61821 + int nr = start_bit;
61822 +
61823 + assert("zam-965", start_bit < BITS_PER_LONG);
61824 + assert("zam-966", start_bit >= 0);
61825 +
61826 + bit_mask = (1UL << nr);
61827 +
61828 + while (bit_mask != 0) {
61829 + if (bit_mask & word)
61830 + return nr;
61831 + bit_mask >>= 1;
61832 + nr--;
61833 + }
61834 + return BITS_PER_LONG;
61835 +}
61836 +
61837 +/* Search bitmap for a set bit in backward direction from the end to the
61838 + * beginning of given region
61839 + *
61840 + * @result: result offset of the last set bit
61841 + * @addr: base memory address,
61842 + * @low_off: low end of the search region, edge bit included into the region,
61843 + * @high_off: high end of the search region, edge bit included into the region,
61844 + *
61845 + * @return: 0 - set bit was found, -1 otherwise.
61846 + */
61847 +static int
61848 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61849 + bmap_off_t high_off)
61850 +{
61851 + ulong_t *base = addr;
61852 + int last_word;
61853 + int first_word;
61854 + int last_bit;
61855 + int nr;
61856 +
61857 + assert("zam-961", high_off >= 0);
61858 + assert("zam-962", high_off >= low_off);
61859 +
61860 + last_word = high_off >> LONG_INT_SHIFT;
61861 + last_bit = high_off & LONG_INT_MASK;
61862 + first_word = low_off >> LONG_INT_SHIFT;
61863 +
61864 + if (last_bit < BITS_PER_LONG) {
61865 + nr = find_last_set_bit_in_word(base[last_word], last_bit);
61866 + if (nr < BITS_PER_LONG) {
61867 + *result = (last_word << LONG_INT_SHIFT) + nr;
61868 + return 0;
61869 + }
61870 + --last_word;
61871 + }
61872 + while (last_word >= first_word) {
61873 + if (base[last_word] != 0x0) {
61874 + last_bit =
61875 + find_last_set_bit_in_word(base[last_word],
61876 + BITS_PER_LONG - 1);
61877 + assert("zam-972", last_bit < BITS_PER_LONG);
61878 + *result = (last_word << LONG_INT_SHIFT) + last_bit;
61879 + return 0;
61880 + }
61881 + --last_word;
61882 + }
61883 +
61884 + return -1; /* set bit not found */
61885 +}
61886 +
61887 +/* Search bitmap for a clear bit in backward direction from the end to the
61888 + * beginning of given region */
61889 +static int
61890 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61891 + bmap_off_t high_off)
61892 +{
61893 + ulong_t *base = addr;
61894 + int last_word;
61895 + int first_word;
61896 + int last_bit;
61897 + int nr;
61898 +
61899 + last_word = high_off >> LONG_INT_SHIFT;
61900 + last_bit = high_off & LONG_INT_MASK;
61901 + first_word = low_off >> LONG_INT_SHIFT;
61902 +
61903 + if (last_bit < BITS_PER_LONG) {
61904 + nr = find_last_set_bit_in_word(~base[last_word], last_bit);
61905 + if (nr < BITS_PER_LONG) {
61906 + *result = (last_word << LONG_INT_SHIFT) + nr;
61907 + return 0;
61908 + }
61909 + --last_word;
61910 + }
61911 + while (last_word >= first_word) {
61912 + if (base[last_word] != (ulong_t) (-1)) {
61913 + *result = (last_word << LONG_INT_SHIFT) +
61914 + find_last_set_bit_in_word(~base[last_word],
61915 + BITS_PER_LONG - 1);
61916 + return 0;
61917 + }
61918 + --last_word;
61919 + }
61920 +
61921 + return -1; /* zero bit not found */
61922 +}
61923 +
61924 +/* Audited by: green(2002.06.12) */
61925 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
61926 +{
61927 + int first_byte;
61928 + int last_byte;
61929 +
61930 + unsigned char first_byte_mask = 0xFF;
61931 + unsigned char last_byte_mask = 0xFF;
61932 +
61933 + assert("zam-410", start < end);
61934 +
61935 + first_byte = start >> 3;
61936 + last_byte = (end - 1) >> 3;
61937 +
61938 + if (last_byte > first_byte + 1)
61939 + memset(addr + first_byte + 1, 0,
61940 + (size_t) (last_byte - first_byte - 1));
61941 +
61942 + first_byte_mask >>= 8 - (start & 0x7);
61943 + last_byte_mask <<= ((end - 1) & 0x7) + 1;
61944 +
61945 + if (first_byte == last_byte) {
61946 + addr[first_byte] &= (first_byte_mask | last_byte_mask);
61947 + } else {
61948 + addr[first_byte] &= first_byte_mask;
61949 + addr[last_byte] &= last_byte_mask;
61950 + }
61951 +}
61952 +
61953 +/* Audited by: green(2002.06.12) */
61954 +/* ZAM-FIXME-HANS: comment this */
61955 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
61956 +{
61957 + int first_byte;
61958 + int last_byte;
61959 +
61960 + unsigned char first_byte_mask = 0xFF;
61961 + unsigned char last_byte_mask = 0xFF;
61962 +
61963 + assert("zam-386", start < end);
61964 +
61965 + first_byte = start >> 3;
61966 + last_byte = (end - 1) >> 3;
61967 +
61968 + if (last_byte > first_byte + 1)
61969 + memset(addr + first_byte + 1, 0xFF,
61970 + (size_t) (last_byte - first_byte - 1));
61971 +
61972 + first_byte_mask <<= start & 0x7;
61973 + last_byte_mask >>= 7 - ((end - 1) & 0x7);
61974 +
61975 + if (first_byte == last_byte) {
61976 + addr[first_byte] |= (first_byte_mask & last_byte_mask);
61977 + } else {
61978 + addr[first_byte] |= first_byte_mask;
61979 + addr[last_byte] |= last_byte_mask;
61980 + }
61981 +}
61982 +
61983 +#define ADLER_BASE 65521
61984 +#define ADLER_NMAX 5552
61985 +
61986 +/* Calculates the adler32 checksum for the data pointed by `data` of the
61987 + length `len`. This function was originally taken from zlib, version 1.1.3,
61988 + July 9th, 1998.
61989 +
61990 + Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
61991 +
61992 + This software is provided 'as-is', without any express or implied
61993 + warranty. In no event will the authors be held liable for any damages
61994 + arising from the use of this software.
61995 +
61996 + Permission is granted to anyone to use this software for any purpose,
61997 + including commercial applications, and to alter it and redistribute it
61998 + freely, subject to the following restrictions:
61999 +
62000 + 1. The origin of this software must not be misrepresented; you must not
62001 + claim that you wrote the original software. If you use this software
62002 + in a product, an acknowledgment in the product documentation would be
62003 + appreciated but is not required.
62004 + 2. Altered source versions must be plainly marked as such, and must not be
62005 + misrepresented as being the original software.
62006 + 3. This notice may not be removed or altered from any source distribution.
62007 +
62008 + Jean-loup Gailly Mark Adler
62009 + jloup@gzip.org madler@alumni.caltech.edu
62010 +
62011 + The above comment applies only to the reiser4_adler32 function.
62012 +*/
62013 +
62014 +__u32 reiser4_adler32(char *data, __u32 len)
62015 +{
62016 + unsigned char *t = data;
62017 + __u32 s1 = 1;
62018 + __u32 s2 = 0;
62019 + int k;
62020 +
62021 + while (len > 0) {
62022 + k = len < ADLER_NMAX ? len : ADLER_NMAX;
62023 + len -= k;
62024 +
62025 + while (k--) {
62026 + s1 += *t++;
62027 + s2 += s1;
62028 + }
62029 +
62030 + s1 %= ADLER_BASE;
62031 + s2 %= ADLER_BASE;
62032 + }
62033 + return (s2 << 16) | s1;
62034 +}
62035 +
62036 +#define sb_by_bnode(bnode) \
62037 + ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
62038 +
62039 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
62040 +{
62041 + return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
62042 +}
62043 +
62044 +static int
62045 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
62046 +{
62047 + if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
62048 + bmap_nr_t bmap;
62049 +
62050 + bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
62051 +
62052 + warning("vpf-263",
62053 + "Checksum for the bitmap block %llu is incorrect",
62054 + bmap);
62055 +
62056 + return RETERR(-EIO);
62057 + }
62058 +
62059 + return 0;
62060 +}
62061 +
62062 +#define REISER4_CHECK_BMAP_CRC (0)
62063 +
62064 +#if REISER4_CHECK_BMAP_CRC
62065 +static int bnode_check_crc(const struct bitmap_node *bnode)
62066 +{
62067 + return bnode_check_adler32(bnode,
62068 + bmap_size(sb_by_bnode(bnode)->s_blocksize));
62069 +}
62070 +
62071 +/* REISER4_CHECK_BMAP_CRC */
62072 +#else
62073 +
62074 +#define bnode_check_crc(bnode) (0)
62075 +
62076 +/* REISER4_CHECK_BMAP_CRC */
62077 +#endif
62078 +
62079 +/* Recalculates the adler32 checksum for only 1 byte change.
62080 + adler - previous adler checksum
62081 + old_data, data - old, new byte values.
62082 + tail == (chunk - offset) : length, checksum was calculated for, - offset of
62083 + the changed byte within this chunk.
62084 + This function can be used for checksum calculation optimisation.
62085 +*/
62086 +
62087 +static __u32
62088 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
62089 + __u32 tail)
62090 +{
62091 + __u32 delta = data - old_data + 2 * ADLER_BASE;
62092 + __u32 s1 = adler & 0xffff;
62093 + __u32 s2 = (adler >> 16) & 0xffff;
62094 +
62095 + s1 = (delta + s1) % ADLER_BASE;
62096 + s2 = (delta * tail + s2) % ADLER_BASE;
62097 +
62098 + return (s2 << 16) | s1;
62099 +}
62100 +
62101 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
62102 +
62103 +/**
62104 + * get_nr_bitmap - calculate number of bitmap blocks
62105 + * @super: super block with initialized blocksize and block count
62106 + *
62107 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
62108 + * maintain free disk space. It assumes that each bitmap addresses the same
62109 + * number of blocks which is calculated by bmap_block_count macro defined in
62110 + * above. Number of blocks in the filesystem has to be initialized in reiser4
62111 + * private data of super block already so that it can be obtained via
62112 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
62113 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
62114 + * to use special function to divide and modulo 64bits filesystem block
62115 + * counters.
62116 + *
62117 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
62118 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
62119 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
62120 + */
62121 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
62122 +{
62123 + u64 quotient;
62124 +
62125 + assert("zam-393", reiser4_block_count(super) != 0);
62126 +
62127 + quotient = reiser4_block_count(super) - 1;
62128 + do_div(quotient, bmap_bit_count(super->s_blocksize));
62129 + return quotient + 1;
62130 +}
62131 +
62132 +/**
62133 + * parse_blocknr - calculate bitmap number and offset in it by block number
62134 + * @block: pointer to block number to calculate location in bitmap of
62135 + * @bmap: pointer where to store bitmap block number
62136 + * @offset: pointer where to store offset within bitmap block
62137 + *
62138 + * Calculates location of bit which is responsible for allocation/freeing of
62139 + * block @*block. That location is represented by bitmap block number and offset
62140 + * within that bitmap block.
62141 + */
62142 +static void
62143 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
62144 + bmap_off_t *offset)
62145 +{
62146 + struct super_block *super = get_current_context()->super;
62147 + u64 quotient = *block;
62148 +
62149 + *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
62150 + *bmap = quotient;
62151 +
62152 + assert("zam-433", *bmap < get_nr_bmap(super));
62153 + assert("", *offset < bmap_bit_count(super->s_blocksize));
62154 +}
62155 +
62156 +#if REISER4_DEBUG
62157 +/* Audited by: green(2002.06.12) */
62158 +static void
62159 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
62160 +{
62161 + struct super_block *sb = reiser4_get_current_sb();
62162 +
62163 + assert("zam-436", sb != NULL);
62164 +
62165 + assert("zam-455", start != NULL);
62166 + assert("zam-437", *start != 0);
62167 + assert("zam-541", !blocknr_is_fake(start));
62168 + assert("zam-441", *start < reiser4_block_count(sb));
62169 +
62170 + if (len != NULL) {
62171 + assert("zam-438", *len != 0);
62172 + assert("zam-442", *start + *len <= reiser4_block_count(sb));
62173 + }
62174 +}
62175 +
62176 +static void check_bnode_loaded(const struct bitmap_node *bnode)
62177 +{
62178 + assert("zam-485", bnode != NULL);
62179 + assert("zam-483", jnode_page(bnode->wjnode) != NULL);
62180 + assert("zam-484", jnode_page(bnode->cjnode) != NULL);
62181 + assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
62182 + assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
62183 +}
62184 +
62185 +#else
62186 +
62187 +# define check_block_range(start, len) do { /* nothing */} while(0)
62188 +# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
62189 +
62190 +#endif
62191 +
62192 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
62193 + spin-locked */
62194 +static inline void
62195 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
62196 +{
62197 + if (offset < bnode->first_zero_bit)
62198 + bnode->first_zero_bit = offset;
62199 +}
62200 +
62201 +/* return a physical disk address for logical bitmap number @bmap */
62202 +/* FIXME-VS: this is somehow related to disk layout? */
62203 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
62204 + * per block allocation so that performance is not affected. Probably this
62205 + * whole file should be considered part of the disk layout plugin, and other
62206 + * disk layouts can use other defines and efficiency will not be significantly
62207 + * affected. */
62208 +
62209 +#define REISER4_FIRST_BITMAP_BLOCK \
62210 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
62211 +
62212 +/* Audited by: green(2002.06.12) */
62213 +static void
62214 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
62215 + reiser4_block_nr * bnr)
62216 +{
62217 +
62218 + assert("zam-390", bmap < get_nr_bmap(super));
62219 +
62220 +#ifdef CONFIG_REISER4_BADBLOCKS
62221 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
62222 + /* Check if the diskmap have this already, first. */
62223 + if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
62224 + return; /* Found it in diskmap */
62225 +#endif
62226 + /* FIXME_ZAM: before discussing of disk layouts and disk format
62227 + plugins I implement bitmap location scheme which is close to scheme
62228 + used in reiser 3.6 */
62229 + if (bmap == 0) {
62230 + *bnr = REISER4_FIRST_BITMAP_BLOCK;
62231 + } else {
62232 + *bnr = bmap * bmap_bit_count(super->s_blocksize);
62233 + }
62234 +}
62235 +
62236 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
62237 +/* Audited by: green(2002.06.12) */
62238 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
62239 +{
62240 + *bnr =
62241 + (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
62242 + REISER4_BITMAP_BLOCKS_STATUS_VALUE);
62243 +}
62244 +
62245 +/* bnode structure initialization */
62246 +static void
62247 +init_bnode(struct bitmap_node *bnode,
62248 + struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
62249 +{
62250 + memset(bnode, 0, sizeof(struct bitmap_node));
62251 +
62252 + sema_init(&bnode->sema, 1);
62253 + atomic_set(&bnode->loaded, 0);
62254 +}
62255 +
62256 +static void release(jnode * node)
62257 +{
62258 + jrelse(node);
62259 + JF_SET(node, JNODE_HEARD_BANSHEE);
62260 + jput(node);
62261 +}
62262 +
62263 +/* This function is for internal bitmap.c use because it assumes that jnode is
62264 + in under full control of this thread */
62265 +static void done_bnode(struct bitmap_node *bnode)
62266 +{
62267 + if (bnode) {
62268 + atomic_set(&bnode->loaded, 0);
62269 + if (bnode->wjnode != NULL)
62270 + release(bnode->wjnode);
62271 + if (bnode->cjnode != NULL)
62272 + release(bnode->cjnode);
62273 + bnode->wjnode = bnode->cjnode = NULL;
62274 + }
62275 +}
62276 +
62277 +/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
62278 +static int
62279 +prepare_bnode(struct bitmap_node *bnode, jnode ** cjnode_ret,
62280 + jnode ** wjnode_ret)
62281 +{
62282 + struct super_block *super;
62283 + jnode *cjnode;
62284 + jnode *wjnode;
62285 + bmap_nr_t bmap;
62286 + int ret;
62287 +
62288 + super = reiser4_get_current_sb();
62289 +
62290 + *wjnode_ret = wjnode = bnew();
62291 + if (wjnode == NULL) {
62292 + *cjnode_ret = NULL;
62293 + return RETERR(-ENOMEM);
62294 + }
62295 +
62296 + *cjnode_ret = cjnode = bnew();
62297 + if (cjnode == NULL)
62298 + return RETERR(-ENOMEM);
62299 +
62300 + bmap = bnode - get_bnode(super, 0);
62301 +
62302 + get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
62303 + get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
62304 +
62305 + jref(cjnode);
62306 + jref(wjnode);
62307 +
62308 + /* load commit bitmap */
62309 + ret = jload_gfp(cjnode, GFP_NOFS, 1);
62310 +
62311 + if (ret)
62312 + goto error;
62313 +
62314 + /* allocate memory for working bitmap block. Note that for
62315 + * bitmaps jinit_new() doesn't actually modifies node content,
62316 + * so parallel calls to this are ok. */
62317 + ret = jinit_new(wjnode, GFP_NOFS);
62318 +
62319 + if (ret != 0) {
62320 + jrelse(cjnode);
62321 + goto error;
62322 + }
62323 +
62324 + return 0;
62325 +
62326 + error:
62327 + jput(cjnode);
62328 + jput(wjnode);
62329 + *wjnode_ret = *cjnode_ret = NULL;
62330 + return ret;
62331 +
62332 +}
62333 +
62334 +/* Check the bnode data on read. */
62335 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
62336 +{
62337 + void *data;
62338 + int ret;
62339 +
62340 + /* Check CRC */
62341 + ret = bnode_check_adler32(bnode, blksize);
62342 +
62343 + if (ret) {
62344 + return ret;
62345 + }
62346 +
62347 + data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
62348 +
62349 + /* Check the very first bit -- it must be busy. */
62350 + if (!reiser4_test_bit(0, data)) {
62351 + warning("vpf-1362", "The allocator block %llu is not marked "
62352 + "as used.", (unsigned long long)bnode->cjnode->blocknr);
62353 +
62354 + return -EINVAL;
62355 + }
62356 +
62357 + return 0;
62358 +}
62359 +
62360 +/* load bitmap blocks "on-demand" */
62361 +static int load_and_lock_bnode(struct bitmap_node *bnode)
62362 +{
62363 + int ret;
62364 +
62365 + jnode *cjnode;
62366 + jnode *wjnode;
62367 +
62368 + assert("nikita-3040", schedulable());
62369 +
62370 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62371 + * need to be atomic, right? Just leave a comment that if bitmaps were
62372 + * unloadable, this would need to be atomic. */
62373 + if (atomic_read(&bnode->loaded)) {
62374 + /* bitmap is already loaded, nothing to do */
62375 + check_bnode_loaded(bnode);
62376 + down(&bnode->sema);
62377 + assert("nikita-2827", atomic_read(&bnode->loaded));
62378 + return 0;
62379 + }
62380 +
62381 + ret = prepare_bnode(bnode, &cjnode, &wjnode);
62382 + if (ret == 0) {
62383 + down(&bnode->sema);
62384 +
62385 + if (!atomic_read(&bnode->loaded)) {
62386 + assert("nikita-2822", cjnode != NULL);
62387 + assert("nikita-2823", wjnode != NULL);
62388 + assert("nikita-2824", jnode_is_loaded(cjnode));
62389 + assert("nikita-2825", jnode_is_loaded(wjnode));
62390 +
62391 + bnode->wjnode = wjnode;
62392 + bnode->cjnode = cjnode;
62393 +
62394 + ret = check_struct_bnode(bnode, current_blocksize);
62395 + if (!ret) {
62396 + cjnode = wjnode = NULL;
62397 + atomic_set(&bnode->loaded, 1);
62398 + /* working bitmap is initialized by on-disk
62399 + * commit bitmap. This should be performed
62400 + * under semaphore. */
62401 + memcpy(bnode_working_data(bnode),
62402 + bnode_commit_data(bnode),
62403 + bmap_size(current_blocksize));
62404 + } else {
62405 + up(&bnode->sema);
62406 + }
62407 + } else
62408 + /* race: someone already loaded bitmap while we were
62409 + * busy initializing data. */
62410 + check_bnode_loaded(bnode);
62411 + }
62412 +
62413 + if (wjnode != NULL) {
62414 + release(wjnode);
62415 + bnode->wjnode = NULL;
62416 + }
62417 + if (cjnode != NULL) {
62418 + release(cjnode);
62419 + bnode->cjnode = NULL;
62420 + }
62421 +
62422 + return ret;
62423 +}
62424 +
62425 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
62426 +{
62427 + check_bnode_loaded(bnode);
62428 + up(&bnode->sema);
62429 +}
62430 +
62431 +/* This function does all block allocation work but only for one bitmap
62432 + block.*/
62433 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62434 + block responsibility zone boundaries. This had no sense in v3.6 but may
62435 + have it in v4.x */
62436 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62437 +static int
62438 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62439 + bmap_off_t max_offset, int min_len, int max_len)
62440 +{
62441 + struct super_block *super = get_current_context()->super;
62442 + struct bitmap_node *bnode = get_bnode(super, bmap);
62443 +
62444 + char *data;
62445 +
62446 + bmap_off_t search_end;
62447 + bmap_off_t start;
62448 + bmap_off_t end;
62449 +
62450 + int set_first_zero_bit = 0;
62451 +
62452 + int ret;
62453 +
62454 + assert("zam-364", min_len > 0);
62455 + assert("zam-365", max_len >= min_len);
62456 + assert("zam-366", *offset <= max_offset);
62457 +
62458 + ret = load_and_lock_bnode(bnode);
62459 +
62460 + if (ret)
62461 + return ret;
62462 +
62463 + data = bnode_working_data(bnode);
62464 +
62465 + start = *offset;
62466 +
62467 + if (bnode->first_zero_bit >= start) {
62468 + start = bnode->first_zero_bit;
62469 + set_first_zero_bit = 1;
62470 + }
62471 +
62472 + while (start + min_len < max_offset) {
62473 +
62474 + start =
62475 + reiser4_find_next_zero_bit((long *)data, max_offset, start);
62476 + if (set_first_zero_bit) {
62477 + bnode->first_zero_bit = start;
62478 + set_first_zero_bit = 0;
62479 + }
62480 + if (start >= max_offset)
62481 + break;
62482 +
62483 + search_end = LIMIT(start + max_len, max_offset);
62484 + end =
62485 + reiser4_find_next_set_bit((long *)data, search_end, start);
62486 + if (end >= start + min_len) {
62487 + /* we can't trust find_next_set_bit result if set bit
62488 + was not fount, result may be bigger than
62489 + max_offset */
62490 + if (end > search_end)
62491 + end = search_end;
62492 +
62493 + ret = end - start;
62494 + *offset = start;
62495 +
62496 + reiser4_set_bits(data, start, end);
62497 +
62498 + /* FIXME: we may advance first_zero_bit if [start,
62499 + end] region overlaps the first_zero_bit point */
62500 +
62501 + break;
62502 + }
62503 +
62504 + start = end + 1;
62505 + }
62506 +
62507 + release_and_unlock_bnode(bnode);
62508 +
62509 + return ret;
62510 +}
62511 +
62512 +static int
62513 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
62514 + bmap_off_t end_offset, int min_len, int max_len)
62515 +{
62516 + struct super_block *super = get_current_context()->super;
62517 + struct bitmap_node *bnode = get_bnode(super, bmap);
62518 + char *data;
62519 + bmap_off_t start;
62520 + int ret;
62521 +
62522 + assert("zam-958", min_len > 0);
62523 + assert("zam-959", max_len >= min_len);
62524 + assert("zam-960", *start_offset >= end_offset);
62525 +
62526 + ret = load_and_lock_bnode(bnode);
62527 + if (ret)
62528 + return ret;
62529 +
62530 + data = bnode_working_data(bnode);
62531 + start = *start_offset;
62532 +
62533 + while (1) {
62534 + bmap_off_t end, search_end;
62535 +
62536 + /* Find the beginning of the zero filled region */
62537 + if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
62538 + break;
62539 + /* Is there more than `min_len' bits from `start' to
62540 + * `end_offset'? */
62541 + if (start < end_offset + min_len - 1)
62542 + break;
62543 +
62544 + /* Do not search to `end_offset' if we need to find less than
62545 + * `max_len' zero bits. */
62546 + if (end_offset + max_len - 1 < start)
62547 + search_end = start - max_len + 1;
62548 + else
62549 + search_end = end_offset;
62550 +
62551 + if (reiser4_find_last_set_bit(&end, data, search_end, start))
62552 + end = search_end;
62553 + else
62554 + end++;
62555 +
62556 + if (end + min_len <= start + 1) {
62557 + if (end < search_end)
62558 + end = search_end;
62559 + ret = start - end + 1;
62560 + *start_offset = end; /* `end' is lowest offset */
62561 + assert("zam-987",
62562 + reiser4_find_next_set_bit(data, start + 1,
62563 + end) >= start + 1);
62564 + reiser4_set_bits(data, end, start + 1);
62565 + break;
62566 + }
62567 +
62568 + if (end <= end_offset)
62569 + /* left search boundary reached. */
62570 + break;
62571 + start = end - 1;
62572 + }
62573 +
62574 + release_and_unlock_bnode(bnode);
62575 + return ret;
62576 +}
62577 +
62578 +/* allocate contiguous range of blocks in bitmap */
62579 +static int bitmap_alloc_forward(reiser4_block_nr * start,
62580 + const reiser4_block_nr * end, int min_len,
62581 + int max_len)
62582 +{
62583 + bmap_nr_t bmap, end_bmap;
62584 + bmap_off_t offset, end_offset;
62585 + int len;
62586 +
62587 + reiser4_block_nr tmp;
62588 +
62589 + struct super_block *super = get_current_context()->super;
62590 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62591 +
62592 + parse_blocknr(start, &bmap, &offset);
62593 +
62594 + tmp = *end - 1;
62595 + parse_blocknr(&tmp, &end_bmap, &end_offset);
62596 + ++end_offset;
62597 +
62598 + assert("zam-358", end_bmap >= bmap);
62599 + assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
62600 +
62601 + for (; bmap < end_bmap; bmap++, offset = 0) {
62602 + len =
62603 + search_one_bitmap_forward(bmap, &offset, max_offset,
62604 + min_len, max_len);
62605 + if (len != 0)
62606 + goto out;
62607 + }
62608 +
62609 + len =
62610 + search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
62611 + max_len);
62612 + out:
62613 + *start = bmap * max_offset + offset;
62614 + return len;
62615 +}
62616 +
62617 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
62618 + * backward direction) */
62619 +static int bitmap_alloc_backward(reiser4_block_nr * start,
62620 + const reiser4_block_nr * end, int min_len,
62621 + int max_len)
62622 +{
62623 + bmap_nr_t bmap, end_bmap;
62624 + bmap_off_t offset, end_offset;
62625 + int len;
62626 + struct super_block *super = get_current_context()->super;
62627 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62628 +
62629 + parse_blocknr(start, &bmap, &offset);
62630 + parse_blocknr(end, &end_bmap, &end_offset);
62631 +
62632 + assert("zam-961", end_bmap <= bmap);
62633 + assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
62634 +
62635 + for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
62636 + len =
62637 + search_one_bitmap_backward(bmap, &offset, 0, min_len,
62638 + max_len);
62639 + if (len != 0)
62640 + goto out;
62641 + }
62642 +
62643 + len =
62644 + search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
62645 + max_len);
62646 + out:
62647 + *start = bmap * max_offset + offset;
62648 + return len;
62649 +}
62650 +
62651 +/* plugin->u.space_allocator.alloc_blocks() */
62652 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
62653 + reiser4_block_nr *start, reiser4_block_nr *len)
62654 +{
62655 + struct super_block *super = get_current_context()->super;
62656 + int actual_len;
62657 +
62658 + reiser4_block_nr search_start;
62659 + reiser4_block_nr search_end;
62660 +
62661 + assert("zam-398", super != NULL);
62662 + assert("zam-412", hint != NULL);
62663 + assert("zam-397", hint->blk <= reiser4_block_count(super));
62664 +
62665 + if (hint->max_dist == 0)
62666 + search_end = reiser4_block_count(super);
62667 + else
62668 + search_end =
62669 + LIMIT(hint->blk + hint->max_dist,
62670 + reiser4_block_count(super));
62671 +
62672 + /* We use @hint -> blk as a search start and search from it to the end
62673 + of the disk or in given region if @hint -> max_dist is not zero */
62674 + search_start = hint->blk;
62675 +
62676 + actual_len =
62677 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62678 +
62679 + /* There is only one bitmap search if max_dist was specified or first
62680 + pass was from the beginning of the bitmap. We also do one pass for
62681 + scanning bitmap in backward direction. */
62682 + if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
62683 + /* next step is a scanning from 0 to search_start */
62684 + search_end = search_start;
62685 + search_start = 0;
62686 + actual_len =
62687 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62688 + }
62689 + if (actual_len == 0)
62690 + return RETERR(-ENOSPC);
62691 + if (actual_len < 0)
62692 + return RETERR(actual_len);
62693 + *len = actual_len;
62694 + *start = search_start;
62695 + return 0;
62696 +}
62697 +
62698 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
62699 + reiser4_block_nr * start,
62700 + reiser4_block_nr * len)
62701 +{
62702 + reiser4_block_nr search_start;
62703 + reiser4_block_nr search_end;
62704 + int actual_len;
62705 +
62706 + ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
62707 +
62708 + assert("zam-969", super != NULL);
62709 + assert("zam-970", hint != NULL);
62710 + assert("zam-971", hint->blk <= reiser4_block_count(super));
62711 +
62712 + search_start = hint->blk;
62713 + if (hint->max_dist == 0 || search_start <= hint->max_dist)
62714 + search_end = 0;
62715 + else
62716 + search_end = search_start - hint->max_dist;
62717 +
62718 + actual_len =
62719 + bitmap_alloc_backward(&search_start, &search_end, 1, needed);
62720 + if (actual_len == 0)
62721 + return RETERR(-ENOSPC);
62722 + if (actual_len < 0)
62723 + return RETERR(actual_len);
62724 + *len = actual_len;
62725 + *start = search_start;
62726 + return 0;
62727 +}
62728 +
62729 +/* plugin->u.space_allocator.alloc_blocks() */
62730 +int
62731 +alloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
62732 + reiser4_blocknr_hint * hint, int needed,
62733 + reiser4_block_nr * start, reiser4_block_nr * len)
62734 +{
62735 + if (hint->backward)
62736 + return alloc_blocks_backward(hint, needed, start, len);
62737 + return alloc_blocks_forward(hint, needed, start, len);
62738 +}
62739 +
62740 +/* plugin->u.space_allocator.dealloc_blocks(). */
62741 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
62742 + nodes deletion is deferred until transaction commit. However, deallocation
62743 + of temporary objects like wandered blocks and transaction commit records
62744 + requires immediate node deletion from WORKING BITMAP.*/
62745 +void
62746 +dealloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
62747 + reiser4_block_nr start, reiser4_block_nr len)
62748 +{
62749 + struct super_block *super = reiser4_get_current_sb();
62750 +
62751 + bmap_nr_t bmap;
62752 + bmap_off_t offset;
62753 +
62754 + struct bitmap_node *bnode;
62755 + int ret;
62756 +
62757 + assert("zam-468", len != 0);
62758 + check_block_range(&start, &len);
62759 +
62760 + parse_blocknr(&start, &bmap, &offset);
62761 +
62762 + assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
62763 +
62764 + bnode = get_bnode(super, bmap);
62765 +
62766 + assert("zam-470", bnode != NULL);
62767 +
62768 + ret = load_and_lock_bnode(bnode);
62769 + assert("zam-481", ret == 0);
62770 +
62771 + reiser4_clear_bits(bnode_working_data(bnode), offset,
62772 + (bmap_off_t) (offset + len));
62773 +
62774 + adjust_first_zero_bit(bnode, offset);
62775 +
62776 + release_and_unlock_bnode(bnode);
62777 +}
62778 +
62779 +/* plugin->u.space_allocator.check_blocks(). */
62780 +void
62781 +check_blocks_bitmap(const reiser4_block_nr * start,
62782 + const reiser4_block_nr * len, int desired)
62783 +{
62784 +#if REISER4_DEBUG
62785 + struct super_block *super = reiser4_get_current_sb();
62786 +
62787 + bmap_nr_t bmap;
62788 + bmap_off_t start_offset;
62789 + bmap_off_t end_offset;
62790 +
62791 + struct bitmap_node *bnode;
62792 + int ret;
62793 +
62794 + assert("zam-622", len != NULL);
62795 + check_block_range(start, len);
62796 + parse_blocknr(start, &bmap, &start_offset);
62797 +
62798 + end_offset = start_offset + *len;
62799 + assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
62800 +
62801 + bnode = get_bnode(super, bmap);
62802 +
62803 + assert("nikita-2215", bnode != NULL);
62804 +
62805 + ret = load_and_lock_bnode(bnode);
62806 + assert("zam-626", ret == 0);
62807 +
62808 + assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
62809 +
62810 + if (desired) {
62811 + assert("zam-623",
62812 + reiser4_find_next_zero_bit(bnode_working_data(bnode),
62813 + end_offset, start_offset)
62814 + >= end_offset);
62815 + } else {
62816 + assert("zam-624",
62817 + reiser4_find_next_set_bit(bnode_working_data(bnode),
62818 + end_offset, start_offset)
62819 + >= end_offset);
62820 + }
62821 +
62822 + release_and_unlock_bnode(bnode);
62823 +#endif
62824 +}
62825 +
62826 +/* conditional insertion of @node into atom's overwrite set if it was not there */
62827 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
62828 +{
62829 + assert("zam-546", atom != NULL);
62830 + assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
62831 + assert("zam-548", node != NULL);
62832 +
62833 + spin_lock_atom(atom);
62834 + spin_lock_jnode(node);
62835 +
62836 + if (node->atom == NULL) {
62837 + JF_SET(node, JNODE_OVRWR);
62838 + insert_into_atom_ovrwr_list(atom, node);
62839 + } else {
62840 + assert("zam-549", node->atom == atom);
62841 + }
62842 +
62843 + spin_unlock_jnode(node);
62844 + spin_unlock_atom(atom);
62845 +}
62846 +
62847 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
62848 + pages in a single-linked list */
62849 +static int
62850 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
62851 + const reiser4_block_nr * len, void *data)
62852 +{
62853 +
62854 + bmap_nr_t bmap;
62855 + bmap_off_t offset;
62856 + int ret;
62857 +
62858 + long long *blocks_freed_p = data;
62859 +
62860 + struct bitmap_node *bnode;
62861 +
62862 + struct super_block *sb = reiser4_get_current_sb();
62863 +
62864 + check_block_range(start, len);
62865 +
62866 + parse_blocknr(start, &bmap, &offset);
62867 +
62868 + /* FIXME-ZAM: we assume that all block ranges are allocated by this
62869 + bitmap-based allocator and each block range can't go over a zone of
62870 + responsibility of one bitmap block; same assumption is used in
62871 + other journal hooks in bitmap code. */
62872 + bnode = get_bnode(sb, bmap);
62873 + assert("zam-448", bnode != NULL);
62874 +
62875 + /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
62876 + assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
62877 + ret = load_and_lock_bnode(bnode);
62878 + if (ret)
62879 + return ret;
62880 +
62881 + /* put bnode into atom's overwrite set */
62882 + cond_add_to_overwrite_set(atom, bnode->cjnode);
62883 +
62884 + data = bnode_commit_data(bnode);
62885 +
62886 + ret = bnode_check_crc(bnode);
62887 + if (ret != 0)
62888 + return ret;
62889 +
62890 + if (len != NULL) {
62891 + /* FIXME-ZAM: a check that all bits are set should be there */
62892 + assert("zam-443",
62893 + offset + *len <= bmap_bit_count(sb->s_blocksize));
62894 + reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
62895 +
62896 + (*blocks_freed_p) += *len;
62897 + } else {
62898 + reiser4_clear_bit(offset, data);
62899 + (*blocks_freed_p)++;
62900 + }
62901 +
62902 + bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
62903 +
62904 + release_and_unlock_bnode(bnode);
62905 +
62906 + return 0;
62907 +}
62908 +
62909 +/* plugin->u.space_allocator.pre_commit_hook(). */
62910 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
62911 + rest is done by transaction manager (allocate wandered locations for COMMIT
62912 + BITMAP blocks, copy COMMIT BITMAP blocks data). */
62913 +/* Only one instance of this function can be running at one given time, because
62914 + only one transaction can be committed a time, therefore it is safe to access
62915 + some global variables without any locking */
62916 +
62917 +int pre_commit_hook_bitmap(void)
62918 +{
62919 + struct super_block *super = reiser4_get_current_sb();
62920 + txn_atom *atom;
62921 +
62922 + long long blocks_freed = 0;
62923 +
62924 + atom = get_current_atom_locked();
62925 + assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
62926 + spin_unlock_atom(atom);
62927 +
62928 + { /* scan atom's captured list and find all freshly allocated nodes,
62929 + * mark corresponded bits in COMMIT BITMAP as used */
62930 + struct list_head *head = ATOM_CLEAN_LIST(atom);
62931 + jnode *node = list_entry(head->next, jnode, capture_link);
62932 +
62933 + while (head != &node->capture_link) {
62934 + /* we detect freshly allocated jnodes */
62935 + if (JF_ISSET(node, JNODE_RELOC)) {
62936 + int ret;
62937 + bmap_nr_t bmap;
62938 +
62939 + bmap_off_t offset;
62940 + bmap_off_t index;
62941 + struct bitmap_node *bn;
62942 + __u32 size = bmap_size(super->s_blocksize);
62943 + __u32 crc;
62944 + char byte;
62945 +
62946 + assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
62947 + assert("zam-460",
62948 + !blocknr_is_fake(&node->blocknr));
62949 +
62950 + parse_blocknr(&node->blocknr, &bmap, &offset);
62951 + bn = get_bnode(super, bmap);
62952 +
62953 + index = offset >> 3;
62954 + assert("vpf-276", index < size);
62955 +
62956 + ret = bnode_check_crc(bnode);
62957 + if (ret != 0)
62958 + return ret;
62959 +
62960 + check_bnode_loaded(bn);
62961 + load_and_lock_bnode(bn);
62962 +
62963 + byte = *(bnode_commit_data(bn) + index);
62964 + reiser4_set_bit(offset, bnode_commit_data(bn));
62965 +
62966 + crc = adler32_recalc(bnode_commit_crc(bn), byte,
62967 + *(bnode_commit_data(bn) +
62968 + index),
62969 + size - index),
62970 + bnode_set_commit_crc(bn, crc);
62971 +
62972 + release_and_unlock_bnode(bn);
62973 +
62974 + ret = bnode_check_crc(bn);
62975 + if (ret != 0)
62976 + return ret;
62977 +
62978 + /* working of this depends on how it inserts
62979 + new j-node into clean list, because we are
62980 + scanning the same list now. It is OK, if
62981 + insertion is done to the list front */
62982 + cond_add_to_overwrite_set(atom, bn->cjnode);
62983 + }
62984 +
62985 + node = list_entry(node->capture_link.next, jnode, capture_link);
62986 + }
62987 + }
62988 +
62989 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
62990 + &blocks_freed, 0);
62991 +
62992 + blocks_freed -= atom->nr_blocks_allocated;
62993 +
62994 + {
62995 + reiser4_super_info_data *sbinfo;
62996 +
62997 + sbinfo = get_super_private(super);
62998 +
62999 + spin_lock_reiser4_super(sbinfo);
63000 + sbinfo->blocks_free_committed += blocks_freed;
63001 + spin_unlock_reiser4_super(sbinfo);
63002 + }
63003 +
63004 + return 0;
63005 +}
63006 +
63007 +/* plugin->u.space_allocator.init_allocator
63008 + constructor of reiser4_space_allocator object. It is called on fs mount */
63009 +int
63010 +init_allocator_bitmap(reiser4_space_allocator * allocator,
63011 + struct super_block *super, void *arg UNUSED_ARG)
63012 +{
63013 + struct bitmap_allocator_data *data = NULL;
63014 + bmap_nr_t bitmap_blocks_nr;
63015 + bmap_nr_t i;
63016 +
63017 + assert("nikita-3039", schedulable());
63018 +
63019 + /* getting memory for bitmap allocator private data holder */
63020 + data =
63021 + kmalloc(sizeof(struct bitmap_allocator_data), GFP_KERNEL);
63022 +
63023 + if (data == NULL)
63024 + return RETERR(-ENOMEM);
63025 +
63026 + /* allocation and initialization for the array of bnodes */
63027 + bitmap_blocks_nr = get_nr_bmap(super);
63028 +
63029 + /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
63030 + which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
63031 + may I never meet someone who still uses the ia32 architecture when
63032 + storage devices of that size enter the market, and wants to use ia32
63033 + with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
63034 + probably, another dynamic data structure should replace a static
63035 + array of bnodes. */
63036 + /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
63037 + data->bitmap = vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
63038 + if (data->bitmap == NULL) {
63039 + kfree(data);
63040 + return RETERR(-ENOMEM);
63041 + }
63042 +
63043 + for (i = 0; i < bitmap_blocks_nr; i++)
63044 + init_bnode(data->bitmap + i, super, i);
63045 +
63046 + allocator->u.generic = data;
63047 +
63048 +#if REISER4_DEBUG
63049 + get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
63050 +#endif
63051 +
63052 + /* Load all bitmap blocks at mount time. */
63053 + if (!test_bit
63054 + (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
63055 + __u64 start_time, elapsed_time;
63056 + struct bitmap_node *bnode;
63057 + int ret;
63058 +
63059 + if (REISER4_DEBUG)
63060 + printk(KERN_INFO "loading reiser4 bitmap...");
63061 + start_time = jiffies;
63062 +
63063 + for (i = 0; i < bitmap_blocks_nr; i++) {
63064 + bnode = data->bitmap + i;
63065 + ret = load_and_lock_bnode(bnode);
63066 + if (ret) {
63067 + destroy_allocator_bitmap(allocator, super);
63068 + return ret;
63069 + }
63070 + release_and_unlock_bnode(bnode);
63071 + }
63072 +
63073 + elapsed_time = jiffies - start_time;
63074 + if (REISER4_DEBUG)
63075 + printk("...done (%llu jiffies)\n",
63076 + (unsigned long long)elapsed_time);
63077 + }
63078 +
63079 + return 0;
63080 +}
63081 +
63082 +/* plugin->u.space_allocator.destroy_allocator
63083 + destructor. It is called on fs unmount */
63084 +int
63085 +destroy_allocator_bitmap(reiser4_space_allocator * allocator,
63086 + struct super_block *super)
63087 +{
63088 + bmap_nr_t bitmap_blocks_nr;
63089 + bmap_nr_t i;
63090 +
63091 + struct bitmap_allocator_data *data = allocator->u.generic;
63092 +
63093 + assert("zam-414", data != NULL);
63094 + assert("zam-376", data->bitmap != NULL);
63095 +
63096 + bitmap_blocks_nr = get_nr_bmap(super);
63097 +
63098 + for (i = 0; i < bitmap_blocks_nr; i++) {
63099 + struct bitmap_node *bnode = data->bitmap + i;
63100 +
63101 + down(&bnode->sema);
63102 +
63103 +#if REISER4_DEBUG
63104 + if (atomic_read(&bnode->loaded)) {
63105 + jnode *wj = bnode->wjnode;
63106 + jnode *cj = bnode->cjnode;
63107 +
63108 + assert("zam-480", jnode_page(cj) != NULL);
63109 + assert("zam-633", jnode_page(wj) != NULL);
63110 +
63111 + assert("zam-634",
63112 + memcmp(jdata(wj), jdata(wj),
63113 + bmap_size(super->s_blocksize)) == 0);
63114 +
63115 + }
63116 +#endif
63117 + done_bnode(bnode);
63118 + up(&bnode->sema);
63119 + }
63120 +
63121 + vfree(data->bitmap);
63122 + kfree(data);
63123 +
63124 + allocator->u.generic = NULL;
63125 +
63126 + return 0;
63127 +}
63128 +
63129 +/*
63130 + Local variables:
63131 + c-indentation-style: "K&R"
63132 + mode-name: "LC"
63133 + c-basic-offset: 8
63134 + tab-width: 8
63135 + fill-column: 80
63136 + scroll-step: 1
63137 + End:
63138 +*/
63139 Index: linux-2.6.16/fs/reiser4/plugin/space/bitmap.h
63140 ===================================================================
63141 --- /dev/null
63142 +++ linux-2.6.16/fs/reiser4/plugin/space/bitmap.h
63143 @@ -0,0 +1,47 @@
63144 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63145 +
63146 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
63147 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
63148 +
63149 +#include "../../dformat.h"
63150 +#include "../../block_alloc.h"
63151 +
63152 +#include <linux/types.h> /* for __u?? */
63153 +#include <linux/fs.h> /* for struct super_block */
63154 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
63155 +/* declarations of functions implementing methods of space allocator plugin for
63156 + bitmap based allocator. The functions themselves are in bitmap.c */
63157 +extern int init_allocator_bitmap(reiser4_space_allocator *,
63158 + struct super_block *, void *);
63159 +extern int destroy_allocator_bitmap(reiser4_space_allocator *,
63160 + struct super_block *);
63161 +extern int alloc_blocks_bitmap(reiser4_space_allocator *,
63162 + reiser4_blocknr_hint *, int needed,
63163 + reiser4_block_nr * start,
63164 + reiser4_block_nr * len);
63165 +extern void check_blocks_bitmap(const reiser4_block_nr *,
63166 + const reiser4_block_nr *, int);
63167 +
63168 +extern void dealloc_blocks_bitmap(reiser4_space_allocator *, reiser4_block_nr,
63169 + reiser4_block_nr);
63170 +extern int pre_commit_hook_bitmap(void);
63171 +
63172 +#define post_commit_hook_bitmap() do{}while(0)
63173 +#define post_write_back_hook_bitmap() do{}while(0)
63174 +#define print_info_bitmap(pref, al) do{}while(0)
63175 +
63176 +typedef __u64 bmap_nr_t;
63177 +typedef __u32 bmap_off_t;
63178 +
63179 +#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
63180 +
63181 +/* Make Linus happy.
63182 + Local variables:
63183 + c-indentation-style: "K&R"
63184 + mode-name: "LC"
63185 + c-basic-offset: 8
63186 + tab-width: 8
63187 + fill-column: 120
63188 + scroll-step: 1
63189 + End:
63190 +*/
63191 Index: linux-2.6.16/fs/reiser4/plugin/space/space_allocator.h
63192 ===================================================================
63193 --- /dev/null
63194 +++ linux-2.6.16/fs/reiser4/plugin/space/space_allocator.h
63195 @@ -0,0 +1,80 @@
63196 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63197 +
63198 +#ifndef __SPACE_ALLOCATOR_H__
63199 +#define __SPACE_ALLOCATOR_H__
63200 +
63201 +#include "../../forward.h"
63202 +#include "bitmap.h"
63203 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
63204 + * but... */
63205 +#define DEF_SPACE_ALLOCATOR(allocator) \
63206 + \
63207 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
63208 +{ \
63209 + return init_allocator_##allocator (al, s, opaque); \
63210 +} \
63211 + \
63212 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
63213 +{ \
63214 + destroy_allocator_##allocator (al, s); \
63215 +} \
63216 + \
63217 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
63218 + int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
63219 +{ \
63220 + return alloc_blocks_##allocator (al, hint, needed, start, len); \
63221 +} \
63222 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
63223 +{ \
63224 + dealloc_blocks_##allocator (al, start, len); \
63225 +} \
63226 + \
63227 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
63228 +{ \
63229 + check_blocks_##allocator (start, end, desired); \
63230 +} \
63231 + \
63232 +static inline void sa_pre_commit_hook (void) \
63233 +{ \
63234 + pre_commit_hook_##allocator (); \
63235 +} \
63236 + \
63237 +static inline void sa_post_commit_hook (void) \
63238 +{ \
63239 + post_commit_hook_##allocator (); \
63240 +} \
63241 + \
63242 +static inline void sa_post_write_back_hook (void) \
63243 +{ \
63244 + post_write_back_hook_##allocator(); \
63245 +} \
63246 + \
63247 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
63248 +{ \
63249 + print_info_##allocator (prefix, al); \
63250 +}
63251 +
63252 +DEF_SPACE_ALLOCATOR(bitmap)
63253 +
63254 +/* this object is part of reiser4 private in-core super block */
63255 +struct reiser4_space_allocator {
63256 + union {
63257 + /* space allocators might use this pointer to reference their
63258 + * data. */
63259 + void *generic;
63260 + } u;
63261 +};
63262 +
63263 +/* __SPACE_ALLOCATOR_H__ */
63264 +#endif
63265 +
63266 +/* Make Linus happy.
63267 + Local variables:
63268 + c-indentation-style: "K&R"
63269 + mode-name: "LC"
63270 + c-basic-offset: 8
63271 + tab-width: 8
63272 + fill-column: 120
63273 + scroll-step: 1
63274 + End:
63275 +*/
63276 Index: linux-2.6.16/fs/reiser4/plugin/tail_policy.c
63277 ===================================================================
63278 --- /dev/null
63279 +++ linux-2.6.16/fs/reiser4/plugin/tail_policy.c
63280 @@ -0,0 +1,113 @@
63281 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63282 + * reiser4/README */
63283 +
63284 +/* Formatting policy plugins */
63285 +
63286 +/*
63287 + * Formatting policy plugin is used by object plugin (of regular file) to
63288 + * convert file between two representations.
63289 + *
63290 + * Currently following policies are implemented:
63291 + * never store file in formatted nodes
63292 + * always store file in formatted nodes
63293 + * store file in formatted nodes if file is smaller than 4 blocks (default)
63294 + */
63295 +
63296 +#include "../tree.h"
63297 +#include "../inode.h"
63298 +#include "../super.h"
63299 +#include "object.h"
63300 +#include "plugin.h"
63301 +#include "node/node.h"
63302 +#include "plugin_header.h"
63303 +
63304 +#include <linux/pagemap.h>
63305 +#include <linux/fs.h> /* For struct inode */
63306 +
63307 +/**
63308 + * have_formatting_never -
63309 + * @inode:
63310 + * @size:
63311 + *
63312 + *
63313 + */
63314 +/* Never store file's tail as direct item */
63315 +/* Audited by: green(2002.06.12) */
63316 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
63317 + /* inode to operate on */ ,
63318 + loff_t size UNUSED_ARG /* new object size */ )
63319 +{
63320 + return 0;
63321 +}
63322 +
63323 +/* Always store file's tail as direct item */
63324 +/* Audited by: green(2002.06.12) */
63325 +static int
63326 +have_formatting_always(const struct inode *inode UNUSED_ARG
63327 + /* inode to operate on */ ,
63328 + loff_t size UNUSED_ARG /* new object size */ )
63329 +{
63330 + return 1;
63331 +}
63332 +
63333 +/* This function makes test if we should store file denoted @inode as tails only or
63334 + as extents only. */
63335 +static int
63336 +have_formatting_default(const struct inode *inode UNUSED_ARG
63337 + /* inode to operate on */ ,
63338 + loff_t size /* new object size */ )
63339 +{
63340 + assert("umka-1253", inode != NULL);
63341 +
63342 + if (size > inode->i_sb->s_blocksize * 4)
63343 + return 0;
63344 +
63345 + return 1;
63346 +}
63347 +
63348 +/* tail plugins */
63349 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
63350 + [NEVER_TAILS_FORMATTING_ID] = {
63351 + .h = {
63352 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63353 + .id = NEVER_TAILS_FORMATTING_ID,
63354 + .pops = NULL,
63355 + .label = "never",
63356 + .desc = "Never store file's tail",
63357 + .linkage = {NULL, NULL}
63358 + },
63359 + .have_tail = have_formatting_never
63360 + },
63361 + [ALWAYS_TAILS_FORMATTING_ID] = {
63362 + .h = {
63363 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63364 + .id = ALWAYS_TAILS_FORMATTING_ID,
63365 + .pops = NULL,
63366 + .label = "always",
63367 + .desc = "Always store file's tail",
63368 + .linkage = {NULL, NULL}
63369 + },
63370 + .have_tail = have_formatting_always
63371 + },
63372 + [SMALL_FILE_FORMATTING_ID] = {
63373 + .h = {
63374 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63375 + .id = SMALL_FILE_FORMATTING_ID,
63376 + .pops = NULL,
63377 + .label = "4blocks",
63378 + .desc = "store files shorter than 4 blocks in tail items",
63379 + .linkage = {NULL, NULL}
63380 + },
63381 + .have_tail = have_formatting_default
63382 + }
63383 +};
63384 +
63385 +/*
63386 + * Local variables:
63387 + * c-indentation-style: "K&R"
63388 + * mode-name: "LC"
63389 + * c-basic-offset: 8
63390 + * tab-width: 8
63391 + * fill-column: 79
63392 + * End:
63393 + */
63394 Index: linux-2.6.16/fs/reiser4/pool.c
63395 ===================================================================
63396 --- /dev/null
63397 +++ linux-2.6.16/fs/reiser4/pool.c
63398 @@ -0,0 +1,236 @@
63399 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63400 + * reiser4/README */
63401 +
63402 +/* Fast pool allocation.
63403 +
63404 + There are situations when some sub-system normally asks memory allocator
63405 + for only few objects, but under some circumstances could require much
63406 + more. Typical and actually motivating example is tree balancing. It needs
63407 + to keep track of nodes that were involved into it, and it is well-known
63408 + that in reasonable packed balanced tree most (92.938121%) percent of all
63409 + balancings end up after working with only few nodes (3.141592 on
63410 + average). But in rare cases balancing can involve much more nodes
63411 + (3*tree_height+1 in extremal situation).
63412 +
63413 + On the one hand, we don't want to resort to dynamic allocation (slab,
63414 + malloc(), etc.) to allocate data structures required to keep track of
63415 + nodes during balancing. On the other hand, we cannot statically allocate
63416 + required amount of space on the stack, because first: it is useless wastage
63417 + of precious resource, and second: this amount is unknown in advance (tree
63418 + height can change).
63419 +
63420 + Pools, implemented in this file are solution for this problem:
63421 +
63422 + - some configurable amount of objects is statically preallocated on the
63423 + stack
63424 +
63425 + - if this preallocated pool is exhausted and more objects is requested
63426 + they are allocated dynamically.
63427 +
63428 + Pools encapsulate distinction between statically and dynamically allocated
63429 + objects. Both allocation and recycling look exactly the same.
63430 +
63431 + To keep track of dynamically allocated objects, pool adds its own linkage
63432 + to each object.
63433 +
63434 + NOTE-NIKITA This linkage also contains some balancing-specific data. This
63435 + is not perfect. On the other hand, balancing is currently the only client
63436 + of pool code.
63437 +
63438 + NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63439 + functions in the style of tslist/tshash, i.e., make them unreadable, but
63440 + type-safe.
63441 +
63442 +
63443 +*/
63444 +
63445 +#include "debug.h"
63446 +#include "pool.h"
63447 +#include "super.h"
63448 +
63449 +#include <linux/types.h>
63450 +#include <linux/err.h>
63451 +
63452 +/* initialize new pool object */
63453 +static void reiser4_init_pool_obj(reiser4_pool_header * h /* pool object to
63454 + * initialize */ )
63455 +{
63456 + INIT_LIST_HEAD(&h->usage_linkage);
63457 + INIT_LIST_HEAD(&h->level_linkage);
63458 + INIT_LIST_HEAD(&h->extra_linkage);
63459 +}
63460 +
63461 +/* initialize new pool */
63462 +void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
63463 + size_t obj_size /* size of objects in @pool */ ,
63464 + int num_of_objs /* number of preallocated objects */ ,
63465 + char *data /* area for preallocated objects */ )
63466 +{
63467 + reiser4_pool_header *h;
63468 + int i;
63469 +
63470 + assert("nikita-955", pool != NULL);
63471 + assert("nikita-1044", obj_size > 0);
63472 + assert("nikita-956", num_of_objs >= 0);
63473 + assert("nikita-957", data != NULL);
63474 +
63475 + memset(pool, 0, sizeof *pool);
63476 + pool->obj_size = obj_size;
63477 + pool->data = data;
63478 + INIT_LIST_HEAD(&pool->free);
63479 + INIT_LIST_HEAD(&pool->used);
63480 + INIT_LIST_HEAD(&pool->extra);
63481 + memset(data, 0, obj_size * num_of_objs);
63482 + for (i = 0; i < num_of_objs; ++i) {
63483 + h = (reiser4_pool_header *) (data + i * obj_size);
63484 + reiser4_init_pool_obj(h);
63485 + /* add pool header to the end of pool's free list */
63486 + list_add_tail(&h->usage_linkage, &pool->free);
63487 + }
63488 +}
63489 +
63490 +/* release pool resources
63491 +
63492 + Release all resources acquired by this pool, specifically, dynamically
63493 + allocated objects.
63494 +
63495 +*/
63496 +void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
63497 +{
63498 +}
63499 +
63500 +/* allocate carry object from pool
63501 +
63502 + First, try to get preallocated object. If this fails, resort to dynamic
63503 + allocation.
63504 +
63505 +*/
63506 +static void *reiser4_pool_alloc(reiser4_pool * pool /* pool to allocate object
63507 + * from */ )
63508 +{
63509 + reiser4_pool_header *result;
63510 +
63511 + assert("nikita-959", pool != NULL);
63512 +
63513 + if (!list_empty(&pool->free)) {
63514 + struct list_head *linkage;
63515 +
63516 + linkage = pool->free.next;
63517 + list_del(linkage);
63518 + INIT_LIST_HEAD(linkage);
63519 + result = list_entry(linkage, reiser4_pool_header, usage_linkage);
63520 + BUG_ON(!list_empty(&result->level_linkage) ||
63521 + !list_empty(&result->extra_linkage));
63522 + } else {
63523 + /* pool is empty. Extra allocations don't deserve dedicated
63524 + slab to be served from, as they are expected to be rare. */
63525 + result = kmalloc(pool->obj_size, get_gfp_mask());
63526 + if (result != 0) {
63527 + reiser4_init_pool_obj(result);
63528 + list_add(&result->extra_linkage, &pool->extra);
63529 + } else
63530 + return ERR_PTR(RETERR(-ENOMEM));
63531 + BUG_ON(!list_empty(&result->usage_linkage) ||
63532 + !list_empty(&result->level_linkage));
63533 + }
63534 + ++pool->objs;
63535 + list_add(&result->usage_linkage, &pool->used);
63536 + memset(result + 1, 0, pool->obj_size - sizeof *result);
63537 + return result;
63538 +}
63539 +
63540 +/* return object back to the pool */
63541 +void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h /* pool to return object back
63542 + * into */ )
63543 +{
63544 + assert("nikita-961", h != NULL);
63545 + assert("nikita-962", pool != NULL);
63546 +
63547 + --pool->objs;
63548 + assert("nikita-963", pool->objs >= 0);
63549 +
63550 + list_del_init(&h->usage_linkage);
63551 + list_del_init(&h->level_linkage);
63552 +
63553 + if (list_empty(&h->extra_linkage))
63554 + /*
63555 + * pool header is not an extra one. Push it onto free list
63556 + * using usage_linkage
63557 + */
63558 + list_add(&h->usage_linkage, &pool->free);
63559 + else {
63560 + /* remove pool header from pool's extra list and kfree it */
63561 + list_del(&h->extra_linkage);
63562 + kfree(h);
63563 + }
63564 +}
63565 +
63566 +/* add new object to the carry level list
63567 +
63568 + Carry level is FIFO most of the time, but not always. Complications arise
63569 + when make_space() function tries to go to the left neighbor and thus adds
63570 + carry node before existing nodes, and also, when updating delimiting keys
63571 + after moving data between two nodes, we want left node to be locked before
63572 + right node.
63573 +
63574 + Latter case is confusing at the first glance. Problem is that COP_UPDATE
63575 + opration that updates delimiting keys is sometimes called with two nodes
63576 + (when data are moved between two nodes) and sometimes with only one node
63577 + (when leftmost item is deleted in a node). In any case operation is
63578 + supplied with at least node whose left delimiting key is to be updated
63579 + (that is "right" node).
63580 +
63581 +*/
63582 +reiser4_pool_header *add_obj(reiser4_pool * pool /* pool from which to
63583 + * allocate new object */ ,
63584 + struct list_head *list, /* list where to add
63585 + * object */
63586 + pool_ordering order /* where to add */ ,
63587 + reiser4_pool_header * reference /* after (or
63588 + * before) which
63589 + * existing
63590 + * object to
63591 + * add */ )
63592 +{
63593 + reiser4_pool_header *result;
63594 +
63595 + assert("nikita-972", pool != NULL);
63596 +
63597 + result = reiser4_pool_alloc(pool);
63598 + if (IS_ERR(result))
63599 + return result;
63600 +
63601 + assert("nikita-973", result != NULL);
63602 +
63603 + switch (order) {
63604 + case POOLO_BEFORE:
63605 + __list_add(&result->level_linkage,
63606 + reference->level_linkage.prev,
63607 + &reference->level_linkage);
63608 + break;
63609 + case POOLO_AFTER:
63610 + __list_add(&result->level_linkage,
63611 + &reference->level_linkage,
63612 + reference->level_linkage.next);
63613 + break;
63614 + case POOLO_LAST:
63615 + list_add_tail(&result->level_linkage, list);
63616 + break;
63617 + case POOLO_FIRST:
63618 + list_add(&result->level_linkage, list);
63619 + break;
63620 + default:
63621 + wrong_return_value("nikita-927", "order");
63622 + }
63623 + return result;
63624 +}
63625 +
63626 +/* Make Linus happy.
63627 + Local variables:
63628 + c-indentation-style: "K&R"
63629 + mode-name: "LC"
63630 + c-basic-offset: 8
63631 + tab-width: 8
63632 + fill-column: 120
63633 + End:
63634 +*/
63635 Index: linux-2.6.16/fs/reiser4/pool.h
63636 ===================================================================
63637 --- /dev/null
63638 +++ linux-2.6.16/fs/reiser4/pool.h
63639 @@ -0,0 +1,54 @@
63640 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63641 +
63642 +/* Fast pool allocation */
63643 +
63644 +#ifndef __REISER4_POOL_H__
63645 +#define __REISER4_POOL_H__
63646 +
63647 +#include <linux/types.h>
63648 +
63649 +typedef struct reiser4_pool {
63650 + size_t obj_size;
63651 + int objs;
63652 + char *data;
63653 + struct list_head free;
63654 + struct list_head used;
63655 + struct list_head extra;
63656 +} reiser4_pool;
63657 +
63658 +typedef struct reiser4_pool_header {
63659 + /* object is either on free or "used" lists */
63660 + struct list_head usage_linkage;
63661 + struct list_head level_linkage;
63662 + struct list_head extra_linkage;
63663 +} reiser4_pool_header;
63664 +
63665 +typedef enum {
63666 + POOLO_BEFORE,
63667 + POOLO_AFTER,
63668 + POOLO_LAST,
63669 + POOLO_FIRST
63670 +} pool_ordering;
63671 +
63672 +/* pool manipulation functions */
63673 +
63674 +extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
63675 + int num_of_objs, char *data);
63676 +extern void reiser4_done_pool(reiser4_pool * pool);
63677 +extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
63678 +reiser4_pool_header *add_obj(reiser4_pool * pool, struct list_head * list,
63679 + pool_ordering order,
63680 + reiser4_pool_header * reference);
63681 +
63682 +/* __REISER4_POOL_H__ */
63683 +#endif
63684 +
63685 +/* Make Linus happy.
63686 + Local variables:
63687 + c-indentation-style: "K&R"
63688 + mode-name: "LC"
63689 + c-basic-offset: 8
63690 + tab-width: 8
63691 + fill-column: 120
63692 + End:
63693 +*/
63694 Index: linux-2.6.16/fs/reiser4/readahead.c
63695 ===================================================================
63696 --- /dev/null
63697 +++ linux-2.6.16/fs/reiser4/readahead.c
63698 @@ -0,0 +1,138 @@
63699 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63700 + * reiser4/README */
63701 +
63702 +#include "forward.h"
63703 +#include "tree.h"
63704 +#include "tree_walk.h"
63705 +#include "super.h"
63706 +#include "inode.h"
63707 +#include "key.h"
63708 +#include "znode.h"
63709 +
63710 +#include <linux/swap.h> /* for totalram_pages */
63711 +
63712 +void init_ra_info(ra_info_t * rai)
63713 +{
63714 + rai->key_to_stop = *min_key();
63715 +}
63716 +
63717 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
63718 +static inline int ra_adjacent_only(int flags)
63719 +{
63720 + return flags & RA_ADJACENT_ONLY;
63721 +}
63722 +
63723 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
63724 + if right neighbor's first key is less or equal to readahead's stop key */
63725 +static int should_readahead_neighbor(znode * node, ra_info_t * info)
63726 +{
63727 + int result;
63728 +
63729 + read_lock_dk(znode_get_tree(node));
63730 + result = keyle(znode_get_rd_key(node), &info->key_to_stop);
63731 + read_unlock_dk(znode_get_tree(node));
63732 + return result;
63733 +}
63734 +
63735 +#define LOW_MEM_PERCENTAGE (5)
63736 +
63737 +static int low_on_memory(void)
63738 +{
63739 + unsigned int freepages;
63740 +
63741 + freepages = nr_free_pages();
63742 + return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
63743 +}
63744 +
63745 +/* start read for @node and for a few of its right neighbors */
63746 +void formatted_readahead(znode * node, ra_info_t * info)
63747 +{
63748 + ra_params_t *ra_params;
63749 + znode *cur;
63750 + int i;
63751 + int grn_flags;
63752 + lock_handle next_lh;
63753 +
63754 + /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
63755 + if (blocknr_is_fake(znode_get_block(node)))
63756 + return;
63757 +
63758 + ra_params = get_current_super_ra_params();
63759 +
63760 + if (znode_page(node) == NULL)
63761 + jstartio(ZJNODE(node));
63762 +
63763 + if (znode_get_level(node) != LEAF_LEVEL)
63764 + return;
63765 +
63766 + /* don't waste memory for read-ahead when low on memory */
63767 + if (low_on_memory())
63768 + return;
63769 +
63770 + /* We can have locked nodes on upper tree levels, in this situation lock
63771 + priorities do not help to resolve deadlocks, we have to use TRY_LOCK
63772 + here. */
63773 + grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
63774 +
63775 + i = 0;
63776 + cur = zref(node);
63777 + init_lh(&next_lh);
63778 + while (i < ra_params->max) {
63779 + const reiser4_block_nr *nextblk;
63780 +
63781 + if (!should_readahead_neighbor(cur, info))
63782 + break;
63783 +
63784 + if (reiser4_get_right_neighbor
63785 + (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
63786 + break;
63787 +
63788 + nextblk = znode_get_block(next_lh.node);
63789 + if (blocknr_is_fake(nextblk) ||
63790 + (ra_adjacent_only(ra_params->flags)
63791 + && *nextblk != *znode_get_block(cur) + 1)) {
63792 + break;
63793 + }
63794 +
63795 + zput(cur);
63796 + cur = zref(next_lh.node);
63797 + done_lh(&next_lh);
63798 + if (znode_page(cur) == NULL)
63799 + jstartio(ZJNODE(cur));
63800 + else
63801 + /* Do not scan read-ahead window if pages already
63802 + * allocated (and i/o already started). */
63803 + break;
63804 +
63805 + i++;
63806 + }
63807 + zput(cur);
63808 + done_lh(&next_lh);
63809 +}
63810 +
63811 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
63812 +{
63813 + reiser4_key *stop_key;
63814 +
63815 + assert("nikita-3542", dir != NULL);
63816 + assert("nikita-3543", tap != NULL);
63817 +
63818 + stop_key = &tap->ra_info.key_to_stop;
63819 + /* initialize readdir readahead information: include into readahead
63820 + * stat data of all files of the directory */
63821 + set_key_locality(stop_key, get_inode_oid(dir));
63822 + set_key_type(stop_key, KEY_SD_MINOR);
63823 + set_key_ordering(stop_key, get_key_ordering(max_key()));
63824 + set_key_objectid(stop_key, get_key_objectid(max_key()));
63825 + set_key_offset(stop_key, get_key_offset(max_key()));
63826 +}
63827 +
63828 +/*
63829 + Local variables:
63830 + c-indentation-style: "K&R"
63831 + mode-name: "LC"
63832 + c-basic-offset: 8
63833 + tab-width: 8
63834 + fill-column: 80
63835 + End:
63836 +*/
63837 Index: linux-2.6.16/fs/reiser4/readahead.h
63838 ===================================================================
63839 --- /dev/null
63840 +++ linux-2.6.16/fs/reiser4/readahead.h
63841 @@ -0,0 +1,48 @@
63842 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63843 +
63844 +#ifndef __READAHEAD_H__
63845 +#define __READAHEAD_H__
63846 +
63847 +#include "key.h"
63848 +
63849 +typedef enum {
63850 + RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
63851 +} ra_global_flags;
63852 +
63853 +/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
63854 +typedef struct formatted_read_ahead_params {
63855 + unsigned long max; /* request not more than this amount of nodes. Default is totalram_pages / 4 */
63856 + int flags;
63857 +} ra_params_t;
63858 +
63859 +typedef struct {
63860 + reiser4_key key_to_stop;
63861 +} ra_info_t;
63862 +
63863 +void formatted_readahead(znode *, ra_info_t *);
63864 +void init_ra_info(ra_info_t * rai);
63865 +
63866 +struct reiser4_file_ra_state {
63867 + loff_t start; /* Current window */
63868 + loff_t size;
63869 + loff_t next_size; /* Next window size */
63870 + loff_t ahead_start; /* Ahead window */
63871 + loff_t ahead_size;
63872 + loff_t max_window_size; /* Maximum readahead window */
63873 + loff_t slow_start; /* enlarging r/a size algorithm. */
63874 +};
63875 +
63876 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
63877 +
63878 +/* __READAHEAD_H__ */
63879 +#endif
63880 +
63881 +/*
63882 + Local variables:
63883 + c-indentation-style: "K&R"
63884 + mode-name: "LC"
63885 + c-basic-offset: 8
63886 + tab-width: 8
63887 + fill-column: 120
63888 + End:
63889 +*/
63890 Index: linux-2.6.16/fs/reiser4/reiser4.h
63891 ===================================================================
63892 --- /dev/null
63893 +++ linux-2.6.16/fs/reiser4/reiser4.h
63894 @@ -0,0 +1,276 @@
63895 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63896 + * reiser4/README */
63897 +
63898 +/* definitions of common constants used by reiser4 */
63899 +
63900 +#if !defined( __REISER4_H__ )
63901 +#define __REISER4_H__
63902 +
63903 +#include <linux/config.h>
63904 +#include <asm/param.h> /* for HZ */
63905 +#include <linux/errno.h>
63906 +#include <linux/types.h>
63907 +#include <linux/fs.h>
63908 +#include <asm/hardirq.h>
63909 +#include <linux/sched.h>
63910 +
63911 +/*
63912 + * reiser4 compilation options.
63913 + */
63914 +
63915 +#if defined(CONFIG_REISER4_DEBUG)
63916 +/* turn on assertion checks */
63917 +#define REISER4_DEBUG (1)
63918 +#else
63919 +#define REISER4_DEBUG (0)
63920 +#endif
63921 +
63922 +#if defined(CONFIG_ZLIB_INFLATE)
63923 +/* turn on zlib */
63924 +#define REISER4_ZLIB (1)
63925 +#else
63926 +#define REISER4_ZLIB (0)
63927 +#endif
63928 +
63929 +#if defined(CONFIG_CRYPTO_SHA256)
63930 +#define REISER4_SHA256 (1)
63931 +#else
63932 +#define REISER4_SHA256 (0)
63933 +#endif
63934 +
63935 +#if defined(CONFIG_CRYPTO_AES_586)
63936 +#define REISER4_AES (1)
63937 +#else
63938 +#define REISER4_AES (0)
63939 +#endif
63940 +
63941 +/*
63942 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
63943 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
63944 + * components. Additional component, referred to as "ordering" is used to
63945 + * order items from which given object is composed of. As such, ordering is
63946 + * placed between locality and objectid. For directory item ordering contains
63947 + * initial prefix of the file name this item is for. This sorts all directory
63948 + * items within given directory lexicographically (but see
63949 + * fibration.[ch]). For file body and stat-data, ordering contains initial
63950 + * prefix of the name file was initially created with. In the common case
63951 + * (files with single name) this allows to order file bodies and stat-datas in
63952 + * the same order as their respective directory entries, thus speeding up
63953 + * readdir.
63954 + *
63955 + * Note, that kernel can only mount file system with the same key size as one
63956 + * it is compiled for, so flipping this option may render your data
63957 + * inaccessible.
63958 + */
63959 +#define REISER4_LARGE_KEY (1)
63960 +/*#define REISER4_LARGE_KEY (0)*/
63961 +
63962 +/*#define GUESS_EXISTS 1*/
63963 +
63964 +/*
63965 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
63966 + * option
63967 + */
63968 +
63969 +extern const char *REISER4_SUPER_MAGIC_STRING;
63970 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
63971 + * beginning of device */
63972 +
63973 +/* here go tunable parameters that are not worth special entry in kernel
63974 + configuration */
63975 +
63976 +/* default number of slots in coord-by-key caches */
63977 +#define CBK_CACHE_SLOTS (16)
63978 +/* how many elementary tree operation to carry on the next level */
63979 +#define CARRIES_POOL_SIZE (5)
63980 +/* size of pool of preallocated nodes for carry process. */
63981 +#define NODES_LOCKED_POOL_SIZE (5)
63982 +
63983 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63984 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63985 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
63986 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
63987 +
63988 +/* we are supporting reservation of disk space on uid basis */
63989 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
63990 +/* we are supporting reservation of disk space for groups */
63991 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
63992 +/* we are supporting reservation of disk space for root */
63993 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
63994 +/* we use rapid flush mode, see flush.c for comments. */
63995 +#define REISER4_USE_RAPID_FLUSH (1)
63996 +
63997 +/*
63998 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
63999 + */
64000 +#define REISER4_USE_ENTD (1)
64001 +
64002 +/* key allocation is Plan-A */
64003 +#define REISER4_PLANA_KEY_ALLOCATION (1)
64004 +/* key allocation follows good old 3.x scheme */
64005 +#define REISER4_3_5_KEY_ALLOCATION (0)
64006 +
64007 +/* size of hash-table for znodes */
64008 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
64009 +
64010 +/* number of buckets in lnode hash-table */
64011 +#define LNODE_HTABLE_BUCKETS (1024)
64012 +
64013 +/* some ridiculously high maximal limit on height of znode tree. This
64014 + is used in declaration of various per level arrays and
64015 + to allocate stattistics gathering array for per-level stats. */
64016 +#define REISER4_MAX_ZTREE_HEIGHT (8)
64017 +
64018 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
64019 +
64020 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
64021 + sequential search is on average faster than binary. This is because
64022 + of better optimization and because sequential search is more CPU
64023 + cache friendly. This number (25) was found by experiments on dual AMD
64024 + Athlon(tm), 1400MHz.
64025 +
64026 + NOTE: testing in kernel has shown that binary search is more effective than
64027 + implied by results of the user level benchmarking. Probably because in the
64028 + node keys are separated by other data. So value was adjusted after few
64029 + tests. More thorough tuning is needed.
64030 +*/
64031 +#define REISER4_SEQ_SEARCH_BREAK (3)
64032 +
64033 +/* don't allow tree to be lower than this */
64034 +#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
64035 +
64036 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
64037 + * available memory. */
64038 +/* Default value of maximal atom size. Can be ovewritten by
64039 + tmgr.atom_max_size mount option. By default infinity. */
64040 +#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
64041 +
64042 +/* Default value of maximal atom age (in jiffies). After reaching this age
64043 + atom will be forced to commit, either synchronously or asynchronously. Can
64044 + be overwritten by tmgr.atom_max_age mount option. */
64045 +#define REISER4_ATOM_MAX_AGE (600 * HZ)
64046 +
64047 +/* sleeping period for ktxnmrgd */
64048 +#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
64049 +
64050 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
64051 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
64052 +
64053 +/* start complaining after that many restarts in coord_by_key().
64054 +
64055 + This either means incredibly heavy contention for this part of a tree, or
64056 + some corruption or bug.
64057 +*/
64058 +#define REISER4_CBK_ITERATIONS_LIMIT (100)
64059 +
64060 +/* return -EIO after that many iterations in coord_by_key().
64061 +
64062 + I have witnessed more than 800 iterations (in 30 thread test) before cbk
64063 + finished. --nikita
64064 +*/
64065 +#define REISER4_MAX_CBK_ITERATIONS 500000
64066 +
64067 +/* put a per-inode limit on maximal number of directory entries with identical
64068 + keys in hashed directory.
64069 +
64070 + Disable this until inheritance interfaces stabilize: we need some way to
64071 + set per directory limit.
64072 +*/
64073 +#define REISER4_USE_COLLISION_LIMIT (0)
64074 +
64075 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
64076 + will force them to be relocated. */
64077 +#define FLUSH_RELOCATE_THRESHOLD 64
64078 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
64079 + from the preceder it will relocate to that position. */
64080 +#define FLUSH_RELOCATE_DISTANCE 64
64081 +
64082 +/* If we have written this much or more blocks before encountering busy jnode
64083 + in flush list - abort flushing hoping that next time we get called
64084 + this jnode will be clean already, and we will save some seeks. */
64085 +#define FLUSH_WRITTEN_THRESHOLD 50
64086 +
64087 +/* The maximum number of nodes to scan left on a level during flush. */
64088 +#define FLUSH_SCAN_MAXNODES 10000
64089 +
64090 +/* per-atom limit of flushers */
64091 +#define ATOM_MAX_FLUSHERS (1)
64092 +
64093 +/* default tracing buffer size */
64094 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
64095 +
64096 +/* what size units of IO we would like cp, etc., to use, in writing to
64097 + reiser4. In bytes.
64098 +
64099 + Can be overwritten by optimal_io_size mount option.
64100 +*/
64101 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
64102 +
64103 +/* see comments in inode.c:oid_to_uino() */
64104 +#define REISER4_UINO_SHIFT (1 << 30)
64105 +
64106 +/* Mark function argument as unused to avoid compiler warnings. */
64107 +#define UNUSED_ARG __attribute__((unused))
64108 +
64109 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
64110 +#define NONNULL __attribute__((nonnull))
64111 +#else
64112 +#define NONNULL
64113 +#endif
64114 +
64115 +/* master super block offset in bytes.*/
64116 +#define REISER4_MASTER_OFFSET 65536
64117 +
64118 +/* size of VFS block */
64119 +#define VFS_BLKSIZE 512
64120 +/* number of bits in size of VFS block (512==2^9) */
64121 +#define VFS_BLKSIZE_BITS 9
64122 +
64123 +#define REISER4_I reiser4_inode_data
64124 +
64125 +/* implication */
64126 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
64127 +/* logical equivalence */
64128 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
64129 +
64130 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
64131 +
64132 +#define NOT_YET (0)
64133 +
64134 +/** Reiser4 specific error codes **/
64135 +
64136 +#define REISER4_ERROR_CODE_BASE 500
64137 +
64138 +/* Neighbor is not available (side neighbor or parent) */
64139 +#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
64140 +
64141 +/* Node was not found in cache */
64142 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
64143 +
64144 +/* node has no free space enough for completion of balancing operation */
64145 +#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
64146 +
64147 +/* repeat operation */
64148 +#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
64149 +
64150 +/* deadlock happens */
64151 +#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
64152 +
64153 +/* operation cannot be performed, because it would block and non-blocking mode
64154 + * was requested. */
64155 +#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
64156 +
64157 +/* wait some event (depends on context), then repeat */
64158 +#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
64159 +
64160 +#endif /* __REISER4_H__ */
64161 +
64162 +/* Make Linus happy.
64163 + Local variables:
64164 + c-indentation-style: "K&R"
64165 + mode-name: "LC"
64166 + c-basic-offset: 8
64167 + tab-width: 8
64168 + fill-column: 120
64169 + End:
64170 +*/
64171 Index: linux-2.6.16/fs/reiser4/safe_link.c
64172 ===================================================================
64173 --- /dev/null
64174 +++ linux-2.6.16/fs/reiser4/safe_link.c
64175 @@ -0,0 +1,351 @@
64176 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
64177 + * reiser4/README */
64178 +
64179 +/* Safe-links. */
64180 +
64181 +/*
64182 + * Safe-links are used to maintain file system consistency during operations
64183 + * that spawns multiple transactions. For example:
64184 + *
64185 + * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
64186 + * without user-visible names in the file system, but still opened by some
64187 + * active process. What happens here is that unlink proper (i.e., removal
64188 + * of the last file name) and file deletion (truncate of file body to zero
64189 + * and deletion of stat-data, that happens when last file descriptor is
64190 + * closed), may belong to different transactions T1 and T2. If a crash
64191 + * happens after T1 commit, but before T2 commit, on-disk file system has
64192 + * a file without name, that is, disk space leak.
64193 + *
64194 + * 2. Truncate. Truncate of large file may spawn multiple transactions. If
64195 + * system crashes while truncate was in-progress, file is left partially
64196 + * truncated, which violates "atomicity guarantees" of reiser4, viz. that
64197 + * every system is atomic.
64198 + *
64199 + * Safe-links address both above cases. Basically, safe-link is a way post
64200 + * some operation to be executed during commit of some other transaction than
64201 + * current one. (Another way to look at the safe-link is to interpret it as a
64202 + * logical logging.)
64203 + *
64204 + * Specifically, at the beginning of unlink safe-link in inserted in the
64205 + * tree. This safe-link is normally removed by file deletion code (during
64206 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
64207 + * normally removed when truncate operation is finished.
64208 + *
64209 + * This means, that in the case of "clean umount" there are no safe-links in
64210 + * the tree. If safe-links are observed during mount, it means that (a) system
64211 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
64212 + * (i.e., not finished) operations that were in-progress during system
64213 + * termination. Each safe-link record enough information to complete
64214 + * corresponding operation, and mount simply "replays" them (hence, the
64215 + * analogy with the logical logging).
64216 + *
64217 + * Safe-links are implemented as blackbox items (see
64218 + * plugin/item/blackbox.[ch]).
64219 + *
64220 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
64221 + * list" there.
64222 + */
64223 +
64224 +#include "safe_link.h"
64225 +#include "debug.h"
64226 +#include "inode.h"
64227 +
64228 +#include "plugin/item/blackbox.h"
64229 +
64230 +#include <linux/fs.h>
64231 +
64232 +/*
64233 + * On-disk format of safe-link.
64234 + */
64235 +typedef struct safelink {
64236 + reiser4_key sdkey; /* key of stat-data for the file safe-link is
64237 + * for */
64238 + d64 size; /* size to which file should be truncated */
64239 +} safelink_t;
64240 +
64241 +/*
64242 + * locality where safe-link items are stored. Next to the objectid of root
64243 + * directory.
64244 + */
64245 +static oid_t safe_link_locality(reiser4_tree * tree)
64246 +{
64247 + return get_key_objectid(get_super_private(tree->super)->df_plug->
64248 + root_dir_key(tree->super)) + 1;
64249 +}
64250 +
64251 +/*
64252 + Construct a key for the safe-link. Key has the following format:
64253 +
64254 +| 60 | 4 | 64 | 4 | 60 | 64 |
64255 ++---------------+---+------------------+---+---------------+------------------+
64256 +| locality | 0 | 0 | 0 | objectid | link type |
64257 ++---------------+---+------------------+---+---------------+------------------+
64258 +| | | | |
64259 +| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
64260 +
64261 + This is in large keys format. In small keys format second 8 byte chunk is
64262 + out. Locality is a constant returned by safe_link_locality(). objectid is
64263 + an oid of a file on which operation protected by this safe-link is
64264 + performed. link-type is used to distinguish safe-links for different
64265 + operations.
64266 +
64267 + */
64268 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64269 + reiser4_safe_link_t link, reiser4_key * key)
64270 +{
64271 + reiser4_key_init(key);
64272 + set_key_locality(key, safe_link_locality(tree));
64273 + set_key_objectid(key, oid);
64274 + set_key_offset(key, link);
64275 + return key;
64276 +}
64277 +
64278 +/*
64279 + * how much disk space is necessary to insert and remove (in the
64280 + * error-handling path) safe-link.
64281 + */
64282 +static __u64 safe_link_tograb(reiser4_tree * tree)
64283 +{
64284 + return
64285 + /* insert safe link */
64286 + estimate_one_insert_item(tree) +
64287 + /* remove safe link */
64288 + estimate_one_item_removal(tree) +
64289 + /* drill to the leaf level during insertion */
64290 + 1 + estimate_one_insert_item(tree) +
64291 + /*
64292 + * possible update of existing safe-link. Actually, if
64293 + * safe-link existed already (we failed to remove it), then no
64294 + * insertion is necessary, so this term is already "covered",
64295 + * but for simplicity let's left it.
64296 + */
64297 + 1;
64298 +}
64299 +
64300 +/*
64301 + * grab enough disk space to insert and remove (in the error-handling path)
64302 + * safe-link.
64303 + */
64304 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64305 +{
64306 + int result;
64307 +
64308 + grab_space_enable();
64309 + /* The sbinfo->delete semaphore can be taken here.
64310 + * safe_link_release() should be called before leaving reiser4
64311 + * context. */
64312 + result =
64313 + reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64314 + grab_space_enable();
64315 + return result;
64316 +}
64317 +
64318 +/*
64319 + * release unused disk space reserved by safe_link_grab().
64320 + */
64321 +void safe_link_release(reiser4_tree * tree)
64322 +{
64323 + reiser4_release_reserved(tree->super);
64324 +}
64325 +
64326 +/*
64327 + * insert into tree safe-link for operation @link on inode @inode.
64328 + */
64329 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64330 +{
64331 + reiser4_key key;
64332 + safelink_t sl;
64333 + int length;
64334 + int result;
64335 + reiser4_tree *tree;
64336 +
64337 + build_sd_key(inode, &sl.sdkey);
64338 + length = sizeof sl.sdkey;
64339 +
64340 + if (link == SAFE_TRUNCATE) {
64341 + /*
64342 + * for truncate we have to store final file length also,
64343 + * expand item.
64344 + */
64345 + length += sizeof(sl.size);
64346 + put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64347 + }
64348 + tree = tree_by_inode(inode);
64349 + build_link_key(tree, get_inode_oid(inode), link, &key);
64350 +
64351 + result = store_black_box(tree, &key, &sl, length);
64352 + if (result == -EEXIST)
64353 + result = update_black_box(tree, &key, &sl, length);
64354 + return result;
64355 +}
64356 +
64357 +/*
64358 + * remove safe-link corresponding to the operation @link on inode @inode from
64359 + * the tree.
64360 + */
64361 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64362 +{
64363 + reiser4_key key;
64364 +
64365 + return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64366 +}
64367 +
64368 +/*
64369 + * in-memory structure to keep information extracted from safe-link. This is
64370 + * used to iterate over all safe-links.
64371 + */
64372 +typedef struct {
64373 + reiser4_tree *tree; /* internal tree */
64374 + reiser4_key key; /* safe-link key */
64375 + reiser4_key sdkey; /* key of object stat-data */
64376 + reiser4_safe_link_t link; /* safe-link type */
64377 + oid_t oid; /* object oid */
64378 + __u64 size; /* final size for truncate */
64379 +} safe_link_context;
64380 +
64381 +/*
64382 + * start iterating over all safe-links.
64383 + */
64384 +static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
64385 +{
64386 + ctx->tree = tree;
64387 + reiser4_key_init(&ctx->key);
64388 + set_key_locality(&ctx->key, safe_link_locality(tree));
64389 + set_key_objectid(&ctx->key, get_key_objectid(max_key()));
64390 + set_key_offset(&ctx->key, get_key_offset(max_key()));
64391 +}
64392 +
64393 +/*
64394 + * return next safe-link.
64395 + */
64396 +static int safe_link_iter_next(safe_link_context * ctx)
64397 +{
64398 + int result;
64399 + safelink_t sl;
64400 +
64401 + result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64402 + if (result == 0) {
64403 + ctx->oid = get_key_objectid(&ctx->key);
64404 + ctx->link = get_key_offset(&ctx->key);
64405 + ctx->sdkey = sl.sdkey;
64406 + if (ctx->link == SAFE_TRUNCATE)
64407 + ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64408 + }
64409 + return result;
64410 +}
64411 +
64412 +/*
64413 + * check are there any more safe-links left in the tree.
64414 + */
64415 +static int safe_link_iter_finished(safe_link_context * ctx)
64416 +{
64417 + return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64418 +}
64419 +
64420 +/*
64421 + * finish safe-link iteration.
64422 + */
64423 +static void safe_link_iter_end(safe_link_context * ctx)
64424 +{
64425 + /* nothing special */
64426 +}
64427 +
64428 +/*
64429 + * process single safe-link.
64430 + */
64431 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64432 + reiser4_key * sdkey, oid_t oid, __u64 size)
64433 +{
64434 + struct inode *inode;
64435 + int result;
64436 +
64437 + /*
64438 + * obtain object inode by reiser4_iget(), then call object plugin
64439 + * ->safelink() method to do actual work, then delete safe-link on
64440 + * success.
64441 + */
64442 + inode = reiser4_iget(super, sdkey, 1);
64443 + if (!IS_ERR(inode)) {
64444 + file_plugin *fplug;
64445 +
64446 + fplug = inode_file_plugin(inode);
64447 + assert("nikita-3428", fplug != NULL);
64448 + assert("", oid == get_inode_oid(inode));
64449 + if (fplug->safelink != NULL) {
64450 + /* txn_restart_current is not necessary because
64451 + * mounting is signle thread. However, without it
64452 + * deadlock detection code will complain (see
64453 + * nikita-3361). */
64454 + txn_restart_current();
64455 + result = fplug->safelink(inode, link, size);
64456 + } else {
64457 + warning("nikita-3430",
64458 + "Cannot handle safelink for %lli",
64459 + (unsigned long long)oid);
64460 + print_key("key", sdkey);
64461 + result = 0;
64462 + }
64463 + if (result != 0) {
64464 + warning("nikita-3431",
64465 + "Error processing safelink for %lli: %i",
64466 + (unsigned long long)oid, result);
64467 + }
64468 + reiser4_iget_complete(inode);
64469 + iput(inode);
64470 + if (result == 0) {
64471 + result = safe_link_grab(get_tree(super), BA_CAN_COMMIT);
64472 + if (result == 0)
64473 + result =
64474 + safe_link_del(get_tree(super), oid, link);
64475 + safe_link_release(get_tree(super));
64476 + /*
64477 + * restart transaction: if there was large number of
64478 + * safe-links, their processing may fail to fit into
64479 + * single transaction.
64480 + */
64481 + if (result == 0)
64482 + txn_restart_current();
64483 + }
64484 + } else
64485 + result = PTR_ERR(inode);
64486 + return result;
64487 +}
64488 +
64489 +/*
64490 + * iterate over all safe-links in the file-system processing them one by one.
64491 + */
64492 +int process_safelinks(struct super_block *super)
64493 +{
64494 + safe_link_context ctx;
64495 + int result;
64496 +
64497 + if (rofs_super(super))
64498 + /* do nothing on the read-only file system */
64499 + return 0;
64500 + safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
64501 + result = 0;
64502 + do {
64503 + result = safe_link_iter_next(&ctx);
64504 + if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
64505 + result = 0;
64506 + break;
64507 + }
64508 + if (result == 0)
64509 + result = process_safelink(super, ctx.link,
64510 + &ctx.sdkey, ctx.oid,
64511 + ctx.size);
64512 + } while (result == 0);
64513 + safe_link_iter_end(&ctx);
64514 + return result;
64515 +}
64516 +
64517 +/* Make Linus happy.
64518 + Local variables:
64519 + c-indentation-style: "K&R"
64520 + mode-name: "LC"
64521 + c-basic-offset: 8
64522 + tab-width: 8
64523 + fill-column: 120
64524 + scroll-step: 1
64525 + End:
64526 +*/
64527 Index: linux-2.6.16/fs/reiser4/safe_link.h
64528 ===================================================================
64529 --- /dev/null
64530 +++ linux-2.6.16/fs/reiser4/safe_link.h
64531 @@ -0,0 +1,29 @@
64532 +/* Copyright 2003 by Hans Reiser, licensing governed by
64533 + * reiser4/README */
64534 +
64535 +/* Safe-links. See safe_link.c for details. */
64536 +
64537 +#if !defined( __FS_SAFE_LINK_H__ )
64538 +#define __FS_SAFE_LINK_H__
64539 +
64540 +#include "tree.h"
64541 +
64542 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
64543 +void safe_link_release(reiser4_tree * tree);
64544 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
64545 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
64546 +
64547 +int process_safelinks(struct super_block *super);
64548 +
64549 +/* __FS_SAFE_LINK_H__ */
64550 +#endif
64551 +
64552 +/* Make Linus happy.
64553 + Local variables:
64554 + c-indentation-style: "K&R"
64555 + mode-name: "LC"
64556 + c-basic-offset: 8
64557 + tab-width: 8
64558 + fill-column: 120
64559 + End:
64560 +*/
64561 Index: linux-2.6.16/fs/reiser4/seal.c
64562 ===================================================================
64563 --- /dev/null
64564 +++ linux-2.6.16/fs/reiser4/seal.c
64565 @@ -0,0 +1,217 @@
64566 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64567 +/* Seals implementation. */
64568 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
64569 + allowing to bypass tree traversal. But normal usage of coords implies that
64570 + node pointed to by coord is locked, whereas seals don't keep a lock (or
64571 + even a reference) to znode. In stead, each znode contains a version number,
64572 + increased on each znode modification. This version number is copied into a
64573 + seal when seal is created. Later, one can "validate" seal by calling
64574 + seal_validate(). If znode is in cache and its version number is still the
64575 + same, seal is "pristine" and coord associated with it can be re-used
64576 + immediately.
64577 +
64578 + If, on the other hand, znode is out of cache, or it is obviously different
64579 + one from the znode seal was initially attached to (for example, it is on
64580 + the different level, or is being removed from the tree), seal is
64581 + irreparably invalid ("burned") and tree traversal has to be repeated.
64582 +
64583 + Otherwise, there is some hope, that while znode was modified (and seal was
64584 + "broken" as a result), key attached to the seal is still in the node. This
64585 + is checked by first comparing this key with delimiting keys of node and, if
64586 + key is ok, doing intra-node lookup.
64587 +
64588 + Znode version is maintained in the following way:
64589 +
64590 + there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
64591 + znode_epoch is incremented and its new value is stored in ->version field
64592 + of new znode. Whenever znode is dirtied (which means it was probably
64593 + modified), znode_epoch is also incremented and its new value is stored in
64594 + znode->version. This is done so, because just incrementing znode->version
64595 + on each update is not enough: it may so happen, that znode get deleted, new
64596 + znode is allocated for the same disk block and gets the same version
64597 + counter, tricking seal code into false positive.
64598 +*/
64599 +
64600 +#include "forward.h"
64601 +#include "debug.h"
64602 +#include "key.h"
64603 +#include "coord.h"
64604 +#include "seal.h"
64605 +#include "plugin/item/item.h"
64606 +#include "plugin/node/node.h"
64607 +#include "jnode.h"
64608 +#include "znode.h"
64609 +#include "super.h"
64610 +
64611 +static znode *seal_node(const seal_t * seal);
64612 +static int seal_matches(const seal_t * seal, znode * node);
64613 +
64614 +/* initialise seal. This can be called several times on the same seal. @coord
64615 + and @key can be NULL. */
64616 +void seal_init(seal_t * seal /* seal to initialise */ ,
64617 + const coord_t * coord /* coord @seal will be attached to */ ,
64618 + const reiser4_key * key UNUSED_ARG /* key @seal will be
64619 + * attached to */ )
64620 +{
64621 + assert("nikita-1886", seal != NULL);
64622 + memset(seal, 0, sizeof *seal);
64623 + if (coord != NULL) {
64624 + znode *node;
64625 +
64626 + node = coord->node;
64627 + assert("nikita-1987", node != NULL);
64628 + spin_lock_znode(node);
64629 + seal->version = node->version;
64630 + assert("nikita-1988", seal->version != 0);
64631 + seal->block = *znode_get_block(node);
64632 +#if REISER4_DEBUG
64633 + seal->coord1 = *coord;
64634 + if (key != NULL)
64635 + seal->key = *key;
64636 +#endif
64637 + spin_unlock_znode(node);
64638 + }
64639 +}
64640 +
64641 +/* finish with seal */
64642 +void seal_done(seal_t * seal /* seal to clear */ )
64643 +{
64644 + assert("nikita-1887", seal != NULL);
64645 + seal->version = 0;
64646 +}
64647 +
64648 +/* true if seal was initialised */
64649 +int seal_is_set(const seal_t * seal /* seal to query */ )
64650 +{
64651 + assert("nikita-1890", seal != NULL);
64652 + return seal->version != 0;
64653 +}
64654 +
64655 +#if REISER4_DEBUG
64656 +/* helper function for seal_validate(). It checks that item at @coord has
64657 + * expected key. This is to detect cases where node was modified but wasn't
64658 + * marked dirty. */
64659 +static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
64660 + const reiser4_key * k /* expected key */ )
64661 +{
64662 + reiser4_key ukey;
64663 +
64664 + return (coord->between != AT_UNIT) ||
64665 + /* FIXME-VS: we only can compare keys for items whose units
64666 + represent exactly one key */
64667 + ((coord_is_existing_unit(coord))
64668 + && (item_is_extent(coord)
64669 + || keyeq(k, unit_key_by_coord(coord, &ukey))))
64670 + || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
64671 + && keyge(k, unit_key_by_coord(coord, &ukey)));
64672 +}
64673 +#endif
64674 +
64675 +/* this is used by seal_validate. It accepts return value of
64676 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
64677 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
64678 + * seal_validate returns -E_REPEAT and caller will call tre search. We cannot
64679 + * do this in longterm_lock_znode(), because sometimes we want to distinguish
64680 + * between -EINVAL and -E_REPEAT. */
64681 +static int should_repeat(int return_code)
64682 +{
64683 + return return_code == -EINVAL;
64684 +}
64685 +
64686 +/* (re-)validate seal.
64687 +
64688 + Checks whether seal is pristine, and try to revalidate it if possible.
64689 +
64690 + If seal was burned, or broken irreparably, return -E_REPEAT.
64691 +
64692 + NOTE-NIKITA currently seal_validate() returns -E_REPEAT if key we are
64693 + looking for is in range of keys covered by the sealed node, but item wasn't
64694 + found by node ->lookup() method. Alternative is to return -ENOENT in this
64695 + case, but this would complicate callers logic.
64696 +
64697 +*/
64698 +int seal_validate(seal_t * seal /* seal to validate */ ,
64699 + coord_t * coord /* coord to validate against */ ,
64700 + const reiser4_key * key /* key to validate against */ ,
64701 + lock_handle * lh /* resulting lock handle */ ,
64702 + znode_lock_mode mode /* lock node */ ,
64703 + znode_lock_request request /* locking priority */ )
64704 +{
64705 + znode *node;
64706 + int result;
64707 +
64708 + assert("nikita-1889", seal != NULL);
64709 + assert("nikita-1881", seal_is_set(seal));
64710 + assert("nikita-1882", key != NULL);
64711 + assert("nikita-1883", coord != NULL);
64712 + assert("nikita-1884", lh != NULL);
64713 + assert("nikita-1885", keyeq(&seal->key, key));
64714 + assert("nikita-1989", coords_equal(&seal->coord1, coord));
64715 +
64716 + /* obtain znode by block number */
64717 + node = seal_node(seal);
64718 + if (node != NULL) {
64719 + /* znode was in cache, lock it */
64720 + result = longterm_lock_znode(lh, node, mode, request);
64721 + zput(node);
64722 + if (result == 0) {
64723 + if (seal_matches(seal, node)) {
64724 + /* if seal version and znode version
64725 + coincide */
64726 + ON_DEBUG(coord_update_v(coord));
64727 + assert("nikita-1990",
64728 + node == seal->coord1.node);
64729 + assert("nikita-1898",
64730 + WITH_DATA_RET(coord->node, 1,
64731 + check_seal_match(coord,
64732 + key)));
64733 + } else
64734 + result = RETERR(-E_REPEAT);
64735 + }
64736 + if (result != 0) {
64737 + if (should_repeat(result))
64738 + result = RETERR(-E_REPEAT);
64739 + /* unlock node on failure */
64740 + done_lh(lh);
64741 + }
64742 + } else {
64743 + /* znode wasn't in cache */
64744 + result = RETERR(-E_REPEAT);
64745 + }
64746 + return result;
64747 +}
64748 +
64749 +/* helpers functions */
64750 +
64751 +/* obtain reference to znode seal points to, if in cache */
64752 +static znode *seal_node(const seal_t * seal /* seal to query */ )
64753 +{
64754 + assert("nikita-1891", seal != NULL);
64755 + return zlook(current_tree, &seal->block);
64756 +}
64757 +
64758 +/* true if @seal version and @node version coincide */
64759 +static int seal_matches(const seal_t * seal /* seal to check */ ,
64760 + znode * node /* node to check */ )
64761 +{
64762 + int result;
64763 +
64764 + assert("nikita-1991", seal != NULL);
64765 + assert("nikita-1993", node != NULL);
64766 +
64767 + spin_lock_znode(node);
64768 + result = (seal->version == node->version);
64769 + spin_unlock_znode(node);
64770 + return result;
64771 +}
64772 +
64773 +/* Make Linus happy.
64774 + Local variables:
64775 + c-indentation-style: "K&R"
64776 + mode-name: "LC"
64777 + c-basic-offset: 8
64778 + tab-width: 8
64779 + fill-column: 120
64780 + scroll-step: 1
64781 + End:
64782 +*/
64783 Index: linux-2.6.16/fs/reiser4/seal.h
64784 ===================================================================
64785 --- /dev/null
64786 +++ linux-2.6.16/fs/reiser4/seal.h
64787 @@ -0,0 +1,49 @@
64788 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64789 +
64790 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
64791 +
64792 +#ifndef __SEAL_H__
64793 +#define __SEAL_H__
64794 +
64795 +#include "forward.h"
64796 +#include "debug.h"
64797 +#include "dformat.h"
64798 +#include "key.h"
64799 +#include "coord.h"
64800 +
64801 +/* for __u?? types */
64802 +/*#include <linux/types.h>*/
64803 +
64804 +/* seal. See comment at the top of seal.c */
64805 +typedef struct seal_s {
64806 + /* version of znode recorder at the time of seal creation */
64807 + __u64 version;
64808 + /* block number of znode attached to this seal */
64809 + reiser4_block_nr block;
64810 +#if REISER4_DEBUG
64811 + /* coord this seal is attached to. For debugging. */
64812 + coord_t coord1;
64813 + /* key this seal is attached to. For debugging. */
64814 + reiser4_key key;
64815 +#endif
64816 +} seal_t;
64817 +
64818 +extern void seal_init(seal_t *, const coord_t *, const reiser4_key *);
64819 +extern void seal_done(seal_t *);
64820 +extern int seal_is_set(const seal_t *);
64821 +extern int seal_validate(seal_t *, coord_t *,
64822 + const reiser4_key *, lock_handle *,
64823 + znode_lock_mode mode, znode_lock_request request);
64824 +
64825 +/* __SEAL_H__ */
64826 +#endif
64827 +
64828 +/* Make Linus happy.
64829 + Local variables:
64830 + c-indentation-style: "K&R"
64831 + mode-name: "LC"
64832 + c-basic-offset: 8
64833 + tab-width: 8
64834 + fill-column: 120
64835 + End:
64836 +*/
64837 Index: linux-2.6.16/fs/reiser4/search.c
64838 ===================================================================
64839 --- /dev/null
64840 +++ linux-2.6.16/fs/reiser4/search.c
64841 @@ -0,0 +1,1611 @@
64842 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64843 + * reiser4/README */
64844 +
64845 +#include "forward.h"
64846 +#include "debug.h"
64847 +#include "dformat.h"
64848 +#include "key.h"
64849 +#include "coord.h"
64850 +#include "seal.h"
64851 +#include "plugin/item/item.h"
64852 +#include "plugin/node/node.h"
64853 +#include "plugin/plugin.h"
64854 +#include "jnode.h"
64855 +#include "znode.h"
64856 +#include "block_alloc.h"
64857 +#include "tree_walk.h"
64858 +#include "tree.h"
64859 +#include "reiser4.h"
64860 +#include "super.h"
64861 +#include "inode.h"
64862 +
64863 +#include <linux/slab.h>
64864 +
64865 +static const char *bias_name(lookup_bias bias);
64866 +
64867 +/* tree searching algorithm, intranode searching algorithms are in
64868 + plugin/node/ */
64869 +
64870 +/* tree lookup cache
64871 + *
64872 + * The coord by key cache consists of small list of recently accessed nodes
64873 + * maintained according to the LRU discipline. Before doing real top-to-down
64874 + * tree traversal this cache is scanned for nodes that can contain key
64875 + * requested.
64876 + *
64877 + * The efficiency of coord cache depends heavily on locality of reference for
64878 + * tree accesses. Our user level simulations show reasonably good hit ratios
64879 + * for coord cache under most loads so far.
64880 + */
64881 +
64882 +/* Initialise coord cache slot */
64883 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
64884 +{
64885 + assert("nikita-345", slot != NULL);
64886 +
64887 + INIT_LIST_HEAD(&slot->lru);
64888 + slot->node = NULL;
64889 +}
64890 +
64891 +/* Initialize coord cache */
64892 +int cbk_cache_init(cbk_cache *cache /* cache to init */ )
64893 +{
64894 + int i;
64895 +
64896 + assert("nikita-346", cache != NULL);
64897 +
64898 + cache->slot =
64899 + kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, GFP_KERNEL);
64900 + if (cache->slot == NULL)
64901 + return RETERR(-ENOMEM);
64902 +
64903 + INIT_LIST_HEAD(&cache->lru);
64904 + for (i = 0; i < cache->nr_slots; ++i) {
64905 + cbk_cache_init_slot(cache->slot + i);
64906 + list_add_tail(&((cache->slot + i)->lru), &cache->lru);
64907 + }
64908 + rwlock_init(&cache->guard);
64909 + return 0;
64910 +}
64911 +
64912 +/* free cbk cache data */
64913 +void cbk_cache_done(cbk_cache * cache /* cache to release */ )
64914 +{
64915 + assert("nikita-2493", cache != NULL);
64916 + if (cache->slot != NULL) {
64917 + kfree(cache->slot);
64918 + cache->slot = NULL;
64919 + }
64920 +}
64921 +
64922 +/* macro to iterate over all cbk cache slots */
64923 +#define for_all_slots(cache, slot) \
64924 + for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
64925 + &(cache)->lru != &(slot)->lru; \
64926 + (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
64927 +
64928 +
64929 +#if REISER4_DEBUG
64930 +/* this function assures that [cbk-cache-invariant] invariant holds */
64931 +static int cbk_cache_invariant(const cbk_cache *cache)
64932 +{
64933 + cbk_cache_slot *slot;
64934 + int result;
64935 + int unused;
64936 +
64937 + if (cache->nr_slots == 0)
64938 + return 1;
64939 +
64940 + assert("nikita-2469", cache != NULL);
64941 + unused = 0;
64942 + result = 1;
64943 + read_lock(&((cbk_cache *)cache)->guard);
64944 + for_all_slots(cache, slot) {
64945 + /* in LRU first go all `used' slots followed by `unused' */
64946 + if (unused && (slot->node != NULL))
64947 + result = 0;
64948 + if (slot->node == NULL)
64949 + unused = 1;
64950 + else {
64951 + cbk_cache_slot *scan;
64952 +
64953 + /* all cached nodes are different */
64954 + scan = slot;
64955 + while (result) {
64956 + scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
64957 + if (&cache->lru == &scan->lru)
64958 + break;
64959 + if (slot->node == scan->node)
64960 + result = 0;
64961 + }
64962 + }
64963 + if (!result)
64964 + break;
64965 + }
64966 + read_unlock(&((cbk_cache *)cache)->guard);
64967 + return result;
64968 +}
64969 +
64970 +#endif
64971 +
64972 +/* Remove references, if any, to @node from coord cache */
64973 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
64974 + reiser4_tree * tree /* tree to remove node from */ )
64975 +{
64976 + cbk_cache_slot *slot;
64977 + cbk_cache *cache;
64978 + int i;
64979 +
64980 + assert("nikita-350", node != NULL);
64981 + assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
64982 +
64983 + cache = &tree->cbk_cache;
64984 + assert("nikita-2470", cbk_cache_invariant(cache));
64985 +
64986 + write_lock(&(cache->guard));
64987 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
64988 + if (slot->node == node) {
64989 + list_move_tail(&slot->lru, &cache->lru);
64990 + slot->node = NULL;
64991 + break;
64992 + }
64993 + }
64994 + write_unlock(&(cache->guard));
64995 + assert("nikita-2471", cbk_cache_invariant(cache));
64996 +}
64997 +
64998 +/* add to the cbk-cache in the "tree" information about "node". This
64999 + can actually be update of existing slot in a cache. */
65000 +static void cbk_cache_add(const znode *node /* node to add to the cache */ )
65001 +{
65002 + cbk_cache *cache;
65003 + cbk_cache_slot *slot;
65004 + int i;
65005 +
65006 + assert("nikita-352", node != NULL);
65007 +
65008 + cache = &znode_get_tree(node)->cbk_cache;
65009 + assert("nikita-2472", cbk_cache_invariant(cache));
65010 +
65011 + if (cache->nr_slots == 0)
65012 + return;
65013 +
65014 + write_lock(&(cache->guard));
65015 + /* find slot to update/add */
65016 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65017 + /* oops, this node is already in a cache */
65018 + if (slot->node == node)
65019 + break;
65020 + }
65021 + /* if all slots are used, reuse least recently used one */
65022 + if (i == cache->nr_slots) {
65023 + slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
65024 + slot->node = (znode *) node;
65025 + }
65026 + list_move(&slot->lru, &cache->lru);
65027 + write_unlock(&(cache->guard));
65028 + assert("nikita-2473", cbk_cache_invariant(cache));
65029 +}
65030 +
65031 +static int setup_delimiting_keys(cbk_handle * h);
65032 +static lookup_result coord_by_handle(cbk_handle * handle);
65033 +static lookup_result traverse_tree(cbk_handle * h);
65034 +static int cbk_cache_search(cbk_handle * h);
65035 +
65036 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
65037 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
65038 +
65039 +/* helper functions */
65040 +
65041 +static void update_stale_dk(reiser4_tree * tree, znode * node);
65042 +
65043 +/* release parent node during traversal */
65044 +static void put_parent(cbk_handle * h);
65045 +/* check consistency of fields */
65046 +static int sanity_check(cbk_handle * h);
65047 +/* release resources in handle */
65048 +static void hput(cbk_handle * h);
65049 +
65050 +static level_lookup_result search_to_left(cbk_handle * h);
65051 +
65052 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
65053 + * cbk_handle */
65054 +static cbk_handle *cbk_pack(cbk_handle * handle,
65055 + reiser4_tree * tree,
65056 + const reiser4_key * key,
65057 + coord_t * coord,
65058 + lock_handle * active_lh,
65059 + lock_handle * parent_lh,
65060 + znode_lock_mode lock_mode,
65061 + lookup_bias bias,
65062 + tree_level lock_level,
65063 + tree_level stop_level,
65064 + __u32 flags, ra_info_t * info)
65065 +{
65066 + memset(handle, 0, sizeof *handle);
65067 +
65068 + handle->tree = tree;
65069 + handle->key = key;
65070 + handle->lock_mode = lock_mode;
65071 + handle->bias = bias;
65072 + handle->lock_level = lock_level;
65073 + handle->stop_level = stop_level;
65074 + handle->coord = coord;
65075 + /* set flags. See comment in tree.h:cbk_flags */
65076 + handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
65077 +
65078 + handle->active_lh = active_lh;
65079 + handle->parent_lh = parent_lh;
65080 + handle->ra_info = info;
65081 + return handle;
65082 +}
65083 +
65084 +/* main tree lookup procedure
65085 +
65086 + Check coord cache. If key we are looking for is not found there, call cbk()
65087 + to do real tree traversal.
65088 +
65089 + As we have extents on the twig level, @lock_level and @stop_level can
65090 + be different from LEAF_LEVEL and each other.
65091 +
65092 + Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
65093 + long term locks) while calling this.
65094 +*/
65095 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
65096 + * in. Usually this tree is
65097 + * part of file-system
65098 + * super-block */ ,
65099 + const reiser4_key * key /* key to look for */ ,
65100 + coord_t * coord /* where to store found
65101 + * position in a tree. Fields
65102 + * in "coord" are only valid if
65103 + * coord_by_key() returned
65104 + * "CBK_COORD_FOUND" */ ,
65105 + lock_handle * lh, /* resulting lock handle */
65106 + znode_lock_mode lock_mode /* type of lookup we
65107 + * want on node. Pass
65108 + * ZNODE_READ_LOCK here
65109 + * if you only want to
65110 + * read item found and
65111 + * ZNODE_WRITE_LOCK if
65112 + * you want to modify
65113 + * it */ ,
65114 + lookup_bias bias /* what to return if coord
65115 + * with exactly the @key is
65116 + * not in the tree */ ,
65117 + tree_level lock_level /* tree level where to start
65118 + * taking @lock type of
65119 + * locks */ ,
65120 + tree_level stop_level /* tree level to stop. Pass
65121 + * LEAF_LEVEL or TWIG_LEVEL
65122 + * here Item being looked
65123 + * for has to be between
65124 + * @lock_level and
65125 + * @stop_level, inclusive */ ,
65126 + __u32 flags /* search flags */ ,
65127 + ra_info_t *
65128 + info
65129 + /* information about desired tree traversal readahead */
65130 + )
65131 +{
65132 + cbk_handle handle;
65133 + lock_handle parent_lh;
65134 + lookup_result result;
65135 +
65136 + init_lh(lh);
65137 + init_lh(&parent_lh);
65138 +
65139 + assert("nikita-3023", schedulable());
65140 +
65141 + assert("nikita-353", tree != NULL);
65142 + assert("nikita-354", key != NULL);
65143 + assert("nikita-355", coord != NULL);
65144 + assert("nikita-356", (bias == FIND_EXACT)
65145 + || (bias == FIND_MAX_NOT_MORE_THAN));
65146 + assert("nikita-357", stop_level >= LEAF_LEVEL);
65147 + /* no locks can be held during tree traversal */
65148 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65149 +
65150 + cbk_pack(&handle,
65151 + tree,
65152 + key,
65153 + coord,
65154 + lh,
65155 + &parent_lh,
65156 + lock_mode, bias, lock_level, stop_level, flags, info);
65157 +
65158 + result = coord_by_handle(&handle);
65159 + assert("nikita-3247",
65160 + ergo(!IS_CBKERR(result), coord->node == lh->node));
65161 + return result;
65162 +}
65163 +
65164 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
65165 + * from tree root. */
65166 +lookup_result
65167 +object_lookup(struct inode * object,
65168 + const reiser4_key * key,
65169 + coord_t * coord,
65170 + lock_handle * lh,
65171 + znode_lock_mode lock_mode,
65172 + lookup_bias bias,
65173 + tree_level lock_level,
65174 + tree_level stop_level, __u32 flags, ra_info_t * info)
65175 +{
65176 + cbk_handle handle;
65177 + lock_handle parent_lh;
65178 + lookup_result result;
65179 +
65180 + init_lh(lh);
65181 + init_lh(&parent_lh);
65182 +
65183 + assert("nikita-3023", schedulable());
65184 +
65185 + assert("nikita-354", key != NULL);
65186 + assert("nikita-355", coord != NULL);
65187 + assert("nikita-356", (bias == FIND_EXACT)
65188 + || (bias == FIND_MAX_NOT_MORE_THAN));
65189 + assert("nikita-357", stop_level >= LEAF_LEVEL);
65190 + /* no locks can be held during tree search by key */
65191 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65192 +
65193 + cbk_pack(&handle,
65194 + object != NULL ? tree_by_inode(object) : current_tree,
65195 + key,
65196 + coord,
65197 + lh,
65198 + &parent_lh,
65199 + lock_mode, bias, lock_level, stop_level, flags, info);
65200 + handle.object = object;
65201 +
65202 + result = coord_by_handle(&handle);
65203 + assert("nikita-3247",
65204 + ergo(!IS_CBKERR(result), coord->node == lh->node));
65205 + return result;
65206 +}
65207 +
65208 +/* lookup by cbk_handle. Common part of coord_by_key() and object_lookup(). */
65209 +static lookup_result coord_by_handle(cbk_handle * handle)
65210 +{
65211 + /*
65212 + * first check cbk_cache (which is look-aside cache for our tree) and
65213 + * of this fails, start traversal.
65214 + */
65215 + /* first check whether "key" is in cache of recent lookups. */
65216 + if (cbk_cache_search(handle) == 0)
65217 + return handle->result;
65218 + else
65219 + return traverse_tree(handle);
65220 +}
65221 +
65222 +/* Execute actor for each item (or unit, depending on @through_units_p),
65223 + starting from @coord, right-ward, until either:
65224 +
65225 + - end of the tree is reached
65226 + - unformatted node is met
65227 + - error occurred
65228 + - @actor returns 0 or less
65229 +
65230 + Error code, or last actor return value is returned.
65231 +
65232 + This is used by plugin/dir/hashe_dir.c:find_entry() to move through
65233 + sequence of entries with identical keys and alikes.
65234 +*/
65235 +int iterate_tree(reiser4_tree * tree /* tree to scan */ ,
65236 + coord_t * coord /* coord to start from */ ,
65237 + lock_handle * lh /* lock handle to start with and to
65238 + * update along the way */ ,
65239 + tree_iterate_actor_t actor /* function to call on each
65240 + * item/unit */ ,
65241 + void *arg /* argument to pass to @actor */ ,
65242 + znode_lock_mode mode /* lock mode on scanned nodes */ ,
65243 + int through_units_p /* call @actor on each item or on each
65244 + * unit */ )
65245 +{
65246 + int result;
65247 +
65248 + assert("nikita-1143", tree != NULL);
65249 + assert("nikita-1145", coord != NULL);
65250 + assert("nikita-1146", lh != NULL);
65251 + assert("nikita-1147", actor != NULL);
65252 +
65253 + result = zload(coord->node);
65254 + coord_clear_iplug(coord);
65255 + if (result != 0)
65256 + return result;
65257 + if (!coord_is_existing_unit(coord)) {
65258 + zrelse(coord->node);
65259 + return -ENOENT;
65260 + }
65261 + while ((result = actor(tree, coord, lh, arg)) > 0) {
65262 + /* move further */
65263 + if ((through_units_p && coord_next_unit(coord)) ||
65264 + (!through_units_p && coord_next_item(coord))) {
65265 + do {
65266 + lock_handle couple;
65267 +
65268 + /* move to the next node */
65269 + init_lh(&couple);
65270 + result =
65271 + reiser4_get_right_neighbor(&couple,
65272 + coord->node,
65273 + (int)mode,
65274 + GN_CAN_USE_UPPER_LEVELS);
65275 + zrelse(coord->node);
65276 + if (result == 0) {
65277 +
65278 + result = zload(couple.node);
65279 + if (result != 0) {
65280 + done_lh(&couple);
65281 + return result;
65282 + }
65283 +
65284 + coord_init_first_unit(coord,
65285 + couple.node);
65286 + done_lh(lh);
65287 + move_lh(lh, &couple);
65288 + } else
65289 + return result;
65290 + } while (node_is_empty(coord->node));
65291 + }
65292 +
65293 + assert("nikita-1149", coord_is_existing_unit(coord));
65294 + }
65295 + zrelse(coord->node);
65296 + return result;
65297 +}
65298 +
65299 +/* return locked uber znode for @tree */
65300 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65301 + znode_lock_request pri, lock_handle * lh)
65302 +{
65303 + int result;
65304 +
65305 + result = longterm_lock_znode(lh, tree->uber, mode, pri);
65306 + return result;
65307 +}
65308 +
65309 +/* true if @key is strictly within @node
65310 +
65311 + we are looking for possibly non-unique key and it is item is at the edge of
65312 + @node. May be it is in the neighbor.
65313 +*/
65314 +static int znode_contains_key_strict(znode * node /* node to check key
65315 + * against */ ,
65316 + const reiser4_key *
65317 + key /* key to check */ ,
65318 + int isunique)
65319 +{
65320 + int answer;
65321 +
65322 + assert("nikita-1760", node != NULL);
65323 + assert("nikita-1722", key != NULL);
65324 +
65325 + if (keyge(key, &node->rd_key))
65326 + return 0;
65327 +
65328 + answer = keycmp(&node->ld_key, key);
65329 +
65330 + if (isunique)
65331 + return answer != GREATER_THAN;
65332 + else
65333 + return answer == LESS_THAN;
65334 +}
65335 +
65336 +/*
65337 + * Virtual Root (vroot) code.
65338 + *
65339 + * For given file system object (e.g., regular file or directory) let's
65340 + * define its "virtual root" as lowest in the tree (that is, furtherest
65341 + * from the tree root) node such that all body items of said object are
65342 + * located in a tree rooted at this node.
65343 + *
65344 + * Once vroot of object is found all tree lookups for items within body of
65345 + * this object ("object lookups") can be started from its vroot rather
65346 + * than from real root. This has following advantages:
65347 + *
65348 + * 1. amount of nodes traversed during lookup (and, hence, amount of
65349 + * key comparisons made) decreases, and
65350 + *
65351 + * 2. contention on tree root is decreased. This latter was actually
65352 + * motivating reason behind vroot, because spin lock of root node,
65353 + * which is taken when acquiring long-term lock on root node is the
65354 + * hottest lock in the reiser4.
65355 + *
65356 + * How to find vroot.
65357 + *
65358 + * When vroot of object F is not yet determined, all object lookups start
65359 + * from the root of the tree. At each tree level during traversal we have
65360 + * a node N such that a key we are looking for (which is the key inside
65361 + * object's body) is located within N. In function handle_vroot() called
65362 + * from cbk_level_lookup() we check whether N is possible vroot for
65363 + * F. Check is trivial---if neither leftmost nor rightmost item of N
65364 + * belongs to F (and we already have helpful ->owns_item() method of
65365 + * object plugin for this), then N is possible vroot of F. This, of
65366 + * course, relies on the assumption that each object occupies contiguous
65367 + * range of keys in the tree.
65368 + *
65369 + * Thus, traversing tree downward and checking each node as we go, we can
65370 + * find lowest such node, which, by definition, is vroot.
65371 + *
65372 + * How to track vroot.
65373 + *
65374 + * Nohow. If actual vroot changes, next object lookup will just restart
65375 + * from the actual tree root, refreshing object's vroot along the way.
65376 + *
65377 + */
65378 +
65379 +/*
65380 + * Check whether @node is possible vroot of @object.
65381 + */
65382 +static void handle_vroot(struct inode *object, znode * node)
65383 +{
65384 + file_plugin *fplug;
65385 + coord_t coord;
65386 +
65387 + fplug = inode_file_plugin(object);
65388 + assert("nikita-3353", fplug != NULL);
65389 + assert("nikita-3354", fplug->owns_item != NULL);
65390 +
65391 + if (unlikely(node_is_empty(node)))
65392 + return;
65393 +
65394 + coord_init_first_unit(&coord, node);
65395 + /*
65396 + * if leftmost item of @node belongs to @object, we cannot be sure
65397 + * that @node is vroot of @object, because, some items of @object are
65398 + * probably in the sub-tree rooted at the left neighbor of @node.
65399 + */
65400 + if (fplug->owns_item(object, &coord))
65401 + return;
65402 + coord_init_last_unit(&coord, node);
65403 + /* mutatis mutandis for the rightmost item */
65404 + if (fplug->owns_item(object, &coord))
65405 + return;
65406 + /* otherwise, @node is possible vroot of @object */
65407 + inode_set_vroot(object, node);
65408 +}
65409 +
65410 +/*
65411 + * helper function used by traverse tree to start tree traversal not from the
65412 + * tree root, but from @h->object's vroot, if possible.
65413 + */
65414 +static int prepare_object_lookup(cbk_handle * h)
65415 +{
65416 + znode *vroot;
65417 + int result;
65418 +
65419 + vroot = inode_get_vroot(h->object);
65420 + if (vroot == NULL) {
65421 + /*
65422 + * object doesn't have known vroot, start from real tree root.
65423 + */
65424 + return LOOKUP_CONT;
65425 + }
65426 +
65427 + h->level = znode_get_level(vroot);
65428 + /* take a long-term lock on vroot */
65429 + h->result = longterm_lock_znode(h->active_lh, vroot,
65430 + cbk_lock_mode(h->level, h),
65431 + ZNODE_LOCK_LOPRI);
65432 + result = LOOKUP_REST;
65433 + if (h->result == 0) {
65434 + int isunique;
65435 + int inside;
65436 +
65437 + isunique = h->flags & CBK_UNIQUE;
65438 + /* check that key is inside vroot */
65439 + read_lock_dk(h->tree);
65440 + inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65441 + !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65442 + read_unlock_dk(h->tree);
65443 + if (inside) {
65444 + h->result = zload(vroot);
65445 + if (h->result == 0) {
65446 + /* search for key in vroot. */
65447 + result = cbk_node_lookup(h);
65448 + zrelse(vroot); /*h->active_lh->node); */
65449 + if (h->active_lh->node != vroot) {
65450 + result = LOOKUP_REST;
65451 + } else if (result == LOOKUP_CONT) {
65452 + move_lh(h->parent_lh, h->active_lh);
65453 + h->flags &= ~CBK_DKSET;
65454 + }
65455 + }
65456 + }
65457 + } else
65458 + /* long-term locking failed. Restart. */
65459 + ;
65460 +
65461 + zput(vroot);
65462 +
65463 + if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65464 + hput(h);
65465 + return result;
65466 +}
65467 +
65468 +/* main function that handles common parts of tree traversal: starting
65469 + (fake znode handling), restarts, error handling, completion */
65470 +static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65471 +{
65472 + int done;
65473 + int iterations;
65474 + int vroot_used;
65475 +
65476 + assert("nikita-365", h != NULL);
65477 + assert("nikita-366", h->tree != NULL);
65478 + assert("nikita-367", h->key != NULL);
65479 + assert("nikita-368", h->coord != NULL);
65480 + assert("nikita-369", (h->bias == FIND_EXACT)
65481 + || (h->bias == FIND_MAX_NOT_MORE_THAN));
65482 + assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65483 + assert("nikita-2949", !(h->flags & CBK_DKSET));
65484 + assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65485 +
65486 + done = 0;
65487 + iterations = 0;
65488 + vroot_used = 0;
65489 +
65490 + /* loop for restarts */
65491 + restart:
65492 +
65493 + assert("nikita-3024", schedulable());
65494 +
65495 + h->result = CBK_COORD_FOUND;
65496 + /* connect_znode() needs it */
65497 + h->ld_key = *min_key();
65498 + h->rd_key = *max_key();
65499 + h->flags |= CBK_DKSET;
65500 + h->error = NULL;
65501 +
65502 + if (!vroot_used && h->object != NULL) {
65503 + vroot_used = 1;
65504 + done = prepare_object_lookup(h);
65505 + if (done == LOOKUP_REST) {
65506 + goto restart;
65507 + } else if (done == LOOKUP_DONE)
65508 + return h->result;
65509 + }
65510 + if (h->parent_lh->node == NULL) {
65511 + done =
65512 + get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
65513 + h->parent_lh);
65514 +
65515 + assert("nikita-1637", done != -E_DEADLOCK);
65516 +
65517 + h->block = h->tree->root_block;
65518 + h->level = h->tree->height;
65519 + h->coord->node = h->parent_lh->node;
65520 +
65521 + if (done != 0)
65522 + return done;
65523 + }
65524 +
65525 + /* loop descending a tree */
65526 + while (!done) {
65527 +
65528 + if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
65529 + IS_POW(iterations))) {
65530 + warning("nikita-1481", "Too many iterations: %i",
65531 + iterations);
65532 + print_key("key", h->key);
65533 + ++iterations;
65534 + } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
65535 + h->error =
65536 + "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
65537 + h->result = RETERR(-EIO);
65538 + break;
65539 + }
65540 + switch (cbk_level_lookup(h)) {
65541 + case LOOKUP_CONT:
65542 + move_lh(h->parent_lh, h->active_lh);
65543 + continue;
65544 + default:
65545 + wrong_return_value("nikita-372", "cbk_level");
65546 + case LOOKUP_DONE:
65547 + done = 1;
65548 + break;
65549 + case LOOKUP_REST:
65550 + hput(h);
65551 + /* deadlock avoidance is normal case. */
65552 + if (h->result != -E_DEADLOCK)
65553 + ++iterations;
65554 + preempt_point();
65555 + goto restart;
65556 + }
65557 + }
65558 + /* that's all. The rest is error handling */
65559 + if (unlikely(h->error != NULL)) {
65560 + warning("nikita-373", "%s: level: %i, "
65561 + "lock_level: %i, stop_level: %i "
65562 + "lock_mode: %s, bias: %s",
65563 + h->error, h->level, h->lock_level, h->stop_level,
65564 + lock_mode_name(h->lock_mode), bias_name(h->bias));
65565 + reiser4_print_address("block", &h->block);
65566 + print_key("key", h->key);
65567 + print_coord_content("coord", h->coord);
65568 + }
65569 + /* `unlikely' error case */
65570 + if (unlikely(IS_CBKERR(h->result))) {
65571 + /* failure. do cleanup */
65572 + hput(h);
65573 + } else {
65574 + assert("nikita-1605", WITH_DATA_RET
65575 + (h->coord->node, 1,
65576 + ergo((h->result == CBK_COORD_FOUND) &&
65577 + (h->bias == FIND_EXACT) &&
65578 + (!node_is_empty(h->coord->node)),
65579 + coord_is_existing_item(h->coord))));
65580 + }
65581 + return h->result;
65582 +}
65583 +
65584 +/* find delimiting keys of child
65585 +
65586 + Determine left and right delimiting keys for child pointed to by
65587 + @parent_coord.
65588 +
65589 +*/
65590 +static void find_child_delimiting_keys(znode * parent /* parent znode, passed
65591 + * locked */ ,
65592 + const coord_t * parent_coord /* coord where
65593 + * pointer to
65594 + * child is
65595 + * stored */ ,
65596 + reiser4_key * ld /* where to store left
65597 + * delimiting key */ ,
65598 + reiser4_key * rd /* where to store right
65599 + * delimiting key */ )
65600 +{
65601 + coord_t neighbor;
65602 +
65603 + assert("nikita-1484", parent != NULL);
65604 + assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
65605 +
65606 + coord_dup(&neighbor, parent_coord);
65607 +
65608 + if (neighbor.between == AT_UNIT)
65609 + /* imitate item ->lookup() behavior. */
65610 + neighbor.between = AFTER_UNIT;
65611 +
65612 + if (coord_set_to_left(&neighbor) == 0)
65613 + unit_key_by_coord(&neighbor, ld);
65614 + else {
65615 + assert("nikita-14851", 0);
65616 + *ld = *znode_get_ld_key(parent);
65617 + }
65618 +
65619 + coord_dup(&neighbor, parent_coord);
65620 + if (neighbor.between == AT_UNIT)
65621 + neighbor.between = AFTER_UNIT;
65622 + if (coord_set_to_right(&neighbor) == 0)
65623 + unit_key_by_coord(&neighbor, rd);
65624 + else
65625 + *rd = *znode_get_rd_key(parent);
65626 +}
65627 +
65628 +/*
65629 + * setup delimiting keys for a child
65630 + *
65631 + * @parent parent node
65632 + *
65633 + * @coord location in @parent where pointer to @child is
65634 + *
65635 + * @child child node
65636 + */
65637 +int
65638 +set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
65639 +{
65640 + reiser4_tree *tree;
65641 +
65642 + assert("nikita-2952",
65643 + znode_get_level(parent) == znode_get_level(coord->node));
65644 +
65645 + /* fast check without taking dk lock. This is safe, because
65646 + * JNODE_DKSET is never cleared once set. */
65647 + if (!ZF_ISSET(child, JNODE_DKSET)) {
65648 + tree = znode_get_tree(parent);
65649 + write_lock_dk(tree);
65650 + if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
65651 + find_child_delimiting_keys(parent, coord,
65652 + &child->ld_key,
65653 + &child->rd_key);
65654 + ON_DEBUG(child->ld_key_version =
65655 + atomic_inc_return(&delim_key_version);
65656 + child->rd_key_version =
65657 + atomic_inc_return(&delim_key_version););
65658 + ZF_SET(child, JNODE_DKSET);
65659 + }
65660 + write_unlock_dk(tree);
65661 + return 1;
65662 + }
65663 + return 0;
65664 +}
65665 +
65666 +/* Perform tree lookup at one level. This is called from cbk_traverse()
65667 + function that drives lookup through tree and calls cbk_node_lookup() to
65668 + perform lookup within one node.
65669 +
65670 + See comments in a code.
65671 +*/
65672 +static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
65673 +{
65674 + int ret;
65675 + int setdk;
65676 + int ldkeyset = 0;
65677 + reiser4_key ldkey;
65678 + reiser4_key key;
65679 + znode *active;
65680 +
65681 + assert("nikita-3025", schedulable());
65682 +
65683 + /* acquire reference to @active node */
65684 + active =
65685 + zget(h->tree, &h->block, h->parent_lh->node, h->level, get_gfp_mask());
65686 +
65687 + if (IS_ERR(active)) {
65688 + h->result = PTR_ERR(active);
65689 + return LOOKUP_DONE;
65690 + }
65691 +
65692 + /* lock @active */
65693 + h->result = longterm_lock_znode(h->active_lh,
65694 + active,
65695 + cbk_lock_mode(h->level, h),
65696 + ZNODE_LOCK_LOPRI);
65697 + /* longterm_lock_znode() acquires additional reference to znode (which
65698 + will be later released by longterm_unlock_znode()). Release
65699 + reference acquired by zget().
65700 + */
65701 + zput(active);
65702 + if (unlikely(h->result != 0))
65703 + goto fail_or_restart;
65704 +
65705 + setdk = 0;
65706 + /* if @active is accessed for the first time, setup delimiting keys on
65707 + it. Delimiting keys are taken from the parent node. See
65708 + setup_delimiting_keys() for details.
65709 + */
65710 + if (h->flags & CBK_DKSET) {
65711 + setdk = setup_delimiting_keys(h);
65712 + h->flags &= ~CBK_DKSET;
65713 + } else {
65714 + znode *parent;
65715 +
65716 + parent = h->parent_lh->node;
65717 + h->result = zload(parent);
65718 + if (unlikely(h->result != 0))
65719 + goto fail_or_restart;
65720 +
65721 + if (!ZF_ISSET(active, JNODE_DKSET))
65722 + setdk = set_child_delimiting_keys(parent,
65723 + h->coord, active);
65724 + else {
65725 + read_lock_dk(h->tree);
65726 + find_child_delimiting_keys(parent, h->coord, &ldkey,
65727 + &key);
65728 + read_unlock_dk(h->tree);
65729 + ldkeyset = 1;
65730 + }
65731 + zrelse(parent);
65732 + }
65733 +
65734 + /* this is ugly kludge. Reminder: this is necessary, because
65735 + ->lookup() method returns coord with ->between field probably set
65736 + to something different from AT_UNIT.
65737 + */
65738 + h->coord->between = AT_UNIT;
65739 +
65740 + if (znode_just_created(active) && (h->coord->node != NULL)) {
65741 + write_lock_tree(h->tree);
65742 + /* if we are going to load znode right now, setup
65743 + ->in_parent: coord where pointer to this node is stored in
65744 + parent.
65745 + */
65746 + coord_to_parent_coord(h->coord, &active->in_parent);
65747 + write_unlock_tree(h->tree);
65748 + }
65749 +
65750 + /* check connectedness without holding tree lock---false negatives
65751 + * will be re-checked by connect_znode(), and false positives are
65752 + * impossible---@active cannot suddenly turn into unconnected
65753 + * state. */
65754 + if (!znode_is_connected(active)) {
65755 + h->result = connect_znode(h->coord, active);
65756 + if (unlikely(h->result != 0)) {
65757 + put_parent(h);
65758 + goto fail_or_restart;
65759 + }
65760 + }
65761 +
65762 + jload_prefetch(ZJNODE(active));
65763 +
65764 + if (setdk)
65765 + update_stale_dk(h->tree, active);
65766 +
65767 + /* put_parent() cannot be called earlier, because connect_znode()
65768 + assumes parent node is referenced; */
65769 + put_parent(h);
65770 +
65771 + if ((!znode_contains_key_lock(active, h->key) &&
65772 + (h->flags & CBK_TRUST_DK))
65773 + || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
65774 + /* 1. key was moved out of this node while this thread was
65775 + waiting for the lock. Restart. More elaborate solution is
65776 + to determine where key moved (to the left, or to the right)
65777 + and try to follow it through sibling pointers.
65778 +
65779 + 2. or, node itself is going to be removed from the
65780 + tree. Release lock and restart.
65781 + */
65782 + h->result = -E_REPEAT;
65783 + }
65784 + if (h->result == -E_REPEAT)
65785 + return LOOKUP_REST;
65786 +
65787 + h->result = zload_ra(active, h->ra_info);
65788 + if (h->result) {
65789 + return LOOKUP_DONE;
65790 + }
65791 +
65792 + /* sanity checks */
65793 + if (sanity_check(h)) {
65794 + zrelse(active);
65795 + return LOOKUP_DONE;
65796 + }
65797 +
65798 + /* check that key of leftmost item in the @active is the same as in
65799 + * its parent */
65800 + if (ldkeyset && !node_is_empty(active) &&
65801 + !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
65802 + warning("vs-3533", "Keys are inconsistent. Fsck?");
65803 + print_key("inparent", &ldkey);
65804 + print_key("inchild", &key);
65805 + h->result = RETERR(-EIO);
65806 + zrelse(active);
65807 + return LOOKUP_DONE;
65808 + }
65809 +
65810 + if (h->object != NULL)
65811 + handle_vroot(h->object, active);
65812 +
65813 + ret = cbk_node_lookup(h);
65814 +
65815 + /* h->active_lh->node might change, but active is yet to be zrelsed */
65816 + zrelse(active);
65817 +
65818 + return ret;
65819 +
65820 + fail_or_restart:
65821 + if (h->result == -E_DEADLOCK)
65822 + return LOOKUP_REST;
65823 + return LOOKUP_DONE;
65824 +}
65825 +
65826 +#if REISER4_DEBUG
65827 +/* check left and right delimiting keys of a znode */
65828 +void check_dkeys(znode * node)
65829 +{
65830 + znode *left;
65831 + znode *right;
65832 +
65833 + read_lock_tree(current_tree);
65834 + read_lock_dk(current_tree);
65835 +
65836 + assert("vs-1710", znode_is_any_locked(node));
65837 + assert("vs-1197",
65838 + !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
65839 +
65840 + left = node->left;
65841 + right = node->right;
65842 +
65843 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65844 + && left != NULL && ZF_ISSET(left, JNODE_DKSET))
65845 + /* check left neighbor. Note that left neighbor is not locked,
65846 + so it might get wrong delimiting keys therefore */
65847 + assert("vs-1198",
65848 + (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
65849 + || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
65850 +
65851 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65852 + && right != NULL && ZF_ISSET(right, JNODE_DKSET))
65853 + /* check right neighbor. Note that right neighbor is not
65854 + locked, so it might get wrong delimiting keys therefore */
65855 + assert("vs-1199",
65856 + (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
65857 + || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
65858 +
65859 + read_unlock_dk(current_tree);
65860 + read_unlock_tree(current_tree);
65861 +}
65862 +#endif
65863 +
65864 +/* true if @key is left delimiting key of @node */
65865 +static int key_is_ld(znode * node, const reiser4_key * key)
65866 +{
65867 + int ld;
65868 +
65869 + assert("nikita-1716", node != NULL);
65870 + assert("nikita-1758", key != NULL);
65871 +
65872 + read_lock_dk(znode_get_tree(node));
65873 + assert("nikita-1759", znode_contains_key(node, key));
65874 + ld = keyeq(znode_get_ld_key(node), key);
65875 + read_unlock_dk(znode_get_tree(node));
65876 + return ld;
65877 +}
65878 +
65879 +/* Process one node during tree traversal.
65880 +
65881 + This is called by cbk_level_lookup(). */
65882 +static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
65883 +{
65884 + /* node plugin of @active */
65885 + node_plugin *nplug;
65886 + /* item plugin of item that was found */
65887 + item_plugin *iplug;
65888 + /* search bias */
65889 + lookup_bias node_bias;
65890 + /* node we are operating upon */
65891 + znode *active;
65892 + /* tree we are searching in */
65893 + reiser4_tree *tree;
65894 + /* result */
65895 + int result;
65896 +
65897 + assert("nikita-379", h != NULL);
65898 +
65899 + active = h->active_lh->node;
65900 + tree = h->tree;
65901 +
65902 + nplug = active->nplug;
65903 + assert("nikita-380", nplug != NULL);
65904 +
65905 + ON_DEBUG(check_dkeys(active));
65906 +
65907 + /* return item from "active" node with maximal key not greater than
65908 + "key" */
65909 + node_bias = h->bias;
65910 + result = nplug->lookup(active, h->key, node_bias, h->coord);
65911 + if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
65912 + /* error occurred */
65913 + h->result = result;
65914 + return LOOKUP_DONE;
65915 + }
65916 + if (h->level == h->stop_level) {
65917 + /* welcome to the stop level */
65918 + assert("nikita-381", h->coord->node == active);
65919 + if (result == NS_FOUND) {
65920 + /* success of tree lookup */
65921 + if (!(h->flags & CBK_UNIQUE)
65922 + && key_is_ld(active, h->key)) {
65923 + return search_to_left(h);
65924 + } else
65925 + h->result = CBK_COORD_FOUND;
65926 + } else {
65927 + h->result = CBK_COORD_NOTFOUND;
65928 + }
65929 + if (!(h->flags & CBK_IN_CACHE))
65930 + cbk_cache_add(active);
65931 + return LOOKUP_DONE;
65932 + }
65933 +
65934 + if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
65935 + h->error = "not found on internal node";
65936 + h->result = result;
65937 + return LOOKUP_DONE;
65938 + }
65939 +
65940 + assert("vs-361", h->level > h->stop_level);
65941 +
65942 + if (handle_eottl(h, &result)) {
65943 + assert("vs-1674", (result == LOOKUP_DONE ||
65944 + result == LOOKUP_REST));
65945 + return result;
65946 + }
65947 +
65948 + /* go down to next level */
65949 + check_me("vs-12", zload(h->coord->node) == 0);
65950 + assert("nikita-2116", item_is_internal(h->coord));
65951 + iplug = item_plugin_by_coord(h->coord);
65952 + iplug->s.internal.down_link(h->coord, h->key, &h->block);
65953 + zrelse(h->coord->node);
65954 + --h->level;
65955 + return LOOKUP_CONT; /* continue */
65956 +}
65957 +
65958 +/* scan cbk_cache slots looking for a match for @h */
65959 +static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
65960 +{
65961 + level_lookup_result llr;
65962 + znode *node;
65963 + reiser4_tree *tree;
65964 + cbk_cache_slot *slot;
65965 + cbk_cache *cache;
65966 + tree_level level;
65967 + int isunique;
65968 + const reiser4_key *key;
65969 + int result;
65970 +
65971 + assert("nikita-1317", h != NULL);
65972 + assert("nikita-1315", h->tree != NULL);
65973 + assert("nikita-1316", h->key != NULL);
65974 +
65975 + tree = h->tree;
65976 + cache = &tree->cbk_cache;
65977 + if (cache->nr_slots == 0)
65978 + /* size of cbk cache was set to 0 by mount time option. */
65979 + return RETERR(-ENOENT);
65980 +
65981 + assert("nikita-2474", cbk_cache_invariant(cache));
65982 + node = NULL; /* to keep gcc happy */
65983 + level = h->level;
65984 + key = h->key;
65985 + isunique = h->flags & CBK_UNIQUE;
65986 + result = RETERR(-ENOENT);
65987 +
65988 + /*
65989 + * this is time-critical function and dragons had, hence, been settled
65990 + * here.
65991 + *
65992 + * Loop below scans cbk cache slots trying to find matching node with
65993 + * suitable range of delimiting keys and located at the h->level.
65994 + *
65995 + * Scan is done under cbk cache spin lock that protects slot->node
65996 + * pointers. If suitable node is found we want to pin it in
65997 + * memory. But slot->node can point to the node with x_count 0
65998 + * (unreferenced). Such node can be recycled at any moment, or can
65999 + * already be in the process of being recycled (within jput()).
66000 + *
66001 + * As we found node in the cbk cache, it means that jput() hasn't yet
66002 + * called cbk_cache_invalidate().
66003 + *
66004 + * We acquire reference to the node without holding tree lock, and
66005 + * later, check node's RIP bit. This avoids races with jput().
66006 + */
66007 +
66008 + rcu_read_lock();
66009 + read_lock(&((cbk_cache *)cache)->guard);
66010 +
66011 + slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
66012 + slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
66013 + BUG_ON(&slot->lru != &cache->lru);/*????*/
66014 + while (1) {
66015 +
66016 + slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
66017 +
66018 + if (&cache->lru != &slot->lru)
66019 + node = slot->node;
66020 + else
66021 + node = NULL;
66022 +
66023 + if (unlikely(node == NULL))
66024 + break;
66025 +
66026 + /*
66027 + * this is (hopefully) the only place in the code where we are
66028 + * working with delimiting keys without holding dk lock. This
66029 + * is fine here, because this is only "guess" anyway---keys
66030 + * are rechecked under dk lock below.
66031 + */
66032 + if (znode_get_level(node) == level &&
66033 + /* min_key < key < max_key */
66034 + znode_contains_key_strict(node, key, isunique)) {
66035 + zref(node);
66036 + result = 0;
66037 + spin_lock_prefetch(&tree->tree_lock);
66038 + break;
66039 + }
66040 + }
66041 + read_unlock(&((cbk_cache *)cache)->guard);
66042 +
66043 + assert("nikita-2475", cbk_cache_invariant(cache));
66044 +
66045 + if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
66046 + result = -ENOENT;
66047 +
66048 + rcu_read_unlock();
66049 +
66050 + if (result != 0) {
66051 + h->result = CBK_COORD_NOTFOUND;
66052 + return RETERR(-ENOENT);
66053 + }
66054 +
66055 + result =
66056 + longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
66057 + ZNODE_LOCK_LOPRI);
66058 + zput(node);
66059 + if (result != 0)
66060 + return result;
66061 + result = zload(node);
66062 + if (result != 0)
66063 + return result;
66064 +
66065 + /* recheck keys */
66066 + read_lock_dk(tree);
66067 + result = (znode_contains_key_strict(node, key, isunique) &&
66068 + !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66069 + read_unlock_dk(tree);
66070 + if (result) {
66071 + /* do lookup inside node */
66072 + llr = cbk_node_lookup(h);
66073 + /* if cbk_node_lookup() wandered to another node (due to eottl
66074 + or non-unique keys), adjust @node */
66075 + /*node = h->active_lh->node; */
66076 +
66077 + if (llr != LOOKUP_DONE) {
66078 + /* restart or continue on the next level */
66079 + result = RETERR(-ENOENT);
66080 + } else if (IS_CBKERR(h->result))
66081 + /* io or oom */
66082 + result = RETERR(-ENOENT);
66083 + else {
66084 + /* good. Either item found or definitely not found. */
66085 + result = 0;
66086 +
66087 + write_lock(&(cache->guard));
66088 + if (slot->node == h->active_lh->node /*node */ ) {
66089 + /* if this node is still in cbk cache---move
66090 + its slot to the head of the LRU list. */
66091 + list_move(&slot->lru, &cache->lru);
66092 + }
66093 + write_unlock(&(cache->guard));
66094 + }
66095 + } else {
66096 + /* race. While this thread was waiting for the lock, node was
66097 + rebalanced and item we are looking for, shifted out of it
66098 + (if it ever was here).
66099 +
66100 + Continuing scanning is almost hopeless: node key range was
66101 + moved to, is almost certainly at the beginning of the LRU
66102 + list at this time, because it's hot, but restarting
66103 + scanning from the very beginning is complex. Just return,
66104 + so that cbk() will be performed. This is not that
66105 + important, because such races should be rare. Are they?
66106 + */
66107 + result = RETERR(-ENOENT); /* -ERAUGHT */
66108 + }
66109 + zrelse(node);
66110 + assert("nikita-2476", cbk_cache_invariant(cache));
66111 + return result;
66112 +}
66113 +
66114 +/* look for item with given key in the coord cache
66115 +
66116 + This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
66117 + which is a small LRU list of znodes accessed lately. For each znode in
66118 + znode in this list, it checks whether key we are looking for fits into key
66119 + range covered by this node. If so, and in addition, node lies at allowed
66120 + level (this is to handle extents on a twig level), node is locked, and
66121 + lookup inside it is performed.
66122 +
66123 + we need a measurement of the cost of this cache search compared to the cost
66124 + of coord_by_key.
66125 +
66126 +*/
66127 +static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
66128 +{
66129 + int result = 0;
66130 + tree_level level;
66131 +
66132 + /* add CBK_IN_CACHE to the handle flags. This means that
66133 + * cbk_node_lookup() assumes that cbk_cache is scanned and would add
66134 + * found node to the cache. */
66135 + h->flags |= CBK_IN_CACHE;
66136 + for (level = h->stop_level; level <= h->lock_level; ++level) {
66137 + h->level = level;
66138 + result = cbk_cache_scan_slots(h);
66139 + if (result != 0) {
66140 + done_lh(h->active_lh);
66141 + done_lh(h->parent_lh);
66142 + } else {
66143 + assert("nikita-1319", !IS_CBKERR(h->result));
66144 + break;
66145 + }
66146 + }
66147 + h->flags &= ~CBK_IN_CACHE;
66148 + return result;
66149 +}
66150 +
66151 +/* type of lock we want to obtain during tree traversal. On stop level
66152 + we want type of lock user asked for, on upper levels: read lock. */
66153 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
66154 +{
66155 + assert("nikita-382", h != NULL);
66156 +
66157 + return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
66158 +}
66159 +
66160 +/* update outdated delimiting keys */
66161 +static void stale_dk(reiser4_tree * tree, znode * node)
66162 +{
66163 + znode *right;
66164 +
66165 + read_lock_tree(tree);
66166 + write_lock_dk(tree);
66167 + right = node->right;
66168 +
66169 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66170 + right && ZF_ISSET(right, JNODE_DKSET) &&
66171 + !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
66172 + znode_set_rd_key(node, znode_get_ld_key(right));
66173 +
66174 + write_unlock_dk(tree);
66175 + read_unlock_tree(tree);
66176 +}
66177 +
66178 +/* check for possibly outdated delimiting keys, and update them if
66179 + * necessary. */
66180 +static void update_stale_dk(reiser4_tree * tree, znode * node)
66181 +{
66182 + znode *right;
66183 + reiser4_key rd;
66184 +
66185 + read_lock_tree(tree);
66186 + read_lock_dk(tree);
66187 + rd = *znode_get_rd_key(node);
66188 + right = node->right;
66189 + if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66190 + right && ZF_ISSET(right, JNODE_DKSET) &&
66191 + !keyeq(&rd, znode_get_ld_key(right)))) {
66192 + assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
66193 + read_unlock_dk(tree);
66194 + read_unlock_tree(tree);
66195 + stale_dk(tree, node);
66196 + return;
66197 + }
66198 + read_unlock_dk(tree);
66199 + read_unlock_tree(tree);
66200 +}
66201 +
66202 +/*
66203 + * handle searches a the non-unique key.
66204 + *
66205 + * Suppose that we are looking for an item with possibly non-unique key 100.
66206 + *
66207 + * Root node contains two pointers: one to a node with left delimiting key 0,
66208 + * and another to a node with left delimiting key 100. Item we interested in
66209 + * may well happen in the sub-tree rooted at the first pointer.
66210 + *
66211 + * To handle this search_to_left() is called when search reaches stop
66212 + * level. This function checks it is _possible_ that item we are looking for
66213 + * is in the left neighbor (this can be done by comparing delimiting keys) and
66214 + * if so, tries to lock left neighbor (this is low priority lock, so it can
66215 + * deadlock, tree traversal is just restarted if it did) and then checks
66216 + * whether left neighbor actually contains items with our key.
66217 + *
66218 + * Note that this is done on the stop level only. It is possible to try such
66219 + * left-check on each level, but as duplicate keys are supposed to be rare
66220 + * (very unlikely that more than one node is completely filled with items with
66221 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
66222 + *
66223 + */
66224 +static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
66225 +{
66226 + level_lookup_result result;
66227 + coord_t *coord;
66228 + znode *node;
66229 + znode *neighbor;
66230 +
66231 + lock_handle lh;
66232 +
66233 + assert("nikita-1761", h != NULL);
66234 + assert("nikita-1762", h->level == h->stop_level);
66235 +
66236 + init_lh(&lh);
66237 + coord = h->coord;
66238 + node = h->active_lh->node;
66239 + assert("nikita-1763", coord_is_leftmost_unit(coord));
66240 +
66241 + h->result =
66242 + reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
66243 + GN_CAN_USE_UPPER_LEVELS);
66244 + neighbor = NULL;
66245 + switch (h->result) {
66246 + case -E_DEADLOCK:
66247 + result = LOOKUP_REST;
66248 + break;
66249 + case 0:{
66250 + node_plugin *nplug;
66251 + coord_t crd;
66252 + lookup_bias bias;
66253 +
66254 + neighbor = lh.node;
66255 + h->result = zload(neighbor);
66256 + if (h->result != 0) {
66257 + result = LOOKUP_DONE;
66258 + break;
66259 + }
66260 +
66261 + nplug = neighbor->nplug;
66262 +
66263 + coord_init_zero(&crd);
66264 + bias = h->bias;
66265 + h->bias = FIND_EXACT;
66266 + h->result =
66267 + nplug->lookup(neighbor, h->key, h->bias, &crd);
66268 + h->bias = bias;
66269 +
66270 + if (h->result == NS_NOT_FOUND) {
66271 + case -E_NO_NEIGHBOR:
66272 + h->result = CBK_COORD_FOUND;
66273 + if (!(h->flags & CBK_IN_CACHE))
66274 + cbk_cache_add(node);
66275 + default: /* some other error */
66276 + result = LOOKUP_DONE;
66277 + } else if (h->result == NS_FOUND) {
66278 + read_lock_dk(znode_get_tree(neighbor));
66279 + h->rd_key = *znode_get_ld_key(node);
66280 + leftmost_key_in_node(neighbor, &h->ld_key);
66281 + read_unlock_dk(znode_get_tree(neighbor));
66282 + h->flags |= CBK_DKSET;
66283 +
66284 + h->block = *znode_get_block(neighbor);
66285 + /* clear coord -> node so that cbk_level_lookup()
66286 + wouldn't overwrite parent hint in neighbor.
66287 +
66288 + Parent hint was set up by
66289 + reiser4_get_left_neighbor()
66290 + */
66291 + /* FIXME: why do we have to spinlock here? */
66292 + write_lock_tree(znode_get_tree(neighbor));
66293 + h->coord->node = NULL;
66294 + write_unlock_tree(znode_get_tree(neighbor));
66295 + result = LOOKUP_CONT;
66296 + } else {
66297 + result = LOOKUP_DONE;
66298 + }
66299 + if (neighbor != NULL)
66300 + zrelse(neighbor);
66301 + }
66302 + }
66303 + done_lh(&lh);
66304 + return result;
66305 +}
66306 +
66307 +/* debugging aid: return symbolic name of search bias */
66308 +static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66309 +{
66310 + if (bias == FIND_EXACT)
66311 + return "exact";
66312 + else if (bias == FIND_MAX_NOT_MORE_THAN)
66313 + return "left-slant";
66314 +/* else if( bias == RIGHT_SLANT_BIAS ) */
66315 +/* return "right-bias"; */
66316 + else {
66317 + static char buf[30];
66318 +
66319 + sprintf(buf, "unknown: %i", bias);
66320 + return buf;
66321 + }
66322 +}
66323 +
66324 +#if REISER4_DEBUG
66325 +/* debugging aid: print human readable information about @p */
66326 +void print_coord_content(const char *prefix /* prefix to print */ ,
66327 + coord_t * p /* coord to print */ )
66328 +{
66329 + reiser4_key key;
66330 +
66331 + if (p == NULL) {
66332 + printk("%s: null\n", prefix);
66333 + return;
66334 + }
66335 + if ((p->node != NULL) && znode_is_loaded(p->node)
66336 + && coord_is_existing_item(p))
66337 + printk("%s: data: %p, length: %i\n", prefix,
66338 + item_body_by_coord(p), item_length_by_coord(p));
66339 + if (znode_is_loaded(p->node)) {
66340 + item_key_by_coord(p, &key);
66341 + print_key(prefix, &key);
66342 + }
66343 +}
66344 +
66345 +/* debugging aid: print human readable information about @block */
66346 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
66347 + const reiser4_block_nr * block /* block number to print */ )
66348 +{
66349 + printk("%s: %s\n", prefix, sprint_address(block));
66350 +}
66351 +#endif
66352 +
66353 +/* return string containing human readable representation of @block */
66354 +char *sprint_address(const reiser4_block_nr *
66355 + block /* block number to print */ )
66356 +{
66357 + static char address[30];
66358 +
66359 + if (block == NULL)
66360 + sprintf(address, "null");
66361 + else if (blocknr_is_fake(block))
66362 + sprintf(address, "%llx", (unsigned long long)(*block));
66363 + else
66364 + sprintf(address, "%llu", (unsigned long long)(*block));
66365 + return address;
66366 +}
66367 +
66368 +/* release parent node during traversal */
66369 +static void put_parent(cbk_handle * h /* search handle */ )
66370 +{
66371 + assert("nikita-383", h != NULL);
66372 + if (h->parent_lh->node != NULL) {
66373 + longterm_unlock_znode(h->parent_lh);
66374 + }
66375 +}
66376 +
66377 +/* helper function used by coord_by_key(): release reference to parent znode
66378 + stored in handle before processing its child. */
66379 +static void hput(cbk_handle * h /* search handle */ )
66380 +{
66381 + assert("nikita-385", h != NULL);
66382 + done_lh(h->parent_lh);
66383 + done_lh(h->active_lh);
66384 +}
66385 +
66386 +/* Helper function used by cbk(): update delimiting keys of child node (stored
66387 + in h->active_lh->node) using key taken from parent on the parent level. */
66388 +static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66389 +{
66390 + znode *active;
66391 + reiser4_tree *tree;
66392 +
66393 + assert("nikita-1088", h != NULL);
66394 +
66395 + active = h->active_lh->node;
66396 +
66397 + /* fast check without taking dk lock. This is safe, because
66398 + * JNODE_DKSET is never cleared once set. */
66399 + if (!ZF_ISSET(active, JNODE_DKSET)) {
66400 + tree = znode_get_tree(active);
66401 + write_lock_dk(tree);
66402 + if (!ZF_ISSET(active, JNODE_DKSET)) {
66403 + znode_set_ld_key(active, &h->ld_key);
66404 + znode_set_rd_key(active, &h->rd_key);
66405 + ZF_SET(active, JNODE_DKSET);
66406 + }
66407 + write_unlock_dk(tree);
66408 + return 1;
66409 + }
66410 + return 0;
66411 +}
66412 +
66413 +/* true if @block makes sense for the @tree. Used to detect corrupted node
66414 + * pointers */
66415 +static int
66416 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66417 + reiser4_tree * tree /* tree to check against */ )
66418 +{
66419 + assert("nikita-757", block != NULL);
66420 + assert("nikita-758", tree != NULL);
66421 +
66422 + /* check to see if it exceeds the size of the device. */
66423 + return reiser4_blocknr_is_sane_for(tree->super, block);
66424 +}
66425 +
66426 +/* check consistency of fields */
66427 +static int sanity_check(cbk_handle * h /* search handle */ )
66428 +{
66429 + assert("nikita-384", h != NULL);
66430 +
66431 + if (h->level < h->stop_level) {
66432 + h->error = "Buried under leaves";
66433 + h->result = RETERR(-EIO);
66434 + return LOOKUP_DONE;
66435 + } else if (!block_nr_is_correct(&h->block, h->tree)) {
66436 + h->error = "bad block number";
66437 + h->result = RETERR(-EIO);
66438 + return LOOKUP_DONE;
66439 + } else
66440 + return 0;
66441 +}
66442 +
66443 +/* Make Linus happy.
66444 + Local variables:
66445 + c-indentation-style: "K&R"
66446 + mode-name: "LC"
66447 + c-basic-offset: 8
66448 + tab-width: 8
66449 + fill-column: 120
66450 + scroll-step: 1
66451 + End:
66452 +*/
66453 Index: linux-2.6.16/fs/reiser4/status_flags.c
66454 ===================================================================
66455 --- /dev/null
66456 +++ linux-2.6.16/fs/reiser4/status_flags.c
66457 @@ -0,0 +1,176 @@
66458 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66459 + * reiser4/README */
66460 +
66461 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
66462 +
66463 +#include <linux/bio.h>
66464 +#include <linux/highmem.h>
66465 +#include <linux/fs.h>
66466 +#include <linux/blkdev.h>
66467 +#include "debug.h"
66468 +#include "dformat.h"
66469 +#include "status_flags.h"
66470 +#include "super.h"
66471 +
66472 +/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66473 + unconditionally unlocks the page, so we can see that io was done.
66474 + We do not free bio, because we hope to reuse that. */
66475 +static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66476 + int err)
66477 +{
66478 + if (bio->bi_size)
66479 + return 1;
66480 +
66481 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66482 + SetPageUptodate(bio->bi_io_vec->bv_page);
66483 + } else {
66484 + ClearPageUptodate(bio->bi_io_vec->bv_page);
66485 + SetPageError(bio->bi_io_vec->bv_page);
66486 + }
66487 + unlock_page(bio->bi_io_vec->bv_page);
66488 + return 0;
66489 +}
66490 +
66491 +/* Initialise status code. This is expected to be called from the disk format
66492 + code. block paremeter is where status block lives. */
66493 +int reiser4_status_init(reiser4_block_nr block)
66494 +{
66495 + struct super_block *sb = reiser4_get_current_sb();
66496 + struct reiser4_status *statuspage;
66497 + struct bio *bio;
66498 + struct page *page;
66499 +
66500 +
66501 + get_super_private(sb)->status_page = NULL;
66502 + get_super_private(sb)->status_bio = NULL;
66503 +
66504 + page = alloc_pages(GFP_KERNEL, 0);
66505 + if (!page)
66506 + return -ENOMEM;
66507 +
66508 + bio = bio_alloc(GFP_KERNEL, 1);
66509 + if (bio != NULL) {
66510 + bio->bi_sector = block * (sb->s_blocksize >> 9);
66511 + bio->bi_bdev = sb->s_bdev;
66512 + bio->bi_io_vec[0].bv_page = page;
66513 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66514 + bio->bi_io_vec[0].bv_offset = 0;
66515 + bio->bi_vcnt = 1;
66516 + bio->bi_size = sb->s_blocksize;
66517 + bio->bi_end_io = reiser4_status_endio;
66518 + } else {
66519 + __free_pages(page, 0);
66520 + return -ENOMEM;
66521 + }
66522 + lock_page(page);
66523 + submit_bio(READ, bio);
66524 + blk_run_address_space(get_super_fake(sb)->i_mapping);
66525 + wait_on_page_locked(page);
66526 + if (!PageUptodate(page)) {
66527 + warning("green-2007",
66528 + "I/O error while tried to read status page\n");
66529 + return -EIO;
66530 + }
66531 +
66532 + statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
66533 + if (memcmp
66534 + (statuspage->magic, REISER4_STATUS_MAGIC,
66535 + sizeof(REISER4_STATUS_MAGIC))) {
66536 + /* Magic does not match. */
66537 + kunmap_atomic((char *)statuspage, KM_USER0);
66538 + warning("green-2008", "Wrong magic in status block\n");
66539 + __free_pages(page, 0);
66540 + bio_put(bio);
66541 + return -EINVAL;
66542 + }
66543 + kunmap_atomic((char *)statuspage, KM_USER0);
66544 +
66545 + get_super_private(sb)->status_page = page;
66546 + get_super_private(sb)->status_bio = bio;
66547 + return 0;
66548 +}
66549 +
66550 +/* Query the status of fs. Returns if the FS can be safely mounted.
66551 + Also if "status" and "extended" parameters are given, it will fill
66552 + actual parts of status from disk there. */
66553 +int reiser4_status_query(u64 * status, u64 * extended)
66554 +{
66555 + struct super_block *sb = reiser4_get_current_sb();
66556 + struct reiser4_status *statuspage;
66557 + int retval;
66558 +
66559 + if (!get_super_private(sb)->status_page) { // No status page?
66560 + return REISER4_STATUS_MOUNT_UNKNOWN;
66561 + }
66562 + statuspage = (struct reiser4_status *)
66563 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66564 + switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
66565 + case REISER4_STATUS_OK:
66566 + retval = REISER4_STATUS_MOUNT_OK;
66567 + break;
66568 + case REISER4_STATUS_CORRUPTED:
66569 + retval = REISER4_STATUS_MOUNT_WARN;
66570 + break;
66571 + case REISER4_STATUS_DAMAGED:
66572 + case REISER4_STATUS_DESTROYED:
66573 + case REISER4_STATUS_IOERROR:
66574 + retval = REISER4_STATUS_MOUNT_RO;
66575 + break;
66576 + default:
66577 + retval = REISER4_STATUS_MOUNT_UNKNOWN;
66578 + break;
66579 + }
66580 +
66581 + if (status)
66582 + *status = le64_to_cpu(get_unaligned(&statuspage->status));
66583 + if (extended)
66584 + *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
66585 +
66586 + kunmap_atomic((char *)statuspage, KM_USER0);
66587 + return retval;
66588 +}
66589 +
66590 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
66591 + It fills the status structure and tries to push it to disk. */
66592 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
66593 +{
66594 + struct super_block *sb = reiser4_get_current_sb();
66595 + struct reiser4_status *statuspage;
66596 + struct bio *bio = get_super_private(sb)->status_bio;
66597 +
66598 + if (!get_super_private(sb)->status_page) { // No status page?
66599 + return -1;
66600 + }
66601 + statuspage = (struct reiser4_status *)
66602 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66603 +
66604 + put_unaligned(cpu_to_le64(status), &statuspage->status);
66605 + put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
66606 + strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
66607 +
66608 + kunmap_atomic((char *)statuspage, KM_USER0);
66609 + bio->bi_bdev = sb->s_bdev;
66610 + bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
66611 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66612 + bio->bi_io_vec[0].bv_offset = 0;
66613 + bio->bi_vcnt = 1;
66614 + bio->bi_size = sb->s_blocksize;
66615 + bio->bi_end_io = reiser4_status_endio;
66616 + lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
66617 + /* We can block now, but we have no other choice anyway */
66618 + submit_bio(WRITE, bio);
66619 + blk_run_address_space(get_super_fake(sb)->i_mapping);
66620 + return 0; // We do not wait for io to finish.
66621 +}
66622 +
66623 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
66624 +int reiser4_status_finish(void)
66625 +{
66626 + struct super_block *sb = reiser4_get_current_sb();
66627 +
66628 + __free_pages(get_super_private(sb)->status_page, 0);
66629 + get_super_private(sb)->status_page = NULL;
66630 + bio_put(get_super_private(sb)->status_bio);
66631 + get_super_private(sb)->status_bio = NULL;
66632 + return 0;
66633 +}
66634 Index: linux-2.6.16/fs/reiser4/status_flags.h
66635 ===================================================================
66636 --- /dev/null
66637 +++ linux-2.6.16/fs/reiser4/status_flags.h
66638 @@ -0,0 +1,43 @@
66639 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66640 + * reiser4/README */
66641 +
66642 +/* Here we declare structures and flags that store reiser4 status on disk.
66643 + The status that helps us to find out if the filesystem is valid or if it
66644 + contains some critical, or not so critical errors */
66645 +
66646 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
66647 +#define __REISER4_STATUS_FLAGS_H__
66648 +
66649 +#include "dformat.h"
66650 +/* These are major status flags */
66651 +#define REISER4_STATUS_OK 0
66652 +#define REISER4_STATUS_CORRUPTED 0x1
66653 +#define REISER4_STATUS_DAMAGED 0x2
66654 +#define REISER4_STATUS_DESTROYED 0x4
66655 +#define REISER4_STATUS_IOERROR 0x8
66656 +
66657 +/* Return values for reiser4_status_query() */
66658 +#define REISER4_STATUS_MOUNT_OK 0
66659 +#define REISER4_STATUS_MOUNT_WARN 1
66660 +#define REISER4_STATUS_MOUNT_RO 2
66661 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
66662 +
66663 +#define REISER4_TEXTERROR_LEN 256
66664 +
66665 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
66666 +/* We probably need to keep its size under sector size which is 512 bytes */
66667 +struct reiser4_status {
66668 + char magic[16];
66669 + d64 status; /* Current FS state */
66670 + d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
66671 + last sector where io error happened if status is "io error encountered" */
66672 + d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
66673 + char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
66674 +};
66675 +
66676 +int reiser4_status_init(reiser4_block_nr block);
66677 +int reiser4_status_query(u64 * status, u64 * extended);
66678 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
66679 +int reiser4_status_finish(void);
66680 +
66681 +#endif
66682 Index: linux-2.6.16/fs/reiser4/super.c
66683 ===================================================================
66684 --- /dev/null
66685 +++ linux-2.6.16/fs/reiser4/super.c
66686 @@ -0,0 +1,313 @@
66687 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66688 + * reiser4/README */
66689 +
66690 +/* Super-block manipulations. */
66691 +
66692 +#include "debug.h"
66693 +#include "dformat.h"
66694 +#include "key.h"
66695 +#include "plugin/security/perm.h"
66696 +#include "plugin/space/space_allocator.h"
66697 +#include "plugin/plugin.h"
66698 +#include "tree.h"
66699 +#include "vfs_ops.h"
66700 +#include "super.h"
66701 +#include "reiser4.h"
66702 +
66703 +#include <linux/types.h> /* for __u?? */
66704 +#include <linux/fs.h> /* for struct super_block */
66705 +
66706 +
66707 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
66708 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
66709 +static __u64 reserved_for_root(const struct super_block *super);
66710 +
66711 +/* Return reiser4-specific part of super block */
66712 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
66713 + * queried */ )
66714 +{
66715 + return (reiser4_super_info_data *) super->s_fs_info;
66716 +}
66717 +
66718 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
66719 +long statfs_type(const struct super_block *super UNUSED_ARG /* super block
66720 + * queried */ )
66721 +{
66722 + assert("nikita-448", super != NULL);
66723 + assert("nikita-449", is_reiser4_super(super));
66724 + return (long)REISER4_SUPER_MAGIC;
66725 +}
66726 +
66727 +/* functions to read/modify fields of reiser4_super_info_data */
66728 +
66729 +/* get number of blocks in file system */
66730 +__u64 reiser4_block_count(const struct super_block *super /* super block
66731 + queried */ )
66732 +{
66733 + assert("vs-494", super != NULL);
66734 + assert("vs-495", is_reiser4_super(super));
66735 + return get_super_private(super)->block_count;
66736 +}
66737 +
66738 +/*
66739 + * number of blocks in the current file system
66740 + */
66741 +__u64 reiser4_current_block_count(void)
66742 +{
66743 + return get_current_super_private()->block_count;
66744 +}
66745 +
66746 +/* set number of block in filesystem */
66747 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
66748 +{
66749 + assert("vs-501", super != NULL);
66750 + assert("vs-502", is_reiser4_super(super));
66751 + get_super_private(super)->block_count = nr;
66752 + /*
66753 + * The proper calculation of the reserved space counter (%5 of device
66754 + * block counter) we need a 64 bit division which is missing in Linux
66755 + * on i386 platform. Because we do not need a precise calculation here
66756 + * we can replace a div64 operation by this combination of
66757 + * multiplication and shift: 51. / (2^10) == .0498 .
66758 + * FIXME: this is a bug. It comes up only for very small filesystems
66759 + * which probably are never used. Nevertheless, it is a bug. Number of
66760 + * reserved blocks must be not less than maximal number of blocks which
66761 + * get grabbed with BA_RESERVED.
66762 + */
66763 + get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
66764 +}
66765 +
66766 +/* amount of blocks used (allocated for data) in file system */
66767 +__u64 reiser4_data_blocks(const struct super_block *super /* super block
66768 + queried */ )
66769 +{
66770 + assert("nikita-452", super != NULL);
66771 + assert("nikita-453", is_reiser4_super(super));
66772 + return get_super_private(super)->blocks_used;
66773 +}
66774 +
66775 +/* set number of block used in filesystem */
66776 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
66777 +{
66778 + assert("vs-503", super != NULL);
66779 + assert("vs-504", is_reiser4_super(super));
66780 + get_super_private(super)->blocks_used = nr;
66781 +}
66782 +
66783 +/* amount of free blocks in file system */
66784 +__u64 reiser4_free_blocks(const struct super_block *super /* super block
66785 + queried */ )
66786 +{
66787 + assert("nikita-454", super != NULL);
66788 + assert("nikita-455", is_reiser4_super(super));
66789 + return get_super_private(super)->blocks_free;
66790 +}
66791 +
66792 +/* set number of blocks free in filesystem */
66793 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
66794 +{
66795 + assert("vs-505", super != NULL);
66796 + assert("vs-506", is_reiser4_super(super));
66797 + get_super_private(super)->blocks_free = nr;
66798 +}
66799 +
66800 +/* get mkfs unique identifier */
66801 +__u32 reiser4_mkfs_id(const struct super_block *super /* super block
66802 + queried */ )
66803 +{
66804 + assert("vpf-221", super != NULL);
66805 + assert("vpf-222", is_reiser4_super(super));
66806 + return get_super_private(super)->mkfs_id;
66807 +}
66808 +
66809 +/* amount of free blocks in file system */
66810 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
66811 +{
66812 + assert("vs-497", super != NULL);
66813 + assert("vs-498", is_reiser4_super(super));
66814 + return get_super_private(super)->blocks_free_committed;
66815 +}
66816 +
66817 +/* amount of blocks in the file system reserved for @uid and @gid */
66818 +long reiser4_reserved_blocks(const struct super_block *super /* super block
66819 + queried */ ,
66820 + uid_t uid /* user id */ ,
66821 + gid_t gid /* group id */ )
66822 +{
66823 + long reserved;
66824 +
66825 + assert("nikita-456", super != NULL);
66826 + assert("nikita-457", is_reiser4_super(super));
66827 +
66828 + reserved = 0;
66829 + if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
66830 + reserved += reserved_for_gid(super, gid);
66831 + if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
66832 + reserved += reserved_for_uid(super, uid);
66833 + if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
66834 + reserved += reserved_for_root(super);
66835 + return reserved;
66836 +}
66837 +
66838 +/* get/set value of/to grabbed blocks counter */
66839 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
66840 +{
66841 + assert("zam-512", super != NULL);
66842 + assert("zam-513", is_reiser4_super(super));
66843 +
66844 + return get_super_private(super)->blocks_grabbed;
66845 +}
66846 +
66847 +__u64 flush_reserved(const struct super_block * super)
66848 +{
66849 + assert("vpf-285", super != NULL);
66850 + assert("vpf-286", is_reiser4_super(super));
66851 +
66852 + return get_super_private(super)->blocks_flush_reserved;
66853 +}
66854 +
66855 +/* get/set value of/to counter of fake allocated formatted blocks */
66856 +__u64 reiser4_fake_allocated(const struct super_block * super)
66857 +{
66858 + assert("zam-516", super != NULL);
66859 + assert("zam-517", is_reiser4_super(super));
66860 +
66861 + return get_super_private(super)->blocks_fake_allocated;
66862 +}
66863 +
66864 +/* get/set value of/to counter of fake allocated unformatted blocks */
66865 +__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
66866 +{
66867 + assert("zam-516", super != NULL);
66868 + assert("zam-517", is_reiser4_super(super));
66869 +
66870 + return get_super_private(super)->blocks_fake_allocated_unformatted;
66871 +}
66872 +
66873 +/* get/set value of/to counter of clustered blocks */
66874 +__u64 reiser4_clustered_blocks(const struct super_block * super)
66875 +{
66876 + assert("edward-601", super != NULL);
66877 + assert("edward-602", is_reiser4_super(super));
66878 +
66879 + return get_super_private(super)->blocks_clustered;
66880 +}
66881 +
66882 +/* space allocator used by this file system */
66883 +reiser4_space_allocator *get_space_allocator(const struct super_block * super)
66884 +{
66885 + assert("nikita-1965", super != NULL);
66886 + assert("nikita-1966", is_reiser4_super(super));
66887 + return &get_super_private(super)->space_allocator;
66888 +}
66889 +
66890 +/* return fake inode used to bind formatted nodes in the page cache */
66891 +struct inode *get_super_fake(const struct super_block *super /* super block
66892 + queried */ )
66893 +{
66894 + assert("nikita-1757", super != NULL);
66895 + return get_super_private(super)->fake;
66896 +}
66897 +
66898 +/* return fake inode used to bind copied on capture nodes in the page cache */
66899 +struct inode *get_cc_fake(const struct super_block *super /* super block
66900 + queried */ )
66901 +{
66902 + assert("nikita-1757", super != NULL);
66903 + return get_super_private(super)->cc;
66904 +}
66905 +
66906 +/* return fake inode used to bind bitmaps and journlal heads */
66907 +struct inode *get_bitmap_fake(const struct super_block *super)
66908 +{
66909 + assert("nikita-17571", super != NULL);
66910 + return get_super_private(super)->bitmap;
66911 +}
66912 +
66913 +/* tree used by this file system */
66914 +reiser4_tree *get_tree(const struct super_block * super /* super block
66915 + * queried */ )
66916 +{
66917 + assert("nikita-460", super != NULL);
66918 + assert("nikita-461", is_reiser4_super(super));
66919 + return &get_super_private(super)->tree;
66920 +}
66921 +
66922 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
66923 + use in assertions. */
66924 +int is_reiser4_super(const struct super_block *super /* super block
66925 + * queried */ )
66926 +{
66927 + return
66928 + super != NULL &&
66929 + get_super_private(super) != NULL &&
66930 + super->s_op == &(get_super_private(super)->ops.super);
66931 +}
66932 +
66933 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
66934 +{
66935 + return test_bit((int)f, &get_super_private(super)->fs_flags);
66936 +}
66937 +
66938 +/* amount of blocks reserved for given group in file system */
66939 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
66940 + * block
66941 + * queried */ ,
66942 + gid_t gid UNUSED_ARG /* group id */ )
66943 +{
66944 + return 0;
66945 +}
66946 +
66947 +/* amount of blocks reserved for given user in file system */
66948 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
66949 + block
66950 + queried */ ,
66951 + uid_t uid UNUSED_ARG /* user id */ )
66952 +{
66953 + return 0;
66954 +}
66955 +
66956 +/* amount of blocks reserved for super user in file system */
66957 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
66958 + block
66959 + queried */ )
66960 +{
66961 + return 0;
66962 +}
66963 +
66964 +/*
66965 + * true if block number @blk makes sense for the file system at @super.
66966 + */
66967 +int
66968 +reiser4_blocknr_is_sane_for(const struct super_block *super,
66969 + const reiser4_block_nr * blk)
66970 +{
66971 + reiser4_super_info_data *sbinfo;
66972 +
66973 + assert("nikita-2957", super != NULL);
66974 + assert("nikita-2958", blk != NULL);
66975 +
66976 + if (blocknr_is_fake(blk))
66977 + return 1;
66978 +
66979 + sbinfo = get_super_private(super);
66980 + return *blk < sbinfo->block_count;
66981 +}
66982 +
66983 +/*
66984 + * true, if block number @blk makes sense for the current file system
66985 + */
66986 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
66987 +{
66988 + return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
66989 +}
66990 +
66991 +/* Make Linus happy.
66992 + Local variables:
66993 + c-indentation-style: "K&R"
66994 + mode-name: "LC"
66995 + c-basic-offset: 8
66996 + tab-width: 8
66997 + fill-column: 120
66998 + End:
66999 +*/
67000 Index: linux-2.6.16/fs/reiser4/super.h
67001 ===================================================================
67002 --- /dev/null
67003 +++ linux-2.6.16/fs/reiser4/super.h
67004 @@ -0,0 +1,468 @@
67005 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67006 + * reiser4/README */
67007 +
67008 +/* Super-block functions. See super.c for details. */
67009 +
67010 +#if !defined( __REISER4_SUPER_H__ )
67011 +#define __REISER4_SUPER_H__
67012 +
67013 +#include "tree.h"
67014 +#include "entd.h"
67015 +#include "wander.h"
67016 +#include "fsdata.h"
67017 +#include "plugin/object.h"
67018 +#include "plugin/space/space_allocator.h"
67019 +
67020 +/*
67021 + * Flush algorithms parameters.
67022 + */
67023 +typedef struct {
67024 + unsigned relocate_threshold;
67025 + unsigned relocate_distance;
67026 + unsigned written_threshold;
67027 + unsigned scan_maxnodes;
67028 +} flush_params;
67029 +
67030 +typedef enum {
67031 + /*
67032 + * True if this file system doesn't support hard-links (multiple names)
67033 + * for directories: this is default UNIX behavior.
67034 + *
67035 + * If hard-links on directoires are not allowed, file system is Acyclic
67036 + * Directed Graph (modulo dot, and dotdot, of course).
67037 + *
67038 + * This is used by reiser4_link().
67039 + */
67040 + REISER4_ADG = 0,
67041 + /*
67042 + * set if all nodes in internal tree have the same node layout plugin.
67043 + * If so, znode_guess_plugin() will return tree->node_plugin in stead
67044 + * of guessing plugin by plugin id stored in the node.
67045 + */
67046 + REISER4_ONE_NODE_PLUGIN = 1,
67047 + /* if set, bsd gid assignment is supported. */
67048 + REISER4_BSD_GID = 2,
67049 + /* [mac]_time are 32 bit in inode */
67050 + REISER4_32_BIT_TIMES = 3,
67051 + /* allow concurrent flushes */
67052 + REISER4_MTFLUSH = 4,
67053 + /* load all bitmap blocks at mount time */
67054 + REISER4_DONT_LOAD_BITMAP = 5,
67055 + /* enforce atomicity during write(2) */
67056 + REISER4_ATOMIC_WRITE = 6,
67057 + /* don't use write barriers in the log writer code. */
67058 + REISER4_NO_WRITE_BARRIER = 7
67059 +
67060 +} reiser4_fs_flag;
67061 +
67062 +/*
67063 + * VFS related operation vectors.
67064 + */
67065 +typedef struct object_ops {
67066 + struct super_operations super;
67067 + struct dentry_operations dentry;
67068 + struct export_operations export;
67069 +} object_ops;
67070 +
67071 +/* reiser4-specific part of super block
67072 +
67073 + Locking
67074 +
67075 + Fields immutable after mount:
67076 +
67077 + ->oid*
67078 + ->space*
67079 + ->default_[ug]id
67080 + ->mkfs_id
67081 + ->trace_flags
67082 + ->debug_flags
67083 + ->fs_flags
67084 + ->df_plug
67085 + ->optimal_io_size
67086 + ->plug
67087 + ->flush
67088 + ->u (bad name)
67089 + ->txnmgr
67090 + ->ra_params
67091 + ->fsuid
67092 + ->journal_header
67093 + ->journal_footer
67094 +
67095 + Fields protected by ->lnode_guard
67096 +
67097 + ->lnode_htable
67098 +
67099 + Fields protected by per-super block spin lock
67100 +
67101 + ->block_count
67102 + ->blocks_used
67103 + ->blocks_free
67104 + ->blocks_free_committed
67105 + ->blocks_grabbed
67106 + ->blocks_fake_allocated_unformatted
67107 + ->blocks_fake_allocated
67108 + ->blocks_flush_reserved
67109 + ->eflushed
67110 + ->blocknr_hint_default
67111 +
67112 + After journal replaying during mount,
67113 +
67114 + ->last_committed_tx
67115 +
67116 + is protected by ->tmgr.commit_semaphore
67117 +
67118 + Invariants involving this data-type:
67119 +
67120 + [sb-block-counts]
67121 + [sb-grabbed]
67122 + [sb-fake-allocated]
67123 +*/
67124 +struct reiser4_super_info_data {
67125 + /*
67126 + * guard spinlock which protects reiser4 super block fields (currently
67127 + * blocks_free, blocks_free_committed)
67128 + */
67129 + spinlock_t guard;
67130 +
67131 + /* next oid that will be returned by oid_allocate() */
67132 + oid_t next_to_use;
67133 + /* total number of used oids */
67134 + oid_t oids_in_use;
67135 +
67136 + /* space manager plugin */
67137 + reiser4_space_allocator space_allocator;
67138 +
67139 + /* reiser4 internal tree */
67140 + reiser4_tree tree;
67141 +
67142 + /*
67143 + * default user id used for light-weight files without their own
67144 + * stat-data.
67145 + */
67146 + uid_t default_uid;
67147 +
67148 + /*
67149 + * default group id used for light-weight files without their own
67150 + * stat-data.
67151 + */
67152 + gid_t default_gid;
67153 +
67154 + /* mkfs identifier generated at mkfs time. */
67155 + __u32 mkfs_id;
67156 + /* amount of blocks in a file system */
67157 + __u64 block_count;
67158 +
67159 + /* inviolable reserve */
67160 + __u64 blocks_reserved;
67161 +
67162 + /* amount of blocks used by file system data and meta-data. */
67163 + __u64 blocks_used;
67164 +
67165 + /*
67166 + * amount of free blocks. This is "working" free blocks counter. It is
67167 + * like "working" bitmap, please see block_alloc.c for description.
67168 + */
67169 + __u64 blocks_free;
67170 +
67171 + /*
67172 + * free block count for fs committed state. This is "commit" version of
67173 + * free block counter.
67174 + */
67175 + __u64 blocks_free_committed;
67176 +
67177 + /*
67178 + * number of blocks reserved for further allocation, for all
67179 + * threads.
67180 + */
67181 + __u64 blocks_grabbed;
67182 +
67183 + /* number of fake allocated unformatted blocks in tree. */
67184 + __u64 blocks_fake_allocated_unformatted;
67185 +
67186 + /* number of fake allocated formatted blocks in tree. */
67187 + __u64 blocks_fake_allocated;
67188 +
67189 + /* number of blocks reserved for flush operations. */
67190 + __u64 blocks_flush_reserved;
67191 +
67192 + /* number of blocks reserved for cluster operations. */
67193 + __u64 blocks_clustered;
67194 +
67195 + /* unique file-system identifier */
67196 + __u32 fsuid;
67197 +
67198 + /* file-system wide flags. See reiser4_fs_flag enum */
67199 + unsigned long fs_flags;
67200 +
67201 + /* transaction manager */
67202 + txn_mgr tmgr;
67203 +
67204 + /* ent thread */
67205 + entd_context entd;
67206 +
67207 + /* fake inode used to bind formatted nodes */
67208 + struct inode *fake;
67209 + /* inode used to bind bitmaps (and journal heads) */
67210 + struct inode *bitmap;
67211 + /* inode used to bind copied on capture nodes */
67212 + struct inode *cc;
67213 +
67214 + /* disk layout plugin */
67215 + disk_format_plugin *df_plug;
67216 +
67217 + /* disk layout specific part of reiser4 super info data */
67218 + union {
67219 + format40_super_info format40;
67220 + } u;
67221 +
67222 + /* value we return in st_blksize on stat(2) */
67223 + unsigned long optimal_io_size;
67224 +
67225 + /* parameters for the flush algorithm */
67226 + flush_params flush;
67227 +
67228 + /* pointers to jnodes for journal header and footer */
67229 + jnode *journal_header;
67230 + jnode *journal_footer;
67231 +
67232 + journal_location jloc;
67233 +
67234 + /* head block number of last committed transaction */
67235 + __u64 last_committed_tx;
67236 +
67237 + /*
67238 + * we remember last written location for using as a hint for new block
67239 + * allocation
67240 + */
67241 + __u64 blocknr_hint_default;
67242 +
67243 + /* committed number of files (oid allocator state variable ) */
67244 + __u64 nr_files_committed;
67245 +
67246 + ra_params_t ra_params;
67247 +
67248 + /*
67249 + * A semaphore for serializing cut tree operation if out-of-free-space:
67250 + * the only one cut_tree thread is allowed to grab space from reserved
67251 + * area (it is 5% of disk space)
67252 + */
67253 + struct semaphore delete_sema;
67254 + /* task owning ->delete_sema */
67255 + struct task_struct *delete_sema_owner;
67256 +
67257 + /* serialize semaphore */
67258 + struct semaphore flush_sema;
67259 +
67260 + /* Diskmap's blocknumber */
67261 + __u64 diskmap_block;
67262 +
67263 + /* What to do in case of error */
67264 + int onerror;
67265 +
67266 + /* operations for objects on this file system */
67267 + object_ops ops;
67268 +
67269 + /*
67270 + * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67271 + * more details
67272 + */
67273 + d_cursor_info d_info;
67274 +
67275 +#ifdef CONFIG_REISER4_BADBLOCKS
67276 + /* Alternative master superblock offset (in bytes) */
67277 + unsigned long altsuper;
67278 +#endif
67279 + struct repacker *repacker;
67280 + struct page *status_page;
67281 + struct bio *status_bio;
67282 +
67283 +#if REISER4_DEBUG
67284 + /*
67285 + * minimum used blocks value (includes super blocks, bitmap blocks and
67286 + * other fs reserved areas), depends on fs format and fs size.
67287 + */
67288 + __u64 min_blocks_used;
67289 +
67290 + /*
67291 + * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67292 + * are kept on a list anchored at sbinfo->all_jnodes. This list is
67293 + * protected by sbinfo->all_guard spin lock. This lock should be taken
67294 + * with _irq modifier, because it is also modified from interrupt
67295 + * contexts (by RCU).
67296 + */
67297 + spinlock_t all_guard;
67298 + /* list of all jnodes */
67299 + struct list_head all_jnodes;
67300 +#endif
67301 + struct dentry *debugfs_root;
67302 +};
67303 +
67304 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
67305 + super_block *super);
67306 +
67307 +
67308 +/* Return reiser4-specific part of super block */
67309 +static inline reiser4_super_info_data *get_super_private(const struct
67310 + super_block *super)
67311 +{
67312 + assert("nikita-447", super != NULL);
67313 +
67314 + return (reiser4_super_info_data *) super->s_fs_info;
67315 +}
67316 +
67317 +/* get ent context for the @super */
67318 +static inline entd_context *get_entd_context(struct super_block *super)
67319 +{
67320 + return &get_super_private(super)->entd;
67321 +}
67322 +
67323 +
67324 +/* "Current" super-block: main super block used during current system
67325 + call. Reference to this super block is stored in reiser4_context. */
67326 +static inline struct super_block *reiser4_get_current_sb(void)
67327 +{
67328 + return get_current_context()->super;
67329 +}
67330 +
67331 +/* Reiser4-specific part of "current" super-block: main super block used
67332 + during current system call. Reference to this super block is stored in
67333 + reiser4_context. */
67334 +static inline reiser4_super_info_data *get_current_super_private(void)
67335 +{
67336 + return get_super_private(reiser4_get_current_sb());
67337 +}
67338 +
67339 +static inline ra_params_t *get_current_super_ra_params(void)
67340 +{
67341 + return &(get_current_super_private()->ra_params);
67342 +}
67343 +
67344 +/*
67345 + * true, if file system on @super is read-only
67346 + */
67347 +static inline int rofs_super(struct super_block *super)
67348 +{
67349 + return super->s_flags & MS_RDONLY;
67350 +}
67351 +
67352 +/*
67353 + * true, if @tree represents read-only file system
67354 + */
67355 +static inline int rofs_tree(reiser4_tree * tree)
67356 +{
67357 + return rofs_super(tree->super);
67358 +}
67359 +
67360 +/*
67361 + * true, if file system where @inode lives on, is read-only
67362 + */
67363 +static inline int rofs_inode(struct inode *inode)
67364 +{
67365 + return rofs_super(inode->i_sb);
67366 +}
67367 +
67368 +/*
67369 + * true, if file system where @node lives on, is read-only
67370 + */
67371 +static inline int rofs_jnode(jnode * node)
67372 +{
67373 + return rofs_tree(jnode_get_tree(node));
67374 +}
67375 +
67376 +extern __u64 reiser4_current_block_count(void);
67377 +
67378 +extern void build_object_ops(struct super_block *super, object_ops * ops);
67379 +
67380 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67381 +
67382 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67383 +{
67384 + spin_lock(&(sbinfo->guard));
67385 +}
67386 +
67387 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67388 +{
67389 + assert_spin_locked(&(sbinfo->guard));
67390 + spin_unlock(&(sbinfo->guard));
67391 +}
67392 +
67393 +extern __u64 flush_reserved(const struct super_block *);
67394 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67395 +extern long statfs_type(const struct super_block *super);
67396 +extern __u64 reiser4_block_count(const struct super_block *super);
67397 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67398 +extern __u64 reiser4_data_blocks(const struct super_block *super);
67399 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67400 +extern __u64 reiser4_free_blocks(const struct super_block *super);
67401 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67402 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
67403 +
67404 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67405 +
67406 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67407 +extern __u64 reiser4_fake_allocated(const struct super_block *);
67408 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67409 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
67410 +
67411 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67412 + gid_t gid);
67413 +
67414 +extern reiser4_space_allocator *get_space_allocator(const struct super_block
67415 + *super);
67416 +extern reiser4_oid_allocator *get_oid_allocator(const struct super_block
67417 + *super);
67418 +extern struct inode *get_super_fake(const struct super_block *super);
67419 +extern struct inode *get_cc_fake(const struct super_block *super);
67420 +extern struct inode *get_bitmap_fake(const struct super_block *super);
67421 +extern reiser4_tree *get_tree(const struct super_block *super);
67422 +extern int is_reiser4_super(const struct super_block *super);
67423 +
67424 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67425 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67426 + const reiser4_block_nr * blk);
67427 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67428 +extern int reiser4_done_super(struct super_block *s);
67429 +
67430 +/* step of fill super */
67431 +extern int init_fs_info(struct super_block *);
67432 +extern void done_fs_info(struct super_block *);
67433 +extern int init_super_data(struct super_block *, char *opt_string);
67434 +extern int init_read_super(struct super_block *, int silent);
67435 +extern int init_root_inode(struct super_block *);
67436 +
67437 +
67438 +/* Maximal possible object id. */
67439 +#define ABSOLUTE_MAX_OID ((oid_t)~0)
67440 +
67441 +#define OIDS_RESERVED ( 1 << 16 )
67442 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67443 +oid_t oid_allocate(struct super_block *);
67444 +int oid_release(struct super_block *, oid_t);
67445 +oid_t oid_next(const struct super_block *);
67446 +void oid_count_allocated(void);
67447 +void oid_count_released(void);
67448 +long oids_used(const struct super_block *);
67449 +
67450 +#if REISER4_DEBUG
67451 +void print_fs_info(const char *prefix, const struct super_block *);
67452 +#endif
67453 +
67454 +extern void destroy_reiser4_cache(kmem_cache_t **);
67455 +
67456 +extern struct super_operations reiser4_super_operations;
67457 +extern struct export_operations reiser4_export_operations;
67458 +extern struct dentry_operations reiser4_dentry_operations;
67459 +extern struct dentry *reiser4_debugfs_root;
67460 +
67461 +/* __REISER4_SUPER_H__ */
67462 +#endif
67463 +
67464 +/*
67465 + * Local variables:
67466 + * c-indentation-style: "K&R"
67467 + * mode-name: "LC"
67468 + * c-basic-offset: 8
67469 + * tab-width: 8
67470 + * fill-column: 120
67471 + * End:
67472 + */
67473 Index: linux-2.6.16/fs/reiser4/super_ops.c
67474 ===================================================================
67475 --- /dev/null
67476 +++ linux-2.6.16/fs/reiser4/super_ops.c
67477 @@ -0,0 +1,721 @@
67478 +/* Copyright 2005 by Hans Reiser, licensing governed by
67479 + * reiser4/README */
67480 +
67481 +#include "inode.h"
67482 +#include "page_cache.h"
67483 +#include "ktxnmgrd.h"
67484 +#include "flush.h"
67485 +#include "safe_link.h"
67486 +
67487 +#include <linux/vfs.h>
67488 +#include <linux/writeback.h>
67489 +#include <linux/mount.h>
67490 +#include <linux/seq_file.h>
67491 +#include <linux/debugfs.h>
67492 +
67493 +/* slab cache for inodes */
67494 +static kmem_cache_t *inode_cache;
67495 +
67496 +/**
67497 + * init_once - constructor for reiser4 inodes
67498 + * @obj: inode to be initialized
67499 + * @cache: cache @obj belongs to
67500 + * @flags: SLAB flags
67501 + *
67502 + * Initialization function to be called when new page is allocated by reiser4
67503 + * inode cache. It is set on inode cache creation.
67504 + */
67505 +static void init_once(void *obj, kmem_cache_t *cache, unsigned long flags)
67506 +{
67507 + reiser4_inode_object *info;
67508 +
67509 + info = obj;
67510 +
67511 + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
67512 + SLAB_CTOR_CONSTRUCTOR) {
67513 + /* initialize vfs inode */
67514 + inode_init_once(&info->vfs_inode);
67515 +
67516 + /*
67517 + * initialize reiser4 specific part fo inode.
67518 + * NOTE-NIKITA add here initializations for locks, list heads,
67519 + * etc. that will be added to our private inode part.
67520 + */
67521 + INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
67522 + /* init semaphore which is used during inode loading */
67523 + loading_init_once(&info->p);
67524 + INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
67525 + GFP_ATOMIC);
67526 +#if REISER4_DEBUG
67527 + info->p.nr_jnodes = 0;
67528 +#endif
67529 + }
67530 +}
67531 +
67532 +/**
67533 + * init_inodes - create znode cache
67534 + *
67535 + * Initializes slab cache of inodes. It is part of reiser4 module initialization.
67536 + */
67537 +static int init_inodes(void)
67538 +{
67539 + inode_cache = kmem_cache_create("reiser4_inode",
67540 + sizeof(reiser4_inode_object),
67541 + 0,
67542 + SLAB_HWCACHE_ALIGN |
67543 + SLAB_RECLAIM_ACCOUNT, init_once, NULL);
67544 + if (inode_cache == NULL)
67545 + return RETERR(-ENOMEM);
67546 + return 0;
67547 +}
67548 +
67549 +/**
67550 + * done_inodes - delete inode cache
67551 + *
67552 + * This is called on reiser4 module unloading or system shutdown.
67553 + */
67554 +static void done_inodes(void)
67555 +{
67556 + destroy_reiser4_cache(&inode_cache);
67557 +}
67558 +
67559 +/**
67560 + * reiser4_alloc_inode - alloc_inode of super operations
67561 + * @super: super block new inode is allocated for
67562 + *
67563 + * Allocates new inode, initializes reiser4 specific part of it.
67564 + */
67565 +static struct inode *reiser4_alloc_inode(struct super_block *super)
67566 +{
67567 + reiser4_inode_object *obj;
67568 +
67569 + assert("nikita-1696", super != NULL);
67570 + obj = kmem_cache_alloc(inode_cache, SLAB_KERNEL);
67571 + if (obj != NULL) {
67572 + reiser4_inode *info;
67573 +
67574 + info = &obj->p;
67575 +
67576 + info->hset = info->pset = plugin_set_get_empty();
67577 + info->extmask = 0;
67578 + info->locality_id = 0ull;
67579 + info->plugin_mask = 0;
67580 +#if !REISER4_INO_IS_OID
67581 + info->oid_hi = 0;
67582 +#endif
67583 + seal_init(&info->sd_seal, NULL, NULL);
67584 + coord_init_invalid(&info->sd_coord, NULL);
67585 + info->flags = 0;
67586 + spin_lock_init(&info->guard);
67587 + /* this deals with info's loading semaphore */
67588 + loading_alloc(info);
67589 + info->vroot = UBER_TREE_ADDR;
67590 + return &obj->vfs_inode;
67591 + } else
67592 + return NULL;
67593 +}
67594 +
67595 +/**
67596 + * reiser4_destroy_inode - destroy_inode of super operations
67597 + * @inode: inode being destroyed
67598 + *
67599 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
67600 + */
67601 +static void reiser4_destroy_inode(struct inode *inode)
67602 +{
67603 + reiser4_inode *info;
67604 +
67605 + info = reiser4_inode_data(inode);
67606 +
67607 + assert("vs-1220", inode_has_no_jnodes(info));
67608 +
67609 + if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
67610 + file_plugin *fplug = inode_file_plugin(inode);
67611 + if (fplug->destroy_inode != NULL)
67612 + fplug->destroy_inode(inode);
67613 + }
67614 + dispose_cursors(inode);
67615 + if (info->pset)
67616 + plugin_set_put(info->pset);
67617 +
67618 + /*
67619 + * cannot add similar assertion about ->i_list as prune_icache return
67620 + * inode into slab with dangling ->list.{next,prev}. This is safe,
67621 + * because they are re-initialized in the new_inode().
67622 + */
67623 + assert("nikita-2895", list_empty(&inode->i_dentry));
67624 + assert("nikita-2896", hlist_unhashed(&inode->i_hash));
67625 + assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
67626 +
67627 + /* this deals with info's loading semaphore */
67628 + loading_destroy(info);
67629 +
67630 + kmem_cache_free(inode_cache,
67631 + container_of(info, reiser4_inode_object, p));
67632 +}
67633 +
67634 +/**
67635 + * reiser4_dirty_inode - dirty_inode of super operations
67636 + * @inode: inode being dirtied
67637 + *
67638 + * Updates stat data.
67639 + */
67640 +static void reiser4_dirty_inode(struct inode *inode)
67641 +{
67642 + int result;
67643 +
67644 + if (!is_in_reiser4_context())
67645 + return;
67646 + assert("", !IS_RDONLY(inode));
67647 + assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
67648 + get_current_context()->grabbed_blocks));
67649 +
67650 + result = reiser4_update_sd(inode);
67651 + if (result)
67652 + warning("", "failed to dirty inode for %llu: %d",
67653 + get_inode_oid(inode), result);
67654 +}
67655 +
67656 +/**
67657 + * reiser4_delete_inode - delete_inode of super operations
67658 + * @inode: inode to delete
67659 + *
67660 + * Calls file plugin's delete_object method to delete object items from
67661 + * filesystem tree and calls clear_inode.
67662 + */
67663 +static void reiser4_delete_inode(struct inode *inode)
67664 +{
67665 + reiser4_context *ctx;
67666 + file_plugin *fplug;
67667 +
67668 + ctx = init_context(inode->i_sb);
67669 + if (IS_ERR(ctx)) {
67670 + warning("vs-15", "failed to init context");
67671 + return;
67672 + }
67673 +
67674 + if (is_inode_loaded(inode)) {
67675 + fplug = inode_file_plugin(inode);
67676 + if (fplug != NULL && fplug->delete_object != NULL)
67677 + fplug->delete_object(inode);
67678 + }
67679 +
67680 + inode->i_blocks = 0;
67681 + clear_inode(inode);
67682 + reiser4_exit_context(ctx);
67683 +}
67684 +
67685 +/**
67686 + * reiser4_put_super - put_super of super operations
67687 + * @super: super block to free
67688 + *
67689 + * Stops daemons, release resources, umounts in short.
67690 + */
67691 +static void reiser4_put_super(struct super_block *super)
67692 +{
67693 + reiser4_super_info_data *sbinfo;
67694 + reiser4_context *ctx;
67695 +
67696 + sbinfo = get_super_private(super);
67697 + assert("vs-1699", sbinfo);
67698 +
67699 + debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
67700 + debugfs_remove(sbinfo->tmgr.debugfs_id_count);
67701 + debugfs_remove(sbinfo->debugfs_root);
67702 +
67703 + ctx = init_context(super);
67704 + if (IS_ERR(ctx)) {
67705 + warning("vs-17", "failed to init context");
67706 + return;
67707 + }
67708 +
67709 + /* have disk format plugin to free its resources */
67710 + if (get_super_private(super)->df_plug->release)
67711 + get_super_private(super)->df_plug->release(super);
67712 +
67713 + done_formatted_fake(super);
67714 +
67715 + /* stop daemons: ktxnmgr and entd */
67716 + done_entd(super);
67717 + done_ktxnmgrd(super);
67718 + done_txnmgr(&sbinfo->tmgr);
67719 +
67720 + done_fs_info(super);
67721 + reiser4_exit_context(ctx);
67722 +}
67723 +
67724 +/**
67725 + * reiser4_write_super - write_super of super operations
67726 + * @super: super block to write
67727 + *
67728 + * Captures znode associated with super block, comit all transactions.
67729 + */
67730 +static void reiser4_write_super(struct super_block *super)
67731 +{
67732 + int ret;
67733 + reiser4_context *ctx;
67734 +
67735 + assert("vs-1700", !rofs_super(super));
67736 +
67737 + ctx = init_context(super);
67738 + if (IS_ERR(ctx)) {
67739 + warning("vs-16", "failed to init context");
67740 + return;
67741 + }
67742 +
67743 + ret = capture_super_block(super);
67744 + if (ret != 0)
67745 + warning("vs-1701",
67746 + "capture_super_block failed in write_super: %d", ret);
67747 + ret = txnmgr_force_commit_all(super, 0);
67748 + if (ret != 0)
67749 + warning("jmacd-77113",
67750 + "txn_force failed in write_super: %d", ret);
67751 +
67752 + super->s_dirt = 0;
67753 +
67754 + reiser4_exit_context(ctx);
67755 +}
67756 +
67757 +/**
67758 + * reiser4_statfs - statfs of super operations
67759 + * @super: super block of file system in queried
67760 + * @stafs: buffer to fill with statistics
67761 + *
67762 + * Returns information about filesystem.
67763 + */
67764 +static int reiser4_statfs(struct super_block *super, struct kstatfs *statfs)
67765 +{
67766 + sector_t total;
67767 + sector_t reserved;
67768 + sector_t free;
67769 + sector_t forroot;
67770 + sector_t deleted;
67771 + reiser4_context *ctx;
67772 +
67773 + assert("nikita-408", super != NULL);
67774 + assert("nikita-409", statfs != NULL);
67775 +
67776 + ctx = init_context(super);
67777 + if (IS_ERR(ctx))
67778 + return PTR_ERR(ctx);
67779 +
67780 + statfs->f_type = statfs_type(super);
67781 + statfs->f_bsize = super->s_blocksize;
67782 +
67783 + /*
67784 + * 5% of total block space is reserved. This is needed for flush and
67785 + * for truncates (so that we are able to perform truncate/unlink even
67786 + * on the otherwise completely full file system). If this reservation
67787 + * is hidden from statfs(2), users will mistakenly guess that they
67788 + * have enough free space to complete some operation, which is
67789 + * frustrating.
67790 + *
67791 + * Another possible solution is to subtract ->blocks_reserved from
67792 + * ->f_bfree, but changing available space seems less intrusive than
67793 + * letting user to see 5% of disk space to be used directly after
67794 + * mkfs.
67795 + */
67796 + total = reiser4_block_count(super);
67797 + reserved = get_super_private(super)->blocks_reserved;
67798 + deleted = txnmgr_count_deleted_blocks();
67799 + free = reiser4_free_blocks(super) + deleted;
67800 + forroot = reiser4_reserved_blocks(super, 0, 0);
67801 +
67802 + /*
67803 + * These counters may be in inconsistent state because we take the
67804 + * values without keeping any global spinlock. Here we do a sanity
67805 + * check that free block counter does not exceed the number of all
67806 + * blocks.
67807 + */
67808 + if (free > total)
67809 + free = total;
67810 + statfs->f_blocks = total - reserved;
67811 + /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
67812 + if (free > reserved)
67813 + free -= reserved;
67814 + else
67815 + free = 0;
67816 + statfs->f_bfree = free;
67817 +
67818 + if (free > forroot)
67819 + free -= forroot;
67820 + else
67821 + free = 0;
67822 + statfs->f_bavail = free;
67823 +
67824 + statfs->f_files = 0;
67825 + statfs->f_ffree = 0;
67826 +
67827 + /* maximal acceptable name length depends on directory plugin. */
67828 + assert("nikita-3351", super->s_root->d_inode != NULL);
67829 + statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
67830 + reiser4_exit_context(ctx);
67831 + return 0;
67832 +}
67833 +
67834 +/**
67835 + * reiser4_clear_inode - clear_inode of super operation
67836 + * @inode: inode about to destroy
67837 + *
67838 + * Does sanity checks: being destroyed should have all jnodes detached.
67839 + */
67840 +static void reiser4_clear_inode(struct inode *inode)
67841 +{
67842 +#if REISER4_DEBUG
67843 + reiser4_inode *r4_inode;
67844 +
67845 + r4_inode = reiser4_inode_data(inode);
67846 + if (!inode_has_no_jnodes(r4_inode))
67847 + warning("vs-1732", "reiser4 inode has %ld jnodes\n",
67848 + r4_inode->nr_jnodes);
67849 +#endif
67850 +}
67851 +
67852 +/**
67853 + * reiser4_sync_inodes - sync_inodes of super operations
67854 + * @super:
67855 + * @wbc:
67856 + *
67857 + * This method is called by background and non-backgound writeback. Reiser4's
67858 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
67859 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
67860 + * mapping - dirty pages get into atoms. Writeout is called to flush some
67861 + * atoms.
67862 + */
67863 +static void reiser4_sync_inodes(struct super_block *super,
67864 + struct writeback_control *wbc)
67865 +{
67866 + reiser4_context *ctx;
67867 + long to_write;
67868 +
67869 + if (wbc->for_kupdate)
67870 + /* reiser4 has its own means of periodical write-out */
67871 + return;
67872 +
67873 + to_write = wbc->nr_to_write;
67874 + assert("vs-49", wbc->older_than_this == NULL);
67875 +
67876 + ctx = init_context(super);
67877 + if (IS_ERR(ctx)) {
67878 + warning("vs-13", "failed to init context");
67879 + return;
67880 + }
67881 +
67882 + /*
67883 + * call reiser4_writepages for each of dirty inodes to turn dirty pages
67884 + * into transactions if they were not yet.
67885 + */
67886 + generic_sync_sb_inodes(super, wbc);
67887 +
67888 + /* flush goes here */
67889 + wbc->nr_to_write = to_write;
67890 + writeout(super, wbc);
67891 +
67892 + /* avoid recursive calls to ->sync_inodes */
67893 + context_set_commit_async(ctx);
67894 + reiser4_exit_context(ctx);
67895 +}
67896 +
67897 +/**
67898 + * reiser4_show_options - show_options of super operations
67899 + * @m: file where to write information
67900 + * @mnt: mount structure
67901 + *
67902 + * Makes reiser4 mount options visible in /proc/mounts.
67903 + */
67904 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
67905 +{
67906 + struct super_block *super;
67907 + reiser4_super_info_data *sbinfo;
67908 +
67909 + super = mnt->mnt_sb;
67910 + sbinfo = get_super_private(super);
67911 +
67912 + seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
67913 + seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
67914 + seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
67915 + seq_printf(m, ",atom_max_flushers=0x%x",
67916 + sbinfo->tmgr.atom_max_flushers);
67917 + seq_printf(m, ",cbk_cache_slots=0x%x",
67918 + sbinfo->tree.cbk_cache.nr_slots);
67919 +
67920 + return 0;
67921 +}
67922 +
67923 +struct super_operations reiser4_super_operations = {
67924 + .alloc_inode = reiser4_alloc_inode,
67925 + .destroy_inode = reiser4_destroy_inode,
67926 + .dirty_inode = reiser4_dirty_inode,
67927 + .delete_inode = reiser4_delete_inode,
67928 + .put_super = reiser4_put_super,
67929 + .write_super = reiser4_write_super,
67930 + .statfs = reiser4_statfs,
67931 + .clear_inode = reiser4_clear_inode,
67932 + .sync_inodes = reiser4_sync_inodes,
67933 + .show_options = reiser4_show_options
67934 +};
67935 +
67936 +/**
67937 + * fill_super - initialize super block on mount
67938 + * @super: super block to fill
67939 + * @data: reiser4 specific mount option
67940 + * @silent:
67941 + *
67942 + * This is to be called by reiser4_get_sb. Mounts filesystem.
67943 + */
67944 +static int fill_super(struct super_block *super, void *data, int silent)
67945 +{
67946 + reiser4_context ctx;
67947 + int result;
67948 + reiser4_super_info_data *sbinfo;
67949 +
67950 + assert("zam-989", super != NULL);
67951 +
67952 + super->s_op = NULL;
67953 + init_stack_context(&ctx, super);
67954 +
67955 + /* allocate reiser4 specific super block */
67956 + if ((result = init_fs_info(super)) != 0)
67957 + goto failed_init_sinfo;
67958 +
67959 + sbinfo = get_super_private(super);
67960 + /* initialize various reiser4 parameters, parse mount options */
67961 + if ((result = init_super_data(super, data)) != 0)
67962 + goto failed_init_super_data;
67963 +
67964 + /* read reiser4 master super block, initialize disk format plugin */
67965 + if ((result = init_read_super(super, silent)) != 0)
67966 + goto failed_init_read_super;
67967 +
67968 + /* initialize transaction manager */
67969 + init_txnmgr(&sbinfo->tmgr);
67970 +
67971 + /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
67972 + if ((result = init_ktxnmgrd(super)) != 0)
67973 + goto failed_init_ktxnmgrd;
67974 +
67975 + /* initialize entd context and start kernel thread entd */
67976 + if ((result = init_entd(super)) != 0)
67977 + goto failed_init_entd;
67978 +
67979 + /* initialize address spaces for formatted nodes and bitmaps */
67980 + if ((result = init_formatted_fake(super)) != 0)
67981 + goto failed_init_formatted_fake;
67982 +
67983 + /* initialize disk format plugin */
67984 + if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
67985 + goto failed_init_disk_format;
67986 +
67987 + /*
67988 + * There are some 'committed' versions of reiser4 super block counters,
67989 + * which correspond to reiser4 on-disk state. These counters are
67990 + * initialized here
67991 + */
67992 + sbinfo->blocks_free_committed = sbinfo->blocks_free;
67993 + sbinfo->nr_files_committed = oids_used(super);
67994 +
67995 + /* get inode of root directory */
67996 + if ((result = init_root_inode(super)) != 0)
67997 + goto failed_init_root_inode;
67998 +
67999 + process_safelinks(super);
68000 + reiser4_exit_context(&ctx);
68001 +
68002 + sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
68003 + reiser4_debugfs_root);
68004 + if (sbinfo->debugfs_root) {
68005 + sbinfo->tmgr.debugfs_atom_count =
68006 + debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
68007 + sbinfo->debugfs_root,
68008 + &sbinfo->tmgr.atom_count);
68009 + sbinfo->tmgr.debugfs_id_count =
68010 + debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
68011 + sbinfo->debugfs_root,
68012 + &sbinfo->tmgr.id_count);
68013 + }
68014 + return 0;
68015 +
68016 + failed_init_root_inode:
68017 + if (sbinfo->df_plug->release)
68018 + sbinfo->df_plug->release(super);
68019 + failed_init_disk_format:
68020 + done_formatted_fake(super);
68021 + failed_init_formatted_fake:
68022 + done_entd(super);
68023 + failed_init_entd:
68024 + done_ktxnmgrd(super);
68025 + failed_init_ktxnmgrd:
68026 + done_txnmgr(&sbinfo->tmgr);
68027 + failed_init_read_super:
68028 + failed_init_super_data:
68029 + done_fs_info(super);
68030 + failed_init_sinfo:
68031 + reiser4_exit_context(&ctx);
68032 + return result;
68033 +}
68034 +
68035 +/**
68036 + * reiser4_get_sb - get_sb of file_system_type operations
68037 + * @fs_type:
68038 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
68039 + * @dev_name: block device file name
68040 + * @data: specific mount options
68041 + *
68042 + * Reiser4 mount entry.
68043 + */
68044 +static struct super_block *reiser4_get_sb(struct file_system_type *fs_type,
68045 + int flags,
68046 + const char *dev_name,
68047 + void *data)
68048 +{
68049 + return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
68050 +}
68051 +
68052 +/* structure describing the reiser4 filesystem implementation */
68053 +static struct file_system_type reiser4_fs_type = {
68054 + .owner = THIS_MODULE,
68055 + .name = "reiser4",
68056 + .fs_flags = FS_REQUIRES_DEV,
68057 + .get_sb = reiser4_get_sb,
68058 + .kill_sb = kill_block_super,
68059 + .next = NULL
68060 +};
68061 +
68062 +void destroy_reiser4_cache(kmem_cache_t **cachep)
68063 +{
68064 + int result;
68065 +
68066 + BUG_ON(*cachep == NULL);
68067 + result = kmem_cache_destroy(*cachep);
68068 + BUG_ON(result != 0);
68069 + *cachep = NULL;
68070 +}
68071 +
68072 +struct dentry *reiser4_debugfs_root = NULL;
68073 +
68074 +/**
68075 + * init_reiser4 - reiser4 initialization entry point
68076 + *
68077 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
68078 + * on kernel initialization or during reiser4 module load.
68079 + */
68080 +static int __init init_reiser4(void)
68081 +{
68082 + int result;
68083 +
68084 + printk(KERN_INFO
68085 + "Loading Reiser4. "
68086 + "See www.namesys.com for a description of Reiser4.\n");
68087 +
68088 + /* initialize slab cache of inodes */
68089 + if ((result = init_inodes()) != 0)
68090 + goto failed_inode_cache;
68091 +
68092 + /* initialize cache of znodes */
68093 + if ((result = init_znodes()) != 0)
68094 + goto failed_init_znodes;
68095 +
68096 + /* initialize all plugins */
68097 + if ((result = init_plugins()) != 0)
68098 + goto failed_init_plugins;
68099 +
68100 + /* initialize cache of plugin_set-s and plugin_set's hash table */
68101 + if ((result = init_plugin_set()) != 0)
68102 + goto failed_init_plugin_set;
68103 +
68104 + /* initialize caches of txn_atom-s and txn_handle-s */
68105 + if ((result = init_txnmgr_static()) != 0)
68106 + goto failed_init_txnmgr_static;
68107 +
68108 + /* initialize cache of jnodes */
68109 + if ((result = init_jnodes()) != 0)
68110 + goto failed_init_jnodes;
68111 +
68112 + /* initialize cache of flush queues */
68113 + if ((result = init_fqs()) != 0)
68114 + goto failed_init_fqs;
68115 +
68116 + /* initialize cache of structures attached to dentry->d_fsdata */
68117 + if ((result = init_dentry_fsdata()) != 0)
68118 + goto failed_init_dentry_fsdata;
68119 +
68120 + /* initialize cache of structures attached to file->private_data */
68121 + if ((result = init_file_fsdata()) != 0)
68122 + goto failed_init_file_fsdata;
68123 +
68124 + /*
68125 + * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
68126 + * more details
68127 + */
68128 + if ((result = init_d_cursor()) != 0)
68129 + goto failed_init_d_cursor;
68130 +
68131 + if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
68132 + reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
68133 + return 0;
68134 + }
68135 +
68136 + done_d_cursor();
68137 + failed_init_d_cursor:
68138 + done_file_fsdata();
68139 + failed_init_file_fsdata:
68140 + done_dentry_fsdata();
68141 + failed_init_dentry_fsdata:
68142 + done_fqs();
68143 + failed_init_fqs:
68144 + done_jnodes();
68145 + failed_init_jnodes:
68146 + done_txnmgr_static();
68147 + failed_init_txnmgr_static:
68148 + done_plugin_set();
68149 + failed_init_plugin_set:
68150 + failed_init_plugins:
68151 + done_znodes();
68152 + failed_init_znodes:
68153 + done_inodes();
68154 + failed_inode_cache:
68155 + return result;
68156 +}
68157 +
68158 +/**
68159 + * done_reiser4 - reiser4 exit entry point
68160 + *
68161 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
68162 + * or at module unload.
68163 + */
68164 +static void __exit done_reiser4(void)
68165 +{
68166 + int result;
68167 +
68168 + debugfs_remove(reiser4_debugfs_root);
68169 + result = unregister_filesystem(&reiser4_fs_type);
68170 + BUG_ON(result != 0);
68171 + done_d_cursor();
68172 + done_file_fsdata();
68173 + done_dentry_fsdata();
68174 + done_fqs();
68175 + done_jnodes();
68176 + done_txnmgr_static();
68177 + done_plugin_set();
68178 + done_znodes();
68179 + destroy_reiser4_cache(&inode_cache);
68180 +}
68181 +
68182 +module_init(init_reiser4);
68183 +module_exit(done_reiser4);
68184 +
68185 +MODULE_DESCRIPTION("Reiser4 filesystem");
68186 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
68187 +
68188 +MODULE_LICENSE("GPL");
68189 +
68190 +/*
68191 + * Local variables:
68192 + * c-indentation-style: "K&R"
68193 + * mode-name: "LC"
68194 + * c-basic-offset: 8
68195 + * tab-width: 8
68196 + * fill-column: 79
68197 + * End:
68198 + */
68199 Index: linux-2.6.16/fs/reiser4/tap.c
68200 ===================================================================
68201 --- /dev/null
68202 +++ linux-2.6.16/fs/reiser4/tap.c
68203 @@ -0,0 +1,377 @@
68204 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68205 + * reiser4/README */
68206 +
68207 +/*
68208 + Tree Access Pointer (tap).
68209 +
68210 + tap is data structure combining coord and lock handle (mostly). It is
68211 + useful when one has to scan tree nodes (for example, in readdir, or flush),
68212 + for tap functions allow to move tap in either direction transparently
68213 + crossing unit/item/node borders.
68214 +
68215 + Tap doesn't provide automatic synchronization of its fields as it is
68216 + supposed to be per-thread object.
68217 +*/
68218 +
68219 +#include "forward.h"
68220 +#include "debug.h"
68221 +#include "coord.h"
68222 +#include "tree.h"
68223 +#include "context.h"
68224 +#include "tap.h"
68225 +#include "znode.h"
68226 +#include "tree_walk.h"
68227 +
68228 +#if REISER4_DEBUG
68229 +static int tap_invariant(const tap_t * tap);
68230 +static void tap_check(const tap_t * tap);
68231 +#else
68232 +#define tap_check(tap) noop
68233 +#endif
68234 +
68235 +/** load node tap is pointing to, if not loaded already */
68236 +int tap_load(tap_t * tap)
68237 +{
68238 + tap_check(tap);
68239 + if (tap->loaded == 0) {
68240 + int result;
68241 +
68242 + result = zload_ra(tap->coord->node, &tap->ra_info);
68243 + if (result != 0)
68244 + return result;
68245 + coord_clear_iplug(tap->coord);
68246 + }
68247 + ++tap->loaded;
68248 + tap_check(tap);
68249 + return 0;
68250 +}
68251 +
68252 +/** release node tap is pointing to. Dual to tap_load() */
68253 +void tap_relse(tap_t * tap)
68254 +{
68255 + tap_check(tap);
68256 + if (tap->loaded > 0) {
68257 + --tap->loaded;
68258 + if (tap->loaded == 0) {
68259 + zrelse(tap->coord->node);
68260 + }
68261 + }
68262 + tap_check(tap);
68263 +}
68264 +
68265 +/**
68266 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68267 + * @mode
68268 + */
68269 +void
68270 +tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, znode_lock_mode mode)
68271 +{
68272 + tap->coord = coord;
68273 + tap->lh = lh;
68274 + tap->mode = mode;
68275 + tap->loaded = 0;
68276 + INIT_LIST_HEAD(&tap->linkage);
68277 + init_ra_info(&tap->ra_info);
68278 +}
68279 +
68280 +/** add @tap to the per-thread list of all taps */
68281 +void tap_monitor(tap_t * tap)
68282 +{
68283 + assert("nikita-2623", tap != NULL);
68284 + tap_check(tap);
68285 + list_add(&tap->linkage, taps_list());
68286 + tap_check(tap);
68287 +}
68288 +
68289 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68290 + * loaded. */
68291 +void tap_copy(tap_t * dst, tap_t * src)
68292 +{
68293 + assert("nikita-3193", src != NULL);
68294 + assert("nikita-3194", dst != NULL);
68295 +
68296 + *dst->coord = *src->coord;
68297 + if (src->lh->node)
68298 + copy_lh(dst->lh, src->lh);
68299 + dst->mode = src->mode;
68300 + dst->loaded = 0;
68301 + INIT_LIST_HEAD(&dst->linkage);
68302 + dst->ra_info = src->ra_info;
68303 +}
68304 +
68305 +/** finish with @tap */
68306 +void tap_done(tap_t * tap)
68307 +{
68308 + assert("nikita-2565", tap != NULL);
68309 + tap_check(tap);
68310 + if (tap->loaded > 0)
68311 + zrelse(tap->coord->node);
68312 + done_lh(tap->lh);
68313 + tap->loaded = 0;
68314 + list_del_init(&tap->linkage);
68315 + tap->coord->node = NULL;
68316 +}
68317 +
68318 +/**
68319 + * move @tap to the new node, locked with @target. Load @target, if @tap was
68320 + * already loaded.
68321 + */
68322 +int tap_move(tap_t * tap, lock_handle * target)
68323 +{
68324 + int result = 0;
68325 +
68326 + assert("nikita-2567", tap != NULL);
68327 + assert("nikita-2568", target != NULL);
68328 + assert("nikita-2570", target->node != NULL);
68329 + assert("nikita-2569", tap->coord->node == tap->lh->node);
68330 +
68331 + tap_check(tap);
68332 + if (tap->loaded > 0)
68333 + result = zload_ra(target->node, &tap->ra_info);
68334 +
68335 + if (result == 0) {
68336 + if (tap->loaded > 0)
68337 + zrelse(tap->coord->node);
68338 + done_lh(tap->lh);
68339 + copy_lh(tap->lh, target);
68340 + tap->coord->node = target->node;
68341 + coord_clear_iplug(tap->coord);
68342 + }
68343 + tap_check(tap);
68344 + return result;
68345 +}
68346 +
68347 +/**
68348 + * move @tap to @target. Acquire lock on @target, if @tap was already
68349 + * loaded.
68350 + */
68351 +static int tap_to(tap_t * tap, znode * target)
68352 +{
68353 + int result;
68354 +
68355 + assert("nikita-2624", tap != NULL);
68356 + assert("nikita-2625", target != NULL);
68357 +
68358 + tap_check(tap);
68359 + result = 0;
68360 + if (tap->coord->node != target) {
68361 + lock_handle here;
68362 +
68363 + init_lh(&here);
68364 + result = longterm_lock_znode(&here, target,
68365 + tap->mode, ZNODE_LOCK_HIPRI);
68366 + if (result == 0) {
68367 + result = tap_move(tap, &here);
68368 + done_lh(&here);
68369 + }
68370 + }
68371 + tap_check(tap);
68372 + return result;
68373 +}
68374 +
68375 +/**
68376 + * move @tap to given @target, loading and locking @target->node if
68377 + * necessary
68378 + */
68379 +int tap_to_coord(tap_t * tap, coord_t * target)
68380 +{
68381 + int result;
68382 +
68383 + tap_check(tap);
68384 + result = tap_to(tap, target->node);
68385 + if (result == 0)
68386 + coord_dup(tap->coord, target);
68387 + tap_check(tap);
68388 + return result;
68389 +}
68390 +
68391 +/** return list of all taps */
68392 +struct list_head *taps_list(void)
68393 +{
68394 + return &get_current_context()->taps;
68395 +}
68396 +
68397 +/** helper function for go_{next,prev}_{item,unit,node}() */
68398 +int go_dir_el(tap_t * tap, sideof dir, int units_p)
68399 +{
68400 + coord_t dup;
68401 + coord_t *coord;
68402 + int result;
68403 +
68404 + int (*coord_dir) (coord_t *);
68405 + int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68406 + void (*coord_init) (coord_t *, const znode *);
68407 + ON_DEBUG(int (*coord_check) (const coord_t *));
68408 +
68409 + assert("nikita-2556", tap != NULL);
68410 + assert("nikita-2557", tap->coord != NULL);
68411 + assert("nikita-2558", tap->lh != NULL);
68412 + assert("nikita-2559", tap->coord->node != NULL);
68413 +
68414 + tap_check(tap);
68415 + if (dir == LEFT_SIDE) {
68416 + coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68417 + get_dir_neighbor = reiser4_get_left_neighbor;
68418 + coord_init = coord_init_last_unit;
68419 + } else {
68420 + coord_dir = units_p ? coord_next_unit : coord_next_item;
68421 + get_dir_neighbor = reiser4_get_right_neighbor;
68422 + coord_init = coord_init_first_unit;
68423 + }
68424 + ON_DEBUG(coord_check =
68425 + units_p ? coord_is_existing_unit : coord_is_existing_item);
68426 + assert("nikita-2560", coord_check(tap->coord));
68427 +
68428 + coord = tap->coord;
68429 + coord_dup(&dup, coord);
68430 + if (coord_dir(&dup) != 0) {
68431 + do {
68432 + /* move to the left neighboring node */
68433 + lock_handle dup;
68434 +
68435 + init_lh(&dup);
68436 + result =
68437 + get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68438 + GN_CAN_USE_UPPER_LEVELS);
68439 + if (result == 0) {
68440 + result = tap_move(tap, &dup);
68441 + if (result == 0)
68442 + coord_init(tap->coord, dup.node);
68443 + done_lh(&dup);
68444 + }
68445 + /* skip empty nodes */
68446 + } while ((result == 0) && node_is_empty(coord->node));
68447 + } else {
68448 + result = 0;
68449 + coord_dup(coord, &dup);
68450 + }
68451 + assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68452 + tap_check(tap);
68453 + return result;
68454 +}
68455 +
68456 +/**
68457 + * move @tap to the next unit, transparently crossing item and node
68458 + * boundaries
68459 + */
68460 +int go_next_unit(tap_t * tap)
68461 +{
68462 + return go_dir_el(tap, RIGHT_SIDE, 1);
68463 +}
68464 +
68465 +/**
68466 + * move @tap to the previous unit, transparently crossing item and node
68467 + * boundaries
68468 + */
68469 +int go_prev_unit(tap_t * tap)
68470 +{
68471 + return go_dir_el(tap, LEFT_SIDE, 1);
68472 +}
68473 +
68474 +/**
68475 + * @shift times apply @actor to the @tap. This is used to move @tap by
68476 + * @shift units (or items, or nodes) in either direction.
68477 + */
68478 +static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
68479 +{
68480 + int result;
68481 +
68482 + assert("nikita-2555", shift >= 0);
68483 + assert("nikita-2562", tap->coord->node == tap->lh->node);
68484 +
68485 + tap_check(tap);
68486 + result = tap_load(tap);
68487 + if (result != 0)
68488 + return result;
68489 +
68490 + for (; shift > 0; --shift) {
68491 + result = actor(tap);
68492 + assert("nikita-2563", tap->coord->node == tap->lh->node);
68493 + if (result != 0)
68494 + break;
68495 + }
68496 + tap_relse(tap);
68497 + tap_check(tap);
68498 + return result;
68499 +}
68500 +
68501 +/** move @tap @shift units rightward */
68502 +int rewind_right(tap_t * tap, int shift)
68503 +{
68504 + return rewind_to(tap, go_next_unit, shift);
68505 +}
68506 +
68507 +/** move @tap @shift units leftward */
68508 +int rewind_left(tap_t * tap, int shift)
68509 +{
68510 + return rewind_to(tap, go_prev_unit, shift);
68511 +}
68512 +
68513 +#if REISER4_DEBUG
68514 +/** debugging function: print @tap content in human readable form */
68515 +static void print_tap(const char *prefix, const tap_t * tap)
68516 +{
68517 + if (tap == NULL) {
68518 + printk("%s: null tap\n", prefix);
68519 + return;
68520 + }
68521 + printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
68522 + tap->loaded, (&tap->linkage == tap->linkage.next &&
68523 + &tap->linkage == tap->linkage.prev),
68524 + tap->lh->node,
68525 + lock_mode_name(tap->mode));
68526 + print_coord("\tcoord", tap->coord, 0);
68527 +}
68528 +
68529 +/** check [tap-sane] invariant */
68530 +static int tap_invariant(const tap_t * tap)
68531 +{
68532 + /* [tap-sane] invariant */
68533 +
68534 + if (tap == NULL)
68535 + return 1;
68536 + /* tap->mode is one of
68537 + *
68538 + * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
68539 + */
68540 + if (tap->mode != ZNODE_NO_LOCK &&
68541 + tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
68542 + return 2;
68543 + /* tap->coord != NULL, and */
68544 + if (tap->coord == NULL)
68545 + return 3;
68546 + /* tap->lh != NULL, and */
68547 + if (tap->lh == NULL)
68548 + return 4;
68549 + /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
68550 + if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
68551 + return 5;
68552 + /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
68553 + if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
68554 + return 6;
68555 + return 0;
68556 +}
68557 +
68558 +/** debugging function: check internal @tap consistency */
68559 +static void tap_check(const tap_t * tap)
68560 +{
68561 + int result;
68562 +
68563 + result = tap_invariant(tap);
68564 + if (result != 0) {
68565 + print_tap("broken", tap);
68566 + reiser4_panic("nikita-2831", "tap broken: %i\n", result);
68567 + }
68568 +}
68569 +#endif
68570 +
68571 +/* Make Linus happy.
68572 + Local variables:
68573 + c-indentation-style: "K&R"
68574 + mode-name: "LC"
68575 + c-basic-offset: 8
68576 + tab-width: 8
68577 + fill-column: 120
68578 + scroll-step: 1
68579 + End:
68580 +*/
68581 Index: linux-2.6.16/fs/reiser4/tap.h
68582 ===================================================================
68583 --- /dev/null
68584 +++ linux-2.6.16/fs/reiser4/tap.h
68585 @@ -0,0 +1,69 @@
68586 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68587 +
68588 +/* Tree Access Pointers. See tap.c for more details. */
68589 +
68590 +#if !defined( __REISER4_TAP_H__ )
68591 +#define __REISER4_TAP_H__
68592 +
68593 +#include "forward.h"
68594 +#include "readahead.h"
68595 +
68596 +/**
68597 + tree_access_pointer aka tap. Data structure combining coord_t and lock
68598 + handle.
68599 + Invariants involving this data-type, see doc/lock-ordering for details:
68600 +
68601 + [tap-sane]
68602 + */
68603 +struct tree_access_pointer {
68604 + /* coord tap is at */
68605 + coord_t *coord;
68606 + /* lock handle on ->coord->node */
68607 + lock_handle *lh;
68608 + /* mode of lock acquired by this tap */
68609 + znode_lock_mode mode;
68610 + /* incremented by tap_load(). Decremented by tap_relse(). */
68611 + int loaded;
68612 + /* list of taps */
68613 + struct list_head linkage;
68614 + /* read-ahead hint */
68615 + ra_info_t ra_info;
68616 +};
68617 +
68618 +typedef int (*go_actor_t) (tap_t * tap);
68619 +
68620 +extern int tap_load(tap_t * tap);
68621 +extern void tap_relse(tap_t * tap);
68622 +extern void tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68623 + znode_lock_mode mode);
68624 +extern void tap_monitor(tap_t * tap);
68625 +extern void tap_copy(tap_t * dst, tap_t * src);
68626 +extern void tap_done(tap_t * tap);
68627 +extern int tap_move(tap_t * tap, lock_handle * target);
68628 +extern int tap_to_coord(tap_t * tap, coord_t * target);
68629 +
68630 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
68631 +extern int go_next_unit(tap_t * tap);
68632 +extern int go_prev_unit(tap_t * tap);
68633 +extern int rewind_right(tap_t * tap, int shift);
68634 +extern int rewind_left(tap_t * tap, int shift);
68635 +
68636 +extern struct list_head *taps_list(void);
68637 +
68638 +#define for_all_taps(tap) \
68639 + for (tap = list_entry(taps_list()->next, tap_t, linkage); \
68640 + taps_list() != &tap->linkage; \
68641 + tap = list_entry(tap->linkage.next, tap_t, linkage))
68642 +
68643 +/* __REISER4_TAP_H__ */
68644 +#endif
68645 +/* Make Linus happy.
68646 + Local variables:
68647 + c-indentation-style: "K&R"
68648 + mode-name: "LC"
68649 + c-basic-offset: 8
68650 + tab-width: 8
68651 + fill-column: 120
68652 + scroll-step: 1
68653 + End:
68654 +*/
68655 Index: linux-2.6.16/fs/reiser4/tree.c
68656 ===================================================================
68657 --- /dev/null
68658 +++ linux-2.6.16/fs/reiser4/tree.c
68659 @@ -0,0 +1,1875 @@
68660 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68661 + * reiser4/README */
68662 +
68663 +/*
68664 + * KEYS IN A TREE.
68665 + *
68666 + * The tree consists of nodes located on the disk. Node in the tree is either
68667 + * formatted or unformatted. Formatted node is one that has structure
68668 + * understood by the tree balancing and traversal code. Formatted nodes are
68669 + * further classified into leaf and internal nodes. Latter distinctions is
68670 + * (almost) of only historical importance: general structure of leaves and
68671 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
68672 + * that are part of bodies of ordinary files and attributes.
68673 + *
68674 + * Each node in the tree spawns some interval in the key space. Key ranges for
68675 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
68676 + * sense, because of the non-unique keys: intersection of key ranges for
68677 + * different nodes is either empty, or consists of exactly one key.
68678 + *
68679 + * Formatted node consists of a sequence of items. Each item spawns some
68680 + * interval in key space. Key ranges for all items in a tree are disjoint,
68681 + * modulo non-unique keys again. Items within nodes are ordered in the key
68682 + * order of the smallest key in a item.
68683 + *
68684 + * Particular type of item can be further split into units. Unit is piece of
68685 + * item that can be cut from item and moved into another item of the same
68686 + * time. Units are used by balancing code to repack data during balancing.
68687 + *
68688 + * Unit can be further split into smaller entities (for example, extent unit
68689 + * represents several pages, and it is natural for extent code to operate on
68690 + * particular pages and even bytes within one unit), but this is of no
68691 + * relevance to the generic balancing and lookup code.
68692 + *
68693 + * Although item is said to "spawn" range or interval of keys, it is not
68694 + * necessary that item contains piece of data addressable by each and every
68695 + * key in this range. For example, compound directory item, consisting of
68696 + * units corresponding to directory entries and keyed by hashes of file names,
68697 + * looks more as having "discrete spectrum": only some disjoint keys inside
68698 + * range occupied by this item really address data.
68699 + *
68700 + * No than less, each item always has well-defined least (minimal) key, that
68701 + * is recorded in item header, stored in the node this item is in. Also, item
68702 + * plugin can optionally define method ->max_key_inside() returning maximal
68703 + * key that can _possibly_ be located within this item. This method is used
68704 + * (mainly) to determine when given piece of data should be merged into
68705 + * existing item, in stead of creating new one. Because of this, even though
68706 + * ->max_key_inside() can be larger that any key actually located in the item,
68707 + * intervals
68708 + *
68709 + * [ min_key( item ), ->max_key_inside( item ) ]
68710 + *
68711 + * are still disjoint for all items within the _same_ node.
68712 + *
68713 + * In memory node is represented by znode. It plays several roles:
68714 + *
68715 + * . something locks are taken on
68716 + *
68717 + * . something tracked by transaction manager (this is going to change)
68718 + *
68719 + * . something used to access node data
68720 + *
68721 + * . something used to maintain tree structure in memory: sibling and
68722 + * parental linkage.
68723 + *
68724 + * . something used to organize nodes into "slums"
68725 + *
68726 + * More on znodes see in znode.[ch]
68727 + *
68728 + * DELIMITING KEYS
68729 + *
68730 + * To simplify balancing, allow some flexibility in locking and speed up
68731 + * important coord cache optimization, we keep delimiting keys of nodes in
68732 + * memory. Depending on disk format (implemented by appropriate node plugin)
68733 + * node on disk can record both left and right delimiting key, only one of
68734 + * them, or none. Still, our balancing and tree traversal code keep both
68735 + * delimiting keys for a node that is in memory stored in the znode. When
68736 + * node is first brought into memory during tree traversal, its left
68737 + * delimiting key is taken from its parent, and its right delimiting key is
68738 + * either next key in its parent, or is right delimiting key of parent if
68739 + * node is the rightmost child of parent.
68740 + *
68741 + * Physical consistency of delimiting key is protected by special dk
68742 + * read-write lock. That is, delimiting keys can only be inspected or
68743 + * modified under this lock. But dk lock is only sufficient for fast
68744 + * "pessimistic" check, because to simplify code and to decrease lock
68745 + * contention, balancing (carry) only updates delimiting keys right before
68746 + * unlocking all locked nodes on the given tree level. For example,
68747 + * coord-by-key cache scans LRU list of recently accessed znodes. For each
68748 + * node it first does fast check under dk spin lock. If key looked for is
68749 + * not between delimiting keys for this node, next node is inspected and so
68750 + * on. If key is inside of the key range, long term lock is taken on node
68751 + * and key range is rechecked.
68752 + *
68753 + * COORDINATES
68754 + *
68755 + * To find something in the tree, you supply a key, and the key is resolved
68756 + * by coord_by_key() into a coord (coordinate) that is valid as long as the
68757 + * node the coord points to remains locked. As mentioned above trees
68758 + * consist of nodes that consist of items that consist of units. A unit is
68759 + * the smallest and indivisible piece of tree as far as balancing and tree
68760 + * search are concerned. Each node, item, and unit can be addressed by
68761 + * giving its level in the tree and the key occupied by this entity. A node
68762 + * knows what the key ranges are of the items within it, and how to find its
68763 + * items and invoke their item handlers, but it does not know how to access
68764 + * individual units within its items except through the item handlers.
68765 + * coord is a structure containing a pointer to the node, the ordinal number
68766 + * of the item within this node (a sort of item offset), and the ordinal
68767 + * number of the unit within this item.
68768 + *
68769 + * TREE LOOKUP
68770 + *
68771 + * There are two types of access to the tree: lookup and modification.
68772 + *
68773 + * Lookup is a search for the key in the tree. Search can look for either
68774 + * exactly the key given to it, or for the largest key that is not greater
68775 + * than the key given to it. This distinction is determined by "bias"
68776 + * parameter of search routine (coord_by_key()). coord_by_key() either
68777 + * returns error (key is not in the tree, or some kind of external error
68778 + * occurred), or successfully resolves key into coord.
68779 + *
68780 + * This resolution is done by traversing tree top-to-bottom from root level
68781 + * to the desired level. On levels above twig level (level one above the
68782 + * leaf level) nodes consist exclusively of internal items. Internal item is
68783 + * nothing more than pointer to the tree node on the child level. On twig
68784 + * level nodes consist of internal items intermixed with extent
68785 + * items. Internal items form normal search tree structure used by traversal
68786 + * to descent through the tree.
68787 + *
68788 + * TREE LOOKUP OPTIMIZATIONS
68789 + *
68790 + * Tree lookup described above is expensive even if all nodes traversed are
68791 + * already in the memory: for each node binary search within it has to be
68792 + * performed and binary searches are CPU consuming and tend to destroy CPU
68793 + * caches.
68794 + *
68795 + * Several optimizations are used to work around this:
68796 + *
68797 + * . cbk_cache (look-aside cache for tree traversals, see search.c for
68798 + * details)
68799 + *
68800 + * . seals (see seal.[ch])
68801 + *
68802 + * . vroot (see search.c)
68803 + *
68804 + * General search-by-key is layered thusly:
68805 + *
68806 + * [check seal, if any] --ok--> done
68807 + * |
68808 + * failed
68809 + * |
68810 + * V
68811 + * [vroot defined] --no--> node = tree_root
68812 + * | |
68813 + * yes |
68814 + * | |
68815 + * V |
68816 + * node = vroot |
68817 + * | |
68818 + * | |
68819 + * | |
68820 + * V V
68821 + * [check cbk_cache for key] --ok--> done
68822 + * |
68823 + * failed
68824 + * |
68825 + * V
68826 + * [start tree traversal from node]
68827 + *
68828 + */
68829 +
68830 +#include "forward.h"
68831 +#include "debug.h"
68832 +#include "dformat.h"
68833 +#include "key.h"
68834 +#include "coord.h"
68835 +#include "plugin/item/static_stat.h"
68836 +#include "plugin/item/item.h"
68837 +#include "plugin/node/node.h"
68838 +#include "plugin/plugin.h"
68839 +#include "txnmgr.h"
68840 +#include "jnode.h"
68841 +#include "znode.h"
68842 +#include "block_alloc.h"
68843 +#include "tree_walk.h"
68844 +#include "carry.h"
68845 +#include "carry_ops.h"
68846 +#include "tap.h"
68847 +#include "tree.h"
68848 +#include "vfs_ops.h"
68849 +#include "page_cache.h"
68850 +#include "super.h"
68851 +#include "reiser4.h"
68852 +#include "inode.h"
68853 +
68854 +#include <linux/fs.h> /* for struct super_block */
68855 +#include <linux/spinlock.h>
68856 +
68857 +/* Disk address (block number) never ever used for any real tree node. This is
68858 + used as block number of "uber" znode.
68859 +
68860 + Invalid block addresses are 0 by tradition.
68861 +
68862 +*/
68863 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
68864 +
68865 +#define CUT_TREE_MIN_ITERATIONS 64
68866 +
68867 +static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
68868 +
68869 +/* return node plugin of coord->node */
68870 +node_plugin *node_plugin_by_coord(const coord_t * coord)
68871 +{
68872 + assert("vs-1", coord != NULL);
68873 + assert("vs-2", coord->node != NULL);
68874 +
68875 + return coord->node->nplug;
68876 +}
68877 +
68878 +/* insert item into tree. Fields of @coord are updated so that they can be
68879 + * used by consequent insert operation. */
68880 +insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
68881 + * into */ ,
68882 + const reiser4_key * key /* key of new item */ ,
68883 + reiser4_item_data * data /* parameters for item
68884 + * creation */ ,
68885 + coord_t * coord /* resulting insertion coord */ ,
68886 + lock_handle * lh /* resulting lock
68887 + * handle */ ,
68888 + tree_level stop_level /** level where to insert */ ,
68889 + __u32 flags /* insertion flags */ )
68890 +{
68891 + int result;
68892 +
68893 + assert("nikita-358", tree != NULL);
68894 + assert("nikita-360", coord != NULL);
68895 +
68896 + result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
68897 + FIND_EXACT, stop_level, stop_level,
68898 + flags | CBK_FOR_INSERT, NULL /*ra_info */ );
68899 + switch (result) {
68900 + default:
68901 + break;
68902 + case CBK_COORD_FOUND:
68903 + result = IBK_ALREADY_EXISTS;
68904 + break;
68905 + case CBK_COORD_NOTFOUND:
68906 + assert("nikita-2017", coord->node != NULL);
68907 + result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
68908 + break;
68909 + }
68910 + return result;
68911 +}
68912 +
68913 +/* insert item by calling carry. Helper function called if short-cut
68914 + insertion failed */
68915 +static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
68916 + lock_handle * lh, /* lock handle of insertion
68917 + * node */
68918 + reiser4_item_data * data, /* parameters of new
68919 + * item */
68920 + const reiser4_key * key, /* key of new item */
68921 + carry_opcode cop, /* carry operation to perform */
68922 + cop_insert_flag flags
68923 + /* carry flags */ )
68924 +{
68925 + int result;
68926 + carry_pool *pool;
68927 + carry_level *lowest_level;
68928 + carry_insert_data *cdata;
68929 + carry_op *op;
68930 +
68931 + assert("umka-314", coord != NULL);
68932 +
68933 + /* allocate carry_pool and 3 carry_level-s */
68934 + pool =
68935 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68936 + sizeof(*cdata));
68937 + if (IS_ERR(pool))
68938 + return PTR_ERR(pool);
68939 + lowest_level = (carry_level *) (pool + 1);
68940 + init_carry_level(lowest_level, pool);
68941 +
68942 + op = post_carry(lowest_level, cop, coord->node, 0);
68943 + if (IS_ERR(op) || (op == NULL)) {
68944 + done_carry_pool(pool);
68945 + return RETERR(op ? PTR_ERR(op) : -EIO);
68946 + }
68947 + cdata = (carry_insert_data *) (lowest_level + 3);
68948 + cdata->coord = coord;
68949 + cdata->data = data;
68950 + cdata->key = key;
68951 + op->u.insert.d = cdata;
68952 + if (flags == 0)
68953 + flags = znode_get_tree(coord->node)->carry.insert_flags;
68954 + op->u.insert.flags = flags;
68955 + op->u.insert.type = COPT_ITEM_DATA;
68956 + op->u.insert.child = NULL;
68957 + if (lh != NULL) {
68958 + assert("nikita-3245", lh->node == coord->node);
68959 + lowest_level->track_type = CARRY_TRACK_CHANGE;
68960 + lowest_level->tracked = lh;
68961 + }
68962 +
68963 + result = carry(lowest_level, NULL);
68964 + done_carry_pool(pool);
68965 +
68966 + return result;
68967 +}
68968 +
68969 +/* form carry queue to perform paste of @data with @key at @coord, and launch
68970 + its execution by calling carry().
68971 +
68972 + Instruct carry to update @lh it after balancing insertion coord moves into
68973 + different block.
68974 +
68975 +*/
68976 +static int paste_with_carry(coord_t * coord, /* coord of paste */
68977 + lock_handle * lh, /* lock handle of node
68978 + * where item is
68979 + * pasted */
68980 + reiser4_item_data * data, /* parameters of new
68981 + * item */
68982 + const reiser4_key * key, /* key of new item */
68983 + unsigned flags /* paste flags */ )
68984 +{
68985 + int result;
68986 + carry_pool *pool;
68987 + carry_level *lowest_level;
68988 + carry_insert_data *cdata;
68989 + carry_op *op;
68990 +
68991 + assert("umka-315", coord != NULL);
68992 + assert("umka-316", key != NULL);
68993 +
68994 + pool =
68995 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68996 + sizeof(*cdata));
68997 + if (IS_ERR(pool))
68998 + return PTR_ERR(pool);
68999 + lowest_level = (carry_level *) (pool + 1);
69000 + init_carry_level(lowest_level, pool);
69001 +
69002 + op = post_carry(lowest_level, COP_PASTE, coord->node, 0);
69003 + if (IS_ERR(op) || (op == NULL)) {
69004 + done_carry_pool(pool);
69005 + return RETERR(op ? PTR_ERR(op) : -EIO);
69006 + }
69007 + cdata = (carry_insert_data *) (lowest_level + 3);
69008 + cdata->coord = coord;
69009 + cdata->data = data;
69010 + cdata->key = key;
69011 + op->u.paste.d = cdata;
69012 + if (flags == 0)
69013 + flags = znode_get_tree(coord->node)->carry.paste_flags;
69014 + op->u.paste.flags = flags;
69015 + op->u.paste.type = COPT_ITEM_DATA;
69016 + if (lh != NULL) {
69017 + lowest_level->track_type = CARRY_TRACK_CHANGE;
69018 + lowest_level->tracked = lh;
69019 + }
69020 +
69021 + result = carry(lowest_level, NULL);
69022 + done_carry_pool(pool);
69023 +
69024 + return result;
69025 +}
69026 +
69027 +/* insert item at the given coord.
69028 +
69029 + First try to skip carry by directly calling ->create_item() method of node
69030 + plugin. If this is impossible (there is not enough free space in the node,
69031 + or leftmost item in the node is created), call insert_with_carry_by_coord()
69032 + that will do full carry().
69033 +
69034 +*/
69035 +insert_result insert_by_coord(coord_t * coord /* coord where to
69036 + * insert. coord->node has
69037 + * to be write locked by
69038 + * caller */ ,
69039 + reiser4_item_data * data /* data to be
69040 + * inserted */ ,
69041 + const reiser4_key * key /* key of new item */ ,
69042 + lock_handle * lh /* lock handle of write
69043 + * lock on node */ ,
69044 + __u32 flags /* insertion flags */ )
69045 +{
69046 + unsigned item_size;
69047 + int result;
69048 + znode *node;
69049 +
69050 + assert("vs-247", coord != NULL);
69051 + assert("vs-248", data != NULL);
69052 + assert("vs-249", data->length >= 0);
69053 + assert("nikita-1191", znode_is_write_locked(coord->node));
69054 +
69055 + node = coord->node;
69056 + coord_clear_iplug(coord);
69057 + result = zload(node);
69058 + if (result != 0)
69059 + return result;
69060 +
69061 + item_size = space_needed(node, NULL, data, 1);
69062 + if (item_size > znode_free_space(node) &&
69063 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69064 + && (flags & COPI_DONT_ALLOCATE)) {
69065 + /* we are forced to use free space of coord->node and new item
69066 + does not fit into it.
69067 +
69068 + Currently we get here only when we allocate and copy units
69069 + of extent item from a node to its left neighbor during
69070 + "squalloc"-ing. If @node (this is left neighbor) does not
69071 + have enough free space - we do not want to attempt any
69072 + shifting and allocations because we are in squeezing and
69073 + everything to the left of @node is tightly packed.
69074 + */
69075 + result = -E_NODE_FULL;
69076 + } else if ((item_size <= znode_free_space(node)) &&
69077 + !coord_is_before_leftmost(coord) &&
69078 + (node_plugin_by_node(node)->fast_insert != NULL)
69079 + && node_plugin_by_node(node)->fast_insert(coord)) {
69080 + /* shortcut insertion without carry() overhead.
69081 +
69082 + Only possible if:
69083 +
69084 + - there is enough free space
69085 +
69086 + - insertion is not into the leftmost position in a node
69087 + (otherwise it would require updating of delimiting key in a
69088 + parent)
69089 +
69090 + - node plugin agrees with this
69091 +
69092 + */
69093 + result =
69094 + node_plugin_by_node(node)->create_item(coord, key, data,
69095 + NULL);
69096 + znode_make_dirty(node);
69097 + } else {
69098 + /* otherwise do full-fledged carry(). */
69099 + result =
69100 + insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
69101 + flags);
69102 + }
69103 + zrelse(node);
69104 + return result;
69105 +}
69106 +
69107 +/* @coord is set to leaf level and @data is to be inserted to twig level */
69108 +insert_result
69109 +insert_extent_by_coord(coord_t *
69110 + coord
69111 + /* coord where to insert. coord->node * has to be write * locked by caller */
69112 + ,
69113 + reiser4_item_data * data /* data to be inserted */ ,
69114 + const reiser4_key * key /* key of new item */ ,
69115 + lock_handle *
69116 + lh /* lock handle of write lock on * node */ )
69117 +{
69118 + assert("vs-405", coord != NULL);
69119 + assert("vs-406", data != NULL);
69120 + assert("vs-407", data->length > 0);
69121 + assert("vs-408", znode_is_write_locked(coord->node));
69122 + assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
69123 +
69124 + return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
69125 + 0 /*flags */ );
69126 +}
69127 +
69128 +/* Insert into the item at the given coord.
69129 +
69130 + First try to skip carry by directly calling ->paste() method of item
69131 + plugin. If this is impossible (there is not enough free space in the node,
69132 + or we are pasting into leftmost position in the node), call
69133 + paste_with_carry() that will do full carry().
69134 +
69135 +*/
69136 +/* paste_into_item */
69137 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
69138 + lock_handle * lh /* lock handle on node involved */ ,
69139 + const reiser4_key * key /* key of unit being pasted */ ,
69140 + reiser4_item_data * data /* parameters for new unit */ ,
69141 + unsigned flags /* insert/paste flags */ )
69142 +{
69143 + int result;
69144 + int size_change;
69145 + node_plugin *nplug;
69146 + item_plugin *iplug;
69147 +
69148 + assert("umka-317", coord != NULL);
69149 + assert("umka-318", key != NULL);
69150 +
69151 + iplug = item_plugin_by_coord(coord);
69152 + nplug = node_plugin_by_coord(coord);
69153 +
69154 + assert("nikita-1480", iplug == data->iplug);
69155 +
69156 + size_change = space_needed(coord->node, coord, data, 0);
69157 + if (size_change > (int)znode_free_space(coord->node) &&
69158 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69159 + && (flags & COPI_DONT_ALLOCATE)) {
69160 + /* we are forced to use free space of coord->node and new data
69161 + does not fit into it. */
69162 + return -E_NODE_FULL;
69163 + }
69164 +
69165 + /* shortcut paste without carry() overhead.
69166 +
69167 + Only possible if:
69168 +
69169 + - there is enough free space
69170 +
69171 + - paste is not into the leftmost unit in a node (otherwise
69172 + it would require updating of delimiting key in a parent)
69173 +
69174 + - node plugin agrees with this
69175 +
69176 + - item plugin agrees with us
69177 + */
69178 + if (size_change <= (int)znode_free_space(coord->node) &&
69179 + (coord->item_pos != 0 ||
69180 + coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
69181 + coord->unit_pos != 0 && nplug->fast_paste != NULL &&
69182 + nplug->fast_paste(coord) &&
69183 + iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
69184 + if (size_change > 0)
69185 + nplug->change_item_size(coord, size_change);
69186 + /* NOTE-NIKITA: huh? where @key is used? */
69187 + result = iplug->b.paste(coord, data, NULL);
69188 + if (size_change < 0)
69189 + nplug->change_item_size(coord, size_change);
69190 + znode_make_dirty(coord->node);
69191 + } else
69192 + /* otherwise do full-fledged carry(). */
69193 + result = paste_with_carry(coord, lh, data, key, flags);
69194 + return result;
69195 +}
69196 +
69197 +/* this either appends or truncates item @coord */
69198 +int resize_item(coord_t * coord /* coord of item being resized */ ,
69199 + reiser4_item_data * data /* parameters of resize */ ,
69200 + reiser4_key * key /* key of new unit */ ,
69201 + lock_handle * lh /* lock handle of node
69202 + * being modified */ ,
69203 + cop_insert_flag flags /* carry flags */ )
69204 +{
69205 + int result;
69206 + znode *node;
69207 +
69208 + assert("nikita-362", coord != NULL);
69209 + assert("nikita-363", data != NULL);
69210 + assert("vs-245", data->length != 0);
69211 +
69212 + node = coord->node;
69213 + coord_clear_iplug(coord);
69214 + result = zload(node);
69215 + if (result != 0)
69216 + return result;
69217 +
69218 + if (data->length < 0)
69219 + result = node_plugin_by_coord(coord)->shrink_item(coord,
69220 + -data->length);
69221 + else
69222 + result = insert_into_item(coord, lh, key, data, flags);
69223 +
69224 + zrelse(node);
69225 + return result;
69226 +}
69227 +
69228 +/* insert flow @f */
69229 +int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
69230 +{
69231 + int result;
69232 + carry_pool *pool;
69233 + carry_level *lowest_level;
69234 + reiser4_item_data *data;
69235 + carry_op *op;
69236 +
69237 + pool =
69238 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69239 + sizeof(*data));
69240 + if (IS_ERR(pool))
69241 + return PTR_ERR(pool);
69242 + lowest_level = (carry_level *) (pool + 1);
69243 + init_carry_level(lowest_level, pool);
69244 +
69245 + op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
69246 + 0 /* operate directly on coord -> node */ );
69247 + if (IS_ERR(op) || (op == NULL)) {
69248 + done_carry_pool(pool);
69249 + return RETERR(op ? PTR_ERR(op) : -EIO);
69250 + }
69251 +
69252 + /* these are permanent during insert_flow */
69253 + data = (reiser4_item_data *) (lowest_level + 3);
69254 + data->user = 1;
69255 + data->iplug = item_plugin_by_id(FORMATTING_ID);
69256 + data->arg = NULL;
69257 + /* data.length and data.data will be set before calling paste or
69258 + insert */
69259 + data->length = 0;
69260 + data->data = NULL;
69261 +
69262 + op->u.insert_flow.flags = 0;
69263 + op->u.insert_flow.insert_point = coord;
69264 + op->u.insert_flow.flow = f;
69265 + op->u.insert_flow.data = data;
69266 + op->u.insert_flow.new_nodes = 0;
69267 +
69268 + lowest_level->track_type = CARRY_TRACK_CHANGE;
69269 + lowest_level->tracked = lh;
69270 +
69271 + result = carry(lowest_level, NULL);
69272 + done_carry_pool(pool);
69273 +
69274 + return result;
69275 +}
69276 +
69277 +/* Given a coord in parent node, obtain a znode for the corresponding child */
69278 +znode *child_znode(const coord_t * parent_coord /* coord of pointer to
69279 + * child */ ,
69280 + znode * parent /* parent of child */ ,
69281 + int incore_p /* if !0 only return child if already in
69282 + * memory */ ,
69283 + int setup_dkeys_p /* if !0 update delimiting keys of
69284 + * child */ )
69285 +{
69286 + znode *child;
69287 +
69288 + assert("nikita-1374", parent_coord != NULL);
69289 + assert("nikita-1482", parent != NULL);
69290 +#if REISER4_DEBUG
69291 + if (setup_dkeys_p)
69292 + assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69293 +#endif
69294 + assert("nikita-2947", znode_is_any_locked(parent));
69295 +
69296 + if (znode_get_level(parent) <= LEAF_LEVEL) {
69297 + /* trying to get child of leaf node */
69298 + warning("nikita-1217", "Child of maize?");
69299 + return ERR_PTR(RETERR(-EIO));
69300 + }
69301 + if (item_is_internal(parent_coord)) {
69302 + reiser4_block_nr addr;
69303 + item_plugin *iplug;
69304 + reiser4_tree *tree;
69305 +
69306 + iplug = item_plugin_by_coord(parent_coord);
69307 + assert("vs-512", iplug->s.internal.down_link);
69308 + iplug->s.internal.down_link(parent_coord, NULL, &addr);
69309 +
69310 + tree = znode_get_tree(parent);
69311 + if (incore_p)
69312 + child = zlook(tree, &addr);
69313 + else
69314 + child =
69315 + zget(tree, &addr, parent,
69316 + znode_get_level(parent) - 1, get_gfp_mask());
69317 + if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69318 + set_child_delimiting_keys(parent, parent_coord, child);
69319 + } else {
69320 + warning("nikita-1483", "Internal item expected");
69321 + child = ERR_PTR(RETERR(-EIO));
69322 + }
69323 + return child;
69324 +}
69325 +
69326 +/* remove znode from transaction */
69327 +static void uncapture_znode(znode * node)
69328 +{
69329 + struct page *page;
69330 +
69331 + assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69332 +
69333 + if (!blocknr_is_fake(znode_get_block(node))) {
69334 + int ret;
69335 +
69336 + /* An already allocated block goes right to the atom's delete set. */
69337 + ret =
69338 + reiser4_dealloc_block(znode_get_block(node), 0,
69339 + BA_DEFER | BA_FORMATTED);
69340 + if (ret)
69341 + warning("zam-942",
69342 + "can\'t add a block (%llu) number to atom's delete set\n",
69343 + (unsigned long long)(*znode_get_block(node)));
69344 +
69345 + spin_lock_znode(node);
69346 + /* Here we return flush reserved block which was reserved at the
69347 + * moment when this allocated node was marked dirty and still
69348 + * not used by flush in node relocation procedure. */
69349 + if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69350 + txn_atom *atom;
69351 +
69352 + atom = jnode_get_atom(ZJNODE(node));
69353 + assert("zam-939", atom != NULL);
69354 + spin_unlock_znode(node);
69355 + flush_reserved2grabbed(atom, (__u64) 1);
69356 + spin_unlock_atom(atom);
69357 + } else
69358 + spin_unlock_znode(node);
69359 + } else {
69360 + /* znode has assigned block which is counted as "fake
69361 + allocated". Return it back to "free blocks") */
69362 + fake_allocated2free((__u64) 1, BA_FORMATTED);
69363 + }
69364 +
69365 + /*
69366 + * uncapture page from transaction. There is a possibility of a race
69367 + * with ->releasepage(): reiser4_releasepage() detaches page from this
69368 + * jnode and we have nothing to uncapture. To avoid this, get
69369 + * reference of node->pg under jnode spin lock. uncapture_page() will
69370 + * deal with released page itself.
69371 + */
69372 + spin_lock_znode(node);
69373 + page = znode_page(node);
69374 + if (likely(page != NULL)) {
69375 + /*
69376 + * uncapture_page() can only be called when we are sure that
69377 + * znode is pinned in memory, which we are, because
69378 + * forget_znode() is only called from longterm_unlock_znode().
69379 + */
69380 + page_cache_get(page);
69381 + spin_unlock_znode(node);
69382 + lock_page(page);
69383 + uncapture_page(page);
69384 + unlock_page(page);
69385 + page_cache_release(page);
69386 + } else {
69387 + txn_atom *atom;
69388 +
69389 + /* handle "flush queued" znodes */
69390 + while (1) {
69391 + atom = jnode_get_atom(ZJNODE(node));
69392 + assert("zam-943", atom != NULL);
69393 +
69394 + if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69395 + || !atom->nr_running_queues)
69396 + break;
69397 +
69398 + spin_unlock_znode(node);
69399 + atom_wait_event(atom);
69400 + spin_lock_znode(node);
69401 + }
69402 +
69403 + uncapture_block(ZJNODE(node));
69404 + spin_unlock_atom(atom);
69405 + zput(node);
69406 + }
69407 +}
69408 +
69409 +/* This is called from longterm_unlock_znode() when last lock is released from
69410 + the node that has been removed from the tree. At this point node is removed
69411 + from sibling list and its lock is invalidated. */
69412 +void forget_znode(lock_handle * handle)
69413 +{
69414 + znode *node;
69415 + reiser4_tree *tree;
69416 +
69417 + assert("umka-319", handle != NULL);
69418 +
69419 + node = handle->node;
69420 + tree = znode_get_tree(node);
69421 +
69422 + assert("vs-164", znode_is_write_locked(node));
69423 + assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69424 + assert_rw_locked(&(node->lock.guard));
69425 +
69426 + /* We assume that this node was detached from its parent before
69427 + * unlocking, it gives no way to reach this node from parent through a
69428 + * down link. The node should have no children and, thereby, can't be
69429 + * reached from them by their parent pointers. The only way to obtain a
69430 + * reference to the node is to use sibling pointers from its left and
69431 + * right neighbors. In the next several lines we remove the node from
69432 + * the sibling list. */
69433 +
69434 + write_lock_tree(tree);
69435 + sibling_list_remove(node);
69436 + znode_remove(node, tree);
69437 + write_unlock_tree(tree);
69438 +
69439 + /* Here we set JNODE_DYING and cancel all pending lock requests. It
69440 + * forces all lock requestor threads to repeat iterations of getting
69441 + * lock on a child, neighbor or parent node. But, those threads can't
69442 + * come to this node again, because this node is no longer a child,
69443 + * neighbor or parent of any other node. This order of znode
69444 + * invalidation does not allow other threads to waste cpu time is a busy
69445 + * loop, trying to lock dying object. The exception is in the flush
69446 + * code when we take node directly from atom's capture list.*/
69447 + invalidate_lock(handle);
69448 + uncapture_znode(node);
69449 +}
69450 +
69451 +/* Check that internal item at @pointer really contains pointer to @child. */
69452 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69453 + * @child */ ,
69454 + const znode * child /* child znode */ )
69455 +{
69456 + assert("nikita-1016", pointer != NULL);
69457 + assert("nikita-1017", child != NULL);
69458 + assert("nikita-1018", pointer->node != NULL);
69459 +
69460 + assert("nikita-1325", znode_is_any_locked(pointer->node));
69461 +
69462 + assert("nikita-2985",
69463 + znode_get_level(pointer->node) == znode_get_level(child) + 1);
69464 +
69465 + coord_clear_iplug((coord_t *) pointer);
69466 +
69467 + if (coord_is_existing_unit(pointer)) {
69468 + item_plugin *iplug;
69469 + reiser4_block_nr addr;
69470 +
69471 + if (item_is_internal(pointer)) {
69472 + iplug = item_plugin_by_coord(pointer);
69473 + assert("vs-513", iplug->s.internal.down_link);
69474 + iplug->s.internal.down_link(pointer, NULL, &addr);
69475 + /* check that cached value is correct */
69476 + if (disk_addr_eq(&addr, znode_get_block(child))) {
69477 + return NS_FOUND;
69478 + }
69479 + }
69480 + }
69481 + /* warning ("jmacd-1002", "tree pointer incorrect"); */
69482 + return NS_NOT_FOUND;
69483 +}
69484 +
69485 +/* find coord of pointer to new @child in @parent.
69486 +
69487 + Find the &coord_t in the @parent where pointer to a given @child will
69488 + be in.
69489 +
69490 +*/
69491 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
69492 + znode *
69493 + child UNUSED_ARG /* child znode, passed locked */ ,
69494 + znode * left /* left brother of new node */ ,
69495 + coord_t * result /* where result is stored in */ )
69496 +{
69497 + int ret;
69498 +
69499 + assert("nikita-1486", parent != NULL);
69500 + assert("nikita-1487", child != NULL);
69501 + assert("nikita-1488", result != NULL);
69502 +
69503 + ret = find_child_ptr(parent, left, result);
69504 + if (ret != NS_FOUND) {
69505 + warning("nikita-1489", "Cannot find brother position: %i", ret);
69506 + return RETERR(-EIO);
69507 + } else {
69508 + result->between = AFTER_UNIT;
69509 + return RETERR(NS_NOT_FOUND);
69510 + }
69511 +}
69512 +
69513 +/* find coord of pointer to @child in @parent.
69514 +
69515 + Find the &coord_t in the @parent where pointer to a given @child is in.
69516 +
69517 +*/
69518 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
69519 + znode * child /* child znode, passed locked */ ,
69520 + coord_t * result /* where result is stored in */ )
69521 +{
69522 + int lookup_res;
69523 + node_plugin *nplug;
69524 + /* left delimiting key of a child */
69525 + reiser4_key ld;
69526 + reiser4_tree *tree;
69527 +
69528 + assert("nikita-934", parent != NULL);
69529 + assert("nikita-935", child != NULL);
69530 + assert("nikita-936", result != NULL);
69531 + assert("zam-356", znode_is_loaded(parent));
69532 +
69533 + coord_init_zero(result);
69534 + result->node = parent;
69535 +
69536 + nplug = parent->nplug;
69537 + assert("nikita-939", nplug != NULL);
69538 +
69539 + tree = znode_get_tree(parent);
69540 + /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
69541 + * not aliased to ->in_parent of some znode. Otherwise,
69542 + * parent_coord_to_coord() below would modify data protected by tree
69543 + * lock. */
69544 + read_lock_tree(tree);
69545 + /* fast path. Try to use cached value. Lock tree to keep
69546 + node->pos_in_parent and pos->*_blocknr consistent. */
69547 + if (child->in_parent.item_pos + 1 != 0) {
69548 + parent_coord_to_coord(&child->in_parent, result);
69549 + if (check_tree_pointer(result, child) == NS_FOUND) {
69550 + read_unlock_tree(tree);
69551 + return NS_FOUND;
69552 + }
69553 +
69554 + child->in_parent.item_pos = (unsigned short)~0;
69555 + }
69556 + read_unlock_tree(tree);
69557 +
69558 + /* is above failed, find some key from @child. We are looking for the
69559 + least key in a child. */
69560 + read_lock_dk(tree);
69561 + ld = *znode_get_ld_key(child);
69562 + read_unlock_dk(tree);
69563 + /*
69564 + * now, lookup parent with key just found. Note, that left delimiting
69565 + * key doesn't identify node uniquely, because (in extremely rare
69566 + * case) two nodes can have equal left delimiting keys, if one of them
69567 + * is completely filled with directory entries that all happened to be
69568 + * hash collision. But, we check block number in check_tree_pointer()
69569 + * and, so, are safe.
69570 + */
69571 + lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
69572 + /* update cached pos_in_node */
69573 + if (lookup_res == NS_FOUND) {
69574 + write_lock_tree(tree);
69575 + coord_to_parent_coord(result, &child->in_parent);
69576 + write_unlock_tree(tree);
69577 + lookup_res = check_tree_pointer(result, child);
69578 + }
69579 + if (lookup_res == NS_NOT_FOUND)
69580 + lookup_res = find_child_by_addr(parent, child, result);
69581 + return lookup_res;
69582 +}
69583 +
69584 +/* find coord of pointer to @child in @parent by scanning
69585 +
69586 + Find the &coord_t in the @parent where pointer to a given @child
69587 + is in by scanning all internal items in @parent and comparing block
69588 + numbers in them with that of @child.
69589 +
69590 +*/
69591 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
69592 + znode * child /* child znode, passed locked */ ,
69593 + coord_t * result /* where result is stored in */ )
69594 +{
69595 + int ret;
69596 +
69597 + assert("nikita-1320", parent != NULL);
69598 + assert("nikita-1321", child != NULL);
69599 + assert("nikita-1322", result != NULL);
69600 +
69601 + ret = NS_NOT_FOUND;
69602 +
69603 + for_all_units(result, parent) {
69604 + if (check_tree_pointer(result, child) == NS_FOUND) {
69605 + write_lock_tree(znode_get_tree(parent));
69606 + coord_to_parent_coord(result, &child->in_parent);
69607 + write_unlock_tree(znode_get_tree(parent));
69608 + ret = NS_FOUND;
69609 + break;
69610 + }
69611 + }
69612 + return ret;
69613 +}
69614 +
69615 +/* true, if @addr is "unallocated block number", which is just address, with
69616 + highest bit set. */
69617 +int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
69618 + * check */ )
69619 +{
69620 + assert("nikita-1766", addr != NULL);
69621 + cassert(sizeof(reiser4_block_nr) == 8);
69622 + return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
69623 + REISER4_UNALLOCATED_STATUS_VALUE;
69624 +}
69625 +
69626 +/* returns true if removing bytes of given range of key [from_key, to_key]
69627 + causes removing of whole item @from */
69628 +static int
69629 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
69630 + const reiser4_key * to_key)
69631 +{
69632 + item_plugin *iplug;
69633 + reiser4_key key_in_item;
69634 +
69635 + assert("umka-325", from != NULL);
69636 + assert("", item_is_extent(from));
69637 +
69638 + /* check first key just for case */
69639 + item_key_by_coord(from, &key_in_item);
69640 + if (keygt(from_key, &key_in_item))
69641 + return 0;
69642 +
69643 + /* check last key */
69644 + iplug = item_plugin_by_coord(from);
69645 + assert("vs-611", iplug && iplug->s.file.append_key);
69646 +
69647 + iplug->s.file.append_key(from, &key_in_item);
69648 + set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
69649 +
69650 + if (keylt(to_key, &key_in_item))
69651 + /* last byte is not removed */
69652 + return 0;
69653 + return 1;
69654 +}
69655 +
69656 +/* helper function for prepare_twig_kill(): @left and @right are formatted
69657 + * neighbors of extent item being completely removed. Load and lock neighbors
69658 + * and store lock handles into @cdata for later use by kill_hook_extent() */
69659 +static int
69660 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
69661 +{
69662 + int result;
69663 + int left_loaded;
69664 + int right_loaded;
69665 +
69666 + result = 0;
69667 + left_loaded = right_loaded = 0;
69668 +
69669 + if (left != NULL) {
69670 + result = zload(left);
69671 + if (result == 0) {
69672 + left_loaded = 1;
69673 + result = longterm_lock_znode(kdata->left, left,
69674 + ZNODE_READ_LOCK,
69675 + ZNODE_LOCK_LOPRI);
69676 + }
69677 + }
69678 + if (result == 0 && right != NULL) {
69679 + result = zload(right);
69680 + if (result == 0) {
69681 + right_loaded = 1;
69682 + result = longterm_lock_znode(kdata->right, right,
69683 + ZNODE_READ_LOCK,
69684 + ZNODE_LOCK_HIPRI |
69685 + ZNODE_LOCK_NONBLOCK);
69686 + }
69687 + }
69688 + if (result != 0) {
69689 + done_lh(kdata->left);
69690 + done_lh(kdata->right);
69691 + if (left_loaded != 0)
69692 + zrelse(left);
69693 + if (right_loaded != 0)
69694 + zrelse(right);
69695 + }
69696 + return result;
69697 +}
69698 +
69699 +static void done_children(carry_kill_data * kdata)
69700 +{
69701 + if (kdata->left != NULL && kdata->left->node != NULL) {
69702 + zrelse(kdata->left->node);
69703 + done_lh(kdata->left);
69704 + }
69705 + if (kdata->right != NULL && kdata->right->node != NULL) {
69706 + zrelse(kdata->right->node);
69707 + done_lh(kdata->right);
69708 + }
69709 +}
69710 +
69711 +/* part of cut_node. It is called when cut_node is called to remove or cut part
69712 + of extent item. When head of that item is removed - we have to update right
69713 + delimiting of left neighbor of extent. When item is removed completely - we
69714 + have to set sibling link between left and right neighbor of removed
69715 + extent. This may return -E_DEADLOCK because of trying to get left neighbor
69716 + locked. So, caller should repeat an attempt
69717 +*/
69718 +/* Audited by: umka (2002.06.16) */
69719 +static int
69720 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
69721 +{
69722 + int result;
69723 + reiser4_key key;
69724 + lock_handle left_lh;
69725 + lock_handle right_lh;
69726 + coord_t left_coord;
69727 + coord_t *from;
69728 + znode *left_child;
69729 + znode *right_child;
69730 + reiser4_tree *tree;
69731 + int left_zloaded_here, right_zloaded_here;
69732 +
69733 + from = kdata->params.from;
69734 + assert("umka-326", from != NULL);
69735 + assert("umka-327", kdata->params.to != NULL);
69736 +
69737 + /* for one extent item only yet */
69738 + assert("vs-591", item_is_extent(from));
69739 + assert("vs-592", from->item_pos == kdata->params.to->item_pos);
69740 +
69741 + if ((kdata->params.from_key
69742 + && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
69743 + || from->unit_pos != 0) {
69744 + /* head of item @from is not removed, there is nothing to
69745 + worry about */
69746 + return 0;
69747 + }
69748 +
69749 + result = 0;
69750 + left_zloaded_here = 0;
69751 + right_zloaded_here = 0;
69752 +
69753 + left_child = right_child = NULL;
69754 +
69755 + coord_dup(&left_coord, from);
69756 + init_lh(&left_lh);
69757 + init_lh(&right_lh);
69758 + if (coord_prev_unit(&left_coord)) {
69759 + /* @from is leftmost item in its node */
69760 + if (!locked_left_neighbor) {
69761 + result =
69762 + reiser4_get_left_neighbor(&left_lh, from->node,
69763 + ZNODE_READ_LOCK,
69764 + GN_CAN_USE_UPPER_LEVELS);
69765 + switch (result) {
69766 + case 0:
69767 + break;
69768 + case -E_NO_NEIGHBOR:
69769 + /* there is no formatted node to the left of
69770 + from->node */
69771 + warning("vs-605",
69772 + "extent item has smallest key in "
69773 + "the tree and it is about to be removed");
69774 + return 0;
69775 + case -E_DEADLOCK:
69776 + /* need to restart */
69777 + default:
69778 + return result;
69779 + }
69780 +
69781 + /* we have acquired left neighbor of from->node */
69782 + result = zload(left_lh.node);
69783 + if (result)
69784 + goto done;
69785 +
69786 + locked_left_neighbor = left_lh.node;
69787 + } else {
69788 + /* squalloc_right_twig_cut should have supplied locked
69789 + * left neighbor */
69790 + assert("vs-834",
69791 + znode_is_write_locked(locked_left_neighbor));
69792 + result = zload(locked_left_neighbor);
69793 + if (result)
69794 + return result;
69795 + }
69796 +
69797 + left_zloaded_here = 1;
69798 + coord_init_last_unit(&left_coord, locked_left_neighbor);
69799 + }
69800 +
69801 + if (!item_is_internal(&left_coord)) {
69802 + /* what else but extent can be on twig level */
69803 + assert("vs-606", item_is_extent(&left_coord));
69804 +
69805 + /* there is no left formatted child */
69806 + if (left_zloaded_here)
69807 + zrelse(locked_left_neighbor);
69808 + done_lh(&left_lh);
69809 + return 0;
69810 + }
69811 +
69812 + tree = znode_get_tree(left_coord.node);
69813 + left_child = child_znode(&left_coord, left_coord.node, 1, 0);
69814 +
69815 + if (IS_ERR(left_child)) {
69816 + result = PTR_ERR(left_child);
69817 + goto done;
69818 + }
69819 +
69820 + /* left child is acquired, calculate new right delimiting key for it
69821 + and get right child if it is necessary */
69822 + if (item_removed_completely
69823 + (from, kdata->params.from_key, kdata->params.to_key)) {
69824 + /* try to get right child of removed item */
69825 + coord_t right_coord;
69826 +
69827 + assert("vs-607",
69828 + kdata->params.to->unit_pos ==
69829 + coord_last_unit_pos(kdata->params.to));
69830 + coord_dup(&right_coord, kdata->params.to);
69831 + if (coord_next_unit(&right_coord)) {
69832 + /* @to is rightmost unit in the node */
69833 + result =
69834 + reiser4_get_right_neighbor(&right_lh, from->node,
69835 + ZNODE_READ_LOCK,
69836 + GN_CAN_USE_UPPER_LEVELS);
69837 + switch (result) {
69838 + case 0:
69839 + result = zload(right_lh.node);
69840 + if (result)
69841 + goto done;
69842 +
69843 + right_zloaded_here = 1;
69844 + coord_init_first_unit(&right_coord,
69845 + right_lh.node);
69846 + item_key_by_coord(&right_coord, &key);
69847 + break;
69848 +
69849 + case -E_NO_NEIGHBOR:
69850 + /* there is no formatted node to the right of
69851 + from->node */
69852 + read_lock_dk(tree);
69853 + key = *znode_get_rd_key(from->node);
69854 + read_unlock_dk(tree);
69855 + right_coord.node = NULL;
69856 + result = 0;
69857 + break;
69858 + default:
69859 + /* real error */
69860 + goto done;
69861 + }
69862 + } else {
69863 + /* there is an item to the right of @from - take its key */
69864 + item_key_by_coord(&right_coord, &key);
69865 + }
69866 +
69867 + /* try to get right child of @from */
69868 + if (right_coord.node && /* there is right neighbor of @from */
69869 + item_is_internal(&right_coord)) { /* it is internal item */
69870 + right_child = child_znode(&right_coord,
69871 + right_coord.node, 1, 0);
69872 +
69873 + if (IS_ERR(right_child)) {
69874 + result = PTR_ERR(right_child);
69875 + goto done;
69876 + }
69877 +
69878 + }
69879 + /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
69880 + update of right delimiting key of left_child */
69881 + result = prepare_children(left_child, right_child, kdata);
69882 + } else {
69883 + /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
69884 + result = prepare_children(left_child, NULL, kdata);
69885 + }
69886 +
69887 + done:
69888 + if (right_child)
69889 + zput(right_child);
69890 + if (right_zloaded_here)
69891 + zrelse(right_lh.node);
69892 + done_lh(&right_lh);
69893 +
69894 + if (left_child)
69895 + zput(left_child);
69896 + if (left_zloaded_here)
69897 + zrelse(locked_left_neighbor);
69898 + done_lh(&left_lh);
69899 + return result;
69900 +}
69901 +
69902 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
69903 + are to be cut completely */
69904 +/* for try_to_merge_with_left, delete_copied, delete_node */
69905 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
69906 + const reiser4_key * to_key, /* last key to be removed */
69907 + reiser4_key *
69908 + smallest_removed /* smallest key actually removed */ )
69909 +{
69910 + int result;
69911 + carry_pool *pool;
69912 + carry_level *lowest_level;
69913 + carry_cut_data *cut_data;
69914 + carry_op *op;
69915 +
69916 + assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
69917 +
69918 + pool =
69919 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69920 + sizeof(*cut_data));
69921 + if (IS_ERR(pool))
69922 + return PTR_ERR(pool);
69923 + lowest_level = (carry_level *) (pool + 1);
69924 + init_carry_level(lowest_level, pool);
69925 +
69926 + op = post_carry(lowest_level, COP_CUT, from->node, 0);
69927 + assert("vs-1509", op != 0);
69928 + if (IS_ERR(op)) {
69929 + done_carry_pool(pool);
69930 + return PTR_ERR(op);
69931 + }
69932 +
69933 + cut_data = (carry_cut_data *) (lowest_level + 3);
69934 + cut_data->params.from = from;
69935 + cut_data->params.to = to;
69936 + cut_data->params.from_key = from_key;
69937 + cut_data->params.to_key = to_key;
69938 + cut_data->params.smallest_removed = smallest_removed;
69939 +
69940 + op->u.cut_or_kill.is_cut = 1;
69941 + op->u.cut_or_kill.u.cut = cut_data;
69942 +
69943 + result = carry(lowest_level, NULL);
69944 + done_carry_pool(pool);
69945 +
69946 + return result;
69947 +}
69948 +
69949 +/* cut part of the node
69950 +
69951 + Cut part or whole content of node.
69952 +
69953 + cut data between @from and @to of @from->node and call carry() to make
69954 + corresponding changes in the tree. @from->node may become empty. If so -
69955 + pointer to it will be removed. Neighboring nodes are not changed. Smallest
69956 + removed key is stored in @smallest_removed
69957 +
69958 +*/
69959 +int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
69960 + coord_t * to, /* coord of the last unit/item that will be eliminated */
69961 + const reiser4_key * from_key, /* first key to be removed */
69962 + const reiser4_key * to_key, /* last key to be removed */
69963 + reiser4_key * smallest_removed, /* smallest key actually removed */
69964 + znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
69965 + * locked (in squalloc_right_twig_cut, namely) */
69966 + struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
69967 + invalidate pages together with item pointing to them */
69968 + int truncate)
69969 +{ /* this call is made for file truncate) */
69970 + int result;
69971 + carry_pool *pool;
69972 + carry_level *lowest_level;
69973 + carry_kill_data *kdata;
69974 + lock_handle *left_child;
69975 + lock_handle *right_child;
69976 + carry_op *op;
69977 +
69978 + assert("umka-328", from != NULL);
69979 + assert("vs-316", !node_is_empty(from->node));
69980 + assert("nikita-1812", coord_is_existing_unit(from)
69981 + && coord_is_existing_unit(to));
69982 +
69983 + /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
69984 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69985 + sizeof(carry_kill_data) +
69986 + 2 * sizeof(lock_handle) +
69987 + 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
69988 + if (IS_ERR(pool))
69989 + return PTR_ERR(pool);
69990 +
69991 + lowest_level = (carry_level *) (pool + 1);
69992 + init_carry_level(lowest_level, pool);
69993 +
69994 + kdata = (carry_kill_data *) (lowest_level + 3);
69995 + left_child = (lock_handle *) (kdata + 1);
69996 + right_child = left_child + 1;
69997 +
69998 + init_lh(left_child);
69999 + init_lh(right_child);
70000 +
70001 + kdata->params.from = from;
70002 + kdata->params.to = to;
70003 + kdata->params.from_key = from_key;
70004 + kdata->params.to_key = to_key;
70005 + kdata->params.smallest_removed = smallest_removed;
70006 + kdata->params.truncate = truncate;
70007 + kdata->flags = 0;
70008 + kdata->inode = inode;
70009 + kdata->left = left_child;
70010 + kdata->right = right_child;
70011 + /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
70012 + kdata->buf = (char *)(right_child + 1);
70013 +
70014 + if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
70015 + /* left child of extent item may have to get updated right
70016 + delimiting key and to get linked with right child of extent
70017 + @from if it will be removed completely */
70018 + result = prepare_twig_kill(kdata, locked_left_neighbor);
70019 + if (result) {
70020 + done_children(kdata);
70021 + done_carry_pool(pool);
70022 + return result;
70023 + }
70024 + }
70025 +
70026 + op = post_carry(lowest_level, COP_CUT, from->node, 0);
70027 + if (IS_ERR(op) || (op == NULL)) {
70028 + done_children(kdata);
70029 + done_carry_pool(pool);
70030 + return RETERR(op ? PTR_ERR(op) : -EIO);
70031 + }
70032 +
70033 + op->u.cut_or_kill.is_cut = 0;
70034 + op->u.cut_or_kill.u.kill = kdata;
70035 +
70036 + result = carry(lowest_level, NULL);
70037 +
70038 + done_children(kdata);
70039 + done_carry_pool(pool);
70040 + return result;
70041 +}
70042 +
70043 +void
70044 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
70045 +{
70046 + if (inode_get_flag(inode, REISER4_HAS_MMAP)) {
70047 + pgoff_t start_pg, end_pg;
70048 +
70049 + start_pg = start >> PAGE_CACHE_SHIFT;
70050 + end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
70051 +
70052 + if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
70053 + /*
70054 + * kill up to the page boundary.
70055 + */
70056 + assert("vs-123456", start_pg == end_pg);
70057 + reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
70058 + truncate);
70059 + } else if (start_pg != end_pg) {
70060 + /*
70061 + * page boundary is within killed portion of node.
70062 + */
70063 + assert("vs-654321", end_pg - start_pg == 1);
70064 + reiser4_invalidate_pages(inode->i_mapping, end_pg,
70065 + end_pg - start_pg, 1);
70066 + }
70067 + }
70068 + inode_sub_bytes(inode, end - start);
70069 +}
70070 +
70071 +/**
70072 + * Delete whole @node from the reiser4 tree without loading it.
70073 + *
70074 + * @left: locked left neighbor,
70075 + * @node: node to be deleted,
70076 + * @smallest_removed: leftmost key of deleted node,
70077 + * @object: inode pointer, if we truncate a file body.
70078 + * @truncate: true if called for file truncate.
70079 + *
70080 + * @return: 0 if success, error code otherwise.
70081 + *
70082 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
70083 + * contains the right value of the smallest removed key from the previous
70084 + * cut_worker() iteration. This is needed for proper accounting of
70085 + * "i_blocks" and "i_bytes" fields of the @object.
70086 + */
70087 +int delete_node(znode * node, reiser4_key * smallest_removed,
70088 + struct inode *object, int truncate)
70089 +{
70090 + lock_handle parent_lock;
70091 + coord_t cut_from;
70092 + coord_t cut_to;
70093 + reiser4_tree *tree;
70094 + int ret;
70095 +
70096 + assert("zam-937", node != NULL);
70097 + assert("zam-933", znode_is_write_locked(node));
70098 + assert("zam-999", smallest_removed != NULL);
70099 +
70100 + init_lh(&parent_lock);
70101 +
70102 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
70103 + if (ret)
70104 + return ret;
70105 +
70106 + assert("zam-934", !znode_above_root(parent_lock.node));
70107 +
70108 + ret = zload(parent_lock.node);
70109 + if (ret)
70110 + goto failed_nozrelse;
70111 +
70112 + ret = find_child_ptr(parent_lock.node, node, &cut_from);
70113 + if (ret)
70114 + goto failed;
70115 +
70116 + /* decrement child counter and set parent pointer to NULL before
70117 + deleting the list from parent node because of checks in
70118 + internal_kill_item_hook (we can delete the last item from the parent
70119 + node, the parent node is going to be deleted and its c_count should
70120 + be zero). */
70121 +
70122 + tree = znode_get_tree(node);
70123 + write_lock_tree(tree);
70124 + init_parent_coord(&node->in_parent, NULL);
70125 + --parent_lock.node->c_count;
70126 + write_unlock_tree(tree);
70127 +
70128 + assert("zam-989", item_is_internal(&cut_from));
70129 +
70130 + /* @node should be deleted after unlocking. */
70131 + ZF_SET(node, JNODE_HEARD_BANSHEE);
70132 +
70133 + /* remove a pointer from the parent node to the node being deleted. */
70134 + coord_dup(&cut_to, &cut_from);
70135 + /* FIXME: shouldn't this be kill_node_content */
70136 + ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
70137 + if (ret)
70138 + /* FIXME(Zam): Should we re-connect the node to its parent if
70139 + * cut_node fails? */
70140 + goto failed;
70141 +
70142 + {
70143 + reiser4_tree *tree = current_tree;
70144 + __u64 start_offset = 0, end_offset = 0;
70145 +
70146 + read_lock_tree(tree);
70147 + write_lock_dk(tree);
70148 + if (object) {
70149 + /* We use @smallest_removed and the left delimiting of
70150 + * the current node for @object->i_blocks, i_bytes
70151 + * calculation. We assume that the items after the
70152 + * *@smallest_removed key have been deleted from the
70153 + * file body. */
70154 + start_offset = get_key_offset(znode_get_ld_key(node));
70155 + end_offset = get_key_offset(smallest_removed);
70156 + }
70157 +
70158 + assert("zam-1021", znode_is_connected(node));
70159 + if (node->left)
70160 + znode_set_rd_key(node->left, znode_get_rd_key(node));
70161 +
70162 + *smallest_removed = *znode_get_ld_key(node);
70163 +
70164 + write_unlock_dk(tree);
70165 + read_unlock_tree(tree);
70166 +
70167 + if (object) {
70168 + /* we used to perform actions which are to be performed on items on their removal from tree in
70169 + special item method - kill_hook. Here for optimization reasons we avoid reading node
70170 + containing item we remove and can not call item's kill hook. Instead we call function which
70171 + does exactly the same things as tail kill hook in assumption that node we avoid reading
70172 + contains only one item and that item is a tail one. */
70173 + fake_kill_hook_tail(object, start_offset, end_offset,
70174 + truncate);
70175 + }
70176 + }
70177 + failed:
70178 + zrelse(parent_lock.node);
70179 + failed_nozrelse:
70180 + done_lh(&parent_lock);
70181 +
70182 + return ret;
70183 +}
70184 +
70185 +static int can_delete(const reiser4_key *key, znode *node)
70186 +{
70187 + int result;
70188 +
70189 + read_lock_dk(current_tree);
70190 + result = keyle(key, znode_get_ld_key(node));
70191 + read_unlock_dk(current_tree);
70192 + return result;
70193 +}
70194 +
70195 +/**
70196 + * This subroutine is not optimal but implementation seems to
70197 + * be easier).
70198 + *
70199 + * @tap: the point deletion process begins from,
70200 + * @from_key: the beginning of the deleted key range,
70201 + * @to_key: the end of the deleted key range,
70202 + * @smallest_removed: the smallest removed key,
70203 + * @truncate: true if called for file truncate.
70204 + * @progress: return true if a progress in file items deletions was made,
70205 + * @smallest_removed value is actual in that case.
70206 + *
70207 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70208 + * operation was interrupted for allowing atom commit .
70209 + */
70210 +int
70211 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
70212 + const reiser4_key * to_key,
70213 + reiser4_key * smallest_removed, struct inode *object,
70214 + int truncate, int *progress)
70215 +{
70216 + lock_handle next_node_lock;
70217 + coord_t left_coord;
70218 + int result;
70219 +
70220 + assert("zam-931", tap->coord->node != NULL);
70221 + assert("zam-932", znode_is_write_locked(tap->coord->node));
70222 +
70223 + *progress = 0;
70224 + init_lh(&next_node_lock);
70225 +
70226 + while (1) {
70227 + znode *node; /* node from which items are cut */
70228 + node_plugin *nplug; /* node plugin for @node */
70229 +
70230 + node = tap->coord->node;
70231 +
70232 + /* Move next_node_lock to the next node on the left. */
70233 + result =
70234 + reiser4_get_left_neighbor(&next_node_lock, node,
70235 + ZNODE_WRITE_LOCK,
70236 + GN_CAN_USE_UPPER_LEVELS);
70237 + if (result != 0 && result != -E_NO_NEIGHBOR)
70238 + break;
70239 + /* Check can we delete the node as a whole. */
70240 + if (*progress && znode_get_level(node) == LEAF_LEVEL &&
70241 + can_delete(from_key, node)) {
70242 + result = delete_node(node, smallest_removed, object,
70243 + truncate);
70244 + } else {
70245 + result = tap_load(tap);
70246 + if (result)
70247 + return result;
70248 +
70249 + /* Prepare the second (right) point for cut_node() */
70250 + if (*progress)
70251 + coord_init_last_unit(tap->coord, node);
70252 +
70253 + else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70254 + NULL)
70255 + /* set rightmost unit for the items without lookup method */
70256 + tap->coord->unit_pos =
70257 + coord_last_unit_pos(tap->coord);
70258 +
70259 + nplug = node->nplug;
70260 +
70261 + assert("vs-686", nplug);
70262 + assert("vs-687", nplug->lookup);
70263 +
70264 + /* left_coord is leftmost unit cut from @node */
70265 + result = nplug->lookup(node, from_key,
70266 + FIND_MAX_NOT_MORE_THAN,
70267 + &left_coord);
70268 +
70269 + if (IS_CBKERR(result))
70270 + break;
70271 +
70272 + /* adjust coordinates so that they are set to existing units */
70273 + if (coord_set_to_right(&left_coord)
70274 + || coord_set_to_left(tap->coord)) {
70275 + result = 0;
70276 + break;
70277 + }
70278 +
70279 + if (coord_compare(&left_coord, tap->coord) ==
70280 + COORD_CMP_ON_RIGHT) {
70281 + /* keys from @from_key to @to_key are not in the tree */
70282 + result = 0;
70283 + break;
70284 + }
70285 +
70286 + if (left_coord.item_pos != tap->coord->item_pos) {
70287 + /* do not allow to cut more than one item. It is added to solve problem of truncating
70288 + partially converted files. If file is partially converted there may exist a twig node
70289 + containing both internal item or items pointing to leaf nodes with formatting items
70290 + and extent item. We do not want to kill internal items being at twig node here
70291 + because cut_tree_worker assumes killing them from level level */
70292 + coord_dup(&left_coord, tap->coord);
70293 + assert("vs-1652",
70294 + coord_is_existing_unit(&left_coord));
70295 + left_coord.unit_pos = 0;
70296 + }
70297 +
70298 + /* cut data from one node */
70299 + // *smallest_removed = *min_key();
70300 + result =
70301 + kill_node_content(&left_coord, tap->coord, from_key,
70302 + to_key, smallest_removed,
70303 + next_node_lock.node, object,
70304 + truncate);
70305 + tap_relse(tap);
70306 + }
70307 + if (result)
70308 + break;
70309 +
70310 + ++(*progress);
70311 +
70312 + /* Check whether all items with keys >= from_key were removed
70313 + * from the tree. */
70314 + if (keyle(smallest_removed, from_key))
70315 + /* result = 0; */
70316 + break;
70317 +
70318 + if (next_node_lock.node == NULL)
70319 + break;
70320 +
70321 + result = tap_move(tap, &next_node_lock);
70322 + done_lh(&next_node_lock);
70323 + if (result)
70324 + break;
70325 +
70326 + /* Break long cut_tree operation (deletion of a large file) if
70327 + * atom requires commit. */
70328 + if (*progress > CUT_TREE_MIN_ITERATIONS
70329 + && current_atom_should_commit()) {
70330 + result = -E_REPEAT;
70331 + break;
70332 + }
70333 + }
70334 + done_lh(&next_node_lock);
70335 + // assert("vs-301", !keyeq(&smallest_removed, min_key()));
70336 + return result;
70337 +}
70338 +
70339 +/* there is a fundamental problem with optimizing deletes: VFS does it
70340 + one file at a time. Another problem is that if an item can be
70341 + anything, then deleting items must be done one at a time. It just
70342 + seems clean to writes this to specify a from and a to key, and cut
70343 + everything between them though. */
70344 +
70345 +/* use this function with care if deleting more than what is part of a single file. */
70346 +/* do not use this when cutting a single item, it is suboptimal for that */
70347 +
70348 +/* You are encouraged to write plugin specific versions of this. It
70349 + cannot be optimal for all plugins because it works item at a time,
70350 + and some plugins could sometimes work node at a time. Regular files
70351 + however are not optimizable to work node at a time because of
70352 + extents needing to free the blocks they point to.
70353 +
70354 + Optimizations compared to v3 code:
70355 +
70356 + It does not balance (that task is left to memory pressure code).
70357 +
70358 + Nodes are deleted only if empty.
70359 +
70360 + Uses extents.
70361 +
70362 + Performs read-ahead of formatted nodes whose contents are part of
70363 + the deletion.
70364 +*/
70365 +
70366 +/**
70367 + * Delete everything from the reiser4 tree between two keys: @from_key and
70368 + * @to_key.
70369 + *
70370 + * @from_key: the beginning of the deleted key range,
70371 + * @to_key: the end of the deleted key range,
70372 + * @smallest_removed: the smallest removed key,
70373 + * @object: owner of cutting items.
70374 + * @truncate: true if called for file truncate.
70375 + * @progress: return true if a progress in file items deletions was made,
70376 + * @smallest_removed value is actual in that case.
70377 + *
70378 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70379 + * operation was interrupted for allowing atom commit .
70380 + */
70381 +
70382 +int
70383 +cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70384 + const reiser4_key * to_key, reiser4_key * smallest_removed_p,
70385 + struct inode *object, int truncate, int *progress)
70386 +{
70387 + lock_handle lock;
70388 + int result;
70389 + tap_t tap;
70390 + coord_t right_coord;
70391 + reiser4_key smallest_removed;
70392 + int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70393 + const reiser4_key *, reiser4_key *,
70394 + struct inode *, int, int *);
70395 + STORE_COUNTERS;
70396 +
70397 + assert("umka-329", tree != NULL);
70398 + assert("umka-330", from_key != NULL);
70399 + assert("umka-331", to_key != NULL);
70400 + assert("zam-936", keyle(from_key, to_key));
70401 +
70402 + if (smallest_removed_p == NULL)
70403 + smallest_removed_p = &smallest_removed;
70404 +
70405 + init_lh(&lock);
70406 +
70407 + do {
70408 + /* Find rightmost item to cut away from the tree. */
70409 + result = object_lookup(object, to_key, &right_coord, &lock,
70410 + ZNODE_WRITE_LOCK, FIND_MAX_NOT_MORE_THAN,
70411 + TWIG_LEVEL, LEAF_LEVEL, CBK_UNIQUE,
70412 + NULL /*ra_info */ );
70413 + if (result != CBK_COORD_FOUND)
70414 + break;
70415 + if (object == NULL
70416 + || inode_file_plugin(object)->cut_tree_worker == NULL)
70417 + cut_tree_worker = cut_tree_worker_common;
70418 + else
70419 + cut_tree_worker =
70420 + inode_file_plugin(object)->cut_tree_worker;
70421 + tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70422 + result =
70423 + cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70424 + object, truncate, progress);
70425 + tap_done(&tap);
70426 +
70427 + preempt_point();
70428 +
70429 + } while (0);
70430 +
70431 + done_lh(&lock);
70432 +
70433 + if (result) {
70434 + switch (result) {
70435 + case -E_NO_NEIGHBOR:
70436 + result = 0;
70437 + break;
70438 + case -E_DEADLOCK:
70439 + result = -E_REPEAT;
70440 + case -E_REPEAT:
70441 + case -ENOMEM:
70442 + case -ENOENT:
70443 + break;
70444 + default:
70445 + warning("nikita-2861", "failure: %i", result);
70446 + }
70447 + }
70448 +
70449 + CHECK_COUNTERS;
70450 + return result;
70451 +}
70452 +
70453 +/* repeat cut_tree_object until everything is deleted. unlike cut_file_items, it
70454 + * does not end current transaction if -E_REPEAT is returned by
70455 + * cut_tree_object. */
70456 +int
70457 +cut_tree(reiser4_tree * tree, const reiser4_key * from, const reiser4_key * to,
70458 + struct inode *inode, int truncate)
70459 +{
70460 + int result;
70461 + int progress;
70462 +
70463 + do {
70464 + result =
70465 + cut_tree_object(tree, from, to, NULL, inode, truncate,
70466 + &progress);
70467 + } while (result == -E_REPEAT);
70468 +
70469 + return result;
70470 +}
70471 +
70472 +/* finishing reiser4 initialization */
70473 +int init_tree(reiser4_tree * tree /* pointer to structure being
70474 + * initialized */ ,
70475 + const reiser4_block_nr * root_block /* address of a root block
70476 + * on a disk */ ,
70477 + tree_level height /* height of a tree */ ,
70478 + node_plugin * nplug /* default node plugin */ )
70479 +{
70480 + int result;
70481 +
70482 + assert("nikita-306", tree != NULL);
70483 + assert("nikita-307", root_block != NULL);
70484 + assert("nikita-308", height > 0);
70485 + assert("nikita-309", nplug != NULL);
70486 + assert("zam-587", tree->super != NULL);
70487 +
70488 + tree->root_block = *root_block;
70489 + tree->height = height;
70490 + tree->estimate_one_insert = calc_estimate_one_insert(height);
70491 + tree->nplug = nplug;
70492 +
70493 + tree->znode_epoch = 1ull;
70494 +
70495 + cbk_cache_init(&tree->cbk_cache);
70496 +
70497 + result = znodes_tree_init(tree);
70498 + if (result == 0)
70499 + result = jnodes_tree_init(tree);
70500 + if (result == 0) {
70501 + tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, get_gfp_mask());
70502 + if (IS_ERR(tree->uber)) {
70503 + result = PTR_ERR(tree->uber);
70504 + tree->uber = NULL;
70505 + }
70506 + }
70507 + return result;
70508 +}
70509 +
70510 +/* release resources associated with @tree */
70511 +void done_tree(reiser4_tree * tree /* tree to release */ )
70512 +{
70513 + if (tree == NULL)
70514 + return;
70515 +
70516 + if (tree->uber != NULL) {
70517 + zput(tree->uber);
70518 + tree->uber = NULL;
70519 + }
70520 + znodes_tree_done(tree);
70521 + jnodes_tree_done(tree);
70522 + cbk_cache_done(&tree->cbk_cache);
70523 +}
70524 +
70525 +/* Make Linus happy.
70526 + Local variables:
70527 + c-indentation-style: "K&R"
70528 + mode-name: "LC"
70529 + c-basic-offset: 8
70530 + tab-width: 8
70531 + fill-column: 120
70532 + scroll-step: 1
70533 + End:
70534 +*/
70535 Index: linux-2.6.16/fs/reiser4/tree.h
70536 ===================================================================
70537 --- /dev/null
70538 +++ linux-2.6.16/fs/reiser4/tree.h
70539 @@ -0,0 +1,579 @@
70540 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70541 + * reiser4/README */
70542 +
70543 +/* Tree operations. See fs/reiser4/tree.c for comments */
70544 +
70545 +#if !defined( __REISER4_TREE_H__ )
70546 +#define __REISER4_TREE_H__
70547 +
70548 +#include "forward.h"
70549 +#include "debug.h"
70550 +#include "dformat.h"
70551 +#include "plugin/node/node.h"
70552 +#include "plugin/plugin.h"
70553 +#include "znode.h"
70554 +#include "tap.h"
70555 +
70556 +#include <linux/types.h> /* for __u?? */
70557 +#include <linux/fs.h> /* for struct super_block */
70558 +#include <linux/spinlock.h>
70559 +#include <linux/sched.h> /* for struct task_struct */
70560 +
70561 +/* fictive block number never actually used */
70562 +extern const reiser4_block_nr UBER_TREE_ADDR;
70563 +
70564 +/* &cbk_cache_slot - entry in a coord cache.
70565 +
70566 + This is entry in a coord_by_key (cbk) cache, represented by
70567 + &cbk_cache.
70568 +
70569 +*/
70570 +typedef struct cbk_cache_slot {
70571 + /* cached node */
70572 + znode *node;
70573 + /* linkage to the next cbk cache slot in a LRU order */
70574 + struct list_head lru;
70575 +} cbk_cache_slot;
70576 +
70577 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
70578 +
70579 + cbk_cache is supposed to speed up tree lookups by caching results of recent
70580 + successful lookups (we don't cache negative results as dentry cache
70581 + does). Cache consists of relatively small number of entries kept in a LRU
70582 + order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
70583 + which we can obtain a range of keys that covered by this znode. Before
70584 + embarking into real tree traversal we scan cbk_cache slot by slot and for
70585 + each slot check whether key we are looking for is between minimal and
70586 + maximal keys for node pointed to by this slot. If no match is found, real
70587 + tree traversal is performed and if result is successful, appropriate entry
70588 + is inserted into cache, possibly pulling least recently used entry out of
70589 + it.
70590 +
70591 + Tree spin lock is used to protect coord cache. If contention for this
70592 + lock proves to be too high, more finer grained locking can be added.
70593 +
70594 + Invariants involving parts of this data-type:
70595 +
70596 + [cbk-cache-invariant]
70597 +*/
70598 +typedef struct cbk_cache {
70599 + /* serializator */
70600 + rwlock_t guard;
70601 + int nr_slots;
70602 + /* head of LRU list of cache slots */
70603 + struct list_head lru;
70604 + /* actual array of slots */
70605 + cbk_cache_slot *slot;
70606 +} cbk_cache;
70607 +
70608 +
70609 +/* level_lookup_result - possible outcome of looking up key at some level.
70610 + This is used by coord_by_key when traversing tree downward. */
70611 +typedef enum {
70612 + /* continue to the next level */
70613 + LOOKUP_CONT,
70614 + /* done. Either required item was found, or we can prove it
70615 + doesn't exist, or some error occurred. */
70616 + LOOKUP_DONE,
70617 + /* restart traversal from the root. Infamous "repetition". */
70618 + LOOKUP_REST
70619 +} level_lookup_result;
70620 +
70621 +/* This is representation of internal reiser4 tree where all file-system
70622 + data and meta-data are stored. This structure is passed to all tree
70623 + manipulation functions. It's different from the super block because:
70624 + we don't want to limit ourselves to strictly one to one mapping
70625 + between super blocks and trees, and, because they are logically
70626 + different: there are things in a super block that have no relation to
70627 + the tree (bitmaps, journalling area, mount options, etc.) and there
70628 + are things in a tree that bear no relation to the super block, like
70629 + tree of znodes.
70630 +
70631 + At this time, there is only one tree
70632 + per filesystem, and this struct is part of the super block. We only
70633 + call the super block the super block for historical reasons (most
70634 + other filesystems call the per filesystem metadata the super block).
70635 +*/
70636 +
70637 +struct reiser4_tree {
70638 + /* block_nr == 0 is fake znode. Write lock it, while changing
70639 + tree height. */
70640 + /* disk address of root node of a tree */
70641 + reiser4_block_nr root_block;
70642 +
70643 + /* level of the root node. If this is 1, tree consists of root
70644 + node only */
70645 + tree_level height;
70646 +
70647 + /*
70648 + * this is cached here avoid calling plugins through function
70649 + * dereference all the time.
70650 + */
70651 + __u64 estimate_one_insert;
70652 +
70653 + /* cache of recent tree lookup results */
70654 + cbk_cache cbk_cache;
70655 +
70656 + /* hash table to look up znodes by block number. */
70657 + z_hash_table zhash_table;
70658 + z_hash_table zfake_table;
70659 + /* hash table to look up jnodes by inode and offset. */
70660 + j_hash_table jhash_table;
70661 +
70662 + /* lock protecting:
70663 + - parent pointers,
70664 + - sibling pointers,
70665 + - znode hash table
70666 + - coord cache
70667 + */
70668 + /* NOTE: The "giant" tree lock can be replaced by more spin locks,
70669 + hoping they will be less contented. We can use one spin lock per one
70670 + znode hash bucket. With adding of some code complexity, sibling
70671 + pointers can be protected by both znode spin locks. However it looks
70672 + more SMP scalable we should test this locking change on n-ways (n >
70673 + 4) SMP machines. Current 4-ways machine test does not show that tree
70674 + lock is contented and it is a bottleneck (2003.07.25). */
70675 +
70676 + rwlock_t tree_lock;
70677 +
70678 + /* lock protecting delimiting keys */
70679 + rwlock_t dk_lock;
70680 +
70681 + /* spin lock protecting znode_epoch */
70682 + spinlock_t epoch_lock;
70683 + /* version stamp used to mark znode updates. See seal.[ch] for more
70684 + * information. */
70685 + __u64 znode_epoch;
70686 +
70687 + znode *uber;
70688 + node_plugin *nplug;
70689 + struct super_block *super;
70690 + struct {
70691 + /* carry flags used for insertion of new nodes */
70692 + __u32 new_node_flags;
70693 + /* carry flags used for insertion of new extents */
70694 + __u32 new_extent_flags;
70695 + /* carry flags used for paste operations */
70696 + __u32 paste_flags;
70697 + /* carry flags used for insert operations */
70698 + __u32 insert_flags;
70699 + } carry;
70700 +};
70701 +
70702 +extern int init_tree(reiser4_tree * tree,
70703 + const reiser4_block_nr * root_block, tree_level height,
70704 + node_plugin * default_plugin);
70705 +extern void done_tree(reiser4_tree * tree);
70706 +
70707 +/* cbk flags: options for coord_by_key() */
70708 +typedef enum {
70709 + /* coord_by_key() is called for insertion. This is necessary because
70710 + of extents being located at the twig level. For explanation, see
70711 + comment just above is_next_item_internal().
70712 + */
70713 + CBK_FOR_INSERT = (1 << 0),
70714 + /* coord_by_key() is called with key that is known to be unique */
70715 + CBK_UNIQUE = (1 << 1),
70716 + /* coord_by_key() can trust delimiting keys. This options is not user
70717 + accessible. coord_by_key() will set it automatically. It will be
70718 + only cleared by special-case in extents-on-the-twig-level handling
70719 + where it is necessary to insert item with a key smaller than
70720 + leftmost key in a node. This is necessary because of extents being
70721 + located at the twig level. For explanation, see comment just above
70722 + is_next_item_internal().
70723 + */
70724 + CBK_TRUST_DK = (1 << 2),
70725 + CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
70726 + CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
70727 + CBK_DKSET = (1 << 5),
70728 + CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
70729 + CBK_IN_CACHE = (1 << 7), /* node is already in cache */
70730 + CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
70731 + * lock */
70732 +} cbk_flags;
70733 +
70734 +/* insertion outcome. IBK = insert by key */
70735 +typedef enum {
70736 + IBK_INSERT_OK = 0,
70737 + IBK_ALREADY_EXISTS = -EEXIST,
70738 + IBK_IO_ERROR = -EIO,
70739 + IBK_NO_SPACE = -E_NODE_FULL,
70740 + IBK_OOM = -ENOMEM
70741 +} insert_result;
70742 +
70743 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
70744 +
70745 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
70746 + lock_handle * lh, void *arg);
70747 +extern int iterate_tree(reiser4_tree * tree, coord_t * coord, lock_handle * lh,
70748 + tree_iterate_actor_t actor, void *arg,
70749 + znode_lock_mode mode, int through_units_p);
70750 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
70751 + znode_lock_request pri, lock_handle * lh);
70752 +
70753 +/* return node plugin of @node */
70754 +static inline node_plugin *node_plugin_by_node(const znode *
70755 + node /* node to query */ )
70756 +{
70757 + assert("vs-213", node != NULL);
70758 + assert("vs-214", znode_is_loaded(node));
70759 +
70760 + return node->nplug;
70761 +}
70762 +
70763 +/* number of items in @node */
70764 +static inline pos_in_node_t node_num_items(const znode * node)
70765 +{
70766 + assert("nikita-2754", znode_is_loaded(node));
70767 + assert("nikita-2468",
70768 + node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
70769 +
70770 + return node->nr_items;
70771 +}
70772 +
70773 +/* Return the number of items at the present node. Asserts coord->node !=
70774 + NULL. */
70775 +static inline unsigned coord_num_items(const coord_t * coord)
70776 +{
70777 + assert("jmacd-9805", coord->node != NULL);
70778 +
70779 + return node_num_items(coord->node);
70780 +}
70781 +
70782 +/* true if @node is empty */
70783 +static inline int node_is_empty(const znode * node)
70784 +{
70785 + return node_num_items(node) == 0;
70786 +}
70787 +
70788 +typedef enum {
70789 + SHIFTED_SOMETHING = 0,
70790 + SHIFT_NO_SPACE = -E_NODE_FULL,
70791 + SHIFT_IO_ERROR = -EIO,
70792 + SHIFT_OOM = -ENOMEM,
70793 +} shift_result;
70794 +
70795 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
70796 +extern int is_coord_in_node(const coord_t * coord);
70797 +extern int key_in_node(const reiser4_key *, const coord_t *);
70798 +extern void coord_item_move_to(coord_t * coord, int items);
70799 +extern void coord_unit_move_to(coord_t * coord, int units);
70800 +
70801 +/* there are two types of repetitive accesses (ra): intra-syscall
70802 + (local) and inter-syscall (global). Local ra is used when
70803 + during single syscall we add/delete several items and units in the
70804 + same place in a tree. Note that plan-A fragments local ra by
70805 + separating stat-data and file body in key-space. Global ra is
70806 + used when user does repetitive modifications in the same place in a
70807 + tree.
70808 +
70809 + Our ra implementation serves following purposes:
70810 + 1 it affects balancing decisions so that next operation in a row
70811 + can be performed faster;
70812 + 2 it affects lower-level read-ahead in page-cache;
70813 + 3 it allows to avoid unnecessary lookups by maintaining some state
70814 + across several operations (this is only for local ra);
70815 + 4 it leaves room for lazy-micro-balancing: when we start a sequence of
70816 + operations they are performed without actually doing any intra-node
70817 + shifts, until we finish sequence or scope of sequence leaves
70818 + current node, only then we really pack node (local ra only).
70819 +*/
70820 +
70821 +/* another thing that can be useful is to keep per-tree and/or
70822 + per-process cache of recent lookups. This cache can be organised as a
70823 + list of block numbers of formatted nodes sorted by starting key in
70824 + this node. Balancings should invalidate appropriate parts of this
70825 + cache.
70826 +*/
70827 +
70828 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
70829 + coord_t * coord, lock_handle * handle,
70830 + znode_lock_mode lock, lookup_bias bias,
70831 + tree_level lock_level, tree_level stop_level,
70832 + __u32 flags, ra_info_t *);
70833 +
70834 +lookup_result object_lookup(struct inode *object,
70835 + const reiser4_key * key,
70836 + coord_t * coord,
70837 + lock_handle * lh,
70838 + znode_lock_mode lock_mode,
70839 + lookup_bias bias,
70840 + tree_level lock_level,
70841 + tree_level stop_level,
70842 + __u32 flags, ra_info_t * info);
70843 +
70844 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
70845 + reiser4_item_data * data, coord_t * coord,
70846 + lock_handle * lh,
70847 + tree_level stop_level, __u32 flags);
70848 +insert_result insert_by_coord(coord_t * coord,
70849 + reiser4_item_data * data, const reiser4_key * key,
70850 + lock_handle * lh, __u32);
70851 +insert_result insert_extent_by_coord(coord_t * coord,
70852 + reiser4_item_data * data,
70853 + const reiser4_key * key, lock_handle * lh);
70854 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
70855 + const reiser4_key * to_key,
70856 + reiser4_key * smallest_removed);
70857 +int kill_node_content(coord_t * from, coord_t * to,
70858 + const reiser4_key * from_key, const reiser4_key * to_key,
70859 + reiser4_key * smallest_removed,
70860 + znode * locked_left_neighbor, struct inode *inode,
70861 + int truncate);
70862 +
70863 +int resize_item(coord_t * coord, reiser4_item_data * data,
70864 + reiser4_key * key, lock_handle * lh, cop_insert_flag);
70865 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
70866 + reiser4_item_data * data, unsigned);
70867 +int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
70868 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
70869 + coord_t * result);
70870 +
70871 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
70872 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
70873 +
70874 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
70875 +
70876 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
70877 + const reiser4_key *, reiser4_key *,
70878 + struct inode *, int, int *);
70879 +extern int cut_tree_object(reiser4_tree *, const reiser4_key *,
70880 + const reiser4_key *, reiser4_key *, struct inode *,
70881 + int, int *);
70882 +extern int cut_tree(reiser4_tree * tree, const reiser4_key * from,
70883 + const reiser4_key * to, struct inode *, int);
70884 +
70885 +extern int delete_node(znode * node, reiser4_key *, struct inode *, int);
70886 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
70887 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
70888 + znode * left, coord_t * result);
70889 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
70890 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
70891 + znode * child);
70892 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
70893 + int incore_p, int setup_dkeys_p);
70894 +
70895 +extern int cbk_cache_init(cbk_cache * cache);
70896 +extern void cbk_cache_done(cbk_cache * cache);
70897 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
70898 +
70899 +extern char *sprint_address(const reiser4_block_nr * block);
70900 +
70901 +#if REISER4_DEBUG
70902 +extern void print_coord_content(const char *prefix, coord_t * p);
70903 +extern void reiser4_print_address(const char *prefix,
70904 + const reiser4_block_nr * block);
70905 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
70906 + __u32 flags);
70907 +extern void check_dkeys(znode *node);
70908 +#else
70909 +#define print_coord_content(p, c) noop
70910 +#define reiser4_print_address(p, b) noop
70911 +#endif
70912 +
70913 +extern void forget_znode(lock_handle * handle);
70914 +extern int deallocate_znode(znode * node);
70915 +
70916 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
70917 +
70918 +/* struct used internally to pack all numerous arguments of tree lookup.
70919 + Used to avoid passing a lot of arguments to helper functions. */
70920 +typedef struct cbk_handle {
70921 + /* tree we are in */
70922 + reiser4_tree *tree;
70923 + /* key we are going after */
70924 + const reiser4_key *key;
70925 + /* coord we will store result in */
70926 + coord_t *coord;
70927 + /* type of lock to take on target node */
70928 + znode_lock_mode lock_mode;
70929 + /* lookup bias. See comments at the declaration of lookup_bias */
70930 + lookup_bias bias;
70931 + /* lock level: level starting from which tree traversal starts taking
70932 + * write locks. */
70933 + tree_level lock_level;
70934 + /* level where search will stop. Either item will be found between
70935 + lock_level and stop_level, or CBK_COORD_NOTFOUND will be
70936 + returned.
70937 + */
70938 + tree_level stop_level;
70939 + /* level we are currently at */
70940 + tree_level level;
70941 + /* block number of @active node. Tree traversal operates on two
70942 + nodes: active and parent. */
70943 + reiser4_block_nr block;
70944 + /* put here error message to be printed by caller */
70945 + const char *error;
70946 + /* result passed back to caller */
70947 + lookup_result result;
70948 + /* lock handles for active and parent */
70949 + lock_handle *parent_lh;
70950 + lock_handle *active_lh;
70951 + reiser4_key ld_key;
70952 + reiser4_key rd_key;
70953 + /* flags, passed to the cbk routine. Bits of this bitmask are defined
70954 + in tree.h:cbk_flags enum. */
70955 + __u32 flags;
70956 + ra_info_t *ra_info;
70957 + struct inode *object;
70958 +} cbk_handle;
70959 +
70960 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
70961 +
70962 +/* eottl.c */
70963 +extern int handle_eottl(cbk_handle *h, int *outcome);
70964 +
70965 +int lookup_multikey(cbk_handle * handle, int nr_keys);
70966 +int lookup_couple(reiser4_tree * tree,
70967 + const reiser4_key * key1, const reiser4_key * key2,
70968 + coord_t * coord1, coord_t * coord2,
70969 + lock_handle * lh1, lock_handle * lh2,
70970 + znode_lock_mode lock_mode, lookup_bias bias,
70971 + tree_level lock_level, tree_level stop_level, __u32 flags,
70972 + int *result1, int *result2);
70973 +
70974 +
70975 +static inline void read_lock_tree(reiser4_tree *tree)
70976 +{
70977 + /* check that tree is not locked */
70978 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
70979 + LOCK_CNT_NIL(read_locked_tree) &&
70980 + LOCK_CNT_NIL(write_locked_tree)));
70981 + /* check that spinlocks of lower priorities are not held */
70982 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
70983 + LOCK_CNT_NIL(rw_locked_dk) &&
70984 + LOCK_CNT_NIL(spin_locked_stack)));
70985 +
70986 + read_lock(&(tree->tree_lock));
70987 +
70988 + LOCK_CNT_INC(read_locked_tree);
70989 + LOCK_CNT_INC(rw_locked_tree);
70990 + LOCK_CNT_INC(spin_locked);
70991 +}
70992 +
70993 +static inline void read_unlock_tree(reiser4_tree *tree)
70994 +{
70995 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
70996 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
70997 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70998 +
70999 + LOCK_CNT_DEC(read_locked_tree);
71000 + LOCK_CNT_DEC(rw_locked_tree);
71001 + LOCK_CNT_DEC(spin_locked);
71002 +
71003 + read_unlock(&(tree->tree_lock));
71004 +}
71005 +
71006 +static inline void write_lock_tree(reiser4_tree *tree)
71007 +{
71008 + /* check that tree is not locked */
71009 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71010 + LOCK_CNT_NIL(read_locked_tree) &&
71011 + LOCK_CNT_NIL(write_locked_tree)));
71012 + /* check that spinlocks of lower priorities are not held */
71013 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71014 + LOCK_CNT_NIL(rw_locked_dk) &&
71015 + LOCK_CNT_NIL(spin_locked_stack)));
71016 +
71017 + write_lock(&(tree->tree_lock));
71018 +
71019 + LOCK_CNT_INC(write_locked_tree);
71020 + LOCK_CNT_INC(rw_locked_tree);
71021 + LOCK_CNT_INC(spin_locked);
71022 +}
71023 +
71024 +static inline void write_unlock_tree(reiser4_tree *tree)
71025 +{
71026 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
71027 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71028 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71029 +
71030 + LOCK_CNT_DEC(write_locked_tree);
71031 + LOCK_CNT_DEC(rw_locked_tree);
71032 + LOCK_CNT_DEC(spin_locked);
71033 +
71034 + write_unlock(&(tree->tree_lock));
71035 +}
71036 +
71037 +static inline void read_lock_dk(reiser4_tree *tree)
71038 +{
71039 + /* check that dk is not locked */
71040 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71041 + LOCK_CNT_NIL(read_locked_dk) &&
71042 + LOCK_CNT_NIL(write_locked_dk)));
71043 + /* check that spinlocks of lower priorities are not held */
71044 + assert("", LOCK_CNT_NIL(spin_locked_stack));
71045 +
71046 + read_lock(&((tree)->dk_lock));
71047 +
71048 + LOCK_CNT_INC(read_locked_dk);
71049 + LOCK_CNT_INC(rw_locked_dk);
71050 + LOCK_CNT_INC(spin_locked);
71051 +}
71052 +
71053 +static inline void read_unlock_dk(reiser4_tree *tree)
71054 +{
71055 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
71056 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71057 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71058 +
71059 + LOCK_CNT_DEC(read_locked_dk);
71060 + LOCK_CNT_DEC(rw_locked_dk);
71061 + LOCK_CNT_DEC(spin_locked);
71062 +
71063 + read_unlock(&(tree->dk_lock));
71064 +}
71065 +
71066 +static inline void write_lock_dk(reiser4_tree *tree)
71067 +{
71068 + /* check that dk is not locked */
71069 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71070 + LOCK_CNT_NIL(read_locked_dk) &&
71071 + LOCK_CNT_NIL(write_locked_dk)));
71072 + /* check that spinlocks of lower priorities are not held */
71073 + assert("", LOCK_CNT_NIL(spin_locked_stack));
71074 +
71075 + write_lock(&((tree)->dk_lock));
71076 +
71077 + LOCK_CNT_INC(write_locked_dk);
71078 + LOCK_CNT_INC(rw_locked_dk);
71079 + LOCK_CNT_INC(spin_locked);
71080 +}
71081 +
71082 +static inline void write_unlock_dk(reiser4_tree *tree)
71083 +{
71084 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
71085 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71086 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71087 +
71088 + LOCK_CNT_DEC(write_locked_dk);
71089 + LOCK_CNT_DEC(rw_locked_dk);
71090 + LOCK_CNT_DEC(spin_locked);
71091 +
71092 + write_unlock(&(tree->dk_lock));
71093 +}
71094 +
71095 +/* estimate api. Implementation is in estimate.c */
71096 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
71097 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
71098 +reiser4_block_nr estimate_insert_flow(tree_level);
71099 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
71100 +reiser4_block_nr calc_estimate_one_insert(tree_level);
71101 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
71102 +reiser4_block_nr estimate_insert_cluster(struct inode *);
71103 +reiser4_block_nr estimate_update_cluster(struct inode *);
71104 +
71105 +
71106 +/* __REISER4_TREE_H__ */
71107 +#endif
71108 +
71109 +/* Make Linus happy.
71110 + Local variables:
71111 + c-indentation-style: "K&R"
71112 + mode-name: "LC"
71113 + c-basic-offset: 8
71114 + tab-width: 8
71115 + fill-column: 120
71116 + scroll-step: 1
71117 + End:
71118 +*/
71119 Index: linux-2.6.16/fs/reiser4/tree_mod.c
71120 ===================================================================
71121 --- /dev/null
71122 +++ linux-2.6.16/fs/reiser4/tree_mod.c
71123 @@ -0,0 +1,383 @@
71124 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71125 + * reiser4/README */
71126 +
71127 +/*
71128 + * Functions to add/delete new nodes to/from the tree.
71129 + *
71130 + * Functions from this file are used by carry (see carry*) to handle:
71131 + *
71132 + * . insertion of new formatted node into tree
71133 + *
71134 + * . addition of new tree root, increasing tree height
71135 + *
71136 + * . removing tree root, decreasing tree height
71137 + *
71138 + */
71139 +
71140 +#include "forward.h"
71141 +#include "debug.h"
71142 +#include "dformat.h"
71143 +#include "key.h"
71144 +#include "coord.h"
71145 +#include "plugin/plugin.h"
71146 +#include "jnode.h"
71147 +#include "znode.h"
71148 +#include "tree_mod.h"
71149 +#include "block_alloc.h"
71150 +#include "tree_walk.h"
71151 +#include "tree.h"
71152 +#include "super.h"
71153 +
71154 +#include <linux/err.h>
71155 +
71156 +static int add_child_ptr(znode * parent, znode * child);
71157 +/* warning only issued if error is not -E_REPEAT */
71158 +#define ewarning( error, ... ) \
71159 + if( ( error ) != -E_REPEAT ) \
71160 + warning( __VA_ARGS__ )
71161 +
71162 +/* allocate new node on the @level and immediately on the right of @brother. */
71163 +znode *new_node(znode * brother /* existing left neighbor of new node */ ,
71164 + tree_level level /* tree level at which new node is to
71165 + * be allocated */ )
71166 +{
71167 + znode *result;
71168 + int retcode;
71169 + reiser4_block_nr blocknr;
71170 +
71171 + assert("nikita-930", brother != NULL);
71172 + assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
71173 +
71174 + retcode = assign_fake_blocknr_formatted(&blocknr);
71175 + if (retcode == 0) {
71176 + result =
71177 + zget(znode_get_tree(brother), &blocknr, NULL, level,
71178 + get_gfp_mask());
71179 + if (IS_ERR(result)) {
71180 + ewarning(PTR_ERR(result), "nikita-929",
71181 + "Cannot allocate znode for carry: %li",
71182 + PTR_ERR(result));
71183 + return result;
71184 + }
71185 + /* cheap test, can be executed even when debugging is off */
71186 + if (!znode_just_created(result)) {
71187 + warning("nikita-2213",
71188 + "Allocated already existing block: %llu",
71189 + (unsigned long long)blocknr);
71190 + zput(result);
71191 + return ERR_PTR(RETERR(-EIO));
71192 + }
71193 +
71194 + assert("nikita-931", result != NULL);
71195 + result->nplug = znode_get_tree(brother)->nplug;
71196 + assert("nikita-933", result->nplug != NULL);
71197 +
71198 + retcode = zinit_new(result, get_gfp_mask());
71199 + if (retcode == 0) {
71200 + ZF_SET(result, JNODE_CREATED);
71201 + zrelse(result);
71202 + } else {
71203 + zput(result);
71204 + result = ERR_PTR(retcode);
71205 + }
71206 + } else {
71207 + /* failure to allocate new node during balancing.
71208 + This should never happen. Ever. Returning -E_REPEAT
71209 + is not viable solution, because "out of disk space"
71210 + is not transient error that will go away by itself.
71211 + */
71212 + ewarning(retcode, "nikita-928",
71213 + "Cannot allocate block for carry: %i", retcode);
71214 + result = ERR_PTR(retcode);
71215 + }
71216 + assert("nikita-1071", result != NULL);
71217 + return result;
71218 +}
71219 +
71220 +/* allocate new root and add it to the tree
71221 +
71222 + This helper function is called by add_new_root().
71223 +
71224 +*/
71225 +znode *add_tree_root(znode * old_root /* existing tree root */ ,
71226 + znode * fake /* "fake" znode */ )
71227 +{
71228 + reiser4_tree *tree = znode_get_tree(old_root);
71229 + znode *new_root = NULL; /* to shut gcc up */
71230 + int result;
71231 +
71232 + assert("nikita-1069", old_root != NULL);
71233 + assert("umka-262", fake != NULL);
71234 + assert("umka-263", tree != NULL);
71235 +
71236 + /* "fake" znode---one always hanging just above current root. This
71237 + node is locked when new root is created or existing root is
71238 + deleted. Downward tree traversal takes lock on it before taking
71239 + lock on a root node. This avoids race conditions with root
71240 + manipulations.
71241 +
71242 + */
71243 + assert("nikita-1348", znode_above_root(fake));
71244 + assert("nikita-1211", znode_is_root(old_root));
71245 +
71246 + result = 0;
71247 + if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
71248 + warning("nikita-1344", "Tree is too tall: %i", tree->height);
71249 + /* ext2 returns -ENOSPC when it runs out of free inodes with a
71250 + following comment (fs/ext2/ialloc.c:441): Is it really
71251 + ENOSPC?
71252 +
71253 + -EXFULL? -EINVAL?
71254 + */
71255 + result = RETERR(-ENOSPC);
71256 + } else {
71257 + /* Allocate block for new root. It's not that
71258 + important where it will be allocated, as root is
71259 + almost always in memory. Moreover, allocate on
71260 + flush can be going here.
71261 + */
71262 + assert("nikita-1448", znode_is_root(old_root));
71263 + new_root = new_node(fake, tree->height + 1);
71264 + if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71265 + lock_handle rlh;
71266 +
71267 + init_lh(&rlh);
71268 + result =
71269 + longterm_lock_znode(&rlh, new_root,
71270 + ZNODE_WRITE_LOCK,
71271 + ZNODE_LOCK_LOPRI);
71272 + if (result == 0) {
71273 + parent_coord_t *in_parent;
71274 +
71275 + znode_make_dirty(fake);
71276 +
71277 + /* new root is a child of "fake" node */
71278 + write_lock_tree(tree);
71279 +
71280 + ++tree->height;
71281 +
71282 + /* recalculate max balance overhead */
71283 + tree->estimate_one_insert =
71284 + estimate_one_insert_item(tree);
71285 +
71286 + tree->root_block = *znode_get_block(new_root);
71287 + in_parent = &new_root->in_parent;
71288 + init_parent_coord(in_parent, fake);
71289 + /* manually insert new root into sibling
71290 + * list. With this all nodes involved into
71291 + * balancing are connected after balancing is
71292 + * done---useful invariant to check. */
71293 + sibling_list_insert_nolock(new_root, NULL);
71294 + write_unlock_tree(tree);
71295 +
71296 + /* insert into new root pointer to the
71297 + @old_root. */
71298 + assert("nikita-1110",
71299 + WITH_DATA(new_root,
71300 + node_is_empty(new_root)));
71301 + write_lock_dk(tree);
71302 + znode_set_ld_key(new_root, min_key());
71303 + znode_set_rd_key(new_root, max_key());
71304 + write_unlock_dk(tree);
71305 + if (REISER4_DEBUG) {
71306 + ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71307 + ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71308 + ZF_SET(old_root, JNODE_ORPHAN);
71309 + }
71310 + result = add_child_ptr(new_root, old_root);
71311 + done_lh(&rlh);
71312 + }
71313 + zrelse(new_root);
71314 + }
71315 + }
71316 + if (result != 0)
71317 + new_root = ERR_PTR(result);
71318 + return new_root;
71319 +}
71320 +
71321 +/* build &reiser4_item_data for inserting child pointer
71322 +
71323 + Build &reiser4_item_data that can be later used to insert pointer to @child
71324 + in its parent.
71325 +
71326 +*/
71327 +void build_child_ptr_data(znode * child /* node pointer to which will be
71328 + * inserted */ ,
71329 + reiser4_item_data * data /* where to store result */ )
71330 +{
71331 + assert("nikita-1116", child != NULL);
71332 + assert("nikita-1117", data != NULL);
71333 +
71334 + /*
71335 + * NOTE: use address of child's blocknr as address of data to be
71336 + * inserted. As result of this data gets into on-disk structure in cpu
71337 + * byte order. internal's create_hook converts it to little endian byte
71338 + * order.
71339 + */
71340 + data->data = (char *)znode_get_block(child);
71341 + /* data -> data is kernel space */
71342 + data->user = 0;
71343 + data->length = sizeof(reiser4_block_nr);
71344 + /* FIXME-VS: hardcoded internal item? */
71345 +
71346 + /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71347 + data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71348 +}
71349 +
71350 +/* add pointer to @child into empty @parent.
71351 +
71352 + This is used when pointer to old root is inserted into new root which is
71353 + empty.
71354 +*/
71355 +static int add_child_ptr(znode * parent, znode * child)
71356 +{
71357 + coord_t coord;
71358 + reiser4_item_data data;
71359 + int result;
71360 + reiser4_key key;
71361 +
71362 + assert("nikita-1111", parent != NULL);
71363 + assert("nikita-1112", child != NULL);
71364 + assert("nikita-1115",
71365 + znode_get_level(parent) == znode_get_level(child) + 1);
71366 +
71367 + result = zload(parent);
71368 + if (result != 0)
71369 + return result;
71370 + assert("nikita-1113", node_is_empty(parent));
71371 + coord_init_first_unit(&coord, parent);
71372 +
71373 + build_child_ptr_data(child, &data);
71374 + data.arg = NULL;
71375 +
71376 + read_lock_dk(znode_get_tree(parent));
71377 + key = *znode_get_ld_key(child);
71378 + read_unlock_dk(znode_get_tree(parent));
71379 +
71380 + result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71381 + NULL);
71382 + znode_make_dirty(parent);
71383 + zrelse(parent);
71384 + return result;
71385 +}
71386 +
71387 +/* actually remove tree root */
71388 +static int kill_root(reiser4_tree * tree /* tree from which root is being
71389 + * removed */ ,
71390 + znode * old_root /* root node that is being removed */ ,
71391 + znode * new_root /* new root---sole child of *
71392 + * @old_root */ ,
71393 + const reiser4_block_nr * new_root_blk /* disk address of
71394 + * @new_root */ )
71395 +{
71396 + znode *uber;
71397 + int result;
71398 + lock_handle handle_for_uber;
71399 +
71400 + assert("umka-265", tree != NULL);
71401 + assert("nikita-1198", new_root != NULL);
71402 + assert("nikita-1199",
71403 + znode_get_level(new_root) + 1 == znode_get_level(old_root));
71404 +
71405 + assert("nikita-1201", znode_is_write_locked(old_root));
71406 +
71407 + assert("nikita-1203",
71408 + disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71409 +
71410 + init_lh(&handle_for_uber);
71411 + /* obtain and lock "fake" znode protecting changes in tree height. */
71412 + result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71413 + &handle_for_uber);
71414 + if (result == 0) {
71415 + uber = handle_for_uber.node;
71416 +
71417 + znode_make_dirty(uber);
71418 +
71419 + /* don't take long term lock a @new_root. Take spinlock. */
71420 +
71421 + write_lock_tree(tree);
71422 +
71423 + tree->root_block = *new_root_blk;
71424 + --tree->height;
71425 +
71426 + /* recalculate max balance overhead */
71427 + tree->estimate_one_insert = estimate_one_insert_item(tree);
71428 +
71429 + assert("nikita-1202",
71430 + tree->height == znode_get_level(new_root));
71431 +
71432 + /* new root is child on "fake" node */
71433 + init_parent_coord(&new_root->in_parent, uber);
71434 + ++uber->c_count;
71435 +
71436 + /* sibling_list_insert_nolock(new_root, NULL); */
71437 + write_unlock_tree(tree);
71438 +
71439 + /* reinitialise old root. */
71440 + result = node_plugin_by_node(old_root)->init(old_root);
71441 + znode_make_dirty(old_root);
71442 + if (result == 0) {
71443 + assert("nikita-1279", node_is_empty(old_root));
71444 + ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71445 + old_root->c_count = 0;
71446 + }
71447 + }
71448 + done_lh(&handle_for_uber);
71449 +
71450 + return result;
71451 +}
71452 +
71453 +/* remove tree root
71454 +
71455 + This function removes tree root, decreasing tree height by one. Tree root
71456 + and its only child (that is going to become new tree root) are write locked
71457 + at the entry.
71458 +
71459 + To remove tree root we need to take lock on special "fake" znode that
71460 + protects changes of tree height. See comments in add_tree_root() for more
71461 + on this.
71462 +
71463 + Also parent pointers have to be updated in
71464 + old and new root. To simplify code, function is split into two parts: outer
71465 + kill_tree_root() collects all necessary arguments and calls kill_root()
71466 + to do the actual job.
71467 +
71468 +*/
71469 +int kill_tree_root(znode * old_root /* tree root that we are removing */ )
71470 +{
71471 + int result;
71472 + coord_t down_link;
71473 + znode *new_root;
71474 + reiser4_tree *tree;
71475 +
71476 + assert("umka-266", current_tree != NULL);
71477 + assert("nikita-1194", old_root != NULL);
71478 + assert("nikita-1196", znode_is_root(old_root));
71479 + assert("nikita-1200", node_num_items(old_root) == 1);
71480 + assert("nikita-1401", znode_is_write_locked(old_root));
71481 +
71482 + coord_init_first_unit(&down_link, old_root);
71483 +
71484 + tree = znode_get_tree(old_root);
71485 + new_root = child_znode(&down_link, old_root, 0, 1);
71486 + if (!IS_ERR(new_root)) {
71487 + result =
71488 + kill_root(tree, old_root, new_root,
71489 + znode_get_block(new_root));
71490 + zput(new_root);
71491 + } else
71492 + result = PTR_ERR(new_root);
71493 +
71494 + return result;
71495 +}
71496 +
71497 +/* Make Linus happy.
71498 + Local variables:
71499 + c-indentation-style: "K&R"
71500 + mode-name: "LC"
71501 + c-basic-offset: 8
71502 + tab-width: 8
71503 + fill-column: 120
71504 + scroll-step: 1
71505 + End:
71506 +*/
71507 Index: linux-2.6.16/fs/reiser4/tree_mod.h
71508 ===================================================================
71509 --- /dev/null
71510 +++ linux-2.6.16/fs/reiser4/tree_mod.h
71511 @@ -0,0 +1,29 @@
71512 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71513 + * reiser4/README */
71514 +
71515 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
71516 + * comments. */
71517 +
71518 +#if !defined( __REISER4_TREE_MOD_H__ )
71519 +#define __REISER4_TREE_MOD_H__
71520 +
71521 +#include "forward.h"
71522 +
71523 +znode *new_node(znode * brother, tree_level level);
71524 +znode *add_tree_root(znode * old_root, znode * fake);
71525 +int kill_tree_root(znode * old_root);
71526 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
71527 +
71528 +/* __REISER4_TREE_MOD_H__ */
71529 +#endif
71530 +
71531 +/* Make Linus happy.
71532 + Local variables:
71533 + c-indentation-style: "K&R"
71534 + mode-name: "LC"
71535 + c-basic-offset: 8
71536 + tab-width: 8
71537 + fill-column: 120
71538 + scroll-step: 1
71539 + End:
71540 +*/
71541 Index: linux-2.6.16/fs/reiser4/tree_walk.c
71542 ===================================================================
71543 --- /dev/null
71544 +++ linux-2.6.16/fs/reiser4/tree_walk.c
71545 @@ -0,0 +1,926 @@
71546 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71547 + * reiser4/README */
71548 +
71549 +/* Routines and macros to:
71550 +
71551 + get_left_neighbor()
71552 +
71553 + get_right_neighbor()
71554 +
71555 + get_parent()
71556 +
71557 + get_first_child()
71558 +
71559 + get_last_child()
71560 +
71561 + various routines to walk the whole tree and do things to it like
71562 + repack it, or move it to tertiary storage. Please make them as
71563 + generic as is reasonable.
71564 +
71565 +*/
71566 +
71567 +#include "forward.h"
71568 +#include "debug.h"
71569 +#include "dformat.h"
71570 +#include "coord.h"
71571 +#include "plugin/item/item.h"
71572 +#include "jnode.h"
71573 +#include "znode.h"
71574 +#include "tree_walk.h"
71575 +#include "tree.h"
71576 +#include "super.h"
71577 +
71578 +/* These macros are used internally in tree_walk.c in attempt to make
71579 + lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
71580 + lock_left_neighbor */
71581 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
71582 +#define FIELD_OFFSET(name) offsetof(znode, name)
71583 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
71584 +#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
71585 +#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
71586 +
71587 +/* This is the generic procedure to get and lock `generic' neighbor (left or
71588 + right neighbor or parent). It implements common algorithm for all cases of
71589 + getting lock on neighbor node, only znode structure field is different in
71590 + each case. This is parameterized by ptr_offset argument, which is byte
71591 + offset for the pointer to the desired neighbor within the current node's
71592 + znode structure. This function should be called with the tree lock held */
71593 +static int lock_neighbor(
71594 + /* resulting lock handle */
71595 + lock_handle * result,
71596 + /* znode to lock */
71597 + znode * node,
71598 + /* pointer to neighbor (or parent) znode field offset, in bytes from
71599 + the base address of znode structure */
71600 + int ptr_offset,
71601 + /* lock mode for longterm_lock_znode call */
71602 + znode_lock_mode mode,
71603 + /* lock request for longterm_lock_znode call */
71604 + znode_lock_request req,
71605 + /* GN_* flags */
71606 + int flags, int rlocked)
71607 +{
71608 + reiser4_tree *tree = znode_get_tree(node);
71609 + znode *neighbor;
71610 + int ret;
71611 +
71612 + assert("umka-236", node != NULL);
71613 + assert("umka-237", tree != NULL);
71614 + assert_rw_locked(&(tree->tree_lock));
71615 +
71616 + if (flags & GN_TRY_LOCK)
71617 + req |= ZNODE_LOCK_NONBLOCK;
71618 + if (flags & GN_SAME_ATOM)
71619 + req |= ZNODE_LOCK_DONT_FUSE;
71620 +
71621 + /* get neighbor's address by using of sibling link, quit while loop
71622 + (and return) if link is not available. */
71623 + while (1) {
71624 + neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
71625 +
71626 + /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
71627 + * node pointed by it is not connected.
71628 + *
71629 + * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
71630 + * check and allows passing reference to not connected znode to
71631 + * subsequent longterm_lock_znode() call. This kills possible
71632 + * busy loop if we are trying to get longterm lock on locked but
71633 + * not yet connected parent node. */
71634 + if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
71635 + || znode_is_connected(neighbor))) {
71636 + return RETERR(-E_NO_NEIGHBOR);
71637 + }
71638 +
71639 + /* protect it from deletion. */
71640 + zref(neighbor);
71641 +
71642 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71643 +
71644 + ret = longterm_lock_znode(result, neighbor, mode, req);
71645 +
71646 + /* The lock handle obtains its own reference, release the one from above. */
71647 + zput(neighbor);
71648 +
71649 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71650 +
71651 + /* restart if node we got reference to is being
71652 + invalidated. we should not get reference to this node
71653 + again. */
71654 + if (ret == -EINVAL)
71655 + continue;
71656 + if (ret)
71657 + return ret;
71658 +
71659 + /* check if neighbor link still points to just locked znode;
71660 + the link could have been changed while the process slept. */
71661 + if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
71662 + return 0;
71663 +
71664 + /* znode was locked by mistake; unlock it and restart locking
71665 + process from beginning. */
71666 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71667 + longterm_unlock_znode(result);
71668 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71669 + }
71670 +}
71671 +
71672 +/* get parent node with longterm lock, accepts GN* flags. */
71673 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
71674 + znode * node /* child node */ ,
71675 + znode_lock_mode mode
71676 + /* type of lock: read or write */ ,
71677 + int flags /* GN_* flags */ )
71678 +{
71679 + int result;
71680 +
71681 + read_lock_tree(znode_get_tree(node));
71682 + result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
71683 + ZNODE_LOCK_HIPRI, flags, 1);
71684 + read_unlock_tree(znode_get_tree(node));
71685 + return result;
71686 +}
71687 +
71688 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
71689 + bit in @flags parameter */
71690 +/* Audited by: umka (2002.06.14) */
71691 +static inline int
71692 +lock_side_neighbor(lock_handle * result,
71693 + znode * node, znode_lock_mode mode, int flags, int rlocked)
71694 +{
71695 + int ret;
71696 + int ptr_offset;
71697 + znode_lock_request req;
71698 +
71699 + if (flags & GN_GO_LEFT) {
71700 + ptr_offset = LEFT_PTR_OFFSET;
71701 + req = ZNODE_LOCK_LOPRI;
71702 + } else {
71703 + ptr_offset = RIGHT_PTR_OFFSET;
71704 + req = ZNODE_LOCK_HIPRI;
71705 + }
71706 +
71707 + ret =
71708 + lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
71709 +
71710 + if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
71711 + * guarantee that neighbor is absent in the
71712 + * tree; in this case we return -ENOENT --
71713 + * means neighbor at least not found in
71714 + * cache */
71715 + return RETERR(-ENOENT);
71716 +
71717 + return ret;
71718 +}
71719 +
71720 +#if REISER4_DEBUG
71721 +
71722 +int check_sibling_list(znode * node)
71723 +{
71724 + znode *scan;
71725 + znode *next;
71726 +
71727 + assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
71728 +
71729 + if (node == NULL)
71730 + return 1;
71731 +
71732 + if (ZF_ISSET(node, JNODE_RIP))
71733 + return 1;
71734 +
71735 + assert("nikita-3270", node != NULL);
71736 + assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
71737 +
71738 + for (scan = node; znode_is_left_connected(scan); scan = next) {
71739 + next = scan->left;
71740 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71741 + assert("nikita-3271", znode_is_right_connected(next));
71742 + assert("nikita-3272", next->right == scan);
71743 + } else
71744 + break;
71745 + }
71746 + for (scan = node; znode_is_right_connected(scan); scan = next) {
71747 + next = scan->right;
71748 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71749 + assert("nikita-3273", znode_is_left_connected(next));
71750 + assert("nikita-3274", next->left == scan);
71751 + } else
71752 + break;
71753 + }
71754 + return 1;
71755 +}
71756 +
71757 +#endif
71758 +
71759 +/* Znode sibling pointers maintenence. */
71760 +
71761 +/* Znode sibling pointers are established between any neighbored nodes which are
71762 + in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
71763 + JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
71764 + value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
71765 +
71766 + Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
71767 + take care about searching (hash table lookup may be required) of znode
71768 + neighbors, establishing sibling pointers between them and setting
71769 + JNODE_*_CONNECTED state bits. */
71770 +
71771 +/* adjusting of sibling pointers and `connected' states for two
71772 + neighbors; works if one neighbor is NULL (was not found). */
71773 +
71774 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
71775 +void link_left_and_right(znode * left, znode * right)
71776 +{
71777 + assert("nikita-3275", check_sibling_list(left));
71778 + assert("nikita-3275", check_sibling_list(right));
71779 +
71780 + if (left != NULL) {
71781 + if (left->right == NULL) {
71782 + left->right = right;
71783 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
71784 +
71785 + ON_DEBUG(left->right_version =
71786 + atomic_inc_return(&delim_key_version);
71787 + );
71788 +
71789 + } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
71790 + && left->right != right) {
71791 +
71792 + ON_DEBUG(left->right->left_version =
71793 + atomic_inc_return(&delim_key_version);
71794 + left->right_version =
71795 + atomic_inc_return(&delim_key_version););
71796 +
71797 + left->right->left = NULL;
71798 + left->right = right;
71799 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
71800 + } else
71801 + /*
71802 + * there is a race condition in renew_sibling_link()
71803 + * and assertions below check that it is only one
71804 + * there. Thread T1 calls renew_sibling_link() without
71805 + * GN_NO_ALLOC flag. zlook() doesn't find neighbor
71806 + * node, but before T1 gets to the
71807 + * link_left_and_right(), another thread T2 creates
71808 + * neighbor node and connects it. check for
71809 + * left->right == NULL above protects T1 from
71810 + * overwriting correct left->right pointer installed
71811 + * by T2.
71812 + */
71813 + assert("nikita-3302",
71814 + right == NULL || left->right == right);
71815 + }
71816 + if (right != NULL) {
71817 + if (right->left == NULL) {
71818 + right->left = left;
71819 + ZF_SET(right, JNODE_LEFT_CONNECTED);
71820 +
71821 + ON_DEBUG(right->left_version =
71822 + atomic_inc_return(&delim_key_version);
71823 + );
71824 +
71825 + } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
71826 + && right->left != left) {
71827 +
71828 + ON_DEBUG(right->left->right_version =
71829 + atomic_inc_return(&delim_key_version);
71830 + right->left_version =
71831 + atomic_inc_return(&delim_key_version););
71832 +
71833 + right->left->right = NULL;
71834 + right->left = left;
71835 + ZF_SET(right, JNODE_LEFT_CONNECTED);
71836 +
71837 + } else
71838 + assert("nikita-3303",
71839 + left == NULL || right->left == left);
71840 + }
71841 + assert("nikita-3275", check_sibling_list(left));
71842 + assert("nikita-3275", check_sibling_list(right));
71843 +}
71844 +
71845 +/* Audited by: umka (2002.06.14) */
71846 +static void link_znodes(znode * first, znode * second, int to_left)
71847 +{
71848 + if (to_left)
71849 + link_left_and_right(second, first);
71850 + else
71851 + link_left_and_right(first, second);
71852 +}
71853 +
71854 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
71855 + coord's unit position in horizontal direction, even across node
71856 + boundary. Should be called under tree lock, it protects nonexistence of
71857 + sibling link on parent level, if lock_side_neighbor() fails with
71858 + -ENOENT. */
71859 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
71860 +{
71861 + int ret;
71862 + znode *node;
71863 + reiser4_tree *tree;
71864 +
71865 + assert("umka-243", coord != NULL);
71866 + assert("umka-244", handle != NULL);
71867 + assert("zam-1069", handle->node == NULL);
71868 +
71869 + ret =
71870 + (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
71871 + coord_next_unit(coord);
71872 + if (!ret)
71873 + return 0;
71874 +
71875 + ret =
71876 + lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
71877 + if (ret)
71878 + return ret;
71879 +
71880 + node = handle->node;
71881 + tree = znode_get_tree(node);
71882 + write_unlock_tree(tree);
71883 +
71884 + coord_init_zero(coord);
71885 +
71886 + /* We avoid synchronous read here if it is specified by flag. */
71887 + if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
71888 + ret = jstartio(ZJNODE(handle->node));
71889 + if (!ret)
71890 + ret = -E_REPEAT;
71891 + goto error_locked;
71892 + }
71893 +
71894 + /* corresponded zrelse() should be called by the clients of
71895 + far_next_coord(), in place when this node gets unlocked. */
71896 + ret = zload(handle->node);
71897 + if (ret)
71898 + goto error_locked;
71899 +
71900 + if (flags & GN_GO_LEFT)
71901 + coord_init_last_unit(coord, node);
71902 + else
71903 + coord_init_first_unit(coord, node);
71904 +
71905 + if (0) {
71906 + error_locked:
71907 + longterm_unlock_znode(handle);
71908 + }
71909 + write_lock_tree(tree);
71910 + return ret;
71911 +}
71912 +
71913 +/* Very significant function which performs a step in horizontal direction
71914 + when sibling pointer is not available. Actually, it is only function which
71915 + does it.
71916 + Note: this function does not restore locking status at exit,
71917 + caller should does care about proper unlocking and zrelsing */
71918 +static int
71919 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
71920 + tree_level level, int flags, int *nr_locked)
71921 +{
71922 + int ret;
71923 + int to_left = flags & GN_GO_LEFT;
71924 + reiser4_block_nr da;
71925 + /* parent of the neighbor node; we set it to parent until not sharing
71926 + of one parent between child and neighbor node is detected */
71927 + znode *side_parent = coord->node;
71928 + reiser4_tree *tree = znode_get_tree(child);
71929 + znode *neighbor = NULL;
71930 +
71931 + assert("umka-245", coord != NULL);
71932 + assert("umka-246", handle != NULL);
71933 + assert("umka-247", child != NULL);
71934 + assert("umka-303", tree != NULL);
71935 +
71936 + init_lh(handle);
71937 + write_lock_tree(tree);
71938 + ret = far_next_coord(coord, handle, flags);
71939 +
71940 + if (ret) {
71941 + if (ret != -ENOENT) {
71942 + write_unlock_tree(tree);
71943 + return ret;
71944 + }
71945 + } else {
71946 + item_plugin *iplug;
71947 +
71948 + if (handle->node != NULL) {
71949 + (*nr_locked)++;
71950 + side_parent = handle->node;
71951 + }
71952 +
71953 + /* does coord object points to internal item? We do not
71954 + support sibling pointers between znode for formatted and
71955 + unformatted nodes and return -E_NO_NEIGHBOR in that case. */
71956 + iplug = item_plugin_by_coord(coord);
71957 + if (!item_is_internal(coord)) {
71958 + link_znodes(child, NULL, to_left);
71959 + write_unlock_tree(tree);
71960 + /* we know there can't be formatted neighbor */
71961 + return RETERR(-E_NO_NEIGHBOR);
71962 + }
71963 + write_unlock_tree(tree);
71964 +
71965 + iplug->s.internal.down_link(coord, NULL, &da);
71966 +
71967 + if (flags & GN_NO_ALLOC) {
71968 + neighbor = zlook(tree, &da);
71969 + } else {
71970 + neighbor =
71971 + zget(tree, &da, side_parent, level, get_gfp_mask());
71972 + }
71973 +
71974 + if (IS_ERR(neighbor)) {
71975 + ret = PTR_ERR(neighbor);
71976 + return ret;
71977 + }
71978 +
71979 + if (neighbor)
71980 + /* update delimiting keys */
71981 + set_child_delimiting_keys(coord->node, coord, neighbor);
71982 +
71983 + write_lock_tree(tree);
71984 + }
71985 +
71986 + if (likely(neighbor == NULL ||
71987 + (znode_get_level(child) == znode_get_level(neighbor)
71988 + && child != neighbor)))
71989 + link_znodes(child, neighbor, to_left);
71990 + else {
71991 + warning("nikita-3532",
71992 + "Sibling nodes on the different levels: %i != %i\n",
71993 + znode_get_level(child), znode_get_level(neighbor));
71994 + ret = RETERR(-EIO);
71995 + }
71996 +
71997 + write_unlock_tree(tree);
71998 +
71999 + /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
72000 + if (neighbor != NULL && (flags & GN_NO_ALLOC))
72001 + /* atomic_dec(&ZJNODE(neighbor)->x_count); */
72002 + zput(neighbor);
72003 +
72004 + return ret;
72005 +}
72006 +
72007 +/* This function is for establishing of one side relation. */
72008 +/* Audited by: umka (2002.06.14) */
72009 +static int connect_one_side(coord_t * coord, znode * node, int flags)
72010 +{
72011 + coord_t local;
72012 + lock_handle handle;
72013 + int nr_locked;
72014 + int ret;
72015 +
72016 + assert("umka-248", coord != NULL);
72017 + assert("umka-249", node != NULL);
72018 +
72019 + coord_dup_nocheck(&local, coord);
72020 +
72021 + init_lh(&handle);
72022 +
72023 + ret =
72024 + renew_sibling_link(&local, &handle, node, znode_get_level(node),
72025 + flags | GN_NO_ALLOC, &nr_locked);
72026 +
72027 + if (handle.node != NULL) {
72028 + /* complementary operations for zload() and lock() in far_next_coord() */
72029 + zrelse(handle.node);
72030 + longterm_unlock_znode(&handle);
72031 + }
72032 +
72033 + /* we catch error codes which are not interesting for us because we
72034 + run renew_sibling_link() only for znode connection. */
72035 + if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
72036 + return 0;
72037 +
72038 + return ret;
72039 +}
72040 +
72041 +/* if @child is not in `connected' state, performs hash searches for left and
72042 + right neighbor nodes and establishes horizontal sibling links */
72043 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72044 +int connect_znode(coord_t * parent_coord, znode * child)
72045 +{
72046 + reiser4_tree *tree = znode_get_tree(child);
72047 + int ret = 0;
72048 +
72049 + assert("zam-330", parent_coord != NULL);
72050 + assert("zam-331", child != NULL);
72051 + assert("zam-332", parent_coord->node != NULL);
72052 + assert("umka-305", tree != NULL);
72053 +
72054 + /* it is trivial to `connect' root znode because it can't have
72055 + neighbors */
72056 + if (znode_above_root(parent_coord->node)) {
72057 + child->left = NULL;
72058 + child->right = NULL;
72059 + ZF_SET(child, JNODE_LEFT_CONNECTED);
72060 + ZF_SET(child, JNODE_RIGHT_CONNECTED);
72061 +
72062 + ON_DEBUG(child->left_version =
72063 + atomic_inc_return(&delim_key_version);
72064 + child->right_version =
72065 + atomic_inc_return(&delim_key_version););
72066 +
72067 + return 0;
72068 + }
72069 +
72070 + /* load parent node */
72071 + coord_clear_iplug(parent_coord);
72072 + ret = zload(parent_coord->node);
72073 +
72074 + if (ret != 0)
72075 + return ret;
72076 +
72077 + /* protect `connected' state check by tree_lock */
72078 + read_lock_tree(tree);
72079 +
72080 + if (!znode_is_right_connected(child)) {
72081 + read_unlock_tree(tree);
72082 + /* connect right (default is right) */
72083 + ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
72084 + if (ret)
72085 + goto zrelse_and_ret;
72086 +
72087 + read_lock_tree(tree);
72088 + }
72089 +
72090 + ret = znode_is_left_connected(child);
72091 +
72092 + read_unlock_tree(tree);
72093 +
72094 + if (!ret) {
72095 + ret =
72096 + connect_one_side(parent_coord, child,
72097 + GN_NO_ALLOC | GN_GO_LEFT);
72098 + } else
72099 + ret = 0;
72100 +
72101 + zrelse_and_ret:
72102 + zrelse(parent_coord->node);
72103 +
72104 + return ret;
72105 +}
72106 +
72107 +/* this function is like renew_sibling_link() but allocates neighbor node if
72108 + it doesn't exist and `connects' it. It may require making two steps in
72109 + horizontal direction, first one for neighbor node finding/allocation,
72110 + second one is for finding neighbor of neighbor to connect freshly allocated
72111 + znode. */
72112 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72113 +static int
72114 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
72115 +{
72116 + coord_t local;
72117 + lock_handle empty[2];
72118 + reiser4_tree *tree = znode_get_tree(node);
72119 + znode *neighbor = NULL;
72120 + int nr_locked = 0;
72121 + int ret;
72122 +
72123 + assert("umka-250", coord != NULL);
72124 + assert("umka-251", node != NULL);
72125 + assert("umka-307", tree != NULL);
72126 + assert("umka-308", level <= tree->height);
72127 +
72128 + /* umka (2002.06.14)
72129 + Here probably should be a check for given "level" validness.
72130 + Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
72131 + */
72132 +
72133 + coord_dup(&local, coord);
72134 +
72135 + ret =
72136 + renew_sibling_link(&local, &empty[0], node, level,
72137 + flags & ~GN_NO_ALLOC, &nr_locked);
72138 + if (ret)
72139 + goto out;
72140 +
72141 + /* tree lock is not needed here because we keep parent node(s) locked
72142 + and reference to neighbor znode incremented */
72143 + neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
72144 +
72145 + read_lock_tree(tree);
72146 + ret = znode_is_connected(neighbor);
72147 + read_unlock_tree(tree);
72148 + if (ret) {
72149 + ret = 0;
72150 + goto out;
72151 + }
72152 +
72153 + ret =
72154 + renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
72155 + flags | GN_NO_ALLOC, &nr_locked);
72156 + /* second renew_sibling_link() call is used for znode connection only,
72157 + so we can live with these errors */
72158 + if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
72159 + ret = 0;
72160 +
72161 + out:
72162 +
72163 + for (--nr_locked; nr_locked >= 0; --nr_locked) {
72164 + zrelse(empty[nr_locked].node);
72165 + longterm_unlock_znode(&empty[nr_locked]);
72166 + }
72167 +
72168 + if (neighbor != NULL)
72169 + /* decrement znode reference counter without actually
72170 + releasing it. */
72171 + atomic_dec(&ZJNODE(neighbor)->x_count);
72172 +
72173 + return ret;
72174 +}
72175 +
72176 +/*
72177 + reiser4_get_neighbor() -- lock node's neighbor.
72178 +
72179 + reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
72180 + given parameter) using sibling link to it. If sibling link is not available
72181 + (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
72182 + level up for information about neighbor's disk address. We lock node's
72183 + parent, if it is common parent for both 'node' and its neighbor, neighbor's
72184 + disk address is in next (to left or to right) down link from link that points
72185 + to original node. If not, we need to lock parent's neighbor, read its content
72186 + and take first(last) downlink with neighbor's disk address. That locking
72187 + could be done by using sibling link and lock_neighbor() function, if sibling
72188 + link exists. In another case we have to go level up again until we find
72189 + common parent or valid sibling link. Then go down
72190 + allocating/connecting/locking/reading nodes until neighbor of first one is
72191 + locked.
72192 +
72193 + @neighbor: result lock handle,
72194 + @node: a node which we lock neighbor of,
72195 + @lock_mode: lock mode {LM_READ, LM_WRITE},
72196 + @flags: logical OR of {GN_*} (see description above) subset.
72197 +
72198 + @return: 0 if success, negative value if lock was impossible due to an error
72199 + or lack of neighbor node.
72200 +*/
72201 +
72202 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72203 +int
72204 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72205 + znode_lock_mode lock_mode, int flags)
72206 +{
72207 + reiser4_tree *tree = znode_get_tree(node);
72208 + lock_handle path[REAL_MAX_ZTREE_HEIGHT];
72209 +
72210 + coord_t coord;
72211 +
72212 + tree_level base_level;
72213 + tree_level h = 0;
72214 + int ret;
72215 +
72216 + assert("umka-252", tree != NULL);
72217 + assert("umka-253", neighbor != NULL);
72218 + assert("umka-254", node != NULL);
72219 +
72220 + base_level = znode_get_level(node);
72221 +
72222 + assert("umka-310", base_level <= tree->height);
72223 +
72224 + coord_init_zero(&coord);
72225 +
72226 + again:
72227 + /* first, we try to use simple lock_neighbor() which requires sibling
72228 + link existence */
72229 + read_lock_tree(tree);
72230 + ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
72231 + read_unlock_tree(tree);
72232 + if (!ret) {
72233 + /* load znode content if it was specified */
72234 + if (flags & GN_LOAD_NEIGHBOR) {
72235 + ret = zload(node);
72236 + if (ret)
72237 + longterm_unlock_znode(neighbor);
72238 + }
72239 + return ret;
72240 + }
72241 +
72242 + /* only -ENOENT means we may look upward and try to connect
72243 + @node with its neighbor (if @flags allow us to do it) */
72244 + if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
72245 + return ret;
72246 +
72247 + /* before establishing of sibling link we lock parent node; it is
72248 + required by renew_neighbor() to work. */
72249 + init_lh(&path[0]);
72250 + ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72251 + if (ret)
72252 + return ret;
72253 + if (znode_above_root(path[0].node)) {
72254 + longterm_unlock_znode(&path[0]);
72255 + return RETERR(-E_NO_NEIGHBOR);
72256 + }
72257 +
72258 + while (1) {
72259 + znode *child = (h == 0) ? node : path[h - 1].node;
72260 + znode *parent = path[h].node;
72261 +
72262 + ret = zload(parent);
72263 + if (ret)
72264 + break;
72265 +
72266 + ret = find_child_ptr(parent, child, &coord);
72267 +
72268 + if (ret) {
72269 + zrelse(parent);
72270 + break;
72271 + }
72272 +
72273 + /* try to establish missing sibling link */
72274 + ret = renew_neighbor(&coord, child, h + base_level, flags);
72275 +
72276 + zrelse(parent);
72277 +
72278 + switch (ret) {
72279 + case 0:
72280 + /* unlocking of parent znode prevents simple
72281 + deadlock situation */
72282 + done_lh(&path[h]);
72283 +
72284 + /* depend on tree level we stay on we repeat first
72285 + locking attempt ... */
72286 + if (h == 0)
72287 + goto again;
72288 +
72289 + /* ... or repeat establishing of sibling link at
72290 + one level below. */
72291 + --h;
72292 + break;
72293 +
72294 + case -ENOENT:
72295 + /* sibling link is not available -- we go
72296 + upward. */
72297 + init_lh(&path[h + 1]);
72298 + ret =
72299 + reiser4_get_parent(&path[h + 1], parent,
72300 + ZNODE_READ_LOCK);
72301 + if (ret)
72302 + goto fail;
72303 + ++h;
72304 + if (znode_above_root(path[h].node)) {
72305 + ret = RETERR(-E_NO_NEIGHBOR);
72306 + goto fail;
72307 + }
72308 + break;
72309 +
72310 + case -E_DEADLOCK:
72311 + /* there was lock request from hi-pri locker. if
72312 + it is possible we unlock last parent node and
72313 + re-lock it again. */
72314 + for (; check_deadlock(); h--) {
72315 + done_lh(&path[h]);
72316 + if (h == 0)
72317 + goto fail;
72318 + }
72319 +
72320 + break;
72321 +
72322 + default: /* other errors. */
72323 + goto fail;
72324 + }
72325 + }
72326 + fail:
72327 + ON_DEBUG(check_lock_node_data(node));
72328 + ON_DEBUG(check_lock_data());
72329 +
72330 + /* unlock path */
72331 + do {
72332 + /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72333 + fail; path[0] is already done_lh-ed, therefore
72334 + longterm_unlock_znode(&path[h]); is not applicable */
72335 + done_lh(&path[h]);
72336 + --h;
72337 + } while (h + 1 != 0);
72338 +
72339 + return ret;
72340 +}
72341 +
72342 +/* remove node from sibling list */
72343 +/* Audited by: umka (2002.06.14) */
72344 +void sibling_list_remove(znode * node)
72345 +{
72346 + reiser4_tree *tree;
72347 +
72348 + tree = znode_get_tree(node);
72349 + assert("umka-255", node != NULL);
72350 + assert_rw_write_locked(&(tree->tree_lock));
72351 + assert("nikita-3275", check_sibling_list(node));
72352 +
72353 + write_lock_dk(tree);
72354 + if (znode_is_right_connected(node) && node->right != NULL &&
72355 + znode_is_left_connected(node) && node->left != NULL) {
72356 + assert("zam-32245",
72357 + keyeq(znode_get_rd_key(node),
72358 + znode_get_ld_key(node->right)));
72359 + znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72360 + }
72361 + write_unlock_dk(tree);
72362 +
72363 + if (znode_is_right_connected(node) && node->right != NULL) {
72364 + assert("zam-322", znode_is_left_connected(node->right));
72365 + node->right->left = node->left;
72366 + ON_DEBUG(node->right->left_version =
72367 + atomic_inc_return(&delim_key_version);
72368 + );
72369 + }
72370 + if (znode_is_left_connected(node) && node->left != NULL) {
72371 + assert("zam-323", znode_is_right_connected(node->left));
72372 + node->left->right = node->right;
72373 + ON_DEBUG(node->left->right_version =
72374 + atomic_inc_return(&delim_key_version);
72375 + );
72376 + }
72377 +
72378 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
72379 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72380 + ON_DEBUG(node->left = node->right = NULL;
72381 + node->left_version = atomic_inc_return(&delim_key_version);
72382 + node->right_version = atomic_inc_return(&delim_key_version););
72383 + assert("nikita-3276", check_sibling_list(node));
72384 +}
72385 +
72386 +/* disconnect node from sibling list */
72387 +void sibling_list_drop(znode * node)
72388 +{
72389 + znode *right;
72390 + znode *left;
72391 +
72392 + assert("nikita-2464", node != NULL);
72393 + assert("nikita-3277", check_sibling_list(node));
72394 +
72395 + right = node->right;
72396 + if (right != NULL) {
72397 + assert("nikita-2465", znode_is_left_connected(right));
72398 + right->left = NULL;
72399 + ON_DEBUG(right->left_version =
72400 + atomic_inc_return(&delim_key_version);
72401 + );
72402 + }
72403 + left = node->left;
72404 + if (left != NULL) {
72405 + assert("zam-323", znode_is_right_connected(left));
72406 + left->right = NULL;
72407 + ON_DEBUG(left->right_version =
72408 + atomic_inc_return(&delim_key_version);
72409 + );
72410 + }
72411 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
72412 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72413 + ON_DEBUG(node->left = node->right = NULL;
72414 + node->left_version = atomic_inc_return(&delim_key_version);
72415 + node->right_version = atomic_inc_return(&delim_key_version););
72416 +}
72417 +
72418 +/* Insert new node into sibling list. Regular balancing inserts new node
72419 + after (at right side) existing and locked node (@before), except one case
72420 + of adding new tree root node. @before should be NULL in that case. */
72421 +void sibling_list_insert_nolock(znode * new, znode * before)
72422 +{
72423 + assert("zam-334", new != NULL);
72424 + assert("nikita-3298", !znode_is_left_connected(new));
72425 + assert("nikita-3299", !znode_is_right_connected(new));
72426 + assert("nikita-3300", new->left == NULL);
72427 + assert("nikita-3301", new->right == NULL);
72428 + assert("nikita-3278", check_sibling_list(new));
72429 + assert("nikita-3279", check_sibling_list(before));
72430 +
72431 + if (before != NULL) {
72432 + assert("zam-333", znode_is_connected(before));
72433 + new->right = before->right;
72434 + new->left = before;
72435 + ON_DEBUG(new->right_version =
72436 + atomic_inc_return(&delim_key_version);
72437 + new->left_version =
72438 + atomic_inc_return(&delim_key_version););
72439 + if (before->right != NULL) {
72440 + before->right->left = new;
72441 + ON_DEBUG(before->right->left_version =
72442 + atomic_inc_return(&delim_key_version);
72443 + );
72444 + }
72445 + before->right = new;
72446 + ON_DEBUG(before->right_version =
72447 + atomic_inc_return(&delim_key_version);
72448 + );
72449 + } else {
72450 + new->right = NULL;
72451 + new->left = NULL;
72452 + ON_DEBUG(new->right_version =
72453 + atomic_inc_return(&delim_key_version);
72454 + new->left_version =
72455 + atomic_inc_return(&delim_key_version););
72456 + }
72457 + ZF_SET(new, JNODE_LEFT_CONNECTED);
72458 + ZF_SET(new, JNODE_RIGHT_CONNECTED);
72459 + assert("nikita-3280", check_sibling_list(new));
72460 + assert("nikita-3281", check_sibling_list(before));
72461 +}
72462 +
72463 +/*
72464 + Local variables:
72465 + c-indentation-style: "K&R"
72466 + mode-name: "LC"
72467 + c-basic-offset: 8
72468 + tab-width: 8
72469 + fill-column: 80
72470 + End:
72471 +*/
72472 Index: linux-2.6.16/fs/reiser4/tree_walk.h
72473 ===================================================================
72474 --- /dev/null
72475 +++ linux-2.6.16/fs/reiser4/tree_walk.h
72476 @@ -0,0 +1,125 @@
72477 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
72478 +
72479 +/* definitions of reiser4 tree walk functions */
72480 +
72481 +#ifndef __FS_REISER4_TREE_WALK_H__
72482 +#define __FS_REISER4_TREE_WALK_H__
72483 +
72484 +#include "debug.h"
72485 +#include "forward.h"
72486 +
72487 +/* establishes horizontal links between cached znodes */
72488 +int connect_znode(coord_t * coord, znode * node);
72489 +
72490 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
72491 + have the following common arguments:
72492 +
72493 + return codes:
72494 +
72495 + @return : 0 - OK,
72496 +
72497 +ZAM-FIXME-HANS: wrong return code name. Change them all.
72498 + -ENOENT - neighbor is not in cache, what is detected by sibling
72499 + link absence.
72500 +
72501 + -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
72502 + found (because we are left-/right- most node of the
72503 + tree, for example). Also, this return code is for
72504 + reiser4_get_parent() when we see no parent link -- it
72505 + means that our node is root node.
72506 +
72507 + -E_DEADLOCK - deadlock detected (request from high-priority process
72508 + received), other error codes are conformed to
72509 + /usr/include/asm/errno.h .
72510 +*/
72511 +
72512 +int
72513 +reiser4_get_parent_flags(lock_handle * result, znode * node,
72514 + znode_lock_mode mode, int flags);
72515 +
72516 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
72517 +typedef enum {
72518 + /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
72519 + * find not allocated not connected neigbor by going though upper
72520 + * levels */
72521 + GN_CAN_USE_UPPER_LEVELS = 0x1,
72522 + /* locking left neighbor instead of right one */
72523 + GN_GO_LEFT = 0x2,
72524 + /* automatically load neighbor node content */
72525 + GN_LOAD_NEIGHBOR = 0x4,
72526 + /* return -E_REPEAT if can't lock */
72527 + GN_TRY_LOCK = 0x8,
72528 + /* used internally in tree_walk.c, causes renew_sibling to not
72529 + allocate neighbor znode, but only search for it in znode cache */
72530 + GN_NO_ALLOC = 0x10,
72531 + /* do not go across atom boundaries */
72532 + GN_SAME_ATOM = 0x20,
72533 + /* allow to lock not connected nodes */
72534 + GN_ALLOW_NOT_CONNECTED = 0x40,
72535 + /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
72536 + GN_ASYNC = 0x80
72537 +} znode_get_neigbor_flags;
72538 +
72539 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
72540 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
72541 + znode_lock_mode mode)
72542 +{
72543 + return reiser4_get_parent_flags(result, node, mode,
72544 + GN_ALLOW_NOT_CONNECTED);
72545 +}
72546 +
72547 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72548 + znode_lock_mode lock_mode, int flags);
72549 +
72550 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
72551 +static inline int
72552 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
72553 + int flags)
72554 +{
72555 + return reiser4_get_neighbor(result, node, lock_mode,
72556 + flags | GN_GO_LEFT);
72557 +}
72558 +
72559 +static inline int
72560 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
72561 + int flags)
72562 +{
72563 + ON_DEBUG(check_lock_node_data(node));
72564 + ON_DEBUG(check_lock_data());
72565 + return reiser4_get_neighbor(result, node, lock_mode,
72566 + flags & (~GN_GO_LEFT));
72567 +}
72568 +
72569 +extern void sibling_list_remove(znode * node);
72570 +extern void sibling_list_drop(znode * node);
72571 +extern void sibling_list_insert_nolock(znode * new, znode * before);
72572 +extern void link_left_and_right(znode * left, znode * right);
72573 +
72574 +/* Functions called by tree_walk() when tree_walk() ... */
72575 +struct tree_walk_actor {
72576 + /* ... meets a formatted node, */
72577 + int (*process_znode) (tap_t *, void *);
72578 + /* ... meets an extent, */
72579 + int (*process_extent) (tap_t *, void *);
72580 + /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
72581 + * node or extent processing functions. */
72582 + int (*before) (void *);
72583 +};
72584 +
72585 +#if REISER4_DEBUG
72586 +int check_sibling_list(znode * node);
72587 +#else
72588 +#define check_sibling_list(n) (1)
72589 +#endif
72590 +
72591 +#endif /* __FS_REISER4_TREE_WALK_H__ */
72592 +
72593 +/*
72594 + Local variables:
72595 + c-indentation-style: "K&R"
72596 + mode-name: "LC"
72597 + c-basic-offset: 8
72598 + tab-width: 8
72599 + fill-column: 120
72600 + End:
72601 +*/
72602 Index: linux-2.6.16/fs/reiser4/txnmgr.c
72603 ===================================================================
72604 --- /dev/null
72605 +++ linux-2.6.16/fs/reiser4/txnmgr.c
72606 @@ -0,0 +1,3158 @@
72607 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72608 + * reiser4/README */
72609 +
72610 +/* Joshua MacDonald wrote the first draft of this code. */
72611 +
72612 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
72613 +filesystem scales only as well as its worst locking design. You need to
72614 +substantially restructure this code. Josh was not as experienced a programmer
72615 +as you. Particularly review how the locking style differs from what you did
72616 +for znodes usingt hi-lo priority locking, and present to me an opinion on
72617 +whether the differences are well founded. */
72618 +
72619 +/* I cannot help but to disagree with the sentiment above. Locking of
72620 + * transaction manager is _not_ badly designed, and, at the very least, is not
72621 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
72622 + * locking on znodes, especially on the root node of the tree. --nikita,
72623 + * 2003.10.13 */
72624 +
72625 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
72626 + txnmgr processes capture_block requests and manages the relationship between jnodes and
72627 + atoms through the various stages of a transcrash, and it also oversees the fusion and
72628 + capture-on-copy processes. The main difficulty with this task is maintaining a
72629 + deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
72630 + difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
72631 + must be broken. The main requirement is that atom-fusion be deadlock free, so once you
72632 + hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
72633 + that any time you check the atom-pointer of a jnode or handle and then try to lock that
72634 + atom, you must use trylock() and possibly reverse the order.
72635 +
72636 + This code implements the design documented at:
72637 +
72638 + http://namesys.com/txn-doc.html
72639 +
72640 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
72641 +above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
72642 +topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
72643 +year old --- define all technical terms used.
72644 +
72645 +*/
72646 +
72647 +/* Thoughts on the external transaction interface:
72648 +
72649 + In the current code, a TRANSCRASH handle is created implicitly by init_context() (which
72650 + creates state that lasts for the duration of a system call and is called at the start
72651 + of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
72652 + occupying the scope of a single system call. We wish to give certain applications an
72653 + interface to begin and close (commit) transactions. Since our implementation of
72654 + transactions does not yet support isolation, allowing an application to open a
72655 + transaction implies trusting it to later close the transaction. Part of the
72656 + transaction interface will be aimed at enabling that trust, but the interface for
72657 + actually using transactions is fairly narrow.
72658 +
72659 + BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
72660 + this identifier into a string that a shell-script could use, allowing you to start a
72661 + transaction by issuing a command. Once open, the transcrash should be set in the task
72662 + structure, and there should be options (I suppose) to allow it to be carried across
72663 + fork/exec. A transcrash has several options:
72664 +
72665 + - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
72666 + on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
72667 + capture on reads as well, it should set READ_FUSING.
72668 +
72669 + - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
72670 + eventually close (or else the machine must crash). If the application dies an
72671 + unexpected death with an open transcrash, for example, or if it hangs for a long
72672 + duration, one solution (to avoid crashing the machine) is to simply close it anyway.
72673 + This is a dangerous option, but it is one way to solve the problem until isolated
72674 + transcrashes are available for untrusted applications.
72675 +
72676 + It seems to be what databases do, though it is unclear how one avoids a DoS attack
72677 + creating a vulnerability based on resource starvation. Guaranteeing that some
72678 + minimum amount of computational resources are made available would seem more correct
72679 + than guaranteeing some amount of time. When we again have someone to code the work,
72680 + this issue should be considered carefully. -Hans
72681 +
72682 + RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
72683 + many dirty blocks it expects. The reserve_blocks interface should be called at a point
72684 + where it is safe for the application to fail, because the system may not be able to
72685 + grant the allocation and the application must be able to back-out. For this reason,
72686 + the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
72687 + the application may also wish to extend the allocation after beginning its transcrash.
72688 +
72689 + CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
72690 + modifications that require transaction protection. When isolated transactions are
72691 + supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
72692 + RESERVE_BLOCKS call fails for the application, it should "abort" by calling
72693 + CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
72694 + why, for safety, the application should call RESERVE_BLOCKS before making any changes).
72695 +
72696 + For actually implementing these out-of-system-call-scopped transcrashes, the
72697 + reiser4_context has a "txn_handle *trans" pointer that may be set to an open
72698 + transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
72699 + "kmem_cache_t *_txnh_slab" created for that purpose in this file.
72700 +*/
72701 +
72702 +/* Extending the other system call interfaces for future transaction features:
72703 +
72704 + Specialized applications may benefit from passing flags to the ordinary system call
72705 + interface such as read(), write(), or stat(). For example, the application specifies
72706 + WRITE_FUSING by default but wishes to add that a certain read() command should be
72707 + treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
72708 + read, or the file-data read? These issues are straight-forward, but there are a lot of
72709 + them and adding the necessary flags-passing code will be tedious.
72710 +
72711 + When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
72712 + flag, which specifies that although it is a read operation being requested, a
72713 + write-lock should be taken. The reason is that read-locks are shared while write-locks
72714 + are exclusive, so taking a read-lock when a later-write is known in advance will often
72715 + leads to deadlock. If a reader knows it will write later, it should issue read
72716 + requests with the RMW flag set.
72717 +*/
72718 +
72719 +/*
72720 + The znode/atom deadlock avoidance.
72721 +
72722 + FIXME(Zam): writing of this comment is in progress.
72723 +
72724 + The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
72725 + long-term locking, which makes reiser4 locking scheme more complex. It had
72726 + deadlocks until we implement deadlock avoidance algorithms. That deadlocks
72727 + looked as the following: one stopped thread waits for a long-term lock on
72728 + znode, the thread who owns that lock waits when fusion with another atom will
72729 + be allowed.
72730 +
72731 + The source of the deadlocks is an optimization of not capturing index nodes
72732 + for read. Let's prove it. Suppose we have dumb node capturing scheme which
72733 + unconditionally captures each block before locking it.
72734 +
72735 + That scheme has no deadlocks. Let's begin with the thread which stage is
72736 + ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
72737 + a capture because it's stage allows fusion with any atom except which are
72738 + being committed currently. A process of atom commit can't deadlock because
72739 + atom commit procedure does not acquire locks and does not fuse with other
72740 + atoms. Reiser4 does capturing right before going to sleep inside the
72741 + longtertm_lock_znode() function, it means the znode which we want to lock is
72742 + already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
72743 + continue the analysis we understand that no one process in the sequence may
72744 + waits atom fusion. Thereby there are no deadlocks of described kind.
72745 +
72746 + The capturing optimization makes the deadlocks possible. A thread can wait a
72747 + lock which owner did not captured that node. The lock owner's current atom
72748 + is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
72749 + state. A deadlock is possible when that atom meets another one which is in
72750 + ASTAGE_CAPTURE_WAIT already.
72751 +
72752 + The deadlock avoidance scheme includes two algorithms:
72753 +
72754 + First algorithm is used when a thread captures a node which is locked but not
72755 + captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
72756 + moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
72757 + being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
72758 + routine which forces all lock owners to join with current atom is executed.
72759 +
72760 + Second algorithm does not allow to skip capturing of already captured nodes.
72761 +
72762 + Both algorithms together prevent waiting a longterm lock without atom fusion
72763 + with atoms of all lock owners, which is a key thing for getting atom/znode
72764 + locking deadlocks.
72765 +*/
72766 +
72767 +/*
72768 + * Transactions and mmap(2).
72769 + *
72770 + * 1. Transactions are not supported for accesses through mmap(2), because
72771 + * this would effectively amount to user-level transactions whose duration
72772 + * is beyond control of the kernel.
72773 + *
72774 + * 2. That said, we still want to preserve some decency with regard to
72775 + * mmap(2). During normal write(2) call, following sequence of events
72776 + * happens:
72777 + *
72778 + * 1. page is created;
72779 + *
72780 + * 2. jnode is created, dirtied and captured into current atom.
72781 + *
72782 + * 3. extent is inserted and modified.
72783 + *
72784 + * Steps (2) and (3) take place under long term lock on the twig node.
72785 + *
72786 + * When file is accessed through mmap(2) page is always created during
72787 + * page fault. After this (in reiser4_readpage()->readpage_extent()):
72788 + *
72789 + * 1. if access is made to non-hole page new jnode is created, (if
72790 + * necessary)
72791 + *
72792 + * 2. if access is made to the hole page, jnode is not created (XXX
72793 + * not clear why).
72794 + *
72795 + * Also, even if page is created by write page fault it is not marked
72796 + * dirty immediately by handle_mm_fault(). Probably this is to avoid races
72797 + * with page write-out.
72798 + *
72799 + * Dirty bit installed by hardware is only transferred to the struct page
72800 + * later, when page is unmapped (in zap_pte_range(), or
72801 + * try_to_unmap_one()).
72802 + *
72803 + * So, with mmap(2) we have to handle following irksome situations:
72804 + *
72805 + * 1. there exists modified page (clean or dirty) without jnode
72806 + *
72807 + * 2. there exists modified page (clean or dirty) with clean jnode
72808 + *
72809 + * 3. clean page which is a part of atom can be transparently modified
72810 + * at any moment through mapping without becoming dirty.
72811 + *
72812 + * (1) and (2) can lead to the out-of-memory situation: ->writepage()
72813 + * doesn't know what to do with such pages and ->sync_sb()/->writepages()
72814 + * don't see them, because these methods operate on atoms.
72815 + *
72816 + * (3) can lead to the loss of data: suppose we have dirty page with dirty
72817 + * captured jnode captured by some atom. As part of early flush (for
72818 + * example) page was written out. Dirty bit was cleared on both page and
72819 + * jnode. After this page is modified through mapping, but kernel doesn't
72820 + * notice and just discards page and jnode as part of commit. (XXX
72821 + * actually it doesn't, because to reclaim page ->releasepage() has to be
72822 + * called and before this dirty bit will be transferred to the struct
72823 + * page).
72824 + *
72825 + */
72826 +
72827 +#include "debug.h"
72828 +#include "txnmgr.h"
72829 +#include "jnode.h"
72830 +#include "znode.h"
72831 +#include "block_alloc.h"
72832 +#include "tree.h"
72833 +#include "wander.h"
72834 +#include "ktxnmgrd.h"
72835 +#include "super.h"
72836 +#include "page_cache.h"
72837 +#include "reiser4.h"
72838 +#include "vfs_ops.h"
72839 +#include "inode.h"
72840 +#include "flush.h"
72841 +
72842 +#include <asm/atomic.h>
72843 +#include <linux/types.h>
72844 +#include <linux/fs.h>
72845 +#include <linux/mm.h>
72846 +#include <linux/slab.h>
72847 +#include <linux/pagemap.h>
72848 +#include <linux/writeback.h>
72849 +#include <linux/swap.h> /* for totalram_pages */
72850 +
72851 +static void atom_free(txn_atom * atom);
72852 +
72853 +static int commit_txnh(txn_handle * txnh);
72854 +
72855 +static void wakeup_atom_waitfor_list(txn_atom * atom);
72856 +static void wakeup_atom_waiting_list(txn_atom * atom);
72857 +
72858 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
72859 +
72860 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
72861 +
72862 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
72863 +
72864 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
72865 + txn_capture mode);
72866 +
72867 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
72868 +
72869 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
72870 +
72871 +void invalidate_list(struct list_head *);
72872 +
72873 +/* GENERIC STRUCTURES */
72874 +
72875 +typedef struct _txn_wait_links txn_wait_links;
72876 +
72877 +struct _txn_wait_links {
72878 + lock_stack *_lock_stack;
72879 + struct list_head _fwaitfor_link;
72880 + struct list_head _fwaiting_link;
72881 + int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72882 + int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72883 +};
72884 +
72885 +/* FIXME: In theory, we should be using the slab cache init & destructor
72886 + methods instead of, e.g., jnode_init, etc. */
72887 +static kmem_cache_t *_atom_slab = NULL;
72888 +/* this is for user-visible, cross system-call transactions. */
72889 +static kmem_cache_t *_txnh_slab = NULL;
72890 +
72891 +/**
72892 + * init_txnmgr_static - create transaction manager slab caches
72893 + *
72894 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
72895 + * initialization.
72896 + */
72897 +int init_txnmgr_static(void)
72898 +{
72899 + assert("jmacd-600", _atom_slab == NULL);
72900 + assert("jmacd-601", _txnh_slab == NULL);
72901 +
72902 + ON_DEBUG(atomic_set(&flush_cnt, 0));
72903 +
72904 + _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
72905 + SLAB_HWCACHE_ALIGN |
72906 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
72907 + if (_atom_slab == NULL)
72908 + return RETERR(-ENOMEM);
72909 +
72910 + _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
72911 + SLAB_HWCACHE_ALIGN, NULL, NULL);
72912 + if (_txnh_slab == NULL) {
72913 + kmem_cache_destroy(_atom_slab);
72914 + _atom_slab = NULL;
72915 + return RETERR(-ENOMEM);
72916 + }
72917 +
72918 + return 0;
72919 +}
72920 +
72921 +/**
72922 + * done_txnmgr_static - delete txn_atom and txn_handle caches
72923 + *
72924 + * This is called on reiser4 module unloading or system shutdown.
72925 + */
72926 +void done_txnmgr_static(void)
72927 +{
72928 + destroy_reiser4_cache(&_atom_slab);
72929 + destroy_reiser4_cache(&_txnh_slab);
72930 +}
72931 +
72932 +/**
72933 + * init_txnmgr - initialize a new transaction manager
72934 + * @mgr: pointer to transaction manager embedded in reiser4 super block
72935 + *
72936 + * This is called on mount. Makes necessary initializations.
72937 + */
72938 +void init_txnmgr(txn_mgr *mgr)
72939 +{
72940 + assert("umka-169", mgr != NULL);
72941 +
72942 + mgr->atom_count = 0;
72943 + mgr->id_count = 1;
72944 + INIT_LIST_HEAD(&mgr->atoms_list);
72945 + spin_lock_init(&mgr->tmgr_lock);
72946 + sema_init(&mgr->commit_semaphore, 1);
72947 +}
72948 +
72949 +/**
72950 + * done_txnmgr - stop transaction manager
72951 + * @mgr: pointer to transaction manager embedded in reiser4 super block
72952 + *
72953 + * This is called on umount. Does sanity checks.
72954 + */
72955 +void done_txnmgr(txn_mgr *mgr)
72956 +{
72957 + assert("umka-170", mgr != NULL);
72958 + assert("umka-1701", list_empty_careful(&mgr->atoms_list));
72959 + assert("umka-1702", mgr->atom_count == 0);
72960 +}
72961 +
72962 +/* Initialize a transaction handle. */
72963 +/* Audited by: umka (2002.06.13) */
72964 +static void txnh_init(txn_handle * txnh, txn_mode mode)
72965 +{
72966 + assert("umka-171", txnh != NULL);
72967 +
72968 + txnh->mode = mode;
72969 + txnh->atom = NULL;
72970 + set_gfp_mask();
72971 + txnh->flags = 0;
72972 + spin_lock_init(&txnh->hlock);
72973 + INIT_LIST_HEAD(&txnh->txnh_link);
72974 +}
72975 +
72976 +#if REISER4_DEBUG
72977 +/* Check if a transaction handle is clean. */
72978 +static int txnh_isclean(txn_handle * txnh)
72979 +{
72980 + assert("umka-172", txnh != NULL);
72981 + return txnh->atom == NULL &&
72982 + LOCK_CNT_NIL(spin_locked_txnh);
72983 +}
72984 +#endif
72985 +
72986 +/* Initialize an atom. */
72987 +static void atom_init(txn_atom * atom)
72988 +{
72989 + int level;
72990 +
72991 + assert("umka-173", atom != NULL);
72992 +
72993 + memset(atom, 0, sizeof(txn_atom));
72994 +
72995 + atom->stage = ASTAGE_FREE;
72996 + atom->start_time = jiffies;
72997 +
72998 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
72999 + INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
73000 +
73001 + INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
73002 + INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
73003 + INIT_LIST_HEAD(ATOM_WB_LIST(atom));
73004 + INIT_LIST_HEAD(&atom->inodes);
73005 + spin_lock_init(&atom->alock);
73006 + /* list of transaction handles */
73007 + INIT_LIST_HEAD(&atom->txnh_list);
73008 + /* link to transaction manager's list of atoms */
73009 + INIT_LIST_HEAD(&atom->atom_link);
73010 + INIT_LIST_HEAD(&atom->fwaitfor_list);
73011 + INIT_LIST_HEAD(&atom->fwaiting_list);
73012 + blocknr_set_init(&atom->delete_set);
73013 + blocknr_set_init(&atom->wandered_map);
73014 +
73015 + init_atom_fq_parts(atom);
73016 +}
73017 +
73018 +#if REISER4_DEBUG
73019 +/* Check if an atom is clean. */
73020 +static int atom_isclean(txn_atom * atom)
73021 +{
73022 + int level;
73023 +
73024 + assert("umka-174", atom != NULL);
73025 +
73026 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73027 + if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
73028 + return 0;
73029 + }
73030 + }
73031 +
73032 + return atom->stage == ASTAGE_FREE &&
73033 + atom->txnh_count == 0 &&
73034 + atom->capture_count == 0 &&
73035 + atomic_read(&atom->refcount) == 0 &&
73036 + (&atom->atom_link == atom->atom_link.next &&
73037 + &atom->atom_link == atom->atom_link.prev) &&
73038 + list_empty_careful(&atom->txnh_list) &&
73039 + list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
73040 + list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
73041 + list_empty_careful(ATOM_WB_LIST(atom)) &&
73042 + list_empty_careful(&atom->fwaitfor_list) &&
73043 + list_empty_careful(&atom->fwaiting_list) &&
73044 + atom_fq_parts_are_clean(atom);
73045 +}
73046 +#endif
73047 +
73048 +/* Begin a transaction in this context. Currently this uses the reiser4_context's
73049 + trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
73050 + this will be extended to allow transaction handles to span several contexts. */
73051 +/* Audited by: umka (2002.06.13) */
73052 +void txn_begin(reiser4_context * context)
73053 +{
73054 + assert("jmacd-544", context->trans == NULL);
73055 +
73056 + context->trans = &context->trans_in_ctx;
73057 +
73058 + /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
73059 + transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
73060 + stack allocated right now, but we would like to allow for dynamically allocated
73061 + transcrashes that span multiple system calls.
73062 + */
73063 + txnh_init(context->trans, TXN_WRITE_FUSING);
73064 +}
73065 +
73066 +/* Finish a transaction handle context. */
73067 +int txn_end(reiser4_context * context)
73068 +{
73069 + long ret = 0;
73070 + txn_handle *txnh;
73071 +
73072 + assert("umka-283", context != NULL);
73073 + assert("nikita-3012", schedulable());
73074 + assert("vs-24", context == get_current_context());
73075 + assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
73076 +
73077 + txnh = context->trans;
73078 + if (txnh != NULL) {
73079 + if (txnh->atom != NULL)
73080 + ret = commit_txnh(txnh);
73081 + assert("jmacd-633", txnh_isclean(txnh));
73082 + context->trans = NULL;
73083 + }
73084 + return ret;
73085 +}
73086 +
73087 +void txn_restart(reiser4_context * context)
73088 +{
73089 + txn_end(context);
73090 + preempt_point();
73091 + txn_begin(context);
73092 +}
73093 +
73094 +void txn_restart_current(void)
73095 +{
73096 + txn_restart(get_current_context());
73097 +}
73098 +
73099 +/* TXN_ATOM */
73100 +
73101 +/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
73102 + is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
73103 + return NULL. */
73104 +static txn_atom *txnh_get_atom(txn_handle * txnh)
73105 +{
73106 + txn_atom *atom;
73107 +
73108 + assert("umka-180", txnh != NULL);
73109 + assert_spin_not_locked(&(txnh->hlock));
73110 +
73111 + while (1) {
73112 + spin_lock_txnh(txnh);
73113 + atom = txnh->atom;
73114 +
73115 + if (atom == NULL)
73116 + break;
73117 +
73118 + if (spin_trylock_atom(atom))
73119 + break;
73120 +
73121 + atomic_inc(&atom->refcount);
73122 +
73123 + spin_unlock_txnh(txnh);
73124 + spin_lock_atom(atom);
73125 + spin_lock_txnh(txnh);
73126 +
73127 + if (txnh->atom == atom) {
73128 + atomic_dec(&atom->refcount);
73129 + break;
73130 + }
73131 +
73132 + spin_unlock_txnh(txnh);
73133 + atom_dec_and_unlock(atom);
73134 + }
73135 +
73136 + return atom;
73137 +}
73138 +
73139 +/* Get the current atom and spinlock it if current atom present. May return NULL */
73140 +txn_atom *get_current_atom_locked_nocheck(void)
73141 +{
73142 + reiser4_context *cx;
73143 + txn_atom *atom;
73144 + txn_handle *txnh;
73145 +
73146 + cx = get_current_context();
73147 + assert("zam-437", cx != NULL);
73148 +
73149 + txnh = cx->trans;
73150 + assert("zam-435", txnh != NULL);
73151 +
73152 + atom = txnh_get_atom(txnh);
73153 +
73154 + spin_unlock_txnh(txnh);
73155 + return atom;
73156 +}
73157 +
73158 +/* Get the atom belonging to a jnode, which is initially locked. Return with
73159 + both jnode and atom locked. This performs the necessary spin_trylock to
73160 + break the lock-ordering cycle. Assumes the jnode is already locked, and
73161 + returns NULL if atom is not set. */
73162 +txn_atom *jnode_get_atom(jnode * node)
73163 +{
73164 + txn_atom *atom;
73165 +
73166 + assert("umka-181", node != NULL);
73167 +
73168 + while (1) {
73169 + assert_spin_locked(&(node->guard));
73170 +
73171 + atom = node->atom;
73172 + /* node is not in any atom */
73173 + if (atom == NULL)
73174 + break;
73175 +
73176 + /* If atom is not locked, grab the lock and return */
73177 + if (spin_trylock_atom(atom))
73178 + break;
73179 +
73180 + /* At least one jnode belongs to this atom it guarantees that
73181 + * atom->refcount > 0, we can safely increment refcount. */
73182 + atomic_inc(&atom->refcount);
73183 + spin_unlock_jnode(node);
73184 +
73185 + /* re-acquire spin locks in the right order */
73186 + spin_lock_atom(atom);
73187 + spin_lock_jnode(node);
73188 +
73189 + /* check if node still points to the same atom. */
73190 + if (node->atom == atom) {
73191 + atomic_dec(&atom->refcount);
73192 + break;
73193 + }
73194 +
73195 + /* releasing of atom lock and reference requires not holding
73196 + * locks on jnodes. */
73197 + spin_unlock_jnode(node);
73198 +
73199 + /* We do not sure that this atom has extra references except our
73200 + * one, so we should call proper function which may free atom if
73201 + * last reference is released. */
73202 + atom_dec_and_unlock(atom);
73203 +
73204 + /* lock jnode again for getting valid node->atom pointer
73205 + * value. */
73206 + spin_lock_jnode(node);
73207 + }
73208 +
73209 + return atom;
73210 +}
73211 +
73212 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
73213 + by flush code to indicate whether the next node (in some direction) is suitable for
73214 + flushing. */
73215 +int
73216 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
73217 +{
73218 + int compat;
73219 + txn_atom *atom;
73220 +
73221 + assert("umka-182", node != NULL);
73222 + assert("umka-183", check != NULL);
73223 +
73224 + /* Not sure what this function is supposed to do if supplied with @check that is
73225 + neither formatted nor unformatted (bitmap or so). */
73226 + assert("nikita-2373", jnode_is_znode(check)
73227 + || jnode_is_unformatted(check));
73228 +
73229 + /* Need a lock on CHECK to get its atom and to check various state bits.
73230 + Don't need a lock on NODE once we get the atom lock. */
73231 + /* It is not enough to lock two nodes and check (node->atom ==
73232 + check->atom) because atom could be locked and being fused at that
73233 + moment, jnodes of the atom of that state (being fused) can point to
73234 + different objects, but the atom is the same. */
73235 + spin_lock_jnode(check);
73236 +
73237 + atom = jnode_get_atom(check);
73238 +
73239 + if (atom == NULL) {
73240 + compat = 0;
73241 + } else {
73242 + compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
73243 +
73244 + if (compat && jnode_is_znode(check)) {
73245 + compat &= znode_is_connected(JZNODE(check));
73246 + }
73247 +
73248 + if (compat && alloc_check) {
73249 + compat &= (alloc_value == jnode_is_flushprepped(check));
73250 + }
73251 +
73252 + spin_unlock_atom(atom);
73253 + }
73254 +
73255 + spin_unlock_jnode(check);
73256 +
73257 + return compat;
73258 +}
73259 +
73260 +/* Decrement the atom's reference count and if it falls to zero, free it. */
73261 +void atom_dec_and_unlock(txn_atom * atom)
73262 +{
73263 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73264 +
73265 + assert("umka-186", atom != NULL);
73266 + assert_spin_locked(&(atom->alock));
73267 + assert("zam-1039", atomic_read(&atom->refcount) > 0);
73268 +
73269 + if (atomic_dec_and_test(&atom->refcount)) {
73270 + /* take txnmgr lock and atom lock in proper order. */
73271 + if (!spin_trylock_txnmgr(mgr)) {
73272 + /* This atom should exist after we re-acquire its
73273 + * spinlock, so we increment its reference counter. */
73274 + atomic_inc(&atom->refcount);
73275 + spin_unlock_atom(atom);
73276 + spin_lock_txnmgr(mgr);
73277 + spin_lock_atom(atom);
73278 +
73279 + if (!atomic_dec_and_test(&atom->refcount)) {
73280 + spin_unlock_atom(atom);
73281 + spin_unlock_txnmgr(mgr);
73282 + return;
73283 + }
73284 + }
73285 + assert_spin_locked(&(mgr->tmgr_lock));
73286 + atom_free(atom);
73287 + spin_unlock_txnmgr(mgr);
73288 + } else
73289 + spin_unlock_atom(atom);
73290 +}
73291 +
73292 +/* Create new atom and connect it to given transaction handle. This adds the
73293 + atom to the transaction manager's list and sets its reference count to 1, an
73294 + artificial reference which is kept until it commits. We play strange games
73295 + to avoid allocation under jnode & txnh spinlocks.*/
73296 +
73297 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73298 +{
73299 + txn_atom *atom;
73300 + txn_mgr *mgr;
73301 +
73302 + if (REISER4_DEBUG && rofs_tree(current_tree)) {
73303 + warning("nikita-3366", "Creating atom on rofs");
73304 + dump_stack();
73305 + }
73306 +
73307 + if (*atom_alloc == NULL) {
73308 + (*atom_alloc) = kmem_cache_alloc(_atom_slab, get_gfp_mask());
73309 +
73310 + if (*atom_alloc == NULL)
73311 + return RETERR(-ENOMEM);
73312 + }
73313 +
73314 + /* and, also, txnmgr spin lock should be taken before jnode and txnh
73315 + locks. */
73316 + mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73317 + spin_lock_txnmgr(mgr);
73318 + spin_lock_txnh(txnh);
73319 +
73320 + /* Check whether new atom still needed */
73321 + if (txnh->atom != NULL) {
73322 + /* NOTE-NIKITA probably it is rather better to free
73323 + * atom_alloc here than thread it up to try_capture(). */
73324 +
73325 + spin_unlock_txnh(txnh);
73326 + spin_unlock_txnmgr(mgr);
73327 +
73328 + return -E_REPEAT;
73329 + }
73330 +
73331 + atom = *atom_alloc;
73332 + *atom_alloc = NULL;
73333 +
73334 + atom_init(atom);
73335 +
73336 + assert("jmacd-17", atom_isclean(atom));
73337 +
73338 + /*
73339 + * do not use spin_lock_atom because we have broken lock ordering here
73340 + * which is ok, as long as @atom is new and inaccessible for others.
73341 + */
73342 + spin_lock(&(atom->alock));
73343 +
73344 + /* add atom to the end of transaction manager's list of atoms */
73345 + list_add_tail(&atom->atom_link, &mgr->atoms_list);
73346 + atom->atom_id = mgr->id_count++;
73347 + mgr->atom_count += 1;
73348 +
73349 + /* Release txnmgr lock */
73350 + spin_unlock_txnmgr(mgr);
73351 +
73352 + /* One reference until it commits. */
73353 + atomic_inc(&atom->refcount);
73354 + atom->stage = ASTAGE_CAPTURE_FUSE;
73355 + atom->super = reiser4_get_current_sb();
73356 + capture_assign_txnh_nolock(atom, txnh);
73357 +
73358 + spin_unlock(&(atom->alock));
73359 + spin_unlock_txnh(txnh);
73360 +
73361 + return -E_REPEAT;
73362 +}
73363 +
73364 +/* Return true if an atom is currently "open". */
73365 +static int atom_isopen(const txn_atom * atom)
73366 +{
73367 + assert("umka-185", atom != NULL);
73368 +
73369 + return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73370 +}
73371 +
73372 +/* Return the number of pointers to this atom that must be updated during fusion. This
73373 + approximates the amount of work to be done. Fusion chooses the atom with fewer
73374 + pointers to fuse into the atom with more pointers. */
73375 +static int atom_pointer_count(const txn_atom * atom)
73376 +{
73377 + assert("umka-187", atom != NULL);
73378 +
73379 + /* This is a measure of the amount of work needed to fuse this atom
73380 + * into another. */
73381 + return atom->txnh_count + atom->capture_count;
73382 +}
73383 +
73384 +/* Called holding the atom lock, this removes the atom from the transaction manager list
73385 + and frees it. */
73386 +static void atom_free(txn_atom * atom)
73387 +{
73388 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73389 +
73390 + assert("umka-188", atom != NULL);
73391 + assert_spin_locked(&(atom->alock));
73392 +
73393 + /* Remove from the txn_mgr's atom list */
73394 + assert_spin_locked(&(mgr->tmgr_lock));
73395 + mgr->atom_count -= 1;
73396 + list_del_init(&atom->atom_link);
73397 +
73398 + /* Clean the atom */
73399 + assert("jmacd-16",
73400 + (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73401 + atom->stage = ASTAGE_FREE;
73402 +
73403 + blocknr_set_destroy(&atom->delete_set);
73404 + blocknr_set_destroy(&atom->wandered_map);
73405 +
73406 + assert("jmacd-16", atom_isclean(atom));
73407 +
73408 + spin_unlock_atom(atom);
73409 +
73410 + kmem_cache_free(_atom_slab, atom);
73411 +}
73412 +
73413 +static int atom_is_dotard(const txn_atom * atom)
73414 +{
73415 + return time_after(jiffies, atom->start_time +
73416 + get_current_super_private()->tmgr.atom_max_age);
73417 +}
73418 +
73419 +static int atom_can_be_committed(txn_atom * atom)
73420 +{
73421 + assert_spin_locked(&(atom->alock));
73422 + assert("zam-885", atom->txnh_count > atom->nr_waiters);
73423 + return atom->txnh_count == atom->nr_waiters + 1;
73424 +}
73425 +
73426 +/* Return true if an atom should commit now. This is determined by aging, atom
73427 + size or atom flags. */
73428 +static int atom_should_commit(const txn_atom * atom)
73429 +{
73430 + assert("umka-189", atom != NULL);
73431 + return
73432 + (atom->flags & ATOM_FORCE_COMMIT) ||
73433 + ((unsigned)atom_pointer_count(atom) >
73434 + get_current_super_private()->tmgr.atom_max_size)
73435 + || atom_is_dotard(atom);
73436 +}
73437 +
73438 +/* return 1 if current atom exists and requires commit. */
73439 +int current_atom_should_commit(void)
73440 +{
73441 + txn_atom *atom;
73442 + int result = 0;
73443 +
73444 + atom = get_current_atom_locked_nocheck();
73445 + if (atom) {
73446 + result = atom_should_commit(atom);
73447 + spin_unlock_atom(atom);
73448 + }
73449 + return result;
73450 +}
73451 +
73452 +static int atom_should_commit_asap(const txn_atom * atom)
73453 +{
73454 + unsigned int captured;
73455 + unsigned int pinnedpages;
73456 +
73457 + assert("nikita-3309", atom != NULL);
73458 +
73459 + captured = (unsigned)atom->capture_count;
73460 + pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73461 +
73462 + return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
73463 +}
73464 +
73465 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
73466 +{
73467 + jnode *first_dirty;
73468 +
73469 + list_for_each_entry(first_dirty, head, capture_link) {
73470 + if (!(flags & JNODE_FLUSH_COMMIT)) {
73471 + /*
73472 + * skip jnodes which "heard banshee" or having active
73473 + * I/O
73474 + */
73475 + if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
73476 + JF_ISSET(first_dirty, JNODE_WRITEBACK))
73477 + continue;
73478 + }
73479 + return first_dirty;
73480 + }
73481 + return NULL;
73482 +}
73483 +
73484 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
73485 + nodes on atom's lists */
73486 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
73487 +{
73488 + jnode *first_dirty;
73489 + tree_level level;
73490 +
73491 + assert_spin_locked(&(atom->alock));
73492 +
73493 + /* The flush starts from LEAF_LEVEL (=1). */
73494 + for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73495 + if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
73496 + continue;
73497 +
73498 + first_dirty =
73499 + find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
73500 + flags);
73501 + if (first_dirty)
73502 + return first_dirty;
73503 + }
73504 +
73505 + /* znode-above-root is on the list #0. */
73506 + return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
73507 +}
73508 +
73509 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
73510 +{
73511 + jnode *cur;
73512 +
73513 + assert("zam-905", atom_is_protected(atom));
73514 +
73515 + cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
73516 + while (ATOM_WB_LIST(atom) != &cur->capture_link) {
73517 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
73518 +
73519 + spin_lock_jnode(cur);
73520 + if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
73521 + if (JF_ISSET(cur, JNODE_DIRTY)) {
73522 + queue_jnode(fq, cur);
73523 + } else {
73524 + /* move jnode to atom's clean list */
73525 + list_move_tail(&cur->capture_link,
73526 + ATOM_CLEAN_LIST(atom));
73527 + }
73528 + }
73529 + spin_unlock_jnode(cur);
73530 +
73531 + cur = next;
73532 + }
73533 +}
73534 +
73535 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
73536 + * jnodes to disk. */
73537 +static int submit_wb_list(void)
73538 +{
73539 + int ret;
73540 + flush_queue_t *fq;
73541 +
73542 + fq = get_fq_for_current_atom();
73543 + if (IS_ERR(fq))
73544 + return PTR_ERR(fq);
73545 +
73546 + dispatch_wb_list(fq->atom, fq);
73547 + spin_unlock_atom(fq->atom);
73548 +
73549 + ret = write_fq(fq, NULL, 1);
73550 + fq_put(fq);
73551 +
73552 + return ret;
73553 +}
73554 +
73555 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
73556 +static int current_atom_complete_writes(void)
73557 +{
73558 + int ret;
73559 +
73560 + /* Each jnode from that list was modified and dirtied when it had i/o
73561 + * request running already. After i/o completion we have to resubmit
73562 + * them to disk again.*/
73563 + ret = submit_wb_list();
73564 + if (ret < 0)
73565 + return ret;
73566 +
73567 + /* Wait all i/o completion */
73568 + ret = current_atom_finish_all_fq();
73569 + if (ret)
73570 + return ret;
73571 +
73572 + /* Scan wb list again; all i/o should be completed, we re-submit dirty
73573 + * nodes to disk */
73574 + ret = submit_wb_list();
73575 + if (ret < 0)
73576 + return ret;
73577 +
73578 + /* Wait all nodes we just submitted */
73579 + return current_atom_finish_all_fq();
73580 +}
73581 +
73582 +#define TOOMANYFLUSHES (1 << 13)
73583 +
73584 +/* Called with the atom locked and no open "active" transaction handlers except
73585 + ours, this function calls flush_current_atom() until all dirty nodes are
73586 + processed. Then it initiates commit processing.
73587 +
73588 + Called by the single remaining open "active" txnh, which is closing. Other
73589 + open txnhs belong to processes which wait atom commit in commit_txnh()
73590 + routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
73591 + long as we hold the atom lock none of the jnodes can be captured and/or
73592 + locked.
73593 +
73594 + Return value is an error code if commit fails.
73595 +*/
73596 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
73597 +{
73598 + reiser4_super_info_data *sbinfo = get_current_super_private();
73599 + long ret = 0;
73600 + /* how many times jnode_flush() was called as a part of attempt to
73601 + * commit this atom. */
73602 + int flushiters;
73603 +
73604 + assert("zam-888", atom != NULL && *atom != NULL);
73605 + assert_spin_locked(&((*atom)->alock));
73606 + assert("zam-887", get_current_context()->trans->atom == *atom);
73607 + assert("jmacd-151", atom_isopen(*atom));
73608 +
73609 + /* lock ordering: delete_sema and commit_sema are unordered */
73610 + assert("nikita-3184",
73611 + get_current_super_private()->delete_sema_owner != current);
73612 +
73613 + for (flushiters = 0;; ++flushiters) {
73614 + ret =
73615 + flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
73616 + JNODE_FLUSH_COMMIT,
73617 + LONG_MAX /* nr_to_write */ ,
73618 + nr_submitted, atom, NULL);
73619 + if (ret != -E_REPEAT)
73620 + break;
73621 +
73622 + /* if atom's dirty list contains one znode which is
73623 + HEARD_BANSHEE and is locked we have to allow lock owner to
73624 + continue and uncapture that znode */
73625 + preempt_point();
73626 +
73627 + *atom = get_current_atom_locked();
73628 + if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
73629 + warning("nikita-3176",
73630 + "Flushing like mad: %i", flushiters);
73631 + info_atom("atom", *atom);
73632 + DEBUGON(flushiters > (1 << 20));
73633 + }
73634 + }
73635 +
73636 + if (ret)
73637 + return ret;
73638 +
73639 + assert_spin_locked(&((*atom)->alock));
73640 +
73641 + if (!atom_can_be_committed(*atom)) {
73642 + spin_unlock_atom(*atom);
73643 + return RETERR(-E_REPEAT);
73644 + }
73645 +
73646 + if ((*atom)->capture_count == 0)
73647 + goto done;
73648 +
73649 + /* Up to this point we have been flushing and after flush is called we
73650 + return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
73651 + at this point, commit should be successful. */
73652 + atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
73653 + ON_DEBUG(((*atom)->committer = current));
73654 + spin_unlock_atom(*atom);
73655 +
73656 + ret = current_atom_complete_writes();
73657 + if (ret)
73658 + return ret;
73659 +
73660 + assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
73661 +
73662 + /* isolate critical code path which should be executed by only one
73663 + * thread using tmgr semaphore */
73664 + down(&sbinfo->tmgr.commit_semaphore);
73665 +
73666 + ret = reiser4_write_logs(nr_submitted);
73667 + if (ret < 0)
73668 + reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
73669 +
73670 + /* The atom->ovrwr_nodes list is processed under commit semaphore held
73671 + because of bitmap nodes which are captured by special way in
73672 + bitmap_pre_commit_hook(), that way does not include
73673 + capture_fuse_wait() as a capturing of other nodes does -- the commit
73674 + semaphore is used for transaction isolation instead. */
73675 + invalidate_list(ATOM_OVRWR_LIST(*atom));
73676 + up(&sbinfo->tmgr.commit_semaphore);
73677 +
73678 + invalidate_list(ATOM_CLEAN_LIST(*atom));
73679 + invalidate_list(ATOM_WB_LIST(*atom));
73680 + assert("zam-927", list_empty(&(*atom)->inodes));
73681 +
73682 + spin_lock_atom(*atom);
73683 + done:
73684 + atom_set_stage(*atom, ASTAGE_DONE);
73685 + ON_DEBUG((*atom)->committer = NULL);
73686 +
73687 + /* Atom's state changes, so wake up everybody waiting for this
73688 + event. */
73689 + wakeup_atom_waiting_list(*atom);
73690 +
73691 + /* Decrement the "until commit" reference, at least one txnh (the caller) is
73692 + still open. */
73693 + atomic_dec(&(*atom)->refcount);
73694 +
73695 + assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
73696 + assert("jmacd-1062", (*atom)->capture_count == 0);
73697 + BUG_ON((*atom)->capture_count != 0);
73698 + assert_spin_locked(&((*atom)->alock));
73699 +
73700 + return ret;
73701 +}
73702 +
73703 +/* TXN_TXNH */
73704 +
73705 +/**
73706 + * force_commit_atom - commit current atom and wait commit completion
73707 + * @txnh:
73708 + *
73709 + * Commits current atom and wait commit completion; current atom and @txnh have
73710 + * to be spinlocked before call, this function unlocks them on exit.
73711 + */
73712 +int force_commit_atom(txn_handle *txnh)
73713 +{
73714 + txn_atom *atom;
73715 +
73716 + assert("zam-837", txnh != NULL);
73717 + assert_spin_locked(&(txnh->hlock));
73718 + assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
73719 +
73720 + atom = txnh->atom;
73721 +
73722 + assert("zam-834", atom != NULL);
73723 + assert_spin_locked(&(atom->alock));
73724 +
73725 + /*
73726 + * Set flags for atom and txnh: forcing atom commit and waiting for
73727 + * commit completion
73728 + */
73729 + txnh->flags |= TXNH_WAIT_COMMIT;
73730 + atom->flags |= ATOM_FORCE_COMMIT;
73731 +
73732 + spin_unlock_txnh(txnh);
73733 + spin_unlock_atom(atom);
73734 +
73735 + /* commit is here */
73736 + txn_restart_current();
73737 + return 0;
73738 +}
73739 +
73740 +/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
73741 + * should we commit all atoms including new ones which are created after this
73742 + * functions is called. */
73743 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
73744 +{
73745 + int ret;
73746 + txn_atom *atom;
73747 + txn_mgr *mgr;
73748 + txn_handle *txnh;
73749 + unsigned long start_time = jiffies;
73750 + reiser4_context *ctx = get_current_context();
73751 +
73752 + assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
73753 + assert("nikita-3058", commit_check_locks());
73754 +
73755 + txn_restart_current();
73756 +
73757 + mgr = &get_super_private(super)->tmgr;
73758 +
73759 + txnh = ctx->trans;
73760 +
73761 + again:
73762 +
73763 + spin_lock_txnmgr(mgr);
73764 +
73765 + list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
73766 + spin_lock_atom(atom);
73767 +
73768 + /* Commit any atom which can be committed. If @commit_new_atoms
73769 + * is not set we commit only atoms which were created before
73770 + * this call is started. */
73771 + if (commit_all_atoms
73772 + || time_before_eq(atom->start_time, start_time)) {
73773 + if (atom->stage <= ASTAGE_POST_COMMIT) {
73774 + spin_unlock_txnmgr(mgr);
73775 +
73776 + if (atom->stage < ASTAGE_PRE_COMMIT) {
73777 + spin_lock_txnh(txnh);
73778 + /* Add force-context txnh */
73779 + capture_assign_txnh_nolock(atom, txnh);
73780 + ret = force_commit_atom(txnh);
73781 + if (ret)
73782 + return ret;
73783 + } else
73784 + /* wait atom commit */
73785 + atom_wait_event(atom);
73786 +
73787 + goto again;
73788 + }
73789 + }
73790 +
73791 + spin_unlock_atom(atom);
73792 + }
73793 +
73794 +#if REISER4_DEBUG
73795 + if (commit_all_atoms) {
73796 + reiser4_super_info_data *sbinfo = get_super_private(super);
73797 + spin_lock_reiser4_super(sbinfo);
73798 + assert("zam-813",
73799 + sbinfo->blocks_fake_allocated_unformatted == 0);
73800 + assert("zam-812", sbinfo->blocks_fake_allocated == 0);
73801 + spin_unlock_reiser4_super(sbinfo);
73802 + }
73803 +#endif
73804 +
73805 + spin_unlock_txnmgr(mgr);
73806 +
73807 + return 0;
73808 +}
73809 +
73810 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
73811 + * caller */
73812 +static int atom_is_committable(txn_atom * atom)
73813 +{
73814 + return
73815 + atom->stage < ASTAGE_PRE_COMMIT &&
73816 + atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
73817 +}
73818 +
73819 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
73820 + * lock at exit */
73821 +int commit_some_atoms(txn_mgr * mgr)
73822 +{
73823 + int ret = 0;
73824 + txn_atom *atom;
73825 + txn_handle *txnh;
73826 + reiser4_context *ctx;
73827 + struct list_head *pos, *tmp;
73828 +
73829 + ctx = get_current_context();
73830 + assert("nikita-2444", ctx != NULL);
73831 +
73832 + txnh = ctx->trans;
73833 + spin_lock_txnmgr(mgr);
73834 +
73835 + /*
73836 + * this is to avoid gcc complain that atom might be used
73837 + * uninitialized
73838 + */
73839 + atom = NULL;
73840 +
73841 + /* look for atom to commit */
73842 + list_for_each_safe(pos, tmp, &mgr->atoms_list) {
73843 + atom = list_entry(pos, txn_atom, atom_link);
73844 + /*
73845 + * first test without taking atom spin lock, whether it is
73846 + * eligible for committing at all
73847 + */
73848 + if (atom_is_committable(atom)) {
73849 + /* now, take spin lock and re-check */
73850 + spin_lock_atom(atom);
73851 + if (atom_is_committable(atom))
73852 + break;
73853 + spin_unlock_atom(atom);
73854 + }
73855 + }
73856 +
73857 + ret = (&mgr->atoms_list == pos);
73858 + spin_unlock_txnmgr(mgr);
73859 +
73860 + if (ret) {
73861 + /* nothing found */
73862 + spin_unlock(&mgr->daemon->guard);
73863 + return 0;
73864 + }
73865 +
73866 + spin_lock_txnh(txnh);
73867 +
73868 + BUG_ON(atom == NULL);
73869 + /* Set the atom to force committing */
73870 + atom->flags |= ATOM_FORCE_COMMIT;
73871 +
73872 + /* Add force-context txnh */
73873 + capture_assign_txnh_nolock(atom, txnh);
73874 +
73875 + spin_unlock_txnh(txnh);
73876 + spin_unlock_atom(atom);
73877 +
73878 + /* we are about to release daemon spin lock, notify daemon it
73879 + has to rescan atoms */
73880 + mgr->daemon->rescan = 1;
73881 + spin_unlock(&mgr->daemon->guard);
73882 + txn_restart_current();
73883 + return 0;
73884 +}
73885 +
73886 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
73887 +{
73888 + int atom_stage;
73889 + txn_atom *atom_2;
73890 + int repeat;
73891 +
73892 + assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
73893 +
73894 + atom_stage = atom->stage;
73895 + repeat = 0;
73896 +
73897 + if (!spin_trylock_txnmgr(tmgr)) {
73898 + atomic_inc(&atom->refcount);
73899 + spin_unlock_atom(atom);
73900 + spin_lock_txnmgr(tmgr);
73901 + spin_lock_atom(atom);
73902 + repeat = 1;
73903 + if (atom->stage != atom_stage) {
73904 + spin_unlock_txnmgr(tmgr);
73905 + atom_dec_and_unlock(atom);
73906 + return -E_REPEAT;
73907 + }
73908 + atomic_dec(&atom->refcount);
73909 + }
73910 +
73911 + list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
73912 + if (atom == atom_2)
73913 + continue;
73914 + /*
73915 + * if trylock does not succeed we just do not fuse with that
73916 + * atom.
73917 + */
73918 + if (spin_trylock_atom(atom_2)) {
73919 + if (atom_2->stage < ASTAGE_PRE_COMMIT) {
73920 + spin_unlock_txnmgr(tmgr);
73921 + capture_fuse_into(atom_2, atom);
73922 + /* all locks are lost we can only repeat here */
73923 + return -E_REPEAT;
73924 + }
73925 + spin_unlock_atom(atom_2);
73926 + }
73927 + }
73928 + atom->flags |= ATOM_CANCEL_FUSION;
73929 + spin_unlock_txnmgr(tmgr);
73930 + if (repeat) {
73931 + spin_unlock_atom(atom);
73932 + return -E_REPEAT;
73933 + }
73934 + return 0;
73935 +}
73936 +
73937 +/* Calls jnode_flush for current atom if it exists; if not, just take another
73938 + atom and call jnode_flush() for him. If current transaction handle has
73939 + already assigned atom (current atom) we have to close current transaction
73940 + prior to switch to another atom or do something with current atom. This
73941 + code tries to flush current atom.
73942 +
73943 + flush_some_atom() is called as part of memory clearing process. It is
73944 + invoked from balance_dirty_pages(), pdflushd, and entd.
73945 +
73946 + If we can flush no nodes, atom is committed, because this frees memory.
73947 +
73948 + If atom is too large or too old it is committed also.
73949 +*/
73950 +int
73951 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
73952 + int flags)
73953 +{
73954 + reiser4_context *ctx = get_current_context();
73955 + txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
73956 + txn_handle *txnh = ctx->trans;
73957 + txn_atom *atom;
73958 + int ret;
73959 +
73960 + BUG_ON(wbc->nr_to_write == 0);
73961 + BUG_ON(*nr_submitted != 0);
73962 + assert("zam-1042", txnh != NULL);
73963 + repeat:
73964 + if (txnh->atom == NULL) {
73965 + /* current atom is not available, take first from txnmgr */
73966 + spin_lock_txnmgr(tmgr);
73967 +
73968 + /* traverse the list of all atoms */
73969 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73970 + /* lock atom before checking its state */
73971 + spin_lock_atom(atom);
73972 +
73973 + /*
73974 + * we need an atom which is not being committed and
73975 + * which has no flushers (jnode_flush() add one flusher
73976 + * at the beginning and subtract one at the end).
73977 + */
73978 + if (atom->stage < ASTAGE_PRE_COMMIT &&
73979 + atom->nr_flushers == 0) {
73980 + spin_lock_txnh(txnh);
73981 + capture_assign_txnh_nolock(atom, txnh);
73982 + spin_unlock_txnh(txnh);
73983 +
73984 + goto found;
73985 + }
73986 +
73987 + spin_unlock_atom(atom);
73988 + }
73989 +
73990 + /*
73991 + * Write throttling is case of no one atom can be
73992 + * flushed/committed.
73993 + */
73994 + if (!current_is_pdflush() && !wbc->nonblocking) {
73995 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73996 + spin_lock_atom(atom);
73997 + /* Repeat the check from the above. */
73998 + if (atom->stage < ASTAGE_PRE_COMMIT
73999 + && atom->nr_flushers == 0) {
74000 + spin_lock_txnh(txnh);
74001 + capture_assign_txnh_nolock(atom, txnh);
74002 + spin_unlock_txnh(txnh);
74003 +
74004 + goto found;
74005 + }
74006 + if (atom->stage <= ASTAGE_POST_COMMIT) {
74007 + spin_unlock_txnmgr(tmgr);
74008 + /*
74009 + * we just wait until atom's flusher
74010 + * makes a progress in flushing or
74011 + * committing the atom
74012 + */
74013 + atom_wait_event(atom);
74014 + goto repeat;
74015 + }
74016 + spin_unlock_atom(atom);
74017 + }
74018 + }
74019 + spin_unlock_txnmgr(tmgr);
74020 + return 0;
74021 + found:
74022 + spin_unlock_txnmgr(tmgr);
74023 + } else
74024 + atom = get_current_atom_locked();
74025 +
74026 + BUG_ON(atom->super != ctx->super);
74027 + assert("vs-35", atom->super == ctx->super);
74028 + if (start) {
74029 + spin_lock_jnode(start);
74030 + ret = (atom == start->atom) ? 1 : 0;
74031 + spin_unlock_jnode(start);
74032 + if (ret == 0)
74033 + start = NULL;
74034 + }
74035 + ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
74036 + if (ret == 0) {
74037 + /* flush_current_atom returns 0 only if it submitted for write
74038 + nothing */
74039 + BUG_ON(*nr_submitted != 0);
74040 + if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
74041 + if (atom->capture_count < tmgr->atom_min_size &&
74042 + !(atom->flags & ATOM_CANCEL_FUSION)) {
74043 + ret = txn_try_to_fuse_small_atom(tmgr, atom);
74044 + if (ret == -E_REPEAT) {
74045 + preempt_point();
74046 + goto repeat;
74047 + }
74048 + }
74049 + /* if early flushing could not make more nodes clean,
74050 + * or atom is too old/large,
74051 + * we force current atom to commit */
74052 + /* wait for commit completion but only if this
74053 + * wouldn't stall pdflushd and ent thread. */
74054 + if (!wbc->nonblocking && !ctx->entd)
74055 + txnh->flags |= TXNH_WAIT_COMMIT;
74056 + atom->flags |= ATOM_FORCE_COMMIT;
74057 + }
74058 + spin_unlock_atom(atom);
74059 + } else if (ret == -E_REPEAT) {
74060 + if (*nr_submitted == 0) {
74061 + /* let others who hampers flushing (hold longterm locks,
74062 + for instance) to free the way for flush */
74063 + preempt_point();
74064 + goto repeat;
74065 + }
74066 + ret = 0;
74067 + }
74068 +/*
74069 + if (*nr_submitted > wbc->nr_to_write)
74070 + warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
74071 +*/
74072 + txn_restart(ctx);
74073 +
74074 + return ret;
74075 +}
74076 +
74077 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
74078 +void invalidate_list(struct list_head *head)
74079 +{
74080 + while (!list_empty(head)) {
74081 + jnode *node;
74082 +
74083 + node = list_entry(head->next, jnode, capture_link);
74084 + spin_lock_jnode(node);
74085 + uncapture_block(node);
74086 + jput(node);
74087 + }
74088 +}
74089 +
74090 +static void init_wlinks(txn_wait_links * wlinks)
74091 +{
74092 + wlinks->_lock_stack = get_current_lock_stack();
74093 + INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
74094 + INIT_LIST_HEAD(&wlinks->_fwaiting_link);
74095 + wlinks->waitfor_cb = NULL;
74096 + wlinks->waiting_cb = NULL;
74097 +}
74098 +
74099 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
74100 +void atom_wait_event(txn_atom * atom)
74101 +{
74102 + txn_wait_links _wlinks;
74103 +
74104 + assert_spin_locked(&(atom->alock));
74105 + assert("nikita-3156",
74106 + lock_stack_isclean(get_current_lock_stack()) ||
74107 + atom->nr_running_queues > 0);
74108 +
74109 + init_wlinks(&_wlinks);
74110 + list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
74111 + atomic_inc(&atom->refcount);
74112 + spin_unlock_atom(atom);
74113 +
74114 + prepare_to_sleep(_wlinks._lock_stack);
74115 + go_to_sleep(_wlinks._lock_stack);
74116 +
74117 + spin_lock_atom(atom);
74118 + list_del(&_wlinks._fwaitfor_link);
74119 + atom_dec_and_unlock(atom);
74120 +}
74121 +
74122 +void atom_set_stage(txn_atom * atom, txn_stage stage)
74123 +{
74124 + assert("nikita-3535", atom != NULL);
74125 + assert_spin_locked(&(atom->alock));
74126 + assert("nikita-3536", ASTAGE_FREE <= stage && stage <= ASTAGE_INVALID);
74127 + /* Excelsior! */
74128 + assert("nikita-3537", stage >= atom->stage);
74129 + if (atom->stage != stage) {
74130 + atom->stage = stage;
74131 + atom_send_event(atom);
74132 + }
74133 +}
74134 +
74135 +/* wake all threads which wait for an event */
74136 +void atom_send_event(txn_atom * atom)
74137 +{
74138 + assert_spin_locked(&(atom->alock));
74139 + wakeup_atom_waitfor_list(atom);
74140 +}
74141 +
74142 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
74143 + example, because it does fsync(2)) */
74144 +static int should_wait_commit(txn_handle * h)
74145 +{
74146 + return h->flags & TXNH_WAIT_COMMIT;
74147 +}
74148 +
74149 +typedef struct commit_data {
74150 + txn_atom *atom;
74151 + txn_handle *txnh;
74152 + long nr_written;
74153 + /* as an optimization we start committing atom by first trying to
74154 + * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
74155 + * allows to reduce stalls due to other threads waiting for atom in
74156 + * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
74157 + * preliminary flushes. */
74158 + int preflush;
74159 + /* have we waited on atom. */
74160 + int wait;
74161 + int failed;
74162 + int wake_ktxnmgrd_up;
74163 +} commit_data;
74164 +
74165 +/*
74166 + * Called from commit_txnh() repeatedly, until either error happens, or atom
74167 + * commits successfully.
74168 + */
74169 +static int try_commit_txnh(commit_data * cd)
74170 +{
74171 + int result;
74172 +
74173 + assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
74174 +
74175 + /* Get the atom and txnh locked. */
74176 + cd->atom = txnh_get_atom(cd->txnh);
74177 + assert("jmacd-309", cd->atom != NULL);
74178 + spin_unlock_txnh(cd->txnh);
74179 +
74180 + if (cd->wait) {
74181 + cd->atom->nr_waiters--;
74182 + cd->wait = 0;
74183 + }
74184 +
74185 + if (cd->atom->stage == ASTAGE_DONE)
74186 + return 0;
74187 +
74188 + if (cd->failed)
74189 + return 0;
74190 +
74191 + if (atom_should_commit(cd->atom)) {
74192 + /* if atom is _very_ large schedule it for commit as soon as
74193 + * possible. */
74194 + if (atom_should_commit_asap(cd->atom)) {
74195 + /*
74196 + * When atom is in PRE_COMMIT or later stage following
74197 + * invariant (encoded in atom_can_be_committed())
74198 + * holds: there is exactly one non-waiter transaction
74199 + * handle opened on this atom. When thread wants to
74200 + * wait until atom commits (for example sync()) it
74201 + * waits on atom event after increasing
74202 + * atom->nr_waiters (see blow in this function). It
74203 + * cannot be guaranteed that atom is already committed
74204 + * after receiving event, so loop has to be
74205 + * re-started. But if atom switched into PRE_COMMIT
74206 + * stage and became too large, we cannot change its
74207 + * state back to CAPTURE_WAIT (atom stage can only
74208 + * increase monotonically), hence this check.
74209 + */
74210 + if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
74211 + atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74212 + cd->atom->flags |= ATOM_FORCE_COMMIT;
74213 + }
74214 + if (cd->txnh->flags & TXNH_DONT_COMMIT) {
74215 + /*
74216 + * this thread (transaction handle that is) doesn't
74217 + * want to commit atom. Notify waiters that handle is
74218 + * closed. This can happen, for example, when we are
74219 + * under VFS directory lock and don't want to commit
74220 + * atom right now to avoid stalling other threads
74221 + * working in the same directory.
74222 + */
74223 +
74224 + /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
74225 + * commit this atom: no atom waiters and only one
74226 + * (our) open transaction handle. */
74227 + cd->wake_ktxnmgrd_up =
74228 + cd->atom->txnh_count == 1 &&
74229 + cd->atom->nr_waiters == 0;
74230 + atom_send_event(cd->atom);
74231 + result = 0;
74232 + } else if (!atom_can_be_committed(cd->atom)) {
74233 + if (should_wait_commit(cd->txnh)) {
74234 + /* sync(): wait for commit */
74235 + cd->atom->nr_waiters++;
74236 + cd->wait = 1;
74237 + atom_wait_event(cd->atom);
74238 + result = RETERR(-E_REPEAT);
74239 + } else {
74240 + result = 0;
74241 + }
74242 + } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74243 + /*
74244 + * optimization: flush atom without switching it into
74245 + * ASTAGE_CAPTURE_WAIT.
74246 + *
74247 + * But don't do this for ktxnmgrd, because ktxnmgrd
74248 + * should never block on atom fusion.
74249 + */
74250 + result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74251 + LONG_MAX, &cd->nr_written,
74252 + &cd->atom, NULL);
74253 + if (result == 0) {
74254 + spin_unlock_atom(cd->atom);
74255 + cd->preflush = 0;
74256 + result = RETERR(-E_REPEAT);
74257 + } else /* Atoms wasn't flushed
74258 + * completely. Rinse. Repeat. */
74259 + --cd->preflush;
74260 + } else {
74261 + /* We change atom state to ASTAGE_CAPTURE_WAIT to
74262 + prevent atom fusion and count ourself as an active
74263 + flusher */
74264 + atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74265 + cd->atom->flags |= ATOM_FORCE_COMMIT;
74266 +
74267 + result =
74268 + commit_current_atom(&cd->nr_written, &cd->atom);
74269 + if (result != 0 && result != -E_REPEAT)
74270 + cd->failed = 1;
74271 + }
74272 + } else
74273 + result = 0;
74274 +
74275 +#if REISER4_DEBUG
74276 + if (result == 0)
74277 + assert_spin_locked(&(cd->atom->alock));
74278 +#endif
74279 +
74280 + /* perfectly valid assertion, except that when atom/txnh is not locked
74281 + * fusion can take place, and cd->atom points nowhere. */
74282 + /*
74283 + assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74284 + */
74285 + return result;
74286 +}
74287 +
74288 +/* Called to commit a transaction handle. This decrements the atom's number of open
74289 + handles and if it is the last handle to commit and the atom should commit, initiates
74290 + atom commit. if commit does not fail, return number of written blocks */
74291 +static int commit_txnh(txn_handle * txnh)
74292 +{
74293 + commit_data cd;
74294 + assert("umka-192", txnh != NULL);
74295 +
74296 + memset(&cd, 0, sizeof cd);
74297 + cd.txnh = txnh;
74298 + cd.preflush = 10;
74299 +
74300 + /* calls try_commit_txnh() until either atom commits, or error
74301 + * happens */
74302 + while (try_commit_txnh(&cd) != 0)
74303 + preempt_point();
74304 +
74305 + spin_lock_txnh(txnh);
74306 +
74307 + cd.atom->txnh_count -= 1;
74308 + txnh->atom = NULL;
74309 + /* remove transaction handle from atom's list of transaction handles */
74310 + list_del_init(&txnh->txnh_link);
74311 +
74312 + spin_unlock_txnh(txnh);
74313 + atom_dec_and_unlock(cd.atom);
74314 + /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74315 + * because it takes time) by current thread, we do that work
74316 + * asynchronously by ktxnmgrd daemon. */
74317 + if (cd.wake_ktxnmgrd_up)
74318 + ktxnmgrd_kick(&get_current_super_private()->tmgr);
74319 +
74320 + return 0;
74321 +}
74322 +
74323 +/* TRY_CAPTURE */
74324 +
74325 +/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
74326 + condition indicates that the request should be retried, and it may block if the
74327 + txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74328 +
74329 + This routine encodes the basic logic of block capturing described by:
74330 +
74331 + http://namesys.com/v4/v4.html
74332 +
74333 + Our goal here is to ensure that any two blocks that contain dependent modifications
74334 + should commit at the same time. This function enforces this discipline by initiating
74335 + fusion whenever a transaction handle belonging to one atom requests to read or write a
74336 + block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74337 +
74338 + In addition, this routine handles the initial assignment of atoms to blocks and
74339 + transaction handles. These are possible outcomes of this function:
74340 +
74341 + 1. The block and handle are already part of the same atom: return immediate success
74342 +
74343 + 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74344 + the handle to the block's atom.
74345 +
74346 + 3. The handle is assigned but the block is not: call capture_assign_block to assign
74347 + the block to the handle's atom.
74348 +
74349 + 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74350 + to fuse atoms.
74351 +
74352 + 5. Neither block nor handle are assigned: create a new atom and assign them both.
74353 +
74354 + 6. A read request for a non-captured block: return immediate success.
74355 +
74356 + This function acquires and releases the handle's spinlock. This function is called
74357 + under the jnode lock and if the return value is 0, it returns with the jnode lock still
74358 + held. If the return is -E_REPEAT or some other error condition, the jnode lock is
74359 + released. The external interface (try_capture) manages re-aquiring the jnode lock
74360 + in the failure case.
74361 +*/
74362 +static int try_capture_block(
74363 + txn_handle * txnh, jnode * node, txn_capture mode,
74364 + txn_atom ** atom_alloc)
74365 +{
74366 + txn_atom *block_atom;
74367 + txn_atom *txnh_atom;
74368 +
74369 + /* Should not call capture for READ_NONCOM requests, handled in try_capture. */
74370 + assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74371 +
74372 + /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74373 + * node->tree somewhere. */
74374 + assert("umka-194", txnh != NULL);
74375 + assert("umka-195", node != NULL);
74376 +
74377 + /* The jnode is already locked! Being called from try_capture(). */
74378 + assert_spin_locked(&(node->guard));
74379 + block_atom = node->atom;
74380 +
74381 + /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74382 + let us touch the atoms themselves. */
74383 + spin_lock_txnh(txnh);
74384 + txnh_atom = txnh->atom;
74385 + /* Process of capturing continues into one of four branches depends on
74386 + which atoms from (block atom (node->atom), current atom (txnh->atom))
74387 + exist. */
74388 + if (txnh_atom == NULL) {
74389 + if (block_atom == NULL) {
74390 + spin_unlock_txnh(txnh);
74391 + spin_unlock_jnode(node);
74392 + /* assign empty atom to the txnh and repeat */
74393 + return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74394 + } else {
74395 + atomic_inc(&block_atom->refcount);
74396 + /* node spin-lock isn't needed anymore */
74397 + spin_unlock_jnode(node);
74398 + if (!spin_trylock_atom(block_atom)) {
74399 + spin_unlock_txnh(txnh);
74400 + spin_lock_atom(block_atom);
74401 + spin_lock_txnh(txnh);
74402 + }
74403 + /* re-check state after getting txnh and the node
74404 + * atom spin-locked */
74405 + if (node->atom != block_atom || txnh->atom != NULL) {
74406 + spin_unlock_txnh(txnh);
74407 + atom_dec_and_unlock(block_atom);
74408 + return RETERR(-E_REPEAT);
74409 + }
74410 + atomic_dec(&block_atom->refcount);
74411 + if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74412 + (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74413 + block_atom->txnh_count != 0))
74414 + return capture_fuse_wait(txnh, block_atom, NULL, mode);
74415 + capture_assign_txnh_nolock(block_atom, txnh);
74416 + spin_unlock_txnh(txnh);
74417 + spin_unlock_atom(block_atom);
74418 + return RETERR(-E_REPEAT);
74419 + }
74420 + } else {
74421 + /* It is time to perform deadlock prevention check over the
74422 + node we want to capture. It is possible this node was locked
74423 + for read without capturing it. The optimization which allows
74424 + to do it helps us in keeping atoms independent as long as
74425 + possible but it may cause lock/fuse deadlock problems.
74426 +
74427 + A number of similar deadlock situations with locked but not
74428 + captured nodes were found. In each situation there are two
74429 + or more threads: one of them does flushing while another one
74430 + does routine balancing or tree lookup. The flushing thread
74431 + (F) sleeps in long term locking request for node (N), another
74432 + thread (A) sleeps in trying to capture some node already
74433 + belonging the atom F, F has a state which prevents
74434 + immediately fusion .
74435 +
74436 + Deadlocks of this kind cannot happen if node N was properly
74437 + captured by thread A. The F thread fuse atoms before locking
74438 + therefore current atom of thread F and current atom of thread
74439 + A became the same atom and thread A may proceed. This does
74440 + not work if node N was not captured because the fusion of
74441 + atom does not happens.
74442 +
74443 + The following scheme solves the deadlock: If
74444 + longterm_lock_znode locks and does not capture a znode, that
74445 + znode is marked as MISSED_IN_CAPTURE. A node marked this way
74446 + is processed by the code below which restores the missed
74447 + capture and fuses current atoms of all the node lock owners
74448 + by calling the fuse_not_fused_lock_owners() function. */
74449 + if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
74450 + JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
74451 + if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
74452 + spin_unlock_txnh(txnh);
74453 + spin_unlock_jnode(node);
74454 + fuse_not_fused_lock_owners(txnh, JZNODE(node));
74455 + return RETERR(-E_REPEAT);
74456 + }
74457 + }
74458 + if (block_atom == NULL) {
74459 + atomic_inc(&txnh_atom->refcount);
74460 + spin_unlock_txnh(txnh);
74461 + if (!spin_trylock_atom(txnh_atom)) {
74462 + spin_unlock_jnode(node);
74463 + spin_lock_atom(txnh_atom);
74464 + spin_lock_jnode(node);
74465 + }
74466 + if (txnh->atom != txnh_atom || node->atom != NULL
74467 + || JF_ISSET(node, JNODE_IS_DYING)) {
74468 + spin_unlock_jnode(node);
74469 + atom_dec_and_unlock(txnh_atom);
74470 + return RETERR(-E_REPEAT);
74471 + }
74472 + atomic_dec(&txnh_atom->refcount);
74473 + capture_assign_block_nolock(txnh_atom, node);
74474 + spin_unlock_atom(txnh_atom);
74475 + } else {
74476 + if (txnh_atom != block_atom) {
74477 + if (mode & TXN_CAPTURE_DONT_FUSE) {
74478 + spin_unlock_txnh(txnh);
74479 + spin_unlock_jnode(node);
74480 + /* we are in a "no-fusion" mode and @node is
74481 + * already part of transaction. */
74482 + return RETERR(-E_NO_NEIGHBOR);
74483 + }
74484 + return capture_init_fusion(node, txnh, mode);
74485 + }
74486 + spin_unlock_txnh(txnh);
74487 + }
74488 + }
74489 + return 0;
74490 +}
74491 +
74492 +static txn_capture
74493 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
74494 +{
74495 + txn_capture cap_mode;
74496 +
74497 + assert_spin_locked(&(node->guard));
74498 +
74499 + /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
74500 +
74501 + if (lock_mode == ZNODE_WRITE_LOCK) {
74502 + cap_mode = TXN_CAPTURE_WRITE;
74503 + } else if (node->atom != NULL) {
74504 + cap_mode = TXN_CAPTURE_WRITE;
74505 + } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
74506 + jnode_get_level(node) == LEAF_LEVEL) {
74507 + /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
74508 + /* We only need a READ_FUSING capture at the leaf level. This
74509 + is because the internal levels of the tree (twigs included)
74510 + are redundant from the point of the user that asked for a
74511 + read-fusing transcrash. The user only wants to read-fuse
74512 + atoms due to reading uncommitted data that another user has
74513 + written. It is the file system that reads/writes the
74514 + internal tree levels, the user only reads/writes leaves. */
74515 + cap_mode = TXN_CAPTURE_READ_ATOMIC;
74516 + } else {
74517 + /* In this case (read lock at a non-leaf) there's no reason to
74518 + * capture. */
74519 + /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
74520 + return 0;
74521 + }
74522 +
74523 + cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
74524 + assert("nikita-3186", cap_mode != 0);
74525 + return cap_mode;
74526 +}
74527 +
74528 +/* This is an external interface to try_capture_block(), it calls
74529 + try_capture_block() repeatedly as long as -E_REPEAT is returned.
74530 +
74531 + @node: node to capture,
74532 + @lock_mode: read or write lock is used in capture mode calculation,
74533 + @flags: see txn_capture flags enumeration,
74534 + @can_coc : can copy-on-capture
74535 +
74536 + @return: 0 - node was successfully captured, -E_REPEAT - capture request
74537 + cannot be processed immediately as it was requested in flags,
74538 + < 0 - other errors.
74539 +*/
74540 +int try_capture(jnode *node, znode_lock_mode lock_mode,
74541 + txn_capture flags)
74542 +{
74543 + txn_atom *atom_alloc = NULL;
74544 + txn_capture cap_mode;
74545 + txn_handle *txnh = get_current_context()->trans;
74546 + int ret;
74547 +
74548 + assert_spin_locked(&(node->guard));
74549 +
74550 + repeat:
74551 + if (JF_ISSET(node, JNODE_IS_DYING))
74552 + return RETERR(-EINVAL);
74553 + if (node->atom != NULL && txnh->atom == node->atom)
74554 + return 0;
74555 + cap_mode = build_capture_mode(node, lock_mode, flags);
74556 + if (cap_mode == 0 ||
74557 + (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
74558 + /* Mark this node as "MISSED". It helps in further deadlock
74559 + * analysis */
74560 + if (jnode_is_znode(node))
74561 + JF_SET(node, JNODE_MISSED_IN_CAPTURE);
74562 + return 0;
74563 + }
74564 + /* Repeat try_capture as long as -E_REPEAT is returned. */
74565 + ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
74566 + /* Regardless of non_blocking:
74567 +
74568 + If ret == 0 then jnode is still locked.
74569 + If ret != 0 then jnode is unlocked.
74570 + */
74571 +#if REISER4_DEBUG
74572 + if (ret == 0)
74573 + assert_spin_locked(&(node->guard));
74574 + else
74575 + assert_spin_not_locked(&(node->guard));
74576 +#endif
74577 + assert_spin_not_locked(&(txnh->guard));
74578 +
74579 + if (ret == -E_REPEAT) {
74580 + /* E_REPEAT implies all locks were released, therefore we need
74581 + to take the jnode's lock again. */
74582 + spin_lock_jnode(node);
74583 +
74584 + /* Although this may appear to be a busy loop, it is not.
74585 + There are several conditions that cause E_REPEAT to be
74586 + returned by the call to try_capture_block, all cases
74587 + indicating some kind of state change that means you should
74588 + retry the request and will get a different result. In some
74589 + cases this could be avoided with some extra code, but
74590 + generally it is done because the necessary locks were
74591 + released as a result of the operation and repeating is the
74592 + simplest thing to do (less bug potential). The cases are:
74593 + atom fusion returns E_REPEAT after it completes (jnode and
74594 + txnh were unlocked); race conditions in assign_block,
74595 + assign_txnh, and init_fusion return E_REPEAT (trylock
74596 + failure); after going to sleep in capture_fuse_wait
74597 + (request was blocked but may now succeed). I'm not quite
74598 + sure how capture_copy works yet, but it may also return
74599 + E_REPEAT. When the request is legitimately blocked, the
74600 + requestor goes to sleep in fuse_wait, so this is not a busy
74601 + loop. */
74602 + /* NOTE-NIKITA: still don't understand:
74603 +
74604 + try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
74605 +
74606 + looks like busy loop?
74607 + */
74608 + goto repeat;
74609 + }
74610 +
74611 + /* free extra atom object that was possibly allocated by
74612 + try_capture_block().
74613 +
74614 + Do this before acquiring jnode spin lock to
74615 + minimize time spent under lock. --nikita */
74616 + if (atom_alloc != NULL) {
74617 + kmem_cache_free(_atom_slab, atom_alloc);
74618 + }
74619 +
74620 + if (ret != 0) {
74621 + if (ret == -E_BLOCK) {
74622 + assert("nikita-3360",
74623 + cap_mode & TXN_CAPTURE_NONBLOCKING);
74624 + ret = -E_REPEAT;
74625 + }
74626 +
74627 + /* Failure means jnode is not locked. FIXME_LATER_JMACD May
74628 + want to fix the above code to avoid releasing the lock and
74629 + re-acquiring it, but there are cases were failure occurs
74630 + when the lock is not held, and those cases would need to be
74631 + modified to re-take the lock. */
74632 + spin_lock_jnode(node);
74633 + }
74634 +
74635 + /* Jnode is still locked. */
74636 + assert_spin_locked(&(node->guard));
74637 + return ret;
74638 +}
74639 +
74640 +static void release_two_atoms(txn_atom *one, txn_atom *two)
74641 +{
74642 + spin_unlock_atom(one);
74643 + atom_dec_and_unlock(two);
74644 + spin_lock_atom(one);
74645 + atom_dec_and_unlock(one);
74646 +}
74647 +
74648 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
74649 + returned by that routine. The txn_capture request mode is computed here depending on
74650 + the transaction handle's type and the lock request. This is called from the depths of
74651 + the lock manager with the jnode lock held and it always returns with the jnode lock
74652 + held.
74653 +*/
74654 +
74655 +/* fuse all 'active' atoms of lock owners of given node. */
74656 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
74657 +{
74658 + lock_handle *lh;
74659 + int repeat;
74660 + txn_atom *atomh, *atomf;
74661 + reiser4_context *me = get_current_context();
74662 + reiser4_context *ctx = NULL;
74663 +
74664 + assert_spin_not_locked(&(ZJNODE(node)->guard));
74665 + assert_spin_not_locked(&(txnh->hlock));
74666 +
74667 + repeat:
74668 + repeat = 0;
74669 + atomh = txnh_get_atom(txnh);
74670 + spin_unlock_txnh(txnh);
74671 + assert("zam-692", atomh != NULL);
74672 +
74673 + spin_lock_zlock(&node->lock);
74674 + /* inspect list of lock owners */
74675 + list_for_each_entry(lh, &node->lock.owners, owners_link) {
74676 + ctx = get_context_by_lock_stack(lh->owner);
74677 + if (ctx == me)
74678 + continue;
74679 + /* below we use two assumptions to avoid addition spin-locks
74680 + for checking the condition :
74681 +
74682 + 1) if the lock stack has lock, the transaction should be
74683 + opened, i.e. ctx->trans != NULL;
74684 +
74685 + 2) reading of well-aligned ctx->trans->atom is atomic, if it
74686 + equals to the address of spin-locked atomh, we take that
74687 + the atoms are the same, nothing has to be captured. */
74688 + if (atomh != ctx->trans->atom) {
74689 + reiser4_wake_up(lh->owner);
74690 + repeat = 1;
74691 + break;
74692 + }
74693 + }
74694 + if (repeat) {
74695 + if (!spin_trylock_txnh(ctx->trans)) {
74696 + spin_unlock_zlock(&node->lock);
74697 + spin_unlock_atom(atomh);
74698 + goto repeat;
74699 + }
74700 + atomf = ctx->trans->atom;
74701 + if (atomf == NULL) {
74702 + capture_assign_txnh_nolock(atomh, ctx->trans);
74703 + /* release zlock lock _after_ assigning the atom to the
74704 + * transaction handle, otherwise the lock owner thread
74705 + * may unlock all znodes, exit kernel context and here
74706 + * we would access an invalid transaction handle. */
74707 + spin_unlock_zlock(&node->lock);
74708 + spin_unlock_atom(atomh);
74709 + spin_unlock_txnh(ctx->trans);
74710 + goto repeat;
74711 + }
74712 + assert("zam-1059", atomf != atomh);
74713 + spin_unlock_zlock(&node->lock);
74714 + atomic_inc(&atomh->refcount);
74715 + atomic_inc(&atomf->refcount);
74716 + spin_unlock_txnh(ctx->trans);
74717 + if (atomf > atomh) {
74718 + spin_lock_atom(atomf);
74719 + } else {
74720 + spin_unlock_atom(atomh);
74721 + spin_lock_atom(atomf);
74722 + spin_lock_atom(atomh);
74723 + }
74724 + if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
74725 + release_two_atoms(atomf, atomh);
74726 + goto repeat;
74727 + }
74728 + atomic_dec(&atomh->refcount);
74729 + atomic_dec(&atomf->refcount);
74730 + capture_fuse_into(atomf, atomh);
74731 + goto repeat;
74732 + }
74733 + spin_unlock_zlock(&node->lock);
74734 + spin_unlock_atom(atomh);
74735 +}
74736 +
74737 +/* This is the interface to capture unformatted nodes via their struct page
74738 + reference. Currently it is only used in reiser4_invalidatepage */
74739 +int try_capture_page_to_invalidate(struct page *pg)
74740 +{
74741 + int ret;
74742 + jnode *node;
74743 +
74744 + assert("umka-292", pg != NULL);
74745 + assert("nikita-2597", PageLocked(pg));
74746 +
74747 + if (IS_ERR(node = jnode_of_page(pg))) {
74748 + return PTR_ERR(node);
74749 + }
74750 +
74751 + spin_lock_jnode(node);
74752 + unlock_page(pg);
74753 +
74754 + ret = try_capture(node, ZNODE_WRITE_LOCK, 0);
74755 + spin_unlock_jnode(node);
74756 + jput(node);
74757 + lock_page(pg);
74758 + return ret;
74759 +}
74760 +
74761 +/* This informs the transaction manager when a node is deleted. Add the block to the
74762 + atom's delete set and uncapture the block.
74763 +
74764 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
74765 +explanations. find all the functions that use it, and unless there is some very
74766 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
74767 +move the loop to inside the function.
74768 +
74769 +VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
74770 + */
74771 +void uncapture_page(struct page *pg)
74772 +{
74773 + jnode *node;
74774 + txn_atom *atom;
74775 +
74776 + assert("umka-199", pg != NULL);
74777 + assert("nikita-3155", PageLocked(pg));
74778 +
74779 + clear_page_dirty_for_io(pg);
74780 +
74781 + reiser4_wait_page_writeback(pg);
74782 +
74783 + node = jprivate(pg);
74784 + BUG_ON(node == NULL);
74785 +
74786 + spin_lock_jnode(node);
74787 +
74788 + atom = jnode_get_atom(node);
74789 + if (atom == NULL) {
74790 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74791 + spin_unlock_jnode(node);
74792 + return;
74793 + }
74794 +
74795 + /* We can remove jnode from transaction even if it is on flush queue
74796 + * prepped list, we only need to be sure that flush queue is not being
74797 + * written by write_fq(). write_fq() does not use atom spin lock for
74798 + * protection of the prepped nodes list, instead write_fq() increments
74799 + * atom's nr_running_queues counters for the time when prepped list is
74800 + * not protected by spin lock. Here we check this counter if we want
74801 + * to remove jnode from flush queue and, if the counter is not zero,
74802 + * wait all write_fq() for this atom to complete. This is not
74803 + * significant overhead. */
74804 + while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
74805 + spin_unlock_jnode(node);
74806 + /*
74807 + * at this moment we want to wait for "atom event", viz. wait
74808 + * until @node can be removed from flush queue. But
74809 + * atom_wait_event() cannot be called with page locked, because
74810 + * it deadlocks with jnode_extent_write(). Unlock page, after
74811 + * making sure (through page_cache_get()) that it cannot be
74812 + * released from memory.
74813 + */
74814 + page_cache_get(pg);
74815 + unlock_page(pg);
74816 + atom_wait_event(atom);
74817 + lock_page(pg);
74818 + /*
74819 + * page may has been detached by ->writepage()->releasepage().
74820 + */
74821 + reiser4_wait_page_writeback(pg);
74822 + spin_lock_jnode(node);
74823 + page_cache_release(pg);
74824 + atom = jnode_get_atom(node);
74825 +/* VS-FIXME-HANS: improve the commenting in this function */
74826 + if (atom == NULL) {
74827 + spin_unlock_jnode(node);
74828 + return;
74829 + }
74830 + }
74831 + uncapture_block(node);
74832 + spin_unlock_atom(atom);
74833 + jput(node);
74834 +}
74835 +
74836 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
74837 + * inode's tree of jnodes */
74838 +void uncapture_jnode(jnode * node)
74839 +{
74840 + txn_atom *atom;
74841 +
74842 + assert_spin_locked(&(node->guard));
74843 + assert("", node->pg == 0);
74844 +
74845 + atom = jnode_get_atom(node);
74846 + if (atom == NULL) {
74847 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74848 + spin_unlock_jnode(node);
74849 + return;
74850 + }
74851 +
74852 + uncapture_block(node);
74853 + spin_unlock_atom(atom);
74854 + jput(node);
74855 +}
74856 +
74857 +/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
74858 + increases atom refcount and txnh_count, adds to txnh_list. */
74859 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
74860 +{
74861 + assert("umka-200", atom != NULL);
74862 + assert("umka-201", txnh != NULL);
74863 +
74864 + assert_spin_locked(&(txnh->hlock));
74865 + assert_spin_locked(&(atom->alock));
74866 + assert("jmacd-824", txnh->atom == NULL);
74867 + assert("nikita-3540", atom_isopen(atom));
74868 + BUG_ON(txnh->atom != NULL);
74869 +
74870 + atomic_inc(&atom->refcount);
74871 + txnh->atom = atom;
74872 + set_gfp_mask();
74873 + list_add_tail(&txnh->txnh_link, &atom->txnh_list);
74874 + atom->txnh_count += 1;
74875 +}
74876 +
74877 +/* No-locking version of assign_block. Sets the block's atom pointer, references the
74878 + block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
74879 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
74880 +{
74881 + assert("umka-202", atom != NULL);
74882 + assert("umka-203", node != NULL);
74883 + assert_spin_locked(&(node->guard));
74884 + assert_spin_locked(&(atom->alock));
74885 + assert("jmacd-323", node->atom == NULL);
74886 + BUG_ON(!list_empty_careful(&node->capture_link));
74887 + assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
74888 +
74889 + /* Pointer from jnode to atom is not counted in atom->refcount. */
74890 + node->atom = atom;
74891 +
74892 + list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
74893 + atom->capture_count += 1;
74894 + /* reference to jnode is acquired by atom. */
74895 + jref(node);
74896 +
74897 + ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
74898 +
74899 + LOCK_CNT_INC(t_refs);
74900 +}
74901 +
74902 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
74903 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
74904 +{
74905 + assert_spin_locked(&(node->guard));
74906 + assert_spin_locked(&(atom->alock));
74907 + assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
74908 +
74909 + JF_SET(node, JNODE_DIRTY);
74910 +
74911 + get_current_context()->nr_marked_dirty++;
74912 +
74913 + /* We grab2flush_reserve one additional block only if node was
74914 + not CREATED and jnode_flush did not sort it into neither
74915 + relocate set nor overwrite one. If node is in overwrite or
74916 + relocate set we assume that atom's flush reserved counter was
74917 + already adjusted. */
74918 + if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
74919 + && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
74920 + && !jnode_is_cluster_page(node)) {
74921 + assert("vs-1093", !blocknr_is_fake(&node->blocknr));
74922 + assert("vs-1506", *jnode_get_block(node) != 0);
74923 + grabbed2flush_reserved_nolock(atom, (__u64) 1);
74924 + JF_SET(node, JNODE_FLUSH_RESERVED);
74925 + }
74926 +
74927 + if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
74928 + /* If the atom is not set yet, it will be added to the appropriate list in
74929 + capture_assign_block_nolock. */
74930 + /* Sometimes a node is set dirty before being captured -- the case for new
74931 + jnodes. In that case the jnode will be added to the appropriate list
74932 + in capture_assign_block_nolock. Another reason not to re-link jnode is
74933 + that jnode is on a flush queue (see flush.c for details) */
74934 +
74935 + int level = jnode_get_level(node);
74936 +
74937 + assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
74938 + assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
74939 + assert("nikita-2607", 0 <= level);
74940 + assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
74941 +
74942 + /* move node to atom's dirty list */
74943 + list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
74944 + ON_DEBUG(count_jnode
74945 + (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
74946 + }
74947 +}
74948 +
74949 +/* Set the dirty status for this (spin locked) jnode. */
74950 +void jnode_make_dirty_locked(jnode * node)
74951 +{
74952 + assert("umka-204", node != NULL);
74953 + assert_spin_locked(&(node->guard));
74954 +
74955 + if (REISER4_DEBUG && rofs_jnode(node)) {
74956 + warning("nikita-3365", "Dirtying jnode on rofs");
74957 + dump_stack();
74958 + }
74959 +
74960 + /* Fast check for already dirty node */
74961 + if (!JF_ISSET(node, JNODE_DIRTY)) {
74962 + txn_atom *atom;
74963 +
74964 + atom = jnode_get_atom(node);
74965 + assert("vs-1094", atom);
74966 + /* Check jnode dirty status again because node spin lock might
74967 + * be released inside jnode_get_atom(). */
74968 + if (likely(!JF_ISSET(node, JNODE_DIRTY)))
74969 + do_jnode_make_dirty(node, atom);
74970 + spin_unlock_atom(atom);
74971 + }
74972 +}
74973 +
74974 +/* Set the dirty status for this znode. */
74975 +void znode_make_dirty(znode * z)
74976 +{
74977 + jnode *node;
74978 + struct page *page;
74979 +
74980 + assert("umka-204", z != NULL);
74981 + assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
74982 + assert("nikita-3560", znode_is_write_locked(z));
74983 +
74984 + node = ZJNODE(z);
74985 + /* znode is longterm locked, we can check dirty bit without spinlock */
74986 + if (JF_ISSET(node, JNODE_DIRTY)) {
74987 + /* znode is dirty already. All we have to do is to change znode version */
74988 + z->version = znode_build_version(jnode_get_tree(node));
74989 + return;
74990 + }
74991 +
74992 + spin_lock_jnode(node);
74993 + jnode_make_dirty_locked(node);
74994 + page = jnode_page(node);
74995 + if (page != NULL) {
74996 + /* this is useful assertion (allows one to check that no
74997 + * modifications are lost due to update of in-flight page),
74998 + * but it requires locking on page to check PG_writeback
74999 + * bit. */
75000 + /* assert("nikita-3292",
75001 + !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
75002 + page_cache_get(page);
75003 +
75004 + /* jnode lock is not needed for the rest of
75005 + * znode_set_dirty(). */
75006 + spin_unlock_jnode(node);
75007 + /* reiser4 file write code calls set_page_dirty for
75008 + * unformatted nodes, for formatted nodes we do it here. */
75009 + set_page_dirty_internal(page);
75010 + page_cache_release(page);
75011 + /* bump version counter in znode */
75012 + z->version = znode_build_version(jnode_get_tree(node));
75013 + } else {
75014 + assert("zam-596", znode_above_root(JZNODE(node)));
75015 + spin_unlock_jnode(node);
75016 + }
75017 +
75018 + assert("nikita-1900", znode_is_write_locked(z));
75019 + assert("jmacd-9777", node->atom != NULL);
75020 +}
75021 +
75022 +int sync_atom(txn_atom * atom)
75023 +{
75024 + int result;
75025 + txn_handle *txnh;
75026 +
75027 + txnh = get_current_context()->trans;
75028 +
75029 + result = 0;
75030 + if (atom != NULL) {
75031 + if (atom->stage < ASTAGE_PRE_COMMIT) {
75032 + spin_lock_txnh(txnh);
75033 + capture_assign_txnh_nolock(atom, txnh);
75034 + result = force_commit_atom(txnh);
75035 + } else if (atom->stage < ASTAGE_POST_COMMIT) {
75036 + /* wait atom commit */
75037 + atom_wait_event(atom);
75038 + /* try once more */
75039 + result = RETERR(-E_REPEAT);
75040 + } else
75041 + spin_unlock_atom(atom);
75042 + }
75043 + return result;
75044 +}
75045 +
75046 +#if REISER4_DEBUG
75047 +
75048 +/* move jnode form one list to another
75049 + call this after atom->capture_count is updated */
75050 +void
75051 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
75052 + atom_list new_list, int check_lists)
75053 +{
75054 + struct list_head *pos;
75055 +
75056 + assert("zam-1018", atom_is_protected(atom));
75057 + assert_spin_locked(&(node->guard));
75058 + assert("", NODE_LIST(node) == old_list);
75059 +
75060 + switch (NODE_LIST(node)) {
75061 + case NOT_CAPTURED:
75062 + break;
75063 + case DIRTY_LIST:
75064 + assert("", atom->dirty > 0);
75065 + atom->dirty--;
75066 + break;
75067 + case CLEAN_LIST:
75068 + assert("", atom->clean > 0);
75069 + atom->clean--;
75070 + break;
75071 + case FQ_LIST:
75072 + assert("", atom->fq > 0);
75073 + atom->fq--;
75074 + break;
75075 + case WB_LIST:
75076 + assert("", atom->wb > 0);
75077 + atom->wb--;
75078 + break;
75079 + case OVRWR_LIST:
75080 + assert("", atom->ovrwr > 0);
75081 + atom->ovrwr--;
75082 + break;
75083 + default:
75084 + impossible("", "");
75085 + }
75086 +
75087 + switch (new_list) {
75088 + case NOT_CAPTURED:
75089 + break;
75090 + case DIRTY_LIST:
75091 + atom->dirty++;
75092 + break;
75093 + case CLEAN_LIST:
75094 + atom->clean++;
75095 + break;
75096 + case FQ_LIST:
75097 + atom->fq++;
75098 + break;
75099 + case WB_LIST:
75100 + atom->wb++;
75101 + break;
75102 + case OVRWR_LIST:
75103 + atom->ovrwr++;
75104 + break;
75105 + default:
75106 + impossible("", "");
75107 + }
75108 + ASSIGN_NODE_LIST(node, new_list);
75109 + if (0 && check_lists) {
75110 + int count;
75111 + tree_level level;
75112 +
75113 + count = 0;
75114 +
75115 + /* flush queue list */
75116 + /*check_fq(atom); */
75117 +
75118 + /* dirty list */
75119 + count = 0;
75120 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75121 + list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
75122 + count++;
75123 + }
75124 + if (count != atom->dirty)
75125 + warning("", "dirty counter %d, real %d\n", atom->dirty,
75126 + count);
75127 +
75128 + /* clean list */
75129 + count = 0;
75130 + list_for_each(pos, ATOM_CLEAN_LIST(atom))
75131 + count++;
75132 + if (count != atom->clean)
75133 + warning("", "clean counter %d, real %d\n", atom->clean,
75134 + count);
75135 +
75136 + /* wb list */
75137 + count = 0;
75138 + list_for_each(pos, ATOM_WB_LIST(atom))
75139 + count++;
75140 + if (count != atom->wb)
75141 + warning("", "wb counter %d, real %d\n", atom->wb,
75142 + count);
75143 +
75144 + /* overwrite list */
75145 + count = 0;
75146 + list_for_each(pos, ATOM_OVRWR_LIST(atom))
75147 + count++;
75148 +
75149 + if (count != atom->ovrwr)
75150 + warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
75151 + count);
75152 + }
75153 + assert("vs-1624", atom->num_queued == atom->fq);
75154 + if (atom->capture_count !=
75155 + atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
75156 + printk
75157 + ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
75158 + atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
75159 + atom->wb, atom->fq);
75160 + assert("vs-1622",
75161 + atom->capture_count ==
75162 + atom->dirty + atom->clean + atom->ovrwr + atom->wb +
75163 + atom->fq);
75164 + }
75165 +}
75166 +
75167 +#endif
75168 +
75169 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
75170 + * lock should be taken before calling this function. */
75171 +void jnode_make_wander_nolock(jnode * node)
75172 +{
75173 + txn_atom *atom;
75174 +
75175 + assert("nikita-2431", node != NULL);
75176 + assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
75177 + assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
75178 + assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75179 + assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
75180 +
75181 + atom = node->atom;
75182 +
75183 + assert("zam-895", atom != NULL);
75184 + assert("zam-894", atom_is_protected(atom));
75185 +
75186 + JF_SET(node, JNODE_OVRWR);
75187 + /* move node to atom's overwrite list */
75188 + list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
75189 + ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
75190 +}
75191 +
75192 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
75193 + * this function. */
75194 +void jnode_make_wander(jnode * node)
75195 +{
75196 + txn_atom *atom;
75197 +
75198 + spin_lock_jnode(node);
75199 + atom = jnode_get_atom(node);
75200 + assert("zam-913", atom != NULL);
75201 + assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
75202 +
75203 + jnode_make_wander_nolock(node);
75204 + spin_unlock_atom(atom);
75205 + spin_unlock_jnode(node);
75206 +}
75207 +
75208 +/* this just sets RELOC bit */
75209 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
75210 +{
75211 + assert_spin_locked(&(node->guard));
75212 + assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
75213 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
75214 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
75215 + assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75216 + assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
75217 + jnode_set_reloc(node);
75218 +}
75219 +
75220 +/* Make znode RELOC and put it on flush queue */
75221 +void znode_make_reloc(znode * z, flush_queue_t * fq)
75222 +{
75223 + jnode *node;
75224 + txn_atom *atom;
75225 +
75226 + node = ZJNODE(z);
75227 + spin_lock_jnode(node);
75228 +
75229 + atom = jnode_get_atom(node);
75230 + assert("zam-919", atom != NULL);
75231 +
75232 + jnode_make_reloc_nolock(fq, node);
75233 + queue_jnode(fq, node);
75234 +
75235 + spin_unlock_atom(atom);
75236 + spin_unlock_jnode(node);
75237 +
75238 +}
75239 +
75240 +/* Make unformatted node RELOC and put it on flush queue */
75241 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75242 +{
75243 + assert("vs-1479", jnode_is_unformatted(node));
75244 +
75245 + jnode_make_reloc_nolock(fq, node);
75246 + queue_jnode(fq, node);
75247 +}
75248 +
75249 +int capture_super_block(struct super_block *s)
75250 +{
75251 + int result;
75252 + znode *uber;
75253 + lock_handle lh;
75254 +
75255 + init_lh(&lh);
75256 + result = get_uber_znode(get_tree(s),
75257 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75258 + if (result)
75259 + return result;
75260 +
75261 + uber = lh.node;
75262 + /* Grabbing one block for superblock */
75263 + result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75264 + if (result != 0)
75265 + return result;
75266 +
75267 + znode_make_dirty(uber);
75268 +
75269 + done_lh(&lh);
75270 + return 0;
75271 +}
75272 +
75273 +/* Wakeup every handle on the atom's WAITFOR list */
75274 +static void wakeup_atom_waitfor_list(txn_atom * atom)
75275 +{
75276 + txn_wait_links *wlinks;
75277 +
75278 + assert("umka-210", atom != NULL);
75279 +
75280 + /* atom is locked */
75281 + list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75282 + if (wlinks->waitfor_cb == NULL ||
75283 + wlinks->waitfor_cb(atom, wlinks))
75284 + /* Wake up. */
75285 + reiser4_wake_up(wlinks->_lock_stack);
75286 + }
75287 +}
75288 +
75289 +/* Wakeup every handle on the atom's WAITING list */
75290 +static void wakeup_atom_waiting_list(txn_atom * atom)
75291 +{
75292 + txn_wait_links *wlinks;
75293 +
75294 + assert("umka-211", atom != NULL);
75295 +
75296 + /* atom is locked */
75297 + list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75298 + if (wlinks->waiting_cb == NULL ||
75299 + wlinks->waiting_cb(atom, wlinks))
75300 + /* Wake up. */
75301 + reiser4_wake_up(wlinks->_lock_stack);
75302 + }
75303 +}
75304 +
75305 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75306 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75307 +{
75308 + assert("nikita-3330", atom != NULL);
75309 + assert_spin_locked(&(atom->alock));
75310 +
75311 + /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75312 + * last transaction handle. */
75313 + return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75314 +}
75315 +
75316 +/* The general purpose of this function is to wait on the first of two possible events.
75317 + The situation is that a handle (and its atom atomh) is blocked trying to capture a
75318 + block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
75319 + handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
75320 + another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75321 + needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
75322 + proceed and fuse the two atoms in the CAPTURE_WAIT state.
75323 +
75324 + In other words, if either atomh or atomf change state, the handle will be awakened,
75325 + thus there are two lists per atom: WAITING and WAITFOR.
75326 +
75327 + This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75328 + close but it is not assigned to an atom of its own.
75329 +
75330 + Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75331 + BOTH_ATOM_LOCKS. Result: all four locks are released.
75332 +*/
75333 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75334 + txn_atom * atomh, txn_capture mode)
75335 +{
75336 + int ret;
75337 + txn_wait_links wlinks;
75338 +
75339 + assert("umka-213", txnh != NULL);
75340 + assert("umka-214", atomf != NULL);
75341 +
75342 + if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75343 + spin_unlock_txnh(txnh);
75344 + spin_unlock_atom(atomf);
75345 +
75346 + if (atomh) {
75347 + spin_unlock_atom(atomh);
75348 + }
75349 +
75350 + return RETERR(-E_BLOCK);
75351 + }
75352 +
75353 + /* Initialize the waiting list links. */
75354 + init_wlinks(&wlinks);
75355 +
75356 + /* Add txnh to atomf's waitfor list, unlock atomf. */
75357 + list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75358 + wlinks.waitfor_cb = wait_for_fusion;
75359 + atomic_inc(&atomf->refcount);
75360 + spin_unlock_atom(atomf);
75361 +
75362 + if (atomh) {
75363 + /* Add txnh to atomh's waiting list, unlock atomh. */
75364 + list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75365 + atomic_inc(&atomh->refcount);
75366 + spin_unlock_atom(atomh);
75367 + }
75368 +
75369 + /* Go to sleep. */
75370 + spin_unlock_txnh(txnh);
75371 +
75372 + ret = prepare_to_sleep(wlinks._lock_stack);
75373 + if (ret == 0) {
75374 + go_to_sleep(wlinks._lock_stack);
75375 + ret = RETERR(-E_REPEAT);
75376 + }
75377 +
75378 + /* Remove from the waitfor list. */
75379 + spin_lock_atom(atomf);
75380 +
75381 + list_del(&wlinks._fwaitfor_link);
75382 + atom_dec_and_unlock(atomf);
75383 +
75384 + if (atomh) {
75385 + /* Remove from the waiting list. */
75386 + spin_lock_atom(atomh);
75387 + list_del(&wlinks._fwaiting_link);
75388 + atom_dec_and_unlock(atomh);
75389 + }
75390 + return ret;
75391 +}
75392 +
75393 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
75394 +{
75395 + assert("zam-1067", one != two);
75396 +
75397 + /* lock the atom with lesser address first */
75398 + if (one < two) {
75399 + spin_lock_atom(one);
75400 + spin_lock_atom(two);
75401 + } else {
75402 + spin_lock_atom(two);
75403 + spin_lock_atom(one);
75404 + }
75405 +}
75406 +
75407 +
75408 +/* Perform the necessary work to prepare for fusing two atoms, which involves
75409 + * acquiring two atom locks in the proper order. If one of the node's atom is
75410 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75411 + * atom is not then the handle's request is put to sleep. If the node's atom
75412 + * is committing, then the node can be copy-on-captured. Otherwise, pick the
75413 + * atom with fewer pointers to be fused into the atom with more pointer and
75414 + * call capture_fuse_into.
75415 + */
75416 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75417 +{
75418 + txn_atom * txnh_atom = txnh->atom;
75419 + txn_atom * block_atom = node->atom;
75420 +
75421 + atomic_inc(&txnh_atom->refcount);
75422 + atomic_inc(&block_atom->refcount);
75423 +
75424 + spin_unlock_txnh(txnh);
75425 + spin_unlock_jnode(node);
75426 +
75427 + lock_two_atoms(txnh_atom, block_atom);
75428 +
75429 + if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75430 + release_two_atoms(txnh_atom, block_atom);
75431 + return RETERR(-E_REPEAT);
75432 + }
75433 +
75434 + atomic_dec(&txnh_atom->refcount);
75435 + atomic_dec(&block_atom->refcount);
75436 +
75437 + assert ("zam-1066", atom_isopen(txnh_atom));
75438 +
75439 + if (txnh_atom->stage >= block_atom->stage ||
75440 + (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
75441 + capture_fuse_into(txnh_atom, block_atom);
75442 + return RETERR(-E_REPEAT);
75443 + }
75444 + spin_lock_txnh(txnh);
75445 + return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
75446 +}
75447 +
75448 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
75449 + the small list to point to the large atom. Returns the length of the list. */
75450 +static int
75451 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
75452 + struct list_head *small_head)
75453 +{
75454 + int count = 0;
75455 + jnode *node;
75456 +
75457 + assert("umka-218", large != NULL);
75458 + assert("umka-219", large_head != NULL);
75459 + assert("umka-220", small_head != NULL);
75460 + /* small atom should be locked also. */
75461 + assert_spin_locked(&(large->alock));
75462 +
75463 + /* For every jnode on small's capture list... */
75464 + list_for_each_entry(node, small_head, capture_link) {
75465 + count += 1;
75466 +
75467 + /* With the jnode lock held, update atom pointer. */
75468 + spin_lock_jnode(node);
75469 + node->atom = large;
75470 + spin_unlock_jnode(node);
75471 + }
75472 +
75473 + /* Splice the lists. */
75474 + list_splice_init(small_head, large_head->prev);
75475 +
75476 + return count;
75477 +}
75478 +
75479 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
75480 + the small list to point to the large atom. Returns the length of the list. */
75481 +static int
75482 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
75483 + struct list_head *small_head)
75484 +{
75485 + int count = 0;
75486 + txn_handle *txnh;
75487 +
75488 + assert("umka-221", large != NULL);
75489 + assert("umka-222", large_head != NULL);
75490 + assert("umka-223", small_head != NULL);
75491 +
75492 + /* Adjust every txnh to the new atom. */
75493 + list_for_each_entry(txnh, small_head, txnh_link) {
75494 + count += 1;
75495 +
75496 + /* With the txnh lock held, update atom pointer. */
75497 + spin_lock_txnh(txnh);
75498 + txnh->atom = large;
75499 + spin_unlock_txnh(txnh);
75500 + }
75501 +
75502 + /* Splice the txn_handle list. */
75503 + list_splice_init(small_head, large_head->prev);
75504 +
75505 + return count;
75506 +}
75507 +
75508 +/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
75509 + added to LARGE and their ->atom pointers are all updated. The associated counts are
75510 + updated as well, and any waiting handles belonging to either are awakened. Finally the
75511 + smaller atom's refcount is decremented.
75512 +*/
75513 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
75514 +{
75515 + int level;
75516 + unsigned zcount = 0;
75517 + unsigned tcount = 0;
75518 +
75519 + assert("umka-224", small != NULL);
75520 + assert("umka-225", small != NULL);
75521 +
75522 + assert_spin_locked(&(large->alock));
75523 + assert_spin_locked(&(small->alock));
75524 +
75525 + assert("jmacd-201", atom_isopen(small));
75526 + assert("jmacd-202", atom_isopen(large));
75527 +
75528 + /* Splice and update the per-level dirty jnode lists */
75529 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75530 + zcount +=
75531 + capture_fuse_jnode_lists(large,
75532 + ATOM_DIRTY_LIST(large, level),
75533 + ATOM_DIRTY_LIST(small, level));
75534 + }
75535 +
75536 + /* Splice and update the [clean,dirty] jnode and txnh lists */
75537 + zcount +=
75538 + capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
75539 + ATOM_CLEAN_LIST(small));
75540 + zcount +=
75541 + capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
75542 + ATOM_OVRWR_LIST(small));
75543 + zcount +=
75544 + capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
75545 + ATOM_WB_LIST(small));
75546 + zcount +=
75547 + capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
75548 + tcount +=
75549 + capture_fuse_txnh_lists(large, &large->txnh_list,
75550 + &small->txnh_list);
75551 +
75552 + /* Check our accounting. */
75553 + assert("jmacd-1063",
75554 + zcount + small->num_queued == small->capture_count);
75555 + assert("jmacd-1065", tcount == small->txnh_count);
75556 +
75557 + /* sum numbers of waiters threads */
75558 + large->nr_waiters += small->nr_waiters;
75559 + small->nr_waiters = 0;
75560 +
75561 + /* splice flush queues */
75562 + fuse_fq(large, small);
75563 +
75564 + /* update counter of jnode on every atom' list */
75565 + ON_DEBUG(large->dirty += small->dirty;
75566 + small->dirty = 0;
75567 + large->clean += small->clean;
75568 + small->clean = 0;
75569 + large->ovrwr += small->ovrwr;
75570 + small->ovrwr = 0;
75571 + large->wb += small->wb;
75572 + small->wb = 0;
75573 + large->fq += small->fq;
75574 + small->fq = 0;);
75575 +
75576 + /* count flushers in result atom */
75577 + large->nr_flushers += small->nr_flushers;
75578 + small->nr_flushers = 0;
75579 +
75580 + /* update counts of flushed nodes */
75581 + large->flushed += small->flushed;
75582 + small->flushed = 0;
75583 +
75584 + /* Transfer list counts to large. */
75585 + large->txnh_count += small->txnh_count;
75586 + large->capture_count += small->capture_count;
75587 +
75588 + /* Add all txnh references to large. */
75589 + atomic_add(small->txnh_count, &large->refcount);
75590 + atomic_sub(small->txnh_count, &small->refcount);
75591 +
75592 + /* Reset small counts */
75593 + small->txnh_count = 0;
75594 + small->capture_count = 0;
75595 +
75596 + /* Assign the oldest start_time, merge flags. */
75597 + large->start_time = min(large->start_time, small->start_time);
75598 + large->flags |= small->flags;
75599 +
75600 + /* Merge blocknr sets. */
75601 + blocknr_set_merge(&small->delete_set, &large->delete_set);
75602 + blocknr_set_merge(&small->wandered_map, &large->wandered_map);
75603 +
75604 + /* Merge allocated/deleted file counts */
75605 + large->nr_objects_deleted += small->nr_objects_deleted;
75606 + large->nr_objects_created += small->nr_objects_created;
75607 +
75608 + small->nr_objects_deleted = 0;
75609 + small->nr_objects_created = 0;
75610 +
75611 + /* Merge allocated blocks counts */
75612 + large->nr_blocks_allocated += small->nr_blocks_allocated;
75613 +
75614 + large->nr_running_queues += small->nr_running_queues;
75615 + small->nr_running_queues = 0;
75616 +
75617 + /* Merge blocks reserved for overwrite set. */
75618 + large->flush_reserved += small->flush_reserved;
75619 + small->flush_reserved = 0;
75620 +
75621 + if (large->stage < small->stage) {
75622 + /* Large only needs to notify if it has changed state. */
75623 + atom_set_stage(large, small->stage);
75624 + wakeup_atom_waiting_list(large);
75625 + }
75626 +
75627 + atom_set_stage(small, ASTAGE_INVALID);
75628 +
75629 + /* Notify any waiters--small needs to unload its wait lists. Waiters
75630 + actually remove themselves from the list before returning from the
75631 + fuse_wait function. */
75632 + wakeup_atom_waiting_list(small);
75633 +
75634 + /* Unlock atoms */
75635 + spin_unlock_atom(large);
75636 + atom_dec_and_unlock(small);
75637 +}
75638 +
75639 +/* TXNMGR STUFF */
75640 +
75641 +/* Release a block from the atom, reversing the effects of being captured,
75642 + do not release atom's reference to jnode due to holding spin-locks.
75643 + Currently this is only called when the atom commits.
75644 +
75645 + NOTE: this function does not release a (journal) reference to jnode
75646 + due to locking optimizations, you should call jput() somewhere after
75647 + calling uncapture_block(). */
75648 +void uncapture_block(jnode * node)
75649 +{
75650 + txn_atom *atom;
75651 +
75652 + assert("umka-226", node != NULL);
75653 + atom = node->atom;
75654 + assert("umka-228", atom != NULL);
75655 +
75656 + assert("jmacd-1021", node->atom == atom);
75657 + assert_spin_locked(&(node->guard));
75658 + assert("jmacd-1023", atom_is_protected(atom));
75659 +
75660 + JF_CLR(node, JNODE_DIRTY);
75661 + JF_CLR(node, JNODE_RELOC);
75662 + JF_CLR(node, JNODE_OVRWR);
75663 + JF_CLR(node, JNODE_CREATED);
75664 + JF_CLR(node, JNODE_WRITEBACK);
75665 + JF_CLR(node, JNODE_REPACK);
75666 +
75667 + list_del_init(&node->capture_link);
75668 + if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75669 + assert("zam-925", atom_isopen(atom));
75670 + assert("vs-1623", NODE_LIST(node) == FQ_LIST);
75671 + ON_DEBUG(atom->num_queued--);
75672 + JF_CLR(node, JNODE_FLUSH_QUEUED);
75673 + }
75674 + atom->capture_count -= 1;
75675 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
75676 + node->atom = NULL;
75677 +
75678 + spin_unlock_jnode(node);
75679 + LOCK_CNT_DEC(t_refs);
75680 +}
75681 +
75682 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
75683 + bitmap-based allocator code for adding modified bitmap blocks the
75684 + transaction. @atom and @node are spin locked */
75685 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
75686 +{
75687 + assert("zam-538", atom_is_protected(atom));
75688 + assert_spin_locked(&(node->guard));
75689 + assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
75690 + assert("zam-543", node->atom == NULL);
75691 + assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
75692 +
75693 + list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
75694 + jref(node);
75695 + node->atom = atom;
75696 + atom->capture_count++;
75697 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
75698 +}
75699 +
75700 +
75701 +#if REISER4_DEBUG
75702 +
75703 +void info_atom(const char *prefix, const txn_atom * atom)
75704 +{
75705 + if (atom == NULL) {
75706 + printk("%s: no atom\n", prefix);
75707 + return;
75708 + }
75709 +
75710 + printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
75711 + " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
75712 + atomic_read(&atom->refcount), atom->atom_id, atom->flags,
75713 + atom->txnh_count, atom->capture_count, atom->stage,
75714 + atom->start_time, atom->flushed);
75715 +}
75716 +
75717 +#endif
75718 +
75719 +static int count_deleted_blocks_actor(txn_atom * atom,
75720 + const reiser4_block_nr * a,
75721 + const reiser4_block_nr * b, void *data)
75722 +{
75723 + reiser4_block_nr *counter = data;
75724 +
75725 + assert("zam-995", data != NULL);
75726 + assert("zam-996", a != NULL);
75727 + if (b == NULL)
75728 + *counter += 1;
75729 + else
75730 + *counter += *b;
75731 + return 0;
75732 +}
75733 +
75734 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
75735 +{
75736 + reiser4_block_nr result;
75737 + txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
75738 + txn_atom *atom;
75739 +
75740 + result = 0;
75741 +
75742 + spin_lock_txnmgr(tmgr);
75743 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
75744 + spin_lock_atom(atom);
75745 + if (atom_isopen(atom))
75746 + blocknr_set_iterator(
75747 + atom, &atom->delete_set,
75748 + count_deleted_blocks_actor, &result, 0);
75749 + spin_unlock_atom(atom);
75750 + }
75751 + spin_unlock_txnmgr(tmgr);
75752 +
75753 + return result;
75754 +}
75755 +
75756 +/*
75757 + * Local variables:
75758 + * c-indentation-style: "K&R"
75759 + * mode-name: "LC"
75760 + * c-basic-offset: 8
75761 + * tab-width: 8
75762 + * fill-column: 79
75763 + * End:
75764 + */
75765 Index: linux-2.6.16/fs/reiser4/txnmgr.h
75766 ===================================================================
75767 --- /dev/null
75768 +++ linux-2.6.16/fs/reiser4/txnmgr.h
75769 @@ -0,0 +1,704 @@
75770 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75771 + * reiser4/README */
75772 +
75773 +/* data-types and function declarations for transaction manager. See txnmgr.c
75774 + * for details. */
75775 +
75776 +#ifndef __REISER4_TXNMGR_H__
75777 +#define __REISER4_TXNMGR_H__
75778 +
75779 +#include "forward.h"
75780 +#include "dformat.h"
75781 +
75782 +#include <linux/fs.h>
75783 +#include <linux/mm.h>
75784 +#include <linux/types.h>
75785 +#include <linux/spinlock.h>
75786 +#include <asm/atomic.h>
75787 +#include <asm/semaphore.h>
75788 +
75789 +/* TYPE DECLARATIONS */
75790 +
75791 +/* This enumeration describes the possible types of a capture request (try_capture).
75792 + A capture request dynamically assigns a block to the calling thread's transaction
75793 + handle. */
75794 +typedef enum {
75795 + /* A READ_ATOMIC request indicates that a block will be read and that the caller's
75796 + atom should fuse in order to ensure that the block commits atomically with the
75797 + caller. */
75798 + TXN_CAPTURE_READ_ATOMIC = (1 << 0),
75799 +
75800 + /* A READ_NONCOM request indicates that a block will be read and that the caller is
75801 + willing to read a non-committed block without causing atoms to fuse. */
75802 + TXN_CAPTURE_READ_NONCOM = (1 << 1),
75803 +
75804 + /* A READ_MODIFY request indicates that a block will be read but that the caller
75805 + wishes for the block to be captured as it will be written. This capture request
75806 + mode is not currently used, but eventually it will be useful for preventing
75807 + deadlock in read-modify-write cycles. */
75808 + TXN_CAPTURE_READ_MODIFY = (1 << 2),
75809 +
75810 + /* A WRITE capture request indicates that a block will be modified and that atoms
75811 + should fuse to make the commit atomic. */
75812 + TXN_CAPTURE_WRITE = (1 << 3),
75813 +
75814 + /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
75815 + exclusive type designation from extra bits that may be supplied -- see
75816 + below. */
75817 + TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
75818 + TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
75819 + TXN_CAPTURE_WRITE),
75820 +
75821 + /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
75822 + indicate modification will occur. */
75823 + TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
75824 +
75825 + /* An option to try_capture, NONBLOCKING indicates that the caller would
75826 + prefer not to sleep waiting for an aging atom to commit. */
75827 + TXN_CAPTURE_NONBLOCKING = (1 << 4),
75828 +
75829 + /* An option to try_capture to prevent atom fusion, just simple capturing is allowed */
75830 + TXN_CAPTURE_DONT_FUSE = (1 << 5)
75831 +
75832 + /* This macro selects only the exclusive capture request types, stripping out any
75833 + options that were supplied (i.e., NONBLOCKING). */
75834 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
75835 +} txn_capture;
75836 +
75837 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
75838 + difference is in the handling of read requests. A WRITE_FUSING transaction handle
75839 + defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
75840 + transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
75841 +typedef enum {
75842 + TXN_WRITE_FUSING = (1 << 0),
75843 + TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
75844 +} txn_mode;
75845 +
75846 +/* Every atom has a stage, which is one of these exclusive values: */
75847 +typedef enum {
75848 + /* Initially an atom is free. */
75849 + ASTAGE_FREE = 0,
75850 +
75851 + /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
75852 + blocks and fuse with other atoms. */
75853 + ASTAGE_CAPTURE_FUSE = 1,
75854 +
75855 + /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
75856 +
75857 + /* When an atom reaches a certain age it must do all it can to commit. An atom in
75858 + the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
75859 + atoms in the CAPTURE_FUSE stage. */
75860 + ASTAGE_CAPTURE_WAIT = 2,
75861 +
75862 + /* Waiting for I/O before commit. Copy-on-capture (see
75863 + http://namesys.com/v4/v4.html). */
75864 + ASTAGE_PRE_COMMIT = 3,
75865 +
75866 + /* Post-commit overwrite I/O. Steal-on-capture. */
75867 + ASTAGE_POST_COMMIT = 4,
75868 +
75869 + /* Atom which waits for the removal of the last reference to (it? ) to
75870 + * be deleted from memory */
75871 + ASTAGE_DONE = 5,
75872 +
75873 + /* invalid atom. */
75874 + ASTAGE_INVALID = 6,
75875 +
75876 +} txn_stage;
75877 +
75878 +/* Certain flags may be set in the txn_atom->flags field. */
75879 +typedef enum {
75880 + /* Indicates that the atom should commit as soon as possible. */
75881 + ATOM_FORCE_COMMIT = (1 << 0),
75882 + /* to avoid endless loop, mark the atom (which was considered as too
75883 + * small) after failed attempt to fuse it. */
75884 + ATOM_CANCEL_FUSION = (1 << 1)
75885 +} txn_flags;
75886 +
75887 +/* Flags for controlling commit_txnh */
75888 +typedef enum {
75889 + /* Wait commit atom completion in commit_txnh */
75890 + TXNH_WAIT_COMMIT = 0x2,
75891 + /* Don't commit atom when this handle is closed */
75892 + TXNH_DONT_COMMIT = 0x4
75893 +} txn_handle_flags_t;
75894 +
75895 +/* TYPE DEFINITIONS */
75896 +
75897 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
75898 + fields, so typically an operation on the atom through either of these objects must (1)
75899 + lock the object, (2) read the atom pointer, (3) lock the atom.
75900 +
75901 + During atom fusion, the process holds locks on both atoms at once. Then, it iterates
75902 + through the list of handles and pages held by the smaller of the two atoms. For each
75903 + handle and page referencing the smaller atom, the fusing process must: (1) lock the
75904 + object, and (2) update the atom pointer.
75905 +
75906 + You can see that there is a conflict of lock ordering here, so the more-complex
75907 + procedure should have priority, i.e., the fusing process has priority so that it is
75908 + guaranteed to make progress and to avoid restarts.
75909 +
75910 + This decision, however, means additional complexity for aquiring the atom lock in the
75911 + first place.
75912 +
75913 + The general original procedure followed in the code was:
75914 +
75915 + TXN_OBJECT *obj = ...;
75916 + TXN_ATOM *atom;
75917 +
75918 + spin_lock (& obj->_lock);
75919 +
75920 + atom = obj->_atom;
75921 +
75922 + if (! spin_trylock_atom (atom))
75923 + {
75924 + spin_unlock (& obj->_lock);
75925 + RESTART OPERATION, THERE WAS A RACE;
75926 + }
75927 +
75928 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75929 +
75930 + It has however been found that this wastes CPU a lot in a manner that is
75931 + hard to profile. So, proper refcounting was added to atoms, and new
75932 + standard locking sequence is like following:
75933 +
75934 + TXN_OBJECT *obj = ...;
75935 + TXN_ATOM *atom;
75936 +
75937 + spin_lock (& obj->_lock);
75938 +
75939 + atom = obj->_atom;
75940 +
75941 + if (! spin_trylock_atom (atom))
75942 + {
75943 + atomic_inc (& atom->refcount);
75944 + spin_unlock (& obj->_lock);
75945 + spin_lock (&atom->_lock);
75946 + atomic_dec (& atom->refcount);
75947 + // HERE atom is locked
75948 + spin_unlock (&atom->_lock);
75949 + RESTART OPERATION, THERE WAS A RACE;
75950 + }
75951 +
75952 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75953 +
75954 + (core of this is implemented in trylock_throttle() function)
75955 +
75956 + See the jnode_get_atom() function for a common case.
75957 +
75958 + As an additional (and important) optimization allowing to avoid restarts,
75959 + it is possible to re-check required pre-conditions at the HERE point in
75960 + code above and proceed without restarting if they are still satisfied.
75961 +*/
75962 +
75963 +/* A block number set consists of only the list head. */
75964 +struct blocknr_set {
75965 + struct list_head entries;
75966 +};
75967 +
75968 +/* An atomic transaction: this is the underlying system representation
75969 + of a transaction, not the one seen by clients.
75970 +
75971 + Invariants involving this data-type:
75972 +
75973 + [sb-fake-allocated]
75974 +*/
75975 +struct txn_atom {
75976 + /* The spinlock protecting the atom, held during fusion and various other state
75977 + changes. */
75978 + spinlock_t alock;
75979 +
75980 + /* The atom's reference counter, increasing (in case of a duplication
75981 + of an existing reference or when we are sure that some other
75982 + reference exists) may be done without taking spinlock, decrementing
75983 + of the ref. counter requires a spinlock to be held.
75984 +
75985 + Each transaction handle counts in ->refcount. All jnodes count as
75986 + one reference acquired in atom_begin_andlock(), released in
75987 + commit_current_atom().
75988 + */
75989 + atomic_t refcount;
75990 +
75991 + /* The atom_id identifies the atom in persistent records such as the log. */
75992 + __u32 atom_id;
75993 +
75994 + /* Flags holding any of the txn_flags enumerated values (e.g.,
75995 + ATOM_FORCE_COMMIT). */
75996 + __u32 flags;
75997 +
75998 + /* Number of open handles. */
75999 + __u32 txnh_count;
76000 +
76001 + /* The number of znodes captured by this atom. Equal to the sum of lengths of the
76002 + dirty_nodes[level] and clean_nodes lists. */
76003 + __u32 capture_count;
76004 +
76005 +#if REISER4_DEBUG
76006 + int clean;
76007 + int dirty;
76008 + int ovrwr;
76009 + int wb;
76010 + int fq;
76011 +#endif
76012 +
76013 + __u32 flushed;
76014 +
76015 + /* Current transaction stage. */
76016 + txn_stage stage;
76017 +
76018 + /* Start time. */
76019 + unsigned long start_time;
76020 +
76021 + /* The atom's delete set. It collects block numbers of the nodes
76022 + which were deleted during the transaction. */
76023 + blocknr_set delete_set;
76024 +
76025 + /* The atom's wandered_block mapping. */
76026 + blocknr_set wandered_map;
76027 +
76028 + /* The transaction's list of dirty captured nodes--per level. Index
76029 + by (level). dirty_nodes[0] is for znode-above-root */
76030 + struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
76031 +
76032 + /* The transaction's list of clean captured nodes. */
76033 + struct list_head clean_nodes;
76034 +
76035 + /* The atom's overwrite set */
76036 + struct list_head ovrwr_nodes;
76037 +
76038 + /* nodes which are being written to disk */
76039 + struct list_head writeback_nodes;
76040 +
76041 + /* list of inodes */
76042 + struct list_head inodes;
76043 +
76044 + /* List of handles associated with this atom. */
76045 + struct list_head txnh_list;
76046 +
76047 + /* Transaction list link: list of atoms in the transaction manager. */
76048 + struct list_head atom_link;
76049 +
76050 + /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
76051 + struct list_head fwaitfor_list;
76052 +
76053 + /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
76054 + struct list_head fwaiting_list;
76055 +
76056 + /* Numbers of objects which were deleted/created in this transaction
76057 + thereby numbers of objects IDs which were released/deallocated. */
76058 + int nr_objects_deleted;
76059 + int nr_objects_created;
76060 + /* number of blocks allocated during the transaction */
76061 + __u64 nr_blocks_allocated;
76062 + /* All atom's flush queue objects are on this list */
76063 + struct list_head flush_queues;
76064 +#if REISER4_DEBUG
76065 + /* number of flush queues for this atom. */
76066 + int nr_flush_queues;
76067 + /* Number of jnodes which were removed from atom's lists and put
76068 + on flush_queue */
76069 + int num_queued;
76070 +#endif
76071 + /* number of threads who wait for this atom to complete commit */
76072 + int nr_waiters;
76073 + /* number of threads which do jnode_flush() over this atom */
76074 + int nr_flushers;
76075 + /* number of flush queues which are IN_USE and jnodes from fq->prepped
76076 + are submitted to disk by the write_fq() routine. */
76077 + int nr_running_queues;
76078 + /* A counter of grabbed unformatted nodes, see a description of the
76079 + * reiser4 space reservation scheme at block_alloc.c */
76080 + reiser4_block_nr flush_reserved;
76081 +#if REISER4_DEBUG
76082 + void *committer;
76083 +#endif
76084 + struct super_block *super;
76085 +};
76086 +
76087 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
76088 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
76089 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
76090 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
76091 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
76092 +
76093 +#define NODE_LIST(node) (node)->list
76094 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
76095 +ON_DEBUG(void
76096 + count_jnode(txn_atom *, jnode *, atom_list old_list,
76097 + atom_list new_list, int check_lists));
76098 +
76099 +typedef struct protected_jnodes {
76100 + struct list_head inatom; /* link to atom's list these structures */
76101 + struct list_head nodes; /* head of list of protected nodes */
76102 +} protected_jnodes;
76103 +
76104 +/* A transaction handle: the client obtains and commits this handle which is assigned by
76105 + the system to a txn_atom. */
76106 +struct txn_handle {
76107 + /* Spinlock protecting ->atom pointer */
76108 + spinlock_t hlock;
76109 +
76110 + /* Flags for controlling commit_txnh() behavior */
76111 + /* from txn_handle_flags_t */
76112 + txn_handle_flags_t flags;
76113 +
76114 + /* Whether it is READ_FUSING or WRITE_FUSING. */
76115 + txn_mode mode;
76116 +
76117 + /* If assigned, the atom it is part of. */
76118 + txn_atom *atom;
76119 +
76120 + /* Transaction list link. Head is in txn_atom. */
76121 + struct list_head txnh_link;
76122 +};
76123 +
76124 +/* The transaction manager: one is contained in the reiser4_super_info_data */
76125 +struct txn_mgr {
76126 + /* A spinlock protecting the atom list, id_count, flush_control */
76127 + spinlock_t tmgr_lock;
76128 +
76129 + /* List of atoms. */
76130 + struct list_head atoms_list;
76131 +
76132 + /* Number of atoms. */
76133 + int atom_count;
76134 +
76135 + /* A counter used to assign atom->atom_id values. */
76136 + __u32 id_count;
76137 +
76138 + /* a semaphore object for commit serialization */
76139 + struct semaphore commit_semaphore;
76140 +
76141 + /* a list of all txnmrgs served by particular daemon. */
76142 + struct list_head linkage;
76143 +
76144 + /* description of daemon for this txnmgr */
76145 + ktxnmgrd_context *daemon;
76146 +
76147 + /* parameters. Adjustable through mount options. */
76148 + unsigned int atom_max_size;
76149 + unsigned int atom_max_age;
76150 + unsigned int atom_min_size;
76151 + /* max number of concurrent flushers for one atom, 0 - unlimited. */
76152 + unsigned int atom_max_flushers;
76153 + struct dentry *debugfs_atom_count;
76154 + struct dentry *debugfs_id_count;
76155 +};
76156 +
76157 +/* FUNCTION DECLARATIONS */
76158 +
76159 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
76160 + are prefixed with "txn_". For comments, see txnmgr.c. */
76161 +
76162 +extern int init_txnmgr_static(void);
76163 +extern void done_txnmgr_static(void);
76164 +
76165 +extern void init_txnmgr(txn_mgr *);
76166 +extern void done_txnmgr(txn_mgr *);
76167 +
76168 +extern int txn_reserve(int reserved);
76169 +
76170 +extern void txn_begin(reiser4_context * context);
76171 +extern int txn_end(reiser4_context * context);
76172 +
76173 +extern void txn_restart(reiser4_context * context);
76174 +extern void txn_restart_current(void);
76175 +
76176 +extern int txnmgr_force_commit_all(struct super_block *, int);
76177 +extern int current_atom_should_commit(void);
76178 +
76179 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
76180 +
76181 +extern int commit_some_atoms(txn_mgr *);
76182 +extern int force_commit_atom(txn_handle *);
76183 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
76184 +
76185 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
76186 +
76187 +extern void atom_set_stage(txn_atom * atom, txn_stage stage);
76188 +
76189 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
76190 + int alloc_value);
76191 +extern void atom_dec_and_unlock(txn_atom * atom);
76192 +
76193 +extern int try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
76194 +extern int try_capture_page_to_invalidate(struct page *pg);
76195 +
76196 +extern void uncapture_page(struct page *pg);
76197 +extern void uncapture_block(jnode *);
76198 +extern void uncapture_jnode(jnode *);
76199 +
76200 +extern int capture_inode(struct inode *);
76201 +extern int uncapture_inode(struct inode *);
76202 +
76203 +extern txn_atom *get_current_atom_locked_nocheck(void);
76204 +
76205 +#if REISER4_DEBUG
76206 +
76207 +/**
76208 + * atom_is_protected - make sure that nobody but us can do anything with atom
76209 + * @atom: atom to be checked
76210 + *
76211 + * This is used to assert that atom either entered commit stages or is spin
76212 + * locked.
76213 + */
76214 +static inline int atom_is_protected(txn_atom *atom)
76215 +{
76216 + if (atom->stage >= ASTAGE_PRE_COMMIT)
76217 + return 1;
76218 + assert_spin_locked(&(atom->alock));
76219 + return 1;
76220 +}
76221 +
76222 +#endif
76223 +
76224 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
76225 +static inline txn_atom *get_current_atom_locked(void)
76226 +{
76227 + txn_atom *atom;
76228 +
76229 + atom = get_current_atom_locked_nocheck();
76230 + assert("zam-761", atom != NULL);
76231 +
76232 + return atom;
76233 +}
76234 +
76235 +extern txn_atom *jnode_get_atom(jnode *);
76236 +
76237 +extern void atom_wait_event(txn_atom *);
76238 +extern void atom_send_event(txn_atom *);
76239 +
76240 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
76241 +extern int capture_super_block(struct super_block *s);
76242 +int capture_bulk(jnode **, int count);
76243 +
76244 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
76245 + calling convention of these three routines. */
76246 +extern void blocknr_set_init(blocknr_set * bset);
76247 +extern void blocknr_set_destroy(blocknr_set * bset);
76248 +extern void blocknr_set_merge(blocknr_set * from, blocknr_set * into);
76249 +extern int blocknr_set_add_extent(txn_atom * atom,
76250 + blocknr_set * bset,
76251 + blocknr_set_entry ** new_bsep,
76252 + const reiser4_block_nr * start,
76253 + const reiser4_block_nr * len);
76254 +extern int blocknr_set_add_pair(txn_atom * atom, blocknr_set * bset,
76255 + blocknr_set_entry ** new_bsep,
76256 + const reiser4_block_nr * a,
76257 + const reiser4_block_nr * b);
76258 +
76259 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76260 + const reiser4_block_nr *, void *);
76261 +
76262 +extern int blocknr_set_iterator(txn_atom * atom, blocknr_set * bset,
76263 + blocknr_set_actor_f actor, void *data,
76264 + int delete);
76265 +
76266 +/* flush code takes care about how to fuse flush queues */
76267 +extern void flush_init_atom(txn_atom * atom);
76268 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76269 +
76270 +static inline void spin_lock_atom(txn_atom *atom)
76271 +{
76272 + /* check that spinlocks of lower priorities are not held */
76273 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76274 + LOCK_CNT_NIL(spin_locked_jnode) &&
76275 + LOCK_CNT_NIL(spin_locked_zlock) &&
76276 + LOCK_CNT_NIL(rw_locked_dk) &&
76277 + LOCK_CNT_NIL(rw_locked_tree)));
76278 +
76279 + spin_lock(&(atom->alock));
76280 +
76281 + LOCK_CNT_INC(spin_locked_atom);
76282 + LOCK_CNT_INC(spin_locked);
76283 +}
76284 +
76285 +static inline int spin_trylock_atom(txn_atom *atom)
76286 +{
76287 + if (spin_trylock(&(atom->alock))) {
76288 + LOCK_CNT_INC(spin_locked_atom);
76289 + LOCK_CNT_INC(spin_locked);
76290 + return 1;
76291 + }
76292 + return 0;
76293 +}
76294 +
76295 +static inline void spin_unlock_atom(txn_atom *atom)
76296 +{
76297 + assert_spin_locked(&(atom->alock));
76298 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76299 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76300 +
76301 + LOCK_CNT_DEC(spin_locked_atom);
76302 + LOCK_CNT_DEC(spin_locked);
76303 +
76304 + spin_unlock(&(atom->alock));
76305 +}
76306 +
76307 +static inline void spin_lock_txnh(txn_handle *txnh)
76308 +{
76309 + /* check that spinlocks of lower priorities are not held */
76310 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76311 + LOCK_CNT_NIL(spin_locked_zlock) &&
76312 + LOCK_CNT_NIL(rw_locked_tree)));
76313 +
76314 + spin_lock(&(txnh->hlock));
76315 +
76316 + LOCK_CNT_INC(spin_locked_txnh);
76317 + LOCK_CNT_INC(spin_locked);
76318 +}
76319 +
76320 +static inline int spin_trylock_txnh(txn_handle *txnh)
76321 +{
76322 + if (spin_trylock(&(txnh->hlock))) {
76323 + LOCK_CNT_INC(spin_locked_txnh);
76324 + LOCK_CNT_INC(spin_locked);
76325 + return 1;
76326 + }
76327 + return 0;
76328 +}
76329 +
76330 +static inline void spin_unlock_txnh(txn_handle *txnh)
76331 +{
76332 + assert_spin_locked(&(txnh->hlock));
76333 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76334 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76335 +
76336 + LOCK_CNT_DEC(spin_locked_txnh);
76337 + LOCK_CNT_DEC(spin_locked);
76338 +
76339 + spin_unlock(&(txnh->hlock));
76340 +}
76341 +
76342 +#define spin_ordering_pred_txnmgr(tmgr) \
76343 + ( LOCK_CNT_NIL(spin_locked_atom) && \
76344 + LOCK_CNT_NIL(spin_locked_txnh) && \
76345 + LOCK_CNT_NIL(spin_locked_jnode) && \
76346 + LOCK_CNT_NIL(rw_locked_zlock) && \
76347 + LOCK_CNT_NIL(rw_locked_dk) && \
76348 + LOCK_CNT_NIL(rw_locked_tree) )
76349 +
76350 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
76351 +{
76352 + /* check that spinlocks of lower priorities are not held */
76353 + assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76354 + LOCK_CNT_NIL(spin_locked_txnh) &&
76355 + LOCK_CNT_NIL(spin_locked_jnode) &&
76356 + LOCK_CNT_NIL(spin_locked_zlock) &&
76357 + LOCK_CNT_NIL(rw_locked_dk) &&
76358 + LOCK_CNT_NIL(rw_locked_tree)));
76359 +
76360 + spin_lock(&(mgr->tmgr_lock));
76361 +
76362 + LOCK_CNT_INC(spin_locked_txnmgr);
76363 + LOCK_CNT_INC(spin_locked);
76364 +}
76365 +
76366 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76367 +{
76368 + if (spin_trylock(&(mgr->tmgr_lock))) {
76369 + LOCK_CNT_INC(spin_locked_txnmgr);
76370 + LOCK_CNT_INC(spin_locked);
76371 + return 1;
76372 + }
76373 + return 0;
76374 +}
76375 +
76376 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76377 +{
76378 + assert_spin_locked(&(mgr->tmgr_lock));
76379 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76380 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76381 +
76382 + LOCK_CNT_DEC(spin_locked_txnmgr);
76383 + LOCK_CNT_DEC(spin_locked);
76384 +
76385 + spin_unlock(&(mgr->tmgr_lock));
76386 +}
76387 +
76388 +typedef enum {
76389 + FQ_IN_USE = 0x1
76390 +} flush_queue_state_t;
76391 +
76392 +typedef struct flush_queue flush_queue_t;
76393 +
76394 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76395 + is filled by the jnode_flush() routine, and written to disk under memory
76396 + pressure or at atom commit time. */
76397 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76398 + field and fq->prepped list can be modified if atom is spin-locked and fq
76399 + object is "in-use" state. For read-only traversal of the fq->prepped list
76400 + and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76401 + only have atom spin-locked. */
76402 +struct flush_queue {
76403 + /* linkage element is the first in this structure to make debugging
76404 + easier. See field in atom struct for description of list. */
76405 + struct list_head alink;
76406 + /* A spinlock to protect changes of fq state and fq->atom pointer */
76407 + spinlock_t guard;
76408 + /* flush_queue state: [in_use | ready] */
76409 + flush_queue_state_t state;
76410 + /* A list which contains queued nodes, queued nodes are removed from any
76411 + * atom's list and put on this ->prepped one. */
76412 + struct list_head prepped;
76413 + /* number of submitted i/o requests */
76414 + atomic_t nr_submitted;
76415 + /* number of i/o errors */
76416 + atomic_t nr_errors;
76417 + /* An atom this flush queue is attached to */
76418 + txn_atom *atom;
76419 + /* A semaphore for waiting on i/o completion */
76420 + struct semaphore io_sem;
76421 +#if REISER4_DEBUG
76422 + /* A thread which took this fq in exclusive use, NULL if fq is free,
76423 + * used for debugging. */
76424 + struct task_struct *owner;
76425 +#endif
76426 +};
76427 +
76428 +extern int fq_by_atom(txn_atom *, flush_queue_t **);
76429 +extern void fq_put_nolock(flush_queue_t *);
76430 +extern void fq_put(flush_queue_t *);
76431 +extern void fuse_fq(txn_atom * to, txn_atom * from);
76432 +extern void queue_jnode(flush_queue_t *, jnode *);
76433 +extern void mark_jnode_queued(flush_queue_t *, jnode *);
76434 +
76435 +extern int write_fq(flush_queue_t *, long *, int);
76436 +extern int current_atom_finish_all_fq(void);
76437 +extern void init_atom_fq_parts(txn_atom *);
76438 +
76439 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76440 +
76441 +extern void znode_make_dirty(znode * node);
76442 +extern void jnode_make_dirty_locked(jnode * node);
76443 +
76444 +extern int sync_atom(txn_atom * atom);
76445 +
76446 +#if REISER4_DEBUG
76447 +extern int atom_fq_parts_are_clean(txn_atom *);
76448 +#endif
76449 +
76450 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
76451 +extern flush_queue_t *get_fq_for_current_atom(void);
76452 +
76453 +void protected_jnodes_init(protected_jnodes * list);
76454 +void protected_jnodes_done(protected_jnodes * list);
76455 +void invalidate_list(struct list_head * head);
76456 +
76457 +#if REISER4_DEBUG
76458 +void info_atom(const char *prefix, const txn_atom * atom);
76459 +#else
76460 +#define info_atom(p,a) noop
76461 +#endif
76462 +
76463 +# endif /* __REISER4_TXNMGR_H__ */
76464 +
76465 +/* Make Linus happy.
76466 + Local variables:
76467 + c-indentation-style: "K&R"
76468 + mode-name: "LC"
76469 + c-basic-offset: 8
76470 + tab-width: 8
76471 + fill-column: 120
76472 + End:
76473 +*/
76474 Index: linux-2.6.16/fs/reiser4/type_safe_hash.h
76475 ===================================================================
76476 --- /dev/null
76477 +++ linux-2.6.16/fs/reiser4/type_safe_hash.h
76478 @@ -0,0 +1,320 @@
76479 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76480 + * reiser4/README */
76481 +
76482 +/* A hash table class that uses hash chains (singly-linked) and is
76483 + parametrized to provide type safety. */
76484 +
76485 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
76486 +#define __REISER4_TYPE_SAFE_HASH_H__
76487 +
76488 +#include "debug.h"
76489 +
76490 +#include <asm/errno.h>
76491 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
76492 + based on the object type. You need to declare the item type before
76493 + this definition, define it after this definition. */
76494 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
76495 + \
76496 +typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
76497 +typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
76498 + \
76499 +struct PREFIX##_hash_table_ \
76500 +{ \
76501 + ITEM_TYPE **_table; \
76502 + __u32 _buckets; \
76503 +}; \
76504 + \
76505 +struct PREFIX##_hash_link_ \
76506 +{ \
76507 + ITEM_TYPE *_next; \
76508 +}
76509 +
76510 +/* Step 2: Define the object type of the hash: give it field of type
76511 + PREFIX_hash_link. */
76512 +
76513 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
76514 + the type and field name used in step 3. The arguments are:
76515 +
76516 + ITEM_TYPE The item type being hashed
76517 + KEY_TYPE The type of key being hashed
76518 + KEY_NAME The name of the key field within the item
76519 + LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
76520 + HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
76521 + EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
76522 +
76523 + It implements these functions:
76524 +
76525 + prefix_hash_init Initialize the table given its size.
76526 + prefix_hash_insert Insert an item
76527 + prefix_hash_insert_index Insert an item w/ precomputed hash_index
76528 + prefix_hash_find Find an item by key
76529 + prefix_hash_find_index Find an item w/ precomputed hash_index
76530 + prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
76531 + prefix_hash_remove_index Remove an item w/ precomputed hash_index
76532 +
76533 + If you'd like something to be done differently, feel free to ask me
76534 + for modifications. Additional features that could be added but
76535 + have not been:
76536 +
76537 + prefix_hash_remove_key Find and remove an item by key
76538 + prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
76539 +
76540 + The hash_function currently receives only the key as an argument,
76541 + meaning it must somehow know the number of buckets. If this is a
76542 + problem let me know.
76543 +
76544 + This hash table uses a single-linked hash chain. This means
76545 + insertion is fast but deletion requires searching the chain.
76546 +
76547 + There is also the doubly-linked hash chain approach, under which
76548 + deletion requires no search but the code is longer and it takes two
76549 + pointers per item.
76550 +
76551 + The circularly-linked approach has the shortest code but requires
76552 + two pointers per bucket, doubling the size of the bucket array (in
76553 + addition to two pointers per item).
76554 +*/
76555 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
76556 + \
76557 +static __inline__ void \
76558 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
76559 + __u32 hash UNUSED_ARG) \
76560 +{ \
76561 + assert("nikita-2780", hash < table->_buckets); \
76562 +} \
76563 + \
76564 +static __inline__ int \
76565 +PREFIX##_hash_init (PREFIX##_hash_table *hash, \
76566 + __u32 buckets) \
76567 +{ \
76568 + hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
76569 + hash->_buckets = buckets; \
76570 + if (hash->_table == NULL) \
76571 + { \
76572 + return RETERR(-ENOMEM); \
76573 + } \
76574 + memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
76575 + ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
76576 + return 0; \
76577 +} \
76578 + \
76579 +static __inline__ void \
76580 +PREFIX##_hash_done (PREFIX##_hash_table *hash) \
76581 +{ \
76582 + if (REISER4_DEBUG && hash->_table != NULL) { \
76583 + __u32 i; \
76584 + for (i = 0 ; i < hash->_buckets ; ++ i) \
76585 + assert("nikita-2905", hash->_table[i] == NULL); \
76586 + } \
76587 + if (hash->_table != NULL) \
76588 + KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
76589 + hash->_table = NULL; \
76590 +} \
76591 + \
76592 +static __inline__ void \
76593 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
76594 +{ \
76595 + prefetch(item->LINK_NAME._next); \
76596 +} \
76597 + \
76598 +static __inline__ void \
76599 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
76600 + __u32 index) \
76601 +{ \
76602 + prefetch(hash->_table[index]); \
76603 +} \
76604 + \
76605 +static __inline__ ITEM_TYPE* \
76606 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
76607 + __u32 hash_index, \
76608 + KEY_TYPE const *find_key) \
76609 +{ \
76610 + ITEM_TYPE *item; \
76611 + \
76612 + PREFIX##_check_hash(hash, hash_index); \
76613 + \
76614 + for (item = hash->_table[hash_index]; \
76615 + item != NULL; \
76616 + item = item->LINK_NAME._next) \
76617 + { \
76618 + prefetch(item->LINK_NAME._next); \
76619 + prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
76620 + if (EQ_FUNC (& item->KEY_NAME, find_key)) \
76621 + { \
76622 + return item; \
76623 + } \
76624 + } \
76625 + \
76626 + return NULL; \
76627 +} \
76628 + \
76629 +static __inline__ ITEM_TYPE* \
76630 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
76631 + __u32 hash_index, \
76632 + KEY_TYPE const *find_key) \
76633 +{ \
76634 + ITEM_TYPE ** item = &hash->_table[hash_index]; \
76635 + \
76636 + PREFIX##_check_hash(hash, hash_index); \
76637 + \
76638 + while (*item != NULL) { \
76639 + prefetch(&(*item)->LINK_NAME._next); \
76640 + if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
76641 + ITEM_TYPE *found; \
76642 + \
76643 + found = *item; \
76644 + *item = found->LINK_NAME._next; \
76645 + found->LINK_NAME._next = hash->_table[hash_index]; \
76646 + hash->_table[hash_index] = found; \
76647 + return found; \
76648 + } \
76649 + item = &(*item)->LINK_NAME._next; \
76650 + } \
76651 + return NULL; \
76652 +} \
76653 + \
76654 +static __inline__ int \
76655 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
76656 + __u32 hash_index, \
76657 + ITEM_TYPE *del_item) \
76658 +{ \
76659 + ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
76660 + \
76661 + PREFIX##_check_hash(hash, hash_index); \
76662 + \
76663 + while (*hash_item_p != NULL) { \
76664 + prefetch(&(*hash_item_p)->LINK_NAME._next); \
76665 + if (*hash_item_p == del_item) { \
76666 + *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
76667 + return 1; \
76668 + } \
76669 + hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
76670 + } \
76671 + return 0; \
76672 +} \
76673 + \
76674 +static __inline__ void \
76675 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
76676 + __u32 hash_index, \
76677 + ITEM_TYPE *ins_item) \
76678 +{ \
76679 + PREFIX##_check_hash(hash, hash_index); \
76680 + \
76681 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76682 + hash->_table[hash_index] = ins_item; \
76683 +} \
76684 + \
76685 +static __inline__ void \
76686 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
76687 + __u32 hash_index, \
76688 + ITEM_TYPE *ins_item) \
76689 +{ \
76690 + PREFIX##_check_hash(hash, hash_index); \
76691 + \
76692 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76693 + smp_wmb(); \
76694 + hash->_table[hash_index] = ins_item; \
76695 +} \
76696 + \
76697 +static __inline__ ITEM_TYPE* \
76698 +PREFIX##_hash_find (PREFIX##_hash_table *hash, \
76699 + KEY_TYPE const *find_key) \
76700 +{ \
76701 + return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
76702 +} \
76703 + \
76704 +static __inline__ ITEM_TYPE* \
76705 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
76706 + KEY_TYPE const *find_key) \
76707 +{ \
76708 + return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
76709 +} \
76710 + \
76711 +static __inline__ int \
76712 +PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
76713 + ITEM_TYPE *del_item) \
76714 +{ \
76715 + return PREFIX##_hash_remove_index (hash, \
76716 + HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
76717 +} \
76718 + \
76719 +static __inline__ int \
76720 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
76721 + ITEM_TYPE *del_item) \
76722 +{ \
76723 + return PREFIX##_hash_remove (hash, del_item); \
76724 +} \
76725 + \
76726 +static __inline__ void \
76727 +PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
76728 + ITEM_TYPE *ins_item) \
76729 +{ \
76730 + return PREFIX##_hash_insert_index (hash, \
76731 + HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
76732 +} \
76733 + \
76734 +static __inline__ void \
76735 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
76736 + ITEM_TYPE *ins_item) \
76737 +{ \
76738 + return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
76739 + ins_item); \
76740 +} \
76741 + \
76742 +static __inline__ ITEM_TYPE * \
76743 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
76744 +{ \
76745 + ITEM_TYPE *first; \
76746 + \
76747 + for (first = NULL; ind < hash->_buckets; ++ ind) { \
76748 + first = hash->_table[ind]; \
76749 + if (first != NULL) \
76750 + break; \
76751 + } \
76752 + return first; \
76753 +} \
76754 + \
76755 +static __inline__ ITEM_TYPE * \
76756 +PREFIX##_hash_next (PREFIX##_hash_table *hash, \
76757 + ITEM_TYPE *item) \
76758 +{ \
76759 + ITEM_TYPE *next; \
76760 + \
76761 + if (item == NULL) \
76762 + return NULL; \
76763 + next = item->LINK_NAME._next; \
76764 + if (next == NULL) \
76765 + next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
76766 + return next; \
76767 +} \
76768 + \
76769 +typedef struct {} PREFIX##_hash_dummy
76770 +
76771 +#define for_all_ht_buckets(table, head) \
76772 +for ((head) = &(table) -> _table[ 0 ] ; \
76773 + (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
76774 +
76775 +#define for_all_in_bucket(bucket, item, next, field) \
76776 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
76777 + (item) != NULL ; \
76778 + (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
76779 +
76780 +#define for_all_in_htable(table, prefix, item, next) \
76781 +for ((item) = prefix ## _hash_first ((table), 0), \
76782 + (next) = prefix ## _hash_next ((table), (item)) ; \
76783 + (item) != NULL ; \
76784 + (item) = (next), \
76785 + (next) = prefix ## _hash_next ((table), (item)))
76786 +
76787 +/* __REISER4_TYPE_SAFE_HASH_H__ */
76788 +#endif
76789 +
76790 +/* Make Linus happy.
76791 + Local variables:
76792 + c-indentation-style: "K&R"
76793 + mode-name: "LC"
76794 + c-basic-offset: 8
76795 + tab-width: 8
76796 + fill-column: 120
76797 + End:
76798 +*/
76799 Index: linux-2.6.16/fs/reiser4/vfs_ops.c
76800 ===================================================================
76801 --- /dev/null
76802 +++ linux-2.6.16/fs/reiser4/vfs_ops.c
76803 @@ -0,0 +1,267 @@
76804 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76805 + * reiser4/README */
76806 +
76807 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
76808 + here. */
76809 +
76810 +#include "forward.h"
76811 +#include "debug.h"
76812 +#include "dformat.h"
76813 +#include "coord.h"
76814 +#include "plugin/item/item.h"
76815 +#include "plugin/file/file.h"
76816 +#include "plugin/security/perm.h"
76817 +#include "plugin/disk_format/disk_format.h"
76818 +#include "plugin/plugin.h"
76819 +#include "plugin/plugin_set.h"
76820 +#include "plugin/object.h"
76821 +#include "txnmgr.h"
76822 +#include "jnode.h"
76823 +#include "znode.h"
76824 +#include "block_alloc.h"
76825 +#include "tree.h"
76826 +#include "vfs_ops.h"
76827 +#include "inode.h"
76828 +#include "page_cache.h"
76829 +#include "ktxnmgrd.h"
76830 +#include "super.h"
76831 +#include "reiser4.h"
76832 +#include "entd.h"
76833 +#include "status_flags.h"
76834 +#include "flush.h"
76835 +#include "dscale.h"
76836 +
76837 +#include <linux/profile.h>
76838 +#include <linux/types.h>
76839 +#include <linux/mount.h>
76840 +#include <linux/vfs.h>
76841 +#include <linux/mm.h>
76842 +#include <linux/buffer_head.h>
76843 +#include <linux/dcache.h>
76844 +#include <linux/list.h>
76845 +#include <linux/pagemap.h>
76846 +#include <linux/slab.h>
76847 +#include <linux/seq_file.h>
76848 +#include <linux/init.h>
76849 +#include <linux/module.h>
76850 +#include <linux/writeback.h>
76851 +#include <linux/blkdev.h>
76852 +#include <linux/quotaops.h>
76853 +#include <linux/security.h>
76854 +#include <linux/reboot.h>
76855 +#include <linux/rcupdate.h>
76856 +
76857 +
76858 +/* update inode stat-data by calling plugin */
76859 +int reiser4_update_sd(struct inode *object)
76860 +{
76861 + file_plugin *fplug;
76862 +
76863 + assert("nikita-2338", object != NULL);
76864 + /* check for read-only file system. */
76865 + if (IS_RDONLY(object))
76866 + return 0;
76867 +
76868 + fplug = inode_file_plugin(object);
76869 + assert("nikita-2339", fplug != NULL);
76870 + return fplug->write_sd_by_inode(object);
76871 +}
76872 +
76873 +/* helper function: increase inode nlink count and call plugin method to save
76874 + updated stat-data.
76875 +
76876 + Used by link/create and during creation of dot and dotdot in mkdir
76877 +*/
76878 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
76879 + struct inode *parent /* parent where new entry will be */
76880 + ,
76881 + int write_sd_p /* true if stat-data has to be
76882 + * updated */ )
76883 +{
76884 + file_plugin *fplug;
76885 + int result;
76886 +
76887 + assert("nikita-1351", object != NULL);
76888 +
76889 + fplug = inode_file_plugin(object);
76890 + assert("nikita-1445", fplug != NULL);
76891 +
76892 + /* ask plugin whether it can add yet another link to this
76893 + object */
76894 + if (!fplug->can_add_link(object))
76895 + return RETERR(-EMLINK);
76896 +
76897 + assert("nikita-2211", fplug->add_link != NULL);
76898 + /* call plugin to do actual addition of link */
76899 + result = fplug->add_link(object, parent);
76900 +
76901 + /* optionally update stat data */
76902 + if (result == 0 && write_sd_p)
76903 + result = fplug->write_sd_by_inode(object);
76904 + return result;
76905 +}
76906 +
76907 +/* helper function: decrease inode nlink count and call plugin method to save
76908 + updated stat-data.
76909 +
76910 + Used by unlink/create
76911 +*/
76912 +int reiser4_del_nlink(struct inode *object /* object from which link is
76913 + * removed */ ,
76914 + struct inode *parent /* parent where entry was */ ,
76915 + int write_sd_p /* true is stat-data has to be
76916 + * updated */ )
76917 +{
76918 + file_plugin *fplug;
76919 + int result;
76920 +
76921 + assert("nikita-1349", object != NULL);
76922 +
76923 + fplug = inode_file_plugin(object);
76924 + assert("nikita-1350", fplug != NULL);
76925 + assert("nikita-1446", object->i_nlink > 0);
76926 + assert("nikita-2210", fplug->rem_link != NULL);
76927 +
76928 + /* call plugin to do actual deletion of link */
76929 + result = fplug->rem_link(object, parent);
76930 +
76931 + /* optionally update stat data */
76932 + if (result == 0 && write_sd_p)
76933 + result = fplug->write_sd_by_inode(object);
76934 + return result;
76935 +}
76936 +
76937 +
76938 +
76939 +
76940 +/* Release reiser4 dentry. This is d_op->d_release() method. */
76941 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
76942 +{
76943 + reiser4_free_dentry_fsdata(dentry);
76944 +}
76945 +
76946 +/*
76947 + * Called by reiser4_sync_inodes(), during speculative write-back (through
76948 + * pdflush, or balance_dirty_pages()).
76949 + */
76950 +void writeout(struct super_block *sb, struct writeback_control *wbc)
76951 +{
76952 + long written = 0;
76953 + int repeats = 0;
76954 + int result;
76955 + struct address_space *mapping;
76956 +
76957 + /*
76958 + * Performs early flushing, trying to free some memory. If there is
76959 + * nothing to flush, commits some atoms.
76960 + */
76961 +
76962 + /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
76963 + sys_fsync(). */
76964 + if (wbc->sync_mode != WB_SYNC_NONE) {
76965 + txnmgr_force_commit_all(sb, 0);
76966 + return;
76967 + }
76968 +
76969 + BUG_ON(get_super_fake(sb) == NULL);
76970 + mapping = get_super_fake(sb)->i_mapping;
76971 + do {
76972 + long nr_submitted = 0;
76973 + jnode *node = NULL;
76974 +
76975 + /* do not put more requests to overload write queue */
76976 + if (wbc->nonblocking &&
76977 + bdi_write_congested(mapping->backing_dev_info)) {
76978 + blk_run_address_space(mapping);
76979 + wbc->encountered_congestion = 1;
76980 + break;
76981 + }
76982 + repeats++;
76983 + BUG_ON(wbc->nr_to_write <= 0);
76984 +
76985 + if (get_current_context()->entd) {
76986 + entd_context *ent = get_entd_context(sb);
76987 +
76988 + if (ent->cur_request->node)
76989 + /*
76990 + * this is ent thread and it managed to capture
76991 + * requested page itself - start flush from
76992 + * that page
76993 + */
76994 + node = jref(ent->cur_request->node);
76995 + }
76996 +
76997 + result = flush_some_atom(node, &nr_submitted, wbc,
76998 + JNODE_FLUSH_WRITE_BLOCKS);
76999 + if (result != 0)
77000 + warning("nikita-31001", "Flush failed: %i", result);
77001 + if (node)
77002 + jput(node);
77003 + if (!nr_submitted)
77004 + break;
77005 +
77006 + wbc->nr_to_write -= nr_submitted;
77007 + written += nr_submitted;
77008 + } while (wbc->nr_to_write > 0);
77009 +}
77010 +
77011 +
77012 +void reiser4_throttle_write(struct inode *inode)
77013 +{
77014 + txn_restart_current();
77015 + balance_dirty_pages_ratelimited(inode->i_mapping);
77016 +}
77017 +
77018 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
77019 +const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
77020 + * beginning of device */
77021 +
77022 +
77023 +
77024 +/*
77025 + * Reiser4 initialization/shutdown.
77026 + *
77027 + * Code below performs global reiser4 initialization that is done either as
77028 + * part of kernel initialization (when reiser4 is statically built-in), or
77029 + * during reiser4 module load (when compiled as module).
77030 + */
77031 +
77032 +
77033 +void reiser4_handle_error(void)
77034 +{
77035 + struct super_block *sb = reiser4_get_current_sb();
77036 +
77037 + if (!sb)
77038 + return;
77039 + reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
77040 + "Filesystem error occured");
77041 + switch (get_super_private(sb)->onerror) {
77042 + case 0:
77043 + reiser4_panic("foobar-42", "Filesystem error occured\n");
77044 + case 1:
77045 + default:
77046 + if (sb->s_flags & MS_RDONLY)
77047 + return;
77048 + sb->s_flags |= MS_RDONLY;
77049 + break;
77050 + }
77051 +}
77052 +
77053 +struct dentry_operations reiser4_dentry_operations = {
77054 + .d_revalidate = NULL,
77055 + .d_hash = NULL,
77056 + .d_compare = NULL,
77057 + .d_delete = NULL,
77058 + .d_release = reiser4_d_release,
77059 + .d_iput = NULL,
77060 +};
77061 +
77062 +/* Make Linus happy.
77063 + Local variables:
77064 + c-indentation-style: "K&R"
77065 + mode-name: "LC"
77066 + c-basic-offset: 8
77067 + tab-width: 8
77068 + fill-column: 120
77069 + End:
77070 +*/
77071 Index: linux-2.6.16/fs/reiser4/vfs_ops.h
77072 ===================================================================
77073 --- /dev/null
77074 +++ linux-2.6.16/fs/reiser4/vfs_ops.h
77075 @@ -0,0 +1,58 @@
77076 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77077 + * reiser4/README */
77078 +
77079 +/* vfs_ops.c's exported symbols */
77080 +
77081 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
77082 +#define __FS_REISER4_VFS_OPS_H__
77083 +
77084 +#include "forward.h"
77085 +#include "coord.h"
77086 +#include "seal.h"
77087 +#include "plugin/file/file.h"
77088 +#include "super.h"
77089 +#include "readahead.h"
77090 +
77091 +#include <linux/types.h> /* for loff_t */
77092 +#include <linux/fs.h> /* for struct address_space */
77093 +#include <linux/dcache.h> /* for struct dentry */
77094 +#include <linux/mm.h>
77095 +#include <linux/backing-dev.h>
77096 +
77097 +/* address space operations */
77098 +int reiser4_writepage(struct page *, struct writeback_control *);
77099 +int reiser4_set_page_dirty(struct page *);
77100 +int reiser4_readpages(struct file *, struct address_space *,
77101 + struct list_head *pages, unsigned nr_pages);
77102 +int reiser4_invalidatepage(struct page *, unsigned long offset);
77103 +int reiser4_releasepage(struct page *, gfp_t);
77104 +
77105 +extern int reiser4_update_sd(struct inode *);
77106 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
77107 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
77108 +
77109 +
77110 +extern int reiser4_start_up_io(struct page *page);
77111 +extern void reiser4_throttle_write(struct inode *);
77112 +extern int jnode_is_releasable(jnode *);
77113 +
77114 +#define CAPTURE_APAGE_BURST (1024l)
77115 +void writeout(struct super_block *, struct writeback_control *);
77116 +
77117 +
77118 +extern void reiser4_handle_error(void);
77119 +
77120 +
77121 +/* __FS_REISER4_VFS_OPS_H__ */
77122 +#endif
77123 +
77124 +/* Make Linus happy.
77125 + Local variables:
77126 + c-indentation-style: "K&R"
77127 + mode-name: "LC"
77128 + c-basic-offset: 8
77129 + tab-width: 8
77130 + fill-column: 120
77131 + scroll-step: 1
77132 + End:
77133 +*/
77134 Index: linux-2.6.16/fs/reiser4/wander.c
77135 ===================================================================
77136 --- /dev/null
77137 +++ linux-2.6.16/fs/reiser4/wander.c
77138 @@ -0,0 +1,1799 @@
77139 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77140 + * reiser4/README */
77141 +
77142 +/* Reiser4 Wandering Log */
77143 +
77144 +/* You should read http://www.namesys.com/txn-doc.html
77145 +
77146 + That describes how filesystem operations are performed as atomic
77147 + transactions, and how we try to arrange it so that we can write most of the
77148 + data only once while performing the operation atomically.
77149 +
77150 + For the purposes of this code, it is enough for it to understand that it
77151 + has been told a given block should be written either once, or twice (if
77152 + twice then once to the wandered location and once to the real location).
77153 +
77154 + This code guarantees that those blocks that are defined to be part of an
77155 + atom either all take effect or none of them take effect.
77156 +
77157 + Relocate set nodes are submitted to write by the jnode_flush() routine, and
77158 + the overwrite set is submitted by reiser4_write_log(). This is because with
77159 + the overwrite set we seek to optimize writes, and with the relocate set we
77160 + seek to cause disk order to correlate with the parent first pre-order.
77161 +
77162 + reiser4_write_log() allocates and writes wandered blocks and maintains
77163 + additional on-disk structures of the atom as wander records (each wander
77164 + record occupies one block) for storing of the "wandered map" (a table which
77165 + contains a relation between wandered and real block numbers) and other
77166 + information which might be needed at transaction recovery time.
77167 +
77168 + The wander records are unidirectionally linked into a circle: each wander
77169 + record contains a block number of the next wander record, the last wander
77170 + record points to the first one.
77171 +
77172 + One wander record (named "tx head" in this file) has a format which is
77173 + different from the other wander records. The "tx head" has a reference to the
77174 + "tx head" block of the previously committed atom. Also, "tx head" contains
77175 + fs information (the free blocks counter, and the oid allocator state) which
77176 + is logged in a special way .
77177 +
77178 + There are two journal control blocks, named journal header and journal
77179 + footer which have fixed on-disk locations. The journal header has a
77180 + reference to the "tx head" block of the last committed atom. The journal
77181 + footer points to the "tx head" of the last flushed atom. The atom is
77182 + "played" when all blocks from its overwrite set are written to disk the
77183 + second time (i.e. written to their real locations).
77184 +
77185 + NOTE: People who know reiserfs internals and its journal structure might be
77186 + confused with these terms journal footer and journal header. There is a table
77187 + with terms of similar semantics in reiserfs (reiser3) and reiser4:
77188 +
77189 + REISER3 TERM | REISER4 TERM | DESCRIPTION
77190 + --------------------+-----------------------+----------------------------
77191 + commit record | journal header | atomic write of this record
77192 + | | ends transaction commit
77193 + --------------------+-----------------------+----------------------------
77194 + journal header | journal footer | atomic write of this record
77195 + | | ends post-commit writes.
77196 + | | After successful
77197 + | | writing of this journal
77198 + | | blocks (in reiser3) or
77199 + | | wandered blocks/records are
77200 + | | free for re-use.
77201 + --------------------+-----------------------+----------------------------
77202 +
77203 + The atom commit process is the following:
77204 +
77205 + 1. The overwrite set is taken from atom's clean list, and its size is
77206 + counted.
77207 +
77208 + 2. The number of necessary wander records (including tx head) is calculated,
77209 + and the wander record blocks are allocated.
77210 +
77211 + 3. Allocate wandered blocks and populate wander records by wandered map.
77212 +
77213 + 4. submit write requests for wander records and wandered blocks.
77214 +
77215 + 5. wait until submitted write requests complete.
77216 +
77217 + 6. update journal header: change the pointer to the block number of just
77218 + written tx head, submit an i/o for modified journal header block and wait
77219 + for i/o completion.
77220 +
77221 + NOTE: The special logging for bitmap blocks and some reiser4 super block
77222 + fields makes processes of atom commit, flush and recovering a bit more
77223 + complex (see comments in the source code for details).
77224 +
77225 + The atom playing process is the following:
77226 +
77227 + 1. Write atom's overwrite set in-place.
77228 +
77229 + 2. Wait on i/o.
77230 +
77231 + 3. Update journal footer: change the pointer to block number of tx head
77232 + block of the atom we currently flushing, submit an i/o, wait on i/o
77233 + completion.
77234 +
77235 + 4. Free disk space which was used for wandered blocks and wander records.
77236 +
77237 + After the freeing of wandered blocks and wander records we have that journal
77238 + footer points to the on-disk structure which might be overwritten soon.
77239 + Neither the log writer nor the journal recovery procedure use that pointer
77240 + for accessing the data. When the journal recovery procedure finds the oldest
77241 + transaction it compares the journal footer pointer value with the "prev_tx"
77242 + pointer value in tx head, if values are equal the oldest not flushed
77243 + transaction is found.
77244 +
77245 + NOTE on disk space leakage: the information about of what blocks and how many
77246 + blocks are allocated for wandered blocks, wandered records is not written to
77247 + the disk because of special logging for bitmaps and some super blocks
77248 + counters. After a system crash we the reiser4 does not remember those
77249 + objects allocation, thus we have no such a kind of disk space leakage.
77250 +*/
77251 +
77252 +/* Special logging of reiser4 super block fields. */
77253 +
77254 +/* There are some reiser4 super block fields (free block count and OID allocator
77255 + state (number of files and next free OID) which are logged separately from
77256 + super block to avoid unnecessary atom fusion.
77257 +
77258 + So, the reiser4 super block can be not captured by a transaction with
77259 + allocates/deallocates disk blocks or create/delete file objects. Moreover,
77260 + the reiser4 on-disk super block is not touched when such a transaction is
77261 + committed and flushed. Those "counters logged specially" are logged in "tx
77262 + head" blocks and in the journal footer block.
77263 +
77264 + A step-by-step description of special logging:
77265 +
77266 + 0. The per-atom information about deleted or created files and allocated or
77267 + freed blocks is collected during the transaction. The atom's
77268 + ->nr_objects_created and ->nr_objects_deleted are for object
77269 + deletion/creation tracking, the numbers of allocated and freed blocks are
77270 + calculated using atom's delete set and atom's capture list -- all new and
77271 + relocated nodes should be on atom's clean list and should have JNODE_RELOC
77272 + bit set.
77273 +
77274 + 1. The "logged specially" reiser4 super block fields have their "committed"
77275 + versions in the reiser4 in-memory super block. They get modified only at
77276 + atom commit time. The atom's commit thread has an exclusive access to those
77277 + "committed" fields because the log writer implementation supports only one
77278 + atom commit a time (there is a per-fs "commit" semaphore). At
77279 + that time "committed" counters are modified using per-atom information
77280 + collected during the transaction. These counters are stored on disk as a
77281 + part of tx head block when atom is committed.
77282 +
77283 + 2. When the atom is flushed the value of the free block counter and the OID
77284 + allocator state get written to the journal footer block. A special journal
77285 + procedure (journal_recover_sb_data()) takes those values from the journal
77286 + footer and updates the reiser4 in-memory super block.
77287 +
77288 + NOTE: That means free block count and OID allocator state are logged
77289 + separately from the reiser4 super block regardless of the fact that the
77290 + reiser4 super block has fields to store both the free block counter and the
77291 + OID allocator.
77292 +
77293 + Writing the whole super block at commit time requires knowing true values of
77294 + all its fields without changes made by not yet committed transactions. It is
77295 + possible by having their "committed" version of the super block like the
77296 + reiser4 bitmap blocks have "committed" and "working" versions. However,
77297 + another scheme was implemented which stores special logged values in the
77298 + unused free space inside transaction head block. In my opinion it has an
77299 + advantage of not writing whole super block when only part of it was
77300 + modified. */
77301 +
77302 +#include "debug.h"
77303 +#include "dformat.h"
77304 +#include "txnmgr.h"
77305 +#include "jnode.h"
77306 +#include "znode.h"
77307 +#include "block_alloc.h"
77308 +#include "page_cache.h"
77309 +#include "wander.h"
77310 +#include "reiser4.h"
77311 +#include "super.h"
77312 +#include "vfs_ops.h"
77313 +#include "writeout.h"
77314 +#include "inode.h"
77315 +#include "entd.h"
77316 +
77317 +#include <linux/types.h>
77318 +#include <linux/fs.h> /* for struct super_block */
77319 +#include <linux/mm.h> /* for struct page */
77320 +#include <linux/pagemap.h>
77321 +#include <linux/bio.h> /* for struct bio */
77322 +#include <linux/blkdev.h>
77323 +
77324 +static int write_jnodes_to_disk_extent(
77325 + jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77326 +
77327 +/* The commit_handle is a container for objects needed at atom commit time */
77328 +struct commit_handle {
77329 + /* A pointer to atom's list of OVRWR nodes */
77330 + struct list_head *overwrite_set;
77331 + /* atom's overwrite set size */
77332 + int overwrite_set_size;
77333 + /* jnodes for wander record blocks */
77334 + struct list_head tx_list;
77335 + /* number of wander records */
77336 + __u32 tx_size;
77337 + /* 'committed' sb counters are saved here until atom is completely
77338 + flushed */
77339 + __u64 free_blocks;
77340 + __u64 nr_files;
77341 + __u64 next_oid;
77342 + /* A pointer to the atom which is being committed */
77343 + txn_atom *atom;
77344 + /* A pointer to current super block */
77345 + struct super_block *super;
77346 + /* The counter of modified bitmaps */
77347 + reiser4_block_nr nr_bitmap;
77348 +};
77349 +
77350 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77351 +{
77352 + memset(ch, 0, sizeof(struct commit_handle));
77353 + INIT_LIST_HEAD(&ch->tx_list);
77354 +
77355 + ch->atom = atom;
77356 + ch->super = reiser4_get_current_sb();
77357 +}
77358 +
77359 +static void done_commit_handle(struct commit_handle *ch)
77360 +{
77361 + assert("zam-690", list_empty(&ch->tx_list));
77362 +}
77363 +
77364 +static inline int reiser4_use_write_barrier(struct super_block * s)
77365 +{
77366 + return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77367 +}
77368 +
77369 +static void disable_write_barrier(struct super_block * s)
77370 +{
77371 + notice("zam-1055", "%s does not support write barriers,"
77372 + " using synchronous write instead.", s->s_id);
77373 + set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77374 +}
77375 +
77376 +
77377 +/* fill journal header block data */
77378 +static void format_journal_header(struct commit_handle *ch)
77379 +{
77380 + struct reiser4_super_info_data *sbinfo;
77381 + struct journal_header *header;
77382 + jnode *txhead;
77383 +
77384 + sbinfo = get_super_private(ch->super);
77385 + assert("zam-479", sbinfo != NULL);
77386 + assert("zam-480", sbinfo->journal_header != NULL);
77387 +
77388 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77389 +
77390 + jload(sbinfo->journal_header);
77391 +
77392 + header = (struct journal_header *)jdata(sbinfo->journal_header);
77393 + assert("zam-484", header != NULL);
77394 +
77395 + put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77396 + &header->last_committed_tx);
77397 +
77398 + jrelse(sbinfo->journal_header);
77399 +}
77400 +
77401 +/* fill journal footer block data */
77402 +static void format_journal_footer(struct commit_handle *ch)
77403 +{
77404 + struct reiser4_super_info_data *sbinfo;
77405 + struct journal_footer *footer;
77406 + jnode *tx_head;
77407 +
77408 + sbinfo = get_super_private(ch->super);
77409 +
77410 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77411 +
77412 + assert("zam-493", sbinfo != NULL);
77413 + assert("zam-494", sbinfo->journal_header != NULL);
77414 +
77415 + check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77416 +
77417 + footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77418 + assert("zam-495", footer != NULL);
77419 +
77420 + put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77421 + &footer->last_flushed_tx);
77422 + put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77423 +
77424 + put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77425 + put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77426 +
77427 + jrelse(sbinfo->journal_footer);
77428 +}
77429 +
77430 +/* wander record capacity depends on current block size */
77431 +static int wander_record_capacity(const struct super_block *super)
77432 +{
77433 + return (super->s_blocksize -
77434 + sizeof(struct wander_record_header)) /
77435 + sizeof(struct wander_entry);
77436 +}
77437 +
77438 +/* Fill first wander record (tx head) in accordance with supplied given data */
77439 +static void format_tx_head(struct commit_handle *ch)
77440 +{
77441 + jnode *tx_head;
77442 + jnode *next;
77443 + struct tx_header *header;
77444 +
77445 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77446 + assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77447 +
77448 + next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77449 + if (&ch->tx_list == &next->capture_link)
77450 + next = tx_head;
77451 +
77452 + header = (struct tx_header *)jdata(tx_head);
77453 +
77454 + assert("zam-460", header != NULL);
77455 + assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77456 +
77457 + memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77458 + memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77459 +
77460 + put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77461 + put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77462 + &header->prev_tx);
77463 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77464 + put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
77465 + put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
77466 + put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
77467 +}
77468 +
77469 +/* prepare ordinary wander record block (fill all service fields) */
77470 +static void
77471 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
77472 +{
77473 + struct wander_record_header *LRH;
77474 + jnode *next;
77475 +
77476 + assert("zam-464", node != NULL);
77477 +
77478 + LRH = (struct wander_record_header *)jdata(node);
77479 + next = list_entry(node->capture_link.next, jnode, capture_link);
77480 +
77481 + if (&ch->tx_list == &next->capture_link)
77482 + next = list_entry(ch->tx_list.next, jnode, capture_link);
77483 +
77484 + assert("zam-465", LRH != NULL);
77485 + assert("zam-463",
77486 + ch->super->s_blocksize > sizeof(struct wander_record_header));
77487 +
77488 + memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
77489 + memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
77490 +
77491 + put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
77492 + put_unaligned(cpu_to_le32(serial), &LRH->serial);
77493 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
77494 +}
77495 +
77496 +/* add one wandered map entry to formatted wander record */
77497 +static void
77498 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
77499 + const reiser4_block_nr * b)
77500 +{
77501 + char *data;
77502 + struct wander_entry *pairs;
77503 +
77504 + data = jdata(node);
77505 + assert("zam-451", data != NULL);
77506 +
77507 + pairs =
77508 + (struct wander_entry *)(data + sizeof(struct wander_record_header));
77509 +
77510 + put_unaligned(cpu_to_le64(*a), &pairs[index].original);
77511 + put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
77512 +}
77513 +
77514 +/* currently, wander records contains contain only wandered map, which depend on
77515 + overwrite set size */
77516 +static void get_tx_size(struct commit_handle *ch)
77517 +{
77518 + assert("zam-440", ch->overwrite_set_size != 0);
77519 + assert("zam-695", ch->tx_size == 0);
77520 +
77521 + /* count all ordinary wander records
77522 + (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
77523 + for tx head block */
77524 + ch->tx_size =
77525 + (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
77526 + 2;
77527 +}
77528 +
77529 +/* A special structure for using in store_wmap_actor() for saving its state
77530 + between calls */
77531 +struct store_wmap_params {
77532 + jnode *cur; /* jnode of current wander record to fill */
77533 + int idx; /* free element index in wander record */
77534 + int capacity; /* capacity */
77535 +
77536 +#if REISER4_DEBUG
77537 + struct list_head *tx_list;
77538 +#endif
77539 +};
77540 +
77541 +/* an actor for use in blocknr_set_iterator routine which populates the list
77542 + of pre-formatted wander records by wandered map info */
77543 +static int
77544 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
77545 + const reiser4_block_nr * b, void *data)
77546 +{
77547 + struct store_wmap_params *params = data;
77548 +
77549 + if (params->idx >= params->capacity) {
77550 + /* a new wander record should be taken from the tx_list */
77551 + params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
77552 + assert("zam-454",
77553 + params->tx_list != &params->cur->capture_link);
77554 +
77555 + params->idx = 0;
77556 + }
77557 +
77558 + store_entry(params->cur, params->idx, a, b);
77559 + params->idx++;
77560 +
77561 + return 0;
77562 +}
77563 +
77564 +/* This function is called after Relocate set gets written to disk, Overwrite
77565 + set is written to wandered locations and all wander records are written
77566 + also. Updated journal header blocks contains a pointer (block number) to
77567 + first wander record of the just written transaction */
77568 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
77569 +{
77570 + struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77571 + jnode *jh = sbinfo->journal_header;
77572 + jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
77573 + int ret;
77574 +
77575 + format_journal_header(ch);
77576 +
77577 + ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
77578 + use_barrier ? WRITEOUT_BARRIER : 0);
77579 + if (ret)
77580 + return ret;
77581 +
77582 + // blk_run_address_space(sbinfo->fake->i_mapping);
77583 + /*blk_run_queues(); */
77584 +
77585 + ret = jwait_io(jh, WRITE);
77586 +
77587 + if (ret)
77588 + return ret;
77589 +
77590 + sbinfo->last_committed_tx = *jnode_get_block(head);
77591 +
77592 + return 0;
77593 +}
77594 +
77595 +/* This function is called after write-back is finished. We update journal
77596 + footer block and free blocks which were occupied by wandered blocks and
77597 + transaction wander records */
77598 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
77599 +{
77600 + reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77601 +
77602 + jnode *jf = sbinfo->journal_footer;
77603 +
77604 + int ret;
77605 +
77606 + format_journal_footer(ch);
77607 +
77608 + ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
77609 + use_barrier ? WRITEOUT_BARRIER : 0);
77610 + if (ret)
77611 + return ret;
77612 +
77613 + // blk_run_address_space(sbinfo->fake->i_mapping);
77614 + /*blk_run_queue(); */
77615 +
77616 + ret = jwait_io(jf, WRITE);
77617 + if (ret)
77618 + return ret;
77619 +
77620 + return 0;
77621 +}
77622 +
77623 +/* free block numbers of wander records of already written in place transaction */
77624 +static void dealloc_tx_list(struct commit_handle *ch)
77625 +{
77626 + while (!list_empty(&ch->tx_list)) {
77627 + jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
77628 + list_del(&cur->capture_link);
77629 + ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
77630 + reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
77631 + BA_FORMATTED);
77632 +
77633 + unpin_jnode_data(cur);
77634 + drop_io_head(cur);
77635 + }
77636 +}
77637 +
77638 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
77639 + from atom's overwrite set. */
77640 +static int
77641 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
77642 + const reiser4_block_nr * a UNUSED_ARG,
77643 + const reiser4_block_nr * b, void *data UNUSED_ARG)
77644 +{
77645 +
77646 + assert("zam-499", b != NULL);
77647 + assert("zam-500", *b != 0);
77648 + assert("zam-501", !blocknr_is_fake(b));
77649 +
77650 + reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
77651 + return 0;
77652 +}
77653 +
77654 +/* free wandered block locations of already written in place transaction */
77655 +static void dealloc_wmap(struct commit_handle *ch)
77656 +{
77657 + assert("zam-696", ch->atom != NULL);
77658 +
77659 + blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
77660 + dealloc_wmap_actor, NULL, 1);
77661 +}
77662 +
77663 +/* helper function for alloc wandered blocks, which refill set of block
77664 + numbers needed for wandered blocks */
77665 +static int
77666 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
77667 +{
77668 + reiser4_blocknr_hint hint;
77669 + int ret;
77670 +
77671 + reiser4_block_nr wide_len = count;
77672 +
77673 + /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
77674 + ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
77675 + reserved allocation area so as to get the best qualities of fixed
77676 + journals? */
77677 + blocknr_hint_init(&hint);
77678 + hint.block_stage = BLOCK_GRABBED;
77679 +
77680 + ret = reiser4_alloc_blocks(&hint, start, &wide_len,
77681 + BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
77682 + *len = (int)wide_len;
77683 +
77684 + return ret;
77685 +}
77686 +
77687 +/*
77688 + * roll back changes made before issuing BIO in the case of IO error.
77689 + */
77690 +static void undo_bio(struct bio *bio)
77691 +{
77692 + int i;
77693 +
77694 + for (i = 0; i < bio->bi_vcnt; ++i) {
77695 + struct page *pg;
77696 + jnode *node;
77697 +
77698 + pg = bio->bi_io_vec[i].bv_page;
77699 + ClearPageWriteback(pg);
77700 + node = jprivate(pg);
77701 + spin_lock_jnode(node);
77702 + JF_CLR(node, JNODE_WRITEBACK);
77703 + JF_SET(node, JNODE_DIRTY);
77704 + spin_unlock_jnode(node);
77705 + }
77706 + bio_put(bio);
77707 +}
77708 +
77709 +/* put overwrite set back to atom's clean list */
77710 +static void put_overwrite_set(struct commit_handle *ch)
77711 +{
77712 + jnode *cur;
77713 +
77714 + list_for_each_entry(cur, ch->overwrite_set, capture_link)
77715 + jrelse_tail(cur);
77716 +}
77717 +
77718 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
77719 + Since we have a separate list for atom's overwrite set we just scan the list,
77720 + count bitmap and other not leaf nodes which wandered blocks allocation we
77721 + have to grab space for. */
77722 +static int get_overwrite_set(struct commit_handle *ch)
77723 +{
77724 + int ret;
77725 + jnode *cur;
77726 + __u64 nr_not_leaves = 0;
77727 +#if REISER4_DEBUG
77728 + __u64 nr_formatted_leaves = 0;
77729 + __u64 nr_unformatted_leaves = 0;
77730 +#endif
77731 +
77732 + assert("zam-697", ch->overwrite_set_size == 0);
77733 +
77734 + ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
77735 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77736 +
77737 + while (ch->overwrite_set != &cur->capture_link) {
77738 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
77739 +
77740 + /* Count bitmap locks for getting correct statistics what number
77741 + * of blocks were cleared by the transaction commit. */
77742 + if (jnode_get_type(cur) == JNODE_BITMAP)
77743 + ch->nr_bitmap++;
77744 +
77745 + assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
77746 + || jnode_get_type(cur) == JNODE_BITMAP);
77747 +
77748 + if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
77749 + /* we replace fake znode by another (real)
77750 + znode which is suggested by disk_layout
77751 + plugin */
77752 +
77753 + /* FIXME: it looks like fake znode should be
77754 + replaced by jnode supplied by
77755 + disk_layout. */
77756 +
77757 + struct super_block *s = reiser4_get_current_sb();
77758 + reiser4_super_info_data *sbinfo =
77759 + get_current_super_private();
77760 +
77761 + if (sbinfo->df_plug->log_super) {
77762 + jnode *sj = sbinfo->df_plug->log_super(s);
77763 +
77764 + assert("zam-593", sj != NULL);
77765 +
77766 + if (IS_ERR(sj))
77767 + return PTR_ERR(sj);
77768 +
77769 + spin_lock_jnode(sj);
77770 + JF_SET(sj, JNODE_OVRWR);
77771 + insert_into_atom_ovrwr_list(ch->atom, sj);
77772 + spin_unlock_jnode(sj);
77773 +
77774 + /* jload it as the rest of overwrite set */
77775 + jload_gfp(sj, get_gfp_mask(), 0);
77776 +
77777 + ch->overwrite_set_size++;
77778 + }
77779 + spin_lock_jnode(cur);
77780 + uncapture_block(cur);
77781 + jput(cur);
77782 +
77783 + } else {
77784 + int ret;
77785 + ch->overwrite_set_size++;
77786 + ret = jload_gfp(cur, get_gfp_mask(), 0);
77787 + if (ret)
77788 + reiser4_panic("zam-783",
77789 + "cannot load e-flushed jnode back (ret = %d)\n",
77790 + ret);
77791 + }
77792 +
77793 + /* Count not leaves here because we have to grab disk space
77794 + * for wandered blocks. They were not counted as "flush
77795 + * reserved". Counting should be done _after_ nodes are pinned
77796 + * into memory by jload(). */
77797 + if (!jnode_is_leaf(cur))
77798 + nr_not_leaves++;
77799 + else {
77800 +#if REISER4_DEBUG
77801 + /* at this point @cur either has JNODE_FLUSH_RESERVED
77802 + * or is eflushed. Locking is not strong enough to
77803 + * write an assertion checking for this. */
77804 + if (jnode_is_znode(cur))
77805 + nr_formatted_leaves++;
77806 + else
77807 + nr_unformatted_leaves++;
77808 +#endif
77809 + JF_CLR(cur, JNODE_FLUSH_RESERVED);
77810 + }
77811 +
77812 + cur = next;
77813 + }
77814 +
77815 + /* Grab space for writing (wandered blocks) of not leaves found in
77816 + * overwrite set. */
77817 + ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
77818 + if (ret)
77819 + return ret;
77820 +
77821 + /* Disk space for allocation of wandered blocks of leaf nodes already
77822 + * reserved as "flush reserved", move it to grabbed space counter. */
77823 + spin_lock_atom(ch->atom);
77824 + assert("zam-940",
77825 + nr_formatted_leaves + nr_unformatted_leaves <=
77826 + ch->atom->flush_reserved);
77827 + flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
77828 + spin_unlock_atom(ch->atom);
77829 +
77830 + return ch->overwrite_set_size;
77831 +}
77832 +
77833 +/**
77834 + * write_jnodes_to_disk_extent - submit write request
77835 + * @head:
77836 + * @first: first jnode of the list
77837 + * @nr: number of jnodes on the list
77838 + * @block_p:
77839 + * @fq:
77840 + * @flags: used to decide whether page is to get PG_reclaim flag
77841 + *
77842 + * Submits a write request for @nr jnodes beginning from the @first, other
77843 + * jnodes are after the @first on the double-linked "capture" list. All jnodes
77844 + * will be written to the disk region of @nr blocks starting with @block_p block
77845 + * number. If @fq is not NULL it means that waiting for i/o completion will be
77846 + * done more efficiently by using flush_queue_t objects.
77847 + * This function is the one which writes list of jnodes in batch mode. It does
77848 + * all low-level things as bio construction and page states manipulation.
77849 + *
77850 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
77851 + * aggregated in this function instead of being left to the layers below
77852 + *
77853 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
77854 + * Why that layer needed? Why BIOs cannot be constructed here?
77855 + */
77856 +static int write_jnodes_to_disk_extent(
77857 + jnode *first, int nr, const reiser4_block_nr *block_p,
77858 + flush_queue_t *fq, int flags)
77859 +{
77860 + struct super_block *super = reiser4_get_current_sb();
77861 + int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
77862 + int max_blocks;
77863 + jnode *cur = first;
77864 + reiser4_block_nr block;
77865 +
77866 + assert("zam-571", first != NULL);
77867 + assert("zam-572", block_p != NULL);
77868 + assert("zam-570", nr > 0);
77869 +
77870 + block = *block_p;
77871 + max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
77872 +
77873 + while (nr > 0) {
77874 + struct bio *bio;
77875 + int nr_blocks = min(nr, max_blocks);
77876 + int i;
77877 + int nr_used;
77878 +
77879 + bio = bio_alloc(GFP_NOIO, nr_blocks);
77880 + if (!bio)
77881 + return RETERR(-ENOMEM);
77882 +
77883 + bio->bi_bdev = super->s_bdev;
77884 + bio->bi_sector = block * (super->s_blocksize >> 9);
77885 + for (nr_used = 0, i = 0; i < nr_blocks; i++) {
77886 + struct page *pg;
77887 +
77888 + pg = jnode_page(cur);
77889 + assert("zam-573", pg != NULL);
77890 +
77891 + page_cache_get(pg);
77892 +
77893 + lock_and_wait_page_writeback(pg);
77894 +
77895 + if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
77896 + /*
77897 + * underlying device is satiated. Stop adding
77898 + * pages to the bio.
77899 + */
77900 + unlock_page(pg);
77901 + page_cache_release(pg);
77902 + break;
77903 + }
77904 +
77905 + spin_lock_jnode(cur);
77906 + assert("nikita-3166",
77907 + pg->mapping == jnode_get_mapping(cur));
77908 + assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
77909 +#if REISER4_DEBUG
77910 + spin_lock(&cur->load);
77911 + assert("nikita-3165", !jnode_is_releasable(cur));
77912 + spin_unlock(&cur->load);
77913 +#endif
77914 + JF_SET(cur, JNODE_WRITEBACK);
77915 + JF_CLR(cur, JNODE_DIRTY);
77916 + ON_DEBUG(cur->written++);
77917 + spin_unlock_jnode(cur);
77918 +
77919 + ClearPageError(pg);
77920 + set_page_writeback(pg);
77921 +
77922 + if (get_current_context()->entd) {
77923 + /* this is ent thread */
77924 + entd_context *ent = get_entd_context(super);
77925 + struct wbq *rq, *next;
77926 +
77927 + spin_lock(&ent->guard);
77928 +
77929 + if (pg == ent->cur_request->page) {
77930 + /*
77931 + * entd is called for this page. This
77932 + * request is not in th etodo list
77933 + */
77934 + ent->cur_request->written = 1;
77935 + } else {
77936 + /*
77937 + * if we have written a page for which writepage
77938 + * is called for - move request to another list.
77939 + */
77940 + list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
77941 + assert("", rq->magic == WBQ_MAGIC);
77942 + if (pg == rq->page) {
77943 + /*
77944 + * remove request from
77945 + * entd's queue, but do
77946 + * not wake up a thread
77947 + * which put this
77948 + * request
77949 + */
77950 + list_del_init(&rq->link);
77951 + ent->nr_todo_reqs --;
77952 + list_add_tail(&rq->link, &ent->done_list);
77953 + ent->nr_done_reqs ++;
77954 + rq->written = 1;
77955 + break;
77956 + }
77957 + }
77958 + }
77959 + spin_unlock(&ent->guard);
77960 + }
77961 +
77962 + clear_page_dirty_for_io(pg);
77963 +
77964 + unlock_page(pg);
77965 +
77966 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77967 + nr_used++;
77968 + }
77969 + if (nr_used > 0) {
77970 + assert("nikita-3453",
77971 + bio->bi_size == super->s_blocksize * nr_used);
77972 + assert("nikita-3454", bio->bi_vcnt == nr_used);
77973 +
77974 + /* Check if we are allowed to write at all */
77975 + if (super->s_flags & MS_RDONLY)
77976 + undo_bio(bio);
77977 + else {
77978 + int not_supported;
77979 +
77980 + add_fq_to_bio(fq, bio);
77981 + bio_get(bio);
77982 + reiser4_submit_bio(write_op, bio);
77983 + not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
77984 + bio_put(bio);
77985 + if (not_supported)
77986 + return -EOPNOTSUPP;
77987 + }
77988 +
77989 + block += nr_used - 1;
77990 + update_blocknr_hint_default(super, &block);
77991 + block += 1;
77992 + } else {
77993 + bio_put(bio);
77994 + }
77995 + nr -= nr_used;
77996 + }
77997 +
77998 + return 0;
77999 +}
78000 +
78001 +/* This is a procedure which recovers a contiguous sequences of disk block
78002 + numbers in the given list of j-nodes and submits write requests on this
78003 + per-sequence basis */
78004 +int
78005 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
78006 + long *nr_submitted, int flags)
78007 +{
78008 + int ret;
78009 + jnode *beg = list_entry(head->next, jnode, capture_link);
78010 +
78011 + while (head != &beg->capture_link) {
78012 + int nr = 1;
78013 + jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
78014 +
78015 + while (head != &cur->capture_link) {
78016 + if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
78017 + break;
78018 + ++nr;
78019 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78020 + }
78021 +
78022 + ret = write_jnodes_to_disk_extent(
78023 + beg, nr, jnode_get_block(beg), fq, flags);
78024 + if (ret)
78025 + return ret;
78026 +
78027 + if (nr_submitted)
78028 + *nr_submitted += nr;
78029 +
78030 + beg = cur;
78031 + }
78032 +
78033 + return 0;
78034 +}
78035 +
78036 +/* add given wandered mapping to atom's wandered map */
78037 +static int
78038 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
78039 +{
78040 + int ret;
78041 + blocknr_set_entry *new_bsep = NULL;
78042 + reiser4_block_nr block;
78043 +
78044 + txn_atom *atom;
78045 +
78046 + assert("zam-568", block_p != NULL);
78047 + block = *block_p;
78048 + assert("zam-569", len > 0);
78049 +
78050 + while ((len--) > 0) {
78051 + do {
78052 + atom = get_current_atom_locked();
78053 + assert("zam-536",
78054 + !blocknr_is_fake(jnode_get_block(cur)));
78055 + ret =
78056 + blocknr_set_add_pair(atom, &atom->wandered_map,
78057 + &new_bsep,
78058 + jnode_get_block(cur), &block);
78059 + } while (ret == -E_REPEAT);
78060 +
78061 + if (ret) {
78062 + /* deallocate blocks which were not added to wandered
78063 + map */
78064 + reiser4_block_nr wide_len = len;
78065 +
78066 + reiser4_dealloc_blocks(&block, &wide_len,
78067 + BLOCK_NOT_COUNTED,
78068 + BA_FORMATTED
78069 + /* formatted, without defer */ );
78070 +
78071 + return ret;
78072 + }
78073 +
78074 + spin_unlock_atom(atom);
78075 +
78076 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78077 + ++block;
78078 + }
78079 +
78080 + return 0;
78081 +}
78082 +
78083 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
78084 + submit IO for allocated blocks. We assume that current atom is in a stage
78085 + when any atom fusion is impossible and atom is unlocked and it is safe. */
78086 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
78087 +{
78088 + reiser4_block_nr block;
78089 +
78090 + int rest;
78091 + int len;
78092 + int ret;
78093 +
78094 + jnode *cur;
78095 +
78096 + assert("zam-534", ch->overwrite_set_size > 0);
78097 +
78098 + rest = ch->overwrite_set_size;
78099 +
78100 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78101 + while (ch->overwrite_set != &cur->capture_link) {
78102 + assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
78103 +
78104 + ret = get_more_wandered_blocks(rest, &block, &len);
78105 + if (ret)
78106 + return ret;
78107 +
78108 + rest -= len;
78109 +
78110 + ret = add_region_to_wmap(cur, len, &block);
78111 + if (ret)
78112 + return ret;
78113 +
78114 + ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
78115 + if (ret)
78116 + return ret;
78117 +
78118 + while ((len--) > 0) {
78119 + assert("zam-604",
78120 + ch->overwrite_set != &cur->capture_link);
78121 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78122 + }
78123 + }
78124 +
78125 + return 0;
78126 +}
78127 +
78128 +/* allocate given number of nodes over the journal area and link them into a
78129 + list, return pointer to the first jnode in the list */
78130 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
78131 +{
78132 + reiser4_blocknr_hint hint;
78133 + reiser4_block_nr allocated = 0;
78134 + reiser4_block_nr first, len;
78135 + jnode *cur;
78136 + jnode *txhead;
78137 + int ret;
78138 + reiser4_context *ctx;
78139 + reiser4_super_info_data *sbinfo;
78140 +
78141 + assert("zam-698", ch->tx_size > 0);
78142 + assert("zam-699", list_empty_careful(&ch->tx_list));
78143 +
78144 + ctx = get_current_context();
78145 + sbinfo = get_super_private(ctx->super);
78146 +
78147 + while (allocated < (unsigned)ch->tx_size) {
78148 + len = (ch->tx_size - allocated);
78149 +
78150 + blocknr_hint_init(&hint);
78151 +
78152 + hint.block_stage = BLOCK_GRABBED;
78153 +
78154 + /* FIXME: there should be some block allocation policy for
78155 + nodes which contain wander records */
78156 +
78157 + /* We assume that disk space for wandered record blocks can be
78158 + * taken from reserved area. */
78159 + ret = reiser4_alloc_blocks(&hint, &first, &len,
78160 + BA_FORMATTED | BA_RESERVED |
78161 + BA_USE_DEFAULT_SEARCH_START);
78162 + blocknr_hint_done(&hint);
78163 +
78164 + if (ret)
78165 + return ret;
78166 +
78167 + allocated += len;
78168 +
78169 + /* create jnodes for all wander records */
78170 + while (len--) {
78171 + cur = alloc_io_head(&first);
78172 +
78173 + if (cur == NULL) {
78174 + ret = RETERR(-ENOMEM);
78175 + goto free_not_assigned;
78176 + }
78177 +
78178 + ret = jinit_new(cur, get_gfp_mask());
78179 +
78180 + if (ret != 0) {
78181 + jfree(cur);
78182 + goto free_not_assigned;
78183 + }
78184 +
78185 + pin_jnode_data(cur);
78186 +
78187 + list_add_tail(&cur->capture_link, &ch->tx_list);
78188 +
78189 + first++;
78190 + }
78191 + }
78192 +
78193 + { /* format a on-disk linked list of wander records */
78194 + int serial = 1;
78195 +
78196 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
78197 + format_tx_head(ch);
78198 +
78199 + cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78200 + while (&ch->tx_list != &cur->capture_link) {
78201 + format_wander_record(ch, cur, serial++);
78202 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78203 + }
78204 + }
78205 +
78206 + { /* Fill wander records with Wandered Set */
78207 + struct store_wmap_params params;
78208 + txn_atom *atom;
78209 +
78210 + params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78211 +
78212 + params.idx = 0;
78213 + params.capacity =
78214 + wander_record_capacity(reiser4_get_current_sb());
78215 +
78216 + atom = get_current_atom_locked();
78217 + blocknr_set_iterator(atom, &atom->wandered_map,
78218 + &store_wmap_actor, &params, 0);
78219 + spin_unlock_atom(atom);
78220 + }
78221 +
78222 + { /* relse all jnodes from tx_list */
78223 + cur = list_entry(ch->tx_list.next, jnode, capture_link);
78224 + while (&ch->tx_list != &cur->capture_link) {
78225 + jrelse(cur);
78226 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
78227 + }
78228 + }
78229 +
78230 + ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
78231 +
78232 + return ret;
78233 +
78234 + free_not_assigned:
78235 + /* We deallocate blocks not yet assigned to jnodes on tx_list. The
78236 + caller takes care about invalidating of tx list */
78237 + reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
78238 +
78239 + return ret;
78240 +}
78241 +
78242 +static int commit_tx(struct commit_handle *ch)
78243 +{
78244 + flush_queue_t *fq;
78245 + int barrier;
78246 + int ret;
78247 +
78248 + /* Grab more space for wandered records. */
78249 + ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
78250 + if (ret)
78251 + return ret;
78252 +
78253 + fq = get_fq_for_current_atom();
78254 + if (IS_ERR(fq))
78255 + return PTR_ERR(fq);
78256 +
78257 + spin_unlock_atom(fq->atom);
78258 + do {
78259 + ret = alloc_wandered_blocks(ch, fq);
78260 + if (ret)
78261 + break;
78262 + ret = alloc_tx(ch, fq);
78263 + if (ret)
78264 + break;
78265 + } while (0);
78266 +
78267 + fq_put(fq);
78268 + if (ret)
78269 + return ret;
78270 + repeat_wo_barrier:
78271 + barrier = reiser4_use_write_barrier(ch->super);
78272 + if (!barrier) {
78273 + ret = current_atom_finish_all_fq();
78274 + if (ret)
78275 + return ret;
78276 + }
78277 + ret = update_journal_header(ch, barrier);
78278 + if (barrier) {
78279 + if (ret) {
78280 + if (ret == -EOPNOTSUPP) {
78281 + disable_write_barrier(ch->super);
78282 + goto repeat_wo_barrier;
78283 + }
78284 + return ret;
78285 + }
78286 + ret = current_atom_finish_all_fq();
78287 + }
78288 + return ret;
78289 +}
78290 +
78291 +
78292 +static int write_tx_back(struct commit_handle * ch)
78293 +{
78294 + flush_queue_t *fq;
78295 + int ret;
78296 + int barrier;
78297 +
78298 + post_commit_hook();
78299 + fq = get_fq_for_current_atom();
78300 + if (IS_ERR(fq))
78301 + return PTR_ERR(fq);
78302 + spin_unlock_atom(fq->atom);
78303 + ret = write_jnode_list(
78304 + ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78305 + fq_put(fq);
78306 + if (ret)
78307 + return ret;
78308 + repeat_wo_barrier:
78309 + barrier = reiser4_use_write_barrier(ch->super);
78310 + if (!barrier) {
78311 + ret = current_atom_finish_all_fq();
78312 + if (ret)
78313 + return ret;
78314 + }
78315 + ret = update_journal_footer(ch, barrier);
78316 + if (barrier) {
78317 + if (ret) {
78318 + if (ret == -EOPNOTSUPP) {
78319 + disable_write_barrier(ch->super);
78320 + goto repeat_wo_barrier;
78321 + }
78322 + return ret;
78323 + }
78324 + ret = current_atom_finish_all_fq();
78325 + }
78326 + if (ret)
78327 + return ret;
78328 + post_write_back_hook();
78329 + return 0;
78330 +}
78331 +
78332 +/* We assume that at this moment all captured blocks are marked as RELOC or
78333 + WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78334 + are submitted to write.
78335 +*/
78336 +
78337 +int reiser4_write_logs(long *nr_submitted)
78338 +{
78339 + txn_atom *atom;
78340 + struct super_block *super = reiser4_get_current_sb();
78341 + reiser4_super_info_data *sbinfo = get_super_private(super);
78342 + struct commit_handle ch;
78343 + int ret;
78344 +
78345 + writeout_mode_enable();
78346 +
78347 + /* block allocator may add j-nodes to the clean_list */
78348 + ret = pre_commit_hook();
78349 + if (ret)
78350 + return ret;
78351 +
78352 + /* No locks are required if we take atom which stage >=
78353 + * ASTAGE_PRE_COMMIT */
78354 + atom = get_current_context()->trans->atom;
78355 + assert("zam-965", atom != NULL);
78356 +
78357 + /* relocate set is on the atom->clean_nodes list after
78358 + * current_atom_complete_writes() finishes. It can be safely
78359 + * uncaptured after commit_semaphore is taken, because any atom that
78360 + * captures these nodes is guaranteed to commit after current one.
78361 + *
78362 + * This can only be done after pre_commit_hook(), because it is where
78363 + * early flushed jnodes with CREATED bit are transferred to the
78364 + * overwrite list. */
78365 + invalidate_list(ATOM_CLEAN_LIST(atom));
78366 + spin_lock_atom(atom);
78367 + /* There might be waiters for the relocate nodes which we have
78368 + * released, wake them up. */
78369 + atom_send_event(atom);
78370 + spin_unlock_atom(atom);
78371 +
78372 + if (REISER4_DEBUG) {
78373 + int level;
78374 +
78375 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78376 + assert("nikita-3352",
78377 + list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78378 + }
78379 +
78380 + sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78381 + sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78382 +
78383 + init_commit_handle(&ch, atom);
78384 +
78385 + ch.free_blocks = sbinfo->blocks_free_committed;
78386 + ch.nr_files = sbinfo->nr_files_committed;
78387 + /* ZAM-FIXME-HANS: email me what the contention level is for the super
78388 + * lock. */
78389 + ch.next_oid = oid_next(super);
78390 +
78391 + /* count overwrite set and place it in a separate list */
78392 + ret = get_overwrite_set(&ch);
78393 +
78394 + if (ret <= 0) {
78395 + /* It is possible that overwrite set is empty here, it means
78396 + all captured nodes are clean */
78397 + goto up_and_ret;
78398 + }
78399 +
78400 + /* Inform the caller about what number of dirty pages will be
78401 + * submitted to disk. */
78402 + *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78403 +
78404 + /* count all records needed for storing of the wandered set */
78405 + get_tx_size(&ch);
78406 +
78407 + ret = commit_tx(&ch);
78408 + if (ret)
78409 + goto up_and_ret;
78410 +
78411 + spin_lock_atom(atom);
78412 + atom_set_stage(atom, ASTAGE_POST_COMMIT);
78413 + spin_unlock_atom(atom);
78414 +
78415 + ret = write_tx_back(&ch);
78416 + post_write_back_hook();
78417 +
78418 + up_and_ret:
78419 + if (ret) {
78420 + /* there could be fq attached to current atom; the only way to
78421 + remove them is: */
78422 + current_atom_finish_all_fq();
78423 + }
78424 +
78425 + /* free blocks of flushed transaction */
78426 + dealloc_tx_list(&ch);
78427 + dealloc_wmap(&ch);
78428 +
78429 + put_overwrite_set(&ch);
78430 +
78431 + done_commit_handle(&ch);
78432 +
78433 + writeout_mode_disable();
78434 +
78435 + return ret;
78436 +}
78437 +
78438 +/* consistency checks for journal data/control blocks: header, footer, log
78439 + records, transactions head blocks. All functions return zero on success. */
78440 +
78441 +static int check_journal_header(const jnode * node UNUSED_ARG)
78442 +{
78443 + /* FIXME: journal header has no magic field yet. */
78444 + return 0;
78445 +}
78446 +
78447 +/* wait for write completion for all jnodes from given list */
78448 +static int wait_on_jnode_list(struct list_head *head)
78449 +{
78450 + jnode *scan;
78451 + int ret = 0;
78452 +
78453 + list_for_each_entry(scan, head, capture_link) {
78454 + struct page *pg = jnode_page(scan);
78455 +
78456 + if (pg) {
78457 + if (PageWriteback(pg))
78458 + wait_on_page_writeback(pg);
78459 +
78460 + if (PageError(pg))
78461 + ret++;
78462 + }
78463 + }
78464 +
78465 + return ret;
78466 +}
78467 +
78468 +static int check_journal_footer(const jnode * node UNUSED_ARG)
78469 +{
78470 + /* FIXME: journal footer has no magic field yet. */
78471 + return 0;
78472 +}
78473 +
78474 +static int check_tx_head(const jnode * node)
78475 +{
78476 + struct tx_header *header = (struct tx_header *)jdata(node);
78477 +
78478 + if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
78479 + warning("zam-627", "tx head at block %s corrupted\n",
78480 + sprint_address(jnode_get_block(node)));
78481 + return RETERR(-EIO);
78482 + }
78483 +
78484 + return 0;
78485 +}
78486 +
78487 +static int check_wander_record(const jnode * node)
78488 +{
78489 + struct wander_record_header *RH =
78490 + (struct wander_record_header *)jdata(node);
78491 +
78492 + if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
78493 + 0) {
78494 + warning("zam-628", "wander record at block %s corrupted\n",
78495 + sprint_address(jnode_get_block(node)));
78496 + return RETERR(-EIO);
78497 + }
78498 +
78499 + return 0;
78500 +}
78501 +
78502 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
78503 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
78504 +{
78505 + struct tx_header *TXH;
78506 + int ret;
78507 +
78508 + ret = jload(tx_head);
78509 + if (ret)
78510 + return ret;
78511 +
78512 + TXH = (struct tx_header *)jdata(tx_head);
78513 +
78514 + ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
78515 + ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
78516 + ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
78517 +
78518 + jrelse(tx_head);
78519 +
78520 + list_add(&tx_head->capture_link, &ch->tx_list);
78521 +
78522 + return 0;
78523 +}
78524 +
78525 +/* replay one transaction: restore and write overwrite set in place */
78526 +static int replay_transaction(const struct super_block *s,
78527 + jnode * tx_head,
78528 + const reiser4_block_nr * log_rec_block_p,
78529 + const reiser4_block_nr * end_block,
78530 + unsigned int nr_wander_records)
78531 +{
78532 + reiser4_block_nr log_rec_block = *log_rec_block_p;
78533 + struct commit_handle ch;
78534 + LIST_HEAD(overwrite_set);
78535 + jnode *log;
78536 + int ret;
78537 +
78538 + init_commit_handle(&ch, NULL);
78539 + ch.overwrite_set = &overwrite_set;
78540 +
78541 + restore_commit_handle(&ch, tx_head);
78542 +
78543 + while (log_rec_block != *end_block) {
78544 + struct wander_record_header *header;
78545 + struct wander_entry *entry;
78546 +
78547 + int i;
78548 +
78549 + if (nr_wander_records == 0) {
78550 + warning("zam-631",
78551 + "number of wander records in the linked list"
78552 + " greater than number stored in tx head.\n");
78553 + ret = RETERR(-EIO);
78554 + goto free_ow_set;
78555 + }
78556 +
78557 + log = alloc_io_head(&log_rec_block);
78558 + if (log == NULL)
78559 + return RETERR(-ENOMEM);
78560 +
78561 + ret = jload(log);
78562 + if (ret < 0) {
78563 + drop_io_head(log);
78564 + return ret;
78565 + }
78566 +
78567 + ret = check_wander_record(log);
78568 + if (ret) {
78569 + jrelse(log);
78570 + drop_io_head(log);
78571 + return ret;
78572 + }
78573 +
78574 + header = (struct wander_record_header *)jdata(log);
78575 + log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
78576 +
78577 + entry = (struct wander_entry *)(header + 1);
78578 +
78579 + /* restore overwrite set from wander record content */
78580 + for (i = 0; i < wander_record_capacity(s); i++) {
78581 + reiser4_block_nr block;
78582 + jnode *node;
78583 +
78584 + block = le64_to_cpu(get_unaligned(&entry->wandered));
78585 + if (block == 0)
78586 + break;
78587 +
78588 + node = alloc_io_head(&block);
78589 + if (node == NULL) {
78590 + ret = RETERR(-ENOMEM);
78591 + /*
78592 + * FIXME-VS:???
78593 + */
78594 + jrelse(log);
78595 + drop_io_head(log);
78596 + goto free_ow_set;
78597 + }
78598 +
78599 + ret = jload(node);
78600 +
78601 + if (ret < 0) {
78602 + drop_io_head(node);
78603 + /*
78604 + * FIXME-VS:???
78605 + */
78606 + jrelse(log);
78607 + drop_io_head(log);
78608 + goto free_ow_set;
78609 + }
78610 +
78611 + block = le64_to_cpu(get_unaligned(&entry->original));
78612 +
78613 + assert("zam-603", block != 0);
78614 +
78615 + jnode_set_block(node, &block);
78616 +
78617 + list_add_tail(&node->capture_link, ch.overwrite_set);
78618 +
78619 + ++entry;
78620 + }
78621 +
78622 + jrelse(log);
78623 + drop_io_head(log);
78624 +
78625 + --nr_wander_records;
78626 + }
78627 +
78628 + if (nr_wander_records != 0) {
78629 + warning("zam-632", "number of wander records in the linked list"
78630 + " less than number stored in tx head.\n");
78631 + ret = RETERR(-EIO);
78632 + goto free_ow_set;
78633 + }
78634 +
78635 + { /* write wandered set in place */
78636 + write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
78637 + ret = wait_on_jnode_list(ch.overwrite_set);
78638 +
78639 + if (ret) {
78640 + ret = RETERR(-EIO);
78641 + goto free_ow_set;
78642 + }
78643 + }
78644 +
78645 + ret = update_journal_footer(&ch, 0);
78646 +
78647 + free_ow_set:
78648 +
78649 + while (!list_empty(ch.overwrite_set)) {
78650 + jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
78651 + list_del_init(&cur->capture_link);
78652 + jrelse(cur);
78653 + drop_io_head(cur);
78654 + }
78655 +
78656 + list_del_init(&tx_head->capture_link);
78657 +
78658 + done_commit_handle(&ch);
78659 +
78660 + return ret;
78661 +}
78662 +
78663 +/* find oldest committed and not played transaction and play it. The transaction
78664 + * was committed and journal header block was updated but the blocks from the
78665 + * process of writing the atom's overwrite set in-place and updating of journal
78666 + * footer block were not completed. This function completes the process by
78667 + * recovering the atom's overwrite set from their wandered locations and writes
78668 + * them in-place and updating the journal footer. */
78669 +static int replay_oldest_transaction(struct super_block *s)
78670 +{
78671 + reiser4_super_info_data *sbinfo = get_super_private(s);
78672 + jnode *jf = sbinfo->journal_footer;
78673 + unsigned int total;
78674 + struct journal_footer *F;
78675 + struct tx_header *T;
78676 +
78677 + reiser4_block_nr prev_tx;
78678 + reiser4_block_nr last_flushed_tx;
78679 + reiser4_block_nr log_rec_block = 0;
78680 +
78681 + jnode *tx_head;
78682 +
78683 + int ret;
78684 +
78685 + if ((ret = jload(jf)) < 0)
78686 + return ret;
78687 +
78688 + F = (struct journal_footer *)jdata(jf);
78689 +
78690 + last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
78691 +
78692 + jrelse(jf);
78693 +
78694 + if (sbinfo->last_committed_tx == last_flushed_tx) {
78695 + /* all transactions are replayed */
78696 + return 0;
78697 + }
78698 +
78699 + prev_tx = sbinfo->last_committed_tx;
78700 +
78701 + /* searching for oldest not flushed transaction */
78702 + while (1) {
78703 + tx_head = alloc_io_head(&prev_tx);
78704 + if (!tx_head)
78705 + return RETERR(-ENOMEM);
78706 +
78707 + ret = jload(tx_head);
78708 + if (ret < 0) {
78709 + drop_io_head(tx_head);
78710 + return ret;
78711 + }
78712 +
78713 + ret = check_tx_head(tx_head);
78714 + if (ret) {
78715 + jrelse(tx_head);
78716 + drop_io_head(tx_head);
78717 + return ret;
78718 + }
78719 +
78720 + T = (struct tx_header *)jdata(tx_head);
78721 +
78722 + prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
78723 +
78724 + if (prev_tx == last_flushed_tx)
78725 + break;
78726 +
78727 + jrelse(tx_head);
78728 + drop_io_head(tx_head);
78729 + }
78730 +
78731 + total = le32_to_cpu(get_unaligned(&T->total));
78732 + log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
78733 +
78734 + pin_jnode_data(tx_head);
78735 + jrelse(tx_head);
78736 +
78737 + ret =
78738 + replay_transaction(s, tx_head, &log_rec_block,
78739 + jnode_get_block(tx_head), total - 1);
78740 +
78741 + unpin_jnode_data(tx_head);
78742 + drop_io_head(tx_head);
78743 +
78744 + if (ret)
78745 + return ret;
78746 + return -E_REPEAT;
78747 +}
78748 +
78749 +/* The reiser4 journal current implementation was optimized to not to capture
78750 + super block if certain super blocks fields are modified. Currently, the set
78751 + is (<free block count>, <OID allocator>). These fields are logged by
78752 + special way which includes storing them in each transaction head block at
78753 + atom commit time and writing that information to journal footer block at
78754 + atom flush time. For getting info from journal footer block to the
78755 + in-memory super block there is a special function
78756 + reiser4_journal_recover_sb_data() which should be called after disk format
78757 + plugin re-reads super block after journal replaying.
78758 +*/
78759 +
78760 +/* get the information from journal footer in-memory super block */
78761 +int reiser4_journal_recover_sb_data(struct super_block *s)
78762 +{
78763 + reiser4_super_info_data *sbinfo = get_super_private(s);
78764 + struct journal_footer *jf;
78765 + int ret;
78766 +
78767 + assert("zam-673", sbinfo->journal_footer != NULL);
78768 +
78769 + ret = jload(sbinfo->journal_footer);
78770 + if (ret != 0)
78771 + return ret;
78772 +
78773 + ret = check_journal_footer(sbinfo->journal_footer);
78774 + if (ret != 0)
78775 + goto out;
78776 +
78777 + jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
78778 +
78779 + /* was there at least one flushed transaction? */
78780 + if (jf->last_flushed_tx) {
78781 +
78782 + /* restore free block counter logged in this transaction */
78783 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
78784 +
78785 + /* restore oid allocator state */
78786 + oid_init_allocator(s,
78787 + le64_to_cpu(get_unaligned(&jf->nr_files)),
78788 + le64_to_cpu(get_unaligned(&jf->next_oid)));
78789 + }
78790 + out:
78791 + jrelse(sbinfo->journal_footer);
78792 + return ret;
78793 +}
78794 +
78795 +/* reiser4 replay journal procedure */
78796 +int reiser4_journal_replay(struct super_block *s)
78797 +{
78798 + reiser4_super_info_data *sbinfo = get_super_private(s);
78799 + jnode *jh, *jf;
78800 + struct journal_header *header;
78801 + int nr_tx_replayed = 0;
78802 + int ret;
78803 +
78804 + assert("zam-582", sbinfo != NULL);
78805 +
78806 + jh = sbinfo->journal_header;
78807 + jf = sbinfo->journal_footer;
78808 +
78809 + if (!jh || !jf) {
78810 + /* it is possible that disk layout does not support journal
78811 + structures, we just warn about this */
78812 + warning("zam-583",
78813 + "journal control blocks were not loaded by disk layout plugin. "
78814 + "journal replaying is not possible.\n");
78815 + return 0;
78816 + }
78817 +
78818 + /* Take free block count from journal footer block. The free block
78819 + counter value corresponds the last flushed transaction state */
78820 + ret = jload(jf);
78821 + if (ret < 0)
78822 + return ret;
78823 +
78824 + ret = check_journal_footer(jf);
78825 + if (ret) {
78826 + jrelse(jf);
78827 + return ret;
78828 + }
78829 +
78830 + jrelse(jf);
78831 +
78832 + /* store last committed transaction info in reiser4 in-memory super
78833 + block */
78834 + ret = jload(jh);
78835 + if (ret < 0)
78836 + return ret;
78837 +
78838 + ret = check_journal_header(jh);
78839 + if (ret) {
78840 + jrelse(jh);
78841 + return ret;
78842 + }
78843 +
78844 + header = (struct journal_header *)jdata(jh);
78845 + sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
78846 +
78847 + jrelse(jh);
78848 +
78849 + /* replay committed transactions */
78850 + while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
78851 + nr_tx_replayed++;
78852 +
78853 + return ret;
78854 +}
78855 +
78856 +/* load journal control block (either journal header or journal footer block) */
78857 +static int
78858 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
78859 +{
78860 + int ret;
78861 +
78862 + *node = alloc_io_head(block);
78863 + if (!(*node))
78864 + return RETERR(-ENOMEM);
78865 +
78866 + ret = jload(*node);
78867 +
78868 + if (ret) {
78869 + drop_io_head(*node);
78870 + *node = NULL;
78871 + return ret;
78872 + }
78873 +
78874 + pin_jnode_data(*node);
78875 + jrelse(*node);
78876 +
78877 + return 0;
78878 +}
78879 +
78880 +/* unload journal header or footer and free jnode */
78881 +static void unload_journal_control_block(jnode ** node)
78882 +{
78883 + if (*node) {
78884 + unpin_jnode_data(*node);
78885 + drop_io_head(*node);
78886 + *node = NULL;
78887 + }
78888 +}
78889 +
78890 +/* release journal control blocks */
78891 +void done_journal_info(struct super_block *s)
78892 +{
78893 + reiser4_super_info_data *sbinfo = get_super_private(s);
78894 +
78895 + assert("zam-476", sbinfo != NULL);
78896 +
78897 + unload_journal_control_block(&sbinfo->journal_header);
78898 + unload_journal_control_block(&sbinfo->journal_footer);
78899 + rcu_barrier();
78900 +}
78901 +
78902 +/* load journal control blocks */
78903 +int init_journal_info(struct super_block *s)
78904 +{
78905 + reiser4_super_info_data *sbinfo = get_super_private(s);
78906 + journal_location *loc;
78907 + int ret;
78908 +
78909 + loc = &sbinfo->jloc;
78910 +
78911 + assert("zam-651", loc != NULL);
78912 + assert("zam-652", loc->header != 0);
78913 + assert("zam-653", loc->footer != 0);
78914 +
78915 + ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
78916 +
78917 + if (ret)
78918 + return ret;
78919 +
78920 + ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
78921 +
78922 + if (ret) {
78923 + unload_journal_control_block(&sbinfo->journal_header);
78924 + }
78925 +
78926 + return ret;
78927 +}
78928 +
78929 +/* Make Linus happy.
78930 + Local variables:
78931 + c-indentation-style: "K&R"
78932 + mode-name: "LC"
78933 + c-basic-offset: 8
78934 + tab-width: 8
78935 + fill-column: 80
78936 + End:
78937 +*/
78938 Index: linux-2.6.16/fs/reiser4/wander.h
78939 ===================================================================
78940 --- /dev/null
78941 +++ linux-2.6.16/fs/reiser4/wander.h
78942 @@ -0,0 +1,135 @@
78943 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
78944 +
78945 +#if !defined (__FS_REISER4_WANDER_H__)
78946 +#define __FS_REISER4_WANDER_H__
78947 +
78948 +#include "dformat.h"
78949 +
78950 +#include <linux/fs.h> /* for struct super_block */
78951 +
78952 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
78953 +
78954 +#define TX_HEADER_MAGIC "TxMagic4"
78955 +#define WANDER_RECORD_MAGIC "LogMagc4"
78956 +
78957 +#define TX_HEADER_MAGIC_SIZE (8)
78958 +#define WANDER_RECORD_MAGIC_SIZE (8)
78959 +
78960 +/* journal header block format */
78961 +struct journal_header {
78962 + /* last written transaction head location */
78963 + d64 last_committed_tx;
78964 +};
78965 +
78966 +typedef struct journal_location {
78967 + reiser4_block_nr footer;
78968 + reiser4_block_nr header;
78969 +} journal_location;
78970 +
78971 +/* The wander.c head comment describes usage and semantic of all these structures */
78972 +/* journal footer block format */
78973 +struct journal_footer {
78974 + /* last flushed transaction location. */
78975 + /* This block number is no more valid after the transaction it points
78976 + to gets flushed, this number is used only at journal replaying time
78977 + for detection of the end of on-disk list of committed transactions
78978 + which were not flushed completely */
78979 + d64 last_flushed_tx;
78980 +
78981 + /* free block counter is written in journal footer at transaction
78982 + flushing , not in super block because free blocks counter is logged
78983 + by another way than super block fields (root pointer, for
78984 + example). */
78985 + d64 free_blocks;
78986 +
78987 + /* number of used OIDs and maximal used OID are logged separately from
78988 + super block */
78989 + d64 nr_files;
78990 + d64 next_oid;
78991 +};
78992 +
78993 +/* Each wander record (except the first one) has unified format with wander
78994 + record header followed by an array of log entries */
78995 +struct wander_record_header {
78996 + /* when there is no predefined location for wander records, this magic
78997 + string should help reiser4fsck. */
78998 + char magic[WANDER_RECORD_MAGIC_SIZE];
78999 +
79000 + /* transaction id */
79001 + d64 id;
79002 +
79003 + /* total number of wander records in current transaction */
79004 + d32 total;
79005 +
79006 + /* this block number in transaction */
79007 + d32 serial;
79008 +
79009 + /* number of previous block in commit */
79010 + d64 next_block;
79011 +};
79012 +
79013 +/* The first wander record (transaction head) of written transaction has the
79014 + special format */
79015 +struct tx_header {
79016 + /* magic string makes first block in transaction different from other
79017 + logged blocks, it should help fsck. */
79018 + char magic[TX_HEADER_MAGIC_SIZE];
79019 +
79020 + /* transaction id */
79021 + d64 id;
79022 +
79023 + /* total number of records (including this first tx head) in the
79024 + transaction */
79025 + d32 total;
79026 +
79027 + /* align next field to 8-byte boundary; this field always is zero */
79028 + d32 padding;
79029 +
79030 + /* block number of previous transaction head */
79031 + d64 prev_tx;
79032 +
79033 + /* next wander record location */
79034 + d64 next_block;
79035 +
79036 + /* committed versions of free blocks counter */
79037 + d64 free_blocks;
79038 +
79039 + /* number of used OIDs (nr_files) and maximal used OID are logged
79040 + separately from super block */
79041 + d64 nr_files;
79042 + d64 next_oid;
79043 +};
79044 +
79045 +/* A transaction gets written to disk as a set of wander records (each wander
79046 + record size is fs block) */
79047 +
79048 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
79049 + by zeroes */
79050 +struct wander_entry {
79051 + d64 original; /* block original location */
79052 + d64 wandered; /* block wandered location */
79053 +};
79054 +
79055 +/* REISER4 JOURNAL WRITER FUNCTIONS */
79056 +
79057 +extern int reiser4_write_logs(long *);
79058 +extern int reiser4_journal_replay(struct super_block *);
79059 +extern int reiser4_journal_recover_sb_data(struct super_block *);
79060 +
79061 +extern int init_journal_info(struct super_block *);
79062 +extern void done_journal_info(struct super_block *);
79063 +
79064 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
79065 +
79066 +#endif /* __FS_REISER4_WANDER_H__ */
79067 +
79068 +/* Make Linus happy.
79069 + Local variables:
79070 + c-indentation-style: "K&R"
79071 + mode-name: "LC"
79072 + c-basic-offset: 8
79073 + tab-width: 8
79074 + fill-column: 80
79075 + scroll-step: 1
79076 + End:
79077 +*/
79078 Index: linux-2.6.16/fs/reiser4/writeout.h
79079 ===================================================================
79080 --- /dev/null
79081 +++ linux-2.6.16/fs/reiser4/writeout.h
79082 @@ -0,0 +1,21 @@
79083 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
79084 +
79085 +#if !defined (__FS_REISER4_WRITEOUT_H__)
79086 +
79087 +#define WRITEOUT_SINGLE_STREAM (0x1)
79088 +#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
79089 +#define WRITEOUT_BARRIER (0x4)
79090 +
79091 +extern int get_writeout_flags(void);
79092 +
79093 +#endif /* __FS_REISER4_WRITEOUT_H__ */
79094 +
79095 +/* Make Linus happy.
79096 + Local variables:
79097 + c-indentation-style: "K&R"
79098 + mode-name: "LC"
79099 + c-basic-offset: 8
79100 + tab-width: 8
79101 + fill-column: 80
79102 + End:
79103 +*/
79104 Index: linux-2.6.16/fs/reiser4/znode.c
79105 ===================================================================
79106 --- /dev/null
79107 +++ linux-2.6.16/fs/reiser4/znode.c
79108 @@ -0,0 +1,1028 @@
79109 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
79110 + * reiser4/README */
79111 +/* Znode manipulation functions. */
79112 +/* Znode is the in-memory header for a tree node. It is stored
79113 + separately from the node itself so that it does not get written to
79114 + disk. In this respect znode is like buffer head or page head. We
79115 + also use znodes for additional reiser4 specific purposes:
79116 +
79117 + . they are organized into tree structure which is a part of whole
79118 + reiser4 tree.
79119 + . they are used to implement node grained locking
79120 + . they are used to keep additional state associated with a
79121 + node
79122 + . they contain links to lists used by the transaction manager
79123 +
79124 + Znode is attached to some variable "block number" which is instance of
79125 + fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
79126 + appropriate node being actually loaded in memory. Existence of znode itself
79127 + is regulated by reference count (->x_count) in it. Each time thread
79128 + acquires reference to znode through call to zget(), ->x_count is
79129 + incremented and decremented on call to zput(). Data (content of node) are
79130 + brought in memory through call to zload(), which also increments ->d_count
79131 + reference counter. zload can block waiting on IO. Call to zrelse()
79132 + decreases this counter. Also, ->c_count keeps track of number of child
79133 + znodes and prevents parent znode from being recycled until all of its
79134 + children are. ->c_count is decremented whenever child goes out of existence
79135 + (being actually recycled in zdestroy()) which can be some time after last
79136 + reference to this child dies if we support some form of LRU cache for
79137 + znodes.
79138 +
79139 +*/
79140 +/* EVERY ZNODE'S STORY
79141 +
79142 + 1. His infancy.
79143 +
79144 + Once upon a time, the znode was born deep inside of zget() by call to
79145 + zalloc(). At the return from zget() znode had:
79146 +
79147 + . reference counter (x_count) of 1
79148 + . assigned block number, marked as used in bitmap
79149 + . pointer to parent znode. Root znode parent pointer points
79150 + to its father: "fake" znode. This, in turn, has NULL parent pointer.
79151 + . hash table linkage
79152 + . no data loaded from disk
79153 + . no node plugin
79154 + . no sibling linkage
79155 +
79156 + 2. His childhood
79157 +
79158 + Each node is either brought into memory as a result of tree traversal, or
79159 + created afresh, creation of the root being a special case of the latter. In
79160 + either case it's inserted into sibling list. This will typically require
79161 + some ancillary tree traversing, but ultimately both sibling pointers will
79162 + exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
79163 + zjnode.state.
79164 +
79165 + 3. His youth.
79166 +
79167 + If znode is bound to already existing node in a tree, its content is read
79168 + from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
79169 + in zjnode.state and zdata() function starts to return non null for this
79170 + znode. zload() further calls zparse() that determines which node layout
79171 + this node is rendered in, and sets ->nplug on success.
79172 +
79173 + If znode is for new node just created, memory for it is allocated and
79174 + zinit_new() function is called to initialise data, according to selected
79175 + node layout.
79176 +
79177 + 4. His maturity.
79178 +
79179 + After this point, znode lingers in memory for some time. Threads can
79180 + acquire references to znode either by blocknr through call to zget(), or by
79181 + following a pointer to unallocated znode from internal item. Each time
79182 + reference to znode is obtained, x_count is increased. Thread can read/write
79183 + lock znode. Znode data can be loaded through calls to zload(), d_count will
79184 + be increased appropriately. If all references to znode are released
79185 + (x_count drops to 0), znode is not recycled immediately. Rather, it is
79186 + still cached in the hash table in the hope that it will be accessed
79187 + shortly.
79188 +
79189 + There are two ways in which znode existence can be terminated:
79190 +
79191 + . sudden death: node bound to this znode is removed from the tree
79192 + . overpopulation: znode is purged out of memory due to memory pressure
79193 +
79194 + 5. His death.
79195 +
79196 + Death is complex process.
79197 +
79198 + When we irrevocably commit ourselves to decision to remove node from the
79199 + tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
79200 + znode. This is done either in ->kill_hook() of internal item or in
79201 + kill_root() function when tree root is removed.
79202 +
79203 + At this moment znode still has:
79204 +
79205 + . locks held on it, necessary write ones
79206 + . references to it
79207 + . disk block assigned to it
79208 + . data loaded from the disk
79209 + . pending requests for lock
79210 +
79211 + But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
79212 + deletion. Node deletion includes two phases. First all ways to get
79213 + references to that znode (sibling and parent links and hash lookup using
79214 + block number stored in parent node) should be deleted -- it is done through
79215 + sibling_list_remove(), also we assume that nobody uses down link from
79216 + parent node due to its nonexistence or proper parent node locking and
79217 + nobody uses parent pointers from children due to absence of them. Second we
79218 + invalidate all pending lock requests which still are on znode's lock
79219 + request queue, this is done by invalidate_lock(). Another JNODE_IS_DYING
79220 + znode status bit is used to invalidate pending lock requests. Once it set
79221 + all requesters are forced to return -EINVAL from
79222 + longterm_lock_znode(). Future locking attempts are not possible because all
79223 + ways to get references to that znode are removed already. Last, node is
79224 + uncaptured from transaction.
79225 +
79226 + When last reference to the dying znode is just about to be released,
79227 + block number for this lock is released and znode is removed from the
79228 + hash table.
79229 +
79230 + Now znode can be recycled.
79231 +
79232 + [it's possible to free bitmap block and remove znode from the hash
79233 + table when last lock is released. This will result in having
79234 + referenced but completely orphaned znode]
79235 +
79236 + 6. Limbo
79237 +
79238 + As have been mentioned above znodes with reference counter 0 are
79239 + still cached in a hash table. Once memory pressure increases they are
79240 + purged out of there [this requires something like LRU list for
79241 + efficient implementation. LRU list would also greatly simplify
79242 + implementation of coord cache that would in this case morph to just
79243 + scanning some initial segment of LRU list]. Data loaded into
79244 + unreferenced znode are flushed back to the durable storage if
79245 + necessary and memory is freed. Znodes themselves can be recycled at
79246 + this point too.
79247 +
79248 +*/
79249 +
79250 +#include "debug.h"
79251 +#include "dformat.h"
79252 +#include "key.h"
79253 +#include "coord.h"
79254 +#include "plugin/plugin_header.h"
79255 +#include "plugin/node/node.h"
79256 +#include "plugin/plugin.h"
79257 +#include "txnmgr.h"
79258 +#include "jnode.h"
79259 +#include "znode.h"
79260 +#include "block_alloc.h"
79261 +#include "tree.h"
79262 +#include "tree_walk.h"
79263 +#include "super.h"
79264 +#include "reiser4.h"
79265 +
79266 +#include <linux/pagemap.h>
79267 +#include <linux/spinlock.h>
79268 +#include <linux/slab.h>
79269 +#include <linux/err.h>
79270 +
79271 +static z_hash_table *get_htable(reiser4_tree *,
79272 + const reiser4_block_nr * const blocknr);
79273 +static z_hash_table *znode_get_htable(const znode *);
79274 +static void zdrop(znode *);
79275 +
79276 +/* hash table support */
79277 +
79278 +/* compare two block numbers for equality. Used by hash-table macros */
79279 +static inline int
79280 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79281 +{
79282 + assert("nikita-534", b1 != NULL);
79283 + assert("nikita-535", b2 != NULL);
79284 +
79285 + return *b1 == *b2;
79286 +}
79287 +
79288 +/* Hash znode by block number. Used by hash-table macros */
79289 +/* Audited by: umka (2002.06.11) */
79290 +static inline __u32
79291 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79292 +{
79293 + assert("nikita-536", b != NULL);
79294 +
79295 + return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79296 +}
79297 +
79298 +/* The hash table definition */
79299 +#define KMALLOC(size) kmalloc((size), GFP_KERNEL)
79300 +#define KFREE(ptr, size) kfree(ptr)
79301 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79302 + blknrhashfn, blknreq);
79303 +#undef KFREE
79304 +#undef KMALLOC
79305 +
79306 +/* slab for znodes */
79307 +static kmem_cache_t *znode_cache;
79308 +
79309 +int znode_shift_order;
79310 +
79311 +/**
79312 + * init_znodes - create znode cache
79313 + *
79314 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79315 + */
79316 +int init_znodes(void)
79317 +{
79318 + znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79319 + SLAB_HWCACHE_ALIGN |
79320 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79321 + if (znode_cache == NULL)
79322 + return RETERR(-ENOMEM);
79323 +
79324 + for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79325 + ++znode_shift_order);
79326 + --znode_shift_order;
79327 + return 0;
79328 +}
79329 +
79330 +/**
79331 + * done_znodes - delete znode cache
79332 + *
79333 + * This is called on reiser4 module unloading or system shutdown.
79334 + */
79335 +void done_znodes(void)
79336 +{
79337 + destroy_reiser4_cache(&znode_cache);
79338 +}
79339 +
79340 +/* call this to initialise tree of znodes */
79341 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79342 +{
79343 + int result;
79344 + assert("umka-050", tree != NULL);
79345 +
79346 + rwlock_init(&tree->dk_lock);
79347 +
79348 + result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79349 + if (result != 0)
79350 + return result;
79351 + result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79352 + return result;
79353 +}
79354 +
79355 +/* free this znode */
79356 +void zfree(znode * node /* znode to free */ )
79357 +{
79358 + assert("nikita-465", node != NULL);
79359 + assert("nikita-2120", znode_page(node) == NULL);
79360 + assert("nikita-2301", list_empty_careful(&node->lock.owners));
79361 + assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79362 + assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79363 + NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79364 + assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79365 + assert("nikita-3293", !znode_is_right_connected(node));
79366 + assert("nikita-3294", !znode_is_left_connected(node));
79367 + assert("nikita-3295", node->left == NULL);
79368 + assert("nikita-3296", node->right == NULL);
79369 +
79370 + /* not yet phash_jnode_destroy(ZJNODE(node)); */
79371 +
79372 + kmem_cache_free(znode_cache, node);
79373 +}
79374 +
79375 +/* call this to free tree of znodes */
79376 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79377 +{
79378 + znode *node;
79379 + znode *next;
79380 + z_hash_table *ztable;
79381 +
79382 + /* scan znode hash-tables and kill all znodes, then free hash tables
79383 + * themselves. */
79384 +
79385 + assert("nikita-795", tree != NULL);
79386 +
79387 + ztable = &tree->zhash_table;
79388 +
79389 + if (ztable->_table != NULL) {
79390 + for_all_in_htable(ztable, z, node, next) {
79391 + node->c_count = 0;
79392 + node->in_parent.node = NULL;
79393 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79394 + zdrop(node);
79395 + }
79396 +
79397 + z_hash_done(&tree->zhash_table);
79398 + }
79399 +
79400 + ztable = &tree->zfake_table;
79401 +
79402 + if (ztable->_table != NULL) {
79403 + for_all_in_htable(ztable, z, node, next) {
79404 + node->c_count = 0;
79405 + node->in_parent.node = NULL;
79406 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79407 + zdrop(node);
79408 + }
79409 +
79410 + z_hash_done(&tree->zfake_table);
79411 + }
79412 +}
79413 +
79414 +/* ZNODE STRUCTURES */
79415 +
79416 +/* allocate fresh znode */
79417 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79418 +{
79419 + znode *node;
79420 +
79421 + node = kmem_cache_alloc(znode_cache, gfp_flag);
79422 + return node;
79423 +}
79424 +
79425 +/* Initialize fields of znode
79426 + @node: znode to initialize;
79427 + @parent: parent znode;
79428 + @tree: tree we are in. */
79429 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79430 +{
79431 + assert("nikita-466", node != NULL);
79432 + assert("umka-268", current_tree != NULL);
79433 +
79434 + memset(node, 0, sizeof *node);
79435 +
79436 + assert("umka-051", tree != NULL);
79437 +
79438 + jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79439 + reiser4_init_lock(&node->lock);
79440 + init_parent_coord(&node->in_parent, parent);
79441 +}
79442 +
79443 +/*
79444 + * remove znode from indices. This is called jput() when last reference on
79445 + * znode is released.
79446 + */
79447 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79448 +{
79449 + assert("nikita-2108", node != NULL);
79450 + assert("nikita-470", node->c_count == 0);
79451 + assert_rw_write_locked(&(tree->tree_lock));
79452 +
79453 + /* remove reference to this znode from cbk cache */
79454 + cbk_cache_invalidate(node, tree);
79455 +
79456 + /* update c_count of parent */
79457 + if (znode_parent(node) != NULL) {
79458 + assert("nikita-472", znode_parent(node)->c_count > 0);
79459 + /* father, onto your hands I forward my spirit... */
79460 + znode_parent(node)->c_count--;
79461 + node->in_parent.node = NULL;
79462 + } else {
79463 + /* orphaned znode?! Root? */
79464 + }
79465 +
79466 + /* remove znode from hash-table */
79467 + z_hash_remove_rcu(znode_get_htable(node), node);
79468 +}
79469 +
79470 +/* zdrop() -- Remove znode from the tree.
79471 +
79472 + This is called when znode is removed from the memory. */
79473 +static void zdrop(znode * node /* znode to finish with */ )
79474 +{
79475 + jdrop(ZJNODE(node));
79476 +}
79477 +
79478 +/*
79479 + * put znode into right place in the hash table. This is called by relocate
79480 + * code.
79481 + */
79482 +int znode_rehash(znode * node /* node to rehash */ ,
79483 + const reiser4_block_nr * new_block_nr /* new block number */ )
79484 +{
79485 + z_hash_table *oldtable;
79486 + z_hash_table *newtable;
79487 + reiser4_tree *tree;
79488 +
79489 + assert("nikita-2018", node != NULL);
79490 +
79491 + tree = znode_get_tree(node);
79492 + oldtable = znode_get_htable(node);
79493 + newtable = get_htable(tree, new_block_nr);
79494 +
79495 + write_lock_tree(tree);
79496 + /* remove znode from hash-table */
79497 + z_hash_remove_rcu(oldtable, node);
79498 +
79499 + /* assertion no longer valid due to RCU */
79500 + /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
79501 +
79502 + /* update blocknr */
79503 + znode_set_block(node, new_block_nr);
79504 + node->zjnode.key.z = *new_block_nr;
79505 +
79506 + /* insert it into hash */
79507 + z_hash_insert_rcu(newtable, node);
79508 + write_unlock_tree(tree);
79509 + return 0;
79510 +}
79511 +
79512 +/* ZNODE LOOKUP, GET, PUT */
79513 +
79514 +/* zlook() - get znode with given block_nr in a hash table or return NULL
79515 +
79516 + If result is non-NULL then the znode's x_count is incremented. Internal version
79517 + accepts pre-computed hash index. The hash table is accessed under caller's
79518 + tree->hash_lock.
79519 +*/
79520 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
79521 +{
79522 + znode *result;
79523 + __u32 hash;
79524 + z_hash_table *htable;
79525 +
79526 + assert("jmacd-506", tree != NULL);
79527 + assert("jmacd-507", blocknr != NULL);
79528 +
79529 + htable = get_htable(tree, blocknr);
79530 + hash = blknrhashfn(htable, blocknr);
79531 +
79532 + rcu_read_lock();
79533 + result = z_hash_find_index(htable, hash, blocknr);
79534 +
79535 + if (result != NULL) {
79536 + add_x_ref(ZJNODE(result));
79537 + result = znode_rip_check(tree, result);
79538 + }
79539 + rcu_read_unlock();
79540 +
79541 + return result;
79542 +}
79543 +
79544 +/* return hash table where znode with block @blocknr is (or should be)
79545 + * stored */
79546 +static z_hash_table *get_htable(reiser4_tree * tree,
79547 + const reiser4_block_nr * const blocknr)
79548 +{
79549 + z_hash_table *table;
79550 + if (is_disk_addr_unallocated(blocknr))
79551 + table = &tree->zfake_table;
79552 + else
79553 + table = &tree->zhash_table;
79554 + return table;
79555 +}
79556 +
79557 +/* return hash table where znode @node is (or should be) stored */
79558 +static z_hash_table *znode_get_htable(const znode * node)
79559 +{
79560 + return get_htable(znode_get_tree(node), znode_get_block(node));
79561 +}
79562 +
79563 +/* zget() - get znode from hash table, allocating it if necessary.
79564 +
79565 + First a call to zlook, locating a x-referenced znode if one
79566 + exists. If znode is not found, allocate new one and return. Result
79567 + is returned with x_count reference increased.
79568 +
79569 + LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
79570 + LOCK ORDERING: NONE
79571 +*/
79572 +znode *zget(reiser4_tree * tree,
79573 + const reiser4_block_nr * const blocknr,
79574 + znode * parent, tree_level level, gfp_t gfp_flag)
79575 +{
79576 + znode *result;
79577 + __u32 hashi;
79578 +
79579 + z_hash_table *zth;
79580 +
79581 + assert("jmacd-512", tree != NULL);
79582 + assert("jmacd-513", blocknr != NULL);
79583 + assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
79584 +
79585 + zth = get_htable(tree, blocknr);
79586 + hashi = blknrhashfn(zth, blocknr);
79587 +
79588 + /* NOTE-NIKITA address-as-unallocated-blocknr still is not
79589 + implemented. */
79590 +
79591 + z_hash_prefetch_bucket(zth, hashi);
79592 +
79593 + rcu_read_lock();
79594 + /* Find a matching BLOCKNR in the hash table. If the znode is found,
79595 + we obtain an reference (x_count) but the znode remains unlocked.
79596 + Have to worry about race conditions later. */
79597 + result = z_hash_find_index(zth, hashi, blocknr);
79598 + /* According to the current design, the hash table lock protects new
79599 + znode references. */
79600 + if (result != NULL) {
79601 + add_x_ref(ZJNODE(result));
79602 + /* NOTE-NIKITA it should be so, but special case during
79603 + creation of new root makes such assertion highly
79604 + complicated. */
79605 + assert("nikita-2131", 1 || znode_parent(result) == parent ||
79606 + (ZF_ISSET(result, JNODE_ORPHAN)
79607 + && (znode_parent(result) == NULL)));
79608 + result = znode_rip_check(tree, result);
79609 + }
79610 +
79611 + rcu_read_unlock();
79612 +
79613 + if (!result) {
79614 + znode *shadow;
79615 +
79616 + result = zalloc(gfp_flag);
79617 + if (!result) {
79618 + return ERR_PTR(RETERR(-ENOMEM));
79619 + }
79620 +
79621 + zinit(result, parent, tree);
79622 + ZJNODE(result)->blocknr = *blocknr;
79623 + ZJNODE(result)->key.z = *blocknr;
79624 + result->level = level;
79625 +
79626 + write_lock_tree(tree);
79627 +
79628 + shadow = z_hash_find_index(zth, hashi, blocknr);
79629 + if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
79630 + jnode_list_remove(ZJNODE(result));
79631 + zfree(result);
79632 + result = shadow;
79633 + } else {
79634 + result->version = znode_build_version(tree);
79635 + z_hash_insert_index_rcu(zth, hashi, result);
79636 +
79637 + if (parent != NULL)
79638 + ++parent->c_count;
79639 + }
79640 +
79641 + add_x_ref(ZJNODE(result));
79642 +
79643 + write_unlock_tree(tree);
79644 + }
79645 +#if REISER4_DEBUG
79646 + if (!blocknr_is_fake(blocknr) && *blocknr != 0)
79647 + reiser4_check_block(blocknr, 1);
79648 +#endif
79649 + /* Check for invalid tree level, return -EIO */
79650 + if (unlikely(znode_get_level(result) != level)) {
79651 + warning("jmacd-504",
79652 + "Wrong level for cached block %llu: %i expecting %i",
79653 + (unsigned long long)(*blocknr), znode_get_level(result),
79654 + level);
79655 + zput(result);
79656 + return ERR_PTR(RETERR(-EIO));
79657 + }
79658 +
79659 + assert("nikita-1227", znode_invariant(result));
79660 +
79661 + return result;
79662 +}
79663 +
79664 +/* ZNODE PLUGINS/DATA */
79665 +
79666 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
79667 + stored at the fixed offset from the beginning of the node. */
79668 +static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
79669 + * plugin of */ )
79670 +{
79671 + reiser4_tree *tree;
79672 +
79673 + assert("nikita-1053", node != NULL);
79674 + assert("nikita-1055", zdata(node) != NULL);
79675 +
79676 + tree = znode_get_tree(node);
79677 + assert("umka-053", tree != NULL);
79678 +
79679 + if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
79680 + return tree->nplug;
79681 + } else {
79682 + return node_plugin_by_disk_id
79683 + (tree, &((common_node_header *) zdata(node))->plugin_id);
79684 +#ifdef GUESS_EXISTS
79685 + reiser4_plugin *plugin;
79686 +
79687 + /* NOTE-NIKITA add locking here when dynamic plugins will be
79688 + * implemented */
79689 + for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
79690 + if ((plugin->u.node.guess != NULL)
79691 + && plugin->u.node.guess(node))
79692 + return plugin;
79693 + }
79694 + warning("nikita-1057", "Cannot guess node plugin");
79695 + print_znode("node", node);
79696 + return NULL;
79697 +#endif
79698 + }
79699 +}
79700 +
79701 +/* parse node header and install ->node_plugin */
79702 +int zparse(znode * node /* znode to parse */ )
79703 +{
79704 + int result;
79705 +
79706 + assert("nikita-1233", node != NULL);
79707 + assert("nikita-2370", zdata(node) != NULL);
79708 +
79709 + if (node->nplug == NULL) {
79710 + node_plugin *nplug;
79711 +
79712 + nplug = znode_guess_plugin(node);
79713 + if (likely(nplug != NULL)) {
79714 + result = nplug->parse(node);
79715 + if (likely(result == 0))
79716 + node->nplug = nplug;
79717 + } else {
79718 + result = RETERR(-EIO);
79719 + }
79720 + } else
79721 + result = 0;
79722 + return result;
79723 +}
79724 +
79725 +/* zload with readahead */
79726 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
79727 +{
79728 + int result;
79729 +
79730 + assert("nikita-484", node != NULL);
79731 + assert("nikita-1377", znode_invariant(node));
79732 + assert("jmacd-7771", !znode_above_root(node));
79733 + assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
79734 + assert("nikita-3016", schedulable());
79735 +
79736 + if (info)
79737 + formatted_readahead(node, info);
79738 +
79739 + result = jload(ZJNODE(node));
79740 + assert("nikita-1378", znode_invariant(node));
79741 + return result;
79742 +}
79743 +
79744 +/* load content of node into memory */
79745 +int zload(znode * node)
79746 +{
79747 + return zload_ra(node, NULL);
79748 +}
79749 +
79750 +/* call node plugin to initialise newly allocated node. */
79751 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
79752 +{
79753 + return jinit_new(ZJNODE(node), gfp_flags);
79754 +}
79755 +
79756 +/* drop reference to node data. When last reference is dropped, data are
79757 + unloaded. */
79758 +void zrelse(znode * node /* znode to release references to */ )
79759 +{
79760 + assert("nikita-1381", znode_invariant(node));
79761 +
79762 + jrelse(ZJNODE(node));
79763 +}
79764 +
79765 +/* returns free space in node */
79766 +unsigned znode_free_space(znode * node /* znode to query */ )
79767 +{
79768 + assert("nikita-852", node != NULL);
79769 + return node_plugin_by_node(node)->free_space(node);
79770 +}
79771 +
79772 +/* left delimiting key of znode */
79773 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
79774 +{
79775 + assert("nikita-958", node != NULL);
79776 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79777 + assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
79778 + assert("nikita-30671", node->rd_key_version != 0);
79779 + return &node->rd_key;
79780 +}
79781 +
79782 +/* right delimiting key of znode */
79783 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
79784 +{
79785 + assert("nikita-974", node != NULL);
79786 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79787 + assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
79788 + assert("nikita-30681", node->ld_key_version != 0);
79789 + return &node->ld_key;
79790 +}
79791 +
79792 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
79793 + )
79794 +
79795 +/* update right-delimiting key of @node */
79796 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
79797 +{
79798 + assert("nikita-2937", node != NULL);
79799 + assert("nikita-2939", key != NULL);
79800 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79801 + assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
79802 + assert("nikita-2944",
79803 + znode_is_any_locked(node) ||
79804 + znode_get_level(node) != LEAF_LEVEL ||
79805 + keyge(key, &node->rd_key) ||
79806 + keyeq(&node->rd_key, min_key()) ||
79807 + ZF_ISSET(node, JNODE_HEARD_BANSHEE));
79808 +
79809 + node->rd_key = *key;
79810 + ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
79811 + return &node->rd_key;
79812 +}
79813 +
79814 +/* update left-delimiting key of @node */
79815 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
79816 +{
79817 + assert("nikita-2940", node != NULL);
79818 + assert("nikita-2941", key != NULL);
79819 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79820 + assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
79821 + assert("nikita-2943",
79822 + znode_is_any_locked(node) || keyeq(&node->ld_key, min_key()));
79823 +
79824 + node->ld_key = *key;
79825 + ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
79826 + return &node->ld_key;
79827 +}
79828 +
79829 +/* true if @key is inside key range for @node */
79830 +int znode_contains_key(znode * node /* znode to look in */ ,
79831 + const reiser4_key * key /* key to look for */ )
79832 +{
79833 + assert("nikita-1237", node != NULL);
79834 + assert("nikita-1238", key != NULL);
79835 +
79836 + /* left_delimiting_key <= key <= right_delimiting_key */
79837 + return keyle(znode_get_ld_key(node), key)
79838 + && keyle(key, znode_get_rd_key(node));
79839 +}
79840 +
79841 +/* same as znode_contains_key(), but lock dk lock */
79842 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
79843 + const reiser4_key * key /* key to look for */ )
79844 +{
79845 + int result;
79846 +
79847 + assert("umka-056", node != NULL);
79848 + assert("umka-057", key != NULL);
79849 +
79850 + read_lock_dk(znode_get_tree(node));
79851 + result = znode_contains_key(node, key);
79852 + read_unlock_dk(znode_get_tree(node));
79853 + return result;
79854 +}
79855 +
79856 +/* get parent pointer, assuming tree is not locked */
79857 +znode *znode_parent_nolock(const znode * node /* child znode */ )
79858 +{
79859 + assert("nikita-1444", node != NULL);
79860 + return node->in_parent.node;
79861 +}
79862 +
79863 +/* get parent pointer of znode */
79864 +znode *znode_parent(const znode * node /* child znode */ )
79865 +{
79866 + assert("nikita-1226", node != NULL);
79867 + assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
79868 + return znode_parent_nolock(node);
79869 +}
79870 +
79871 +/* detect uber znode used to protect in-superblock tree root pointer */
79872 +int znode_above_root(const znode * node /* znode to query */ )
79873 +{
79874 + assert("umka-059", node != NULL);
79875 +
79876 + return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
79877 +}
79878 +
79879 +/* check that @node is root---that its block number is recorder in the tree as
79880 + that of root node */
79881 +#if REISER4_DEBUG
79882 +static int znode_is_true_root(const znode * node /* znode to query */ )
79883 +{
79884 + assert("umka-060", node != NULL);
79885 + assert("umka-061", current_tree != NULL);
79886 +
79887 + return disk_addr_eq(znode_get_block(node),
79888 + &znode_get_tree(node)->root_block);
79889 +}
79890 +#endif
79891 +
79892 +/* check that @node is root */
79893 +int znode_is_root(const znode * node /* znode to query */ )
79894 +{
79895 + assert("nikita-1206", node != NULL);
79896 +
79897 + return znode_get_level(node) == znode_get_tree(node)->height;
79898 +}
79899 +
79900 +/* Returns true is @node was just created by zget() and wasn't ever loaded
79901 + into memory. */
79902 +/* NIKITA-HANS: yes */
79903 +int znode_just_created(const znode * node)
79904 +{
79905 + assert("nikita-2188", node != NULL);
79906 + return (znode_page(node) == NULL);
79907 +}
79908 +
79909 +/* obtain updated ->znode_epoch. See seal.c for description. */
79910 +__u64 znode_build_version(reiser4_tree * tree)
79911 +{
79912 + __u64 result;
79913 +
79914 + spin_lock(&tree->epoch_lock);
79915 + result = ++tree->znode_epoch;
79916 + spin_unlock(&tree->epoch_lock);
79917 + return result;
79918 +}
79919 +
79920 +void init_load_count(load_count * dh)
79921 +{
79922 + assert("nikita-2105", dh != NULL);
79923 + memset(dh, 0, sizeof *dh);
79924 +}
79925 +
79926 +void done_load_count(load_count * dh)
79927 +{
79928 + assert("nikita-2106", dh != NULL);
79929 + if (dh->node != NULL) {
79930 + for (; dh->d_ref > 0; --dh->d_ref)
79931 + zrelse(dh->node);
79932 + dh->node = NULL;
79933 + }
79934 +}
79935 +
79936 +static int incr_load_count(load_count * dh)
79937 +{
79938 + int result;
79939 +
79940 + assert("nikita-2110", dh != NULL);
79941 + assert("nikita-2111", dh->node != NULL);
79942 +
79943 + result = zload(dh->node);
79944 + if (result == 0)
79945 + ++dh->d_ref;
79946 + return result;
79947 +}
79948 +
79949 +int incr_load_count_znode(load_count * dh, znode * node)
79950 +{
79951 + assert("nikita-2107", dh != NULL);
79952 + assert("nikita-2158", node != NULL);
79953 + assert("nikita-2109",
79954 + ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
79955 +
79956 + dh->node = node;
79957 + return incr_load_count(dh);
79958 +}
79959 +
79960 +int incr_load_count_jnode(load_count * dh, jnode * node)
79961 +{
79962 + if (jnode_is_znode(node)) {
79963 + return incr_load_count_znode(dh, JZNODE(node));
79964 + }
79965 + return 0;
79966 +}
79967 +
79968 +void copy_load_count(load_count * new, load_count * old)
79969 +{
79970 + int ret = 0;
79971 + done_load_count(new);
79972 + new->node = old->node;
79973 + new->d_ref = 0;
79974 +
79975 + while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
79976 + }
79977 +
79978 + assert("jmacd-87589", ret == 0);
79979 +}
79980 +
79981 +void move_load_count(load_count * new, load_count * old)
79982 +{
79983 + done_load_count(new);
79984 + new->node = old->node;
79985 + new->d_ref = old->d_ref;
79986 + old->node = NULL;
79987 + old->d_ref = 0;
79988 +}
79989 +
79990 +/* convert parent pointer into coord */
79991 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
79992 +{
79993 + assert("nikita-3204", pcoord != NULL);
79994 + assert("nikita-3205", coord != NULL);
79995 +
79996 + coord_init_first_unit_nocheck(coord, pcoord->node);
79997 + coord_set_item_pos(coord, pcoord->item_pos);
79998 + coord->between = AT_UNIT;
79999 +}
80000 +
80001 +/* pack coord into parent_coord_t */
80002 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
80003 +{
80004 + assert("nikita-3206", pcoord != NULL);
80005 + assert("nikita-3207", coord != NULL);
80006 +
80007 + pcoord->node = coord->node;
80008 + pcoord->item_pos = coord->item_pos;
80009 +}
80010 +
80011 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
80012 + look for comments there) */
80013 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
80014 +{
80015 + pcoord->node = (znode *) node;
80016 + pcoord->item_pos = (unsigned short)~0;
80017 +}
80018 +
80019 +#if REISER4_DEBUG
80020 +
80021 +/* debugging aid: znode invariant */
80022 +static int znode_invariant_f(const znode * node /* znode to check */ ,
80023 + char const **msg /* where to store error
80024 + * message, if any */ )
80025 +{
80026 +#define _ergo(ant, con) \
80027 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
80028 +
80029 +#define _equi(e1, e2) \
80030 + ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
80031 +
80032 +#define _check(exp) ((*msg) = #exp, (exp))
80033 +
80034 + return jnode_invariant_f(ZJNODE(node), msg) &&
80035 + /* [znode-fake] invariant */
80036 + /* fake znode doesn't have a parent, and */
80037 + _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
80038 + /* there is another way to express this very check, and */
80039 + _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
80040 + /* it has special block number, and */
80041 + _ergo(znode_get_level(node) == 0,
80042 + disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80043 + /* it is the only znode with such block number, and */
80044 + _ergo(!znode_above_root(node) && znode_is_loaded(node),
80045 + !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80046 + /* it is parent of the tree root node */
80047 + _ergo(znode_is_true_root(node),
80048 + znode_above_root(znode_parent(node))) &&
80049 + /* [znode-level] invariant */
80050 + /* level of parent znode is one larger than that of child,
80051 + except for the fake znode, and */
80052 + _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
80053 + znode_get_level(znode_parent(node)) ==
80054 + znode_get_level(node) + 1) &&
80055 + /* left neighbor is at the same level, and */
80056 + _ergo(znode_is_left_connected(node) && node->left != NULL,
80057 + znode_get_level(node) == znode_get_level(node->left)) &&
80058 + /* right neighbor is at the same level */
80059 + _ergo(znode_is_right_connected(node) && node->right != NULL,
80060 + znode_get_level(node) == znode_get_level(node->right)) &&
80061 + /* [znode-connected] invariant */
80062 + _ergo(node->left != NULL, znode_is_left_connected(node)) &&
80063 + _ergo(node->right != NULL, znode_is_right_connected(node)) &&
80064 + _ergo(!znode_is_root(node) && node->left != NULL,
80065 + znode_is_right_connected(node->left) &&
80066 + node->left->right == node) &&
80067 + _ergo(!znode_is_root(node) && node->right != NULL,
80068 + znode_is_left_connected(node->right) &&
80069 + node->right->left == node) &&
80070 + /* [znode-c_count] invariant */
80071 + /* for any znode, c_count of its parent is greater than 0 */
80072 + _ergo(znode_parent(node) != NULL &&
80073 + !znode_above_root(znode_parent(node)),
80074 + znode_parent(node)->c_count > 0) &&
80075 + /* leaves don't have children */
80076 + _ergo(znode_get_level(node) == LEAF_LEVEL,
80077 + node->c_count == 0) &&
80078 + _check(node->zjnode.jnodes.prev != NULL) &&
80079 + _check(node->zjnode.jnodes.next != NULL) &&
80080 + /* orphan doesn't have a parent */
80081 + _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
80082 + /* [znode-modify] invariant */
80083 + /* if znode is not write-locked, its checksum remains
80084 + * invariant */
80085 + /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
80086 + * cannot check this. */
80087 + /* [znode-refs] invariant */
80088 + /* only referenced znode can be long-term locked */
80089 + _ergo(znode_is_locked(node),
80090 + atomic_read(&ZJNODE(node)->x_count) != 0);
80091 +}
80092 +
80093 +/* debugging aid: check znode invariant and panic if it doesn't hold */
80094 +int znode_invariant(znode * node /* znode to check */ )
80095 +{
80096 + char const *failed_msg;
80097 + int result;
80098 +
80099 + assert("umka-063", node != NULL);
80100 + assert("umka-064", current_tree != NULL);
80101 +
80102 + spin_lock_znode(node);
80103 + read_lock_tree(znode_get_tree(node));
80104 + result = znode_invariant_f(node, &failed_msg);
80105 + if (!result) {
80106 + /* print_znode("corrupted node", node); */
80107 + warning("jmacd-555", "Condition %s failed", failed_msg);
80108 + }
80109 + read_unlock_tree(znode_get_tree(node));
80110 + spin_unlock_znode(node);
80111 + return result;
80112 +}
80113 +
80114 +/* return non-0 iff data are loaded into znode */
80115 +int znode_is_loaded(const znode * node /* znode to query */ )
80116 +{
80117 + assert("nikita-497", node != NULL);
80118 + return jnode_is_loaded(ZJNODE(node));
80119 +}
80120 +
80121 +unsigned long znode_times_locked(const znode * z)
80122 +{
80123 + return z->times_locked;
80124 +}
80125 +
80126 +#endif /* REISER4_DEBUG */
80127 +
80128 +/* Make Linus happy.
80129 + Local variables:
80130 + c-indentation-style: "K&R"
80131 + mode-name: "LC"
80132 + c-basic-offset: 8
80133 + tab-width: 8
80134 + fill-column: 120
80135 + End:
80136 +*/
80137 Index: linux-2.6.16/fs/reiser4/znode.h
80138 ===================================================================
80139 --- /dev/null
80140 +++ linux-2.6.16/fs/reiser4/znode.h
80141 @@ -0,0 +1,434 @@
80142 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
80143 + * reiser4/README */
80144 +
80145 +/* Declaration of znode (Zam's node). See znode.c for more details. */
80146 +
80147 +#ifndef __ZNODE_H__
80148 +#define __ZNODE_H__
80149 +
80150 +#include "forward.h"
80151 +#include "debug.h"
80152 +#include "dformat.h"
80153 +#include "key.h"
80154 +#include "coord.h"
80155 +#include "plugin/node/node.h"
80156 +#include "jnode.h"
80157 +#include "lock.h"
80158 +#include "readahead.h"
80159 +
80160 +#include <linux/types.h>
80161 +#include <linux/spinlock.h>
80162 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
80163 +#include <asm/atomic.h>
80164 +#include <asm/semaphore.h>
80165 +
80166 +/* znode tracks its position within parent (internal item in a parent node,
80167 + * that contains znode's block number). */
80168 +typedef struct parent_coord {
80169 + znode *node;
80170 + pos_in_node_t item_pos;
80171 +} parent_coord_t;
80172 +
80173 +/* &znode - node in a reiser4 tree.
80174 +
80175 + NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
80176 + cacheline pressure.
80177 +
80178 + Locking:
80179 +
80180 + Long term: data in a disk node attached to this znode are protected
80181 + by long term, deadlock aware lock ->lock;
80182 +
80183 + Spin lock: the following fields are protected by the spin lock:
80184 +
80185 + ->lock
80186 +
80187 + Following fields are protected by the global tree lock:
80188 +
80189 + ->left
80190 + ->right
80191 + ->in_parent
80192 + ->c_count
80193 +
80194 + Following fields are protected by the global delimiting key lock (dk_lock):
80195 +
80196 + ->ld_key (to update ->ld_key long-term lock on the node is also required)
80197 + ->rd_key
80198 +
80199 + Following fields are protected by the long term lock:
80200 +
80201 + ->nr_items
80202 +
80203 + ->node_plugin is never changed once set. This means that after code made
80204 + itself sure that field is valid it can be accessed without any additional
80205 + locking.
80206 +
80207 + ->level is immutable.
80208 +
80209 + Invariants involving this data-type:
80210 +
80211 + [znode-fake]
80212 + [znode-level]
80213 + [znode-connected]
80214 + [znode-c_count]
80215 + [znode-refs]
80216 + [jnode-refs]
80217 + [jnode-queued]
80218 + [znode-modify]
80219 +
80220 + For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
80221 + Suggestions for how to do that are desired.*/
80222 +struct znode {
80223 + /* Embedded jnode. */
80224 + jnode zjnode;
80225 +
80226 + /* contains three subfields, node, pos_in_node, and pos_in_unit.
80227 +
80228 + pos_in_node and pos_in_unit are only hints that are cached to
80229 + speed up lookups during balancing. They are not required to be up to
80230 + date. Synched in find_child_ptr().
80231 +
80232 + This value allows us to avoid expensive binary searches.
80233 +
80234 + in_parent->node points to the parent of this node, and is NOT a
80235 + hint.
80236 + */
80237 + parent_coord_t in_parent;
80238 +
80239 + /*
80240 + * sibling list pointers
80241 + */
80242 +
80243 + /* left-neighbor */
80244 + znode *left;
80245 + /* right-neighbor */
80246 + znode *right;
80247 +
80248 + /* long term lock on node content. This lock supports deadlock
80249 + detection. See lock.c
80250 + */
80251 + zlock lock;
80252 +
80253 + /* You cannot remove from memory a node that has children in
80254 + memory. This is because we rely on the fact that parent of given
80255 + node can always be reached without blocking for io. When reading a
80256 + node into memory you must increase the c_count of its parent, when
80257 + removing it from memory you must decrease the c_count. This makes
80258 + the code simpler, and the cases where it is suboptimal are truly
80259 + obscure.
80260 + */
80261 + int c_count;
80262 +
80263 + /* plugin of node attached to this znode. NULL if znode is not
80264 + loaded. */
80265 + node_plugin *nplug;
80266 +
80267 + /* version of znode data. This is increased on each modification. This
80268 + * is necessary to implement seals (see seal.[ch]) efficiently. */
80269 + __u64 version;
80270 +
80271 + /* left delimiting key. Necessary to efficiently perform
80272 + balancing with node-level locking. Kept in memory only. */
80273 + reiser4_key ld_key;
80274 + /* right delimiting key. */
80275 + reiser4_key rd_key;
80276 +
80277 + /* znode's tree level */
80278 + __u16 level;
80279 + /* number of items in this node. This field is modified by node
80280 + * plugin. */
80281 + __u16 nr_items;
80282 +
80283 +#if REISER4_DEBUG
80284 + void *creator;
80285 + reiser4_key first_key;
80286 + unsigned long times_locked;
80287 + int left_version; /* when node->left was updated */
80288 + int right_version; /* when node->right was updated */
80289 + int ld_key_version; /* when node->ld_key was updated */
80290 + int rd_key_version; /* when node->rd_key was updated */
80291 +#endif
80292 +
80293 +} __attribute__ ((aligned(16)));
80294 +
80295 +ON_DEBUG(extern atomic_t delim_key_version;
80296 + )
80297 +
80298 +/* In general I think these macros should not be exposed. */
80299 +#define znode_is_locked(node) (lock_is_locked(&node->lock))
80300 +#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
80301 +#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
80302 +#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
80303 +#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
80304 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80305 +/* Macros for accessing the znode state. */
80306 +#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
80307 +#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
80308 +#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
80309 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80310 + znode * parent, tree_level level, gfp_t gfp_flag);
80311 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80312 +extern int zload(znode * node);
80313 +extern int zload_ra(znode * node, ra_info_t * info);
80314 +extern int zinit_new(znode * node, gfp_t gfp_flags);
80315 +extern void zrelse(znode * node);
80316 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80317 +
80318 +/* size of data in znode */
80319 +static inline unsigned
80320 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80321 +{
80322 + assert("nikita-1416", node != NULL);
80323 + return PAGE_CACHE_SIZE;
80324 +}
80325 +
80326 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80327 + coord_t * coord);
80328 +extern void coord_to_parent_coord(const coord_t * coord,
80329 + parent_coord_t * pcoord);
80330 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80331 +
80332 +extern unsigned znode_free_space(znode * node);
80333 +
80334 +extern reiser4_key *znode_get_rd_key(znode * node);
80335 +extern reiser4_key *znode_get_ld_key(znode * node);
80336 +
80337 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80338 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80339 +
80340 +/* `connected' state checks */
80341 +static inline int znode_is_right_connected(const znode * node)
80342 +{
80343 + return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80344 +}
80345 +
80346 +static inline int znode_is_left_connected(const znode * node)
80347 +{
80348 + return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80349 +}
80350 +
80351 +static inline int znode_is_connected(const znode * node)
80352 +{
80353 + return znode_is_right_connected(node) && znode_is_left_connected(node);
80354 +}
80355 +
80356 +extern int znode_shift_order;
80357 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80358 +extern void znode_remove(znode *, reiser4_tree *);
80359 +extern znode *znode_parent(const znode * node);
80360 +extern znode *znode_parent_nolock(const znode * node);
80361 +extern int znode_above_root(const znode * node);
80362 +extern int init_znodes(void);
80363 +extern void done_znodes(void);
80364 +extern int znodes_tree_init(reiser4_tree * ztree);
80365 +extern void znodes_tree_done(reiser4_tree * ztree);
80366 +extern int znode_contains_key(znode * node, const reiser4_key * key);
80367 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80368 +extern unsigned znode_save_free_space(znode * node);
80369 +extern unsigned znode_recover_free_space(znode * node);
80370 +extern znode *zalloc(gfp_t gfp_flag);
80371 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
80372 +extern int zparse(znode * node);
80373 +
80374 +
80375 +extern int znode_just_created(const znode * node);
80376 +
80377 +extern void zfree(znode * node);
80378 +
80379 +#if REISER4_DEBUG
80380 +extern void print_znode(const char *prefix, const znode * node);
80381 +#else
80382 +#define print_znode( p, n ) noop
80383 +#endif
80384 +
80385 +/* Make it look like various znode functions exist instead of treating znodes as
80386 + jnodes in znode-specific code. */
80387 +#define znode_page(x) jnode_page ( ZJNODE(x) )
80388 +#define zdata(x) jdata ( ZJNODE(x) )
80389 +#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
80390 +#define znode_created(x) jnode_created ( ZJNODE(x) )
80391 +#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
80392 +#define znode_convertible(x) jnode_convertible (ZJNODE(x))
80393 +#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
80394 +
80395 +#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
80396 +#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
80397 +#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
80398 +#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
80399 +
80400 +#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
80401 +#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
80402 +#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
80403 +#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
80404 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80405 +
80406 +#if REISER4_DEBUG
80407 +extern int znode_x_count_is_protected(const znode * node);
80408 +extern int znode_invariant(znode * node);
80409 +#endif
80410 +
80411 +/* acquire reference to @node */
80412 +static inline znode *zref(znode * node)
80413 +{
80414 + /* change of x_count from 0 to 1 is protected by tree spin-lock */
80415 + return JZNODE(jref(ZJNODE(node)));
80416 +}
80417 +
80418 +/* release reference to @node */
80419 +static inline void zput(znode * node)
80420 +{
80421 + assert("nikita-3564", znode_invariant(node));
80422 + jput(ZJNODE(node));
80423 +}
80424 +
80425 +/* get the level field for a znode */
80426 +static inline tree_level znode_get_level(const znode * node)
80427 +{
80428 + return node->level;
80429 +}
80430 +
80431 +/* get the level field for a jnode */
80432 +static inline tree_level jnode_get_level(const jnode * node)
80433 +{
80434 + if (jnode_is_znode(node))
80435 + return znode_get_level(JZNODE(node));
80436 + else
80437 + /* unformatted nodes are all at the LEAF_LEVEL and for
80438 + "semi-formatted" nodes like bitmaps, level doesn't matter. */
80439 + return LEAF_LEVEL;
80440 +}
80441 +
80442 +/* true if jnode is on leaf level */
80443 +static inline int jnode_is_leaf(const jnode * node)
80444 +{
80445 + if (jnode_is_znode(node))
80446 + return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80447 + if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80448 + return 1;
80449 + return 0;
80450 +}
80451 +
80452 +/* return znode's tree */
80453 +static inline reiser4_tree *znode_get_tree(const znode * node)
80454 +{
80455 + assert("nikita-2692", node != NULL);
80456 + return jnode_get_tree(ZJNODE(node));
80457 +}
80458 +
80459 +/* resolve race with zput */
80460 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80461 +{
80462 + jnode *j;
80463 +
80464 + j = jnode_rip_sync(tree, ZJNODE(node));
80465 + if (likely(j != NULL))
80466 + node = JZNODE(j);
80467 + else
80468 + node = NULL;
80469 + return node;
80470 +}
80471 +
80472 +#if defined(REISER4_DEBUG)
80473 +int znode_is_loaded(const znode * node /* znode to query */ );
80474 +#endif
80475 +
80476 +extern __u64 znode_build_version(reiser4_tree * tree);
80477 +
80478 +/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
80479 + must load the data for a node in many places. We could do this by simply calling
80480 + zload() everywhere, the difficulty arises when we must release the loaded data by
80481 + calling zrelse. In a function with many possible error/return paths, it requires extra
80482 + work to figure out which exit paths must call zrelse and those which do not. The data
80483 + handle automatically calls zrelse for every zload that it is responsible for. In that
80484 + sense, it acts much like a lock_handle.
80485 +*/
80486 +typedef struct load_count {
80487 + znode *node;
80488 + int d_ref;
80489 +} load_count;
80490 +
80491 +extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
80492 +extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
80493 +extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
80494 +extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
80495 + * incr_load_count_znode, otherwise do nothing (unformatted nodes
80496 + * don't require zload/zrelse treatment). */
80497 +extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
80498 +extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
80499 +
80500 +/* Variable initializers for load_count. */
80501 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
80502 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
80503 +/* A convenience macro for use in assertions or debug-only code, where loaded
80504 + data is only required to perform the debugging check. This macro
80505 + encapsulates an expression inside a pair of calls to zload()/zrelse(). */
80506 +#define WITH_DATA( node, exp ) \
80507 +({ \
80508 + long __with_dh_result; \
80509 + znode *__with_dh_node; \
80510 + \
80511 + __with_dh_node = ( node ); \
80512 + __with_dh_result = zload( __with_dh_node ); \
80513 + if( __with_dh_result == 0 ) { \
80514 + __with_dh_result = ( long )( exp ); \
80515 + zrelse( __with_dh_node ); \
80516 + } \
80517 + __with_dh_result; \
80518 +})
80519 +
80520 +/* Same as above, but accepts a return value in case zload fails. */
80521 +#define WITH_DATA_RET( node, ret, exp ) \
80522 +({ \
80523 + int __with_dh_result; \
80524 + znode *__with_dh_node; \
80525 + \
80526 + __with_dh_node = ( node ); \
80527 + __with_dh_result = zload( __with_dh_node ); \
80528 + if( __with_dh_result == 0 ) { \
80529 + __with_dh_result = ( int )( exp ); \
80530 + zrelse( __with_dh_node ); \
80531 + } else \
80532 + __with_dh_result = ( ret ); \
80533 + __with_dh_result; \
80534 +})
80535 +
80536 +#define WITH_COORD(coord, exp) \
80537 +({ \
80538 + coord_t *__coord; \
80539 + \
80540 + __coord = (coord); \
80541 + coord_clear_iplug(__coord); \
80542 + WITH_DATA(__coord->node, exp); \
80543 +})
80544 +
80545 +#if REISER4_DEBUG
80546 +#define STORE_COUNTERS \
80547 + lock_counters_info __entry_counters = *lock_counters()
80548 +#define CHECK_COUNTERS \
80549 +ON_DEBUG_CONTEXT( \
80550 +({ \
80551 + __entry_counters.x_refs = lock_counters() -> x_refs; \
80552 + __entry_counters.t_refs = lock_counters() -> t_refs; \
80553 + __entry_counters.d_refs = lock_counters() -> d_refs; \
80554 + assert("nikita-2159", \
80555 + !memcmp(&__entry_counters, lock_counters(), \
80556 + sizeof __entry_counters)); \
80557 +}) )
80558 +
80559 +#else
80560 +#define STORE_COUNTERS
80561 +#define CHECK_COUNTERS noop
80562 +#endif
80563 +
80564 +/* __ZNODE_H__ */
80565 +#endif
80566 +
80567 +/* Make Linus happy.
80568 + Local variables:
80569 + c-indentation-style: "K&R"
80570 + mode-name: "LC"
80571 + c-basic-offset: 8
80572 + tab-width: 8
80573 + fill-column: 120
80574 + End:
80575 +*/
80576 Index: linux-2.6.16/include/linux/fs.h
80577 ===================================================================
80578 --- linux-2.6.16.orig/include/linux/fs.h
80579 +++ linux-2.6.16/include/linux/fs.h
80580 @@ -1085,6 +1085,8 @@ struct super_operations {
80581 void (*clear_inode) (struct inode *);
80582 void (*umount_begin) (struct super_block *);
80583
80584 + void (*sync_inodes) (struct super_block *sb,
80585 + struct writeback_control *wbc);
80586 int (*show_options)(struct seq_file *, struct vfsmount *);
80587
80588 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
80589 @@ -1449,6 +1451,7 @@ extern int invalidate_inode_pages2(struc
80590 extern int invalidate_inode_pages2_range(struct address_space *mapping,
80591 pgoff_t start, pgoff_t end);
80592 extern int write_inode_now(struct inode *, int);
80593 +extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
80594 extern int filemap_fdatawrite(struct address_space *);
80595 extern int filemap_flush(struct address_space *);
80596 extern int filemap_fdatawait(struct address_space *);
80597 Index: linux-2.6.16/lib/radix-tree.c
80598 ===================================================================
80599 --- linux-2.6.16.orig/lib/radix-tree.c
80600 +++ linux-2.6.16/lib/radix-tree.c
80601 @@ -139,6 +139,7 @@ static inline void tag_set(struct radix_
80602 {
80603 __set_bit(offset, node->tags[tag]);
80604 }
80605 +EXPORT_SYMBOL(radix_tree_preload);
80606
80607 static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
80608 {
80609 Index: linux-2.6.16/mm/filemap.c
80610 ===================================================================
80611 --- linux-2.6.16.orig/mm/filemap.c
80612 +++ linux-2.6.16/mm/filemap.c
80613 @@ -119,6 +119,7 @@ void __remove_from_page_cache(struct pag
80614 mapping->nrpages--;
80615 pagecache_acct(-1);
80616 }
80617 +EXPORT_SYMBOL(__remove_from_page_cache);
80618
80619 void remove_from_page_cache(struct page *page)
80620 {
80621 @@ -130,6 +131,7 @@ void remove_from_page_cache(struct page
80622 __remove_from_page_cache(page);
80623 write_unlock_irq(&mapping->tree_lock);
80624 }
80625 +EXPORT_SYMBOL(remove_from_page_cache);
80626
80627 static int sync_page(void *word)
80628 {
80629 @@ -272,6 +274,7 @@ static int wait_on_page_writeback_range(
80630
80631 return ret;
80632 }
80633 +EXPORT_SYMBOL(add_to_page_cache_lru);
80634
80635 /*
80636 * Write and wait upon all the pages in the passed range. This is a "data
80637 @@ -632,7 +635,6 @@ repeat:
80638 page_cache_release(cached_page);
80639 return page;
80640 }
80641 -
80642 EXPORT_SYMBOL(find_or_create_page);
80643
80644 /**
80645 @@ -665,6 +667,7 @@ unsigned find_get_pages(struct address_s
80646 read_unlock_irq(&mapping->tree_lock);
80647 return ret;
80648 }
80649 +EXPORT_SYMBOL(find_get_pages);
80650
80651 /*
80652 * Like find_get_pages, except we only return pages which are tagged with
80653 @@ -686,6 +689,7 @@ unsigned find_get_pages_tag(struct addre
80654 read_unlock_irq(&mapping->tree_lock);
80655 return ret;
80656 }
80657 +EXPORT_SYMBOL(find_get_pages_tag);
80658
80659 /*
80660 * Same as grab_cache_page, but do not wait if the page is unavailable.
80661 Index: linux-2.6.16/mm/page-writeback.c
80662 ===================================================================
80663 --- linux-2.6.16.orig/mm/page-writeback.c
80664 +++ linux-2.6.16/mm/page-writeback.c
80665 @@ -187,7 +187,7 @@ get_dirty_limits(struct writeback_state
80666 * If we're over `background_thresh' then pdflush is woken to perform some
80667 * writeout.
80668 */
80669 -static void balance_dirty_pages(struct address_space *mapping)
80670 +void balance_dirty_pages(struct address_space *mapping)
80671 {
80672 struct writeback_state wbs;
80673 long nr_reclaimable;
80674 @@ -253,6 +253,7 @@ static void balance_dirty_pages(struct a
80675 (!laptop_mode && (nr_reclaimable > background_thresh)))
80676 pdflush_operation(background_writeout, 0);
80677 }
80678 +EXPORT_SYMBOL(balance_dirty_pages);
80679
80680 /**
80681 * balance_dirty_pages_ratelimited - balance dirty memory state
80682 Index: linux-2.6.16/mm/readahead.c
80683 ===================================================================
80684 --- linux-2.6.16.orig/mm/readahead.c
80685 +++ linux-2.6.16/mm/readahead.c
80686 @@ -541,6 +541,7 @@ page_cache_readahead(struct address_spac
80687 out:
80688 return ra->prev_page + 1;
80689 }
80690 +EXPORT_SYMBOL_GPL(page_cache_readahead);
80691
80692 /*
80693 * handle_ra_miss() is called when it is known that a page which should have
80694 @@ -558,6 +559,7 @@ void handle_ra_miss(struct address_space
80695 ra->flags &= ~RA_FLAG_INCACHE;
80696 ra->cache_hit = 0;
80697 }
80698 +EXPORT_SYMBOL_GPL(handle_ra_miss);
80699
80700 /*
80701 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a