From: ms <ms@ea5c0bd1-69bd-2848-81d8-4f18e57aeed8>
Date: Sun, 25 Feb 2007 19:46:08 +0000 (+0000)
Subject: Samba-Update von Maniac...
X-Git-Tag: v2.3-beta1~822
X-Git-Url: http://git.ipfire.org/?p=ipfire-2.x.git;a=commitdiff_plain;h=44254afd445354c03bf345c8d56d84dd0e4d8c9f

Samba-Update von Maniac...
Reiser4! Juhuu!


git-svn-id: http://svn.ipfire.org/svn/ipfire/trunk@433 ea5c0bd1-69bd-2848-81d8-4f18e57aeed8
---

diff --git a/config/etc/fstab b/config/etc/fstab
index 5ec3489feb..42ee0c9a81 100644
--- a/config/etc/fstab
+++ b/config/etc/fstab
@@ -1,14 +1,12 @@
-# Begin /etc/fstab
-
+#
 # file system  mount-point  type     options         dump  fsck
 #                                                        order
 
 DEVICE1        /boot        ext2     defaults        1     2
 DEVICE2        swap         swap     pri=1           0     0
-DEVICE3        /            reiserfs defaults        1     1
-DEVICE4        /var         reiserfs defaults        1     1
+DEVICE3        /            reiser4  defaults        1     1
+DEVICE4        /var         reiser4  defaults        1     1
 proc           /proc        proc     defaults        0     0
 sysfs          /sys         sysfs    defaults        0     0
 devpts         /dev/pts     devpts   gid=4,mode=620  0     0
 shm            /dev/shm     tmpfs    defaults        0     0
-# End /etc/fstab
diff --git a/config/kernel/kernel.config.i586 b/config/kernel/kernel.config.i586
index c91e82d224..e4b042c9cb 100644
--- a/config/kernel/kernel.config.i586
+++ b/config/kernel/kernel.config.i586
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.16
-# Thu Feb 15 06:11:38 2007
+# Sat Feb 24 18:21:11 2007
 #
 CONFIG_X86_32=y
 CONFIG_SEMAPHORE_SLEEPERS=y
@@ -1367,7 +1367,6 @@ CONFIG_MISDN_AVM_FRITZ=y
 CONFIG_MISDN_NETJET=y
 CONFIG_MISDN_HFCPCI=y
 # CONFIG_MISDN_HFCMULTI is not set
-# CONFIG_HFCMULTI_PCIMEM is not set
 CONFIG_MISDN_HFCUSB=y
 CONFIG_MISDN_HFCMINI=y
 CONFIG_MISDN_XHFC=y
@@ -1846,6 +1845,8 @@ CONFIG_EXT3_FS_SECURITY=y
 CONFIG_JBD=y
 # CONFIG_JBD_DEBUG is not set
 CONFIG_FS_MBCACHE=y
+CONFIG_REISER4_FS=y
+# CONFIG_REISER4_DEBUG is not set
 CONFIG_REISERFS_FS=y
 # CONFIG_REISERFS_CHECK is not set
 CONFIG_REISERFS_PROC_INFO=y
diff --git a/config/kernel/kernel.config.i586.smp b/config/kernel/kernel.config.i586.smp
index 34109f1e57..b63064a3d4 100644
--- a/config/kernel/kernel.config.i586.smp
+++ b/config/kernel/kernel.config.i586.smp
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.16
-# Thu Feb 15 06:11:51 2007
+# Sat Feb 24 18:21:44 2007
 #
 CONFIG_X86_32=y
 CONFIG_SEMAPHORE_SLEEPERS=y
@@ -1367,7 +1367,6 @@ CONFIG_MISDN_AVM_FRITZ=y
 CONFIG_MISDN_NETJET=y
 CONFIG_MISDN_HFCPCI=y
 # CONFIG_MISDN_HFCMULTI is not set
-# CONFIG_HFCMULTI_PCIMEM is not set
 CONFIG_MISDN_HFCUSB=y
 CONFIG_MISDN_HFCMINI=y
 CONFIG_MISDN_XHFC=y
@@ -1841,6 +1840,8 @@ CONFIG_EXT3_FS_SECURITY=y
 CONFIG_JBD=y
 # CONFIG_JBD_DEBUG is not set
 CONFIG_FS_MBCACHE=y
+CONFIG_REISER4_FS=y
+# CONFIG_REISER4_DEBUG is not set
 CONFIG_REISERFS_FS=y
 # CONFIG_REISERFS_CHECK is not set
 CONFIG_REISERFS_PROC_INFO=y
diff --git a/config/rootfiles/common/coreutils b/config/rootfiles/common/coreutils
index 268bd053b0..f59589c263 100644
--- a/config/rootfiles/common/coreutils
+++ b/config/rootfiles/common/coreutils
@@ -56,7 +56,7 @@ usr/bin/paste
 #usr/bin/pinky
 #usr/bin/pr
 #usr/bin/printenv
-#usr/bin/printf
+usr/bin/printf
 #usr/bin/ptx
 #usr/bin/readlink
 usr/bin/seq
diff --git a/config/rootfiles/common/grub b/config/rootfiles/common/grub
index 1be85a65c3..0a3ec5c2fe 100644
--- a/config/rootfiles/common/grub
+++ b/config/rootfiles/common/grub
@@ -19,6 +19,7 @@ usr/lib/grub
 #usr/lib/grub/i386-pc/jfs_stage1_5
 #usr/lib/grub/i386-pc/minix_stage1_5
 #usr/lib/grub/i386-pc/reiserfs_stage1_5
+#usr/lib/grub/i386-pc/reiser4_stage1_5
 #usr/lib/grub/i386-pc/stage1
 #usr/lib/grub/i386-pc/stage2
 #usr/lib/grub/i386-pc/stage2_eltorito
diff --git a/doc/packages-list.txt b/doc/packages-list.txt
index c6056a6c25..a0a2a9a460 100644
--- a/doc/packages-list.txt
+++ b/doc/packages-list.txt
@@ -123,6 +123,8 @@
 * lame-3.96.1
 * lcms-1.15
 * less-394
+* libaal-1.0.5
+* libaal-1.0.5-minimal
 * libart_lgpl-2.3.17
 * libcap-1.10
 * libmad-0.15.1b
@@ -200,6 +202,7 @@
 * psmisc-22.2
 * razor-agents-2.81
 * readline-5.1
+* reiser4progs-1.0.5
 * reiserfsprogs-3.6.19
 * rp-pppoe-3.8
 * rrdtool-1.2.15
diff --git a/html/cgi-bin/samba.cgi b/html/cgi-bin/samba.cgi
index e71466a914..ab2d0ad393 100644
--- a/html/cgi-bin/samba.cgi
+++ b/html/cgi-bin/samba.cgi
@@ -5,7 +5,6 @@
 # This code is distributed under the terms of the GPL
 #
 # (c) The IPFire Team
-#
 
 use strict;
 # enable only the following on debugging purpose
@@ -17,15 +16,12 @@ require "${General::swroot}/lang.pl";
 require "${General::swroot}/header.pl";
 
 my %sambasettings = ();
+my %cgisettings = ();
 my %checked = ();
 my %netsettings = ();
 my %ovpnsettings = ();
 my $message = "";
 my $errormessage = "";
-my $shareconfigentry = "";
-my @sharesconfig = ();
-my @shareconfigline = ();
-my $shareoption = '';
 my $defaultoption= "[Share]\npath = /shares/share1\ncomment = Share - Public Access\nbrowseable = yes\nwriteable = yes\ncreate mask = 0777\ndirectory mask = 0777\nguest ok = yes\npublic = yes\nforce user = samba";
 my $userentry = "";
 my @user = ();
@@ -54,6 +50,9 @@ my %servicenames =
 my @Zeilen= ();
 my @Shares= ();
 my $shareentry = "";
+my $shareconfigentry = "";
+my @shareconfigline = ();
+my $shareoption = '';
 my @shares = ();
 my @shareline = ();
 my $sharefile = "/var/ipfire/samba/shares";
@@ -86,7 +85,7 @@ $sambasettings{'GUESTACCOUNT'} = 'samba';
 $sambasettings{'MAPTOGUEST'} = 'Never';
 $sambasettings{'BINDINTERFACESONLY'} = 'True';
 ### Values that have to be initialized
-$sambasettings{'ACTION'} = '';
+$cgisettings{'ACTION'} = '';
 
 &General::readhash("${General::swroot}/samba/settings", \%sambasettings);
 &Header::getcgihash(\%sambasettings);
@@ -97,15 +96,36 @@ $sambasettings{'ACTION'} = '';
 ############################################################################################################################
 ############################################# Samba Rootskript aufrufe für SU-Actions ######################################
 
-if ($sambasettings{'ACTION'} eq 'smbuserdisable'){system('/usr/local/bin/sambactrl smbuserdisable $sambasettings{"NAME"}');}
-if ($sambasettings{'ACTION'} eq 'smbuserenable'){system('/usr/local/bin/sambactrl smbuserenable $sambasettings{"NAME"}');}
-if ($sambasettings{'ACTION'} eq 'smbuserdelete'){system('/usr/local/bin/sambactrl smbuserdelete $sambasettings{"NAME"}');}
-if ($sambasettings{'ACTION'} eq 'smbuseradd'){system('/usr/local/bin/sambactrl smbuseradd $username $password');}
-if ($sambasettings{'ACTION'} eq 'smbchangepw'){system('/usr/local/bin/sambactrl smbchangepw $username $password');}
-if ($sambasettings{'ACTION'} eq 'smbrestart'){system('/usr/local/bin/sambactrl smbrestart');}
-if ($sambasettings{'ACTION'} eq 'smbstart'){system('/usr/local/bin/sambactrl smbstart');}
-if ($sambasettings{'ACTION'} eq 'smbstop'){system('/usr/local/bin/sambactrl smbstop');}
-# smbsharechange is directly called by the if clause
+if ($sambasettings{'ACTION'} eq 'smbuserdisable'){system("/usr/local/bin/sambactrl smbuserdisable $sambasettings{'NAME'}");}
+if ($sambasettings{'ACTION'} eq 'smbuserenable'){system("/usr/local/bin/sambactrl smbuserenable $sambasettings{'NAME'}");}
+if ($sambasettings{'ACTION'} eq 'smbuserdelete'){system("/usr/local/bin/sambactrl smbuserdelete $sambasettings{'NAME'}");}
+if ($sambasettings{'ACTION'} eq 'smbuseradd'){system("/usr/local/bin/sambactrl smbuseradd $sambasettings{'USERNAME'} $sambasettings{'PASSWORD'}");}
+if ($sambasettings{'ACTION'} eq 'smbchangepw'){system("/usr/local/bin/sambactrl smbchangepw $sambasettings{'USERNAME'} $sambasettings{'PASSWORD'}");}
+if ($sambasettings{'ACTION'} eq 'smbrestart'){system("/usr/local/bin/sambactrl smbrestart");}
+if ($sambasettings{'ACTION'} eq 'smbstart'){system("/usr/local/bin/sambactrl smbstart");}
+if ($sambasettings{'ACTION'} eq 'smbstop'){system("/usr/local/bin/sambactrl smbstop");}
+if ($sambasettings{'ACTION'} eq 'smbstop'){system("/usr/local/bin/sambactrl smbstop");}
+if ($sambasettings{'ACTION'} eq 'globalreset'){system("/usr/local/bin/sambactrl smbglobalreset");}
+
+# smbsafeconf is directly called by the if clause
+
+if ($sambasettings{'ACTION'} eq 'sharesreset')
+{
+system('/usr/local/bin/sambactrl smbsharesreset');
+ @Zeilen = ();
+ @Shares = ();
+ $shareentry = "";
+ @shares = ();
+ @shareline = ();
+ $EOF = qx(cat $sharefile | wc -l);
+ 
+ @shares = `grep -n '^\\[' $sharefile`;
+ foreach $shareentry (@shares)
+ {
+ @shareline = split( /\:/, $shareentry );
+ push(@Zeilen,$shareline[0]);push(@Shares,$shareline[1]);
+ }
+}
 
 ############################################################################################################################
 ############################################## Samba Share neu anlegen #####################################################
@@ -122,10 +142,15 @@ $emptyline
 END
 ;
 close FILE;
-system('/usr/local/bin/sambactrl smbsharechange');
+system("/usr/local/bin/sambactrl smbsafeconf");
 
  @Zeilen = ();
  @Shares = ();
+ $shareentry = "";
+ @shares = ();
+ @shareline = ();
+ $EOF = qx(cat $sharefile | wc -l);
+ 
  @shares = `grep -n '^\\[' $sharefile`;
  foreach $shareentry (@shares)
  {
@@ -181,12 +206,17 @@ $sharetext
 END
 ;
 close FILE;
-system('/usr/local/bin/sambactrl smbsharechange');
+system("/usr/local/bin/sambactrl smbsafeconf");
 
-@Zeilen = ();
-@Shares = ();
-@shares = `grep -n '^\\[' $sharefile`;
-foreach $shareentry (@shares)
+ @Zeilen = ();
+ @Shares = ();
+ $shareentry = "";
+ @shares = ();
+ @shareline = ();
+ $EOF = qx(cat $sharefile | wc -l);
+ 
+ @shares = `grep -n '^\\[' $sharefile`;
+ foreach $shareentry (@shares)
  {
  @shareline = split( /\:/, $shareentry );
  push(@Zeilen,$shareline[0]);push(@Shares,$shareline[1]);
@@ -201,7 +231,6 @@ my $sharebody = '';
 my $sharehead = '';
 my $sharename = "$sambasettings{'NAME'}";
 my $sharetext = '';
-chomp $sharename;
 $sharename=~s/\s//g;
 
 for(my $i = 0; $i <= $#Shares; $i++)
@@ -240,7 +269,7 @@ $sambasettings{'SHAREOPTION'}
 END
 ;
 close FILE;
-system('/usr/local/bin/sambactrl smbsharechange');
+system("/usr/local/bin/sambactrl smbsafeconf");
 
  @Zeilen = ();
  @Shares = ();
@@ -258,11 +287,11 @@ system('/usr/local/bin/sambactrl smbsharechange');
 if ($sambasettings{'ACTION'} eq $Lang::tr{'save'})
 {
 $sambasettings{'INTERFACES'} = '';
-if ($checked{'GREEN'}){ $sambasettings{'INTERFACES'} = "$sambasettings{'INTERFACES'} $netsettings{'GREEN_DEV'}";}
-if ($checked{'BLUE'}){ $sambasettings{'INTERFACES'} = "$sambasettings{'INTERFACES'} $netsettings{'BLUE_DEV'}";}
-if ($checked{'ORANGE'}){ $sambasettings{'INTERFACES'} = "$sambasettings{'INTERFACES'} $netsettings{'ORANGE_DEV'}";}
-if ($checked{'VPN'}){ $sambasettings{'INTERFACES'} = "$sambasettings{'INTERFACES'} $ovpnsettings{'DDEVICE'}";}
-if ($sambasettings{'OTHERINTERFACES'} ne ''){ $sambasettings{'INTERFACES'} = "$sambasettings{'INTERFACES'} $sambasettings{'OTHERINTERFACES'}";}
+if ($sambasettings{'GREEN'} eq 'on'){ $sambasettings{'INTERFACES'} .= " $netsettings{'GREEN_DEV'}";}
+if ($sambasettings{'BLUE'} eq 'on'){ $sambasettings{'INTERFACES'} .= " $netsettings{'BLUE_DEV'}";}
+if ($sambasettings{'ORANGE'} eq 'on'){ $sambasettings{'INTERFACES'} .= " $netsettings{'ORANGE_DEV'}";}
+if ($sambasettings{'VPN'} eq 'on'){ $sambasettings{'INTERFACES'} .= " $ovpnsettings{'DDEVICE'}";}
+if ($sambasettings{'OTHERINTERFACES'} ne ''){ $sambasettings{'INTERFACES'} .= " $sambasettings{'OTHERINTERFACES'}";}
 
 ############################################################################################################################
 ############################################# Schreiben der Samba globals ##################################################
@@ -314,6 +343,7 @@ print FILE <<END
 END
 ;
 	close FILE;
+	system('/usr/local/bin/sambactrl smbsharechange');
 }
   &General::readhash("${General::swroot}/samba/settings", \%sambasettings);
 
@@ -327,11 +357,21 @@ if ($errormessage) {
 ############################################################################################################################
 ########################################## Aktivieren von Checkboxen und Dropdowns #########################################
 
-$checked{'WINSSUPPORT'}{$sambasettings{'WINSSUPPORT'}} = "checked='checked' ";
-$checked{'GREEN'}{$sambasettings{'GREEN'}} = "checked='checked' ";
-$checked{'BLUE'}{$sambasettings{'BLUE'}} = "checked='checked' ";
-$checked{'ORANGE'}{$sambasettings{'ORANGE'}} = "checked='checked' ";
-$checked{'VPN'}{$sambasettings{'VPN'}} = "checked='checked' ";
+$checked{'WINSSUPPORT'}{'off'} = '';
+$checked{'WINSSUPPORT'}{'on'} = '';
+$checked{'WINSSUPPORT'}{$sambasettings{'WINSSUPPORT'}} = "checked='checked'";
+$checked{'GREEN'}{'off'} = '';
+$checked{'GREEN'}{'on'} = '';
+$checked{'GREEN'}{$sambasettings{'GREEN'}} = "checked='checked'";
+$checked{'BLUE'}{'off'} = '';
+$checked{'BLUE'}{'on'} = '';
+$checked{'BLUE'}{$sambasettings{'BLUE'}} = "checked='checked'";
+$checked{'ORANGE'}{'off'} = '';
+$checked{'ORANGE'}{'on'} = '';
+$checked{'ORANGE'}{$sambasettings{'ORANGE'}} = "checked='checked'";
+$checked{'VPN'}{'off'} = '';
+$checked{'VPN'}{'on'} = '';
+$checked{'VPN'}{$sambasettings{'VPN'}} = "checked='checked'";
 
 $selected{'MAPTOGUEST'}{$sambasettings{'MAPTOGUEST'}} = "selected='selected'";
 $selected{'SECURITY'}{$sambasettings{'SECURITY'}} = "selected='selected'";
@@ -383,19 +423,28 @@ END
         <tr><td align='left'>Workgroup:</td><td><input type='text' name='WORKGRP' value='$sambasettings{'WORKGRP'}' size="30"></td></tr>
         <tr><td align='left'>NetBIOS-Name:</td><td><input type='text' name='NETBIOSNAME' value='$sambasettings{'NETBIOSNAME'}' size="30"></td></tr>
         <tr><td align='left'>Server-String:</td><td><input type='text' name='SRVSTRING' value='$sambasettings{'SRVSTRING'}' size="30"></td></tr>
-        <tr><td align='left'>Interfaces:</td><td><input type='checkbox' name='VPN' $checked{'VPN'}{'on'}><font size='2' color='$Header::colourovpn'><b>   OpenVpn  -  $ovpnsettings{'DDEVICE'}</td></tr>
-        <tr><td align='left'></td><td><input type='checkbox' name='GREEN' $checked{'GREEN'}{'on'}><font size='2' color='$Header::colourgreen'><b>   $Lang::tr{'green'}  -  $netsettings{'GREEN_DEV'}</td></tr>
+        <tr><td align='left'>Interfaces:
+                             </td><td>on <input type='radio' name='VPN' value='on' $checked{'VPN'}{'on'}>/
+                                         <input type='radio' name='VPN' value='off' $checked{'VPN'}{'off'}> off |
+                                         <font size='2' color='$Header::colourovpn'><b>   OpenVpn  -  $ovpnsettings{'DDEVICE'}</td></tr>
+        <tr><td align='left'></td><td>on <input type='radio' name='GREEN' value='on' $checked{'GREEN'}{'on'}>/
+                                         <input type='radio' name='GREEN' value='off' $checked{'GREEN'}{'off'}> off |
+                                         <font size='2' color='$Header::colourgreen'><b>   $Lang::tr{'green'}  -  $netsettings{'GREEN_DEV'}</td></tr>
 END
 ;
          if (&Header::blue_used()){
          print <<END
-         <tr><td align='left'></td><td><input type='checkbox' name='BLUE' $checked{'BLUE'}{'on'}><font size='2' color='$Header::colourblue'><b>   $Lang::tr{'wireless'}  -  $netsettings{'BLUE_DEV'}</td></tr>
+         <tr><td align='left'></td><td>on <input type='radio' name='BLUE' value='on' $checked{'BLUE'}{'on'}>/
+                                          <input type='radio' name='BLUE' value='off' $checked{'BLUE'}{'off'}> off |
+                                          <font size='2' color='$Header::colourblue'><b>   $Lang::tr{'wireless'}  -  $netsettings{'BLUE_DEV'}</td></tr>
 END
 ;
                                     }
          if (&Header::orange_used()){
          print <<END
-         <tr><td align='left'></td><td><input type='checkbox' name='ORANGE' $checked{'ORANGE'}{'on'}><font size='2' color='$Header::colourorange'><b>   $Lang::tr{'dmz'}  -  $netsettings{'ORANGE_DEV'}</td></tr>
+         <tr><td align='left'></td><td>on <input type='radio' name='ORANGE' value='on' $checked{'ORANGE'}{'on'}>/
+                                          <input type='radio' name='ORANGE' value='off' $checked{'ORANGE'}{'off'}> off |
+                                          <font size='2' color='$Header::colourorange'><b>   $Lang::tr{'dmz'}  -  $netsettings{'ORANGE_DEV'}</td></tr>
 END
 ;
                                     }
@@ -423,12 +472,31 @@ END
         <tr bgcolor='${Header::table1colour}'><td colspan='2' align='left'><b>WINS-Optionen</b></td></tr>
         <tr><td align='left'>WINS-Server:</td><td><input type='text' name='WINSSRV' value='$sambasettings{'WINSSRV'}' size="30"></td></tr>
         <tr><td align='left'>WINS-Support:</td><td>on <input type='radio' name='WINSSUPPORT' value='on' $checked{'WINSSUPPORT'}{'on'}>/
-                                                        <input type='radio' name='WINSSUPPORT' value='off' $checked{'WINSSUPPORT'}{'off'}> off</td></tr>
-        <tr><td colspan='2' align='center'><input type='submit' name='ACTION' value=$Lang::tr{'save'}></td></tr>
+                                                      <input type='radio' name='WINSSUPPORT' value='off' $checked{'WINSSUPPORT'}{'off'}> off</td></tr>
+        </table>
+        <table width='50px' cellspacing='0'><br>
+        <tr><td align='center'><input type='hidden' name='ACTION' value=$Lang::tr{'save'}>
+                               <input type='image' alt=$Lang::tr{'save'} src='/images/floppy.gif'></td></form>
+            <td align='center'><form method='post' action='$ENV{'SCRIPT_NAME'}'>
+                               <input type='hidden' name='ACTION' value='globalreset'>
+                               <input type='image' alt='Reset' src='/images/reload.gif'></td></form>
+            <td align='center'><form method='post' action='$ENV{'SCRIPT_NAME'}'>
+                               <input type='hidden' name='ACTION' value='globalcaption'>
+                               <input type='image' alt='Legende' src='/images/info.gif'></td></tr></form>
+        </table>
+END
+;
+if ($sambasettings{'ACTION'} eq 'globalcaption')
+{
+        print <<END
+        <table width='500px' cellspacing='0'><br>
+        <tr><td><b>Legende:</b></td></tr>
+        <tr><td><img src='/images/floppy.gif'>Einstellungen speichern</td></tr>
+        <tr><td><img src='/images/reload.gif'>Auf default zurueck setzen</td></tr>
         </table>
-        </form>
 END
 ;
+}
 &Header::closebox();
 
 ############################################################################################################################
@@ -449,7 +517,6 @@ END
 
         system('/usr/local/bin/sambactrl readsmbpasswd');
         open(FILE, "</var/ipfire/samba/private/smbpasswd") or die "Can't read user file: $!";
-        flock (FILE, 2);
         @user = <FILE>;
         close(FILE);
         system('/usr/local/bin/sambactrl locksmbpasswd');
@@ -460,7 +527,7 @@ END
         <tr><td align='left'>$userline[0]</td><td>
 END
 ;
-        if ($userline[2] =~ m/N/){
+        if ($userline[4] =~ /N/){
         print <<END
         nicht gesetzt</td><td>
 END
@@ -471,23 +538,23 @@ END
 END
 ;
         }
-        if ($userline[2] =~ m/D/){
+        if ($userline[4] =~ /D/){
         print <<END
-        aktiv</td>
+        inaktiv</td>
         <td><form method='post' action='$ENV{'SCRIPT_NAME'}'>
                                         <input type='hidden' name='NAME' value='$userline[0]'>
-                                        <input type='hidden' name='ACTION' value='userdisable'>
-                                        <input type='image' alt='Deaktivieren' src='/images/off.gif'>
+                                        <input type='hidden' name='ACTION' value='smbuserenable'>
+                                        <input type='image' alt='Aktivieren' src='/images/on.gif'>
                                 </form></td>
 END
 ;
         }else{
         print <<END
-        inaktiv</td>
+        aktiv</td>
         <td><form method='post' action='$ENV{'SCRIPT_NAME'}'>
                                         <input type='hidden' name='NAME' value='$userline[0]'>
-                                        <input type='hidden' name='ACTION' value='userenable'>
-                                        <input type='image' alt='Aktivieren' src='/images/on.gif'>
+                                        <input type='hidden' name='ACTION' value='smbuserdisable'>
+                                        <input type='image' alt='Deaktivieren' src='/images/off.gif'>
                                 </form></td>
 END
 ;
@@ -500,7 +567,7 @@ END
                         </form></td>
         <td><form method='post' action='$ENV{'SCRIPT_NAME'}'>
                                 <input type='hidden' name='NAME' value='$userline[0]'>
-                                <input type='hidden' name='ACTION' value='userdelete'>
+                                <input type='hidden' name='ACTION' value='smbuserdelete'>
                                 <input type='image' alt='Loeschen' src='/images/delete.gif'>
                         </form></td>
         </td></tr>
@@ -528,6 +595,7 @@ if ($sambasettings{'ACTION'} eq 'usercaption')
         <tr><td><img src='/images/add.gif'>Benutzer neu anlegen</td></tr>
         <tr><td><img src='/images/on.gif'>Benutzer aktivieren</td></tr>
         <tr><td><img src='/images/off.gif'>Benutzer deaktivieren</td></tr>
+        <tr><td><img src='/images/floppy.gif'>Einstellungen speichern</td></tr>
         <tr><td><img src='/images/edit.gif'>Passwort wechseln</td></tr>
         <tr><td><img src='/images/delete.gif'>Benutzer loeschen</td></tr>
         </table>
@@ -546,15 +614,21 @@ if ($sambasettings{'ACTION'} eq 'userchangepw')
         <tr bgcolor='${Header::table1colour}'><td colspan='2' align='left'><b>Passwort wechseln</b></td></tr>
         <tr><td align='left'>Benutzername</td><td><input type='text' name='USERNAME' value='$username' size="30"></td></tr>
         <tr><td align='left'>Passwort</td><td><input type='password' name='PASSWORD' value='$password' size="30"></td></tr>
-        <tr><td colspan='2' align='center'><input type='submit' name='ACTION' value='smbchangepw'></td></tr></form>
+        <tr><td colspan='2' align='center'><input type='hidden' name='ACTION' value='smbchangepw'>
+                                           <input type='image' alt=$Lang::tr{'save'} src='/images/floppy.gif'></td></tr></form>
+
         </table>
 END
 ;
 }
 if ($sambasettings{'ACTION'} eq 'useradd')
 {
-        my $username = "User";
-        my $password = 'samba';
+        my $username = "user";
+        my $password = "samba";
+        chomp $username;
+        $username=~s/\s//g;
+        chomp $password;
+        $password=~s/\s//g;
         print <<END
         <hr>
         <form method='post' action='$ENV{'SCRIPT_NAME'}'>
@@ -562,7 +636,8 @@ if ($sambasettings{'ACTION'} eq 'useradd')
         <tr bgcolor='${Header::table1colour}'><td colspan='2' align='left'><b>Benutzer neu anlegen</b></td></tr>
         <tr><td align='left'>Benutzername</td><td><input type='text' name='USERNAME' value='$username' size="30"></td></tr>
         <tr><td align='left'>Passwort</td><td><input type='password' name='PASSWORD' value='$password' size="30"></td></tr>
-        <tr><td colspan='2' align='center'><input type='submit' name='ACTION' value='smbuseradd'></td></tr></form>
+        <tr><td colspan='2' align='center'><input type='hidden' name='ACTION' value='smbuseradd'>
+                                           <input type='image' alt=$Lang::tr{'save'} src='/images/floppy.gif'></td></tr></form>
         </table>
 END
 ;
@@ -609,6 +684,9 @@ END
         <tr><td align='center'><form method='post' action='$ENV{'SCRIPT_NAME'}'>
                                <input type='hidden' name='ACTION' value='shareadd'>
                                <input type='image' alt='neuen Share anlegen' src='/images/add.gif'></form></td>
+            <td align='center'><form method='post' action='$ENV{'SCRIPT_NAME'}'>
+                               <input type='hidden' name='ACTION' value='sharesreset'>
+                               <input type='image' alt='Reset' src='/images/reload.gif'></td></form>
             <td align='center'><form method='post' action='$ENV{'SCRIPT_NAME'}'>
                                <input type='hidden' name='ACTION' value='sharecaption'>
                                <input type='image' alt='Legende' src='/images/info.gif'></form>
@@ -623,6 +701,8 @@ if ($sambasettings{'ACTION'} eq 'sharecaption')
         <tr><td><b>Legende:</b></td></tr>
         <tr><td><img src='/images/add.gif'>Share neu anlegen</td></tr>
         <tr><td><img src='/images/edit.gif'>Share bearbeiten</td></tr>
+        <tr><td><img src='/images/floppy.gif'>Einstellungen speichern</td></tr>
+        <tr><td><img src='/images/reload.gif'>Shares zurueck setzen</td></tr>
         <tr><td><img src='/images/delete.gif'>Share loeschen</td></tr>
         </table>
 END
@@ -643,7 +723,8 @@ if ($sambasettings{'ACTION'} eq 'shareadd' || $sambasettings{'ACTION'} eq 'optio
         <tr><td colspan='2' align='center'><textarea name="SHAREOPTION" cols="50" rows="15" Wrap="off">$defaultoption</textarea></td></tr>
         </table>
         <table width='50px' cellspacing='0'><br>
-        <tr><td align='center'><input type='submit' name='ACTION' value='smbshareadd'></td></tr></form>
+        <tr><td align='center'><input type='hidden' name='ACTION' value='smbshareadd'>
+                               <input type='image' alt='Share hinzufuegen' src='/images/floppy.gif'></td></tr></form>
         </table>
 END
 ;
@@ -682,7 +763,7 @@ if ($sambasettings{'ACTION'} eq 'sharechange' || $sambasettings{'ACTION'} eq 'op
         </table>
         <table width='50px' cellspacing='0'><br>
         <tr><td align='center'>
-                                <input type='hidden' name='NAME' value='$sharename'>
+                                <input type='hidden' name='NAME' value='$sambasettings{'NAME'}'>
                                 <input type='submit' name='ACTION' value='smbsharechange'></td></tr></form>
         </table>
 END
diff --git a/lfs/grub b/lfs/grub
index 743d01ce48..d31e24587d 100644
--- a/lfs/grub
+++ b/lfs/grub
@@ -74,7 +74,13 @@ $(subst %,%_MD5,$(objects)) :
 $(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects))
 	@$(PREBUILD)
 	@rm -rf $(DIR_APP) && cd $(DIR_SRC) && tar zxf $(DIR_DL)/$(DL_FILE)
-	cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/grub-0.97-disk_geometry-1.patch
+
+	# Reiser4
+	cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/grub-0.97-reiser4-20050808.diff
+
+	# This fails, but doesn't matter
+	-cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/grub-0.97-disk_geometry-1.patch
+
 	### WHICH ONE OF THESE PATCHES DO WE REALLY NEED? CAN WE DELETE ANY?
 	#cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/grub-0.97/grub-0.97-path-patch
 	#cd $(DIR_APP) && patch -Np0 < $(DIR_SRC)/src/patches/grub-0.97/use_ferror.diff
@@ -90,12 +96,10 @@ $(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects))
 	#cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/grub-0.97/grub-path-cfg.diff
 
 	cd $(DIR_APP) && perl -pi -e 's,/usr/lib/grub/i386-pc,/usr/share/grub/i386-pc,' docs/grub.texi
+	cd $(DIR_APP) && sed -i 's/AM_INIT_AUTOMAKE/&\nAM_PROG_AS/' configure.ac
 
 	cd $(DIR_APP) && autoreconf --install --force
-	cd $(DIR_APP) && ./configure --prefix=/usr
-	# Temporary commented.
-	#	CFLAGS="-0s -fno-stack-protector" \
-	#	STAGE2_CFLAGS="-0s -fno-stack-protector" \
+	cd $(DIR_APP) && CFLAGS="-Os -s -fno-strict-aliasing" ./configure --prefix=/usr
 	    
 	cd $(DIR_APP) && make $(MAKETUNING)
 	cd $(DIR_APP) && make install
@@ -108,7 +112,7 @@ $(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects))
 	done
 
 	install -m 0755 $(DIR_SRC)/src/install+setup/install/grubbatch /boot/grub
-	/usr/sbin/grub-set-default 0
+	/usr/sbin/grub-set-default 1
 
 	@rm -rf $(DIR_APP)
 	@$(POSTBUILD)
diff --git a/lfs/libaal b/lfs/libaal
new file mode 100644
index 0000000000..d638dda7e0
--- /dev/null
+++ b/lfs/libaal
@@ -0,0 +1,89 @@
+###############################################################################
+# This file is part of the IPCop Firewall.                                    #
+#                                                                             #
+# IPCop is free software; you can redistribute it and/or modify               #
+# it under the terms of the GNU General Public License as published by        #
+# the Free Software Foundation; either version 2 of the License, or           #
+# (at your option) any later version.                                         #
+#                                                                             #
+# IPCop is distributed in the hope that it will be useful,                    #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
+# GNU General Public License for more details.                                #
+#                                                                             #
+# You should have received a copy of the GNU General Public License           #
+# along with IPCop; if not, write to the Free Software                        #
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA    #
+#                                                                             #
+# Makefiles are based on LFSMake, which is                                    #
+# Copyright (C) 2002 Rod Roard <rod@sunsetsystems.com>                        #
+#                                                                             #
+###############################################################################
+
+###############################################################################
+# Definitions
+###############################################################################
+
+include Config
+
+VER      = 1.0.5
+
+THISAPP    = libaal-$(VER)
+DL_FILE    = $(THISAPP).tar.gz
+DL_FROM    = $(URL_IPFIRE)
+DIR_APP    = $(DIR_SRC)/$(THISAPP)
+ifeq "$(LFS_PASS)" "install"
+  TARGET     = $(DIR_INFO)/$(THISAPP)-install
+else
+  TARGET     = $(DIR_INFO)/$(THISAPP)
+endif
+
+###############################################################################
+# Top-level Rules
+###############################################################################
+
+objects = $(DL_FILE)
+
+$(DL_FILE) = $(DL_FROM)/$(DL_FILE)
+
+$(DL_FILE)_MD5 = 6c55201acd2a2c0a1f46addf248da6a2
+
+install : $(TARGET)
+
+check : $(patsubst %,$(DIR_CHK)/%,$(objects))
+
+download :$(patsubst %,$(DIR_DL)/%,$(objects))
+
+md5 : $(subst %,%_MD5,$(objects))
+
+###############################################################################
+# Downloading, checking, md5sum
+###############################################################################
+
+$(patsubst %,$(DIR_CHK)/%,$(objects)) :
+	@$(CHECK)
+
+$(patsubst %,$(DIR_DL)/%,$(objects)) :
+	@$(LOAD)
+
+$(subst %,%_MD5,$(objects)) :
+	@$(MD5)
+
+###############################################################################
+# Installation Details
+###############################################################################
+
+$(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects))
+	@$(PREBUILD)
+	@rm -rf $(DIR_APP) && cd $(DIR_SRC) && tar zxf $(DIR_DL)/$(DL_FILE)
+ifeq "$(LFS_PASS)" "install"
+	cd $(DIR_APP) && ./configure --prefix=/opt/$(MACHINE)-uClibc
+	cd $(DIR_APP) && make $(MAKETUNING)
+	cd $(DIR_APP) && make install
+else
+	cd $(DIR_APP) && ./configure --prefix=/usr
+	cd $(DIR_APP) && make $(MAKETUNING)
+	cd $(DIR_APP) && make install
+endif
+	@rm -rf $(DIR_APP)
+	@$(POSTBUILD)
diff --git a/lfs/linux b/lfs/linux
index 86994180f3..5dc9c6ffd1 100644
--- a/lfs/linux
+++ b/lfs/linux
@@ -141,6 +141,9 @@ $(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects))
 	cd $(DIR_SRC) && tar xfz $(DIR_DL)/squashfs3.0.tar.gz
 	cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/squashfs3.0/linux-2.6.15/squashfs3.0-patch
 
+	# Reiser4
+	cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/reiser4-for-2.6.16-5.patch
+
 	# Patch-o-matic
 	cd $(DIR_SRC) && rm -rf iptables-*
 	cd $(DIR_SRC) && tar xfj $(DIR_DL)/iptables-1.3.5.tar.bz2
diff --git a/lfs/reiser4progs b/lfs/reiser4progs
new file mode 100644
index 0000000000..2c2353afeb
--- /dev/null
+++ b/lfs/reiser4progs
@@ -0,0 +1,93 @@
+###############################################################################
+# This file is part of the IPCop Firewall.                                    #
+#                                                                             #
+# IPCop is free software; you can redistribute it and/or modify               #
+# it under the terms of the GNU General Public License as published by        #
+# the Free Software Foundation; either version 2 of the License, or           #
+# (at your option) any later version.                                         #
+#                                                                             #
+# IPCop is distributed in the hope that it will be useful,                    #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
+# GNU General Public License for more details.                                #
+#                                                                             #
+# You should have received a copy of the GNU General Public License           #
+# along with IPCop; if not, write to the Free Software                        #
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA    #
+#                                                                             #
+# Makefiles are based on LFSMake, which is                                    #
+# Copyright (C) 2002 Rod Roard <rod@sunsetsystems.com>                        #
+#                                                                             #
+###############################################################################
+
+###############################################################################
+# Definitions
+###############################################################################
+
+include Config
+
+VER        = 1.0.5
+
+THISAPP    = reiser4progs-$(VER)
+DL_FILE    = $(THISAPP).tar.gz
+DL_FROM    = $(URL_IPFIRE)
+DIR_APP    = $(DIR_SRC)/$(THISAPP)
+ifeq "$(LFS_PASS)" "install"
+	TARGET = $(DIR_INFO)/$(THISAPP)-install
+else
+	TARGET = $(DIR_INFO)/$(THISAPP)
+endif
+
+###############################################################################
+# Top-level Rules
+###############################################################################
+
+objects = $(DL_FILE)
+
+$(DL_FILE) = $(DL_FROM)/$(DL_FILE)
+
+$(DL_FILE)_MD5 = b0756831e16b2395d5f443526d640792
+
+install : $(TARGET)
+
+check : $(patsubst %,$(DIR_CHK)/%,$(objects))
+
+download :$(patsubst %,$(DIR_DL)/%,$(objects))
+
+md5 : $(subst %,%_MD5,$(objects))
+
+###############################################################################
+# Downloading, checking, md5sum
+###############################################################################
+
+$(patsubst %,$(DIR_CHK)/%,$(objects)) :
+	@$(CHECK)
+
+$(patsubst %,$(DIR_DL)/%,$(objects)) :
+	@$(LOAD)
+
+$(subst %,%_MD5,$(objects)) :
+	@$(MD5)
+
+###############################################################################
+# Installation Details
+###############################################################################
+
+$(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects))
+	@$(PREBUILD)
+	@rm -rf $(DIR_APP) && cd $(DIR_SRC) && tar zxf $(DIR_DL)/$(DL_FILE)
+ifeq "$(LFS_PASS)" "install"
+	cd $(DIR_APP) && ./configure --prefix=/opt/$(MACHINE)-uClibc \
+			--with-libaal=/opt/$(MACHINE)-uClibc \
+			--without-readline --disable-shared \
+			--sbindir=/install/initrd/sbin \
+			--libdir=/install/initrd/lib
+	cd $(DIR_APP) && make $(MAKETUNING)
+	cd $(DIR_APP) && make install
+else
+	cd $(DIR_APP) && ./configure --prefix=/usr --sbindir=/sbin
+	cd $(DIR_APP) && make $(MAKETUNING)
+	cd $(DIR_APP) && make install
+endif
+	@rm -rf $(DIR_APP)
+	@$(POSTBUILD)
diff --git a/make.sh b/make.sh
index 632f6ecf97..19152c5cea 100644
--- a/make.sh
+++ b/make.sh
@@ -284,7 +284,6 @@ buildbase() {
     lfsmake2 file
     lfsmake2 findutils
     lfsmake2 flex
-    lfsmake2 grub
     lfsmake2 gawk
     lfsmake2 gettext
     lfsmake2 grep
@@ -294,6 +293,7 @@ buildbase() {
     lfsmake2 iproute2
     lfsmake2 kbd
     lfsmake2 less
+    lfsmake2 libaal
     lfsmake2 make
     lfsmake2 man
     lfsmake2 mktemp
@@ -301,6 +301,7 @@ buildbase() {
     lfsmake2 net-tools
     lfsmake2 patch
     lfsmake2 psmisc
+    lfsmake2 reiser4progs
     lfsmake2 shadow
     lfsmake2 sysklogd
     lfsmake2 sysvinit
@@ -309,6 +310,7 @@ buildbase() {
     lfsmake2 udev
     lfsmake2 util-linux
     lfsmake2 vim
+    lfsmake2 grub
 }
 
 buildipfire() {
@@ -381,6 +383,7 @@ buildipfire() {
   ipfiremake capi4k-utils
   ipfiremake cdrtools
   ipfiremake dnsmasq
+  ipfiremake libaal
   ipfiremake dosfstools
   ipfiremake reiserfsprogs
   ipfiremake squashfstools
@@ -521,7 +524,6 @@ buildipfire() {
   ipfiremake tftp-hpa
   ipfiremake iptraf
   ipfiremake nagios
-  ipfiremake yasuc
 }
 
 buildinstaller() {
@@ -547,7 +549,8 @@ buildinstaller() {
   installmake sysvinit
   installmake misc-progs
   installmake e2fsprogs
-  installmake reiserfsprogs
+  installmake libaal
+  installmake reiser4progs
   installmake sysfsutils
   installmake util-linux
   installmake pciutils
diff --git a/src/install+setup/install/main.c b/src/install+setup/install/main.c
index 926976585c..7105972553 100644
--- a/src/install+setup/install/main.c
+++ b/src/install+setup/install/main.c
@@ -557,9 +557,9 @@ int main(int argc, char *argv[])
 	}
 
 	if (raid_disk)
-		snprintf(commandstring, STRING_SIZE, "/bin/mkreiserfs -f %sp3", hdparams.devnode);	
+		snprintf(commandstring, STRING_SIZE, "/sbin/mkfs.reiser4 -f %sp3", hdparams.devnode);	
 	else
-		snprintf(commandstring, STRING_SIZE, "/bin/mkreiserfs -f %s3", hdparams.devnode);	
+		snprintf(commandstring, STRING_SIZE, "/sbin/mkfs.reiser4 -f %s3", hdparams.devnode);	
 
 	if (runcommandwithstatus(commandstring, ctr[TR_MAKING_ROOT_FILESYSTEM]))
 	{
@@ -568,9 +568,9 @@ int main(int argc, char *argv[])
 	}
 
 	if (raid_disk)
-		snprintf(commandstring, STRING_SIZE, "/bin/mkreiserfs -f %sp4", hdparams.devnode);	
+		snprintf(commandstring, STRING_SIZE, "/sbin/mkfs.reiser4 -f %sp4", hdparams.devnode);	
 	else
-		snprintf(commandstring, STRING_SIZE, "/bin/mkreiserfs -f %s4", hdparams.devnode);	
+		snprintf(commandstring, STRING_SIZE, "/sbin/mkfs.reiser4 -f %s4", hdparams.devnode);	
 
 	if (runcommandwithstatus(commandstring, ctr[TR_MAKING_LOG_FILESYSTEM]))
 	{
diff --git a/src/misc-progs/sambactrl.c b/src/misc-progs/sambactrl.c
index 4f3f248bd7..568af05e36 100644
--- a/src/misc-progs/sambactrl.c
+++ b/src/misc-progs/sambactrl.c
@@ -1,11 +1,17 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
 #include "setuid.h"
 
-int main(int argc, char**argv)
+#define BUFFER_SIZE 1024
+
+char command[BUFFER_SIZE]; 
+
+int main(int argc, char *argv[])
 {
-	char commandstring[256];
 
         if (!(initsetuid()))
                 exit(1);
@@ -17,77 +23,88 @@ int main(int argc, char**argv)
             return 1;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbuserdisable")==0)
+        if (strcmp(argv[1], "smbuserdisable")==0)
         {
-            snprintf(commandstring,STRING_SIZE-1,"/usr/bin/smbpasswd -d %s",argv[2]);
-            safe_system(commandstring);
+            snprintf(command, BUFFER_SIZE-1, "/usr/bin/smbpasswd -d %s", argv[2]);
+            safe_system(command);
+            printf(command);
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbuserenable")==0)
+        if (strcmp(argv[1], "smbuserenable")==0)
         {
-            snprintf(commandstring,STRING_SIZE-1,"/usr/bin/smbpasswd -e %s",argv[2]);
-            safe_system(commandstring);
+            snprintf(command, BUFFER_SIZE-1, "/usr/bin/smbpasswd -e %s", argv[2]);
+            safe_system(command);
+            printf(command);
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbuserdelete")==0)
+        if (strcmp(argv[1], "smbuserdelete")==0)
         {
-            snprintf(commandstring,STRING_SIZE-1,"/usr/bin/smbpasswd -x %s",argv[2]);
-            safe_system(commandstring);
+            snprintf(command, BUFFER_SIZE-1, "/usr/bin/smbpasswd -x %s", argv[2]);
+            safe_system(command);
+            printf(command);
+            snprintf(command, BUFFER_SIZE-1, "/usr/sbin/userdel %s", argv[2]);
+            safe_system(command);
+            printf(command);
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbsafeconf")==0)
+        if (strcmp(argv[1], "smbsafeconf")==0)
         {
             safe_system("/bin/cat /var/ipfire/samba/global /var/ipfire/samba/shares > /var/ipfire/samba/smb.conf");
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbglobalreset")==0)
+        if (strcmp(argv[1], "smbglobalreset")==0)
         {
-            safe_system("/bin/cat /var/ipfire/samba/global.default /var/ipfire/samba/shares > /var/ipfire/samba/smb.conf");
+            safe_system("/bin/cat /var/ipfire/samba/default.global /var/ipfire/samba/shares > /var/ipfire/samba/smb.conf");
+            safe_system("/bin/cat /var/ipfire/samba/default.settings > /var/ipfire/samba/settings");
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbsharesreset")==0)
+        if (strcmp(argv[1], "smbsharesreset")==0)
         {
-            safe_system("/bin/cat /var/ipfire/samba/global /var/ipfire/samba/shares.default > /var/ipfire/samba/smb.conf");
+            safe_system("/bin/cat /var/ipfire/samba/global /var/ipfire/samba/default.shares > /var/ipfire/samba/smb.conf");
+            safe_system("/bin/cat /var/ipfire/samba/default.shares > /var/ipfire/samba/shares");
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbrestart")==0)
+        if (strcmp(argv[1], "smbrestart")==0)
         {
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbstop")==0)
+        if (strcmp(argv[1], "smbstop")==0)
         {
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbstart")==0)
+        if (strcmp(argv[1], "smbstart")==0)
         {
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbuseradd")==0)
+        if (strcmp(argv[1], "smbuseradd")==0)
         {
-            snprintf(commandstring,STRING_SIZE-1,"/usr/sbin/useradd -c 'Samba User' -d /opt/samba -g 2110 -p %s -s /bin/false %s",argv[3],argv[2]);
-            safe_system(commandstring);
-            snprintf(commandstring,STRING_SIZE-1,"/bin/printf '%s\n%s\n' | /usr/local/bin/smbpasswd -as %s",argv[3],argv[3],argv[2]);
-            safe_system(commandstring);
+            snprintf(command, BUFFER_SIZE-1, "/usr/sbin/useradd -c 'Samba User' -d /opt/samba -g 2110 -p %s -s /bin/false %s", argv[3], argv[2]);
+            safe_system(command);
+            printf(command);
+            snprintf(command, BUFFER_SIZE-1, "/usr/bin/printf '%s\n%s\n' | /usr/bin/smbpasswd -as %s", argv[3], argv[3], argv[2]);
+            safe_system(command);
+            printf(command);
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "smbchangepw")==0)
+        if (strcmp(argv[1], "smbchangepw")==0)
         {
-            snprintf(commandstring,STRING_SIZE-1,"/bin/printf '%s\n%s\n' | /usr/local/bin/smbpasswd -as %s",argv[3],argv[3],argv[2]);
-            safe_system(commandstring);
+            snprintf(command, BUFFER_SIZE-1, "/usr/bin/printf '%s\n%s\n' | /usr/bin/smbpasswd -as %s", argv[3], argv[3], argv[2]);
+            safe_system(command);
+            printf(command);
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "readsmbpasswd")==0)
+        if (strcmp(argv[1], "readsmbpasswd")==0)
         {
             safe_system("/bin/chown root:nobody /var/ipfire/samba/private");
             safe_system("/bin/chown root:nobody /var/ipfire/samba/private/smbpasswd");
@@ -96,7 +113,7 @@ int main(int argc, char**argv)
             return 0;
         }
 
-        if (argc==2 && strcmp(argv[1], "locksmbpasswd")==0)
+        if (strcmp(argv[1], "locksmbpasswd")==0)
         {
             safe_system("/bin/chown root:root /var/ipfire/samba/private");
             safe_system("/bin/chown root:root /var/ipfire/samba/private/smbpasswd");
@@ -104,4 +121,12 @@ int main(int argc, char**argv)
             safe_system("/bin/chmod 600 /var/ipfire/samba/private");
             return 0;
         }
+
+        if (strcmp(argv[1], "smbechotest")==0)
+        {
+            sprintf(command, BUFFER_SIZE-1, "/usr/bin/printf %s %s", argv[2], argv[3]);
+            printf(command);
+            safe_system(command);
+            return 0;
+        }
 }
diff --git a/src/patches/grub-0.97-reiser4-20050808.diff b/src/patches/grub-0.97-reiser4-20050808.diff
new file mode 100644
index 0000000000..dee4ed3d51
--- /dev/null
+++ b/src/patches/grub-0.97-reiser4-20050808.diff
@@ -0,0 +1,604 @@
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/config.h.in grub-0.97-1/config.h.in
+--- grub-0.97/config.h.in	2005-05-08 06:48:19.000000000 +0400
++++ grub-0.97-1/config.h.in	2005-08-05 22:48:24.000000000 +0400
+@@ -27,9 +27,16 @@
+ /* Define to 1 if you have the <inttypes.h> header file. */
+ #undef HAVE_INTTYPES_H
+ 
++/* Define to 1 if you have the `aal-minimal' library (-laal-minimal). */
++#undef HAVE_LIBAAL_MINIMAL
++
+ /* Define if you have a curses library */
+ #undef HAVE_LIBCURSES
+ 
++/* Define to 1 if you have the `reiser4-minimal' library (-lreiser4-minimal).
++   */
++#undef HAVE_LIBREISER4_MINIMAL
++
+ /* Define to 1 if you have the <memory.h> header file. */
+ #undef HAVE_MEMORY_H
+ 
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/configure.ac grub-0.97-1/configure.ac
+--- grub-0.97/configure.ac	2005-05-08 06:36:03.000000000 +0400
++++ grub-0.97-1/configure.ac	2005-08-05 22:48:24.000000000 +0400
+@@ -263,6 +263,77 @@ if test x"$enable_reiserfs" != xno; then
+   FSYS_CFLAGS="$FSYS_CFLAGS -DFSYS_REISERFS=1"
+ fi
+ 
++dnl Checking for reiser4
++REISER4_LIBS=""
++REISER4_CFLAGS=""
++
++OLD_LIBS=$LIBS
++OLD_CFLAGS=$CFLAGS
++LIBS=""
++CFLAGS=""
++
++AC_ARG_ENABLE(reiser4,
++  [  --disable-reiser4       disable Reiser4 support in Stage 2])
++
++if test x"$enable_reiser4" != xno; then
++  AC_CHECK_LIB(aal-minimal, aal_mem_init, , 
++    AC_MSG_WARN(
++Reiser4 support is disabled due to inability find libaal-minimal with 
++memory manager support turned on.)
++    enable_reiser4=no
++  )
++fi
++  
++if test x"$enable_reiser4" != xno; then
++  AC_CHECK_HEADER(aal/libaal.h, ,
++  AC_MSG_WARN(
++Libaal header files are not found. Reiser4 support is disabled
++  )
++  enable_reiser4=no)
++fi
++  
++if test x"$enable_reiser4" != xno; then
++  AC_CHECK_LIB(reiser4-minimal, reiser4_fs_open, , 
++  AC_MSG_WARN(
++Reiser4 support is disabled due to inability find valid libreiser4-minimal.)
++    enable_reiser4=no, 
++    -laal-minimal
++  )
++fi
++
++if test x"$enable_reiser4" != xno; then
++  AC_CHECK_HEADER(reiser4/libreiser4.h, ,
++  AC_MSG_WARN(
++Reiser4 header files are not found. Reiser4 support is disabled.
++  )
++  enable_reiser4=no)
++fi
++
++if test x"$enable_reiser4" != xno; then
++  REISER4_CFLAGS="$REISER4_CFLAGS -DFSYS_REISER4=1"
++  REISER4_LIBS=$LIBS
++fi
++
++if test x"$enable_reiser4" != xno; then
++  AC_CHECK_LIB(reiser4-minimal, __sym40_plug_init, 
++    REISER4_CFLAGS="$REISER4_CFLAGS -DENABLE_SYMLINKS=1",
++AC_MSG_WARN(Reiser4 symlinks support is disabled.), 
++    -laal-minimal
++  )
++fi
++
++LIBS=$OLD_LIBS
++CFLAGS=$OLD_CFLAGS
++
++if test x"$enable_reiser4" != xno; then
++  enable_reiser4_support=yes
++  FSYS_CFLAGS="$FSYS_CFLAGS $REISER4_CFLAGS"
++fi
++
++AC_SUBST(REISER4_LIBS)
++AC_SUBST(REISER4_CFLAGS)
++AM_CONDITIONAL(ENABLE_REISER4_SUPPORT, test x"$enable_reiser4" != xno)
++
+ AC_ARG_ENABLE(vstafs,
+   [  --disable-vstafs        disable VSTa FS support in Stage 2])
+ 
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/docs/grub.texi grub-0.97-1/docs/grub.texi
+--- grub-0.97/docs/grub.texi	2005-05-08 06:59:59.000000000 +0400
++++ grub-0.97-1/docs/grub.texi	2005-08-05 22:48:24.000000000 +0400
+@@ -283,7 +283,7 @@ devices, partitions, and files in a dire
+ Support multiple filesystem types transparently, plus a useful explicit
+ blocklist notation. The currently supported filesystem types are
+ @dfn{BSD FFS}, @dfn{DOS FAT16 and FAT32}, @dfn{Minix fs}, @dfn{Linux
+-ext2fs}, @dfn{ReiserFS}, @dfn{JFS}, @dfn{XFS}, and @dfn{VSTa
++ext2fs}, @dfn{ReiserFS}, @dfn{ReiserFS}, @dfn{JFS}, @dfn{XFS}, and @dfn{VSTa
+ fs}. @xref{Filesystem}, for more information.
+ 
+ @item Support automatic decompression
+@@ -1776,6 +1776,7 @@ itself. Usually, this is put in a filesy
+ @itemx jfs_stage1_5
+ @itemx minix_stage1_5
+ @itemx reiserfs_stage1_5
++@itemx reiser4_stage1_5
+ @itemx vstafs_stage1_5
+ @itemx xfs_stage1_5
+ 
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/grub/Makefile.am grub-0.97-1/grub/Makefile.am
+--- grub-0.97/grub/Makefile.am	2005-02-02 23:38:19.000000000 +0300
++++ grub-0.97-1/grub/Makefile.am	2005-08-05 22:48:24.000000000 +0400
+@@ -16,4 +16,4 @@ AM_CPPFLAGS = -DGRUB_UTIL=1 -DFSYS_EXT2F
+ AM_CFLAGS = $(GRUB_CFLAGS)
+ 
+ grub_SOURCES = main.c asmstub.c
+-grub_LDADD = ../stage2/libgrub.a  ../lib/libcommon.a $(GRUB_LIBS)
++grub_LDADD = ../stage2/libgrub.a  ../lib/libcommon.a $(GRUB_LIBS) $(REISER4_LIBS)
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/INSTALL grub-0.97-1/INSTALL
+--- grub-0.97/INSTALL	2005-05-08 06:43:15.000000000 +0400
++++ grub-0.97-1/INSTALL	2005-08-05 22:48:24.000000000 +0400
+@@ -207,6 +207,9 @@ operates.
+ `--disable-reiserfs'
+      Omit the ReiserFS support in Stage 2.
+ 
++`--disable-reiser4'
++     Omit the Reiser4 support in Stage 2.
++
+ `--disable-vstafs'
+      Omit the VSTa filesystem support in Stage 2.
+ 
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/stage2/builtins.c grub-0.97-1/stage2/builtins.c
+--- grub-0.97/stage2/builtins.c	2005-02-16 00:58:23.000000000 +0300
++++ grub-0.97-1/stage2/builtins.c	2005-08-05 22:48:24.000000000 +0400
+@@ -3880,6 +3880,7 @@ setup_func (char *arg, int flags)
+     {"jfs",      "/jfs_stage1_5"},
+     {"minix",    "/minix_stage1_5"},
+     {"reiserfs", "/reiserfs_stage1_5"},
++    {"reiser4",  "/reiser4_stage1_5"},
+     {"vstafs",   "/vstafs_stage1_5"},
+     {"xfs",      "/xfs_stage1_5"}
+   };
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/stage2/disk_io.c grub-0.97-1/stage2/disk_io.c
+--- grub-0.97/stage2/disk_io.c	2004-05-23 20:35:24.000000000 +0400
++++ grub-0.97-1/stage2/disk_io.c	2005-08-05 22:48:24.000000000 +0400
+@@ -63,6 +63,9 @@ struct fsys_entry fsys_table[NUM_FSYS + 
+ # ifdef FSYS_REISERFS
+   {"reiserfs", reiserfs_mount, reiserfs_read, reiserfs_dir, 0, reiserfs_embed},
+ # endif
++# ifdef FSYS_REISER4
++  {"reiser4", reiser4_mount, reiser4_read, reiser4_dir, 0, reiser4_embed},
++# endif
+ # ifdef FSYS_VSTAFS
+   {"vstafs", vstafs_mount, vstafs_read, vstafs_dir, 0, 0},
+ # endif
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/stage2/filesys.h grub-0.97-1/stage2/filesys.h
+--- grub-0.97/stage2/filesys.h	2004-05-14 23:36:43.000000000 +0400
++++ grub-0.97-1/stage2/filesys.h	2005-08-05 22:48:24.000000000 +0400
+@@ -77,6 +77,16 @@ int reiserfs_embed (int *start_sector, i
+ #define FSYS_REISERFS_NUM 0
+ #endif
+ 
++#ifdef FSYS_REISER4
++#define FSYS_REISER4_NUM 1
++int reiser4_mount (void);
++int reiser4_read (char *buf, int len);
++int reiser4_dir (char *dirname);
++int reiser4_embed (int *start_sector, int needed_sectors);
++#else
++#define FSYS_REISER4_NUM 0
++#endif
++
+ #ifdef FSYS_VSTAFS
+ #define FSYS_VSTAFS_NUM 1
+ int vstafs_mount (void);
+@@ -127,8 +137,8 @@ int iso9660_dir (char *dirname);
+ #ifndef NUM_FSYS
+ #define NUM_FSYS	\
+   (FSYS_FFS_NUM + FSYS_FAT_NUM + FSYS_EXT2FS_NUM + FSYS_MINIX_NUM	\
+-   + FSYS_REISERFS_NUM + FSYS_VSTAFS_NUM + FSYS_JFS_NUM + FSYS_XFS_NUM	\
+-   + FSYS_TFTP_NUM + FSYS_ISO9660_NUM + FSYS_UFS2_NUM)
++   + FSYS_REISERFS_NUM + FSYS_REISER4_NUM + FSYS_VSTAFS_NUM + FSYS_JFS_NUM \
++   + FSYS_XFS_NUM + FSYS_TFTP_NUM + FSYS_ISO9660_NUM + FSYS_UFS2_NUM)
+ #endif
+ 
+ /* defines for the block filesystem info area */
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/stage2/fsys_reiser4.c grub-0.97-1/stage2/fsys_reiser4.c
+--- grub-0.97/stage2/fsys_reiser4.c	1970-01-01 03:00:00.000000000 +0300
++++ grub-0.97-1/stage2/fsys_reiser4.c	2005-08-05 22:48:24.000000000 +0400
+@@ -0,0 +1,260 @@
++/* 
++ *  fsys_reiser4.c -- reiser4 filesystem support.
++ *  Copyright (C) 2000, 2001   Free Software Foundation, Inc.
++ *  
++ *  GRUB  --  GRand Unified Bootloader
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or
++ *  (at your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License
++ *  along with this program; if not, write to the Free Software
++ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#ifdef FSYS_REISER4
++#include "shared.h"
++#include "filesys.h"
++
++#define ENABLE_MINIMAL
++#include <reiser4/libreiser4.h>
++
++static reiser4_fs_t *fs = NULL;
++static aal_device_t *dev = NULL;
++static reiser4_object_t *object = NULL;
++
++/* Read callback of grub specific device. It uses devread() for reading passed
++   @count of device blocks starting from @blk to passed @buff. */
++static errno_t grub_dev_read(aal_device_t *device,
++			     void *buff, blk_t blk,
++			     count_t count)
++{
++	unsigned int size;
++	unsigned int factor;
++	unsigned int sector;
++
++	/* Calculating actual sector and size in bytes to be read from
++	   device. */
++	factor = device->blksize / SECTOR_SIZE;
++	sector = (unsigned int)blk << aal_log2(factor);
++	size = (unsigned int)count * (SECTOR_SIZE * factor);
++	
++	/* Reading from the current device */
++        if (!devread(sector, 0, size, buff))
++    	        return -EIO;
++		
++	return 0;
++}
++
++/* Length callback of grub device */
++static count_t grub_dev_len(aal_device_t *device) {
++	unsigned int factor;
++
++	/* Getting partition length in device blocks */
++	factor = device->blksize / SECTOR_SIZE;
++	return (part_length >> aal_log2(factor));
++}
++
++/*
++  Initializing grub device abstraction instance. It will use devread and friends
++  for providing needed functionality.
++*/
++struct aal_device_ops grub_dev_ops = {
++	.read   = grub_dev_read,
++	.len    = grub_dev_len
++};
++
++/* Initializes reiser4 */
++static int reiser4_init(void) {
++	extern aal_hash_table_t *plugins;
++	
++	plugins = NULL;
++
++	/* Initializing memory manager */
++	aal_mem_init((void *)FSYS_BUF, FSYS_BUFLEN);
++
++	/* Initializing device abstraction on current device GRUB uses. */
++	if (!(dev = aal_device_open(&grub_dev_ops, NULL,
++				    SECTOR_SIZE, 0)))
++	{
++		return 0;
++	}
++
++	/* Initializing libreiser4 (plugins, etc) */
++	return !libreiser4_init();
++}
++
++#define MEMORY_WATERMARK 8192
++
++/* Memory pressure detect function. */
++static int mpressure_detect(reiser4_tree_t *tree) {
++	return (aal_mem_free() <= MEMORY_WATERMARK);
++}
++
++/* Reiser4 mount() routine */
++int reiser4_mount(void) {
++	
++	/* Initialize all reiser4 related stuff first */
++	if (!reiser4_init())
++		return 0;
++	
++	/* Open filesystem on @dev. */
++	if (!(fs = reiser4_fs_open(dev)))
++		return 0;
++
++	fs->tree->mpc_func = mpressure_detect;
++	
++	object = NULL;
++	return 1;
++}
++
++/* Reiser4 read() handler */
++int reiser4_read(char *buf, int len) {
++	int64_t read;
++
++	if (object == NULL)
++		return 0;
++
++	/* Seet at current position denoted by @filepos */
++	if (objplug(object)->o.object_ops->seek) {
++		plug_call(objplug(object)->o.object_ops,
++			  seek, object->ent, filepos);
++	}
++	
++	/* Reading current file data starting from @filepos */
++	disk_read_func = disk_read_hook;
++	read = objplug(object)->o.object_ops->read ?
++		plug_call(objplug(object)->o.object_ops, read, 
++			  object->ent, buf, len) : -EINVAL;
++	disk_read_func = NULL;
++
++	if (read < 0) {
++		errnum = ERR_FSYS_CORRUPT;
++		return 0;
++	}
++
++    	filepos += read;
++	return read;
++}
++
++/* Reiser4 file open() routine */
++int reiser4_dir(char *dirname) {
++	char *ch;
++	
++	if (fs == NULL)
++		return 0;
++
++	if (object != NULL) {
++		plug_call(objplug(object)->o.object_ops, 
++			  close, object->ent);
++		aal_free(object);
++		object = NULL;
++	}
++
++	/* Cutting out string after first space character */
++    	if ((ch = aal_strchr(dirname, ' ')))
++		*ch = '\0';
++		
++	/* This function is also called for getting directory list for
++	   maintaining the bash-like completion. */
++#ifndef STAGE1_5
++	if (print_possibilities) {
++		char entry[256];
++		entry_hint_t entry_hint;
++		
++		/* Getting last part of name (jsut after last '/') */
++		if (*(dirname + aal_strlen(dirname) - 1) != '/') {
++		
++			if (!(ch = aal_strrchr(dirname, '/'))) {
++				errnum = ERR_BAD_FILETYPE;
++				return 0;
++			}
++
++			aal_strncpy(entry, ch + 1, sizeof(entry));
++			*(ch + 1) = '\0';
++		} else {
++			aal_memset(entry, 0, sizeof(entry));
++		}
++
++		/* Open obejct by @dirname */
++		if (!(object = reiser4_semantic_open(fs->tree, dirname, 
++						     NULL, 1))) 
++		{
++			errnum = ERR_FILE_NOT_FOUND;
++			return 0;
++		}
++
++		/* Checking if it is a directory object */
++		if (object->ent->opset.plug[OPSET_OBJ]->id.group != DIR_OBJECT)
++		{
++			/* If not, cutting out last '/' character */
++    			if ((ch = aal_strrchr(dirname, '/')))
++				*ch = '\0';
++
++			/* Close current object */
++			plug_call(objplug(object)->o.object_ops, 
++				  close, object->ent);
++			aal_free(object);
++			return 0;
++		}
++		
++		/* Reading the opened directory to build the completion list. */
++		if (objplug(object)->o.object_ops->readdir) {
++			while (plug_call(objplug(object)->o.object_ops, readdir,
++					 object->ent, &entry_hint) > 0) 
++			{
++				if (substring(entry, entry_hint.name) <= 0) {
++					if (print_possibilities > 0)
++						print_possibilities = 
++							-print_possibilities;
++				
++					print_a_completion(entry_hint.name);
++				}
++			}
++		}
++	} else {
++#endif
++		/* This is the case when resier4_dir() is called for open the
++		   file @dirname, not for building completion list. */
++		if (!(object = reiser4_semantic_open(fs->tree, dirname,
++						     NULL, 1))) 
++		{
++			errnum = ERR_FILE_NOT_FOUND;
++			return 0;
++		}
++		
++		if (object->ent->opset.plug[OPSET_OBJ]->id.group != REG_OBJECT)
++		{
++			errnum = ERR_BAD_FILETYPE;
++			return 0;
++		}
++		
++		/* Initializing GRUB global variables @filepos and @filemax. */
++		filepos = 0;
++		filemax = reiser4_object_size(object);
++	
++		return 1;
++#ifndef STAGE1_5
++	}
++
++	return 1;
++#endif
++
++	errnum = ERR_FILE_NOT_FOUND;
++	return 0;
++}
++
++/* Returns how many sectors may be used for embeding reiser4_stage1_5 in teh
++   case of installing GRUB to partition instead of MBR. */
++int reiser4_embed (int *start_sector, int needed_sectors) {
++	*start_sector = 1;
++	return needed_sectors <= ((REISER4_MASTER_OFFSET >> SECTOR_BITS) - 1);
++}
++#endif /* FSYS_REISER4 */
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/stage2/Makefile.am grub-0.97-1/stage2/Makefile.am
+--- grub-0.97/stage2/Makefile.am	2005-02-02 23:37:35.000000000 +0300
++++ grub-0.97-1/stage2/Makefile.am	2005-08-05 22:48:24.000000000 +0400
+@@ -13,17 +13,25 @@ EXTRA_DIST = setjmp.S apm.S $(noinst_SCR
+ # For <stage1.h>.
+ INCLUDES = -I$(top_srcdir)/stage1
+ 
++if ENABLE_REISER4_SUPPORT
++REISER4_STAGE1_5 = reiser4_stage1_5
++REISER4_STAGE1_5_EXEC = reiser4_stage1_5.exec
++else
++REISER4_STAGE1_5 =
++REISER4_STAGE1_5_EXEC =
++endif
++
+ # The library for /sbin/grub.
+ noinst_LIBRARIES = libgrub.a
+ libgrub_a_SOURCES = boot.c builtins.c char_io.c cmdline.c common.c \
+ 	disk_io.c fsys_ext2fs.c fsys_fat.c fsys_ffs.c fsys_iso9660.c \
+-	fsys_jfs.c fsys_minix.c fsys_reiserfs.c fsys_ufs2.c \
++	fsys_jfs.c fsys_minix.c fsys_reiserfs.c fsys_reiser4.c fsys_ufs2.c \
+ 	fsys_vstafs.c fsys_xfs.c gunzip.c md5.c serial.c stage2.c \
+ 	terminfo.c tparm.c
+ libgrub_a_CFLAGS = $(GRUB_CFLAGS) -I$(top_srcdir)/lib \
+ 	-DGRUB_UTIL=1 -DFSYS_EXT2FS=1 -DFSYS_FAT=1 -DFSYS_FFS=1 \
+ 	-DFSYS_ISO9660=1 -DFSYS_JFS=1 -DFSYS_MINIX=1 -DFSYS_REISERFS=1 \
+-	-DFSYS_UFS2=1 -DFSYS_VSTAFS=1 -DFSYS_XFS=1 \
++	$(REISER4_CFLAGS) -DFSYS_UFS2=1 -DFSYS_VSTAFS=1 -DFSYS_XFS=1 \
+ 	-DUSE_MD5_PASSWORDS=1 -DSUPPORT_SERIAL=1 -DSUPPORT_HERCULES=1
+ 
+ # Stage 2 and Stage 1.5's.
+@@ -34,24 +42,26 @@ EXTRA_PROGRAMS = nbloader.exec pxeloader
+ if DISKLESS_SUPPORT
+ pkglib_DATA = stage2 stage2_eltorito e2fs_stage1_5 fat_stage1_5 \
+ 	ffs_stage1_5 iso9660_stage1_5 jfs_stage1_5 minix_stage1_5 \
+-	reiserfs_stage1_5 ufs2_stage1_5 vstafs_stage1_5 xfs_stage1_5 \
+-	nbgrub pxegrub
++	reiserfs_stage1_5 $(REISER4_STAGE1_5) ufs2_stage1_5 \
++	vstafs_stage1_5 xfs_stage1_5 nbgrub pxegrub
+ noinst_DATA = pre_stage2 start start_eltorito nbloader pxeloader diskless
+ noinst_PROGRAMS = pre_stage2.exec start.exec start_eltorito.exec \
+ 	e2fs_stage1_5.exec fat_stage1_5.exec ffs_stage1_5.exec \
+ 	iso9660_stage1_5.exec jfs_stage1_5.exec minix_stage1_5.exec \
+-	reiserfs_stage1_5.exec ufs2_stage1_5.exec vstafs_stage1_5.exec \
+-	xfs_stage1_5.exec nbloader.exec pxeloader.exec diskless.exec
++	reiserfs_stage1_5.exec $(REISER4_STAGE1_5_EXEC) ufs2_stage1_5.exec \
++	vstafs_stage1_5.exec xfs_stage1_5.exec nbloader.exec pxeloader.exec \
++	diskless.exec
+ else
+ pkglib_DATA = stage2 stage2_eltorito e2fs_stage1_5 fat_stage1_5 \
+ 	ffs_stage1_5 iso9660_stage1_5 jfs_stage1_5 minix_stage1_5 \
+-	reiserfs_stage1_5 ufs2_stage1_5 vstafs_stage1_5 xfs_stage1_5
++	reiserfs_stage1_5 $(REISER4_STAGE1_5) ufs2_stage1_5 \
++	vstafs_stage1_5 xfs_stage1_5
+ noinst_DATA = pre_stage2 start start_eltorito
+ noinst_PROGRAMS = pre_stage2.exec start.exec start_eltorito.exec \
+ 	e2fs_stage1_5.exec fat_stage1_5.exec ffs_stage1_5.exec \
+ 	iso9660_stage1_5.exec jfs_stage1_5.exec minix_stage1_5.exec \
+-	reiserfs_stage1_5.exec ufs2_stage1_5.exec vstafs_stage1_5.exec \
+-	xfs_stage1_5.exec
++	reiserfs_stage1_5.exec $(REISER4_STAGE1_5_EXEC) ufs2_stage1_5.exec \
++	vstafs_stage1_5.exec xfs_stage1_5.exec
+ endif
+ MOSTLYCLEANFILES = $(noinst_PROGRAMS)
+ 
+@@ -79,7 +89,7 @@ else
+ HERCULES_FLAGS =
+ endif
+ 
+-STAGE2_COMPILE = $(STAGE2_CFLAGS) -fno-builtin -nostdinc \
++STAGE2_COMPILE = $(STAGE2_CFLAGS) -fno-builtin \
+ 	$(NETBOOT_FLAGS) $(SERIAL_FLAGS) $(HERCULES_FLAGS)
+ 
+ STAGE1_5_LINK = -nostdlib -Wl,-N -Wl,-Ttext -Wl,2000
+@@ -89,14 +99,17 @@ STAGE1_5_COMPILE = $(STAGE2_COMPILE) -DN
+ pre_stage2_exec_SOURCES = asm.S bios.c boot.c builtins.c char_io.c \
+ 	cmdline.c common.c console.c disk_io.c fsys_ext2fs.c \
+ 	fsys_fat.c fsys_ffs.c fsys_iso9660.c fsys_jfs.c fsys_minix.c \
+-	fsys_reiserfs.c fsys_ufs2.c fsys_vstafs.c fsys_xfs.c gunzip.c \
+-	hercules.c md5.c serial.c smp-imps.c stage2.c terminfo.c tparm.c
++	fsys_reiserfs.c fsys_reiser4.c fsys_ufs2.c fsys_vstafs.c \
++	fsys_xfs.c gunzip.c hercules.c md5.c serial.c smp-imps.c \
++	stage2.c terminfo.c tparm.c
+ pre_stage2_exec_CFLAGS = $(STAGE2_COMPILE) $(FSYS_CFLAGS)
+ pre_stage2_exec_CCASFLAGS = $(STAGE2_COMPILE) $(FSYS_CFLAGS)
+ pre_stage2_exec_LDFLAGS = $(PRE_STAGE2_LINK)
+ 
+ if NETBOOT_SUPPORT
+-pre_stage2_exec_LDADD = ../netboot/libdrivers.a
++pre_stage2_exec_LDADD = ../netboot/libdrivers.a $(REISER4_LIBS)
++else
++pre_stage2_exec_LDADD = $(REISER4_LIBS)
+ endif
+ 
+ if DISKLESS_SUPPORT
+@@ -190,6 +203,16 @@ reiserfs_stage1_5_exec_CCASFLAGS = $(STA
+ 	-DNO_BLOCK_FILES=1
+ reiserfs_stage1_5_exec_LDFLAGS = $(STAGE1_5_LINK)
+ 
++# For reiser4_stage1_5 target.
++reiser4_stage1_5_exec_SOURCES = start.S asm.S common.c char_io.c \
++	disk_io.c stage1_5.c fsys_reiser4.c bios.c
++reiser4_stage1_5_exec_CFLAGS = $(STAGE1_5_COMPILE) $(REISER4_CFLAGS) \
++	-DNO_BLOCK_FILES=1
++reiser4_stage1_5_exec_CCASFLAGS = $(STAGE1_5_COMPILE) $(REISER4_CFLAGS) \
++	-DNO_BLOCK_FILES=1
++reiser4_stage1_5_exec_LDFLAGS = $(STAGE1_5_LINK)
++reiser4_stage1_5_exec_LDADD = $(REISER4_LIBS)
++
+ # For vstafs_stage1_5 target.
+ vstafs_stage1_5_exec_SOURCES = start.S asm.S common.c char_io.c \
+ 	disk_io.c stage1_5.c fsys_vstafs.c bios.c
+diff -upNr --exclude=Makefile.in --exclude='*.info' --exclude='*.m4' --exclude='*cache' --exclude=configure --exclude=.deps grub-0.97/stage2/shared.h grub-0.97-1/stage2/shared.h
+--- grub-0.97/stage2/shared.h	2004-06-19 20:40:09.000000000 +0400
++++ grub-0.97-1/stage2/shared.h	2005-08-05 22:48:24.000000000 +0400
+@@ -207,11 +207,12 @@ extern char *grub_scratch_mem;
+ #define STAGE2_ID_FAT_STAGE1_5		3
+ #define STAGE2_ID_MINIX_STAGE1_5	4
+ #define STAGE2_ID_REISERFS_STAGE1_5	5
+-#define STAGE2_ID_VSTAFS_STAGE1_5	6
+-#define STAGE2_ID_JFS_STAGE1_5		7
+-#define STAGE2_ID_XFS_STAGE1_5		8
+-#define STAGE2_ID_ISO9660_STAGE1_5	9
+-#define STAGE2_ID_UFS2_STAGE1_5		10
++#define STAGE2_ID_REISER4_STAGE1_5	6
++#define STAGE2_ID_VSTAFS_STAGE1_5	7
++#define STAGE2_ID_JFS_STAGE1_5		8
++#define STAGE2_ID_XFS_STAGE1_5		9
++#define STAGE2_ID_ISO9660_STAGE1_5	10
++#define STAGE2_ID_UFS2_STAGE1_5		11
+ 
+ #ifndef STAGE1_5
+ # define STAGE2_ID	STAGE2_ID_STAGE2
+@@ -226,6 +227,8 @@ extern char *grub_scratch_mem;
+ #  define STAGE2_ID	STAGE2_ID_MINIX_STAGE1_5
+ # elif defined(FSYS_REISERFS)
+ #  define STAGE2_ID	STAGE2_ID_REISERFS_STAGE1_5
++# elif defined(FSYS_REISER4)
++#  define STAGE2_ID	STAGE2_ID_REISER4_STAGE1_5
+ # elif defined(FSYS_VSTAFS)
+ #  define STAGE2_ID	STAGE2_ID_VSTAFS_STAGE1_5
+ # elif defined(FSYS_JFS)
diff --git a/src/patches/reiser4-for-2.6.16-5.patch b/src/patches/reiser4-for-2.6.16-5.patch
new file mode 100644
index 0000000000..0a62c8631c
--- /dev/null
+++ b/src/patches/reiser4-for-2.6.16-5.patch
@@ -0,0 +1,80701 @@
+Index: linux-2.6.16/Documentation/Changes
+===================================================================
+--- linux-2.6.16.orig/Documentation/Changes
++++ linux-2.6.16/Documentation/Changes
+@@ -54,6 +54,7 @@ o  module-init-tools      0.9.10        
+ o  e2fsprogs              1.29                    # tune2fs
+ o  jfsutils               1.1.3                   # fsck.jfs -V
+ o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
++o  reiser4progs           1.0.0                   # fsck.reiser4 -V
+ o  xfsprogs               2.6.0                   # xfs_db -V
+ o  pcmciautils            004
+ o  pcmcia-cs              3.1.21                  # cardmgr -V
+@@ -163,6 +164,13 @@ The reiserfsprogs package should be used
+ versions of mkreiserfs, resize_reiserfs, debugreiserfs and
+ reiserfsck. These utils work on both i386 and alpha platforms.
+ 
++Reiser4progs
++------------
++
++The reiser4progs package contains utilities for the reiser4 file system.
++Detailed instructions are provided in the README file located at:
++<ftp://ftp.namesys.com/pub/reiser4progs/README>.
++
+ Xfsprogs
+ --------
+ 
+@@ -344,6 +352,10 @@ Reiserfsprogs
+ -------------
+ o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
+ 
++Reiser4progs
++------------
++o  <ftp://ftp.namesys.com/pub/reiser4progs/>
++
+ Xfsprogs
+ --------
+ o  <ftp://oss.sgi.com/projects/xfs/download/>
+Index: linux-2.6.16/Documentation/filesystems/reiser4.txt
+===================================================================
+--- /dev/null
++++ linux-2.6.16/Documentation/filesystems/reiser4.txt
+@@ -0,0 +1,75 @@
++Reiser4 filesystem
++==================
++Reiser4 is a file system based on dancing tree algorithms, and is
++described at http://www.namesys.com
++
++
++References
++==========
++web page		http://namesys.com/v4/v4.html
++source code		ftp://ftp.namesys.com/pub/reiser4-for-2.6/
++userland tools		ftp://ftp.namesys.com/pub/reiser4progs/
++install page		http://www.namesys.com/install_v4.html
++
++Compile options
++===============
++Enable reiser4 debug mode
++       This checks everything imaginable while reiser4
++       runs
++
++Mount options
++=============
++tmgr.atom_max_size=N
++	Atoms containing more than N blocks will be forced to commit.
++	N is decimal.
++	Default is nr_free_pagecache_pages() / 2 at mount time.
++
++tmgr.atom_max_age=N
++	Atoms older than N seconds will be forced to commit. N is decimal.
++	Default is 600.
++
++tmgr.atom_max_flushers=N
++	Limit of concurrent flushers for one atom. 0 means no limit.
++	Default is 0.
++
++tree.cbk_cache.nr_slots=N
++	Number of slots in the cbk cache.
++
++flush.relocate_threshold=N
++	If flush finds more than N adjacent dirty leaf-level blocks it
++	will force them to be relocated.
++	Default is 64.
++
++flush.relocate_distance=N
++	If flush finds can find a block allocation closer than at most
++	N from the preceder it will relocate to that position.
++	Default is 64.
++
++flush.scan_maxnodes=N
++	The maximum number of nodes to scan left on a level during
++	flush.
++	Default is 10000.
++
++optimal_io_size=N
++	Preferred IO size. This value is used to set st_blksize of
++	struct stat.
++	Default is 65536.
++
++bsdgroups
++	Turn on BSD-style gid assignment.
++
++32bittimes
++	By default file in reiser4 have 64 bit timestamps. Files
++	created when filesystem is mounted with 32bittimes mount
++	option will get 32 bit timestamps.
++
++mtflush
++	Turn off concurrent flushing.
++
++nopseudo
++	Disable pseudo files support. See
++	http://namesys.com/v4/pseudo.html for more about pseudo files.
++
++dont_load_bitmap
++	Don't load all bitmap blocks at mount time, it is useful for
++	machines with tiny RAM and large disks.
+Index: linux-2.6.16/fs/Kconfig
+===================================================================
+--- linux-2.6.16.orig/fs/Kconfig
++++ linux-2.6.16/fs/Kconfig
+@@ -177,6 +177,8 @@ config FS_MBCACHE
+ 	default y if EXT2_FS=y || EXT3_FS=y
+ 	default m if EXT2_FS=m || EXT3_FS=m
+ 
++source "fs/reiser4/Kconfig"
++
+ config REISERFS_FS
+ 	tristate "Reiserfs support"
+ 	help
+Index: linux-2.6.16/fs/Makefile
+===================================================================
+--- linux-2.6.16.orig/fs/Makefile
++++ linux-2.6.16/fs/Makefile
+@@ -51,6 +51,7 @@ obj-$(CONFIG_PROFILING)		+= dcookies.o
+  
+ # Do not add any filesystems before this line
+ obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
++obj-$(CONFIG_REISER4_FS)	+= reiser4/
+ obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
+ obj-$(CONFIG_JBD)		+= jbd/
+ obj-$(CONFIG_EXT2_FS)		+= ext2/
+Index: linux-2.6.16/fs/fs-writeback.c
+===================================================================
+--- linux-2.6.16.orig/fs/fs-writeback.c
++++ linux-2.6.16/fs/fs-writeback.c
+@@ -286,8 +286,6 @@ __writeback_single_inode(struct inode *i
+  * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
+  * that it can be located for waiting on in __writeback_single_inode().
+  *
+- * Called under inode_lock.
+- *
+  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+  * This function assumes that the blockdev superblock's inodes are backed by
+  * a variety of queues, so all inodes are searched.  For other superblocks,
+@@ -303,11 +301,13 @@ __writeback_single_inode(struct inode *i
+  * on the writer throttling path, and we get decent balancing between many
+  * throttled threads: we don't want them all piling up on __wait_on_inode.
+  */
+-static void
+-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
++void
++generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+ {
+ 	const unsigned long start = jiffies;	/* livelock avoidance */
+ 
++	spin_lock(&inode_lock);
++
+ 	if (!wbc->for_kupdate || list_empty(&sb->s_io))
+ 		list_splice_init(&sb->s_dirty, &sb->s_io);
+ 
+@@ -387,8 +387,19 @@ sync_sb_inodes(struct super_block *sb, s
+ 		if (wbc->nr_to_write <= 0)
+ 			break;
+ 	}
++	spin_unlock(&inode_lock);
+ 	return;		/* Leave any unwritten inodes on s_io */
+ }
++EXPORT_SYMBOL(generic_sync_sb_inodes);
++
++static void
++sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
++{
++	if (sb->s_op->sync_inodes)
++		sb->s_op->sync_inodes(sb, wbc);
++	else
++		generic_sync_sb_inodes(sb, wbc);
++}
+ 
+ /*
+  * Start writeback of dirty pagecache data against all unlocked inodes.
+@@ -429,11 +440,8 @@ restart:
+ 			 * be unmounted by the time it is released.
+ 			 */
+ 			if (down_read_trylock(&sb->s_umount)) {
+-				if (sb->s_root) {
+-					spin_lock(&inode_lock);
++				if (sb->s_root)
+ 					sync_sb_inodes(sb, wbc);
+-					spin_unlock(&inode_lock);
+-				}
+ 				up_read(&sb->s_umount);
+ 			}
+ 			spin_lock(&sb_lock);
+@@ -469,9 +477,7 @@ void sync_inodes_sb(struct super_block *
+ 			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
+ 			nr_dirty + nr_unstable;
+ 	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
+-	spin_lock(&inode_lock);
+ 	sync_sb_inodes(sb, &wbc);
+-	spin_unlock(&inode_lock);
+ }
+ 
+ /*
+Index: linux-2.6.16/fs/reiser4/Kconfig
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/Kconfig
+@@ -0,0 +1,31 @@
++config REISER4_FS
++	tristate "Reiser4 (EXPERIMENTAL)"
++	depends on EXPERIMENTAL
++	select ZLIB_INFLATE
++	select ZLIB_DEFLATE
++	help
++	  Reiser4 is a filesystem that performs all filesystem operations
++	  as atomic transactions, which means that it either performs a
++	  write, or it does not, and in the event of a crash it does not
++	  partially perform it or corrupt it.
++
++	  It stores files in dancing trees, which are like balanced trees but
++	  faster.  It packs small files together so that they share blocks
++	  without wasting space.  This means you can use it to store really
++	  small files.  It also means that it saves you disk space.  It avoids
++	  hassling you with anachronisms like having a maximum number of
++	  inodes, and wasting space if you use less than that number.
++
++	  Reiser4 is a distinct filesystem type from reiserfs (V3).
++	  It's therefore not possible to use reiserfs file systems
++	  with reiser4.
++
++	  To learn more about reiser4, go to http://www.namesys.com
++
++config REISER4_DEBUG
++	bool "Enable reiser4 debug mode"
++	depends on REISER4_FS
++	help
++	  Don't use this unless you are debugging reiser4.
++
++	  If unsure, say N.
+Index: linux-2.6.16/fs/reiser4/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/Makefile
+@@ -0,0 +1,100 @@
++#
++# reiser4/Makefile
++#
++
++obj-$(CONFIG_REISER4_FS) += reiser4.o
++
++reiser4-y := \
++		   debug.o \
++		   jnode.o \
++		   znode.o \
++		   key.o \
++		   pool.o \
++		   tree_mod.o \
++		   estimate.o \
++		   carry.o \
++		   carry_ops.o \
++		   lock.o \
++		   tree.o \
++		   context.o \
++		   tap.o \
++		   coord.o \
++		   block_alloc.o \
++		   txnmgr.o \
++		   kassign.o \
++		   flush.o \
++		   wander.o \
++		   eottl.o \
++		   search.o \
++		   page_cache.o \
++		   seal.o \
++		   dscale.o \
++		   flush_queue.o \
++		   ktxnmgrd.o \
++		   blocknrset.o \
++		   super.o \
++		   super_ops.o \
++		   fsdata.o \
++		   export_ops.o \
++		   oid.o \
++		   tree_walk.o \
++		   inode.o \
++		   vfs_ops.o \
++		   as_ops.o \
++		   entd.o\
++		   readahead.o \
++		   status_flags.o \
++		   init_super.o \
++		   safe_link.o \
++           \
++		   plugin/plugin.o \
++		   plugin/plugin_set.o \
++		   plugin/node/node.o \
++		   plugin/object.o \
++		   plugin/cluster.o \
++		   plugin/inode_ops.o \
++		   plugin/inode_ops_rename.o \
++		   plugin/file_ops.o \
++		   plugin/file_ops_readdir.o \
++		   plugin/file_plugin_common.o \
++		   plugin/file/file.o \
++		   plugin/file/tail_conversion.o \
++		   plugin/file/symlink.o \
++		   plugin/file/cryptcompress.o \
++		   plugin/dir_plugin_common.o \
++		   plugin/dir/hashed_dir.o \
++		   plugin/dir/seekable_dir.o \
++		   plugin/node/node40.o \
++           \
++		   plugin/crypto/cipher.o \
++		   plugin/crypto/digest.o \
++           \
++		   plugin/compress/minilzo.o \
++		   plugin/compress/compress.o \
++		   plugin/compress/compress_mode.o \
++           \
++		   plugin/item/static_stat.o \
++		   plugin/item/sde.o \
++		   plugin/item/cde.o \
++		   plugin/item/blackbox.o \
++		   plugin/item/internal.o \
++		   plugin/item/tail.o \
++		   plugin/item/ctail.o \
++		   plugin/item/extent.o \
++		   plugin/item/extent_item_ops.o \
++		   plugin/item/extent_file_ops.o \
++		   plugin/item/extent_flush_ops.o \
++           \
++		   plugin/hash.o \
++		   plugin/fibration.o \
++		   plugin/tail_policy.o \
++		   plugin/item/item.o \
++           \
++		   plugin/security/perm.o \
++		   plugin/space/bitmap.o \
++           \
++		   plugin/disk_format/disk_format40.o \
++		   plugin/disk_format/disk_format.o \
++	   \
++		   plugin/regular.o
++
+Index: linux-2.6.16/fs/reiser4/README
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/README
+@@ -0,0 +1,125 @@
++[LICENSING]
++
++Reiser4 is hereby licensed under the GNU General
++Public License version 2.
++
++Source code files that contain the phrase "licensing governed by
++reiser4/README" are "governed files" throughout this file.  Governed
++files are licensed under the GPL.  The portions of them owned by Hans
++Reiser, or authorized to be licensed by him, have been in the past,
++and likely will be in the future, licensed to other parties under
++other licenses.  If you add your code to governed files, and don't
++want it to be owned by Hans Reiser, put your copyright label on that
++code so the poor blight and his customers can keep things straight.
++All portions of governed files not labeled otherwise are owned by Hans
++Reiser, and by adding your code to it, widely distributing it to
++others or sending us a patch, and leaving the sentence in stating that
++licensing is governed by the statement in this file, you accept this.
++It will be a kindness if you identify whether Hans Reiser is allowed
++to license code labeled as owned by you on your behalf other than
++under the GPL, because he wants to know if it is okay to do so and put
++a check in the mail to you (for non-trivial improvements) when he
++makes his next sale.  He makes no guarantees as to the amount if any,
++though he feels motivated to motivate contributors, and you can surely
++discuss this with him before or after contributing.  You have the
++right to decline to allow him to license your code contribution other
++than under the GPL.
++
++Further licensing options are available for commercial and/or other
++interests directly from Hans Reiser: reiser@namesys.com.  If you interpret
++the GPL as not allowing those additional licensing options, you read
++it wrongly, and Richard Stallman agrees with me, when carefully read
++you can see that those restrictions on additional terms do not apply
++to the owner of the copyright, and my interpretation of this shall
++govern for this license.
++
++[END LICENSING]
++
++Reiser4 is a file system based on dancing tree algorithms, and is
++described at http://www.namesys.com
++
++mkfs.reiser4 and other utilities are on our webpage or wherever your
++Linux provider put them.  You really want to be running the latest
++version off the website if you use fsck.
++
++Yes, if you update your reiser4 kernel module you do have to
++recompile your kernel, most of the time.  The errors you get will be
++quite cryptic if your forget to do so.
++
++Hideous Commercial Pitch: Spread your development costs across other OS
++vendors.  Select from the best in the world, not the best in your
++building, by buying from third party OS component suppliers.  Leverage
++the software component development power of the internet.  Be the most
++aggressive in taking advantage of the commercial possibilities of
++decentralized internet development, and add value through your branded
++integration that you sell as an operating system.  Let your competitors
++be the ones to compete against the entire internet by themselves.  Be
++hip, get with the new economic trend, before your competitors do.  Send
++email to reiser@namesys.com
++
++Hans Reiser was the primary architect of Reiser4, but a whole team
++chipped their ideas in.  He invested everything he had into Namesys
++for 5.5 dark years of no money before Reiser3 finally started to work well
++enough to bring in money.  He owns the copyright.
++
++DARPA was the primary sponsor of Reiser4.  DARPA does not endorse
++Reiser4, it merely sponsors it.  DARPA is, in solely Hans's personal
++opinion, unique in its willingness to invest into things more
++theoretical than the VC community can readily understand, and more
++longterm than allows them to be sure that they will be the ones to
++extract the economic benefits from.  DARPA also integrated us into a
++security community that transformed our security worldview.
++
++Vladimir Saveliev is our lead programmer, with us from the beginning,
++and he worked long hours writing the cleanest code.  This is why he is
++now the lead programmer after years of commitment to our work.  He
++always made the effort to be the best he could be, and to make his
++code the best that it could be.  What resulted was quite remarkable. I
++don't think that money can ever motivate someone to work the way he
++did, he is one of the most selfless men I know.
++
++Alexander Lyamin was our sysadmin, and helped to educate us in
++security issues.  Moscow State University and IMT were very generous
++in the internet access they provided us, and in lots of other little
++ways that a generous institution can be.
++
++Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
++locking code, the block allocator, and finished the flushing code.
++His code is always crystal clean and well structured.
++
++Nikita Danilov wrote the core of the balancing code, the core of the
++plugins code, and the directory code.  He worked a steady pace of long
++hours that produced a whole lot of well abstracted code.  He is our
++senior computer scientist.
++
++Vladimir Demidov wrote the parser.  Writing an in kernel parser is
++something very few persons have the skills for, and it is thanks to
++him that we can say that the parser is really not so big compared to
++various bits of our other code, and making a parser work in the kernel
++was not so complicated as everyone would imagine mainly because it was
++him doing it...
++
++Joshua McDonald wrote the transaction manager, and the flush code.
++The flush code unexpectedly turned out be extremely hairy for reasons
++you can read about on our web page, and he did a great job on an
++extremely difficult task.
++
++Nina Reiser handled our accounting, government relations, and much
++more.
++
++Ramon Reiser developed our website.
++
++Beverly Palmer drew our graphics.
++
++Vitaly Fertman developed librepair, userspace plugins repair code, fsck
++and worked with Umka on developing libreiser4 and userspace plugins.
++
++Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
++userspace tools (reiser4progs).
++
++Oleg Drokin (aka Green) is the release manager who fixes everything.
++It is so nice to have someone like that on the team.  He (plus Chris
++and Jeff) make it possible for the entire rest of the Namesys team to
++focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also.  It
++is just amazing to watch his talent for spotting bugs in action.
++
+Index: linux-2.6.16/fs/reiser4/as_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/as_ops.c
+@@ -0,0 +1,392 @@
++/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Interface to VFS. Reiser4 address_space_operations are defined here. */
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "coord.h"
++#include "plugin/item/item.h"
++#include "plugin/file/file.h"
++#include "plugin/security/perm.h"
++#include "plugin/disk_format/disk_format.h"
++#include "plugin/plugin.h"
++#include "plugin/plugin_set.h"
++#include "plugin/object.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree.h"
++#include "vfs_ops.h"
++#include "inode.h"
++#include "page_cache.h"
++#include "ktxnmgrd.h"
++#include "super.h"
++#include "reiser4.h"
++#include "entd.h"
++
++#include <linux/profile.h>
++#include <linux/types.h>
++#include <linux/mount.h>
++#include <linux/vfs.h>
++#include <linux/mm.h>
++#include <linux/buffer_head.h>
++#include <linux/dcache.h>
++#include <linux/list.h>
++#include <linux/pagemap.h>
++#include <linux/slab.h>
++#include <linux/seq_file.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/writeback.h>
++#include <linux/backing-dev.h>
++#include <linux/quotaops.h>
++#include <linux/security.h>
++
++/* address space operations */
++
++/**
++ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
++ * @page: page to be dirtied
++ *
++ * Operation of struct address_space_operations. This implementation is used by
++ * unix and crc file plugins.
++ *
++ * This is called when reiser4 page gets dirtied outside of reiser4, for
++ * example, when dirty bit is moved from pte to physical page.
++ *
++ * Tags page in the mapping's page tree with special tag so that it is possible
++ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
++ * capturing by an atom) later because it can not be done in the contexts where
++ * set_page_dirty is called.
++ */
++int reiser4_set_page_dirty(struct page *page)
++{
++	/* this page can be unformatted only */
++	assert("vs-1734", (page->mapping &&
++			   page->mapping->host &&
++			   get_super_fake(page->mapping->host->i_sb) !=
++			   page->mapping->host
++			   && get_cc_fake(page->mapping->host->i_sb) !=
++			   page->mapping->host
++			   && get_bitmap_fake(page->mapping->host->i_sb) !=
++			   page->mapping->host));
++
++	if (!TestSetPageDirty(page)) {
++		struct address_space *mapping = page->mapping;
++
++		if (mapping) {
++			write_lock_irq(&mapping->tree_lock);
++
++			/* check for race with truncate */
++			if (page->mapping) {
++				assert("vs-1652", page->mapping == mapping);
++				if (mapping_cap_account_dirty(mapping))
++					inc_page_state(nr_dirty);
++				radix_tree_tag_set(&mapping->page_tree,
++						   page->index,
++						   PAGECACHE_TAG_REISER4_MOVED);
++			}
++			write_unlock_irq(&mapping->tree_lock);
++			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
++		}
++	}
++	return 0;
++}
++
++static int filler(void *vp, struct page *page)
++{
++	return page->mapping->a_ops->readpage(vp, page);
++}
++
++/**
++ * reiser4_readpages - submit read for a set of pages
++ * @file: file to read
++ * @mapping: address space
++ * @pages: list of pages to submit read for
++ * @nr_pages: number of pages no the list
++ *
++ * Operation of struct address_space_operations. This implementation is used by
++ * unix and crc file plugins.
++ *
++ * Calls read_cache_pages or readpages hook if it is set.
++ */
++int
++reiser4_readpages(struct file *file, struct address_space *mapping,
++		  struct list_head *pages, unsigned nr_pages)
++{
++	reiser4_context *ctx;
++	reiser4_file_fsdata *fsdata;
++
++	ctx = init_context(mapping->host->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	fsdata = reiser4_get_file_fsdata(file);
++	if (IS_ERR(fsdata)) {
++		reiser4_exit_context(ctx);
++		return PTR_ERR(fsdata);
++	}
++
++	if (fsdata->ra2.readpages)
++		fsdata->ra2.readpages(mapping, pages, fsdata->ra2.data);
++	else {
++		/*
++		 * filler (reiser4 readpage method) may involve tree search
++		 * which is not allowed when lock stack is not clean. If lock
++		 * stack is not clean - do nothing.
++		 */
++		if (lock_stack_isclean(get_current_lock_stack()))
++			read_cache_pages(mapping, pages, filler, file);
++		else {
++			while (!list_empty(pages)) {
++				struct page *victim;
++
++				victim = list_entry(pages->prev, struct page, lru);
++				list_del(&victim->lru);
++				page_cache_release(victim);
++			}
++		}
++	}
++	reiser4_exit_context(ctx);
++	return 0;
++}
++
++/* ->invalidatepage method for reiser4 */
++
++/*
++ * this is called for each truncated page from
++ * truncate_inode_pages()->truncate_{complete,partial}_page().
++ *
++ * At the moment of call, page is under lock, and outstanding io (if any) has
++ * completed.
++ */
++
++/**
++ * reiser4_invalidatepage
++ * @page: page to invalidate
++ * @offset: starting offset for partial invalidation
++ *
++ */
++int reiser4_invalidatepage(struct page *page, unsigned long offset)
++{
++	int ret = 0;
++	reiser4_context *ctx;
++	struct inode *inode;
++	jnode *node;
++
++	/*
++	 * This is called to truncate file's page.
++	 *
++	 * Originally, reiser4 implemented truncate in a standard way
++	 * (vmtruncate() calls ->invalidatepage() on all truncated pages
++	 * first, then file system ->truncate() call-back is invoked).
++	 *
++	 * This lead to the problem when ->invalidatepage() was called on a
++	 * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
++	 * process. That is, truncate was bypassing transactions. To avoid
++	 * this, try_capture_page_to_invalidate() call was added here.
++	 *
++	 * After many troubles with vmtruncate() based truncate (including
++	 * races with flush, tail conversion, etc.) it was re-written in the
++	 * top-to-bottom style: items are killed in cut_tree_object() and
++	 * pages belonging to extent are invalidated in kill_hook_extent(). So
++	 * probably now additional call to capture is not needed here.
++	 */
++
++	assert("nikita-3137", PageLocked(page));
++	assert("nikita-3138", !PageWriteback(page));
++	inode = page->mapping->host;
++
++	/*
++	 * ->invalidatepage() should only be called for the unformatted
++	 * jnodes. Destruction of all other types of jnodes is performed
++	 * separately. But, during some corner cases (like handling errors
++	 * during mount) it is simpler to let ->invalidatepage to be called on
++	 * them. Check for this, and do nothing.
++	 */
++	if (get_super_fake(inode->i_sb) == inode)
++		return 0;
++	if (get_cc_fake(inode->i_sb) == inode)
++		return 0;
++	if (get_bitmap_fake(inode->i_sb) == inode)
++		return 0;
++	assert("vs-1426", PagePrivate(page));
++	assert("vs-1427",
++	       page->mapping == jnode_get_mapping(jnode_by_page(page)));
++	assert("", jprivate(page) != NULL);
++	assert("", ergo(inode_file_plugin(inode) !=
++			file_plugin_by_id(CRC_FILE_PLUGIN_ID), offset == 0));
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	node = jprivate(page);
++	spin_lock_jnode(node);
++	if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
++			  (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
++		/* there is not need to capture */
++		jref(node);
++		JF_SET(node, JNODE_HEARD_BANSHEE);
++		page_clear_jnode(page, node);
++		uncapture_jnode(node);
++		unhash_unformatted_jnode(node);
++		jput(node);
++		reiser4_exit_context(ctx);
++		return 0;
++	}
++	spin_unlock_jnode(node);
++
++	/* capture page being truncated. */
++	ret = try_capture_page_to_invalidate(page);
++	if (ret != 0)
++		warning("nikita-3141", "Cannot capture: %i", ret);
++
++	if (offset == 0) {
++		/* remove jnode from transaction and detach it from page. */
++		jref(node);
++		JF_SET(node, JNODE_HEARD_BANSHEE);
++		/* page cannot be detached from jnode concurrently, because it
++		 * is locked */
++		uncapture_page(page);
++
++		/* this detaches page from jnode, so that jdelete will not try
++		 * to lock page which is already locked */
++		spin_lock_jnode(node);
++		page_clear_jnode(page, node);
++		spin_unlock_jnode(node);
++		unhash_unformatted_jnode(node);
++
++		jput(node);
++	}
++
++	reiser4_exit_context(ctx);
++	return 0;
++}
++
++/* help function called from reiser4_releasepage(). It returns true if jnode
++ * can be detached from its page and page released. */
++int jnode_is_releasable(jnode * node /* node to check */ )
++{
++	assert("nikita-2781", node != NULL);
++	assert_spin_locked(&(node->guard));
++	assert_spin_locked(&(node->load));
++
++	/* is some thread is currently using jnode page, later cannot be
++	 * detached */
++	if (atomic_read(&node->d_count) != 0) {
++		return 0;
++	}
++
++	assert("vs-1214", !jnode_is_loaded(node));
++
++	/*
++	 * can only release page if real block number is assigned to it. Simple
++	 * check for ->atom wouldn't do, because it is possible for node to be
++	 * clean, not it atom yet, and still having fake block number. For
++	 * example, node just created in jinit_new().
++	 */
++	if (blocknr_is_fake(jnode_get_block(node)))
++		return 0;
++
++	/*
++	 * pages prepared for write can not be released anyway, so avoid
++	 * detaching jnode from the page
++	 */
++	if (JF_ISSET(node, JNODE_WRITE_PREPARED))
++		return 0;
++
++	/*
++	 * dirty jnode cannot be released. It can however be submitted to disk
++	 * as part of early flushing, but only after getting flush-prepped.
++	 */
++	if (JF_ISSET(node, JNODE_DIRTY))
++		return 0;
++
++	/* overwrite set is only written by log writer. */
++	if (JF_ISSET(node, JNODE_OVRWR))
++		return 0;
++
++	/* jnode is already under writeback */
++	if (JF_ISSET(node, JNODE_WRITEBACK))
++		return 0;
++
++	/* don't flush bitmaps or journal records */
++	if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
++		return 0;
++
++	return 1;
++}
++
++/*
++ * ->releasepage method for reiser4
++ *
++ * This is called by VM scanner when it comes across clean page.  What we have
++ * to do here is to check whether page can really be released (freed that is)
++ * and if so, detach jnode from it and remove page from the page cache.
++ *
++ * Check for releasability is done by releasable() function.
++ */
++int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
++{
++	jnode *node;
++
++	assert("nikita-2257", PagePrivate(page));
++	assert("nikita-2259", PageLocked(page));
++	assert("nikita-2892", !PageWriteback(page));
++	assert("nikita-3019", schedulable());
++
++	/* NOTE-NIKITA: this can be called in the context of reiser4 call. It
++	   is not clear what to do in this case. A lot of deadlocks seems be
++	   possible. */
++
++	node = jnode_by_page(page);
++	assert("nikita-2258", node != NULL);
++	assert("reiser4-4", page->mapping != NULL);
++	assert("reiser4-5", page->mapping->host != NULL);
++
++	if (PageDirty(page))
++		return 0;
++
++	if (page_count(page) > 3)
++		return 0;
++
++	/* releasable() needs jnode lock, because it looks at the jnode fields
++	 * and we need jload_lock here to avoid races with jload(). */
++	spin_lock_jnode(node);
++	spin_lock(&(node->load));
++	if (jnode_is_releasable(node)) {
++		struct address_space *mapping;
++
++		mapping = page->mapping;
++		jref(node);
++		/* there is no need to synchronize against
++		 * jnode_extent_write() here, because pages seen by
++		 * jnode_extent_write() are !releasable(). */
++		page_clear_jnode(page, node);
++		spin_unlock(&(node->load));
++		spin_unlock_jnode(node);
++
++		/* we are under memory pressure so release jnode also. */
++		jput(node);
++
++		return 1;
++	} else {
++		spin_unlock(&(node->load));
++		spin_unlock_jnode(node);
++		assert("nikita-3020", schedulable());
++		return 0;
++	}
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/block_alloc.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/block_alloc.c
+@@ -0,0 +1,1139 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "debug.h"
++#include "dformat.h"
++#include "plugin/plugin.h"
++#include "txnmgr.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree.h"
++#include "super.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block  */
++#include <linux/spinlock.h>
++
++/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
++
++/* We need to be able to reserve enough disk space to ensure that an atomic
++   operation will have enough disk space to flush (see flush.c and
++   http://namesys.com/v4/v4.html) and commit it once it is started.
++
++   In our design a call for reserving disk space may fail but not an actual
++   block allocation.
++
++   All free blocks, already allocated blocks, and all kinds of reserved blocks
++   are counted in different per-fs block counters.
++
++   A reiser4 super block's set of block counters currently is:
++
++   free -- free blocks,
++   used -- already allocated blocks,
++
++   grabbed -- initially reserved for performing an fs operation, those blocks
++          are taken from free blocks, then grabbed disk space leaks from grabbed
++          blocks counter to other counters like "fake allocated", "flush
++          reserved", "used", the rest of not used grabbed space is returned to
++          free space at the end of fs operation;
++
++   fake allocated -- counts all nodes without real disk block numbers assigned,
++                     we have separate accounting for formatted and unformatted
++                     nodes (for easier debugging);
++
++   flush reserved -- disk space needed for flushing and committing an atom.
++                     Each dirty already allocated block could be written as a
++                     part of atom's overwrite set or as a part of atom's
++                     relocate set.  In both case one additional block is needed,
++                     it is used as a wandered block if we do overwrite or as a
++		     new location for a relocated block.
++
++   In addition, blocks in some states are counted on per-thread and per-atom
++   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
++   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
++   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
++   blocks, which are reserved for flush processing and atom commit. */
++
++/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
++   number of blocks to grab for most expensive case of balancing when the leaf
++   node we insert new item to gets split and new leaf node is allocated.
++
++   So, we need to grab blocks for
++
++   1) one block for possible dirtying the node we insert an item to. That block
++      would be used for node relocation at flush time or for allocating of a
++      wandered one, it depends what will be a result (what set, relocate or
++      overwrite the node gets assigned to) of the node processing by the flush
++      algorithm.
++
++   2) one block for either allocating a new node, or dirtying of right or left
++      clean neighbor, only one case may happen.
++
++   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
++   node, and creation of new node.  have I forgotten something?  email me.
++
++   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
++   counter and in the fs-wide one (both ctx->grabbed_blocks and
++   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
++   decremented by 2.
++
++   Suppose both two blocks were spent for dirtying of an already allocated clean
++   node (one block went from "grabbed" to "flush reserved") and for new block
++   allocating (one block went from "grabbed" to "fake allocated formatted").
++
++   Inserting of a child pointer to the parent node caused parent node to be
++   split, the balancing code takes care about this grabbing necessary space
++   immediately by calling reiser4_grab with BA_RESERVED flag set which means
++   "can use the 5% reserved disk space".
++
++   At this moment insertion completes and grabbed blocks (if they were not used)
++   should be returned to the free space counter.
++
++   However the atom life-cycle is not completed.  The atom had one "flush
++   reserved" block added by our insertion and the new fake allocated node is
++   counted as a "fake allocated formatted" one.  The atom has to be fully
++   processed by flush before commit.  Suppose that the flush moved the first,
++   already allocated node to the atom's overwrite list, the new fake allocated
++   node, obviously, went into the atom relocate set.  The reiser4 flush
++   allocates the new node using one unit from "fake allocated formatted"
++   counter, the log writer uses one from "flush reserved" for wandered block
++   allocation.
++
++   And, it is not the end.  When the wandered block is deallocated after the
++   atom gets fully played (see wander.c for term description), the disk space
++   occupied for it is returned to free blocks. */
++
++/* BLOCK NUMBERS */
++
++/* Any reiser4 node has a block number assigned to it.  We use these numbers for
++   indexing in hash tables, so if a block has not yet been assigned a location
++   on disk we need to give it a temporary fake block number.
++
++   Current implementation of reiser4 uses 64-bit integers for block numbers. We
++   use highest bit in 64-bit block number to distinguish fake and real block
++   numbers. So, only 63 bits may be used to addressing of real device
++   blocks. That "fake" block numbers space is divided into subspaces of fake
++   block numbers for data blocks and for shadow (working) bitmap blocks.
++
++   Fake block numbers for data blocks are generated by a cyclic counter, which
++   gets incremented after each real block allocation. We assume that it is
++   impossible to overload this counter during one transaction life. */
++
++/* Initialize a blocknr hint. */
++void blocknr_hint_init(reiser4_blocknr_hint * hint)
++{
++	memset(hint, 0, sizeof(reiser4_blocknr_hint));
++}
++
++/* Release any resources of a blocknr hint. */
++void blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
++{
++	/* No resources should be freed in current blocknr_hint implementation. */
++}
++
++/* see above for explanation of fake block number.  */
++/* Audited by: green(2002.06.11) */
++int blocknr_is_fake(const reiser4_block_nr * da)
++{
++	/* The reason for not simply returning result of '&' operation is that
++	   while return value is (possibly 32bit) int,  the reiser4_block_nr is
++	   at least 64 bits long, and high bit (which is the only possible
++	   non zero bit after the masking) would be stripped off */
++	return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
++}
++
++/* Static functions for <reiser4 super block>/<reiser4 context> block counters
++   arithmetic. Mostly, they are isolated to not to code same assertions in
++   several places. */
++static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
++{
++	BUG_ON(ctx->grabbed_blocks < count);
++	assert("zam-527", ctx->grabbed_blocks >= count);
++	ctx->grabbed_blocks -= count;
++}
++
++static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
++{
++	ctx->grabbed_blocks += count;
++}
++
++static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
++{
++	assert("zam-525", sbinfo->blocks_grabbed >= count);
++	sbinfo->blocks_grabbed -= count;
++}
++
++/* Decrease the counter of block reserved for flush in super block. */
++static void
++sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
++{
++	assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
++	sbinfo->blocks_flush_reserved -= count;
++}
++
++static void
++sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
++			   reiser4_ba_flags_t flags)
++{
++	if (flags & BA_FORMATTED) {
++		assert("zam-806", sbinfo->blocks_fake_allocated >= count);
++		sbinfo->blocks_fake_allocated -= count;
++	} else {
++		assert("zam-528",
++		       sbinfo->blocks_fake_allocated_unformatted >= count);
++		sbinfo->blocks_fake_allocated_unformatted -= count;
++	}
++}
++
++static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
++{
++	assert("zam-530",
++	       sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
++	sbinfo->blocks_used -= count;
++}
++
++static void
++sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
++{
++	assert("edward-501", sbinfo->blocks_clustered >= count);
++	sbinfo->blocks_clustered -= count;
++}
++
++/* Increase the counter of block reserved for flush in atom. */
++static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
++{
++	assert("zam-772", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++	atom->flush_reserved += count;
++}
++
++/* Decrease the counter of block reserved for flush in atom. */
++static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
++{
++	assert("zam-774", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++	assert("nikita-2790", atom->flush_reserved >= count);
++	atom->flush_reserved -= count;
++}
++
++/* super block has 6 counters: free, used, grabbed, fake allocated
++   (formatted and unformatted) and flush reserved. Their sum must be
++   number of blocks on a device. This function checks this */
++int check_block_counters(const struct super_block *super)
++{
++	__u64 sum;
++
++	sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
++	    reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
++	    reiser4_fake_allocated_unformatted(super) + flush_reserved(super) +
++	    reiser4_clustered_blocks(super);
++	if (reiser4_block_count(super) != sum) {
++		printk("super block counters: "
++		       "used %llu, free %llu, "
++		       "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
++		       "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
++		       (unsigned long long)reiser4_data_blocks(super),
++		       (unsigned long long)reiser4_free_blocks(super),
++		       (unsigned long long)reiser4_grabbed_blocks(super),
++		       (unsigned long long)reiser4_fake_allocated(super),
++		       (unsigned long long)
++		       reiser4_fake_allocated_unformatted(super),
++		       (unsigned long long)flush_reserved(super),
++		       (unsigned long long)reiser4_clustered_blocks(super),
++		       (unsigned long long)sum,
++		       (unsigned long long)reiser4_block_count(super));
++		return 0;
++	}
++	return 1;
++}
++
++/* Adjust "working" free blocks counter for number of blocks we are going to
++   allocate.  Record number of grabbed blocks in fs-wide and per-thread
++   counters.  This function should be called before bitmap scanning or
++   allocating fake block numbers
++
++   @super           -- pointer to reiser4 super block;
++   @count           -- number of blocks we reserve;
++
++   @return          -- 0 if success,  -ENOSPC, if all
++                       free blocks are preserved or already allocated.
++*/
++
++static int
++reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
++{
++	__u64 free_blocks;
++	int ret = 0, use_reserved = flags & BA_RESERVED;
++	reiser4_super_info_data *sbinfo;
++
++	assert("vs-1276", ctx == get_current_context());
++
++	/* Do not grab anything on ro-mounted fs. */
++	if (rofs_super(ctx->super)) {
++		ctx->grab_enabled = 0;
++		return 0;
++	}
++
++	sbinfo = get_super_private(ctx->super);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	free_blocks = sbinfo->blocks_free;
++
++	if ((use_reserved && free_blocks < count) ||
++	    (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
++		ret = RETERR(-ENOSPC);
++		goto unlock_and_ret;
++	}
++
++	add_to_ctx_grabbed(ctx, count);
++
++	sbinfo->blocks_grabbed += count;
++	sbinfo->blocks_free -= count;
++
++#if REISER4_DEBUG
++	if (ctx->grabbed_initially == 0)
++		ctx->grabbed_initially = count;
++#endif
++
++	assert("nikita-2986", check_block_counters(ctx->super));
++
++	/* disable grab space in current context */
++	ctx->grab_enabled = 0;
++
++      unlock_and_ret:
++	spin_unlock_reiser4_super(sbinfo);
++
++	return ret;
++}
++
++int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
++{
++	int ret;
++	reiser4_context *ctx;
++
++	assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
++				   lock_stack_isclean(get_current_lock_stack
++						      ())));
++	ctx = get_current_context();
++	if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
++		return 0;
++	}
++
++	ret = reiser4_grab(ctx, count, flags);
++	if (ret == -ENOSPC) {
++
++		/* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
++		if (flags & BA_CAN_COMMIT) {
++			txnmgr_force_commit_all(ctx->super, 0);
++			ctx->grab_enabled = 1;
++			ret = reiser4_grab(ctx, count, flags);
++		}
++	}
++	/*
++	 * allocation from reserved pool cannot fail. This is severe error.
++	 */
++	assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
++	return ret;
++}
++
++/*
++ * SPACE RESERVED FOR UNLINK/TRUNCATE
++ *
++ * Unlink and truncate require space in transaction (to update stat data, at
++ * least). But we don't want rm(1) to fail with "No space on device" error.
++ *
++ * Solution is to reserve 5% of disk space for truncates and
++ * unlinks. Specifically, normal space grabbing requests don't grab space from
++ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
++ * drain it. Per super block delete_sema semaphore is used to allow only one
++ * thread at a time to grab from reserved area.
++ *
++ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
++ * flag.
++ *
++ */
++
++int reiser4_grab_reserved(struct super_block *super,
++			  __u64 count, reiser4_ba_flags_t flags)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(super);
++
++	assert("nikita-3175", flags & BA_CAN_COMMIT);
++
++	/* Check the delete semaphore already taken by us, we assume that
++	 * reading of machine word is atomic. */
++	if (sbinfo->delete_sema_owner == current) {
++		if (reiser4_grab_space
++		    (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
++			warning("zam-1003",
++				"nested call of grab_reserved fails count=(%llu)",
++				(unsigned long long)count);
++			reiser4_release_reserved(super);
++			return RETERR(-ENOSPC);
++		}
++		return 0;
++	}
++
++	if (reiser4_grab_space(count, flags)) {
++		down(&sbinfo->delete_sema);
++		assert("nikita-2929", sbinfo->delete_sema_owner == NULL);
++		sbinfo->delete_sema_owner = current;
++
++		if (reiser4_grab_space(count, flags | BA_RESERVED)) {
++			warning("zam-833",
++				"reserved space is not enough (%llu)",
++				(unsigned long long)count);
++			reiser4_release_reserved(super);
++			return RETERR(-ENOSPC);
++		}
++	}
++	return 0;
++}
++
++void reiser4_release_reserved(struct super_block *super)
++{
++	reiser4_super_info_data *info;
++
++	info = get_super_private(super);
++	if (info->delete_sema_owner == current) {
++		info->delete_sema_owner = NULL;
++		up(&info->delete_sema);
++	}
++}
++
++static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	ctx = get_current_context();
++	sub_from_ctx_grabbed(ctx, count);
++
++	sbinfo = get_super_private(ctx->super);
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_sb_grabbed(sbinfo, count);
++	/* return sbinfo locked */
++	return sbinfo;
++}
++
++/* is called after @count fake block numbers are allocated and pointer to
++   those blocks are inserted into tree. */
++static void grabbed2fake_allocated_formatted(void)
++{
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = grabbed2fake_allocated_head(1);
++	sbinfo->blocks_fake_allocated++;
++
++	assert("vs-922", check_block_counters(reiser4_get_current_sb()));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/**
++ * grabbed2fake_allocated_unformatted
++ * @count:
++ *
++ */
++static void grabbed2fake_allocated_unformatted(int count)
++{
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = grabbed2fake_allocated_head(count);
++	sbinfo->blocks_fake_allocated_unformatted += count;
++
++	assert("vs-9221", check_block_counters(reiser4_get_current_sb()));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++void grabbed2cluster_reserved(int count)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	ctx = get_current_context();
++	sub_from_ctx_grabbed(ctx, count);
++
++	sbinfo = get_super_private(ctx->super);
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_sb_grabbed(sbinfo, count);
++	sbinfo->blocks_clustered += count;
++
++	assert("edward-504", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++void cluster_reserved2grabbed(int count)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	ctx = get_current_context();
++
++	sbinfo = get_super_private(ctx->super);
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_cluster_reserved(sbinfo, count);
++	sbinfo->blocks_grabbed += count;
++
++	assert("edward-505", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++	add_to_ctx_grabbed(ctx, count);
++}
++
++void cluster_reserved2free(int count)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	assert("edward-503", get_current_context()->grabbed_blocks == 0);
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_cluster_reserved(sbinfo, count);
++	sbinfo->blocks_free += count;
++
++	assert("edward-502", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++static DEFINE_SPINLOCK(fake_lock);
++static reiser4_block_nr fake_gen = 0;
++
++/**
++ * assign_fake_blocknr
++ * @blocknr:
++ * @count:
++ *
++ * Obtain a fake block number for new node which will be used to refer to
++ * this newly allocated node until real allocation is done.
++ */
++static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
++{
++	spin_lock(&fake_lock);
++	*blocknr = fake_gen;
++	fake_gen += count;
++	spin_unlock(&fake_lock);
++
++	BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
++	/**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
++	*blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
++	assert("zam-394", zlook(current_tree, blocknr) == NULL);
++}
++
++int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
++{
++	assign_fake_blocknr(blocknr, 1);
++	grabbed2fake_allocated_formatted();
++	return 0;
++}
++
++/**
++ * fake_blocknrs_unformatted
++ * @count: number of fake numbers to get
++ *
++ * Allocates @count fake block numbers which will be assigned to jnodes
++ */
++reiser4_block_nr fake_blocknr_unformatted(int count)
++{
++	reiser4_block_nr blocknr;
++
++	assign_fake_blocknr(&blocknr, count);
++	grabbed2fake_allocated_unformatted(count);
++
++	return blocknr;
++}
++
++/* adjust sb block counters, if real (on-disk) block allocation immediately
++   follows grabbing of free disk space. */
++void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
++		  __u64 count)
++{
++	sub_from_ctx_grabbed(ctx, count);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_sb_grabbed(sbinfo, count);
++	sbinfo->blocks_used += count;
++
++	assert("nikita-2679", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/* adjust sb block counters when @count unallocated blocks get mapped to disk */
++void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
++			 reiser4_ba_flags_t flags)
++{
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_sb_fake_allocated(sbinfo, count, flags);
++	sbinfo->blocks_used += count;
++
++	assert("nikita-2680", check_block_counters(reiser4_get_current_sb()));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++void flush_reserved2used(txn_atom * atom, __u64 count)
++{
++	reiser4_super_info_data *sbinfo;
++
++	assert("zam-787", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++
++	sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
++
++	sbinfo = get_current_super_private();
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_sb_flush_reserved(sbinfo, count);
++	sbinfo->blocks_used += count;
++
++	assert("zam-789", check_block_counters(reiser4_get_current_sb()));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/* update the per fs  blocknr hint default value. */
++void
++update_blocknr_hint_default(const struct super_block *s,
++			    const reiser4_block_nr * block)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++
++	assert("nikita-3342", !blocknr_is_fake(block));
++
++	spin_lock_reiser4_super(sbinfo);
++	if (*block < sbinfo->block_count) {
++		sbinfo->blocknr_hint_default = *block;
++	} else {
++		warning("zam-676",
++			"block number %llu is too large to be used in a blocknr hint\n",
++			(unsigned long long)*block);
++		dump_stack();
++		DEBUGON(1);
++	}
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/* get current value of the default blocknr hint. */
++void get_blocknr_hint_default(reiser4_block_nr * result)
++{
++	reiser4_super_info_data *sbinfo = get_current_super_private();
++
++	spin_lock_reiser4_super(sbinfo);
++	*result = sbinfo->blocknr_hint_default;
++	assert("zam-677", *result < sbinfo->block_count);
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/* Allocate "real" disk blocks by calling a proper space allocation plugin
++ * method. Blocks are allocated in one contiguous disk region. The plugin
++ * independent part accounts blocks by subtracting allocated amount from grabbed
++ * or fake block counter and add the same amount to the counter of allocated
++ * blocks.
++ *
++ * @hint -- a reiser4 blocknr hint object which contains further block
++ *          allocation hints and parameters (search start, a stage of block
++ *          which will be mapped to disk, etc.),
++ * @blk  -- an out parameter for the beginning of the allocated region,
++ * @len  -- in/out parameter, it should contain the maximum number of allocated
++ *          blocks, after block allocation completes, it contains the length of
++ *          allocated disk region.
++ * @flags -- see reiser4_ba_flags_t description.
++ *
++ * @return -- 0 if success, error code otherwise.
++ */
++int
++reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
++		     reiser4_block_nr * len, reiser4_ba_flags_t flags)
++{
++	__u64 needed = *len;
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++	int ret;
++
++	assert("zam-986", hint != NULL);
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	/* For write-optimized data we use default search start value, which is
++	 * close to last write location. */
++	if (flags & BA_USE_DEFAULT_SEARCH_START) {
++		get_blocknr_hint_default(&hint->blk);
++	}
++
++	/* VITALY: allocator should grab this for internal/tx-lists/similar only. */
++/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
++	if (hint->block_stage == BLOCK_NOT_COUNTED) {
++		ret = reiser4_grab_space_force(*len, flags);
++		if (ret != 0)
++			return ret;
++	}
++
++	ret =
++	    sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int)needed,
++			    blk, len);
++
++	if (!ret) {
++		assert("zam-680", *blk < reiser4_block_count(ctx->super));
++		assert("zam-681",
++		       *blk + *len <= reiser4_block_count(ctx->super));
++
++		if (flags & BA_PERMANENT) {
++			/* we assume that current atom exists at this moment */
++			txn_atom *atom = get_current_atom_locked();
++			atom->nr_blocks_allocated += *len;
++			spin_unlock_atom(atom);
++		}
++
++		switch (hint->block_stage) {
++		case BLOCK_NOT_COUNTED:
++		case BLOCK_GRABBED:
++			grabbed2used(ctx, sbinfo, *len);
++			break;
++		case BLOCK_UNALLOCATED:
++			fake_allocated2used(sbinfo, *len, flags);
++			break;
++		case BLOCK_FLUSH_RESERVED:
++			{
++				txn_atom *atom = get_current_atom_locked();
++				flush_reserved2used(atom, *len);
++				spin_unlock_atom(atom);
++			}
++			break;
++		default:
++			impossible("zam-531", "wrong block stage");
++		}
++	} else {
++		assert("zam-821",
++		       ergo(hint->max_dist == 0
++			    && !hint->backward, ret != -ENOSPC));
++		if (hint->block_stage == BLOCK_NOT_COUNTED)
++			grabbed2free(ctx, sbinfo, needed);
++	}
++
++	return ret;
++}
++
++/* used -> fake_allocated -> grabbed -> free */
++
++/* adjust sb block counters when @count unallocated blocks get unmapped from
++   disk */
++static void
++used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
++		    int formatted)
++{
++	spin_lock_reiser4_super(sbinfo);
++
++	if (formatted)
++		sbinfo->blocks_fake_allocated += count;
++	else
++		sbinfo->blocks_fake_allocated_unformatted += count;
++
++	sub_from_sb_used(sbinfo, count);
++
++	assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++static void
++used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
++		    __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
++{
++	assert("nikita-2791", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++
++	add_to_atom_flush_reserved_nolock(atom, (__u32) count);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	sbinfo->blocks_flush_reserved += count;
++	/*add_to_sb_flush_reserved(sbinfo, count); */
++	sub_from_sb_used(sbinfo, count);
++
++	assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
++static void
++fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
++		       __u64 count, reiser4_ba_flags_t flags)
++{
++	add_to_ctx_grabbed(ctx, count);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	assert("nikita-2682", check_block_counters(ctx->super));
++
++	sbinfo->blocks_grabbed += count;
++	sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
++
++	assert("nikita-2683", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	fake_allocated2grabbed(ctx, sbinfo, count, flags);
++	grabbed2free(ctx, sbinfo, count);
++}
++
++void grabbed2free_mark(__u64 mark)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	assert("nikita-3007", (__s64) mark >= 0);
++	assert("nikita-3006", ctx->grabbed_blocks >= mark);
++	grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
++}
++
++/**
++ * grabbed2free - adjust grabbed and free block counters
++ * @ctx: context to update grabbed block counter of
++ * @sbinfo: super block to update grabbed and free block counters of
++ * @count: number of blocks to adjust counters by
++ *
++ * Decreases context's and per filesystem's counters of grabbed
++ * blocks. Increases per filesystem's counter of free blocks.
++ */
++void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
++		  __u64 count)
++{
++	sub_from_ctx_grabbed(ctx, count);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	sub_from_sb_grabbed(sbinfo, count);
++	sbinfo->blocks_free += count;
++	assert("nikita-2684", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	assert("vs-1095", atom);
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	sub_from_ctx_grabbed(ctx, count);
++
++	add_to_atom_flush_reserved_nolock(atom, count);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	sbinfo->blocks_flush_reserved += count;
++	sub_from_sb_grabbed(sbinfo, count);
++
++	assert("vpf-292", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++void grabbed2flush_reserved(__u64 count)
++{
++	txn_atom *atom = get_current_atom_locked();
++
++	grabbed2flush_reserved_nolock(atom, count);
++
++	spin_unlock_atom(atom);
++}
++
++void flush_reserved2grabbed(txn_atom * atom, __u64 count)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	assert("nikita-2788", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	add_to_ctx_grabbed(ctx, count);
++
++	sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	sbinfo->blocks_grabbed += count;
++	sub_from_sb_flush_reserved(sbinfo, count);
++
++	assert("vpf-292", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/**
++ * all_grabbed2free - releases all blocks grabbed in context
++ *
++ * Decreases context's and super block's grabbed block counters by number of
++ * blocks grabbed by current context and increases super block's free block
++ * counter correspondingly.
++ */
++void all_grabbed2free(void)
++{
++	reiser4_context *ctx = get_current_context();
++
++	grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
++}
++
++/* adjust sb block counters if real (on-disk) blocks do not become unallocated
++   after freeing, @count blocks become "grabbed". */
++static void
++used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
++	     __u64 count)
++{
++	add_to_ctx_grabbed(ctx, count);
++
++	spin_lock_reiser4_super(sbinfo);
++
++	sbinfo->blocks_grabbed += count;
++	sub_from_sb_used(sbinfo, count);
++
++	assert("nikita-2685", check_block_counters(ctx->super));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++/* this used to be done through used2grabbed and grabbed2free*/
++static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
++{
++	spin_lock_reiser4_super(sbinfo);
++
++	sbinfo->blocks_free += count;
++	sub_from_sb_used(sbinfo, count);
++
++	assert("nikita-2685", check_block_counters(reiser4_get_current_sb()));
++
++	spin_unlock_reiser4_super(sbinfo);
++}
++
++#if REISER4_DEBUG
++
++/* check "allocated" state of given block range */
++static void
++reiser4_check_blocks(const reiser4_block_nr * start,
++		     const reiser4_block_nr * len, int desired)
++{
++	sa_check_blocks(start, len, desired);
++}
++
++/* check "allocated" state of given block */
++void reiser4_check_block(const reiser4_block_nr * block, int desired)
++{
++	const reiser4_block_nr one = 1;
++
++	reiser4_check_blocks(block, &one, desired);
++}
++
++#endif
++
++/* Blocks deallocation function may do an actual deallocation through space
++   plugin allocation or store deleted block numbers in atom's delete_set data
++   structure depend on @defer parameter. */
++
++/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
++   will be deleted from WORKING bitmap. They might be just unmapped from disk, or
++   freed but disk space is still grabbed by current thread, or these blocks must
++   not be counted in any reiser4 sb block counters, see block_stage_t comment */
++
++/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
++   distinguish blocks allocated for unformatted and formatted nodes */
++
++int
++reiser4_dealloc_blocks(const reiser4_block_nr * start,
++		       const reiser4_block_nr * len,
++		       block_stage_t target_stage, reiser4_ba_flags_t flags)
++{
++	txn_atom *atom = NULL;
++	int ret;
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	if (REISER4_DEBUG) {
++		assert("zam-431", *len != 0);
++		assert("zam-432", *start != 0);
++		assert("zam-558", !blocknr_is_fake(start));
++
++		spin_lock_reiser4_super(sbinfo);
++		assert("zam-562", *start < sbinfo->block_count);
++		spin_unlock_reiser4_super(sbinfo);
++	}
++
++	if (flags & BA_DEFER) {
++		blocknr_set_entry *bsep = NULL;
++
++		/* storing deleted block numbers in a blocknr set
++		   datastructure for further actual deletion */
++		do {
++			atom = get_current_atom_locked();
++			assert("zam-430", atom != NULL);
++
++			ret =
++			    blocknr_set_add_extent(atom, &atom->delete_set,
++						   &bsep, start, len);
++
++			if (ret == -ENOMEM)
++				return ret;
++
++			/* This loop might spin at most two times */
++		} while (ret == -E_REPEAT);
++
++		assert("zam-477", ret == 0);
++		assert("zam-433", atom != NULL);
++
++		spin_unlock_atom(atom);
++
++	} else {
++		assert("zam-425", get_current_super_private() != NULL);
++		sa_dealloc_blocks(get_space_allocator(ctx->super), *start,
++				  *len);
++
++		if (flags & BA_PERMANENT) {
++			/* These blocks were counted as allocated, we have to revert it
++			 * back if allocation is discarded. */
++			txn_atom *atom = get_current_atom_locked();
++			atom->nr_blocks_allocated -= *len;
++			spin_unlock_atom(atom);
++		}
++
++		switch (target_stage) {
++		case BLOCK_NOT_COUNTED:
++			assert("vs-960", flags & BA_FORMATTED);
++			/* VITALY: This is what was grabbed for internal/tx-lists/similar only */
++			used2free(sbinfo, *len);
++			break;
++
++		case BLOCK_GRABBED:
++			used2grabbed(ctx, sbinfo, *len);
++			break;
++
++		case BLOCK_UNALLOCATED:
++			used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
++			break;
++
++		case BLOCK_FLUSH_RESERVED:{
++				txn_atom *atom;
++
++				atom = get_current_atom_locked();
++				used2flush_reserved(sbinfo, atom, *len,
++						    flags & BA_FORMATTED);
++				spin_unlock_atom(atom);
++				break;
++			}
++		default:
++			impossible("zam-532", "wrong block stage");
++		}
++	}
++
++	return 0;
++}
++
++/* wrappers for block allocator plugin methods */
++int pre_commit_hook(void)
++{
++	assert("zam-502", get_current_super_private() != NULL);
++	sa_pre_commit_hook();
++	return 0;
++}
++
++/* an actor which applies delete set to block allocator data */
++static int
++apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
++	   const reiser4_block_nr * b, void *data UNUSED_ARG)
++{
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	__u64 len = 1;
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
++	assert("zam-552", sbinfo != NULL);
++
++	if (b != NULL)
++		len = *b;
++
++	if (REISER4_DEBUG) {
++		spin_lock_reiser4_super(sbinfo);
++
++		assert("zam-554", *a < reiser4_block_count(ctx->super));
++		assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
++
++		spin_unlock_reiser4_super(sbinfo);
++	}
++
++	sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
++	/* adjust sb block counters */
++	used2free(sbinfo, len);
++	return 0;
++}
++
++void post_commit_hook(void)
++{
++	txn_atom *atom;
++
++	atom = get_current_atom_locked();
++	assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
++	spin_unlock_atom(atom);
++
++	/* do the block deallocation which was deferred
++	   until commit is done */
++	blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
++
++	assert("zam-504", get_current_super_private() != NULL);
++	sa_post_commit_hook();
++}
++
++void post_write_back_hook(void)
++{
++	assert("zam-504", get_current_super_private() != NULL);
++
++	sa_post_commit_hook();
++}
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/block_alloc.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/block_alloc.h
+@@ -0,0 +1,175 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
++#define __FS_REISER4_BLOCK_ALLOC_H__
++
++#include "dformat.h"
++#include "forward.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>
++
++/* Mask when is applied to given block number shows is that block number is a fake one */
++#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
++/* Mask which isolates a type of object this fake block number was assigned to */
++#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
++
++/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
++   against these two values to understand is the object unallocated or bitmap
++   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
++#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
++#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
++
++/* specification how block allocation was counted in sb block counters */
++typedef enum {
++	BLOCK_NOT_COUNTED = 0,	/* reiser4 has no info about this block yet */
++	BLOCK_GRABBED = 1,	/* free space grabbed for further allocation
++				   of this block */
++	BLOCK_FLUSH_RESERVED = 2,	/* block is reserved for flush needs. */
++	BLOCK_UNALLOCATED = 3,	/* block is used for existing in-memory object
++				   ( unallocated formatted or unformatted
++				   node) */
++	BLOCK_ALLOCATED = 4	/* block is mapped to disk, real on-disk block
++				   number assigned */
++} block_stage_t;
++
++/* a hint for block allocator */
++struct reiser4_blocknr_hint {
++	/* FIXME: I think we want to add a longterm lock on the bitmap block here.  This
++	   is to prevent jnode_flush() calls from interleaving allocations on the same
++	   bitmap, once a hint is established. */
++
++	/* search start hint */
++	reiser4_block_nr blk;
++	/* if not zero, it is a region size we search for free blocks in */
++	reiser4_block_nr max_dist;
++	/* level for allocation, may be useful have branch-level and higher
++	   write-optimized. */
++	tree_level level;
++	/* block allocator assumes that blocks, which will be mapped to disk,
++	   are in this specified block_stage */
++	block_stage_t block_stage;
++	/* If direction = 1 allocate blocks in backward direction from the end
++	 * of disk to the beginning of disk.  */
++	unsigned int backward:1;
++
++};
++
++/* These flags control block allocation/deallocation behavior */
++enum reiser4_ba_flags {
++	/* do allocatations from reserved (5%) area */
++	BA_RESERVED = (1 << 0),
++
++	/* block allocator can do commit trying to recover free space */
++	BA_CAN_COMMIT = (1 << 1),
++
++	/* if operation will be applied to formatted block */
++	BA_FORMATTED = (1 << 2),
++
++	/* defer actual block freeing until transaction commit */
++	BA_DEFER = (1 << 3),
++
++	/* allocate blocks for permanent fs objects (formatted or unformatted), not
++	   wandered of log blocks */
++	BA_PERMANENT = (1 << 4),
++
++	/* grab space even it was disabled */
++	BA_FORCE = (1 << 5),
++
++	/* use default start value for free blocks search. */
++	BA_USE_DEFAULT_SEARCH_START = (1 << 6)
++};
++
++typedef enum reiser4_ba_flags reiser4_ba_flags_t;
++
++extern void blocknr_hint_init(reiser4_blocknr_hint * hint);
++extern void blocknr_hint_done(reiser4_blocknr_hint * hint);
++extern void update_blocknr_hint_default(const struct super_block *,
++					const reiser4_block_nr *);
++extern void get_blocknr_hint_default(reiser4_block_nr *);
++
++extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
++
++int assign_fake_blocknr_formatted(reiser4_block_nr *);
++reiser4_block_nr fake_blocknr_unformatted(int);
++
++/* free -> grabbed -> fake_allocated -> used */
++
++int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
++void all_grabbed2free(void);
++void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
++void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
++void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
++void grabbed2flush_reserved(__u64 count);
++int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
++			 reiser4_block_nr * start,
++			 reiser4_block_nr * len, reiser4_ba_flags_t flags);
++int reiser4_dealloc_blocks(const reiser4_block_nr *,
++			   const reiser4_block_nr *,
++			   block_stage_t, reiser4_ba_flags_t flags);
++
++static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
++				      reiser4_block_nr * start,
++				      reiser4_ba_flags_t flags)
++{
++	reiser4_block_nr one = 1;
++	return reiser4_alloc_blocks(hint, start, &one, flags);
++}
++
++static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
++					block_stage_t stage,
++					reiser4_ba_flags_t flags)
++{
++	const reiser4_block_nr one = 1;
++	return reiser4_dealloc_blocks(block, &one, stage, flags);
++}
++
++#define reiser4_grab_space_force(count, flags)		\
++	reiser4_grab_space(count, flags | BA_FORCE)
++
++extern void grabbed2free_mark(__u64 mark);
++extern int reiser4_grab_reserved(struct super_block *,
++				 __u64, reiser4_ba_flags_t);
++extern void reiser4_release_reserved(struct super_block *super);
++
++/* grabbed -> fake_allocated */
++
++/* fake_allocated -> used */
++
++/* used -> fake_allocated -> grabbed -> free */
++
++extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
++
++extern int blocknr_is_fake(const reiser4_block_nr * da);
++
++extern void grabbed2cluster_reserved(int count);
++extern void cluster_reserved2grabbed(int count);
++extern void cluster_reserved2free(int count);
++
++extern int check_block_counters(const struct super_block *);
++
++#if REISER4_DEBUG
++
++extern void reiser4_check_block(const reiser4_block_nr *, int);
++
++#else
++
++#  define reiser4_check_block(beg, val)        noop
++
++#endif
++
++extern int pre_commit_hook(void);
++extern void post_commit_hook(void);
++extern void post_write_back_hook(void);
++
++#endif				/* __FS_REISER4_BLOCK_ALLOC_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/blocknrset.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/blocknrset.c
+@@ -0,0 +1,368 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* This file contains code for various block number sets used by the atom to
++   track the deleted set and wandered block mappings. */
++
++#include "debug.h"
++#include "dformat.h"
++#include "txnmgr.h"
++#include "context.h"
++
++#include <linux/slab.h>
++
++/* The proposed data structure for storing unordered block number sets is a
++   list of elements, each of which contains an array of block number or/and
++   array of block number pairs. That element called blocknr_set_entry is used
++   to store block numbers from the beginning and for extents from the end of
++   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
++   count numbers of blocks and extents.
++
++   +------------------- blocknr_set_entry->data ------------------+
++   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
++   +------------------------------------------------------------+
++
++   When current blocknr_set_entry is full, allocate a new one. */
++
++/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
++ * set (single blocks and block extents), in that case blocknr pair represent an
++ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
++ * there represent a (real block) -> (wandered block) mapping. */
++
++typedef struct blocknr_pair blocknr_pair;
++
++/* The total size of a blocknr_set_entry. */
++#define BLOCKNR_SET_ENTRY_SIZE 128
++
++/* The number of blocks that can fit the blocknr data area. */
++#define BLOCKNR_SET_ENTRIES_NUMBER		\
++       ((BLOCKNR_SET_ENTRY_SIZE -		\
++         2 * sizeof (unsigned) -		\
++         sizeof(struct list_head)) /		\
++        sizeof(reiser4_block_nr))
++
++/* An entry of the blocknr_set */
++struct blocknr_set_entry {
++	unsigned nr_singles;
++	unsigned nr_pairs;
++	struct list_head link;
++	reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
++};
++
++/* A pair of blocks as recorded in the blocknr_set_entry data. */
++struct blocknr_pair {
++	reiser4_block_nr a;
++	reiser4_block_nr b;
++};
++
++/* Return the number of blocknr slots available in a blocknr_set_entry. */
++/* Audited by: green(2002.06.11) */
++static unsigned bse_avail(blocknr_set_entry * bse)
++{
++	unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
++
++	assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
++	cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
++
++	return BLOCKNR_SET_ENTRIES_NUMBER - used;
++}
++
++/* Initialize a blocknr_set_entry. */
++static void bse_init(blocknr_set_entry *bse)
++{
++	bse->nr_singles = 0;
++	bse->nr_pairs = 0;
++	INIT_LIST_HEAD(&bse->link);
++}
++
++/* Allocate and initialize a blocknr_set_entry. */
++/* Audited by: green(2002.06.11) */
++static blocknr_set_entry *bse_alloc(void)
++{
++	blocknr_set_entry *e;
++
++	if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
++					       get_gfp_mask())) == NULL)
++		return NULL;
++
++	bse_init(e);
++
++	return e;
++}
++
++/* Free a blocknr_set_entry. */
++/* Audited by: green(2002.06.11) */
++static void bse_free(blocknr_set_entry * bse)
++{
++	kfree(bse);
++}
++
++/* Add a block number to a blocknr_set_entry */
++/* Audited by: green(2002.06.11) */
++static void
++bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
++{
++	assert("jmacd-5099", bse_avail(bse) >= 1);
++
++	bse->entries[bse->nr_singles++] = *block;
++}
++
++/* Get a pair of block numbers */
++/* Audited by: green(2002.06.11) */
++static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
++{
++	assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
++
++	return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
++				 2 * (pno + 1));
++}
++
++/* Add a pair of block numbers to a blocknr_set_entry */
++/* Audited by: green(2002.06.11) */
++static void
++bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
++	     const reiser4_block_nr * b)
++{
++	blocknr_pair *pair;
++
++	assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
++
++	pair = bse_get_pair(bse, bse->nr_pairs++);
++
++	pair->a = *a;
++	pair->b = *b;
++}
++
++/* Add either a block or pair of blocks to the block number set.  The first
++   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
++   @b is non-NULL a pair is added.  The block number set belongs to atom, and
++   the call is made with the atom lock held.  There may not be enough space in
++   the current blocknr_set_entry.  If new_bsep points to a non-NULL
++   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
++   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
++   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
++   returned with the atom unlocked for the operation to be tried again.  If
++   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
++   used during the call, it will be freed automatically. */
++static int blocknr_set_add(txn_atom *atom, blocknr_set *bset,
++			   blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
++			   const reiser4_block_nr *b)
++{
++	blocknr_set_entry *bse;
++	unsigned entries_needed;
++
++	assert("jmacd-5101", a != NULL);
++
++	entries_needed = (b == NULL) ? 1 : 2;
++	if (list_empty(&bset->entries) ||
++	    bse_avail(list_entry(bset->entries.next, blocknr_set_entry, link)) < entries_needed) {
++		/* See if a bse was previously allocated. */
++		if (*new_bsep == NULL) {
++			spin_unlock_atom(atom);
++			*new_bsep = bse_alloc();
++			return (*new_bsep != NULL) ? -E_REPEAT :
++				RETERR(-ENOMEM);
++		}
++
++		/* Put it on the head of the list. */
++		list_add(&((*new_bsep)->link), &bset->entries);
++
++		*new_bsep = NULL;
++	}
++
++	/* Add the single or pair. */
++	bse = list_entry(bset->entries.next, blocknr_set_entry, link);
++	if (b == NULL) {
++		bse_put_single(bse, a);
++	} else {
++		bse_put_pair(bse, a, b);
++	}
++
++	/* If new_bsep is non-NULL then there was an allocation race, free this copy. */
++	if (*new_bsep != NULL) {
++		bse_free(*new_bsep);
++		*new_bsep = NULL;
++	}
++
++	return 0;
++}
++
++/* Add an extent to the block set.  If the length is 1, it is treated as a
++   single block (e.g., reiser4_set_add_block). */
++/* Audited by: green(2002.06.11) */
++/* Auditor note: Entire call chain cannot hold any spinlocks, because
++   kmalloc might schedule. The only exception is atom spinlock, which is
++   properly freed. */
++int
++blocknr_set_add_extent(txn_atom * atom,
++		       blocknr_set * bset,
++		       blocknr_set_entry ** new_bsep,
++		       const reiser4_block_nr * start,
++		       const reiser4_block_nr * len)
++{
++	assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
++	return blocknr_set_add(atom, bset, new_bsep, start,
++			       *len == 1 ? NULL : len);
++}
++
++/* Add a block pair to the block set. It adds exactly a pair, which is checked
++ * by an assertion that both arguments are not null.*/
++/* Audited by: green(2002.06.11) */
++/* Auditor note: Entire call chain cannot hold any spinlocks, because
++   kmalloc might schedule. The only exception is atom spinlock, which is
++   properly freed. */
++int
++blocknr_set_add_pair(txn_atom * atom,
++		     blocknr_set * bset,
++		     blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
++		     const reiser4_block_nr * b)
++{
++	assert("jmacd-5103", a != NULL && b != NULL);
++	return blocknr_set_add(atom, bset, new_bsep, a, b);
++}
++
++/* Initialize a blocknr_set. */
++void blocknr_set_init(blocknr_set *bset)
++{
++	INIT_LIST_HEAD(&bset->entries);
++}
++
++/* Release the entries of a blocknr_set. */
++void blocknr_set_destroy(blocknr_set *bset)
++{
++	blocknr_set_entry *bse;
++
++	while (!list_empty_careful(&bset->entries)) {
++		bse = list_entry(bset->entries.next, blocknr_set_entry, link);
++		list_del_init(&bse->link);
++		bse_free(bse);
++	}
++}
++
++/* Merge blocknr_set entries out of @from into @into. */
++/* Audited by: green(2002.06.11) */
++/* Auditor comments: This merge does not know if merged sets contain
++   blocks pairs (As for wandered sets) or extents, so it cannot really merge
++   overlapping ranges if there is some. So I believe it may lead to
++   some blocks being presented several times in one blocknr_set. To help
++   debugging such problems it might help to check for duplicate entries on
++   actual processing of this set. Testing this kind of stuff right here is
++   also complicated by the fact that these sets are not sorted and going
++   through whole set on each element addition is going to be CPU-heavy task */
++void blocknr_set_merge(blocknr_set * from, blocknr_set * into)
++{
++	blocknr_set_entry *bse_into = NULL;
++
++	/* If @from is empty, no work to perform. */
++	if (list_empty_careful(&from->entries)) {
++		return;
++	}
++
++	/* If @into is not empty, try merging partial-entries. */
++	if (!list_empty_careful(&into->entries)) {
++
++		/* Neither set is empty, pop the front to members and try to combine them. */
++		blocknr_set_entry *bse_from;
++		unsigned into_avail;
++
++		bse_into = list_entry(into->entries.next, blocknr_set_entry, link);
++		list_del_init(&bse_into->link);
++		bse_from = list_entry(from->entries.next, blocknr_set_entry, link);
++		list_del_init(&bse_from->link);
++
++		/* Combine singles. */
++		for (into_avail = bse_avail(bse_into);
++		     into_avail != 0 && bse_from->nr_singles != 0;
++		     into_avail -= 1) {
++			bse_put_single(bse_into,
++				       &bse_from->entries[--bse_from->
++							  nr_singles]);
++		}
++
++		/* Combine pairs. */
++		for (; into_avail > 1 && bse_from->nr_pairs != 0;
++		     into_avail -= 2) {
++			blocknr_pair *pair =
++			    bse_get_pair(bse_from, --bse_from->nr_pairs);
++			bse_put_pair(bse_into, &pair->a, &pair->b);
++		}
++
++		/* If bse_from is empty, delete it now. */
++		if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
++			bse_free(bse_from);
++		} else {
++			/* Otherwise, bse_into is full or nearly full (e.g.,
++			   it could have one slot avail and bse_from has one
++			   pair left).  Push it back onto the list.  bse_from
++			   becomes bse_into, which will be the new partial. */
++			list_add(&bse_into->link, &into->entries);
++			bse_into = bse_from;
++		}
++	}
++
++	/* Splice lists together. */
++	list_splice_init(&from->entries, into->entries.prev);
++
++	/* Add the partial entry back to the head of the list. */
++	if (bse_into != NULL) {
++		list_add(&bse_into->link, &into->entries);
++	}
++}
++
++/* Iterate over all blocknr set elements. */
++int blocknr_set_iterator(txn_atom *atom, blocknr_set *bset,
++			 blocknr_set_actor_f actor, void *data, int delete)
++{
++
++	blocknr_set_entry *entry;
++
++	assert("zam-429", atom != NULL);
++	assert("zam-430", atom_is_protected(atom));
++	assert("zam-431", bset != 0);
++	assert("zam-432", actor != NULL);
++
++	entry = list_entry(bset->entries.next, blocknr_set_entry, link);
++	while (&bset->entries != &entry->link) {
++		blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
++		unsigned int i;
++		int ret;
++
++		for (i = 0; i < entry->nr_singles; i++) {
++			ret = actor(atom, &entry->entries[i], NULL, data);
++
++			/* We can't break a loop if delete flag is set. */
++			if (ret != 0 && !delete)
++				return ret;
++		}
++
++		for (i = 0; i < entry->nr_pairs; i++) {
++			struct blocknr_pair *ab;
++
++			ab = bse_get_pair(entry, i);
++
++			ret = actor(atom, &ab->a, &ab->b, data);
++
++			if (ret != 0 && !delete)
++				return ret;
++		}
++
++		if (delete) {
++			list_del(&entry->link);
++			bse_free(entry);
++		}
++
++		entry = tmp;
++	}
++
++	return 0;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/carry.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/carry.c
+@@ -0,0 +1,1381 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* Functions to "carry" tree modification(s) upward. */
++/* Tree is modified one level at a time. As we modify a level we accumulate a
++   set of changes that need to be propagated to the next level.  We manage
++   node locking such that any searches that collide with carrying are
++   restarted, from the root if necessary.
++
++   Insertion of a new item may result in items being moved among nodes and
++   this requires the delimiting key to be updated at the least common parent
++   of the nodes modified to preserve search tree invariants. Also, insertion
++   may require allocation of a new node. A pointer to the new node has to be
++   inserted into some node on the parent level, etc.
++
++   Tree carrying is meant to be analogous to arithmetic carrying.
++
++   A carry operation is always associated with some node (&carry_node).
++
++   Carry process starts with some initial set of operations to be performed
++   and an initial set of already locked nodes.  Operations are performed one
++   by one. Performing each single operation has following possible effects:
++
++    - content of carry node associated with operation is modified
++    - new carry nodes are locked and involved into carry process on this level
++    - new carry operations are posted to the next level
++
++   After all carry operations on this level are done, process is repeated for
++   the accumulated sequence on carry operations for the next level. This
++   starts by trying to lock (in left to right order) all carry nodes
++   associated with carry operations on the parent level. After this, we decide
++   whether more nodes are required on the left of already locked set. If so,
++   all locks taken on the parent level are released, new carry nodes are
++   added, and locking process repeats.
++
++   It may happen that balancing process fails owing to unrecoverable error on
++   some of upper levels of a tree (possible causes are io error, failure to
++   allocate new node, etc.). In this case we should unmount the filesystem,
++   rebooting if it is the root, and possibly advise the use of fsck.
++
++   USAGE:
++
++    int some_tree_operation( znode *node, ... )
++    {
++       // Allocate on a stack pool of carry objects: operations and nodes.
++       // Most carry processes will only take objects from here, without
++       // dynamic allocation.
++
++I feel uneasy about this pool.  It adds to code complexity, I understand why it exists, but.... -Hans
++
++       carry_pool  pool;
++       carry_level lowest_level;
++       carry_op   *op;
++
++       init_carry_pool( &pool );
++       init_carry_level( &lowest_level, &pool );
++
++       // operation may be one of:
++       //   COP_INSERT    --- insert new item into node
++       //   COP_CUT       --- remove part of or whole node
++       //   COP_PASTE     --- increase size of item
++       //   COP_DELETE    --- delete pointer from parent node
++       //   COP_UPDATE    --- update delimiting key in least
++       //                     common ancestor of two
++
++       op = post_carry( &lowest_level, operation, node, 0 );
++       if( IS_ERR( op ) || ( op == NULL ) ) {
++           handle error
++       } else {
++           // fill in remaining fields in @op, according to carry.h:carry_op
++           result = carry( &lowest_level, NULL );
++       }
++       done_carry_pool( &pool );
++    }
++
++   When you are implementing node plugin method that participates in carry
++   (shifting, insertion, deletion, etc.), do the following:
++
++   int foo_node_method( znode *node, ..., carry_level *todo )
++   {
++       carry_op   *op;
++
++       ....
++
++       // note, that last argument to post_carry() is non-null
++       // here, because @op is to be applied to the parent of @node, rather
++       // than to the @node itself as in the previous case.
++
++       op = node_post_carry( todo, operation, node, 1 );
++       // fill in remaining fields in @op, according to carry.h:carry_op
++
++       ....
++
++   }
++
++   BATCHING:
++
++   One of the main advantages of level-by-level balancing implemented here is
++   ability to batch updates on a parent level and to peform them more
++   efficiently as a result.
++
++   Description To Be Done (TBD).
++
++   DIFFICULTIES AND SUBTLE POINTS:
++
++   1. complex plumbing is required, because:
++
++       a. effective allocation through pools is needed
++
++       b. target of operation is not exactly known when operation is
++       posted. This is worked around through bitfields in &carry_node and
++       logic in lock_carry_node()
++
++       c. of interaction with locking code: node should be added into sibling
++       list when pointer to it is inserted into its parent, which is some time
++       after node was created. Between these moments, node is somewhat in
++       suspended state and is only registered in the carry lists
++
++    2. whole balancing logic is implemented here, in particular, insertion
++    logic is coded in make_space().
++
++    3. special cases like insertion (add_tree_root()) or deletion
++    (kill_tree_root()) of tree root and morphing of paste into insert
++    (insert_paste()) have to be handled.
++
++    4. there is non-trivial interdependency between allocation of new nodes
++    and almost everything else. This is mainly due to the (1.c) above. I shall
++    write about this later.
++
++*/
++
++#include "forward.h"
++#include "debug.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/item/item.h"
++#include "plugin/item/extent.h"
++#include "plugin/node/node.h"
++#include "jnode.h"
++#include "znode.h"
++#include "tree_mod.h"
++#include "tree_walk.h"
++#include "block_alloc.h"
++#include "pool.h"
++#include "tree.h"
++#include "carry.h"
++#include "carry_ops.h"
++#include "super.h"
++#include "reiser4.h"
++
++#include <linux/types.h>
++
++/* level locking/unlocking */
++static int lock_carry_level(carry_level * level);
++static void unlock_carry_level(carry_level * level, int failure);
++static void done_carry_level(carry_level * level);
++static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
++
++int lock_carry_node(carry_level * level, carry_node * node);
++int lock_carry_node_tail(carry_node * node);
++
++/* carry processing proper */
++static int carry_on_level(carry_level * doing, carry_level * todo);
++
++static carry_op *add_op(carry_level * level, pool_ordering order,
++			carry_op * reference);
++
++/* handlers for carry operations. */
++
++static void fatal_carry_error(carry_level * doing, int ecode);
++static int add_new_root(carry_level * level, carry_node * node, znode * fake);
++
++
++static void print_level(const char *prefix, carry_level * level);
++
++#if REISER4_DEBUG
++typedef enum {
++	CARRY_TODO,
++	CARRY_DOING
++} carry_queue_state;
++static int carry_level_invariant(carry_level * level, carry_queue_state state);
++#endif
++
++/* main entry point for tree balancing.
++
++   Tree carry performs operations from @doing and while doing so accumulates
++   information about operations to be performed on the next level ("carried"
++   to the parent level). Carried operations are performed, causing possibly
++   more operations to be carried upward etc. carry() takes care about
++   locking and pinning znodes while operating on them.
++
++   For usage, see comment at the top of fs/reiser4/carry.c
++
++*/
++int carry(carry_level * doing /* set of carry operations to be performed */ ,
++	  carry_level * done	/* set of nodes, already performed at the
++				 * previous level. NULL in most cases */ )
++{
++	int result = 0;
++	/* queue of new requests */
++	carry_level *todo;
++	ON_DEBUG(STORE_COUNTERS);
++
++	assert("nikita-888", doing != NULL);
++	BUG_ON(done != NULL);
++
++	todo = doing + 1;
++	init_carry_level(todo, doing->pool);
++
++	/* queue of requests preformed on the previous level */
++	done = todo + 1;
++	init_carry_level(done, doing->pool);
++
++	/* iterate until there is nothing more to do */
++	while (result == 0 && doing->ops_num > 0) {
++		carry_level *tmp;
++
++		/* at this point @done is locked. */
++		/* repeat lock/do/unlock while
++
++		   (1) lock_carry_level() fails due to deadlock avoidance, or
++
++		   (2) carry_on_level() decides that more nodes have to
++		   be involved.
++
++		   (3) some unexpected error occurred while balancing on the
++		   upper levels. In this case all changes are rolled back.
++
++		 */
++		while (1) {
++			result = lock_carry_level(doing);
++			if (result == 0) {
++				/* perform operations from @doing and
++				   accumulate new requests in @todo */
++				result = carry_on_level(doing, todo);
++				if (result == 0)
++					break;
++				else if (result != -E_REPEAT ||
++					 !doing->restartable) {
++					warning("nikita-1043",
++						"Fatal error during carry: %i",
++						result);
++					print_level("done", done);
++					print_level("doing", doing);
++					print_level("todo", todo);
++					/* do some rough stuff like aborting
++					   all pending transcrashes and thus
++					   pushing tree back to the consistent
++					   state. Alternatvely, just panic.
++					 */
++					fatal_carry_error(doing, result);
++					return result;
++				}
++			} else if (result != -E_REPEAT) {
++				fatal_carry_error(doing, result);
++				return result;
++			}
++			unlock_carry_level(doing, 1);
++		}
++		/* at this point @done can be safely unlocked */
++		done_carry_level(done);
++
++		/* cyclically shift queues */
++		tmp = done;
++		done = doing;
++		doing = todo;
++		todo = tmp;
++		init_carry_level(todo, doing->pool);
++
++		/* give other threads chance to run */
++		preempt_point();
++	}
++	done_carry_level(done);
++
++	/* all counters, but x_refs should remain the same. x_refs can change
++	   owing to transaction manager */
++	ON_DEBUG(CHECK_COUNTERS);
++	return result;
++}
++
++/* perform carry operations on given level.
++
++   Optimizations proposed by pooh:
++
++   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
++   required;
++
++   (2) unlock node if there are no more operations to be performed upon it and
++   node didn't add any operation to @todo. This can be implemented by
++   attaching to each node two counters: counter of operaions working on this
++   node and counter and operations carried upward from this node.
++
++*/
++static int carry_on_level(carry_level * doing	/* queue of carry operations to
++						 * do on this level */ ,
++			  carry_level * todo	/* queue where new carry
++						 * operations to be performed on
++						 * the * parent level are
++						 * accumulated during @doing
++						 * processing. */ )
++{
++	int result;
++	int (*f) (carry_op *, carry_level *, carry_level *);
++	carry_op *op;
++	carry_op *tmp_op;
++
++	assert("nikita-1034", doing != NULL);
++	assert("nikita-1035", todo != NULL);
++
++	/* @doing->nodes are locked. */
++
++	/* This function can be split into two phases: analysis and modification.
++
++	   Analysis calculates precisely what items should be moved between
++	   nodes. This information is gathered in some structures attached to
++	   each carry_node in a @doing queue. Analysis also determines whether
++	   new nodes are to be allocated etc.
++
++	   After analysis is completed, actual modification is performed. Here
++	   we can take advantage of "batch modification": if there are several
++	   operations acting on the same node, modifications can be performed
++	   more efficiently when batched together.
++
++	   Above is an optimization left for the future.
++	 */
++	/* Important, but delayed optimization: it's possible to batch
++	   operations together and perform them more efficiently as a
++	   result. For example, deletion of several neighboring items from a
++	   node can be converted to a single ->cut() operation.
++
++	   Before processing queue, it should be scanned and "mergeable"
++	   operations merged.
++	 */
++	result = 0;
++	for_all_ops(doing, op, tmp_op) {
++		carry_opcode opcode;
++
++		assert("nikita-1041", op != NULL);
++		opcode = op->op;
++		assert("nikita-1042", op->op < COP_LAST_OP);
++		f = op_dispatch_table[op->op].handler;
++		result = f(op, doing, todo);
++		/* locking can fail with -E_REPEAT. Any different error is fatal
++		   and will be handled by fatal_carry_error() sledgehammer.
++		 */
++		if (result != 0)
++			break;
++	}
++	if (result == 0) {
++		carry_plugin_info info;
++		carry_node *scan;
++		carry_node *tmp_scan;
++
++		info.doing = doing;
++		info.todo = todo;
++
++		assert("nikita-3002",
++		       carry_level_invariant(doing, CARRY_DOING));
++		for_all_nodes(doing, scan, tmp_scan) {
++			znode *node;
++
++			node = carry_real(scan);
++			assert("nikita-2547", node != NULL);
++			if (node_is_empty(node)) {
++				result =
++				    node_plugin_by_node(node)->
++				    prepare_removal(node, &info);
++				if (result != 0)
++					break;
++			}
++		}
++	}
++	return result;
++}
++
++/* post carry operation
++
++   This is main function used by external carry clients: node layout plugins
++   and tree operations to create new carry operation to be performed on some
++   level.
++
++   New operation will be included in the @level queue. To actually perform it,
++   call carry( level, ... ). This function takes write lock on @node. Carry
++   manages all its locks by itself, don't worry about this.
++
++   This function adds operation and node at the end of the queue. It is up to
++   caller to guarantee proper ordering of node queue.
++
++*/
++carry_op *post_carry(carry_level * level	/* queue where new operation is to
++						 * be posted at */ ,
++		     carry_opcode op /* opcode of operation */ ,
++		     znode * node	/* node on which this operation
++					 * will operate */ ,
++		     int apply_to_parent_p	/* whether operation will operate
++						 * directly on @node or on it
++						 * parent. */ )
++{
++	carry_op *result;
++	carry_node *child;
++
++	assert("nikita-1046", level != NULL);
++	assert("nikita-1788", znode_is_write_locked(node));
++
++	result = add_op(level, POOLO_LAST, NULL);
++	if (IS_ERR(result))
++		return result;
++	child = add_carry(level, POOLO_LAST, NULL);
++	if (IS_ERR(child)) {
++		reiser4_pool_free(&level->pool->op_pool, &result->header);
++		return (carry_op *) child;
++	}
++	result->node = child;
++	result->op = op;
++	child->parent = apply_to_parent_p;
++	if (ZF_ISSET(node, JNODE_ORPHAN))
++		child->left_before = 1;
++	child->node = node;
++	return result;
++}
++
++/* initialize carry queue */
++void init_carry_level(carry_level * level /* level to initialize */ ,
++		      carry_pool * pool	/* pool @level will allocate objects
++					 * from */ )
++{
++	assert("nikita-1045", level != NULL);
++	assert("nikita-967", pool != NULL);
++
++	memset(level, 0, sizeof *level);
++	level->pool = pool;
++
++	INIT_LIST_HEAD(&level->nodes);
++	INIT_LIST_HEAD(&level->ops);
++}
++
++/* allocate carry pool and initialize pools within queue */
++carry_pool *init_carry_pool(int size)
++{
++	carry_pool *pool;
++
++	assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
++	pool = kmalloc(size, get_gfp_mask());
++	if (pool == NULL)
++		return ERR_PTR(RETERR(-ENOMEM));
++
++	reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
++			  (char *)pool->op);
++	reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
++			  NODES_LOCKED_POOL_SIZE, (char *)pool->node);
++	return pool;
++}
++
++/* finish with queue pools */
++void done_carry_pool(carry_pool * pool /* pool to destroy */ )
++{
++	reiser4_done_pool(&pool->op_pool);
++	reiser4_done_pool(&pool->node_pool);
++	kfree(pool);
++}
++
++/* add new carry node to the @level.
++
++   Returns pointer to the new carry node allocated from pool.  It's up to
++   callers to maintain proper order in the @level. Assumption is that if carry
++   nodes on one level are already sorted and modifications are peroformed from
++   left to right, carry nodes added on the parent level will be ordered
++   automatically. To control ordering use @order and @reference parameters.
++
++*/
++carry_node *add_carry_skip(carry_level * level	/* &carry_level to add node
++						 * to */ ,
++			   pool_ordering order	/* where to insert: at the
++						 * beginning of @level,
++						 * before @reference, after
++						 * @reference, at the end
++						 * of @level */ ,
++			   carry_node * reference	/* reference node for
++							 * insertion */ )
++{
++	ON_DEBUG(carry_node * orig_ref = reference);
++
++	if (order == POOLO_BEFORE) {
++		reference = find_left_carry(reference, level);
++		if (reference == NULL)
++			reference = list_entry(level->nodes.next, carry_node,
++					       header.level_linkage);
++		else
++			reference = list_entry(reference->header.level_linkage.next,
++					       carry_node, header.level_linkage);
++	} else if (order == POOLO_AFTER) {
++		reference = find_right_carry(reference, level);
++		if (reference == NULL)
++			reference = list_entry(level->nodes.prev, carry_node,
++					       header.level_linkage);
++		else
++			reference = list_entry(reference->header.level_linkage.prev,
++					       carry_node, header.level_linkage);
++	}
++	assert("nikita-2209",
++	       ergo(orig_ref != NULL,
++		    carry_real(reference) == carry_real(orig_ref)));
++	return add_carry(level, order, reference);
++}
++
++carry_node *add_carry(carry_level * level	/* &carry_level to add node
++						 * to */ ,
++		      pool_ordering order	/* where to insert: at the
++						 * beginning of @level, before
++						 * @reference, after @reference,
++						 * at the end of @level */ ,
++		      carry_node * reference	/* reference node for
++						 * insertion */ )
++{
++	carry_node *result;
++
++	result =
++	    (carry_node *) add_obj(&level->pool->node_pool, &level->nodes,
++				   order, &reference->header);
++	if (!IS_ERR(result) && (result != NULL))
++		++level->nodes_num;
++	return result;
++}
++
++/* add new carry operation to the @level.
++
++   Returns pointer to the new carry operations allocated from pool. It's up to
++   callers to maintain proper order in the @level. To control ordering use
++   @order and @reference parameters.
++
++*/
++static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
++			pool_ordering order	/* where to insert: at the beginning of
++						 * @level, before @reference, after
++						 * @reference, at the end of @level */ ,
++			carry_op *
++			reference /* reference node for insertion */ )
++{
++	carry_op *result;
++
++	result =
++	    (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order,
++				 &reference->header);
++	if (!IS_ERR(result) && (result != NULL))
++		++level->ops_num;
++	return result;
++}
++
++/* Return node on the right of which @node was created.
++
++   Each node is created on the right of some existing node (or it is new root,
++   which is special case not handled here).
++
++   @node is new node created on some level, but not yet inserted into its
++   parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
++
++*/
++static carry_node *find_begetting_brother(carry_node * node	/* node to start search
++								 * from */ ,
++					  carry_level * kin UNUSED_ARG	/* level to
++									 * scan */ )
++{
++	carry_node *scan;
++
++	assert("nikita-1614", node != NULL);
++	assert("nikita-1615", kin != NULL);
++	assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
++	assert("nikita-1619", ergo(carry_real(node) != NULL,
++				   ZF_ISSET(carry_real(node), JNODE_ORPHAN)));
++
++	for (scan = node;;
++	     scan = list_entry(scan->header.level_linkage.prev, carry_node,
++			       header.level_linkage)) {
++		assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
++		if ((scan->node != node->node) &&
++		    !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
++			assert("nikita-1618", carry_real(scan) != NULL);
++			break;
++		}
++	}
++	return scan;
++}
++
++static cmp_t
++carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
++{
++	assert("nikita-2199", n1 != NULL);
++	assert("nikita-2200", n2 != NULL);
++
++	if (n1 == n2)
++		return EQUAL_TO;
++	while (1) {
++		n1 = carry_node_next(n1);
++		if (carry_node_end(level, n1))
++			return GREATER_THAN;
++		if (n1 == n2)
++			return LESS_THAN;
++	}
++	impossible("nikita-2201", "End of level reached");
++}
++
++carry_node *find_carry_node(carry_level * level, const znode * node)
++{
++	carry_node *scan;
++	carry_node *tmp_scan;
++
++	assert("nikita-2202", level != NULL);
++	assert("nikita-2203", node != NULL);
++
++	for_all_nodes(level, scan, tmp_scan) {
++		if (carry_real(scan) == node)
++			return scan;
++	}
++	return NULL;
++}
++
++znode *carry_real(const carry_node * node)
++{
++	assert("nikita-3061", node != NULL);
++
++	return node->lock_handle.node;
++}
++
++carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
++			      const znode * node)
++{
++	carry_node *base;
++	carry_node *scan;
++	carry_node *tmp_scan;
++	carry_node *proj;
++
++	base = find_carry_node(doing, node);
++	assert("nikita-2204", base != NULL);
++
++	for_all_nodes(todo, scan, tmp_scan) {
++		proj = find_carry_node(doing, scan->node);
++		assert("nikita-2205", proj != NULL);
++		if (carry_node_cmp(doing, proj, base) != LESS_THAN)
++			break;
++	}
++	return scan;
++}
++
++static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
++				     znode * node)
++{
++	carry_node *reference;
++
++	assert("nikita-2994", doing != NULL);
++	assert("nikita-2995", todo != NULL);
++	assert("nikita-2996", node != NULL);
++
++	reference = insert_carry_node(doing, todo, node);
++	assert("nikita-2997", reference != NULL);
++
++	return add_carry(todo, POOLO_BEFORE, reference);
++}
++
++/* like post_carry(), but designed to be called from node plugin methods.
++   This function is different from post_carry() in that it finds proper place
++   to insert node in the queue. */
++carry_op *node_post_carry(carry_plugin_info * info	/* carry parameters
++							 * passed down to node
++							 * plugin */ ,
++			  carry_opcode op /* opcode of operation */ ,
++			  znode * node	/* node on which this
++					 * operation will operate */ ,
++			  int apply_to_parent_p	/* whether operation will
++						 * operate directly on @node
++						 * or on it parent. */ )
++{
++	carry_op *result;
++	carry_node *child;
++
++	assert("nikita-2207", info != NULL);
++	assert("nikita-2208", info->todo != NULL);
++
++	if (info->doing == NULL)
++		return post_carry(info->todo, op, node, apply_to_parent_p);
++
++	result = add_op(info->todo, POOLO_LAST, NULL);
++	if (IS_ERR(result))
++		return result;
++	child = add_carry_atplace(info->doing, info->todo, node);
++	if (IS_ERR(child)) {
++		reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
++		return (carry_op *) child;
++	}
++	result->node = child;
++	result->op = op;
++	child->parent = apply_to_parent_p;
++	if (ZF_ISSET(node, JNODE_ORPHAN))
++		child->left_before = 1;
++	child->node = node;
++	return result;
++}
++
++/* lock all carry nodes in @level */
++static int lock_carry_level(carry_level * level /* level to lock */ )
++{
++	int result;
++	carry_node *node;
++	carry_node *tmp_node;
++
++	assert("nikita-881", level != NULL);
++	assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
++
++	/* lock nodes from left to right */
++	result = 0;
++	for_all_nodes(level, node, tmp_node) {
++		result = lock_carry_node(level, node);
++		if (result != 0)
++			break;
++	}
++	return result;
++}
++
++/* Synchronize delimiting keys between @node and its left neighbor.
++
++   To reduce contention on dk key and simplify carry code, we synchronize
++   delimiting keys only when carry ultimately leaves tree level (carrying
++   changes upward) and unlocks nodes at this level.
++
++   This function first finds left neighbor of @node and then updates left
++   neighbor's right delimiting key to conincide with least key in @node.
++
++*/
++
++ON_DEBUG(extern atomic_t delim_key_version;
++    )
++
++static void sync_dkeys(znode * spot /* node to update */ )
++{
++	reiser4_key pivot;
++	reiser4_tree *tree;
++
++	assert("nikita-1610", spot != NULL);
++	assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
++
++	tree = znode_get_tree(spot);
++	read_lock_tree(tree);
++	write_lock_dk(tree);
++
++	assert("nikita-2192", znode_is_loaded(spot));
++
++	/* sync left delimiting key of @spot with key in its leftmost item */
++	if (node_is_empty(spot))
++		pivot = *znode_get_rd_key(spot);
++	else
++		leftmost_key_in_node(spot, &pivot);
++
++	znode_set_ld_key(spot, &pivot);
++
++	/* there can be sequence of empty nodes pending removal on the left of
++	   @spot. Scan them and update their left and right delimiting keys to
++	   match left delimiting key of @spot. Also, update right delimiting
++	   key of first non-empty left neighbor.
++	 */
++	while (1) {
++		if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
++			break;
++
++		spot = spot->left;
++		if (spot == NULL)
++			break;
++
++		znode_set_rd_key(spot, &pivot);
++		/* don't sink into the domain of another balancing */
++		if (!znode_is_write_locked(spot))
++			break;
++		if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
++			znode_set_ld_key(spot, &pivot);
++		else
++			break;
++	}
++
++	write_unlock_dk(tree);
++	read_unlock_tree(tree);
++}
++
++/* unlock all carry nodes in @level */
++static void unlock_carry_level(carry_level * level /* level to unlock */ ,
++			       int failure	/* true if unlocking owing to
++						 * failure */ )
++{
++	carry_node *node;
++	carry_node *tmp_node;
++
++	assert("nikita-889", level != NULL);
++
++	if (!failure) {
++		znode *spot;
++
++		spot = NULL;
++		/* update delimiting keys */
++		for_all_nodes(level, node, tmp_node) {
++			if (carry_real(node) != spot) {
++				spot = carry_real(node);
++				sync_dkeys(spot);
++			}
++		}
++	}
++
++	/* nodes can be unlocked in arbitrary order.  In preemptible
++	   environment it's better to unlock in reverse order of locking,
++	   though.
++	 */
++	for_all_nodes_back(level, node, tmp_node) {
++		/* all allocated nodes should be already linked to their
++		   parents at this moment. */
++		assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node),
++							       JNODE_ORPHAN)));
++		ON_DEBUG(check_dkeys(carry_real(node)));
++		unlock_carry_node(level, node, failure);
++	}
++	level->new_root = NULL;
++}
++
++/* finish with @level
++
++   Unlock nodes and release all allocated resources */
++static void done_carry_level(carry_level * level /* level to finish */ )
++{
++	carry_node *node;
++	carry_node *tmp_node;
++	carry_op *op;
++	carry_op *tmp_op;
++
++	assert("nikita-1076", level != NULL);
++
++	unlock_carry_level(level, 0);
++	for_all_nodes(level, node, tmp_node) {
++		assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
++		assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
++		reiser4_pool_free(&level->pool->node_pool, &node->header);
++	}
++	for_all_ops(level, op, tmp_op)
++	    reiser4_pool_free(&level->pool->op_pool, &op->header);
++}
++
++/* helper function to complete locking of carry node
++
++   Finish locking of carry node. There are several ways in which new carry
++   node can be added into carry level and locked. Normal is through
++   lock_carry_node(), but also from find_{left|right}_neighbor(). This
++   function factors out common final part of all locking scenarios. It
++   supposes that @node -> lock_handle is lock handle for lock just taken and
++   fills ->real_node from this lock handle.
++
++*/
++int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
++{
++	assert("nikita-1052", node != NULL);
++	assert("nikita-1187", carry_real(node) != NULL);
++	assert("nikita-1188", !node->unlock);
++
++	node->unlock = 1;
++	/* Load node content into memory and install node plugin by
++	   looking at the node header.
++
++	   Most of the time this call is cheap because the node is
++	   already in memory.
++
++	   Corresponding zrelse() is in unlock_carry_node()
++	 */
++	return zload(carry_real(node));
++}
++
++/* lock carry node
++
++   "Resolve" node to real znode, lock it and mark as locked.
++   This requires recursive locking of znodes.
++
++   When operation is posted to the parent level, node it will be applied to is
++   not yet known. For example, when shifting data between two nodes,
++   delimiting has to be updated in parent or parents of nodes involved. But
++   their parents is not yet locked and, moreover said nodes can be reparented
++   by concurrent balancing.
++
++   To work around this, carry operation is applied to special "carry node"
++   rather than to the znode itself. Carry node consists of some "base" or
++   "reference" znode and flags indicating how to get to the target of carry
++   operation (->real_node field of carry_node) from base.
++
++*/
++int lock_carry_node(carry_level * level /* level @node is in */ ,
++		    carry_node * node /* node to lock */ )
++{
++	int result;
++	znode *reference_point;
++	lock_handle lh;
++	lock_handle tmp_lh;
++	reiser4_tree *tree;
++
++	assert("nikita-887", level != NULL);
++	assert("nikita-882", node != NULL);
++
++	result = 0;
++	reference_point = node->node;
++	init_lh(&lh);
++	init_lh(&tmp_lh);
++	if (node->left_before) {
++		/* handling of new nodes, allocated on the previous level:
++
++		   some carry ops were propably posted from the new node, but
++		   this node neither has parent pointer set, nor is
++		   connected. This will be done in ->create_hook() for
++		   internal item.
++
++		   No then less, parent of new node has to be locked. To do
++		   this, first go to the "left" in the carry order. This
++		   depends on the decision to always allocate new node on the
++		   right of existing one.
++
++		   Loop handles case when multiple nodes, all orphans, were
++		   inserted.
++
++		   Strictly speaking, taking tree lock is not necessary here,
++		   because all nodes scanned by loop in
++		   find_begetting_brother() are write-locked by this thread,
++		   and thus, their sibling linkage cannot change.
++
++		 */
++		tree = znode_get_tree(reference_point);
++		read_lock_tree(tree);
++		reference_point = find_begetting_brother(node, level)->node;
++		read_unlock_tree(tree);
++		assert("nikita-1186", reference_point != NULL);
++	}
++	if (node->parent && (result == 0)) {
++		result =
++		    reiser4_get_parent(&tmp_lh, reference_point,
++				       ZNODE_WRITE_LOCK);
++		if (result != 0) {
++			;	/* nothing */
++		} else if (znode_get_level(tmp_lh.node) == 0) {
++			assert("nikita-1347", znode_above_root(tmp_lh.node));
++			result = add_new_root(level, node, tmp_lh.node);
++			if (result == 0) {
++				reference_point = level->new_root;
++				move_lh(&lh, &node->lock_handle);
++			}
++		} else if ((level->new_root != NULL)
++			   && (level->new_root !=
++			       znode_parent_nolock(reference_point))) {
++			/* parent of node exists, but this level aready
++			   created different new root, so */
++			warning("nikita-1109",
++				/* it should be "radicis", but tradition is
++				   tradition.  do banshees read latin? */
++				"hodie natus est radici frater");
++			result = -EIO;
++		} else {
++			move_lh(&lh, &tmp_lh);
++			reference_point = lh.node;
++		}
++	}
++	if (node->left && (result == 0)) {
++		assert("nikita-1183", node->parent);
++		assert("nikita-883", reference_point != NULL);
++		result =
++		    reiser4_get_left_neighbor(&tmp_lh, reference_point,
++					      ZNODE_WRITE_LOCK,
++					      GN_CAN_USE_UPPER_LEVELS);
++		if (result == 0) {
++			done_lh(&lh);
++			move_lh(&lh, &tmp_lh);
++			reference_point = lh.node;
++		}
++	}
++	if (!node->parent && !node->left && !node->left_before) {
++		result =
++		    longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
++					ZNODE_LOCK_HIPRI);
++	}
++	if (result == 0) {
++		move_lh(&node->lock_handle, &lh);
++		result = lock_carry_node_tail(node);
++	}
++	done_lh(&tmp_lh);
++	done_lh(&lh);
++	return result;
++}
++
++/* release a lock on &carry_node.
++
++   Release if necessary lock on @node. This opearion is pair of
++   lock_carry_node() and is idempotent: you can call it more than once on the
++   same node.
++
++*/
++static void
++unlock_carry_node(carry_level * level,
++		  carry_node * node /* node to be released */ ,
++		  int failure	/* 0 if node is unlocked due
++				 * to some error */ )
++{
++	znode *real_node;
++
++	assert("nikita-884", node != NULL);
++
++	real_node = carry_real(node);
++	/* pair to zload() in lock_carry_node_tail() */
++	zrelse(real_node);
++	if (node->unlock && (real_node != NULL)) {
++		assert("nikita-899", real_node == node->lock_handle.node);
++		longterm_unlock_znode(&node->lock_handle);
++	}
++	if (failure) {
++		if (node->deallocate && (real_node != NULL)) {
++			/* free node in bitmap
++
++			   Prepare node for removal. Last zput() will finish
++			   with it.
++			 */
++			ZF_SET(real_node, JNODE_HEARD_BANSHEE);
++		}
++		if (node->free) {
++			assert("nikita-2177",
++			       list_empty_careful(&node->lock_handle.locks_link));
++			assert("nikita-2112",
++			       list_empty_careful(&node->lock_handle.owners_link));
++			reiser4_pool_free(&level->pool->node_pool,
++					  &node->header);
++		}
++	}
++}
++
++/* fatal_carry_error() - all-catching error handling function
++
++   It is possible that carry faces unrecoverable error, like unability to
++   insert pointer at the internal level. Our simple solution is just panic in
++   this situation. More sophisticated things like attempt to remount
++   file-system as read-only can be implemented without much difficlties.
++
++   It is believed, that:
++
++   1. in stead of panicking, all current transactions can be aborted rolling
++   system back to the consistent state.
++
++Umm, if you simply panic without doing anything more at all, then all current
++transactions are aborted and the system is rolled back to a consistent state,
++by virtue of the design of the transactional mechanism. Well, wait, let's be
++precise.  If an internal node is corrupted on disk due to hardware failure,
++then there may be no consistent state that can be rolled back to, so instead
++we should say that it will rollback the transactions, which barring other
++factors means rolling back to a consistent state.
++
++# Nikita: there is a subtle difference between panic and aborting
++# transactions: machine doesn't reboot. Processes aren't killed. Processes
++# don't using reiser4 (not that we care about such processes), or using other
++# reiser4 mounts (about them we do care) will simply continue to run. With
++# some luck, even application using aborted file system can survive: it will
++# get some error, like EBADF, from each file descriptor on failed file system,
++# but applications that do care about tolerance will cope with this (squid
++# will).
++
++It would be a nice feature though to support rollback without rebooting
++followed by remount, but this can wait for later versions.
++
++   2. once isolated transactions will be implemented it will be possible to
++   roll back offending transaction.
++
++2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
++it more before deciding if it should be done.  -Hans
++
++*/
++static void fatal_carry_error(carry_level * doing UNUSED_ARG	/* carry level
++								 * where
++								 * unrecoverable
++								 * error
++								 * occurred */ ,
++			      int ecode /* error code */ )
++{
++	assert("nikita-1230", doing != NULL);
++	assert("nikita-1231", ecode < 0);
++
++	reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
++}
++
++/* add new root to the tree
++
++   This function itself only manages changes in carry structures and delegates
++   all hard work (allocation of znode for new root, changes of parent and
++   sibling pointers to the add_tree_root().
++
++   Locking: old tree root is locked by carry at this point. Fake znode is also
++   locked.
++
++*/
++static int add_new_root(carry_level * level	/* carry level in context of which
++						 * operation is performed */ ,
++			carry_node * node /* carry node for existing root */ ,
++			znode * fake	/* "fake" znode already locked by
++					 * us */ )
++{
++	int result;
++
++	assert("nikita-1104", level != NULL);
++	assert("nikita-1105", node != NULL);
++
++	assert("nikita-1403", znode_is_write_locked(node->node));
++	assert("nikita-1404", znode_is_write_locked(fake));
++
++	/* trying to create new root. */
++	/* @node is root and it's already locked by us. This
++	   means that nobody else can be trying to add/remove
++	   tree root right now.
++	 */
++	if (level->new_root == NULL)
++		level->new_root = add_tree_root(node->node, fake);
++	if (!IS_ERR(level->new_root)) {
++		assert("nikita-1210", znode_is_root(level->new_root));
++		node->deallocate = 1;
++		result =
++		    longterm_lock_znode(&node->lock_handle, level->new_root,
++					ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
++		if (result == 0)
++			zput(level->new_root);
++	} else {
++		result = PTR_ERR(level->new_root);
++		level->new_root = NULL;
++	}
++	return result;
++}
++
++/* allocate new znode and add the operation that inserts the
++   pointer to it into the parent node into the todo level
++
++   Allocate new znode, add it into carry queue and post into @todo queue
++   request to add pointer to new node into its parent.
++
++   This is carry related routing that calls new_node() to allocate new
++   node.
++*/
++carry_node *add_new_znode(znode * brother	/* existing left neighbor of new
++						 * node */ ,
++			  carry_node * ref	/* carry node after which new
++						 * carry node is to be inserted
++						 * into queue. This affects
++						 * locking. */ ,
++			  carry_level * doing	/* carry queue where new node is
++						 * to be added */ ,
++			  carry_level * todo	/* carry queue where COP_INSERT
++						 * operation to add pointer to
++						 * new node will ne added */ )
++{
++	carry_node *fresh;
++	znode *new_znode;
++	carry_op *add_pointer;
++	carry_plugin_info info;
++
++	assert("nikita-1048", brother != NULL);
++	assert("nikita-1049", todo != NULL);
++
++	/* There is a lot of possible variations here: to what parent
++	   new node will be attached and where. For simplicity, always
++	   do the following:
++
++	   (1) new node and @brother will have the same parent.
++
++	   (2) new node is added on the right of @brother
++
++	 */
++
++	fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref);
++	if (IS_ERR(fresh))
++		return fresh;
++
++	fresh->deallocate = 1;
++	fresh->free = 1;
++
++	new_znode = new_node(brother, znode_get_level(brother));
++	if (IS_ERR(new_znode))
++		/* @fresh will be deallocated automatically by error
++		   handling code in the caller. */
++		return (carry_node *) new_znode;
++
++	/* new_znode returned znode with x_count 1. Caller has to decrease
++	   it. make_space() does. */
++
++	ZF_SET(new_znode, JNODE_ORPHAN);
++	fresh->node = new_znode;
++
++	while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) {
++		ref = carry_node_prev(ref);
++		assert("nikita-1606", !carry_node_end(doing, ref));
++	}
++
++	info.todo = todo;
++	info.doing = doing;
++	add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1);
++	if (IS_ERR(add_pointer)) {
++		/* no need to deallocate @new_znode here: it will be
++		   deallocated during carry error handling. */
++		return (carry_node *) add_pointer;
++	}
++
++	add_pointer->u.insert.type = COPT_CHILD;
++	add_pointer->u.insert.child = fresh;
++	add_pointer->u.insert.brother = brother;
++	/* initially new node spawns empty key range */
++	write_lock_dk(znode_get_tree(brother));
++	znode_set_ld_key(new_znode,
++			 znode_set_rd_key(new_znode,
++					  znode_get_rd_key(brother)));
++	write_unlock_dk(znode_get_tree(brother));
++	return fresh;
++}
++
++/* DEBUGGING FUNCTIONS.
++
++   Probably we also should leave them on even when
++   debugging is turned off to print dumps at errors.
++*/
++#if REISER4_DEBUG
++static int carry_level_invariant(carry_level * level, carry_queue_state state)
++{
++	carry_node *node;
++	carry_node *tmp_node;
++
++	if (level == NULL)
++		return 0;
++
++	if (level->track_type != 0 &&
++	    level->track_type != CARRY_TRACK_NODE &&
++	    level->track_type != CARRY_TRACK_CHANGE)
++		return 0;
++
++	/* check that nodes are in ascending order */
++	for_all_nodes(level, node, tmp_node) {
++		znode *left;
++		znode *right;
++
++		reiser4_key lkey;
++		reiser4_key rkey;
++
++		if (node != carry_node_front(level)) {
++			if (state == CARRY_TODO) {
++				right = node->node;
++				left = carry_node_prev(node)->node;
++			} else {
++				right = carry_real(node);
++				left = carry_real(carry_node_prev(node));
++			}
++			if (right == NULL || left == NULL)
++				continue;
++			if (node_is_empty(right) || node_is_empty(left))
++				continue;
++			if (!keyle(leftmost_key_in_node(left, &lkey),
++				   leftmost_key_in_node(right, &rkey))) {
++				warning("", "wrong key order");
++				return 0;
++			}
++		}
++	}
++	return 1;
++}
++#endif
++
++/* get symbolic name for boolean */
++static const char *tf(int boolean /* truth value */ )
++{
++	return boolean ? "t" : "f";
++}
++
++/* symbolic name for carry operation */
++static const char *carry_op_name(carry_opcode op /* carry opcode */ )
++{
++	switch (op) {
++	case COP_INSERT:
++		return "COP_INSERT";
++	case COP_DELETE:
++		return "COP_DELETE";
++	case COP_CUT:
++		return "COP_CUT";
++	case COP_PASTE:
++		return "COP_PASTE";
++	case COP_UPDATE:
++		return "COP_UPDATE";
++	case COP_EXTENT:
++		return "COP_EXTENT";
++	case COP_INSERT_FLOW:
++		return "COP_INSERT_FLOW";
++	default:{
++			/* not mt safe, but who cares? */
++			static char buf[20];
++
++			sprintf(buf, "unknown op: %x", op);
++			return buf;
++		}
++	}
++}
++
++/* dump information about carry node */
++static void print_carry(const char *prefix /* prefix to print */ ,
++			carry_node * node /* node to print */ )
++{
++	if (node == NULL) {
++		printk("%s: null\n", prefix);
++		return;
++	}
++	printk
++	    ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
++	     prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
++	     tf(node->free), tf(node->deallocate));
++}
++
++/* dump information about carry operation */
++static void print_op(const char *prefix /* prefix to print */ ,
++		     carry_op * op /* operation to print */ )
++{
++	if (op == NULL) {
++		printk("%s: null\n", prefix);
++		return;
++	}
++	printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
++	print_carry("\tnode", op->node);
++	switch (op->op) {
++	case COP_INSERT:
++	case COP_PASTE:
++		print_coord("\tcoord",
++			    op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
++		print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL);
++		print_carry("\tchild", op->u.insert.child);
++		break;
++	case COP_DELETE:
++		print_carry("\tchild", op->u.delete.child);
++		break;
++	case COP_CUT:
++		if (op->u.cut_or_kill.is_cut) {
++			print_coord("\tfrom",
++				    op->u.cut_or_kill.u.kill->params.from, 0);
++			print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
++				    0);
++		} else {
++			print_coord("\tfrom",
++				    op->u.cut_or_kill.u.cut->params.from, 0);
++			print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
++				    0);
++		}
++		break;
++	case COP_UPDATE:
++		print_carry("\tleft", op->u.update.left);
++		break;
++	default:
++		/* do nothing */
++		break;
++	}
++}
++
++/* dump information about all nodes and operations in a @level */
++static void print_level(const char *prefix /* prefix to print */ ,
++			carry_level * level /* level to print */ )
++{
++	carry_node *node;
++	carry_node *tmp_node;
++	carry_op *op;
++	carry_op *tmp_op;
++
++	if (level == NULL) {
++		printk("%s: null\n", prefix);
++		return;
++	}
++	printk("%s: %p, restartable: %s\n",
++	       prefix, level, tf(level->restartable));
++
++	for_all_nodes(level, node, tmp_node)
++	    print_carry("\tcarry node", node);
++	for_all_ops(level, op, tmp_op)
++	    print_op("\tcarry op", op);
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/carry.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/carry.h
+@@ -0,0 +1,442 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Functions and data types to "carry" tree modification(s) upward.
++   See fs/reiser4/carry.c for details. */
++
++#if !defined( __FS_REISER4_CARRY_H__ )
++#define __FS_REISER4_CARRY_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "pool.h"
++#include "znode.h"
++
++#include <linux/types.h>
++
++/* &carry_node - "location" of carry node.
++
++   "location" of node that is involved or going to be involved into
++   carry process. Node where operation will be carried to on the
++   parent level cannot be recorded explicitly. Operation will be carried
++   usually to the parent of some node (where changes are performed at
++   the current level) or, to the left neighbor of its parent. But while
++   modifications are performed at the current level, parent may
++   change. So, we have to allow some indirection (or, positevly,
++   flexibility) in locating carry nodes.
++
++*/
++typedef struct carry_node {
++	/* pool linkage */
++	reiser4_pool_header header;
++
++	/* base node from which real_node is calculated. See
++	   fs/reiser4/carry.c:lock_carry_node(). */
++	znode *node;
++
++	/* how to get ->real_node */
++	/* to get ->real_node obtain parent of ->node */
++	__u32 parent:1;
++	/* to get ->real_node obtain left neighbor of parent of
++	   ->node */
++	__u32 left:1;
++	__u32 left_before:1;
++
++	/* locking */
++
++	/* this node was locked by carry process and should be
++	   unlocked when carry leaves a level */
++	__u32 unlock:1;
++
++	/* disk block for this node was allocated by carry process and
++	   should be deallocated when carry leaves a level */
++	__u32 deallocate:1;
++	/* this carry node was allocated by carry process and should be
++	   freed when carry leaves a level */
++	__u32 free:1;
++
++	/* type of lock we want to take on this node */
++	lock_handle lock_handle;
++} carry_node;
++
++/* &carry_opcode - elementary operations that can be carried upward
++
++   Operations that carry() can handle. This list is supposed to be
++   expanded.
++
++   Each carry operation (cop) is handled by appropriate function defined
++   in fs/reiser4/carry.c. For example COP_INSERT is handled by
++   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
++   call plugins of nodes affected by operation to modify nodes' content
++   and to gather operations to be performed on the next level.
++
++*/
++typedef enum {
++	/* insert new item into node. */
++	COP_INSERT,
++	/* delete pointer from parent node */
++	COP_DELETE,
++	/* remove part of or whole node. */
++	COP_CUT,
++	/* increase size of item. */
++	COP_PASTE,
++	/* insert extent (that is sequence of unformatted nodes). */
++	COP_EXTENT,
++	/* update delimiting key in least common ancestor of two
++	   nodes. This is performed when items are moved between two
++	   nodes.
++	 */
++	COP_UPDATE,
++	/* insert flow */
++	COP_INSERT_FLOW,
++	COP_LAST_OP,
++} carry_opcode;
++
++#define CARRY_FLOW_NEW_NODES_LIMIT 20
++
++/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
++   item is determined. */
++typedef enum {
++	/* target item is one containing pointer to the ->child node */
++	COPT_CHILD,
++	/* target item is given explicitly by @coord */
++	COPT_ITEM_DATA,
++	/* target item is given by key */
++	COPT_KEY,
++	/* see insert_paste_common() for more comments on this. */
++	COPT_PASTE_RESTARTED,
++} cop_insert_pos_type;
++
++/* flags to cut and delete */
++typedef enum {
++	/* don't kill node even if it became completely empty as results of
++	 * cut. This is needed for eottl handling. See carry_extent() for
++	 * details. */
++	DELETE_RETAIN_EMPTY = (1 << 0)
++} cop_delete_flag;
++
++/*
++ * carry() implements "lock handle tracking" feature.
++ *
++ * Callers supply carry with node where to perform initial operation and lock
++ * handle on this node. Trying to optimize node utilization carry may actually
++ * move insertion point to different node. Callers expect that lock handle
++ * will rebe transferred to the new node also.
++ *
++ */
++typedef enum {
++	/* transfer lock handle along with insertion point */
++	CARRY_TRACK_CHANGE = 1,
++	/* acquire new lock handle to the node where insertion point is. This
++	 * is used when carry() client doesn't initially possess lock handle
++	 * on the insertion point node, for example, by extent insertion
++	 * code. See carry_extent(). */
++	CARRY_TRACK_NODE = 2
++} carry_track_type;
++
++/* data supplied to COP_{INSERT|PASTE} by callers */
++typedef struct carry_insert_data {
++	/* position where new item is to be inserted */
++	coord_t *coord;
++	/* new item description */
++	reiser4_item_data *data;
++	/* key of new item */
++	const reiser4_key *key;
++} carry_insert_data;
++
++/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
++struct cut_kill_params {
++	/* coord where cut starts (inclusive) */
++	coord_t *from;
++	/* coord where cut stops (inclusive, this item/unit will also be
++	 * cut) */
++	coord_t *to;
++	/* starting key. This is necessary when item and unit pos don't
++	 * uniquely identify what portion or tree to remove. For example, this
++	 * indicates what portion of extent unit will be affected. */
++	const reiser4_key *from_key;
++	/* exclusive stop key */
++	const reiser4_key *to_key;
++	/* if this is not NULL, smallest actually removed key is stored
++	 * here. */
++	reiser4_key *smallest_removed;
++	/* kill_node_content()  is called for file truncate */
++	int truncate;
++};
++
++struct carry_cut_data {
++	struct cut_kill_params params;
++};
++
++struct carry_kill_data {
++	struct cut_kill_params params;
++	/* parameter to be passed to the ->kill_hook() method of item
++	 * plugin */
++	/*void *iplug_params; *//* FIXME: unused currently */
++	/* if not NULL---inode whose items are being removed. This is needed
++	 * for ->kill_hook() of extent item to update VM structures when
++	 * removing pages. */
++	struct inode *inode;
++	/* sibling list maintenance is complicated by existence of eottl. When
++	 * eottl whose left and right neighbors are formatted leaves is
++	 * removed, one has to connect said leaves in the sibling list. This
++	 * cannot be done when extent removal is just started as locking rules
++	 * require sibling list update to happen atomically with removal of
++	 * extent item. Therefore: 1. pointers to left and right neighbors
++	 * have to be passed down to the ->kill_hook() of extent item, and
++	 * 2. said neighbors have to be locked. */
++	lock_handle *left;
++	lock_handle *right;
++	/* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
++	unsigned flags;
++	char *buf;
++};
++
++/* &carry_tree_op - operation to "carry" upward.
++
++   Description of an operation we want to "carry" to the upper level of
++   a tree: e.g, when we insert something and there is not enough space
++   we allocate a new node and "carry" the operation of inserting a
++   pointer to the new node to the upper level, on removal of empty node,
++   we carry up operation of removing appropriate entry from parent.
++
++   There are two types of carry ops: when adding or deleting node we
++   node at the parent level where appropriate modification has to be
++   performed is known in advance. When shifting items between nodes
++   (split, merge), delimiting key should be changed in the least common
++   parent of the nodes involved that is not known in advance.
++
++   For the operations of the first type we store in &carry_op pointer to
++   the &carry_node at the parent level. For the operation of the second
++   type we store &carry_node or parents of the left and right nodes
++   modified and keep track of them upward until they coincide.
++
++*/
++typedef struct carry_op {
++	/* pool linkage */
++	reiser4_pool_header header;
++	carry_opcode op;
++	/* node on which operation is to be performed:
++
++	   for insert, paste: node where new item is to be inserted
++
++	   for delete: node where pointer is to be deleted
++
++	   for cut: node to cut from
++
++	   for update: node where delimiting key is to be modified
++
++	   for modify: parent of modified node
++
++	 */
++	carry_node *node;
++	union {
++		struct {
++			/* (sub-)type of insertion/paste. Taken from
++			   cop_insert_pos_type. */
++			__u8 type;
++			/* various operation flags. Taken from
++			   cop_insert_flag. */
++			__u8 flags;
++			carry_insert_data *d;
++			carry_node *child;
++			znode *brother;
++		} insert, paste, extent;
++
++		struct {
++			int is_cut;
++			union {
++				carry_kill_data *kill;
++				carry_cut_data *cut;
++			} u;
++		} cut_or_kill;
++
++		struct {
++			carry_node *left;
++		} update;
++		struct {
++			/* changed child */
++			carry_node *child;
++			/* bitmask of changes. See &cop_modify_flag */
++			__u32 flag;
++		} modify;
++		struct {
++			/* flags to deletion operation. Are taken from
++			   cop_delete_flag */
++			__u32 flags;
++			/* child to delete from parent. If this is
++			   NULL, delete op->node.  */
++			carry_node *child;
++		} delete;
++		struct {
++			/* various operation flags. Taken from
++			   cop_insert_flag. */
++			__u32 flags;
++			flow_t *flow;
++			coord_t *insert_point;
++			reiser4_item_data *data;
++			/* flow insertion is limited by number of new blocks
++			   added in that operation which do not get any data
++			   but part of flow. This limit is set by macro
++			   CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
++			   of nodes added already during one carry_flow */
++			int new_nodes;
++		} insert_flow;
++	} u;
++} carry_op;
++
++/* &carry_op_pool - preallocated pool of carry operations, and nodes */
++typedef struct carry_pool {
++	carry_op op[CARRIES_POOL_SIZE];
++	reiser4_pool op_pool;
++	carry_node node[NODES_LOCKED_POOL_SIZE];
++	reiser4_pool node_pool;
++} carry_pool;
++
++/* &carry_tree_level - carry process on given level
++
++   Description of balancing process on the given level.
++
++   No need for locking here, as carry_tree_level is essentially per
++   thread thing (for now).
++
++*/
++struct carry_level {
++	/* this level may be restarted */
++	__u32 restartable:1;
++	/* list of carry nodes on this level, ordered by key order */
++	struct list_head nodes;
++	struct list_head ops;
++	/* pool where new objects are allocated from */
++	carry_pool *pool;
++	int ops_num;
++	int nodes_num;
++	/* new root created on this level, if any */
++	znode *new_root;
++	/* This is set by caller (insert_by_key(), resize_item(), etc.) when
++	   they want ->tracked to automagically wander to the node where
++	   insertion point moved after insert or paste.
++	 */
++	carry_track_type track_type;
++	/* lock handle supplied by user that we are tracking. See
++	   above. */
++	lock_handle *tracked;
++};
++
++/* information carry passes to plugin methods that may add new operations to
++   the @todo queue  */
++struct carry_plugin_info {
++	carry_level *doing;
++	carry_level *todo;
++};
++
++int carry(carry_level * doing, carry_level * done);
++
++carry_node *add_carry(carry_level * level, pool_ordering order,
++		      carry_node * reference);
++carry_node *add_carry_skip(carry_level * level, pool_ordering order,
++			   carry_node * reference);
++
++extern carry_node *insert_carry_node(carry_level * doing,
++				     carry_level * todo, const znode * node);
++
++extern carry_pool *init_carry_pool(int);
++extern void done_carry_pool(carry_pool * pool);
++
++extern void init_carry_level(carry_level * level, carry_pool * pool);
++
++extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node,
++			    int apply_to_parent);
++extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
++				 znode * node, int apply_to_parent_p);
++
++carry_node *add_new_znode(znode * brother, carry_node * reference,
++			  carry_level * doing, carry_level * todo);
++
++carry_node *find_carry_node(carry_level * level, const znode * node);
++
++extern znode *carry_real(const carry_node * node);
++
++/* helper macros to iterate over carry queues */
++
++#define carry_node_next( node )					\
++	list_entry((node)->header.level_linkage.next, carry_node,	\
++		   header.level_linkage)
++
++#define carry_node_prev( node )					\
++	list_entry((node)->header.level_linkage.prev, carry_node,	\
++		   header.level_linkage)
++
++#define carry_node_front( level )						\
++	list_entry((level)->nodes.next, carry_node, header.level_linkage)
++
++#define carry_node_back( level )						\
++	list_entry((level)->nodes.prev, carry_node, header.level_linkage)
++
++#define carry_node_end( level, node )				\
++	(&(level)->nodes == &(node)->header.level_linkage)
++
++/* macro to iterate over all operations in a @level */
++#define for_all_ops( level /* carry level (of type carry_level *) */,			\
++		     op    /* pointer to carry operation, modified by loop (of 		\
++			    * type carry_op *) */,					\
++		     tmp   /* pointer to carry operation (of type carry_op *), 		\
++			    * used to make iterator stable in the face of 		\
++			    * deletions from the level */ )				\
++for (op = list_entry(level->ops.next, carry_op, header.level_linkage),			\
++     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); 	\
++     &op->header.level_linkage != &level->ops;						\
++     op = tmp,										\
++     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
++
++#if 0
++for( op = ( carry_op * ) pool_level_list_front( &level -> ops ),		\
++     tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ;		\
++     ! pool_level_list_end( &level -> ops, &op -> header ) ;			\
++     op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
++#endif
++
++/* macro to iterate over all nodes in a @level */						\
++#define for_all_nodes( level /* carry level (of type carry_level *) */,				\
++		       node  /* pointer to carry node, modified by loop (of 			\
++			      * type carry_node *) */,						\
++		       tmp   /* pointer to carry node (of type carry_node *), 			\
++			      * used to make iterator stable in the face of * 			\
++			      * deletions from the level */ )					\
++for (node = list_entry(level->nodes.next, carry_node, header.level_linkage),			\
++     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); 	\
++     &node->header.level_linkage != &level->nodes;						\
++     node = tmp, 										\
++     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
++
++#if 0
++for( node = carry_node_front( level ),						\
++     tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ;		\
++     node = tmp, tmp = carry_node_next( node ) )
++#endif
++
++/* macro to iterate over all nodes in a @level in reverse order
++
++   This is used, because nodes are unlocked in reversed order of locking */
++#define for_all_nodes_back( level /* carry level (of type carry_level *) */,	\
++		            node  /* pointer to carry node, modified by loop	\
++				   * (of type carry_node *) */,			\
++		            tmp   /* pointer to carry node (of type carry_node	\
++				   * *), used to make iterator stable in the	\
++				   * face of deletions from the level */ )	\
++for( node = carry_node_back( level ),		\
++     tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ;		\
++     node = tmp, tmp = carry_node_prev( node ) )
++
++/* __FS_REISER4_CARRY_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/carry_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/carry_ops.c
+@@ -0,0 +1,2103 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* implementation of carry operations */
++
++#include "forward.h"
++#include "debug.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/item/item.h"
++#include "plugin/node/node.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree_walk.h"
++#include "pool.h"
++#include "tree_mod.h"
++#include "carry.h"
++#include "carry_ops.h"
++#include "tree.h"
++#include "super.h"
++#include "reiser4.h"
++
++#include <linux/types.h>
++#include <linux/err.h>
++
++static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
++			    carry_level * doing, carry_level * todo,
++			    unsigned int including_insert_coord_p);
++
++extern int lock_carry_node(carry_level * level, carry_node * node);
++extern int lock_carry_node_tail(carry_node * node);
++
++/* find left neighbor of a carry node
++
++   Look for left neighbor of @node and add it to the @doing queue. See
++   comments in the body.
++
++*/
++static carry_node *find_left_neighbor(carry_op * op	/* node to find left
++							 * neighbor of */ ,
++				      carry_level * doing /* level to scan */ )
++{
++	int result;
++	carry_node *node;
++	carry_node *left;
++	int flags;
++	reiser4_tree *tree;
++
++	node = op->node;
++
++	tree = current_tree;
++	read_lock_tree(tree);
++	/* first, check whether left neighbor is already in a @doing queue */
++	if (carry_real(node)->left != NULL) {
++		/* NOTE: there is locking subtlety here. Look into
++		 * find_right_neighbor() for more info */
++		if (find_carry_node(doing, carry_real(node)->left) != NULL) {
++			read_unlock_tree(tree);
++			left = node;
++			do {
++				left = list_entry(left->header.level_linkage.prev,
++						  carry_node, header.level_linkage);
++				assert("nikita-3408", !carry_node_end(doing,
++								      left));
++			} while (carry_real(left) == carry_real(node));
++			return left;
++		}
++	}
++	read_unlock_tree(tree);
++
++	left = add_carry_skip(doing, POOLO_BEFORE, node);
++	if (IS_ERR(left))
++		return left;
++
++	left->node = node->node;
++	left->free = 1;
++
++	flags = GN_TRY_LOCK;
++	if (!op->u.insert.flags & COPI_LOAD_LEFT)
++		flags |= GN_NO_ALLOC;
++
++	/* then, feeling lucky, peek left neighbor in the cache. */
++	result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node),
++					   ZNODE_WRITE_LOCK, flags);
++	if (result == 0) {
++		/* ok, node found and locked. */
++		result = lock_carry_node_tail(left);
++		if (result != 0)
++			left = ERR_PTR(result);
++	} else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
++		/* node is leftmost node in a tree, or neighbor wasn't in
++		   cache, or there is an extent on the left. */
++		reiser4_pool_free(&doing->pool->node_pool, &left->header);
++		left = NULL;
++	} else if (doing->restartable) {
++		/* if left neighbor is locked, and level is restartable, add
++		   new node to @doing and restart. */
++		assert("nikita-913", node->parent != 0);
++		assert("nikita-914", node->node != NULL);
++		left->left = 1;
++		left->free = 0;
++		left = ERR_PTR(-E_REPEAT);
++	} else {
++		/* left neighbor is locked, level cannot be restarted. Just
++		   ignore left neighbor. */
++		reiser4_pool_free(&doing->pool->node_pool, &left->header);
++		left = NULL;
++	}
++	return left;
++}
++
++/* find right neighbor of a carry node
++
++   Look for right neighbor of @node and add it to the @doing queue. See
++   comments in the body.
++
++*/
++static carry_node *find_right_neighbor(carry_op * op	/* node to find right
++							 * neighbor of */ ,
++				       carry_level * doing /* level to scan */ )
++{
++	int result;
++	carry_node *node;
++	carry_node *right;
++	lock_handle lh;
++	int flags;
++	reiser4_tree *tree;
++
++	init_lh(&lh);
++
++	node = op->node;
++
++	tree = current_tree;
++	read_lock_tree(tree);
++	/* first, check whether right neighbor is already in a @doing queue */
++	if (carry_real(node)->right != NULL) {
++		/*
++		 * Tree lock is taken here anyway, because, even if _outcome_
++		 * of (find_carry_node() != NULL) doesn't depends on
++		 * concurrent updates to ->right, find_carry_node() cannot
++		 * work with second argument NULL. Hence, following comment is
++		 * of historic importance only.
++		 *
++		 * Subtle:
++		 *
++		 * Q: why don't we need tree lock here, looking for the right
++		 * neighbor?
++		 *
++		 * A: even if value of node->real_node->right were changed
++		 * during find_carry_node() execution, outcome of execution
++		 * wouldn't change, because (in short) other thread cannot add
++		 * elements to the @doing, and if node->real_node->right
++		 * already was in @doing, value of node->real_node->right
++		 * couldn't change, because node cannot be inserted between
++		 * locked neighbors.
++		 */
++		if (find_carry_node(doing, carry_real(node)->right) != NULL) {
++			read_unlock_tree(tree);
++			/*
++			 * What we are doing here (this is also applicable to
++			 * the find_left_neighbor()).
++			 *
++			 * tree_walk.c code requires that insertion of a
++			 * pointer to a child, modification of parent pointer
++			 * in the child, and insertion of the child into
++			 * sibling list are atomic (see
++			 * plugin/item/internal.c:create_hook_internal()).
++			 *
++			 * carry allocates new node long before pointer to it
++			 * is inserted into parent and, actually, long before
++			 * parent is even known. Such allocated-but-orphaned
++			 * nodes are only trackable through carry level lists.
++			 *
++			 * Situation that is handled here is following: @node
++			 * has valid ->right pointer, but there is
++			 * allocated-but-orphaned node in the carry queue that
++			 * is logically between @node and @node->right. Here
++			 * we are searching for it. Critical point is that
++			 * this is only possible if @node->right is also in
++			 * the carry queue (this is checked above), because
++			 * this is the only way new orphaned node could be
++			 * inserted between them (before inserting new node,
++			 * make_space() first tries to shift to the right, so,
++			 * right neighbor will be locked and queued).
++			 *
++			 */
++			right = node;
++			do {
++				right = list_entry(right->header.level_linkage.next,
++						   carry_node, header.level_linkage);
++				assert("nikita-3408", !carry_node_end(doing,
++								      right));
++			} while (carry_real(right) == carry_real(node));
++			return right;
++		}
++	}
++	read_unlock_tree(tree);
++
++	flags = GN_CAN_USE_UPPER_LEVELS;
++	if (!op->u.insert.flags & COPI_LOAD_RIGHT)
++		flags = GN_NO_ALLOC;
++
++	/* then, try to lock right neighbor */
++	init_lh(&lh);
++	result = reiser4_get_right_neighbor(&lh, carry_real(node),
++					    ZNODE_WRITE_LOCK, flags);
++	if (result == 0) {
++		/* ok, node found and locked. */
++		right = add_carry_skip(doing, POOLO_AFTER, node);
++		if (!IS_ERR(right)) {
++			right->node = lh.node;
++			move_lh(&right->lock_handle, &lh);
++			right->free = 1;
++			result = lock_carry_node_tail(right);
++			if (result != 0)
++				right = ERR_PTR(result);
++		}
++	} else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
++		/* node is rightmost node in a tree, or neighbor wasn't in
++		   cache, or there is an extent on the right. */
++		right = NULL;
++	} else
++		right = ERR_PTR(result);
++	done_lh(&lh);
++	return right;
++}
++
++/* how much free space in a @node is needed for @op
++
++   How much space in @node is required for completion of @op, where @op is
++   insert or paste operation.
++*/
++static unsigned int space_needed_for_op(znode * node	/* znode data are
++							 * inserted or
++							 * pasted in */ ,
++					carry_op * op	/* carry
++							   operation */ )
++{
++	assert("nikita-919", op != NULL);
++
++	switch (op->op) {
++	default:
++		impossible("nikita-1701", "Wrong opcode");
++	case COP_INSERT:
++		return space_needed(node, NULL, op->u.insert.d->data, 1);
++	case COP_PASTE:
++		return space_needed(node, op->u.insert.d->coord,
++				    op->u.insert.d->data, 0);
++	}
++}
++
++/* how much space in @node is required to insert or paste @data at
++   @coord. */
++unsigned int space_needed(const znode * node	/* node data are inserted or
++						 * pasted in */ ,
++			  const coord_t * coord	/* coord where data are
++						 * inserted or pasted
++						 * at */ ,
++			  const reiser4_item_data * data	/* data to insert or
++								 * paste */ ,
++			  int insertion /* non-0 is inserting, 0---paste */ )
++{
++	int result;
++	item_plugin *iplug;
++
++	assert("nikita-917", node != NULL);
++	assert("nikita-918", node_plugin_by_node(node) != NULL);
++	assert("vs-230", !insertion || (coord == NULL));
++
++	result = 0;
++	iplug = data->iplug;
++	if (iplug->b.estimate != NULL) {
++		/* ask item plugin how much space is needed to insert this
++		   item */
++		result += iplug->b.estimate(insertion ? NULL : coord, data);
++	} else {
++		/* reasonable default */
++		result += data->length;
++	}
++	if (insertion) {
++		node_plugin *nplug;
++
++		nplug = node->nplug;
++		/* and add node overhead */
++		if (nplug->item_overhead != NULL) {
++			result += nplug->item_overhead(node, NULL);
++		}
++	}
++	return result;
++}
++
++/* find &coord in parent where pointer to new child is to be stored. */
++static int find_new_child_coord(carry_op * op	/* COP_INSERT carry operation to
++						 * insert pointer to new
++						 * child */ )
++{
++	int result;
++	znode *node;
++	znode *child;
++
++	assert("nikita-941", op != NULL);
++	assert("nikita-942", op->op == COP_INSERT);
++
++	node = carry_real(op->node);
++	assert("nikita-943", node != NULL);
++	assert("nikita-944", node_plugin_by_node(node) != NULL);
++
++	child = carry_real(op->u.insert.child);
++	result =
++	    find_new_child_ptr(node, child, op->u.insert.brother,
++			       op->u.insert.d->coord);
++
++	build_child_ptr_data(child, op->u.insert.d->data);
++	return result;
++}
++
++/* additional amount of free space in @node required to complete @op */
++static int free_space_shortage(znode * node /* node to check */ ,
++			       carry_op * op /* operation being performed */ )
++{
++	assert("nikita-1061", node != NULL);
++	assert("nikita-1062", op != NULL);
++
++	switch (op->op) {
++	default:
++		impossible("nikita-1702", "Wrong opcode");
++	case COP_INSERT:
++	case COP_PASTE:
++		return space_needed_for_op(node, op) - znode_free_space(node);
++	case COP_EXTENT:
++		/* when inserting extent shift data around until insertion
++		   point is utmost in the node. */
++		if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
++			return +1;
++		else
++			return -1;
++	}
++}
++
++/* helper function: update node pointer in operation after insertion
++   point was probably shifted into @target. */
++static znode *sync_op(carry_op * op, carry_node * target)
++{
++	znode *insertion_node;
++
++	/* reget node from coord: shift might move insertion coord to
++	   the neighbor */
++	insertion_node = op->u.insert.d->coord->node;
++	/* if insertion point was actually moved into new node,
++	   update carry node pointer in operation. */
++	if (insertion_node != carry_real(op->node)) {
++		op->node = target;
++		assert("nikita-2540", carry_real(target) == insertion_node);
++	}
++	assert("nikita-2541",
++	       carry_real(op->node) == op->u.insert.d->coord->node);
++	return insertion_node;
++}
++
++/*
++ * complete make_space() call: update tracked lock handle if necessary. See
++ * comments for fs/reiser4/carry.h:carry_track_type
++ */
++static int
++make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
++{
++	int result;
++	carry_track_type tracking;
++	znode *node;
++
++	tracking = doing->track_type;
++	node = op->u.insert.d->coord->node;
++
++	if (tracking == CARRY_TRACK_NODE ||
++	    (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
++		/* inserting or pasting into node different from
++		   original. Update lock handle supplied by caller. */
++		assert("nikita-1417", doing->tracked != NULL);
++		done_lh(doing->tracked);
++		init_lh(doing->tracked);
++		result = longterm_lock_znode(doing->tracked, node,
++					     ZNODE_WRITE_LOCK,
++					     ZNODE_LOCK_HIPRI);
++	} else
++		result = 0;
++	return result;
++}
++
++/* This is insertion policy function. It shifts data to the left and right
++   neighbors of insertion coord and allocates new nodes until there is enough
++   free space to complete @op.
++
++   See comments in the body.
++
++   Assumes that the node format favors insertions at the right end of the node
++   as node40 does.
++
++   See carry_flow() on detail about flow insertion
++*/
++static int make_space(carry_op * op /* carry operation, insert or paste */ ,
++		      carry_level * doing /* current carry queue */ ,
++		      carry_level * todo /* carry queue on the parent level */ )
++{
++	znode *node;
++	int result;
++	int not_enough_space;
++	int blk_alloc;
++	znode *orig_node;
++	__u32 flags;
++
++	coord_t *coord;
++
++	assert("nikita-890", op != NULL);
++	assert("nikita-891", todo != NULL);
++	assert("nikita-892",
++	       op->op == COP_INSERT ||
++	       op->op == COP_PASTE || op->op == COP_EXTENT);
++	assert("nikita-1607",
++	       carry_real(op->node) == op->u.insert.d->coord->node);
++
++	flags = op->u.insert.flags;
++
++	/* NOTE check that new node can only be allocated after checking left
++	 * and right neighbors. This is necessary for proper work of
++	 * find_{left,right}_neighbor(). */
++	assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
++				   flags & COPI_DONT_SHIFT_LEFT));
++	assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
++				   flags & COPI_DONT_SHIFT_RIGHT));
++
++	coord = op->u.insert.d->coord;
++	orig_node = node = coord->node;
++
++	assert("nikita-908", node != NULL);
++	assert("nikita-909", node_plugin_by_node(node) != NULL);
++
++	result = 0;
++	/* If there is not enough space in a node, try to shift something to
++	   the left neighbor. This is a bit tricky, as locking to the left is
++	   low priority. This is handled by restart logic in carry().
++	 */
++	not_enough_space = free_space_shortage(node, op);
++	if (not_enough_space <= 0)
++		/* it is possible that carry was called when there actually
++		   was enough space in the node. For example, when inserting
++		   leftmost item so that delimiting keys have to be updated.
++		 */
++		return make_space_tail(op, doing, orig_node);
++	if (!(flags & COPI_DONT_SHIFT_LEFT)) {
++		carry_node *left;
++		/* make note in statistics of an attempt to move
++		   something into the left neighbor */
++		left = find_left_neighbor(op, doing);
++		if (unlikely(IS_ERR(left))) {
++			if (PTR_ERR(left) == -E_REPEAT)
++				return -E_REPEAT;
++			else {
++				/* some error other than restart request
++				   occurred. This shouldn't happen. Issue a
++				   warning and continue as if left neighbor
++				   weren't existing.
++				 */
++				warning("nikita-924",
++					"Error accessing left neighbor: %li",
++					PTR_ERR(left));
++			}
++		} else if (left != NULL) {
++
++			/* shift everything possible on the left of and
++			   including insertion coord into the left neighbor */
++			result = carry_shift_data(LEFT_SIDE, coord,
++						  carry_real(left), doing, todo,
++						  flags & COPI_GO_LEFT);
++
++			/* reget node from coord: shift_left() might move
++			   insertion coord to the left neighbor */
++			node = sync_op(op, left);
++
++			not_enough_space = free_space_shortage(node, op);
++			/* There is not enough free space in @node, but
++			   may be, there is enough free space in
++			   @left. Various balancing decisions are valid here.
++			   The same for the shifiting to the right.
++			 */
++		}
++	}
++	/* If there still is not enough space, shift to the right */
++	if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
++		carry_node *right;
++
++		right = find_right_neighbor(op, doing);
++		if (IS_ERR(right)) {
++			warning("nikita-1065",
++				"Error accessing right neighbor: %li",
++				PTR_ERR(right));
++		} else if (right != NULL) {
++			/* node containing insertion point, and its right
++			   neighbor node are write locked by now.
++
++			   shift everything possible on the right of but
++			   excluding insertion coord into the right neighbor
++			 */
++			result = carry_shift_data(RIGHT_SIDE, coord,
++						  carry_real(right),
++						  doing, todo,
++						  flags & COPI_GO_RIGHT);
++			/* reget node from coord: shift_right() might move
++			   insertion coord to the right neighbor */
++			node = sync_op(op, right);
++			not_enough_space = free_space_shortage(node, op);
++		}
++	}
++	/* If there is still not enough space, allocate new node(s).
++
++	   We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
++	   the carry operation flags (currently this is needed during flush
++	   only).
++	 */
++	for (blk_alloc = 0;
++	     not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
++	     !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
++		carry_node *fresh;	/* new node we are allocating */
++		coord_t coord_shadow;	/* remembered insertion point before
++					 * shifting data into new node */
++		carry_node *node_shadow;	/* remembered insertion node before
++						 * shifting */
++		unsigned int gointo;	/* whether insertion point should move
++					 * into newly allocated node */
++
++		/* allocate new node on the right of @node. Znode and disk
++		   fake block number for new node are allocated.
++
++		   add_new_znode() posts carry operation COP_INSERT with
++		   COPT_CHILD option to the parent level to add
++		   pointer to newly created node to its parent.
++
++		   Subtle point: if several new nodes are required to complete
++		   insertion operation at this level, they will be inserted
++		   into their parents in the order of creation, which means
++		   that @node will be valid "cookie" at the time of insertion.
++
++		 */
++		fresh = add_new_znode(node, op->node, doing, todo);
++		if (IS_ERR(fresh))
++			return PTR_ERR(fresh);
++
++		/* Try to shift into new node. */
++		result = lock_carry_node(doing, fresh);
++		zput(carry_real(fresh));
++		if (result != 0) {
++			warning("nikita-947",
++				"Cannot lock new node: %i", result);
++			return result;
++		}
++
++		/* both nodes are write locked by now.
++
++		   shift everything possible on the right of and
++		   including insertion coord into the right neighbor.
++		 */
++		coord_dup(&coord_shadow, op->u.insert.d->coord);
++		node_shadow = op->node;
++		/* move insertion point into newly created node if:
++
++		   . insertion point is rightmost in the source node, or
++		   . this is not the first node we are allocating in a row.
++		 */
++		gointo =
++		    (blk_alloc > 0) ||
++		    coord_is_after_rightmost(op->u.insert.d->coord);
++
++		result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh),
++					  doing, todo, gointo);
++		/* if insertion point was actually moved into new node,
++		   update carry node pointer in operation. */
++		node = sync_op(op, fresh);
++		not_enough_space = free_space_shortage(node, op);
++		if ((not_enough_space > 0) && (node != coord_shadow.node)) {
++			/* there is not enough free in new node. Shift
++			   insertion point back to the @shadow_node so that
++			   next new node would be inserted between
++			   @shadow_node and @fresh.
++			 */
++			coord_normalize(&coord_shadow);
++			coord_dup(coord, &coord_shadow);
++			node = coord->node;
++			op->node = node_shadow;
++			if (1 || (flags & COPI_STEP_BACK)) {
++				/* still not enough space?! Maybe there is
++				   enough space in the source node (i.e., node
++				   data are moved from) now.
++				 */
++				not_enough_space =
++				    free_space_shortage(node, op);
++			}
++		}
++	}
++	if (not_enough_space > 0) {
++		if (!(flags & COPI_DONT_ALLOCATE))
++			warning("nikita-948", "Cannot insert new item");
++		result = -E_NODE_FULL;
++	}
++	assert("nikita-1622", ergo(result == 0,
++				   carry_real(op->node) == coord->node));
++	assert("nikita-2616", coord == op->u.insert.d->coord);
++	if (result == 0)
++		result = make_space_tail(op, doing, orig_node);
++	return result;
++}
++
++/* insert_paste_common() - common part of insert and paste operations
++
++   This function performs common part of COP_INSERT and COP_PASTE.
++
++   There are two ways in which insertion/paste can be requested:
++
++    . by directly supplying reiser4_item_data. In this case, op ->
++    u.insert.type is set to COPT_ITEM_DATA.
++
++    . by supplying child pointer to which is to inserted into parent. In this
++    case op -> u.insert.type == COPT_CHILD.
++
++    . by supplying key of new item/unit. This is currently only used during
++    extent insertion
++
++   This is required, because when new node is allocated we don't know at what
++   position pointer to it is to be stored in the parent. Actually, we don't
++   even know what its parent will be, because parent can be re-balanced
++   concurrently and new node re-parented, and because parent can be full and
++   pointer to the new node will go into some other node.
++
++   insert_paste_common() resolves pointer to child node into position in the
++   parent by calling find_new_child_coord(), that fills
++   reiser4_item_data. After this, insertion/paste proceeds uniformly.
++
++   Another complication is with finding free space during pasting. It may
++   happen that while shifting items to the neighbors and newly allocated
++   nodes, insertion coord can no longer be in the item we wanted to paste
++   into. At this point, paste becomes (morphs) into insert. Moreover free
++   space analysis has to be repeated, because amount of space required for
++   insertion is different from that of paste (item header overhead, etc).
++
++   This function "unifies" different insertion modes (by resolving child
++   pointer or key into insertion coord), and then calls make_space() to free
++   enough space in the node by shifting data to the left and right and by
++   allocating new nodes if necessary. Carry operation knows amount of space
++   required for its completion. After enough free space is obtained, caller of
++   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
++   by calling item plugin method.
++
++*/
++static int insert_paste_common(carry_op * op	/* carry operation being
++						 * performed */ ,
++			       carry_level * doing /* current carry level */ ,
++			       carry_level * todo /* next carry level */ ,
++			       carry_insert_data * cdata	/* pointer to
++								 * cdata */ ,
++			       coord_t * coord /* insertion/paste coord */ ,
++			       reiser4_item_data * data	/* data to be
++							 * inserted/pasted */ )
++{
++	assert("nikita-981", op != NULL);
++	assert("nikita-980", todo != NULL);
++	assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
++	       || (op->op == COP_EXTENT));
++
++	if (op->u.insert.type == COPT_PASTE_RESTARTED) {
++		/* nothing to do. Fall through to make_space(). */
++		;
++	} else if (op->u.insert.type == COPT_KEY) {
++		node_search_result intra_node;
++		znode *node;
++		/* Problem with doing batching at the lowest level, is that
++		   operations here are given by coords where modification is
++		   to be performed, and one modification can invalidate coords
++		   of all following operations.
++
++		   So, we are implementing yet another type for operation that
++		   will use (the only) "locator" stable across shifting of
++		   data between nodes, etc.: key (COPT_KEY).
++
++		   This clause resolves key to the coord in the node.
++
++		   But node can change also. Probably some pieces have to be
++		   added to the lock_carry_node(), to lock node by its key.
++
++		 */
++		/* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
++		   if you need something else. */
++		op->u.insert.d->coord = coord;
++		node = carry_real(op->node);
++		intra_node = node_plugin_by_node(node)->lookup
++		    (node, op->u.insert.d->key, FIND_EXACT,
++		     op->u.insert.d->coord);
++		if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
++			warning("nikita-1715", "Intra node lookup failure: %i",
++				intra_node);
++			return intra_node;
++		}
++	} else if (op->u.insert.type == COPT_CHILD) {
++		/* if we are asked to insert pointer to the child into
++		   internal node, first convert pointer to the child into
++		   coord within parent node.
++		 */
++		znode *child;
++		int result;
++
++		op->u.insert.d = cdata;
++		op->u.insert.d->coord = coord;
++		op->u.insert.d->data = data;
++		op->u.insert.d->coord->node = carry_real(op->node);
++		result = find_new_child_coord(op);
++		child = carry_real(op->u.insert.child);
++		if (result != NS_NOT_FOUND) {
++			warning("nikita-993",
++				"Cannot find a place for child pointer: %i",
++				result);
++			return result;
++		}
++		/* This only happens when we did multiple insertions at
++		   the previous level, trying to insert single item and
++		   it so happened, that insertion of pointers to all new
++		   nodes before this one already caused parent node to
++		   split (may be several times).
++
++		   I am going to come up with better solution.
++
++		   You are not expected to understand this.
++		   -- v6root/usr/sys/ken/slp.c
++
++		   Basically, what happens here is the following: carry came
++		   to the parent level and is about to insert internal item
++		   pointing to the child node that it just inserted in the
++		   level below. Position where internal item is to be inserted
++		   was found by find_new_child_coord() above, but node of the
++		   current carry operation (that is, parent node of child
++		   inserted on the previous level), was determined earlier in
++		   the lock_carry_level/lock_carry_node. It could so happen
++		   that other carry operations already performed on the parent
++		   level already split parent node, so that insertion point
++		   moved into another node. Handle this by creating new carry
++		   node for insertion point if necessary.
++		 */
++		if (carry_real(op->node) != op->u.insert.d->coord->node) {
++			pool_ordering direction;
++			znode *z1;
++			znode *z2;
++			reiser4_key k1;
++			reiser4_key k2;
++
++			/*
++			 * determine in what direction insertion point
++			 * moved. Do this by comparing delimiting keys.
++			 */
++			z1 = op->u.insert.d->coord->node;
++			z2 = carry_real(op->node);
++			if (keyle(leftmost_key_in_node(z1, &k1),
++				  leftmost_key_in_node(z2, &k2)))
++				/* insertion point moved to the left */
++				direction = POOLO_BEFORE;
++			else
++				/* insertion point moved to the right */
++				direction = POOLO_AFTER;
++
++			op->node = add_carry_skip(doing, direction, op->node);
++			if (IS_ERR(op->node))
++				return PTR_ERR(op->node);
++			op->node->node = op->u.insert.d->coord->node;
++			op->node->free = 1;
++			result = lock_carry_node(doing, op->node);
++			if (result != 0)
++				return result;
++		}
++
++		/*
++		 * set up key of an item being inserted: we are inserting
++		 * internal item and its key is (by the very definition of
++		 * search tree) is leftmost key in the child node.
++		 */
++		write_lock_dk(znode_get_tree(child));
++		op->u.insert.d->key = leftmost_key_in_node(child,
++							   znode_get_ld_key(child));
++		write_unlock_dk(znode_get_tree(child));
++		op->u.insert.d->data->arg = op->u.insert.brother;
++	} else {
++		assert("vs-243", op->u.insert.d->coord != NULL);
++		op->u.insert.d->coord->node = carry_real(op->node);
++	}
++
++	/* find free space. */
++	return make_space(op, doing, todo);
++}
++
++/* handle carry COP_INSERT operation.
++
++   Insert new item into node. New item can be given in one of two ways:
++
++   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
++   only applicable at the leaf/twig level.
++
++   - by passing a child node pointer to which is to be inserted by this
++   operation.
++
++*/
++static int carry_insert(carry_op * op /* operation to perform */ ,
++			carry_level * doing	/* queue of operations @op
++						 * is part of */ ,
++			carry_level * todo	/* queue where new operations
++						 * are accumulated */ )
++{
++	znode *node;
++	carry_insert_data cdata;
++	coord_t coord;
++	reiser4_item_data data;
++	carry_plugin_info info;
++	int result;
++
++	assert("nikita-1036", op != NULL);
++	assert("nikita-1037", todo != NULL);
++	assert("nikita-1038", op->op == COP_INSERT);
++
++	coord_init_zero(&coord);
++
++	/* perform common functionality of insert and paste. */
++	result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
++	if (result != 0)
++		return result;
++
++	node = op->u.insert.d->coord->node;
++	assert("nikita-1039", node != NULL);
++	assert("nikita-1040", node_plugin_by_node(node) != NULL);
++
++	assert("nikita-949",
++	       space_needed_for_op(node, op) <= znode_free_space(node));
++
++	/* ask node layout to create new item. */
++	info.doing = doing;
++	info.todo = todo;
++	result = node_plugin_by_node(node)->create_item
++	    (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
++	     &info);
++	doing->restartable = 0;
++	znode_make_dirty(node);
++
++	return result;
++}
++
++/*
++ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
++ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
++ * by slicing into multiple items.
++ */
++
++#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
++#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
++#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
++
++static size_t item_data_overhead(carry_op * op)
++{
++	if (flow_insert_data(op)->iplug->b.estimate == NULL)
++		return 0;
++	return (flow_insert_data(op)->iplug->b.
++		estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
++		flow_insert_data(op)->length);
++}
++
++/* FIXME-VS: this is called several times during one make_flow_for_insertion
++   and it will always return the same result. Some optimization could be made
++   by calculating this value once at the beginning and passing it around. That
++   would reduce some flexibility in future changes
++*/
++static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
++static size_t flow_insertion_overhead(carry_op * op)
++{
++	znode *node;
++	size_t insertion_overhead;
++
++	node = flow_insert_point(op)->node;
++	insertion_overhead = 0;
++	if (node->nplug->item_overhead &&
++	    !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
++		       flow_insert_data(op)))
++		insertion_overhead =
++		    node->nplug->item_overhead(node, NULL) +
++			item_data_overhead(op);
++	return insertion_overhead;
++}
++
++/* how many bytes of flow does fit to the node */
++static int what_can_fit_into_node(carry_op * op)
++{
++	size_t free, overhead;
++
++	overhead = flow_insertion_overhead(op);
++	free = znode_free_space(flow_insert_point(op)->node);
++	if (free <= overhead)
++		return 0;
++	free -= overhead;
++	/* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
++	if (free < op->u.insert_flow.flow->length)
++		return free;
++	return (int)op->u.insert_flow.flow->length;
++}
++
++/* in make_space_for_flow_insertion we need to check either whether whole flow
++   fits into a node or whether minimal fraction of flow fits into a node */
++static int enough_space_for_whole_flow(carry_op * op)
++{
++	return (unsigned)what_can_fit_into_node(op) ==
++	    op->u.insert_flow.flow->length;
++}
++
++#define MIN_FLOW_FRACTION 1
++static int enough_space_for_min_flow_fraction(carry_op * op)
++{
++	assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
++
++	return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
++}
++
++/* this returns 0 if left neighbor was obtained successfully and everything
++   upto insertion point including it were shifted and left neighbor still has
++   some free space to put minimal fraction of flow into it */
++static int
++make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
++{
++	carry_node *left;
++	znode *orig;
++
++	left = find_left_neighbor(op, doing);
++	if (unlikely(IS_ERR(left))) {
++		warning("vs-899",
++			"make_space_by_shift_left: "
++			"error accessing left neighbor: %li", PTR_ERR(left));
++		return 1;
++	}
++	if (left == NULL)
++		/* left neighbor either does not exist or is unformatted
++		   node */
++		return 1;
++
++	orig = flow_insert_point(op)->node;
++	/* try to shift content of node @orig from its head upto insert point
++	   including insertion point into the left neighbor */
++	carry_shift_data(LEFT_SIDE, flow_insert_point(op), carry_real(left), doing, todo, 1	/* including insert
++												 * point */ );
++	if (carry_real(left) != flow_insert_point(op)->node) {
++		/* insertion point did not move */
++		return 1;
++	}
++
++	/* insertion point is set after last item in the node */
++	assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
++
++	if (!enough_space_for_min_flow_fraction(op)) {
++		/* insertion point node does not have enough free space to put
++		   even minimal portion of flow into it, therefore, move
++		   insertion point back to orig node (before first item) */
++		coord_init_before_first_item(flow_insert_point(op), orig);
++		return 1;
++	}
++
++	/* part of flow is to be written to the end of node */
++	op->node = left;
++	return 0;
++}
++
++/* this returns 0 if right neighbor was obtained successfully and everything to
++   the right of insertion point was shifted to it and node got enough free
++   space to put minimal fraction of flow into it */
++static int
++make_space_by_shift_right(carry_op * op, carry_level * doing,
++			  carry_level * todo)
++{
++	carry_node *right;
++
++	right = find_right_neighbor(op, doing);
++	if (unlikely(IS_ERR(right))) {
++		warning("nikita-1065", "shift_right_excluding_insert_point: "
++			"error accessing right neighbor: %li", PTR_ERR(right));
++		return 1;
++	}
++	if (right) {
++		/* shift everything possible on the right of but excluding
++		   insertion coord into the right neighbor */
++		carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(right), doing, todo, 0	/* not
++													 * including
++													 * insert
++													 * point */ );
++	} else {
++		/* right neighbor either does not exist or is unformatted
++		   node */
++		;
++	}
++	if (coord_is_after_rightmost(flow_insert_point(op))) {
++		if (enough_space_for_min_flow_fraction(op)) {
++			/* part of flow is to be written to the end of node */
++			return 0;
++		}
++	}
++
++	/* new node is to be added if insert point node did not get enough
++	   space for whole flow */
++	return 1;
++}
++
++/* this returns 0 when insert coord is set at the node end and fraction of flow
++   fits into that node */
++static int
++make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
++{
++	int result;
++	znode *node;
++	carry_node *new;
++
++	node = flow_insert_point(op)->node;
++
++	if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
++		return RETERR(-E_NODE_FULL);
++	/* add new node after insert point node */
++	new = add_new_znode(node, op->node, doing, todo);
++	if (unlikely(IS_ERR(new))) {
++		return PTR_ERR(new);
++	}
++	result = lock_carry_node(doing, new);
++	zput(carry_real(new));
++	if (unlikely(result)) {
++		return result;
++	}
++	op->u.insert_flow.new_nodes++;
++	if (!coord_is_after_rightmost(flow_insert_point(op))) {
++		carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(new), doing, todo, 0	/* not
++													 * including
++													 * insert
++													 * point */ );
++
++		assert("vs-901",
++		       coord_is_after_rightmost(flow_insert_point(op)));
++
++		if (enough_space_for_min_flow_fraction(op)) {
++			return 0;
++		}
++		if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
++			return RETERR(-E_NODE_FULL);
++
++		/* add one more new node */
++		new = add_new_znode(node, op->node, doing, todo);
++		if (unlikely(IS_ERR(new))) {
++			return PTR_ERR(new);
++		}
++		result = lock_carry_node(doing, new);
++		zput(carry_real(new));
++		if (unlikely(result)) {
++			return result;
++		}
++		op->u.insert_flow.new_nodes++;
++	}
++
++	/* move insertion point to new node */
++	coord_init_before_first_item(flow_insert_point(op), carry_real(new));
++	op->node = new;
++	return 0;
++}
++
++static int
++make_space_for_flow_insertion(carry_op * op, carry_level * doing,
++			      carry_level * todo)
++{
++	__u32 flags = op->u.insert_flow.flags;
++
++	if (enough_space_for_whole_flow(op)) {
++		/* whole flow fits into insert point node */
++		return 0;
++	}
++
++	if (!(flags & COPI_DONT_SHIFT_LEFT)
++	    && (make_space_by_shift_left(op, doing, todo) == 0)) {
++		/* insert point is shifted to left neighbor of original insert
++		   point node and is set after last unit in that node. It has
++		   enough space to fit at least minimal fraction of flow. */
++		return 0;
++	}
++
++	if (enough_space_for_whole_flow(op)) {
++		/* whole flow fits into insert point node */
++		return 0;
++	}
++
++	if (!(flags & COPI_DONT_SHIFT_RIGHT)
++	    && (make_space_by_shift_right(op, doing, todo) == 0)) {
++		/* insert point is still set to the same node, but there is
++		   nothing to the right of insert point. */
++		return 0;
++	}
++
++	if (enough_space_for_whole_flow(op)) {
++		/* whole flow fits into insert point node */
++		return 0;
++	}
++
++	return make_space_by_new_nodes(op, doing, todo);
++}
++
++/* implements COP_INSERT_FLOW operation */
++static int
++carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
++{
++	int result;
++	flow_t *f;
++	coord_t *insert_point;
++	node_plugin *nplug;
++	carry_plugin_info info;
++	znode *orig_node;
++	lock_handle *orig_lh;
++
++	f = op->u.insert_flow.flow;
++	result = 0;
++
++	/* carry system needs this to work */
++	info.doing = doing;
++	info.todo = todo;
++
++	orig_node = flow_insert_point(op)->node;
++	orig_lh = doing->tracked;
++
++	while (f->length) {
++		result = make_space_for_flow_insertion(op, doing, todo);
++		if (result)
++			break;
++
++		insert_point = flow_insert_point(op);
++		nplug = node_plugin_by_node(insert_point->node);
++
++		/* compose item data for insertion/pasting */
++		flow_insert_data(op)->data = f->data;
++		flow_insert_data(op)->length = what_can_fit_into_node(op);
++
++		if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
++			/* insert point is set to item of file we are writing to and we have to append to it */
++			assert("vs-903", insert_point->between == AFTER_UNIT);
++			nplug->change_item_size(insert_point,
++						flow_insert_data(op)->length);
++			flow_insert_data(op)->iplug->b.paste(insert_point,
++							     flow_insert_data
++							     (op), &info);
++		} else {
++			/* new item must be inserted */
++			pos_in_node_t new_pos;
++			flow_insert_data(op)->length += item_data_overhead(op);
++
++			/* FIXME-VS: this is because node40_create_item changes
++			   insert_point for obscure reasons */
++			switch (insert_point->between) {
++			case AFTER_ITEM:
++				new_pos = insert_point->item_pos + 1;
++				break;
++			case EMPTY_NODE:
++				new_pos = 0;
++				break;
++			case BEFORE_ITEM:
++				assert("vs-905", insert_point->item_pos == 0);
++				new_pos = 0;
++				break;
++			default:
++				impossible("vs-906",
++					   "carry_insert_flow: invalid coord");
++				new_pos = 0;
++				break;
++			}
++
++			nplug->create_item(insert_point, &f->key,
++					   flow_insert_data(op), &info);
++			coord_set_item_pos(insert_point, new_pos);
++		}
++		coord_init_after_item_end(insert_point);
++		doing->restartable = 0;
++		znode_make_dirty(insert_point->node);
++
++		move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
++	}
++
++	if (orig_node != flow_insert_point(op)->node) {
++		/* move lock to new insert point */
++		done_lh(orig_lh);
++		init_lh(orig_lh);
++		result =
++		    longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
++					ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
++	}
++
++	return result;
++}
++
++/* implements COP_DELETE operation
++
++   Remove pointer to @op -> u.delete.child from it's parent.
++
++   This function also handles killing of a tree root is last pointer from it
++   was removed. This is complicated by our handling of "twig" level: root on
++   twig level is never killed.
++
++*/
++static int carry_delete(carry_op * op /* operation to be performed */ ,
++			carry_level * doing UNUSED_ARG	/* current carry
++							 * level */ ,
++			carry_level * todo /* next carry level */ )
++{
++	int result;
++	coord_t coord;
++	coord_t coord2;
++	znode *parent;
++	znode *child;
++	carry_plugin_info info;
++	reiser4_tree *tree;
++
++	/*
++	 * This operation is called to delete internal item pointing to the
++	 * child node that was removed by carry from the tree on the previous
++	 * tree level.
++	 */
++
++	assert("nikita-893", op != NULL);
++	assert("nikita-894", todo != NULL);
++	assert("nikita-895", op->op == COP_DELETE);
++
++	coord_init_zero(&coord);
++	coord_init_zero(&coord2);
++
++	parent = carry_real(op->node);
++	child = op->u.delete.child ?
++	    carry_real(op->u.delete.child) : op->node->node;
++	tree = znode_get_tree(child);
++	read_lock_tree(tree);
++
++	/*
++	 * @parent was determined when carry entered parent level
++	 * (lock_carry_level/lock_carry_node). Since then, actual parent of
++	 * @child node could change due to other carry operations performed on
++	 * the parent level. Check for this.
++	 */
++
++	if (znode_parent(child) != parent) {
++		/* NOTE-NIKITA add stat counter for this. */
++		parent = znode_parent(child);
++		assert("nikita-2581", find_carry_node(doing, parent));
++	}
++	read_unlock_tree(tree);
++
++	assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
++
++	/* Twig level horrors: tree should be of height at least 2. So, last
++	   pointer from the root at twig level is preserved even if child is
++	   empty. This is ugly, but so it was architectured.
++	 */
++
++	if (znode_is_root(parent) &&
++	    znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
++	    node_num_items(parent) == 1) {
++		/* Delimiting key manipulations. */
++		write_lock_dk(tree);
++		znode_set_ld_key(child, znode_set_ld_key(parent, min_key()));
++		znode_set_rd_key(child, znode_set_rd_key(parent, max_key()));
++		ZF_SET(child, JNODE_DKSET);
++		write_unlock_dk(tree);
++
++		/* @child escaped imminent death! */
++		ZF_CLR(child, JNODE_HEARD_BANSHEE);
++		return 0;
++	}
++
++	/* convert child pointer to the coord_t */
++	result = find_child_ptr(parent, child, &coord);
++	if (result != NS_FOUND) {
++		warning("nikita-994", "Cannot find child pointer: %i", result);
++		print_coord_content("coord", &coord);
++		return result;
++	}
++
++	coord_dup(&coord2, &coord);
++	info.doing = doing;
++	info.todo = todo;
++	{
++		/*
++		 * Actually kill internal item: prepare structure with
++		 * arguments for ->cut_and_kill() method...
++		 */
++
++		struct carry_kill_data kdata;
++		kdata.params.from = &coord;
++		kdata.params.to = &coord2;
++		kdata.params.from_key = NULL;
++		kdata.params.to_key = NULL;
++		kdata.params.smallest_removed = NULL;
++		kdata.params.truncate = 1;
++		kdata.flags = op->u.delete.flags;
++		kdata.inode = NULL;
++		kdata.left = NULL;
++		kdata.right = NULL;
++		kdata.buf = NULL;
++		/* ... and call it. */
++		result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
++								   &info);
++	}
++	doing->restartable = 0;
++
++	/* check whether root should be killed violently */
++	if (znode_is_root(parent) &&
++	    /* don't kill roots at and lower than twig level */
++	    znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
++	    node_num_items(parent) == 1) {
++		result = kill_tree_root(coord.node);
++	}
++
++	return result < 0 ? : 0;
++}
++
++/* implements COP_CUT opration
++
++   Cuts part or whole content of node.
++
++*/
++static int carry_cut(carry_op * op /* operation to be performed */ ,
++		     carry_level * doing /* current carry level */ ,
++		     carry_level * todo /* next carry level */ )
++{
++	int result;
++	carry_plugin_info info;
++	node_plugin *nplug;
++
++	assert("nikita-896", op != NULL);
++	assert("nikita-897", todo != NULL);
++	assert("nikita-898", op->op == COP_CUT);
++
++	info.doing = doing;
++	info.todo = todo;
++
++	nplug = node_plugin_by_node(carry_real(op->node));
++	if (op->u.cut_or_kill.is_cut)
++		result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
++	else
++		result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
++
++	doing->restartable = 0;
++	return result < 0 ? : 0;
++}
++
++/* helper function for carry_paste(): returns true if @op can be continued as
++   paste  */
++static int
++can_paste(coord_t * icoord, const reiser4_key * key,
++	  const reiser4_item_data * data)
++{
++	coord_t circa;
++	item_plugin *new_iplug;
++	item_plugin *old_iplug;
++	int result = 0;		/* to keep gcc shut */
++
++	assert("", icoord->between != AT_UNIT);
++
++	/* obviously, one cannot paste when node is empty---there is nothing
++	   to paste into. */
++	if (node_is_empty(icoord->node))
++		return 0;
++	/* if insertion point is at the middle of the item, then paste */
++	if (!coord_is_between_items(icoord))
++		return 1;
++	coord_dup(&circa, icoord);
++	circa.between = AT_UNIT;
++
++	old_iplug = item_plugin_by_coord(&circa);
++	new_iplug = data->iplug;
++
++	/* check whether we can paste to the item @icoord is "at" when we
++	   ignore ->between field */
++	if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
++		result = 1;
++	} else if (icoord->between == BEFORE_UNIT
++		   || icoord->between == BEFORE_ITEM) {
++		/* otherwise, try to glue to the item at the left, if any */
++		coord_dup(&circa, icoord);
++		if (coord_set_to_left(&circa)) {
++			result = 0;
++			coord_init_before_item(icoord);
++		} else {
++			old_iplug = item_plugin_by_coord(&circa);
++			result = (old_iplug == new_iplug)
++			    && item_can_contain_key(icoord, key, data);
++			if (result) {
++				coord_dup(icoord, &circa);
++				icoord->between = AFTER_UNIT;
++			}
++		}
++	} else if (icoord->between == AFTER_UNIT
++		   || icoord->between == AFTER_ITEM) {
++		coord_dup(&circa, icoord);
++		/* otherwise, try to glue to the item at the right, if any */
++		if (coord_set_to_right(&circa)) {
++			result = 0;
++			coord_init_after_item(icoord);
++		} else {
++			int (*cck) (const coord_t *, const reiser4_key *,
++				    const reiser4_item_data *);
++
++			old_iplug = item_plugin_by_coord(&circa);
++
++			cck = old_iplug->b.can_contain_key;
++			if (cck == NULL)
++				/* item doesn't define ->can_contain_key
++				   method? So it is not expandable. */
++				result = 0;
++			else {
++				result = (old_iplug == new_iplug)
++				    && cck(&circa /*icoord */ , key, data);
++				if (result) {
++					coord_dup(icoord, &circa);
++					icoord->between = BEFORE_UNIT;
++				}
++			}
++		}
++	} else
++		impossible("nikita-2513", "Nothing works");
++	if (result) {
++		if (icoord->between == BEFORE_ITEM) {
++			assert("vs-912", icoord->unit_pos == 0);
++			icoord->between = BEFORE_UNIT;
++		} else if (icoord->between == AFTER_ITEM) {
++			coord_init_after_item_end(icoord);
++		}
++	}
++	return result;
++}
++
++/* implements COP_PASTE operation
++
++   Paste data into existing item. This is complicated by the fact that after
++   we shifted something to the left or right neighbors trying to free some
++   space, item we were supposed to paste into can be in different node than
++   insertion coord. If so, we are no longer doing paste, but insert. See
++   comments in insert_paste_common().
++
++*/
++static int carry_paste(carry_op * op /* operation to be performed */ ,
++		       carry_level * doing UNUSED_ARG	/* current carry
++							 * level */ ,
++		       carry_level * todo /* next carry level */ )
++{
++	znode *node;
++	carry_insert_data cdata;
++	coord_t dcoord;
++	reiser4_item_data data;
++	int result;
++	int real_size;
++	item_plugin *iplug;
++	carry_plugin_info info;
++	coord_t *coord;
++
++	assert("nikita-982", op != NULL);
++	assert("nikita-983", todo != NULL);
++	assert("nikita-984", op->op == COP_PASTE);
++
++	coord_init_zero(&dcoord);
++
++	result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
++	if (result != 0)
++		return result;
++
++	coord = op->u.insert.d->coord;
++
++	/* handle case when op -> u.insert.coord doesn't point to the item
++	   of required type. restart as insert. */
++	if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
++		op->op = COP_INSERT;
++		op->u.insert.type = COPT_PASTE_RESTARTED;
++		result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
++
++		return result;
++	}
++
++	node = coord->node;
++	iplug = item_plugin_by_coord(coord);
++	assert("nikita-992", iplug != NULL);
++
++	assert("nikita-985", node != NULL);
++	assert("nikita-986", node_plugin_by_node(node) != NULL);
++
++	assert("nikita-987",
++	       space_needed_for_op(node, op) <= znode_free_space(node));
++
++	assert("nikita-1286", coord_is_existing_item(coord));
++
++	/*
++	 * if item is expanded as a result of this operation, we should first
++	 * change item size, than call ->b.paste item method. If item is
++	 * shrunk, it should be done other way around: first call ->b.paste
++	 * method, then reduce item size.
++	 */
++
++	real_size = space_needed_for_op(node, op);
++	if (real_size > 0)
++		node->nplug->change_item_size(coord, real_size);
++
++	doing->restartable = 0;
++	info.doing = doing;
++	info.todo = todo;
++
++	result = iplug->b.paste(coord, op->u.insert.d->data, &info);
++
++	if (real_size < 0)
++		node->nplug->change_item_size(coord, real_size);
++
++	/* if we pasted at the beginning of the item, update item's key. */
++	if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
++		node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
++
++	znode_make_dirty(node);
++	return result;
++}
++
++/* handle carry COP_EXTENT operation. */
++static int carry_extent(carry_op * op /* operation to perform */ ,
++			carry_level * doing	/* queue of operations @op
++						 * is part of */ ,
++			carry_level * todo	/* queue where new operations
++						 * are accumulated */ )
++{
++	znode *node;
++	carry_insert_data cdata;
++	coord_t coord;
++	reiser4_item_data data;
++	carry_op *delete_dummy;
++	carry_op *insert_extent;
++	int result;
++	carry_plugin_info info;
++
++	assert("nikita-1751", op != NULL);
++	assert("nikita-1752", todo != NULL);
++	assert("nikita-1753", op->op == COP_EXTENT);
++
++	/* extent insertion overview:
++
++	   extents live on the TWIG LEVEL, which is level one above the leaf
++	   one. This complicates extent insertion logic somewhat: it may
++	   happen (and going to happen all the time) that in logical key
++	   ordering extent has to be placed between items I1 and I2, located
++	   at the leaf level, but I1 and I2 are in the same formatted leaf
++	   node N1. To insert extent one has to
++
++	   (1) reach node N1 and shift data between N1, its neighbors and
++	   possibly newly allocated nodes until I1 and I2 fall into different
++	   nodes. Since I1 and I2 are still neighboring items in logical key
++	   order, they will be necessary utmost items in their respective
++	   nodes.
++
++	   (2) After this new extent item is inserted into node on the twig
++	   level.
++
++	   Fortunately this process can reuse almost all code from standard
++	   insertion procedure (viz. make_space() and insert_paste_common()),
++	   due to the following observation: make_space() only shifts data up
++	   to and excluding or including insertion point. It never
++	   "over-moves" through insertion point. Thus, one can use
++	   make_space() to perform step (1). All required for this is just to
++	   instruct free_space_shortage() to keep make_space() shifting data
++	   until insertion point is at the node border.
++
++	 */
++
++	/* perform common functionality of insert and paste. */
++	result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
++	if (result != 0)
++		return result;
++
++	node = op->u.extent.d->coord->node;
++	assert("nikita-1754", node != NULL);
++	assert("nikita-1755", node_plugin_by_node(node) != NULL);
++	assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
++
++	/* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
++	   extent fits between items. */
++
++	info.doing = doing;
++	info.todo = todo;
++
++	/* there is another complication due to placement of extents on the
++	   twig level: extents are "rigid" in the sense that key-range
++	   occupied by extent cannot grow indefinitely to the right as it is
++	   for the formatted leaf nodes. Because of this when search finds two
++	   adjacent extents on the twig level, it has to "drill" to the leaf
++	   level, creating new node. Here we are removing this node.
++	 */
++	if (node_is_empty(node)) {
++		delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
++		if (IS_ERR(delete_dummy))
++			return PTR_ERR(delete_dummy);
++		delete_dummy->u.delete.child = NULL;
++		delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
++		ZF_SET(node, JNODE_HEARD_BANSHEE);
++	}
++
++	/* proceed with inserting extent item into parent. We are definitely
++	   inserting rather than pasting if we get that far. */
++	insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
++	if (IS_ERR(insert_extent))
++		/* @delete_dummy will be automatically destroyed on the level
++		   exiting  */
++		return PTR_ERR(insert_extent);
++	/* NOTE-NIKITA insertion by key is simplest option here. Another
++	   possibility is to insert on the left or right of already existing
++	   item.
++	 */
++	insert_extent->u.insert.type = COPT_KEY;
++	insert_extent->u.insert.d = op->u.extent.d;
++	assert("nikita-1719", op->u.extent.d->key != NULL);
++	insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
++	insert_extent->u.insert.flags =
++	    znode_get_tree(node)->carry.new_extent_flags;
++
++	/*
++	 * if carry was asked to track lock handle we should actually track
++	 * lock handle on the twig node rather than on the leaf where
++	 * operation was started from. Transfer tracked lock handle.
++	 */
++	if (doing->track_type) {
++		assert("nikita-3242", doing->tracked != NULL);
++		assert("nikita-3244", todo->tracked == NULL);
++		todo->tracked = doing->tracked;
++		todo->track_type = CARRY_TRACK_NODE;
++		doing->tracked = NULL;
++		doing->track_type = 0;
++	}
++
++	return 0;
++}
++
++/* update key in @parent between pointers to @left and @right.
++
++   Find coords of @left and @right and update delimiting key between them.
++   This is helper function called by carry_update(). Finds position of
++   internal item involved. Updates item key. Updates delimiting keys of child
++   nodes involved.
++*/
++static int update_delimiting_key(znode * parent	/* node key is updated
++						 * in */ ,
++				 znode * left /* child of @parent */ ,
++				 znode * right /* child of @parent */ ,
++				 carry_level * doing	/* current carry
++							 * level */ ,
++				 carry_level * todo	/* parent carry
++							 * level */ ,
++				 const char **error_msg	/* place to
++							 * store error
++							 * message */ )
++{
++	coord_t left_pos;
++	coord_t right_pos;
++	int result;
++	reiser4_key ldkey;
++	carry_plugin_info info;
++
++	assert("nikita-1177", right != NULL);
++	/* find position of right left child in a parent */
++	result = find_child_ptr(parent, right, &right_pos);
++	if (result != NS_FOUND) {
++		*error_msg = "Cannot find position of right child";
++		return result;
++	}
++
++	if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
++		/* find position of the left child in a parent */
++		result = find_child_ptr(parent, left, &left_pos);
++		if (result != NS_FOUND) {
++			*error_msg = "Cannot find position of left child";
++			return result;
++		}
++		assert("nikita-1355", left_pos.node != NULL);
++	} else
++		left_pos.node = NULL;
++
++	/* check that they are separated by exactly one key and are basically
++	   sane */
++	if (REISER4_DEBUG) {
++		if ((left_pos.node != NULL)
++		    && !coord_is_existing_unit(&left_pos)) {
++			*error_msg = "Left child is bastard";
++			return RETERR(-EIO);
++		}
++		if (!coord_is_existing_unit(&right_pos)) {
++			*error_msg = "Right child is bastard";
++			return RETERR(-EIO);
++		}
++		if (left_pos.node != NULL &&
++		    !coord_are_neighbors(&left_pos, &right_pos)) {
++			*error_msg = "Children are not direct siblings";
++			return RETERR(-EIO);
++		}
++	}
++	*error_msg = NULL;
++
++	info.doing = doing;
++	info.todo = todo;
++
++	/*
++	 * If child node is not empty, new key of internal item is a key of
++	 * leftmost item in the child node. If the child is empty, take its
++	 * right delimiting key as a new key of the internal item. Precise key
++	 * in the latter case is not important per se, because the child (and
++	 * the internal item) are going to be killed shortly anyway, but we
++	 * have to preserve correct order of keys in the parent node.
++	 */
++
++	if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
++		leftmost_key_in_node(right, &ldkey);
++	else {
++		read_lock_dk(znode_get_tree(parent));
++		ldkey = *znode_get_rd_key(right);
++		read_unlock_dk(znode_get_tree(parent));
++	}
++	node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
++	doing->restartable = 0;
++	znode_make_dirty(parent);
++	return 0;
++}
++
++/* implements COP_UPDATE opration
++
++   Update delimiting keys.
++
++*/
++static int carry_update(carry_op * op /* operation to be performed */ ,
++			carry_level * doing /* current carry level */ ,
++			carry_level * todo /* next carry level */ )
++{
++	int result;
++	carry_node *missing UNUSED_ARG;
++	znode *left;
++	znode *right;
++	carry_node *lchild;
++	carry_node *rchild;
++	const char *error_msg;
++	reiser4_tree *tree;
++
++	/*
++	 * This operation is called to update key of internal item. This is
++	 * necessary when carry shifted of cut data on the child
++	 * level. Arguments of this operation are:
++	 *
++	 *     @right --- child node. Operation should update key of internal
++	 *     item pointing to @right.
++	 *
++	 *     @left --- left neighbor of @right. This parameter is optional.
++	 */
++
++	assert("nikita-902", op != NULL);
++	assert("nikita-903", todo != NULL);
++	assert("nikita-904", op->op == COP_UPDATE);
++
++	lchild = op->u.update.left;
++	rchild = op->node;
++
++	if (lchild != NULL) {
++		assert("nikita-1001", lchild->parent);
++		assert("nikita-1003", !lchild->left);
++		left = carry_real(lchild);
++	} else
++		left = NULL;
++
++	tree = znode_get_tree(rchild->node);
++	read_lock_tree(tree);
++	right = znode_parent(rchild->node);
++	read_unlock_tree(tree);
++
++	if (right != NULL) {
++		result = update_delimiting_key(right,
++					       lchild ? lchild->node : NULL,
++					       rchild->node,
++					       doing, todo, &error_msg);
++	} else {
++		error_msg = "Cannot find node to update key in";
++		result = RETERR(-EIO);
++	}
++	/* operation will be reposted to the next level by the
++	   ->update_item_key() method of node plugin, if necessary. */
++
++	if (result != 0) {
++		warning("nikita-999", "Error updating delimiting key: %s (%i)",
++			error_msg ? : "", result);
++	}
++	return result;
++}
++
++/* move items from @node during carry */
++static int carry_shift_data(sideof side /* in what direction to move data */ ,
++			    coord_t * insert_coord	/* coord where new item
++							 * is to be inserted */ ,
++			    znode * node /* node which data are moved from */ ,
++			    carry_level * doing /* active carry queue */ ,
++			    carry_level * todo	/* carry queue where new
++						 * operations are to be put
++						 * in */ ,
++			    unsigned int including_insert_coord_p	/* true if
++									 * @insertion_coord
++									 * can be moved */ )
++{
++	int result;
++	znode *source;
++	carry_plugin_info info;
++	node_plugin *nplug;
++
++	source = insert_coord->node;
++
++	info.doing = doing;
++	info.todo = todo;
++
++	nplug = node_plugin_by_node(node);
++	result = nplug->shift(insert_coord, node,
++			      (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
++			      (int)including_insert_coord_p, &info);
++	/* the only error ->shift() method of node plugin can return is
++	   -ENOMEM due to carry node/operation allocation. */
++	assert("nikita-915", result >= 0 || result == -ENOMEM);
++	if (result > 0) {
++		/*
++		 * if some number of bytes was actually shifted, mark nodes
++		 * dirty, and carry level as non-restartable.
++		 */
++		doing->restartable = 0;
++		znode_make_dirty(source);
++		znode_make_dirty(node);
++	}
++
++	assert("nikita-2077", coord_check(insert_coord));
++	return 0;
++}
++
++typedef carry_node *(*carry_iterator) (carry_node * node);
++static carry_node *find_dir_carry(carry_node * node, carry_level * level,
++				  carry_iterator iterator);
++
++static carry_node *pool_level_list_prev(carry_node *node)
++{
++	return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
++}
++
++/* look for the left neighbor of given carry node in a carry queue.
++
++   This is used by find_left_neighbor(), but I am not sure that this
++   really gives any advantage. More statistics required.
++
++*/
++carry_node *find_left_carry(carry_node * node	/* node to find left neighbor
++						 * of */ ,
++			    carry_level * level /* level to scan */ )
++{
++	return find_dir_carry(node, level,
++			      (carry_iterator) pool_level_list_prev);
++}
++
++static carry_node *pool_level_list_next(carry_node *node)
++{
++	return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
++}
++
++/* look for the right neighbor of given carry node in a
++   carry queue.
++
++   This is used by find_right_neighbor(), but I am not sure that this
++   really gives any advantage. More statistics required.
++
++*/
++carry_node *find_right_carry(carry_node * node	/* node to find right neighbor
++						 * of */ ,
++			     carry_level * level /* level to scan */ )
++{
++	return find_dir_carry(node, level,
++			      (carry_iterator) pool_level_list_next);
++}
++
++/* look for the left or right neighbor of given carry node in a carry
++   queue.
++
++   Helper function used by find_{left|right}_carry().
++*/
++static carry_node *find_dir_carry(carry_node * node	/* node to start scanning
++							 * from */ ,
++				  carry_level * level /* level to scan */ ,
++				  carry_iterator iterator	/* operation to
++								 * move to the next
++								 * node */ )
++{
++	carry_node *neighbor;
++
++	assert("nikita-1059", node != NULL);
++	assert("nikita-1060", level != NULL);
++
++	/* scan list of carry nodes on this list dir-ward, skipping all
++	   carry nodes referencing the same znode. */
++	neighbor = node;
++	while (1) {
++		neighbor = iterator(neighbor);
++		if (carry_node_end(level, neighbor))
++			/* list head is reached */
++			return NULL;
++		if (carry_real(neighbor) != carry_real(node))
++			return neighbor;
++	}
++}
++
++/*
++ * Memory reservation estimation.
++ *
++ * Carry process proceeds through tree levels upwards. Carry assumes that it
++ * takes tree in consistent state (e.g., that search tree invariants hold),
++ * and leaves tree consistent after it finishes. This means that when some
++ * error occurs carry cannot simply return if there are pending carry
++ * operations. Generic solution for this problem is carry-undo either as
++ * transaction manager feature (requiring checkpoints and isolation), or
++ * through some carry specific mechanism.
++ *
++ * Our current approach is to panic if carry hits an error while tree is
++ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
++ * this "memory reservation" mechanism was added.
++ *
++ * Memory reservation is implemented by perthread-pages.diff patch from
++ * core-patches. Its API is defined in <linux/gfp.h>
++ *
++ *     int  perthread_pages_reserve(int nrpages, gfp_t gfp);
++ *     void perthread_pages_release(int nrpages);
++ *     int  perthread_pages_count(void);
++ *
++ * carry estimates its worst case memory requirements at the entry, reserved
++ * enough memory, and released unused pages before returning.
++ *
++ * Code below estimates worst case memory requirements for a given carry
++ * queue. This is dome by summing worst case memory requirements for each
++ * operation in the queue.
++ *
++ */
++
++/*
++ * Memory memory requirements of many operations depends on the tree
++ * height. For example, item insertion requires new node to be inserted at
++ * each tree level in the worst case. What tree height should be used for
++ * estimation? Current tree height is wrong, because tree height can change
++ * between the time when estimation was done and the time when operation is
++ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
++ * is also not desirable, because it would lead to the huge over-estimation
++ * all the time. Plausible solution is "capped tree height": if current tree
++ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
++ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
++ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
++ * to be increased even more during short interval of time.
++ */
++#define TREE_HEIGHT_CAP (5)
++
++/* return capped tree height for the @tree. See comment above. */
++static int cap_tree_height(reiser4_tree * tree)
++{
++	return max_t(int, tree->height, TREE_HEIGHT_CAP);
++}
++
++/* return capped tree height for the current tree. */
++static int capped_height(void)
++{
++	return cap_tree_height(current_tree);
++}
++
++/* return number of pages required to store given number of bytes */
++static int bytes_to_pages(int bytes)
++{
++	return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
++}
++
++/* how many pages are required to allocate znodes during item insertion. */
++static int carry_estimate_znodes(void)
++{
++	/*
++	 * Note, that there we have some problem here: there is no way to
++	 * reserve pages specifically for the given slab. This means that
++	 * these pages can be hijacked for some other end.
++	 */
++
++	/* in the worst case we need 3 new znode on each tree level */
++	return bytes_to_pages(capped_height() * sizeof(znode) * 3);
++}
++
++/*
++ * how many pages are required to load bitmaps. One bitmap per level.
++ */
++static int carry_estimate_bitmaps(void)
++{
++	if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
++		int bytes;
++
++		bytes = capped_height() * (0 +	/* bnode should be added, but its is private to
++						 * bitmap.c, skip for now. */
++					   2 * sizeof(jnode));	/* working and commit jnodes */
++		return bytes_to_pages(bytes) + 2;	/* and their contents */
++	} else
++		/* bitmaps were pre-loaded during mount */
++		return 0;
++}
++
++/* worst case item insertion memory requirements */
++static int carry_estimate_insert(carry_op * op, carry_level * level)
++{
++	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
++	    capped_height() +	/* new block on each level */
++	    1 +			/* and possibly extra new block at the leaf level */
++	    3;			/* loading of leaves into memory */
++}
++
++/* worst case item deletion memory requirements */
++static int carry_estimate_delete(carry_op * op, carry_level * level)
++{
++	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
++	    3;			/* loading of leaves into memory */
++}
++
++/* worst case tree cut memory requirements */
++static int carry_estimate_cut(carry_op * op, carry_level * level)
++{
++	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
++	    3;			/* loading of leaves into memory */
++}
++
++/* worst case memory requirements of pasting into item */
++static int carry_estimate_paste(carry_op * op, carry_level * level)
++{
++	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
++	    capped_height() +	/* new block on each level */
++	    1 +			/* and possibly extra new block at the leaf level */
++	    3;			/* loading of leaves into memory */
++}
++
++/* worst case memory requirements of extent insertion */
++static int carry_estimate_extent(carry_op * op, carry_level * level)
++{
++	return carry_estimate_insert(op, level) +	/* insert extent */
++	    carry_estimate_delete(op, level);	/* kill leaf */
++}
++
++/* worst case memory requirements of key update */
++static int carry_estimate_update(carry_op * op, carry_level * level)
++{
++	return 0;
++}
++
++/* worst case memory requirements of flow insertion */
++static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
++{
++	int newnodes;
++
++	newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
++		       CARRY_FLOW_NEW_NODES_LIMIT);
++	/*
++	 * roughly estimate insert_flow as a sequence of insertions.
++	 */
++	return newnodes * carry_estimate_insert(op, level);
++}
++
++/* This is dispatch table for carry operations. It can be trivially
++   abstracted into useful plugin: tunable balancing policy is a good
++   thing. */
++carry_op_handler op_dispatch_table[COP_LAST_OP] = {
++	[COP_INSERT] = {
++			.handler = carry_insert,
++			.estimate = carry_estimate_insert}
++	,
++	[COP_DELETE] = {
++			.handler = carry_delete,
++			.estimate = carry_estimate_delete}
++	,
++	[COP_CUT] = {
++		     .handler = carry_cut,
++		     .estimate = carry_estimate_cut}
++	,
++	[COP_PASTE] = {
++		       .handler = carry_paste,
++		       .estimate = carry_estimate_paste}
++	,
++	[COP_EXTENT] = {
++			.handler = carry_extent,
++			.estimate = carry_estimate_extent}
++	,
++	[COP_UPDATE] = {
++			.handler = carry_update,
++			.estimate = carry_estimate_update}
++	,
++	[COP_INSERT_FLOW] = {
++			     .handler = carry_insert_flow,
++			     .estimate = carry_estimate_insert_flow}
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/carry_ops.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/carry_ops.h
+@@ -0,0 +1,42 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* implementation of carry operations. See carry_ops.c for details. */
++
++#if !defined( __CARRY_OPS_H__ )
++#define __CARRY_OPS_H__
++
++#include "forward.h"
++#include "znode.h"
++#include "carry.h"
++
++/* carry operation handlers */
++typedef struct carry_op_handler {
++	/* perform operation */
++	int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
++	/* estimate memory requirements for @op */
++	int (*estimate) (carry_op * op, carry_level * level);
++} carry_op_handler;
++
++/* This is dispatch table for carry operations. It can be trivially
++   abstracted into useful plugin: tunable balancing policy is a good
++   thing. */
++extern carry_op_handler op_dispatch_table[COP_LAST_OP];
++
++unsigned int space_needed(const znode * node, const coord_t * coord,
++			  const reiser4_item_data * data, int inserting);
++extern carry_node *find_left_carry(carry_node * node, carry_level * level);
++extern carry_node *find_right_carry(carry_node * node, carry_level * level);
++
++/* __CARRY_OPS_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/context.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/context.c
+@@ -0,0 +1,278 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Manipulation of reiser4_context */
++
++/*
++ * global context used during system call. Variable of this type is allocated
++ * on the stack at the beginning of the reiser4 part of the system call and
++ * pointer to it is stored in the current->fs_context. This allows us to avoid
++ * passing pointer to current transaction and current lockstack (both in
++ * one-to-one mapping with threads) all over the call chain.
++ *
++ * It's kind of like those global variables the prof used to tell you not to
++ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
++ *
++ * In some situations it is desirable to have ability to enter reiser4_context
++ * more than once for the same thread (nested contexts). For example, there
++ * are some functions that can be called either directly from VFS/VM or from
++ * already active reiser4 context (->writepage, for example).
++ *
++ * In such situations "child" context acts like dummy: all activity is
++ * actually performed in the top level context, and get_current_context()
++ * always returns top level context. Of course, init_context()/done_context()
++ * have to be properly nested any way.
++ *
++ * Note that there is an important difference between reiser4 uses
++ * ->fs_context and the way other file systems use it. Other file systems
++ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
++ * (this is why ->fs_context was initially called ->journal_info). This means,
++ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
++ * to the file system, they assume that some transaction is already underway,
++ * and usually bail out, because starting nested transaction would most likely
++ * lead to the deadlock. This gives false positives with reiser4, because we
++ * set ->fs_context before starting transaction.
++ */
++
++#include "debug.h"
++#include "super.h"
++#include "context.h"
++
++#include <linux/writeback.h>	/* balance_dirty_pages() */
++#include <linux/hardirq.h>
++
++
++static void _init_context(reiser4_context * context, struct super_block *super)
++{
++	memset(context, 0, sizeof(*context));
++
++	context->super = super;
++	context->magic = context_magic;
++	context->outer = current->journal_info;
++	current->journal_info = (void *)context;
++	context->nr_children = 0;
++	context->gfp_mask = GFP_KERNEL;
++
++	init_lock_stack(&context->stack);
++
++	txn_begin(context);
++
++	/* initialize head of tap list */
++	INIT_LIST_HEAD(&context->taps);
++#if REISER4_DEBUG
++	context->task = current;
++#endif
++	grab_space_enable();
++}
++
++/* initialize context and bind it to the current thread
++
++   This function should be called at the beginning of reiser4 part of
++   syscall.
++*/
++reiser4_context *init_context(struct super_block *super	/* super block we are going to
++							 * work with */ )
++{
++	reiser4_context *context;
++
++	assert("nikita-2662", !in_interrupt() && !in_irq());
++	assert("nikita-3357", super != NULL);
++	assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
++
++	context = get_current_context_check();
++	if (context && context->super == super) {
++		context = (reiser4_context *) current->journal_info;
++		context->nr_children++;
++		return context;
++	}
++
++	context = kmalloc(sizeof(*context), GFP_KERNEL);
++	if (context == NULL)
++		return ERR_PTR(RETERR(-ENOMEM));
++
++	_init_context(context, super);
++	return context;
++}
++
++/* this is used in scan_mgr which is called with spinlock held and in
++   reiser4_fill_super magic */
++void init_stack_context(reiser4_context *context, struct super_block *super)
++{
++	assert("nikita-2662", !in_interrupt() && !in_irq());
++	assert("nikita-3357", super != NULL);
++	assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
++	assert("vs-12", !is_in_reiser4_context());
++
++	_init_context(context, super);
++	context->on_stack = 1;
++	return;
++}
++
++/* cast lock stack embedded into reiser4 context up to its container */
++reiser4_context *get_context_by_lock_stack(lock_stack * owner)
++{
++	return container_of(owner, reiser4_context, stack);
++}
++
++/* true if there is already _any_ reiser4 context for the current thread */
++int is_in_reiser4_context(void)
++{
++	reiser4_context *ctx;
++
++	ctx = current->journal_info;
++	return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
++}
++
++/*
++ * call balance dirty pages for the current context.
++ *
++ * File system is expected to call balance_dirty_pages_ratelimited() whenever
++ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
++ * write---this covers vast majority of all dirty traffic), but we cannot do
++ * this immediately when formatted node is dirtied, because long term lock is
++ * usually held at that time. To work around this, dirtying of formatted node
++ * simply increases ->nr_marked_dirty counter in the current reiser4
++ * context. When we are about to leave this context,
++ * balance_dirty_pages_ratelimited() is called, if necessary.
++ *
++ * This introduces another problem: sometimes we do not want to run
++ * balance_dirty_pages_ratelimited() when leaving a context, for example
++ * because some important lock (like ->i_mutex on the parent directory) is
++ * held. To achieve this, ->nobalance flag can be set in the current context.
++ */
++static void balance_dirty_pages_at(reiser4_context *context)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(context->super);
++
++	/*
++	 * call balance_dirty_pages_ratelimited() to process formatted nodes
++	 * dirtied during this system call. Do that only if we are not in mount
++	 * and there were nodes dirtied in this context and we are not in
++	 * writepage (to avoid deadlock) and not in pdflush
++	 */
++	if (sbinfo != NULL && sbinfo->fake != NULL &&
++	    context->nr_marked_dirty != 0 &&
++	    !(current->flags & PF_MEMALLOC) &&
++	    !current_is_pdflush())
++		balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
++}
++
++/* release resources associated with context.
++
++   This function should be called at the end of "session" with reiser4,
++   typically just before leaving reiser4 driver back to VFS.
++
++   This is good place to put some degugging consistency checks, like that
++   thread released all locks and closed transcrash etc.
++
++*/
++static void done_context(reiser4_context * context /* context being released */ )
++{
++	assert("nikita-860", context != NULL);
++	assert("nikita-859", context->magic == context_magic);
++	assert("vs-646", (reiser4_context *) current->journal_info == context);
++	assert("zam-686", !in_interrupt() && !in_irq());
++
++	/* only do anything when leaving top-level reiser4 context. All nested
++	 * contexts are just dummies. */
++	if (context->nr_children == 0) {
++		assert("jmacd-673", context->trans == NULL);
++		assert("jmacd-1002", lock_stack_isclean(&context->stack));
++		assert("nikita-1936", no_counters_are_held());
++		assert("nikita-2626", list_empty_careful(taps_list()));
++		assert("zam-1004", ergo(get_super_private(context->super),
++					get_super_private(context->super)->delete_sema_owner !=
++					current));
++
++		/* release all grabbed but as yet unused blocks */
++		if (context->grabbed_blocks != 0)
++			all_grabbed2free();
++
++		/*
++		 * synchronize against longterm_unlock_znode():
++		 * wake_up_requestor() wakes up requestors without holding
++		 * zlock (otherwise they will immediately bump into that lock
++		 * after wake up on another CPU). To work around (rare)
++		 * situation where requestor has been woken up asynchronously
++		 * and managed to run until completion (and destroy its
++		 * context and lock stack) before wake_up_requestor() called
++		 * wake_up() on it, wake_up_requestor() synchronize on lock
++		 * stack spin lock. It has actually been observed that spin
++		 * lock _was_ locked at this point, because
++		 * wake_up_requestor() took interrupt.
++		 */
++		spin_lock_stack(&context->stack);
++		spin_unlock_stack(&context->stack);
++
++		assert("zam-684", context->nr_children == 0);
++		/* restore original ->fs_context value */
++		current->journal_info = context->outer;
++		if (context->on_stack == 0)
++			kfree(context);
++	} else {
++		context->nr_children--;
++#if REISER4_DEBUG
++		assert("zam-685", context->nr_children >= 0);
++#endif
++	}
++}
++
++/*
++ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
++ * transaction. Call done_context() to do context related book-keeping.
++ */
++void reiser4_exit_context(reiser4_context * context)
++{
++	assert("nikita-3021", schedulable());
++
++	if (context->nr_children == 0) {
++		if (!context->nobalance) {
++			txn_restart(context);
++			balance_dirty_pages_at(context);
++		}
++
++		/* if filesystem is mounted with -o sync or -o dirsync - commit
++		   transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
++		   commiting on exit_context when inode semaphore is held and
++		   to have ktxnmgrd to do commit instead to get better
++		   concurrent filesystem accesses. But, when one mounts with -o
++		   sync, he cares more about reliability than about
++		   performance. So, for now we have this simple mount -o sync
++		   support. */
++		if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
++			txn_atom *atom;
++
++			atom = get_current_atom_locked_nocheck();
++			if (atom) {
++				atom->flags |= ATOM_FORCE_COMMIT;
++				context->trans->flags &= ~TXNH_DONT_COMMIT;
++				spin_unlock_atom(atom);
++			}
++		}
++		txn_end(context);
++	}
++	done_context(context);
++}
++
++void set_gfp_mask(void)
++{
++	reiser4_context *ctx;
++
++	ctx = get_current_context();
++	if (ctx->entd == 0 &&
++	    list_empty(&ctx->stack.locks) &&
++	    ctx->trans->atom == NULL)
++		ctx->gfp_mask = GFP_KERNEL;
++	else
++		ctx->gfp_mask = GFP_NOFS;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/context.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/context.h
+@@ -0,0 +1,228 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Reiser4 context. See context.c for details. */
++
++#if !defined( __REISER4_CONTEXT_H__ )
++#define __REISER4_CONTEXT_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "tap.h"
++#include "lock.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block  */
++#include <linux/spinlock.h>
++#include <linux/sched.h>	/* for struct task_struct */
++
++
++/* reiser4 per-thread context */
++struct reiser4_context {
++	/* magic constant. For identification of reiser4 contexts. */
++	__u32 magic;
++
++	/* current lock stack. See lock.[ch]. This is where list of all
++	   locks taken by current thread is kept. This is also used in
++	   deadlock detection. */
++	lock_stack stack;
++
++	/* current transcrash. */
++	txn_handle *trans;
++	/* transaction handle embedded into reiser4_context. ->trans points
++	 * here by default. */
++	txn_handle trans_in_ctx;
++
++	/* super block we are working with.  To get the current tree
++	   use &get_super_private (reiser4_get_current_sb ())->tree. */
++	struct super_block *super;
++
++	/* parent fs activation */
++	struct fs_activation *outer;
++
++	/* per-thread grabbed (for further allocation) blocks counter */
++	reiser4_block_nr grabbed_blocks;
++
++	/* list of taps currently monitored. See tap.c */
++	struct list_head taps;
++
++	/* grabbing space is enabled */
++	unsigned int grab_enabled:1;
++	/* should be set when we are write dirty nodes to disk in jnode_flush or
++	 * reiser4_write_logs() */
++	unsigned int writeout_mode:1;
++	/* true, if current thread is an ent thread */
++	unsigned int entd:1;
++	/* true, if balance_dirty_pages() should not be run when leaving this
++	 * context. This is used to avoid lengthly balance_dirty_pages()
++	 * operation when holding some important resource, like directory
++	 * ->i_mutex */
++	unsigned int nobalance:1;
++
++	/* this bit is used on done_context to decide whether context is
++	   kmalloc-ed and has to be kfree-ed */
++	unsigned int on_stack:1;
++
++	/* count non-trivial jnode_set_dirty() calls */
++	unsigned long nr_marked_dirty;
++
++	/* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
++	 * reiser4_writepages for each of dirty inodes. Reiser4_writepages
++	 * captures pages. When number of pages captured in one
++	 * reiser4_sync_inodes reaches some threshold - some atoms get
++	 * flushed */
++	int nr_captured;
++	int nr_children;	/* number of child contexts */
++#if REISER4_DEBUG
++	/* debugging information about reiser4 locks held by the current
++	 * thread */
++	lock_counters_info locks;
++	struct task_struct *task;	/* so we can easily find owner of the stack */
++
++	/*
++	 * disk space grabbing debugging support
++	 */
++	/* how many disk blocks were grabbed by the first call to
++	 * reiser4_grab_space() in this context */
++	reiser4_block_nr grabbed_initially;
++
++	/* list of all threads doing flush currently */
++	struct list_head flushers_link;
++	/* information about last error encountered by reiser4 */
++	err_site err;
++#endif
++	void *vp;
++	gfp_t gfp_mask;
++};
++
++extern reiser4_context *get_context_by_lock_stack(lock_stack *);
++
++/* Debugging helps. */
++#if REISER4_DEBUG
++extern void print_contexts(void);
++#endif
++
++#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
++#define current_blocksize reiser4_get_current_sb()->s_blocksize
++#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
++
++extern reiser4_context *init_context(struct super_block *);
++extern void init_stack_context(reiser4_context *, struct super_block *);
++extern void reiser4_exit_context(reiser4_context *);
++
++/* magic constant we store in reiser4_context allocated at the stack. Used to
++   catch accesses to staled or uninitialized contexts. */
++#define context_magic ((__u32) 0x4b1b5d0b)
++
++extern int is_in_reiser4_context(void);
++
++/*
++ * return reiser4_context for the thread @tsk
++ */
++static inline reiser4_context *get_context(const struct task_struct *tsk)
++{
++	assert("vs-1682",
++	       ((reiser4_context *) tsk->journal_info)->magic == context_magic);
++	return (reiser4_context *) tsk->journal_info;
++}
++
++/*
++ * return reiser4 context of the current thread, or NULL if there is none.
++ */
++static inline reiser4_context *get_current_context_check(void)
++{
++	if (is_in_reiser4_context())
++		return get_context(current);
++	else
++		return NULL;
++}
++
++static inline reiser4_context *get_current_context(void);	/* __attribute__((const)); */
++
++/* return context associated with current thread */
++static inline reiser4_context *get_current_context(void)
++{
++	return get_context(current);
++}
++
++static inline gfp_t get_gfp_mask(void)
++{
++	reiser4_context *ctx;
++
++	ctx = get_current_context_check();
++	return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
++}
++
++void set_gfp_mask(void);
++
++/*
++ * true if current thread is in the write-out mode. Thread enters write-out
++ * mode during jnode_flush and reiser4_write_logs().
++ */
++static inline int is_writeout_mode(void)
++{
++	return get_current_context()->writeout_mode;
++}
++
++/*
++ * enter write-out mode
++ */
++static inline void writeout_mode_enable(void)
++{
++	assert("zam-941", !get_current_context()->writeout_mode);
++	get_current_context()->writeout_mode = 1;
++}
++
++/*
++ * leave write-out mode
++ */
++static inline void writeout_mode_disable(void)
++{
++	assert("zam-942", get_current_context()->writeout_mode);
++	get_current_context()->writeout_mode = 0;
++}
++
++static inline void grab_space_enable(void)
++{
++	get_current_context()->grab_enabled = 1;
++}
++
++static inline void grab_space_disable(void)
++{
++	get_current_context()->grab_enabled = 0;
++}
++
++static inline void grab_space_set_enabled(int enabled)
++{
++	get_current_context()->grab_enabled = enabled;
++}
++
++static inline int is_grab_enabled(reiser4_context * ctx)
++{
++	return ctx->grab_enabled;
++}
++
++/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
++ * flush would be performed when it is closed. This is necessary when handle
++ * has to be closed under some coarse semaphore, like i_mutex of
++ * directory. Commit will be performed by ktxnmgrd. */
++static inline void context_set_commit_async(reiser4_context * context)
++{
++	context->nobalance = 1;
++	context->trans->flags |= TXNH_DONT_COMMIT;
++}
++
++/* __REISER4_CONTEXT_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/coord.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/coord.c
+@@ -0,0 +1,937 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "tree.h"
++#include "plugin/item/item.h"
++#include "znode.h"
++#include "coord.h"
++
++/* Internal constructor. */
++static inline void
++coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
++		  pos_in_node_t unit_pos, between_enum between)
++{
++	coord->node = (znode *) node;
++	coord_set_item_pos(coord, item_pos);
++	coord->unit_pos = unit_pos;
++	coord->between = between;
++	ON_DEBUG(coord->plug_v = 0);
++	ON_DEBUG(coord->body_v = 0);
++
++	/*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
++}
++
++/* after shifting of node content, coord previously set properly may become
++   invalid, try to "normalize" it. */
++void coord_normalize(coord_t * coord)
++{
++	znode *node;
++
++	node = coord->node;
++	assert("vs-683", node);
++
++	coord_clear_iplug(coord);
++
++	if (node_is_empty(node)) {
++		coord_init_first_unit(coord, node);
++	} else if ((coord->between == AFTER_ITEM)
++		   || (coord->between == AFTER_UNIT)) {
++		return;
++	} else if (coord->item_pos == coord_num_items(coord)
++		   && coord->between == BEFORE_ITEM) {
++		coord_dec_item_pos(coord);
++		coord->between = AFTER_ITEM;
++	} else if (coord->unit_pos == coord_num_units(coord)
++		   && coord->between == BEFORE_UNIT) {
++		coord->unit_pos--;
++		coord->between = AFTER_UNIT;
++	} else if (coord->item_pos == coord_num_items(coord)
++		   && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
++		coord_dec_item_pos(coord);
++		coord->unit_pos = 0;
++		coord->between = AFTER_ITEM;
++	}
++}
++
++/* Copy a coordinate. */
++void coord_dup(coord_t * coord, const coord_t * old_coord)
++{
++	assert("jmacd-9800", coord_check(old_coord));
++	coord_dup_nocheck(coord, old_coord);
++}
++
++/* Copy a coordinate without check. Useful when old_coord->node is not
++   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
++void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
++{
++	coord->node = old_coord->node;
++	coord_set_item_pos(coord, old_coord->item_pos);
++	coord->unit_pos = old_coord->unit_pos;
++	coord->between = old_coord->between;
++	coord->iplugid = old_coord->iplugid;
++	ON_DEBUG(coord->plug_v = old_coord->plug_v);
++	ON_DEBUG(coord->body_v = old_coord->body_v);
++}
++
++/* Initialize an invalid coordinate. */
++void coord_init_invalid(coord_t * coord, const znode * node)
++{
++	coord_init_values(coord, node, 0, 0, INVALID_COORD);
++}
++
++void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
++{
++	coord_init_values(coord, node, 0, 0, AT_UNIT);
++}
++
++/* Initialize a coordinate to point at the first unit of the first item.  If the node is
++   empty, it is positioned at the EMPTY_NODE. */
++void coord_init_first_unit(coord_t * coord, const znode * node)
++{
++	int is_empty = node_is_empty(node);
++
++	coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
++
++	assert("jmacd-9801", coord_check(coord));
++}
++
++/* Initialize a coordinate to point at the last unit of the last item.  If the node is
++   empty, it is positioned at the EMPTY_NODE. */
++void coord_init_last_unit(coord_t * coord, const znode * node)
++{
++	int is_empty = node_is_empty(node);
++
++	coord_init_values(coord, node,
++			  (is_empty ? 0 : node_num_items(node) - 1), 0,
++			  (is_empty ? EMPTY_NODE : AT_UNIT));
++	if (!is_empty)
++		coord->unit_pos = coord_last_unit_pos(coord);
++	assert("jmacd-9802", coord_check(coord));
++}
++
++/* Initialize a coordinate to before the first item.  If the node is empty, it is
++   positioned at the EMPTY_NODE. */
++void coord_init_before_first_item(coord_t * coord, const znode * node)
++{
++	int is_empty = node_is_empty(node);
++
++	coord_init_values(coord, node, 0, 0,
++			  (is_empty ? EMPTY_NODE : BEFORE_UNIT));
++
++	assert("jmacd-9803", coord_check(coord));
++}
++
++/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
++   at the EMPTY_NODE. */
++void coord_init_after_last_item(coord_t * coord, const znode * node)
++{
++	int is_empty = node_is_empty(node);
++
++	coord_init_values(coord, node,
++			  (is_empty ? 0 : node_num_items(node) - 1), 0,
++			  (is_empty ? EMPTY_NODE : AFTER_ITEM));
++
++	assert("jmacd-9804", coord_check(coord));
++}
++
++/* Initialize a coordinate to after last unit in the item. Coord must be set
++   already to existing item */
++void coord_init_after_item_end(coord_t * coord)
++{
++	coord->between = AFTER_UNIT;
++	coord->unit_pos = coord_last_unit_pos(coord);
++}
++
++/* Initialize a coordinate to before the item. Coord must be set already to existing item */
++void coord_init_before_item(coord_t * coord)
++{
++	coord->unit_pos = 0;
++	coord->between = BEFORE_ITEM;
++}
++
++/* Initialize a coordinate to after the item. Coord must be set already to existing item */
++void coord_init_after_item(coord_t * coord)
++{
++	coord->unit_pos = 0;
++	coord->between = AFTER_ITEM;
++}
++
++/* Initialize a coordinate by 0s. Used in places where init_coord was used and
++   it was not clear how actually */
++void coord_init_zero(coord_t * coord)
++{
++	memset(coord, 0, sizeof(*coord));
++}
++
++/* Return the number of units at the present item.  Asserts coord_is_existing_item(). */
++unsigned coord_num_units(const coord_t * coord)
++{
++	assert("jmacd-9806", coord_is_existing_item(coord));
++
++	return item_plugin_by_coord(coord)->b.nr_units(coord);
++}
++
++/* Returns true if the coord was initializewd by coord_init_invalid (). */
++/* Audited by: green(2002.06.15) */
++int coord_is_invalid(const coord_t * coord)
++{
++	return coord->between == INVALID_COORD;
++}
++
++/* Returns true if the coordinate is positioned at an existing item, not before or after
++   an item.  It may be placed at, before, or after any unit within the item, whether
++   existing or not. */
++int coord_is_existing_item(const coord_t * coord)
++{
++	switch (coord->between) {
++	case EMPTY_NODE:
++	case BEFORE_ITEM:
++	case AFTER_ITEM:
++	case INVALID_COORD:
++		return 0;
++
++	case BEFORE_UNIT:
++	case AT_UNIT:
++	case AFTER_UNIT:
++		return coord->item_pos < coord_num_items(coord);
++	}
++
++	impossible("jmacd-9900", "unreachable coord: %p", coord);
++	return 0;
++}
++
++/* Returns true if the coordinate is positioned at an existing unit, not before or after a
++   unit. */
++/* Audited by: green(2002.06.15) */
++int coord_is_existing_unit(const coord_t * coord)
++{
++	switch (coord->between) {
++	case EMPTY_NODE:
++	case BEFORE_UNIT:
++	case AFTER_UNIT:
++	case BEFORE_ITEM:
++	case AFTER_ITEM:
++	case INVALID_COORD:
++		return 0;
++
++	case AT_UNIT:
++		return (coord->item_pos < coord_num_items(coord)
++			&& coord->unit_pos < coord_num_units(coord));
++	}
++
++	impossible("jmacd-9902", "unreachable");
++	return 0;
++}
++
++/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
++   true for empty nodes nor coordinates positioned before the first item. */
++/* Audited by: green(2002.06.15) */
++int coord_is_leftmost_unit(const coord_t * coord)
++{
++	return (coord->between == AT_UNIT && coord->item_pos == 0
++		&& coord->unit_pos == 0);
++}
++
++#if REISER4_DEBUG
++/* For assertions only, checks for a valid coordinate. */
++int coord_check(const coord_t * coord)
++{
++	if (coord->node == NULL) {
++		return 0;
++	}
++	if (znode_above_root(coord->node))
++		return 1;
++
++	switch (coord->between) {
++	default:
++	case INVALID_COORD:
++		return 0;
++	case EMPTY_NODE:
++		if (!node_is_empty(coord->node)) {
++			return 0;
++		}
++		return coord->item_pos == 0 && coord->unit_pos == 0;
++
++	case BEFORE_UNIT:
++	case AFTER_UNIT:
++		if (node_is_empty(coord->node) && (coord->item_pos == 0)
++		    && (coord->unit_pos == 0))
++			return 1;
++	case AT_UNIT:
++		break;
++	case AFTER_ITEM:
++	case BEFORE_ITEM:
++		/* before/after item should not set unit_pos. */
++		if (coord->unit_pos != 0) {
++			return 0;
++		}
++		break;
++	}
++
++	if (coord->item_pos >= node_num_items(coord->node)) {
++		return 0;
++	}
++
++	/* FIXME-VS: we are going to check unit_pos. This makes no sense when
++	   between is set either AFTER_ITEM or BEFORE_ITEM */
++	if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
++		return 1;
++
++	if (coord_is_iplug_set(coord) &&
++	    coord->unit_pos >
++	    item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
++		return 0;
++	}
++	return 1;
++}
++#endif
++
++/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
++   Returns 1 if the new position is does not exist. */
++static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
++{
++	/* If the node is invalid, leave it. */
++	if (coord->between == INVALID_COORD) {
++		return 1;
++	}
++
++	/* If the node is empty, set it appropriately. */
++	if (items == 0) {
++		coord->between = EMPTY_NODE;
++		coord_set_item_pos(coord, 0);
++		coord->unit_pos = 0;
++		return 1;
++	}
++
++	/* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
++	if (coord->between == EMPTY_NODE) {
++		coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
++		coord_set_item_pos(coord, 0);
++		coord->unit_pos = 0;
++		return 0;
++	}
++
++	/* If the item_pos is out-of-range, set it appropriatly. */
++	if (coord->item_pos >= items) {
++		coord->between = AFTER_ITEM;
++		coord_set_item_pos(coord, items - 1);
++		coord->unit_pos = 0;
++		/* If is_next, return 1 (can't go any further). */
++		return is_next;
++	}
++
++	return 0;
++}
++
++/* Advances the coordinate by one unit to the right.  If empty, no change.  If
++   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is an
++   existing unit. */
++int coord_next_unit(coord_t * coord)
++{
++	unsigned items = coord_num_items(coord);
++
++	if (coord_adjust_items(coord, items, 1) == 1) {
++		return 1;
++	}
++
++	switch (coord->between) {
++	case BEFORE_UNIT:
++		/* Now it is positioned at the same unit. */
++		coord->between = AT_UNIT;
++		return 0;
++
++	case AFTER_UNIT:
++	case AT_UNIT:
++		/* If it was at or after a unit and there are more units in this item,
++		   advance to the next one. */
++		if (coord->unit_pos < coord_last_unit_pos(coord)) {
++			coord->unit_pos += 1;
++			coord->between = AT_UNIT;
++			return 0;
++		}
++
++		/* Otherwise, it is crossing an item boundary and treated as if it was
++		   after the current item. */
++		coord->between = AFTER_ITEM;
++		coord->unit_pos = 0;
++		/* FALLTHROUGH */
++
++	case AFTER_ITEM:
++		/* Check for end-of-node. */
++		if (coord->item_pos == items - 1) {
++			return 1;
++		}
++
++		coord_inc_item_pos(coord);
++		coord->unit_pos = 0;
++		coord->between = AT_UNIT;
++		return 0;
++
++	case BEFORE_ITEM:
++		/* The adjust_items checks ensure that we are valid here. */
++		coord->unit_pos = 0;
++		coord->between = AT_UNIT;
++		return 0;
++
++	case INVALID_COORD:
++	case EMPTY_NODE:
++		/* Handled in coord_adjust_items(). */
++		break;
++	}
++
++	impossible("jmacd-9902", "unreachable");
++	return 0;
++}
++
++/* Advances the coordinate by one item to the right.  If empty, no change.  If
++   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
++   an existing item. */
++int coord_next_item(coord_t * coord)
++{
++	unsigned items = coord_num_items(coord);
++
++	if (coord_adjust_items(coord, items, 1) == 1) {
++		return 1;
++	}
++
++	switch (coord->between) {
++	case AFTER_UNIT:
++	case AT_UNIT:
++	case BEFORE_UNIT:
++	case AFTER_ITEM:
++		/* Check for end-of-node. */
++		if (coord->item_pos == items - 1) {
++			coord->between = AFTER_ITEM;
++			coord->unit_pos = 0;
++			coord_clear_iplug(coord);
++			return 1;
++		}
++
++		/* Anywhere in an item, go to the next one. */
++		coord->between = AT_UNIT;
++		coord_inc_item_pos(coord);
++		coord->unit_pos = 0;
++		return 0;
++
++	case BEFORE_ITEM:
++		/* The out-of-range check ensures that we are valid here. */
++		coord->unit_pos = 0;
++		coord->between = AT_UNIT;
++		return 0;
++	case INVALID_COORD:
++	case EMPTY_NODE:
++		/* Handled in coord_adjust_items(). */
++		break;
++	}
++
++	impossible("jmacd-9903", "unreachable");
++	return 0;
++}
++
++/* Advances the coordinate by one unit to the left.  If empty, no change.  If
++   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
++   is an existing unit. */
++int coord_prev_unit(coord_t * coord)
++{
++	unsigned items = coord_num_items(coord);
++
++	if (coord_adjust_items(coord, items, 0) == 1) {
++		return 1;
++	}
++
++	switch (coord->between) {
++	case AT_UNIT:
++	case BEFORE_UNIT:
++		if (coord->unit_pos > 0) {
++			coord->unit_pos -= 1;
++			coord->between = AT_UNIT;
++			return 0;
++		}
++
++		if (coord->item_pos == 0) {
++			coord->between = BEFORE_ITEM;
++			return 1;
++		}
++
++		coord_dec_item_pos(coord);
++		coord->unit_pos = coord_last_unit_pos(coord);
++		coord->between = AT_UNIT;
++		return 0;
++
++	case AFTER_UNIT:
++		/* What if unit_pos is out-of-range? */
++		assert("jmacd-5442",
++		       coord->unit_pos <= coord_last_unit_pos(coord));
++		coord->between = AT_UNIT;
++		return 0;
++
++	case BEFORE_ITEM:
++		if (coord->item_pos == 0) {
++			return 1;
++		}
++
++		coord_dec_item_pos(coord);
++		/* FALLTHROUGH */
++
++	case AFTER_ITEM:
++		coord->between = AT_UNIT;
++		coord->unit_pos = coord_last_unit_pos(coord);
++		return 0;
++
++	case INVALID_COORD:
++	case EMPTY_NODE:
++		break;
++	}
++
++	impossible("jmacd-9904", "unreachable");
++	return 0;
++}
++
++/* Advances the coordinate by one item to the left.  If empty, no change.  If
++   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
++   is an existing item. */
++int coord_prev_item(coord_t * coord)
++{
++	unsigned items = coord_num_items(coord);
++
++	if (coord_adjust_items(coord, items, 0) == 1) {
++		return 1;
++	}
++
++	switch (coord->between) {
++	case AT_UNIT:
++	case AFTER_UNIT:
++	case BEFORE_UNIT:
++	case BEFORE_ITEM:
++
++		if (coord->item_pos == 0) {
++			coord->between = BEFORE_ITEM;
++			coord->unit_pos = 0;
++			return 1;
++		}
++
++		coord_dec_item_pos(coord);
++		coord->unit_pos = 0;
++		coord->between = AT_UNIT;
++		return 0;
++
++	case AFTER_ITEM:
++		coord->between = AT_UNIT;
++		coord->unit_pos = 0;
++		return 0;
++
++	case INVALID_COORD:
++	case EMPTY_NODE:
++		break;
++	}
++
++	impossible("jmacd-9905", "unreachable");
++	return 0;
++}
++
++/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
++void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
++{
++	assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
++	if (dir == LEFT_SIDE) {
++		coord_init_first_unit(coord, node);
++	} else {
++		coord_init_last_unit(coord, node);
++	}
++}
++
++/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
++   argument. */
++/* Audited by: green(2002.06.15) */
++int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
++{
++	assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
++	if (dir == LEFT_SIDE) {
++		return coord_is_before_leftmost(coord);
++	} else {
++		return coord_is_after_rightmost(coord);
++	}
++}
++
++/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
++/* Audited by: green(2002.06.15) */
++int coord_sideof_unit(coord_t * coord, sideof dir)
++{
++	assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
++	if (dir == LEFT_SIDE) {
++		return coord_prev_unit(coord);
++	} else {
++		return coord_next_unit(coord);
++	}
++}
++
++#if REISER4_DEBUG
++#define DEBUG_COORD_FIELDS (sizeof(c1->plug_v) + sizeof(c1->body_v))
++#else
++#define DEBUG_COORD_FIELDS (0)
++#endif
++
++int coords_equal(const coord_t * c1, const coord_t * c2)
++{
++	assert("nikita-2840", c1 != NULL);
++	assert("nikita-2841", c2 != NULL);
++
++	return
++	    c1->node == c2->node &&
++	    c1->item_pos == c2->item_pos &&
++	    c1->unit_pos == c2->unit_pos && c1->between == c2->between;
++}
++
++/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
++   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
++/* Audited by: green(2002.06.15) */
++coord_wrt_node coord_wrt(const coord_t * coord)
++{
++	if (coord_is_before_leftmost(coord)) {
++		return COORD_ON_THE_LEFT;
++	}
++
++	if (coord_is_after_rightmost(coord)) {
++		return COORD_ON_THE_RIGHT;
++	}
++
++	return COORD_INSIDE;
++}
++
++/* Returns true if the coordinate is positioned after the last item or after the last unit
++   of the last item or it is an empty node. */
++/* Audited by: green(2002.06.15) */
++int coord_is_after_rightmost(const coord_t * coord)
++{
++	assert("jmacd-7313", coord_check(coord));
++
++	switch (coord->between) {
++	case INVALID_COORD:
++	case AT_UNIT:
++	case BEFORE_UNIT:
++	case BEFORE_ITEM:
++		return 0;
++
++	case EMPTY_NODE:
++		return 1;
++
++	case AFTER_ITEM:
++		return (coord->item_pos == node_num_items(coord->node) - 1);
++
++	case AFTER_UNIT:
++		return ((coord->item_pos == node_num_items(coord->node) - 1) &&
++			coord->unit_pos == coord_last_unit_pos(coord));
++	}
++
++	impossible("jmacd-9908", "unreachable");
++	return 0;
++}
++
++/* Returns true if the coordinate is positioned before the first item or it is an empty
++   node. */
++int coord_is_before_leftmost(const coord_t * coord)
++{
++	/* FIXME-VS: coord_check requires node to be loaded whereas it is not
++	   necessary to check if coord is set before leftmost
++	   assert ("jmacd-7313", coord_check (coord)); */
++	switch (coord->between) {
++	case INVALID_COORD:
++	case AT_UNIT:
++	case AFTER_ITEM:
++	case AFTER_UNIT:
++		return 0;
++
++	case EMPTY_NODE:
++		return 1;
++
++	case BEFORE_ITEM:
++	case BEFORE_UNIT:
++		return (coord->item_pos == 0) && (coord->unit_pos == 0);
++	}
++
++	impossible("jmacd-9908", "unreachable");
++	return 0;
++}
++
++/* Returns true if the coordinate is positioned after a item, before a item, after the
++   last unit of an item, before the first unit of an item, or at an empty node. */
++/* Audited by: green(2002.06.15) */
++int coord_is_between_items(const coord_t * coord)
++{
++	assert("jmacd-7313", coord_check(coord));
++
++	switch (coord->between) {
++	case INVALID_COORD:
++	case AT_UNIT:
++		return 0;
++
++	case AFTER_ITEM:
++	case BEFORE_ITEM:
++	case EMPTY_NODE:
++		return 1;
++
++	case BEFORE_UNIT:
++		return coord->unit_pos == 0;
++
++	case AFTER_UNIT:
++		return coord->unit_pos == coord_last_unit_pos(coord);
++	}
++
++	impossible("jmacd-9908", "unreachable");
++	return 0;
++}
++
++/* Returns true if the coordinates are positioned at adjacent units, regardless of
++   before-after or item boundaries. */
++int coord_are_neighbors(coord_t * c1, coord_t * c2)
++{
++	coord_t *left;
++	coord_t *right;
++
++	assert("nikita-1241", c1 != NULL);
++	assert("nikita-1242", c2 != NULL);
++	assert("nikita-1243", c1->node == c2->node);
++	assert("nikita-1244", coord_is_existing_unit(c1));
++	assert("nikita-1245", coord_is_existing_unit(c2));
++
++	left = right = NULL;
++	switch (coord_compare(c1, c2)) {
++	case COORD_CMP_ON_LEFT:
++		left = c1;
++		right = c2;
++		break;
++	case COORD_CMP_ON_RIGHT:
++		left = c2;
++		right = c1;
++		break;
++	case COORD_CMP_SAME:
++		return 0;
++	default:
++		wrong_return_value("nikita-1246", "compare_coords()");
++	}
++	assert("vs-731", left && right);
++	if (left->item_pos == right->item_pos) {
++		return left->unit_pos + 1 == right->unit_pos;
++	} else if (left->item_pos + 1 == right->item_pos) {
++		return (left->unit_pos == coord_last_unit_pos(left))
++		    && (right->unit_pos == 0);
++	} else {
++		return 0;
++	}
++}
++
++/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
++   COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2.  */
++/* Audited by: green(2002.06.15) */
++coord_cmp coord_compare(coord_t * c1, coord_t * c2)
++{
++	assert("vs-209", c1->node == c2->node);
++	assert("vs-194", coord_is_existing_unit(c1)
++	       && coord_is_existing_unit(c2));
++
++	if (c1->item_pos > c2->item_pos)
++		return COORD_CMP_ON_RIGHT;
++	if (c1->item_pos < c2->item_pos)
++		return COORD_CMP_ON_LEFT;
++	if (c1->unit_pos > c2->unit_pos)
++		return COORD_CMP_ON_RIGHT;
++	if (c1->unit_pos < c2->unit_pos)
++		return COORD_CMP_ON_LEFT;
++	return COORD_CMP_SAME;
++}
++
++/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
++   non-zero if there is no position to the right. */
++int coord_set_to_right(coord_t * coord)
++{
++	unsigned items = coord_num_items(coord);
++
++	if (coord_adjust_items(coord, items, 1) == 1) {
++		return 1;
++	}
++
++	switch (coord->between) {
++	case AT_UNIT:
++		return 0;
++
++	case BEFORE_ITEM:
++	case BEFORE_UNIT:
++		coord->between = AT_UNIT;
++		return 0;
++
++	case AFTER_UNIT:
++		if (coord->unit_pos < coord_last_unit_pos(coord)) {
++			coord->unit_pos += 1;
++			coord->between = AT_UNIT;
++			return 0;
++		} else {
++
++			coord->unit_pos = 0;
++
++			if (coord->item_pos == items - 1) {
++				coord->between = AFTER_ITEM;
++				return 1;
++			}
++
++			coord_inc_item_pos(coord);
++			coord->between = AT_UNIT;
++			return 0;
++		}
++
++	case AFTER_ITEM:
++		if (coord->item_pos == items - 1) {
++			return 1;
++		}
++
++		coord_inc_item_pos(coord);
++		coord->unit_pos = 0;
++		coord->between = AT_UNIT;
++		return 0;
++
++	case EMPTY_NODE:
++		return 1;
++
++	case INVALID_COORD:
++		break;
++	}
++
++	impossible("jmacd-9920", "unreachable");
++	return 0;
++}
++
++/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
++   non-zero if there is no position to the left. */
++int coord_set_to_left(coord_t * coord)
++{
++	unsigned items = coord_num_items(coord);
++
++	if (coord_adjust_items(coord, items, 0) == 1) {
++		return 1;
++	}
++
++	switch (coord->between) {
++	case AT_UNIT:
++		return 0;
++
++	case AFTER_UNIT:
++		coord->between = AT_UNIT;
++		return 0;
++
++	case AFTER_ITEM:
++		coord->between = AT_UNIT;
++		coord->unit_pos = coord_last_unit_pos(coord);
++		return 0;
++
++	case BEFORE_UNIT:
++		if (coord->unit_pos > 0) {
++			coord->unit_pos -= 1;
++			coord->between = AT_UNIT;
++			return 0;
++		} else {
++
++			if (coord->item_pos == 0) {
++				coord->between = BEFORE_ITEM;
++				return 1;
++			}
++
++			coord->unit_pos = coord_last_unit_pos(coord);
++			coord_dec_item_pos(coord);
++			coord->between = AT_UNIT;
++			return 0;
++		}
++
++	case BEFORE_ITEM:
++		if (coord->item_pos == 0) {
++			return 1;
++		}
++
++		coord_dec_item_pos(coord);
++		coord->unit_pos = coord_last_unit_pos(coord);
++		coord->between = AT_UNIT;
++		return 0;
++
++	case EMPTY_NODE:
++		return 1;
++
++	case INVALID_COORD:
++		break;
++	}
++
++	impossible("jmacd-9920", "unreachable");
++	return 0;
++}
++
++static const char *coord_tween_tostring(between_enum n)
++{
++	switch (n) {
++	case BEFORE_UNIT:
++		return "before unit";
++	case BEFORE_ITEM:
++		return "before item";
++	case AT_UNIT:
++		return "at unit";
++	case AFTER_UNIT:
++		return "after unit";
++	case AFTER_ITEM:
++		return "after item";
++	case EMPTY_NODE:
++		return "empty node";
++	case INVALID_COORD:
++		return "invalid";
++	default:
++	{
++		static char buf[30];
++
++		sprintf(buf, "unknown: %i", n);
++		return buf;
++	}
++	}
++}
++
++void print_coord(const char *mes, const coord_t * coord, int node)
++{
++	if (coord == NULL) {
++		printk("%s: null\n", mes);
++		return;
++	}
++	printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
++	       mes, coord->item_pos, coord->unit_pos,
++	       coord_tween_tostring(coord->between), coord->iplugid);
++}
++
++int
++item_utmost_child_real_block(const coord_t * coord, sideof side,
++			     reiser4_block_nr * blk)
++{
++	return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
++								      side,
++								      blk);
++}
++
++int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
++{
++	return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
++}
++
++/* @count bytes of flow @f got written, update correspondingly f->length,
++   f->data and f->key */
++void move_flow_forward(flow_t * f, unsigned count)
++{
++	if (f->data)
++		f->data += count;
++	f->length -= count;
++	set_key_offset(&f->key, get_key_offset(&f->key) + count);
++}
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/coord.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/coord.h
+@@ -0,0 +1,389 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Coords */
++
++#if !defined( __REISER4_COORD_H__ )
++#define __REISER4_COORD_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++
++/* insertions happen between coords in the tree, so we need some means
++   of specifying the sense of betweenness. */
++typedef enum {
++	BEFORE_UNIT,		/* Note: we/init_coord depends on this value being zero. */
++	AT_UNIT,
++	AFTER_UNIT,
++	BEFORE_ITEM,
++	AFTER_ITEM,
++	INVALID_COORD,
++	EMPTY_NODE,
++} between_enum;
++
++/* location of coord w.r.t. its node */
++typedef enum {
++	COORD_ON_THE_LEFT = -1,
++	COORD_ON_THE_RIGHT = +1,
++	COORD_INSIDE = 0
++} coord_wrt_node;
++
++typedef enum {
++	COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
++} coord_cmp;
++
++struct coord {
++	/* node in a tree */
++	/*  0 */ znode *node;
++
++	/* position of item within node */
++	/*  4 */ pos_in_node_t item_pos;
++	/* position of unit within item */
++	/*  6 */ pos_in_node_t unit_pos;
++	/* optimization: plugin of item is stored in coord_t. Until this was
++	   implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
++	   is invalidated (set to 0xff) on each modification of ->item_pos,
++	   and all such modifications are funneled through coord_*_item_pos()
++	   functions below.
++	 */
++	/*  8 */ char iplugid;
++	/* position of coord w.r.t. to neighboring items and/or units.
++	   Values are taken from &between_enum above.
++	 */
++	/*  9 */ char between;
++	/* padding. It will be added by the compiler anyway to conform to the
++	 * C language alignment requirements. We keep it here to be on the
++	 * safe side and to have a clear picture of the memory layout of this
++	 * structure. */
++	/* 10 */ __u16 pad;
++	/* 12 */ int offset;
++#if REISER4_DEBUG
++	unsigned long plug_v;
++	unsigned long body_v;
++#endif
++};
++
++#define INVALID_PLUGID  ((char)((1 << 8) - 1))
++#define INVALID_OFFSET -1
++
++static inline void coord_clear_iplug(coord_t * coord)
++{
++	assert("nikita-2835", coord != NULL);
++	coord->iplugid = INVALID_PLUGID;
++	coord->offset = INVALID_OFFSET;
++}
++
++static inline int coord_is_iplug_set(const coord_t * coord)
++{
++	assert("nikita-2836", coord != NULL);
++	return coord->iplugid != INVALID_PLUGID;
++}
++
++static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
++{
++	assert("nikita-2478", coord != NULL);
++	coord->item_pos = pos;
++	coord_clear_iplug(coord);
++}
++
++static inline void coord_dec_item_pos(coord_t * coord)
++{
++	assert("nikita-2480", coord != NULL);
++	--coord->item_pos;
++	coord_clear_iplug(coord);
++}
++
++static inline void coord_inc_item_pos(coord_t * coord)
++{
++	assert("nikita-2481", coord != NULL);
++	++coord->item_pos;
++	coord_clear_iplug(coord);
++}
++
++static inline void coord_add_item_pos(coord_t * coord, int delta)
++{
++	assert("nikita-2482", coord != NULL);
++	coord->item_pos += delta;
++	coord_clear_iplug(coord);
++}
++
++static inline void coord_invalid_item_pos(coord_t * coord)
++{
++	assert("nikita-2832", coord != NULL);
++	coord->item_pos = (unsigned short)~0;
++	coord_clear_iplug(coord);
++}
++
++/* Reverse a direction. */
++static inline sideof sideof_reverse(sideof side)
++{
++	return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
++}
++
++/* NOTE: There is a somewhat odd mixture of the following opposed terms:
++
++   "first" and "last"
++   "next" and "prev"
++   "before" and "after"
++   "leftmost" and "rightmost"
++
++   But I think the chosen names are decent the way they are.
++*/
++
++/* COORD INITIALIZERS */
++
++/* Initialize an invalid coordinate. */
++extern void coord_init_invalid(coord_t * coord, const znode * node);
++
++extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
++
++/* Initialize a coordinate to point at the first unit of the first item.  If the node is
++   empty, it is positioned at the EMPTY_NODE. */
++extern void coord_init_first_unit(coord_t * coord, const znode * node);
++
++/* Initialize a coordinate to point at the last unit of the last item.  If the node is
++   empty, it is positioned at the EMPTY_NODE. */
++extern void coord_init_last_unit(coord_t * coord, const znode * node);
++
++/* Initialize a coordinate to before the first item.  If the node is empty, it is
++   positioned at the EMPTY_NODE. */
++extern void coord_init_before_first_item(coord_t * coord, const znode * node);
++
++/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
++   at the EMPTY_NODE. */
++extern void coord_init_after_last_item(coord_t * coord, const znode * node);
++
++/* Initialize a coordinate to after last unit in the item. Coord must be set
++   already to existing item */
++void coord_init_after_item_end(coord_t * coord);
++
++/* Initialize a coordinate to before the item. Coord must be set already to existing item */
++void coord_init_before_item(coord_t *);
++/* Initialize a coordinate to after the item. Coord must be set already to existing item */
++void coord_init_after_item(coord_t *);
++
++/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
++extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
++				   sideof dir);
++
++/* Initialize a coordinate by 0s. Used in places where init_coord was used and
++   it was not clear how actually
++   FIXME-VS: added by vs (2002, june, 8) */
++extern void coord_init_zero(coord_t * coord);
++
++/* COORD METHODS */
++
++/* after shifting of node content, coord previously set properly may become
++   invalid, try to "normalize" it. */
++void coord_normalize(coord_t * coord);
++
++/* Copy a coordinate. */
++extern void coord_dup(coord_t * coord, const coord_t * old_coord);
++
++/* Copy a coordinate without check. */
++void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
++
++unsigned coord_num_units(const coord_t * coord);
++
++/* Return the last valid unit number at the present item (i.e.,
++   coord_num_units() - 1). */
++static inline unsigned coord_last_unit_pos(const coord_t * coord)
++{
++	return coord_num_units(coord) - 1;
++}
++
++#if REISER4_DEBUG
++/* For assertions only, checks for a valid coordinate. */
++extern int coord_check(const coord_t * coord);
++
++extern unsigned long znode_times_locked(const znode * z);
++
++static inline void coord_update_v(coord_t * coord)
++{
++	coord->plug_v = coord->body_v = znode_times_locked(coord->node);
++}
++#endif
++
++extern int coords_equal(const coord_t * c1, const coord_t * c2);
++
++extern void print_coord(const char *mes, const coord_t * coord, int print_node);
++
++/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
++   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
++extern coord_wrt_node coord_wrt(const coord_t * coord);
++
++/* Returns true if the coordinates are positioned at adjacent units, regardless of
++   before-after or item boundaries. */
++extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
++
++/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
++   NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2.  */
++extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
++
++/* COORD PREDICATES */
++
++/* Returns true if the coord was initializewd by coord_init_invalid (). */
++extern int coord_is_invalid(const coord_t * coord);
++
++/* Returns true if the coordinate is positioned at an existing item, not before or after
++   an item.  It may be placed at, before, or after any unit within the item, whether
++   existing or not.  If this is true you can call methods of the item plugin.  */
++extern int coord_is_existing_item(const coord_t * coord);
++
++/* Returns true if the coordinate is positioned after a item, before a item, after the
++   last unit of an item, before the first unit of an item, or at an empty node. */
++extern int coord_is_between_items(const coord_t * coord);
++
++/* Returns true if the coordinate is positioned at an existing unit, not before or after a
++   unit. */
++extern int coord_is_existing_unit(const coord_t * coord);
++
++/* Returns true if the coordinate is positioned at an empty node. */
++extern int coord_is_empty(const coord_t * coord);
++
++/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
++   true for empty nodes nor coordinates positioned before the first item. */
++extern int coord_is_leftmost_unit(const coord_t * coord);
++
++/* Returns true if the coordinate is positioned after the last item or after the last unit
++   of the last item or it is an empty node. */
++extern int coord_is_after_rightmost(const coord_t * coord);
++
++/* Returns true if the coordinate is positioned before the first item or it is an empty
++   node. */
++extern int coord_is_before_leftmost(const coord_t * coord);
++
++/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
++   argument. */
++extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
++
++/* COORD MODIFIERS */
++
++/* Advances the coordinate by one unit to the right.  If empty, no change.  If
++   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
++   an existing unit. */
++extern int coord_next_unit(coord_t * coord);
++
++/* Advances the coordinate by one item to the right.  If empty, no change.  If
++   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
++   an existing item. */
++extern int coord_next_item(coord_t * coord);
++
++/* Advances the coordinate by one unit to the left.  If empty, no change.  If
++   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
++   is an existing unit. */
++extern int coord_prev_unit(coord_t * coord);
++
++/* Advances the coordinate by one item to the left.  If empty, no change.  If
++   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
++   is an existing item. */
++extern int coord_prev_item(coord_t * coord);
++
++/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
++   non-zero if there is no position to the right. */
++extern int coord_set_to_right(coord_t * coord);
++
++/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
++   non-zero if there is no position to the left. */
++extern int coord_set_to_left(coord_t * coord);
++
++/* If the coordinate is at an existing unit, set to after that unit.  Returns 0 on success
++   and non-zero if the unit did not exist. */
++extern int coord_set_after_unit(coord_t * coord);
++
++/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
++extern int coord_sideof_unit(coord_t * coord, sideof dir);
++
++/* iterate over all units in @node */
++#define for_all_units( coord, node )					\
++	for( coord_init_before_first_item( ( coord ), ( node ) ) ; 	\
++	     coord_next_unit( coord ) == 0 ; )
++
++/* iterate over all items in @node */
++#define for_all_items( coord, node )					\
++	for( coord_init_before_first_item( ( coord ), ( node ) ) ; 	\
++	     coord_next_item( coord ) == 0 ; )
++
++/* COORD/ITEM METHODS */
++
++extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
++					reiser4_block_nr * blk);
++extern int item_utmost_child(const coord_t * coord, sideof side,
++			     jnode ** child);
++
++/* a flow is a sequence of bytes being written to or read from the tree.  The
++   tree will slice the flow into items while storing it into nodes, but all of
++   that is hidden from anything outside the tree.  */
++
++struct flow {
++	reiser4_key key;	/* key of start of flow's sequence of bytes */
++	loff_t length;		/* length of flow's sequence of bytes */
++	char *data;	        /* start of flow's sequence of bytes */
++	int user;		/* if 1 data is user space, 0 - kernel space */
++	rw_op op;		/* NIKITA-FIXME-HANS: comment is where?  */
++};
++
++void move_flow_forward(flow_t * f, unsigned count);
++
++/* &reiser4_item_data - description of data to be inserted or pasted
++
++   Q: articulate the reasons for the difference between this and flow.
++
++   A: Becides flow we insert into tree other things: stat data, directory
++   entry, etc.  To insert them into tree one has to provide this structure. If
++   one is going to insert flow - he can use insert_flow, where this structure
++   does not have to be created
++*/
++struct reiser4_item_data {
++	/* actual data to be inserted. If NULL, ->create_item() will not
++	   do xmemcpy itself, leaving this up to the caller. This can
++	   save some amount of unnecessary memory copying, for example,
++	   during insertion of stat data.
++
++	 */
++	char *data;
++	/* 1 if 'char * data' contains pointer to user space and 0 if it is
++	   kernel space */
++	int user;
++	/* amount of data we are going to insert or paste */
++	int length;
++	/* "Arg" is opaque data that is passed down to the
++	   ->create_item() method of node layout, which in turn
++	   hands it to the ->create_hook() of item being created. This
++	   arg is currently used by:
++
++	   .  ->create_hook() of internal item
++	   (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
++	   . ->paste() method of directory item.
++	   . ->create_hook() of extent item
++
++	   For internal item, this is left "brother" of new node being
++	   inserted and it is used to add new node into sibling list
++	   after parent to it was just inserted into parent.
++
++	   While ->arg does look somewhat of unnecessary compication,
++	   it actually saves a lot of headache in many places, because
++	   all data necessary to insert or paste new data into tree are
++	   collected in one place, and this eliminates a lot of extra
++	   argument passing and storing everywhere.
++
++	 */
++	void *arg;
++	/* plugin of item we are inserting */
++	item_plugin *iplug;
++};
++
++/* __REISER4_COORD_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/debug.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/debug.c
+@@ -0,0 +1,300 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Debugging facilities. */
++
++/*
++ * This file contains generic debugging functions used by reiser4. Roughly
++ * following:
++ *
++ *     panicking: reiser4_do_panic(), reiser4_print_prefix().
++ *
++ *     locking: schedulable(), lock_counters(), print_lock_counters(),
++ *     no_counters_are_held(), commit_check_locks()
++ *
++ *     error code monitoring (see comment before RETERR macro): return_err(),
++ *     report_err().
++ *
++ *     stack back-tracing: fill_backtrace()
++ *
++ *     miscellaneous: preempt_point(), call_on_each_assert(), debugtrap().
++ *
++ */
++
++#include "reiser4.h"
++#include "context.h"
++#include "super.h"
++#include "txnmgr.h"
++#include "znode.h"
++
++#include <linux/sysfs.h>
++#include <linux/slab.h>
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <linux/spinlock.h>
++#include <linux/kallsyms.h>
++#include <linux/vmalloc.h>
++#include <linux/ctype.h>
++#include <linux/sysctl.h>
++#include <linux/hardirq.h>
++
++#if REISER4_DEBUG
++static void report_err(void);
++#else
++#define report_err() noop
++#endif
++
++/*
++ * global buffer where message given to reiser4_panic is formatted.
++ */
++static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
++
++/*
++ * lock protecting consistency of panic_buf under concurrent panics
++ */
++static DEFINE_SPINLOCK(panic_guard);
++
++/* Your best friend. Call it on each occasion.  This is called by
++    fs/reiser4/debug.h:reiser4_panic(). */
++void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
++{
++	static int in_panic = 0;
++	va_list args;
++
++	/*
++	 * check for recursive panic.
++	 */
++	if (in_panic == 0) {
++		in_panic = 1;
++
++		spin_lock(&panic_guard);
++		va_start(args, format);
++		vsnprintf(panic_buf, sizeof(panic_buf), format, args);
++		va_end(args);
++		printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
++		spin_unlock(&panic_guard);
++
++		/*
++		 * if kernel debugger is configured---drop in. Early dropping
++		 * into kgdb is not always convenient, because panic message
++		 * is not yet printed most of the times. But:
++		 *
++		 *     (1) message can be extracted from printk_buf[]
++		 *     (declared static inside of printk()), and
++		 *
++		 *     (2) sometimes serial/kgdb combo dies while printing
++		 *     long panic message, so it's more prudent to break into
++		 *     debugger earlier.
++		 *
++		 */
++		DEBUGON(1);
++	}
++	/* to make gcc happy about noreturn attribute */
++	panic("%s", panic_buf);
++}
++
++void
++reiser4_print_prefix(const char *level, int reperr, const char *mid,
++		     const char *function, const char *file, int lineno)
++{
++	const char *comm;
++	int pid;
++
++	if (unlikely(in_interrupt() || in_irq())) {
++		comm = "interrupt";
++		pid = 0;
++	} else {
++		comm = current->comm;
++		pid = current->pid;
++	}
++	printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
++	       level, comm, pid, function, file, lineno, mid);
++	if (reperr)
++		report_err();
++}
++
++/* Preemption point: this should be called periodically during long running
++   operations (carry, allocate, and squeeze are best examples) */
++int preempt_point(void)
++{
++	assert("nikita-3008", schedulable());
++	cond_resched();
++	return signal_pending(current);
++}
++
++#if REISER4_DEBUG
++/* Debugging aid: return struct where information about locks taken by current
++   thread is accumulated. This can be used to formulate lock ordering
++   constraints and various assertions.
++
++*/
++lock_counters_info *lock_counters(void)
++{
++	reiser4_context *ctx = get_current_context();
++	assert("jmacd-1123", ctx != NULL);
++	return &ctx->locks;
++}
++
++/*
++ * print human readable information about locks held by the reiser4 context.
++ */
++static void print_lock_counters(const char *prefix,
++				const lock_counters_info * info)
++{
++	printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
++	       "jload: %i, "
++	       "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
++	       "ktxnmgrd: %i, fq: %i\n"
++	       "inode: %i, "
++	       "cbk_cache: %i (r:%i,w%i), "
++	       "eflush: %i, "
++	       "zlock: %i,\n"
++	       "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
++	       "d: %i, x: %i, t: %i\n", prefix,
++	       info->spin_locked_jnode,
++	       info->rw_locked_tree, info->read_locked_tree,
++	       info->write_locked_tree,
++	       info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
++	       info->spin_locked_jload,
++	       info->spin_locked_txnh,
++	       info->spin_locked_atom, info->spin_locked_stack,
++	       info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
++	       info->spin_locked_fq,
++	       info->spin_locked_inode,
++	       info->rw_locked_cbk_cache,
++	       info->read_locked_cbk_cache,
++	       info->write_locked_cbk_cache,
++	       info->spin_locked_super_eflush,
++	       info->spin_locked_zlock,
++	       info->spin_locked,
++	       info->long_term_locked_znode,
++	       info->inode_sem_r, info->inode_sem_w,
++	       info->d_refs, info->x_refs, info->t_refs);
++}
++
++/* check that no spinlocks are held */
++int schedulable(void)
++{
++	if (get_current_context_check() != NULL) {
++		if (!LOCK_CNT_NIL(spin_locked)) {
++			print_lock_counters("in atomic", lock_counters());
++			return 0;
++		}
++	}
++	might_sleep();
++	return 1;
++}
++/*
++ * return true, iff no locks are held.
++ */
++int no_counters_are_held(void)
++{
++	lock_counters_info *counters;
++
++	counters = lock_counters();
++	return
++	    (counters->spin_locked_zlock == 0) &&
++	    (counters->spin_locked_jnode == 0) &&
++	    (counters->rw_locked_tree == 0) &&
++	    (counters->read_locked_tree == 0) &&
++	    (counters->write_locked_tree == 0) &&
++	    (counters->rw_locked_dk == 0) &&
++	    (counters->read_locked_dk == 0) &&
++	    (counters->write_locked_dk == 0) &&
++	    (counters->spin_locked_txnh == 0) &&
++	    (counters->spin_locked_atom == 0) &&
++	    (counters->spin_locked_stack == 0) &&
++	    (counters->spin_locked_txnmgr == 0) &&
++	    (counters->spin_locked_inode == 0) &&
++	    (counters->spin_locked == 0) &&
++	    (counters->long_term_locked_znode == 0) &&
++	    (counters->inode_sem_r == 0) &&
++	    (counters->inode_sem_w == 0) && (counters->d_refs == 0);
++}
++
++/*
++ * return true, iff transaction commit can be done under locks held by the
++ * current thread.
++ */
++int commit_check_locks(void)
++{
++	lock_counters_info *counters;
++	int inode_sem_r;
++	int inode_sem_w;
++	int result;
++
++	/*
++	 * inode's read/write semaphore is the only reiser4 lock that can be
++	 * held during commit.
++	 */
++
++	counters = lock_counters();
++	inode_sem_r = counters->inode_sem_r;
++	inode_sem_w = counters->inode_sem_w;
++
++	counters->inode_sem_r = counters->inode_sem_w = 0;
++	result = no_counters_are_held();
++	counters->inode_sem_r = inode_sem_r;
++	counters->inode_sem_w = inode_sem_w;
++	return result;
++}
++
++/*
++ * fill "error site" in the current reiser4 context. See comment before RETERR
++ * macro for more details.
++ */
++void return_err(int code, const char *file, int line)
++{
++	if (code < 0 && is_in_reiser4_context()) {
++		reiser4_context *ctx = get_current_context();
++
++		if (ctx != NULL) {
++			ctx->err.code = code;
++			ctx->err.file = file;
++			ctx->err.line = line;
++		}
++	}
++}
++
++/*
++ * report error information recorder by return_err().
++ */
++static void report_err(void)
++{
++	reiser4_context *ctx = get_current_context_check();
++
++	if (ctx != NULL) {
++		if (ctx->err.code != 0) {
++			printk("code: %i at %s:%i\n",
++			       ctx->err.code, ctx->err.file, ctx->err.line);
++		}
++	}
++}
++
++#endif				/* REISER4_DEBUG */
++
++#if KERNEL_DEBUGGER
++
++/*
++ * this functions just drops into kernel debugger. It is a convenient place to
++ * put breakpoint in.
++ */
++void debugtrap(void)
++{
++	/* do nothing. Put break point here. */
++#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
++	extern void breakpoint(void);
++	breakpoint();
++#endif
++}
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/debug.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/debug.h
+@@ -0,0 +1,350 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Declarations of debug macros. */
++
++#if !defined( __FS_REISER4_DEBUG_H__ )
++#define __FS_REISER4_DEBUG_H__
++
++#include "forward.h"
++#include "reiser4.h"
++
++/* generic function to produce formatted output, decorating it with
++   whatever standard prefixes/postfixes we want. "Fun" is a function
++   that will be actually called, can be printk, panic etc.
++   This is for use by other debugging macros, not by users. */
++#define DCALL(lev, fun, reperr, label, format, ...)			\
++({									\
++	fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" ,	\
++	    current->comm, current->pid, __FUNCTION__,			\
++	    __FILE__, __LINE__, label, ## __VA_ARGS__);			\
++})
++
++/*
++ * cause kernel to crash
++ */
++#define reiser4_panic(mid, format, ...)				\
++	DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
++
++/* print message with indication of current process, file, line and
++   function */
++#define reiser4_log(label, format, ...) 				\
++	DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
++
++/* Assertion checked during compilation.
++    If "cond" is false (0) we get duplicate case label in switch.
++    Use this to check something like famous
++       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
++    in 3.x journal.c. If cassertion fails you get compiler error,
++    so no "maintainer-id".
++*/
++#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
++
++#define noop   do {;} while(0)
++
++#if REISER4_DEBUG
++/* version of info that only actually prints anything when _d_ebugging
++    is on */
++#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
++/* macro to catch logical errors. Put it into `default' clause of
++    switch() statement. */
++#define impossible(label, format, ...) 			\
++         reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
++/* assert assures that @cond is true. If it is not, reiser4_panic() is
++   called. Use this for checking logical consistency and _never_ call
++   this to check correctness of external data: disk blocks and user-input . */
++#define assert(label, cond)							\
++({										\
++	/* call_on_each_assert(); */						\
++	if (cond) {								\
++		/* put negated check to avoid using !(cond) that would lose	\
++		 * warnings for things like assert(a = b); */			\
++		;								\
++	} else {								\
++		DEBUGON(1);							\
++		reiser4_panic(label, "assertion failed: %s", #cond);		\
++	}									\
++})
++
++/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
++#define check_me( label, expr )	assert( label, ( expr ) )
++
++#define ON_DEBUG( exp ) exp
++
++extern int schedulable(void);
++extern void call_on_each_assert(void);
++
++#else
++
++#define dinfo( format, args... ) noop
++#define impossible( label, format, args... ) noop
++#define assert( label, cond ) noop
++#define check_me( label, expr )	( ( void ) ( expr ) )
++#define ON_DEBUG( exp )
++#define schedulable() might_sleep()
++
++/* REISER4_DEBUG */
++#endif
++
++#if REISER4_DEBUG
++/* per-thread information about lock acquired by this thread. Used by lock
++ * ordering checking in spin_macros.h */
++typedef struct lock_counters_info {
++	int rw_locked_tree;
++	int read_locked_tree;
++	int write_locked_tree;
++
++	int rw_locked_dk;
++	int read_locked_dk;
++	int write_locked_dk;
++
++	int rw_locked_cbk_cache;
++	int read_locked_cbk_cache;
++	int write_locked_cbk_cache;
++
++	int spin_locked_zlock;
++	int spin_locked_jnode;
++	int spin_locked_jload;
++	int spin_locked_txnh;
++	int spin_locked_atom;
++	int spin_locked_stack;
++	int spin_locked_txnmgr;
++	int spin_locked_ktxnmgrd;
++	int spin_locked_fq;
++	int spin_locked_inode;
++	int spin_locked_super_eflush;
++	int spin_locked;
++	int long_term_locked_znode;
++
++	int inode_sem_r;
++	int inode_sem_w;
++
++	int d_refs;
++	int x_refs;
++	int t_refs;
++} lock_counters_info;
++
++extern lock_counters_info *lock_counters(void);
++#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
++
++/* increment lock-counter @counter, if present */
++#define LOCK_CNT_INC(counter) IN_CONTEXT(++(lock_counters()->counter), 0)
++
++/* decrement lock-counter @counter, if present */
++#define LOCK_CNT_DEC(counter) IN_CONTEXT(--(lock_counters()->counter), 0)
++
++/* check that lock-counter is zero. This is for use in assertions */
++#define LOCK_CNT_NIL(counter) IN_CONTEXT(lock_counters()->counter == 0, 1)
++
++/* check that lock-counter is greater than zero. This is for use in
++ * assertions */
++#define LOCK_CNT_GTZ(counter) IN_CONTEXT(lock_counters()->counter > 0, 1)
++#define LOCK_CNT_LT(counter,n) IN_CONTEXT(lock_counters()->counter < n, 1)
++
++#else				/* REISER4_DEBUG */
++
++/* no-op versions on the above */
++
++typedef struct lock_counters_info {
++} lock_counters_info;
++
++#define lock_counters() ((lock_counters_info *)NULL)
++#define LOCK_CNT_INC(counter) noop
++#define LOCK_CNT_DEC(counter) noop
++#define LOCK_CNT_NIL(counter) (1)
++#define LOCK_CNT_GTZ(counter) (1)
++#define LOCK_CNT_LT(counter,n) (1)
++
++#endif				/* REISER4_DEBUG */
++
++#define assert_spin_not_locked(lock) BUG_ON(0)
++#define assert_rw_write_locked(lock) BUG_ON(0)
++#define assert_rw_read_locked(lock) BUG_ON(0)
++#define assert_rw_locked(lock) BUG_ON(0)
++#define assert_rw_not_write_locked(lock) BUG_ON(0)
++#define assert_rw_not_read_locked(lock) BUG_ON(0)
++#define assert_rw_not_locked(lock) BUG_ON(0)
++
++/* flags controlling debugging behavior. Are set through debug_flags=N mount
++   option. */
++typedef enum {
++	/* print a lot of information during panic. When this is on all jnodes
++	 * are listed. This can be *very* large output. Usually you don't want
++	 * this. Especially over serial line. */
++	REISER4_VERBOSE_PANIC = 0x00000001,
++	/* print a lot of information during umount */
++	REISER4_VERBOSE_UMOUNT = 0x00000002,
++	/* print gathered statistics on umount */
++	REISER4_STATS_ON_UMOUNT = 0x00000004,
++	/* check node consistency */
++	REISER4_CHECK_NODE = 0x00000008
++} reiser4_debug_flags;
++
++extern int is_in_reiser4_context(void);
++
++/*
++ * evaluate expression @e only if with reiser4 context
++ */
++#define ON_CONTEXT(e)	do {			\
++	if(is_in_reiser4_context()) {		\
++		e;				\
++	} } while(0)
++
++/*
++ * evaluate expression @e only when within reiser4_context and debugging is
++ * on.
++ */
++#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
++
++/*
++ * complain about unexpected function result and crash. Used in "default"
++ * branches of switch statements and alike to assert that invalid results are
++ * not silently ignored.
++ */
++#define wrong_return_value( label, function )				\
++	impossible( label, "wrong return value from " function )
++
++/* Issue different types of reiser4 messages to the console */
++#define warning( label, format, ... )					\
++	DCALL( KERN_WARNING, 						\
++	       printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
++#define notice( label, format, ... )					\
++	DCALL( KERN_NOTICE, 						\
++	       printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
++
++/* mark not yet implemented functionality */
++#define not_yet( label, format, ... )				\
++	reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
++
++extern void reiser4_do_panic(const char *format, ...)
++    __attribute__ ((noreturn, format(printf, 1, 2)));
++
++extern void reiser4_print_prefix(const char *level, int reperr, const char *mid,
++				 const char *function,
++				 const char *file, int lineno);
++
++extern int preempt_point(void);
++extern void reiser4_print_stats(void);
++
++
++#if REISER4_DEBUG
++extern int no_counters_are_held(void);
++extern int commit_check_locks(void);
++#else
++#define no_counters_are_held() (1)
++#define commit_check_locks() (1)
++#endif
++
++/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
++#define IS_POW(i) 				\
++({						\
++	typeof(i) __i;				\
++						\
++	__i = (i);				\
++	!(__i & (__i - 1));			\
++})
++
++#define KERNEL_DEBUGGER (1)
++
++#if KERNEL_DEBUGGER
++
++extern void debugtrap(void);
++
++/*
++ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
++ * kgdb is not compiled in, do nothing.
++ */
++#define DEBUGON(cond)				\
++({						\
++	if (unlikely(cond))			\
++		debugtrap();			\
++})
++#else
++#define DEBUGON(cond) noop
++#endif
++
++/*
++ * Error code tracing facility. (Idea is borrowed from XFS code.)
++ *
++ * Suppose some strange and/or unexpected code is returned from some function
++ * (for example, write(2) returns -EEXIST). It is possible to place a
++ * breakpoint in the reiser4_write(), but it is too late here. How to find out
++ * in what particular place -EEXIST was generated first?
++ *
++ * In reiser4 all places where actual error codes are produced (that is,
++ * statements of the form
++ *
++ *     return -EFOO;        // (1), or
++ *
++ *     result = -EFOO;      // (2)
++ *
++ * are replaced with
++ *
++ *     return RETERR(-EFOO);        // (1a), and
++ *
++ *     result = RETERR(-EFOO);      // (2a) respectively
++ *
++ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
++ * printed in error and warning messages. Moreover, it's possible to put a
++ * conditional breakpoint in return_err (low-level function called by RETERR()
++ * to do the actual work) to break into debugger immediately when particular
++ * error happens.
++ *
++ */
++
++#if REISER4_DEBUG
++
++/*
++ * data-type to store information about where error happened ("error site").
++ */
++typedef struct err_site {
++	int code;		/* error code */
++	const char *file;	/* source file, filled by __FILE__ */
++	int line;		/* source file line, filled by __LINE__ */
++} err_site;
++
++extern void return_err(int code, const char *file, int line);
++
++/*
++ * fill &get_current_context()->err_site with error information.
++ */
++#define RETERR(code) 				\
++({						\
++	typeof(code) __code;			\
++						\
++	__code = (code);			\
++	return_err(__code, __FILE__, __LINE__);	\
++	__code;					\
++})
++
++#else
++
++/*
++ * no-op versions of the above
++ */
++
++typedef struct err_site {
++} err_site;
++#define RETERR(code) code
++#endif
++
++#if REISER4_LARGE_KEY
++/*
++ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
++ */
++#define ON_LARGE_KEY(...) __VA_ARGS__
++#else
++#define ON_LARGE_KEY(...)
++#endif
++
++/* __FS_REISER4_DEBUG_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/dformat.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/dformat.h
+@@ -0,0 +1,71 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Formats of on-disk data and conversion functions. */
++
++/* put all item formats in the files describing the particular items,
++   our model is, everything you need to do to add an item to reiser4,
++   (excepting the changes to the plugin that uses the item which go
++   into the file defining that plugin), you put into one file. */
++/* Data on disk are stored in little-endian format.
++   To declare fields of on-disk structures, use d8, d16, d32 and d64.
++   d??tocpu() and cputod??() to convert. */
++
++#if !defined( __FS_REISER4_DFORMAT_H__ )
++#define __FS_REISER4_DFORMAT_H__
++
++#include <asm/byteorder.h>
++#include <asm/unaligned.h>
++#include <linux/types.h>
++
++
++typedef __u8 d8;
++typedef __le16 d16;
++typedef __le32 d32;
++typedef __le64 d64;
++
++#define PACKED __attribute__((packed))
++
++/* data-type for block number */
++typedef __u64 reiser4_block_nr;
++
++/* data-type for block number on disk, disk format */
++typedef __le64 reiser4_dblock_nr;
++
++/**
++ * disk_addr_eq - compare disk addresses
++ * @b1: pointer to block number ot compare
++ * @b2: pointer to block number ot compare
++ *
++ * Returns true if if disk addresses are the same
++ */
++static inline int disk_addr_eq(const reiser4_block_nr *b1,
++			       const reiser4_block_nr * b2)
++{
++	assert("nikita-1033", b1 != NULL);
++	assert("nikita-1266", b2 != NULL);
++
++	return !memcmp(b1, b2, sizeof *b1);
++}
++
++/* structure of master reiser4 super block */
++typedef struct reiser4_master_sb {
++	char magic[16];		/* "ReIsEr4" */
++	__le16 disk_plugin_id;	/* id of disk layout plugin */
++	__le16 blocksize;
++	char uuid[16];		/* unique id */
++	char label[16];		/* filesystem label */
++	__le64 diskmap;		/* location of the diskmap. 0 if not present */
++} reiser4_master_sb;
++
++/* __FS_REISER4_DFORMAT_H__ */
++#endif
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/dscale.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/dscale.c
+@@ -0,0 +1,174 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Scalable on-disk integers */
++
++/*
++ * Various on-disk structures contain integer-like structures. Stat-data
++ * contain [yes, "data" is plural, check the dictionary] file size, link
++ * count; extent unit contains extent width etc. To accommodate for general
++ * case enough space is reserved to keep largest possible value. 64 bits in
++ * all cases above. But in overwhelming majority of cases numbers actually
++ * stored in these fields will be comparatively small and reserving 8 bytes is
++ * a waste of precious disk bandwidth.
++ *
++ * Scalable integers are one way to solve this problem. dscale_write()
++ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
++ * depending on the magnitude of the value supplied. dscale_read() reads value
++ * previously stored by dscale_write().
++ *
++ * dscale_write() produces format not completely unlike of UTF: two highest
++ * bits of the first byte are used to store "tag". One of 4 possible tag
++ * values is chosen depending on the number being encoded:
++ *
++ *           0 ... 0x3f               => 0           [table 1]
++ *        0x40 ... 0x3fff             => 1
++ *      0x4000 ... 0x3fffffff         => 2
++ *  0x40000000 ... 0xffffffffffffffff => 3
++ *
++ * (see dscale_range() function)
++ *
++ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
++ * to be stored, so in this case there is no place in the first byte to store
++ * tag. For such values tag is stored in an extra 9th byte.
++ *
++ * As _highest_ bits are used for the test (which is natural) scaled integers
++ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
++ * uses LITTLE-ENDIAN.
++ *
++ */
++
++#include "debug.h"
++#include "dscale.h"
++
++/* return tag of scaled integer stored at @address */
++static int gettag(const unsigned char *address)
++{
++	/* tag is stored in two highest bits */
++	return (*address) >> 6;
++}
++
++/* clear tag from value. Clear tag embedded into @value. */
++static void cleartag(__u64 * value, int tag)
++{
++	/*
++	 * W-w-what ?!
++	 *
++	 * Actually, this is rather simple: @value passed here was read by
++	 * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
++	 * zeroes. Tag is still stored in the highest (arithmetically)
++	 * non-zero bits of @value, but relative position of tag within __u64
++	 * depends on @tag.
++	 *
++	 * For example if @tag is 0, it's stored 2 highest bits of lowest
++	 * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
++	 *
++	 * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
++	 * and it's offset if (2 * 8) - 2 == 14 bits.
++	 *
++	 * See table 1 above for details.
++	 *
++	 * All these cases are captured by the formula:
++	 */
++	*value &= ~(3 << (((1 << tag) << 3) - 2));
++	/*
++	 * That is, clear two (3 == 0t11) bits at the offset
++	 *
++	 *                  8 * (2 ^ tag) - 2,
++	 *
++	 * that is, two highest bits of (2 ^ tag)-th byte of @value.
++	 */
++}
++
++/* return tag for @value. See table 1 above for details. */
++static int dscale_range(__u64 value)
++{
++	if (value > 0x3fffffff)
++		return 3;
++	if (value > 0x3fff)
++		return 2;
++	if (value > 0x3f)
++		return 1;
++	return 0;
++}
++
++/* restore value stored at @adderss by dscale_write() and return number of
++ * bytes consumed */
++int dscale_read(unsigned char *address, __u64 * value)
++{
++	int tag;
++
++	/* read tag */
++	tag = gettag(address);
++	switch (tag) {
++	case 3:
++		/* In this case tag is stored in an extra byte, skip this byte
++		 * and decode value stored in the next 8 bytes.*/
++		*value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
++		/* worst case: 8 bytes for value itself plus one byte for
++		 * tag. */
++		return 9;
++	case 0:
++		*value = get_unaligned(address);
++		break;
++	case 1:
++		*value = __be16_to_cpu(get_unaligned((__be16 *)address));
++		break;
++	case 2:
++		*value = __be32_to_cpu(get_unaligned((__be32 *)address));
++		break;
++	default:
++		return RETERR(-EIO);
++	}
++	/* clear tag embedded into @value */
++	cleartag(value, tag);
++	/* number of bytes consumed is (2 ^ tag)---see table 1. */
++	return 1 << tag;
++}
++
++/* store @value at @address and return number of bytes consumed */
++int dscale_write(unsigned char *address, __u64 value)
++{
++	int tag;
++	int shift;
++	__be64 v;
++	unsigned char *valarr;
++
++	tag = dscale_range(value);
++	v = __cpu_to_be64(value);
++	valarr = (unsigned char *)&v;
++	shift = (tag == 3) ? 1 : 0;
++	memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
++	*address |= (tag << 6);
++	return shift + (1 << tag);
++}
++
++/* number of bytes required to store @value */
++int dscale_bytes(__u64 value)
++{
++	int bytes;
++
++	bytes = 1 << dscale_range(value);
++	if (bytes == 8)
++		++bytes;
++	return bytes;
++}
++
++/* returns true if @value and @other require the same number of bytes to be
++ * stored. Used by detect when data structure (like stat-data) has to be
++ * expanded or contracted. */
++int dscale_fit(__u64 value, __u64 other)
++{
++	return dscale_range(value) == dscale_range(other);
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/dscale.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/dscale.h
+@@ -0,0 +1,27 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Scalable on-disk integers. See dscale.h for details. */
++
++#if !defined( __FS_REISER4_DSCALE_H__ )
++#define __FS_REISER4_DSCALE_H__
++
++#include "dformat.h"
++
++extern int dscale_read(unsigned char *address, __u64 * value);
++extern int dscale_write(unsigned char *address, __u64 value);
++extern int dscale_bytes(__u64 value);
++extern int dscale_fit(__u64 value, __u64 other);
++
++/* __FS_REISER4_DSCALE_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/entd.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/entd.c
+@@ -0,0 +1,356 @@
++/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Ent daemon. */
++
++#include "debug.h"
++#include "txnmgr.h"
++#include "tree.h"
++#include "entd.h"
++#include "super.h"
++#include "context.h"
++#include "reiser4.h"
++#include "vfs_ops.h"
++#include "page_cache.h"
++#include "inode.h"
++
++#include <linux/sched.h>	/* struct task_struct */
++#include <linux/suspend.h>
++#include <linux/kernel.h>
++#include <linux/writeback.h>
++#include <linux/time.h>		/* INITIAL_JIFFIES */
++#include <linux/backing-dev.h>	/* bdi_write_congested */
++#include <linux/wait.h>
++#include <linux/kthread.h>
++
++#define LLONG_MAX      ((long long)(~0ULL>>1))
++
++#define DEF_PRIORITY 12
++#define MAX_ENTD_ITERS 10
++
++static void entd_flush(struct super_block *, struct wbq *);
++static int entd(void *arg);
++
++/*
++ * set ->comm field of end thread to make its state visible to the user level
++ */
++#define entd_set_comm(state)					\
++	snprintf(current->comm, sizeof(current->comm),	\
++	         "ent:%s%s", super->s_id, (state))
++
++/**
++ * init_entd - initialize entd context and start kernel daemon
++ * @super: super block to start ent thread for
++ *
++ * Creates entd contexts, starts kernel thread and waits until it
++ * initializes.
++ */
++int init_entd(struct super_block *super)
++{
++	entd_context *ctx;
++
++	assert("nikita-3104", super != NULL);
++
++	ctx = get_entd_context(super);
++
++	memset(ctx, 0, sizeof *ctx);
++	spin_lock_init(&ctx->guard);
++	init_waitqueue_head(&ctx->wait);
++#if REISER4_DEBUG
++	INIT_LIST_HEAD(&ctx->flushers_list);
++#endif
++	/* lists of writepage requests */
++	INIT_LIST_HEAD(&ctx->todo_list);
++	INIT_LIST_HEAD(&ctx->done_list);
++	/* start entd */
++	ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
++	if (IS_ERR(ctx->tsk))
++		return PTR_ERR(ctx->tsk);
++	return 0;
++}
++
++static void __put_wbq(entd_context *ent, struct wbq *rq)
++{
++	up(&rq->sem);
++}
++
++/* ent should be locked */
++static struct wbq *__get_wbq(entd_context * ent)
++{
++	struct wbq *wbq;
++
++	if (list_empty_careful(&ent->todo_list))
++		return NULL;
++
++	ent->nr_todo_reqs --;
++	wbq = list_entry(ent->todo_list.next, struct wbq, link);
++	list_del_init(&wbq->link);
++	return wbq;
++}
++
++static void wakeup_all_wbq(entd_context * ent)
++{
++	struct wbq *rq;
++
++	spin_lock(&ent->guard);
++	while ((rq = __get_wbq(ent)) != NULL)
++		__put_wbq(ent, rq);
++	spin_unlock(&ent->guard);
++}
++
++/* ent thread function */
++static int entd(void *arg)
++{
++	struct super_block *super;
++	entd_context *ent;
++	int done = 0;
++
++	super = arg;
++	/* do_fork() just copies task_struct into the new
++	   thread. ->fs_context shouldn't be copied of course. This shouldn't
++	   be a problem for the rest of the code though.
++	 */
++	current->journal_info = NULL;
++
++	ent = get_entd_context(super);
++
++	while (!done) {
++		try_to_freeze();
++
++		spin_lock(&ent->guard);
++		while (ent->nr_todo_reqs != 0) {
++			struct wbq *rq, *next;
++
++			assert("", list_empty_careful(&ent->done_list));
++
++			/* take request from the queue head */
++			rq = __get_wbq(ent);
++			assert("", rq != NULL);
++			ent->cur_request = rq;
++			spin_unlock(&ent->guard);
++
++			entd_set_comm("!");
++			entd_flush(super, rq);
++
++			iput(rq->mapping->host);
++			up(&(rq->sem));
++
++			/*
++			 * wakeup all requestors and iput their inodes
++			 */
++			spin_lock(&ent->guard);
++			list_for_each_entry_safe(rq, next, &ent->done_list, link) {
++				list_del_init(&(rq->link));
++				ent->nr_done_reqs --;
++				spin_unlock(&ent->guard);
++
++				assert("", rq->written == 1);
++				iput(rq->mapping->host);
++				up(&(rq->sem));
++				spin_lock(&ent->guard);
++			}
++		}
++		spin_unlock(&ent->guard);
++
++		entd_set_comm(".");
++
++		{
++			DEFINE_WAIT(__wait);
++
++			do {
++				prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
++				if (kthread_should_stop()) {
++					done = 1;
++					break;
++				}
++				if (ent->nr_todo_reqs != 0)
++					break;
++				schedule();
++			} while (0);
++			finish_wait(&ent->wait, &__wait);
++		}
++	}
++	spin_lock(&ent->guard);
++	BUG_ON(ent->nr_todo_reqs != 0);
++	spin_unlock(&ent->guard);
++	wakeup_all_wbq(ent);
++	return 0;
++}
++
++/**
++ * done_entd - stop entd kernel thread
++ * @super: super block to stop ent thread for
++ *
++ * It is called on umount. Sends stop signal to entd and wait until it handles
++ * it.
++ */
++void done_entd(struct super_block *super)
++{
++	entd_context *ent;
++
++	assert("nikita-3103", super != NULL);
++
++	ent = get_entd_context(super);
++	assert("zam-1055", ent->tsk != NULL);
++	kthread_stop(ent->tsk);
++}
++
++/* called at the beginning of jnode_flush to register flusher thread with ent
++ * daemon */
++void enter_flush(struct super_block *super)
++{
++	entd_context *ent;
++
++	assert("zam-1029", super != NULL);
++	ent = get_entd_context(super);
++
++	assert("zam-1030", ent != NULL);
++
++	spin_lock(&ent->guard);
++	ent->flushers++;
++#if REISER4_DEBUG
++	list_add(&get_current_context()->flushers_link, &ent->flushers_list);
++#endif
++	spin_unlock(&ent->guard);
++}
++
++/* called at the end of jnode_flush */
++void leave_flush(struct super_block *super)
++{
++	entd_context *ent;
++	int wake_up_ent;
++
++	assert("zam-1027", super != NULL);
++	ent = get_entd_context(super);
++
++	assert("zam-1028", ent != NULL);
++
++	spin_lock(&ent->guard);
++	ent->flushers--;
++	wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
++#if REISER4_DEBUG
++	list_del_init(&get_current_context()->flushers_link);
++#endif
++	spin_unlock(&ent->guard);
++	if (wake_up_ent)
++		wake_up(&ent->wait);
++}
++
++#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
++
++static void entd_flush(struct super_block *super, struct wbq *rq)
++{
++	reiser4_context ctx;
++	int tmp;
++
++	init_stack_context(&ctx, super);
++	ctx.entd = 1;
++	ctx.gfp_mask = GFP_NOFS;
++
++	rq->wbc->start = rq->page->index << PAGE_CACHE_SHIFT;
++	rq->wbc->end = (rq->page->index + ENTD_CAPTURE_APAGE_BURST) << PAGE_CACHE_SHIFT;
++	tmp = rq->wbc->nr_to_write;
++	rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
++
++	if (rq->wbc->nr_to_write > 0) {
++		rq->wbc->start = 0;
++		rq->wbc->end = LLONG_MAX;
++		generic_sync_sb_inodes(super, rq->wbc);
++	}
++	rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
++	writeout(super, rq->wbc);
++
++	context_set_commit_async(&ctx);
++	reiser4_exit_context(&ctx);
++}
++
++/**
++ * write_page_by_ent - ask entd thread to flush this page as part of slum
++ * @page: page to be written
++ * @wbc: writeback control passed to reiser4_writepage
++ *
++ * Creates a request, puts it on entd list of requests, wakeups entd if
++ * necessary, waits until entd completes with the request.
++ */
++int write_page_by_ent(struct page *page, struct writeback_control *wbc)
++{
++	struct super_block *sb;
++	struct inode *inode;
++	entd_context *ent;
++	struct wbq rq;
++
++	assert("", PageLocked(page));
++	assert("", page->mapping != NULL);
++
++	sb = page->mapping->host->i_sb;
++	ent = get_entd_context(sb);
++	assert("", ent && ent->done == 0);
++
++	/*
++	 * we are going to unlock page and ask ent thread to write the
++	 * page. Re-dirty page before unlocking so that if ent thread fails to
++	 * write it - it will remain dirty
++	 */
++	set_page_dirty_internal(page);
++
++	/*
++	 * pin inode in memory, unlock page, entd_flush will iput. We can not
++	 * iput here becasue we can not allow delete_inode to be called here
++	 */
++	inode = igrab(page->mapping->host);
++	unlock_page(page);
++	if (inode == NULL)
++		/* inode is getting freed */
++		return 0;
++
++	/* init wbq */
++	INIT_LIST_HEAD(&rq.link);
++	rq.magic = WBQ_MAGIC;
++	rq.wbc = wbc;
++	rq.page = page;
++	rq.mapping = inode->i_mapping;
++	rq.node = NULL;
++	rq.written = 0;
++	sema_init(&rq.sem, 0);
++
++	/* add request to entd's list of writepage requests */
++	spin_lock(&ent->guard);
++	ent->nr_todo_reqs++;
++	list_add_tail(&rq.link, &ent->todo_list);
++	if (ent->nr_todo_reqs == 1)
++		wake_up(&ent->wait);
++
++	spin_unlock(&ent->guard);
++
++	/* wait until entd finishes */
++	down(&rq.sem);
++
++	/*
++	 * spin until entd thread which did up(&rq.sem) does not need rq
++	 * anymore
++	 */
++	spin_lock(&ent->guard);
++	spin_unlock(&ent->guard);
++
++	if (rq.written)
++		/* Eventually ENTD has written the page to disk. */
++		return 0;
++	return 0;
++}
++
++int wbq_available(void)
++{
++	struct super_block *sb = reiser4_get_current_sb();
++	entd_context *ent = get_entd_context(sb);
++	return ent->nr_todo_reqs;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/entd.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/entd.h
+@@ -0,0 +1,90 @@
++/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Ent daemon. */
++
++#ifndef __ENTD_H__
++#define __ENTD_H__
++
++#include "context.h"
++
++#include <linux/fs.h>
++#include <linux/completion.h>
++#include <linux/wait.h>
++#include <linux/spinlock.h>
++#include <linux/sched.h>	/* for struct task_struct */
++
++#define WBQ_MAGIC 0x7876dc76
++
++/* write-back request. */
++struct wbq {
++	int magic;
++	struct list_head link; /* list head of this list is in entd context */
++	struct writeback_control *wbc;
++	struct page *page;
++	struct address_space *mapping;
++	struct semaphore sem;
++	jnode *node; /* set if ent thread captured requested page */
++	int written; /* set if ent thread wrote requested page */
++};
++
++/* ent-thread context. This is used to synchronize starting/stopping ent
++ * threads. */
++typedef struct entd_context {
++	 /* wait queue that ent thread waits on for more work. It's
++	  * signaled by write_page_by_ent(). */
++	wait_queue_head_t wait;
++	/* spinlock protecting other fields */
++	spinlock_t guard;
++	/* ent thread */
++	struct task_struct *tsk;
++	/* set to indicate that ent thread should leave. */
++	int done;
++	/* counter of active flushers */
++	int flushers;
++	/*
++	 * when reiser4_writepage asks entd to write a page - it adds struct
++	 * wbq to this list
++	 */
++	struct list_head todo_list;
++	/* number of elements on the above list */
++	int nr_todo_reqs;
++
++	struct wbq *cur_request;
++	/*
++	 * when entd writes a page it moves write-back request from todo_list
++	 * to done_list. This list is used at the end of entd iteration to
++	 * wakeup requestors and iput inodes.
++	 */
++	struct list_head done_list;
++	/* number of elements on the above list */
++	int nr_done_reqs;
++
++#if REISER4_DEBUG
++	/* list of all active flushers */
++	struct list_head flushers_list;
++#endif
++} entd_context;
++
++extern int  init_entd(struct super_block *);
++extern void done_entd(struct super_block *);
++
++extern void enter_flush(struct super_block *);
++extern void leave_flush(struct super_block *);
++
++extern int write_page_by_ent(struct page *, struct writeback_control *);
++extern int wbq_available(void);
++extern void ent_writes_page(struct super_block *, struct page *);
++
++extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
++/* __ENTD_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/eottl.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/eottl.c
+@@ -0,0 +1,510 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "forward.h"
++#include "debug.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/item/item.h"
++#include "plugin/node/node.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree_walk.h"
++#include "tree_mod.h"
++#include "carry.h"
++#include "tree.h"
++#include "super.h"
++
++#include <linux/types.h>	/* for __u??  */
++
++/*
++ * Extents on the twig level (EOTTL) handling.
++ *
++ * EOTTL poses some problems to the tree traversal, that are better explained
++ * by example.
++ *
++ * Suppose we have block B1 on the twig level with the following items:
++ *
++ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
++ * offset)
++ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
++ * 2. internal item I2 with key (10:0:0:0)
++ *
++ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
++ * then intra-node lookup is done. This lookup finished on the E1, because the
++ * key we are looking for is larger than the key of E1 and is smaller than key
++ * the of I2.
++ *
++ * Here search is stuck.
++ *
++ * After some thought it is clear what is wrong here: extents on the twig level
++ * break some basic property of the *search* tree (on the pretext, that they
++ * restore property of balanced tree).
++ *
++ * Said property is the following: if in the internal node of the search tree
++ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
++ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
++ * through the Pointer.
++ *
++ * This is not true, when Pointer is Extent-Pointer, simply because extent
++ * cannot expand indefinitely to the right to include any item with
++ *
++ *   Key1 <= Key <= Key2.
++ *
++ * For example, our E1 extent is only responsible for the data with keys
++ *
++ *   (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
++ *
++ * so, key range
++ *
++ *   ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
++ *
++ * is orphaned: there is no way to get there from the tree root.
++ *
++ * In other words, extent pointers are different than normal child pointers as
++ * far as search tree is concerned, and this creates such problems.
++ *
++ * Possible solution for this problem is to insert our item into node pointed
++ * to by I2. There are some problems through:
++ *
++ * (1) I2 can be in a different node.
++ * (2) E1 can be immediately followed by another extent E2.
++ *
++ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
++ * for locks/coords as necessary.
++ *
++ * (2) is more complex. Solution here is to insert new empty leaf node and
++ * insert internal item between E1 and E2 pointing to said leaf node. This is
++ * further complicated by possibility that E2 is in a different node, etc.
++ *
++ * Problems:
++ *
++ * (1) if there was internal item I2 immediately on the right of an extent E1
++ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
++ * key of S1 will be less than smallest key in the N2. Normally, search key
++ * checks that key we are looking for is in the range of keys covered by the
++ * node key is being looked in. To work around of this situation, while
++ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
++ * cbk falgs bitmask. This flag is automatically set on entrance to the
++ * coord_by_key() and is only cleared when we are about to enter situation
++ * described above.
++ *
++ * (2) If extent E1 is immediately followed by another extent E2 and we are
++ * searching for the key that is between E1 and E2 we only have to insert new
++ * empty leaf node when coord_by_key was called for insertion, rather than just
++ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
++ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
++ * performed by insert_by_key() and friends.
++ *
++ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
++ * case it requires modification of node content which is only possible under
++ * write lock. It may well happen that we only have read lock on the node where
++ * new internal pointer is to be inserted (common case: lookup of non-existent
++ * stat-data that fells between two extents). If only read lock is held, tree
++ * traversal is restarted with lock_level modified so that next time we hit
++ * this problem, write lock will be held. Once we have write lock, balancing
++ * will be performed.
++ */
++
++/**
++ * is_next_item_internal - check whether next item is internal
++ * @coord: coordinate of extent item in twig node
++ * @key: search key
++ * @lh: twig node lock handle
++ *
++ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
++ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
++ * to that node, @coord is set to its first unit. If next item is not internal
++ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
++ * is returned if search restart has to be done.
++ */
++static int
++is_next_item_internal(coord_t *coord, const reiser4_key *key,
++		      lock_handle *lh)
++{
++	coord_t next;
++	lock_handle rn;
++	int result;
++
++	coord_dup(&next, coord);
++	if (coord_next_unit(&next) == 0) {
++		/* next unit is in this node */
++		if (item_is_internal(&next)) {
++			coord_dup(coord, &next);
++			return 1;
++		}
++		assert("vs-3", item_is_extent(&next));
++		return 0;
++	}
++
++	/*
++	 * next unit either does not exist or is in right neighbor. If it is in
++	 * right neighbor we have to check right delimiting key because
++	 * concurrent thread could get their first and insert item with a key
++	 * smaller than @key
++	 */
++	read_lock_dk(current_tree);
++	result = keycmp(key, znode_get_rd_key(coord->node));
++	read_unlock_dk(current_tree);
++	assert("vs-6", result != EQUAL_TO);
++	if (result == GREATER_THAN)
++		return 2;
++
++	/* lock right neighbor */
++	init_lh(&rn);
++	result = reiser4_get_right_neighbor(&rn, coord->node,
++					    znode_is_wlocked(coord->node) ?
++					    ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
++					    GN_CAN_USE_UPPER_LEVELS);
++	if (result == -E_NO_NEIGHBOR) {
++		/* we are on the rightmost edge of the tree */
++		done_lh(&rn);
++		return 0;
++	}
++
++	if (result) {
++		assert("vs-4", result < 0);
++		done_lh(&rn);
++		return result;
++	}
++
++	/*
++	 * check whether concurrent thread managed to insert item with a key
++	 * smaller than @key
++	 */
++	read_lock_dk(current_tree);
++	result = keycmp(key, znode_get_ld_key(rn.node));
++	read_unlock_dk(current_tree);
++	assert("vs-6", result != EQUAL_TO);
++	if (result == GREATER_THAN) {
++		done_lh(&rn);
++		return 2;
++	}
++
++	result = zload(rn.node);
++	if (result) {
++		assert("vs-5", result < 0);
++		done_lh(&rn);
++		return result;
++	}
++
++	coord_init_first_unit(&next, rn.node);
++	if (item_is_internal(&next)) {
++		/*
++		 * next unit is in right neighbor and it is an unit of internal
++		 * item. Unlock coord->node. Move @lh to right neighbor. @coord
++		 * is set to the first unit of right neighbor.
++		 */
++		coord_dup(coord, &next);
++		zrelse(rn.node);
++		done_lh(lh);
++		move_lh(lh, &rn);
++		return 1;
++	}
++
++	/*
++	 * next unit is unit of extent item. Return without chaning @lh and
++	 * @coord.
++	 */
++	assert("vs-6", item_is_extent(&next));
++	zrelse(rn.node);
++	done_lh(&rn);
++	return 0;
++}
++
++/**
++ * rd_key - calculate key of an item next to the given one
++ * @coord: position in a node
++ * @key: storage for result key
++ *
++ * @coord is set between items or after the last item in a node. Calculate key
++ * of item to the right of @coord.
++ */
++static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
++{
++	coord_t dup;
++
++	assert("nikita-2281", coord_is_between_items(coord));
++	coord_dup(&dup, coord);
++
++	if (coord_set_to_right(&dup) == 0)
++		/* next item is in this node. Return its key. */
++		unit_key_by_coord(&dup, key);
++	else {
++		/*
++		 * next item either does not exist or is in right
++		 * neighbor. Return znode's right delimiting key.
++		 */
++		read_lock_dk(current_tree);
++		*key = *znode_get_rd_key(coord->node);
++		read_unlock_dk(current_tree);
++	}
++	return key;
++}
++
++/**
++ * add_empty_leaf - insert empty leaf between two extents
++ * @insert_coord: position in twig node between two extents
++ * @lh: twig node lock handle
++ * @key: left delimiting key of new node
++ * @rdkey: right delimiting key of new node
++ *
++ * Inserts empty leaf node between two extent items. It is necessary when we
++ * have to insert an item on leaf level between two extents (items on the twig
++ * level).
++ */
++static int
++add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
++	       const reiser4_key *key, const reiser4_key *rdkey)
++{
++	int result;
++	carry_pool *pool;
++	carry_level *todo;
++	reiser4_item_data *item;
++	carry_insert_data *cdata;
++	carry_op *op;
++	znode *node;
++	reiser4_tree *tree;
++
++	assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
++	tree = znode_get_tree(insert_coord->node);
++	node = new_node(insert_coord->node, LEAF_LEVEL);
++	if (IS_ERR(node))
++		return PTR_ERR(node);
++
++	/* setup delimiting keys for node being inserted */
++	write_lock_dk(tree);
++	znode_set_ld_key(node, key);
++	znode_set_rd_key(node, rdkey);
++	ON_DEBUG(node->creator = current);
++	ON_DEBUG(node->first_key = *key);
++	write_unlock_dk(tree);
++
++	ZF_SET(node, JNODE_ORPHAN);
++
++	/*
++	 * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
++	 * carry_insert_data
++	 */
++	pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
++			       sizeof(*item) + sizeof(*cdata));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	todo = (carry_level *) (pool + 1);
++	init_carry_level(todo, pool);
++
++	item = (reiser4_item_data *) (todo + 3);
++	cdata = (carry_insert_data *) (item + 1);
++
++	op = post_carry(todo, COP_INSERT, insert_coord->node, 0);
++	if (!IS_ERR(op)) {
++		cdata->coord = insert_coord;
++		cdata->key = key;
++		cdata->data = item;
++		op->u.insert.d = cdata;
++		op->u.insert.type = COPT_ITEM_DATA;
++		build_child_ptr_data(node, item);
++		item->arg = NULL;
++		/* have @insert_coord to be set at inserted item after
++		   insertion is done */
++		todo->track_type = CARRY_TRACK_CHANGE;
++		todo->tracked = lh;
++
++		result = carry(todo, NULL);
++		if (result == 0) {
++			/*
++			 * pin node in memory. This is necessary for
++			 * znode_make_dirty() below.
++			 */
++			result = zload(node);
++			if (result == 0) {
++				lock_handle local_lh;
++
++				/*
++				 * if we inserted new child into tree we have
++				 * to mark it dirty so that flush will be able
++				 * to process it.
++				 */
++				init_lh(&local_lh);
++				result = longterm_lock_znode(&local_lh, node,
++							     ZNODE_WRITE_LOCK,
++							     ZNODE_LOCK_LOPRI);
++				if (result == 0) {
++					znode_make_dirty(node);
++
++					/*
++					 * when internal item pointing to @node
++					 * was inserted into twig node
++					 * create_hook_internal did not connect
++					 * it properly because its right
++					 * neighbor was not known. Do it
++					 * here
++					 */
++					write_lock_tree(tree);
++					assert("nikita-3312",
++					       znode_is_right_connected(node));
++					assert("nikita-2984",
++					       node->right == NULL);
++					ZF_CLR(node, JNODE_RIGHT_CONNECTED);
++					write_unlock_tree(tree);
++					result =
++					    connect_znode(insert_coord, node);
++					if (result == 0)
++						ON_DEBUG(check_dkeys(node));
++
++					done_lh(lh);
++					move_lh(lh, &local_lh);
++					assert("vs-1676", node_is_empty(node));
++					coord_init_first_unit(insert_coord,
++							      node);
++				} else {
++					warning("nikita-3136",
++						"Cannot lock child");
++				}
++				done_lh(&local_lh);
++				zrelse(node);
++			}
++		}
++	} else
++		result = PTR_ERR(op);
++	zput(node);
++	done_carry_pool(pool);
++	return result;
++}
++
++/**
++ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
++ * @h: search handle
++ * @outcome: flag saying whether search has to restart or is done
++ *
++ * Handles search on twig level. If this function completes search itself then
++ * it returns 1. If search has to go one level down then 0 is returned. If
++ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
++ * in @h->result.
++ */
++int handle_eottl(cbk_handle *h, int *outcome)
++{
++	int result;
++	reiser4_key key;
++	coord_t *coord;
++
++	coord = h->coord;
++
++	if (h->level != TWIG_LEVEL ||
++	    (coord_is_existing_item(coord) && item_is_internal(coord))) {
++		/* Continue to traverse tree downward. */
++		return 0;
++	}
++
++	/*
++	 * make sure that @h->coord is set to twig node and that it is either
++	 * set to extent item or after extent item
++	 */
++	assert("vs-356", h->level == TWIG_LEVEL);
++	assert("vs-357", ( {
++			  coord_t lcoord;
++			  coord_dup(&lcoord, coord);
++			  check_me("vs-733", coord_set_to_left(&lcoord) == 0);
++			  item_is_extent(&lcoord);
++			  }
++	       ));
++
++	if (*outcome == NS_FOUND) {
++		/* we have found desired key on twig level in extent item */
++		h->result = CBK_COORD_FOUND;
++		*outcome = LOOKUP_DONE;
++		return 1;
++	}
++
++	if (!(h->flags & CBK_FOR_INSERT)) {
++		/* tree traversal is not for insertion. Just return
++		   CBK_COORD_NOTFOUND. */
++		h->result = CBK_COORD_NOTFOUND;
++		*outcome = LOOKUP_DONE;
++		return 1;
++	}
++
++	/* take a look at the item to the right of h -> coord */
++	result = is_next_item_internal(coord, h->key, h->active_lh);
++	if (unlikely(result < 0)) {
++		h->error = "get_right_neighbor failed";
++		h->result = result;
++		*outcome = LOOKUP_DONE;
++		return 1;
++	}
++	if (result == 0) {
++		/*
++		 * item to the right is also an extent one. Allocate a new node
++		 * and insert pointer to it after item h -> coord.
++		 *
++		 * This is a result of extents being located at the twig
++		 * level. For explanation, see comment just above
++		 * is_next_item_internal().
++		 */
++		znode *loaded;
++
++		if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
++			/*
++			 * we got node read locked, restart coord_by_key to
++			 * have write lock on twig level
++			 */
++			h->lock_level = TWIG_LEVEL;
++			h->lock_mode = ZNODE_WRITE_LOCK;
++			*outcome = LOOKUP_REST;
++			return 1;
++		}
++
++		loaded = coord->node;
++		result =
++		    add_empty_leaf(coord, h->active_lh, h->key,
++				   rd_key(coord, &key));
++		if (result) {
++			h->error = "could not add empty leaf";
++			h->result = result;
++			*outcome = LOOKUP_DONE;
++			return 1;
++		}
++		/* added empty leaf is locked (h->active_lh), its parent node
++		   is unlocked, h->coord is set as EMPTY */
++		assert("vs-13", coord->between == EMPTY_NODE);
++		assert("vs-14", znode_is_write_locked(coord->node));
++		assert("vs-15",
++		       WITH_DATA(coord->node, node_is_empty(coord->node)));
++		assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
++		assert("vs-17", coord->node == h->active_lh->node);
++		*outcome = LOOKUP_DONE;
++		h->result = CBK_COORD_NOTFOUND;
++		return 1;
++	} else if (result == 1) {
++		/*
++		 * this is special case mentioned in the comment on
++		 * tree.h:cbk_flags. We have found internal item immediately on
++		 * the right of extent, and we are going to insert new item
++		 * there. Key of item we are going to insert is smaller than
++		 * leftmost key in the node pointed to by said internal item
++		 * (otherwise search wouldn't come to the extent in the first
++		 * place).
++		 *
++		 * This is a result of extents being located at the twig
++		 * level. For explanation, see comment just above
++		 * is_next_item_internal().
++		 */
++		h->flags &= ~CBK_TRUST_DK;
++	} else {
++		assert("vs-8", result == 2);
++		*outcome = LOOKUP_REST;
++		return 1;
++	}
++	assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
++	return 0;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/estimate.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/estimate.c
+@@ -0,0 +1,111 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "debug.h"
++#include "dformat.h"
++#include "tree.h"
++#include "carry.h"
++#include "inode.h"
++#include "plugin/cluster.h"
++#include "plugin/item/ctail.h"
++
++/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
++
++   Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
++   is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
++   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
++   leaf level, 3 for twig level, 2 on upper + 1 for root.
++
++   Do not calculate the current node of the lowest level here - this is overhead only.
++
++   children is almost always 1 here. Exception is flow insertion
++*/
++static reiser4_block_nr
++max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
++{
++	reiser4_block_nr ten_percent;
++
++	ten_percent = ((103 * childen) >> 10);
++
++	/* If we have too many balancings at the time, tree height can raise on more
++	   then 1. Assume that if tree_height is 5, it can raise on 1 only. */
++	return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
++}
++
++/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
++   perform insertion of one item into the tree */
++/* it is only called when tree height changes, or gets initialized */
++reiser4_block_nr calc_estimate_one_insert(tree_level height)
++{
++	return 1 + max_balance_overhead(1, height);
++}
++
++reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
++{
++	return tree->estimate_one_insert;
++}
++
++/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
++   perform insertion of one unit into an item in the tree */
++reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
++{
++	/* estimate insert into item just like item insertion */
++	return tree->estimate_one_insert;
++}
++
++reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
++{
++	/* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
++	   level */
++	return tree->estimate_one_insert;
++}
++
++/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
++   both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
++   levels */
++reiser4_block_nr estimate_insert_flow(tree_level height)
++{
++	return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
++								     CARRY_FLOW_NEW_NODES_LIMIT,
++								     height);
++}
++
++/* returnes max number of nodes can be occupied by disk cluster */
++static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
++{
++	int per_cluster;
++	per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
++	return 3 + per_cluster +
++		max_balance_overhead(3 + per_cluster,
++				     REISER4_MAX_ZTREE_HEIGHT);
++}
++
++/* how many nodes might get dirty and added
++   during insertion of a disk cluster */
++reiser4_block_nr estimate_insert_cluster(struct inode * inode)
++{
++	return estimate_cluster(inode, 1); /* 24 */
++}
++
++/* how many nodes might get dirty and added
++   during update of a (prepped or unprepped) disk cluster */
++reiser4_block_nr estimate_update_cluster(struct inode * inode)
++{
++	return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
++}
++
++/* how many nodes occupied by a disk cluster might get dirty */
++reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
++{
++	return 2 + cluster_nrpages(inode);
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/export_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/export_ops.c
+@@ -0,0 +1,296 @@
++/* Copyright 2005 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "inode.h"
++#include "plugin/plugin.h"
++
++
++/*
++ * Supported file-handle types
++ */
++typedef enum {
++	FH_WITH_PARENT = 0x10,	/* file handle with parent */
++	FH_WITHOUT_PARENT = 0x11	/* file handle without parent */
++} reiser4_fhtype;
++
++#define NFSERROR (255)
++
++/* initialize place-holder for object */
++static void object_on_wire_init(reiser4_object_on_wire *o)
++{
++	o->plugin = NULL;
++}
++
++/* finish with @o */
++static void object_on_wire_done(reiser4_object_on_wire *o)
++{
++	if (o->plugin != NULL)
++		o->plugin->wire.done(o);
++}
++
++/*
++ * read serialized object identity from @addr and store information about
++ * object in @obj. This is dual to encode_inode().
++ */
++static char *decode_inode(struct super_block *s, char *addr,
++			  reiser4_object_on_wire * obj)
++{
++	file_plugin *fplug;
++
++	/* identifier of object plugin is stored in the first two bytes,
++	 * followed by... */
++	fplug = file_plugin_by_disk_id(get_tree(s), (d16 *) addr);
++	if (fplug != NULL) {
++		addr += sizeof(d16);
++		obj->plugin = fplug;
++		assert("nikita-3520", fplug->wire.read != NULL);
++		/* plugin specific encoding of object identity. */
++		addr = fplug->wire.read(addr, obj);
++	} else
++		addr = ERR_PTR(RETERR(-EINVAL));
++	return addr;
++}
++
++/**
++ * reiser4_decode_fh - decode_fh of export operations
++ * @super: super block
++ * @fh: nfsd file handle
++ * @len: length of file handle
++ * @fhtype: type of file handle
++ * @acceptable: acceptability testing function
++ * @context: argument for @acceptable
++ *
++ * Returns dentry referring to the same file as @fh.
++ */
++static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
++					int len, int fhtype,
++					int (*acceptable) (void *context,
++							   struct dentry *de),
++					void *context)
++{
++	reiser4_context *ctx;
++	reiser4_object_on_wire object;
++	reiser4_object_on_wire parent;
++	char *addr;
++	int with_parent;
++
++	ctx = init_context(super);
++	if (IS_ERR(ctx))
++		return (struct dentry *)ctx;
++
++	assert("vs-1482",
++	       fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
++
++	with_parent = (fhtype == FH_WITH_PARENT);
++
++	addr = (char *)fh;
++
++	object_on_wire_init(&object);
++	object_on_wire_init(&parent);
++
++	addr = decode_inode(super, addr, &object);
++	if (!IS_ERR(addr)) {
++		if (with_parent)
++			addr = decode_inode(super, addr, &parent);
++		if (!IS_ERR(addr)) {
++			struct dentry *d;
++			typeof(super->s_export_op->find_exported_dentry) fn;
++
++			fn = super->s_export_op->find_exported_dentry;
++			assert("nikita-3521", fn != NULL);
++			d = fn(super, &object, with_parent ? &parent : NULL,
++			       acceptable, context);
++			if (d != NULL && !IS_ERR(d))
++				/* FIXME check for -ENOMEM */
++			  	reiser4_get_dentry_fsdata(d)->stateless = 1;
++			addr = (char *)d;
++		}
++	}
++
++	object_on_wire_done(&object);
++	object_on_wire_done(&parent);
++
++	reiser4_exit_context(ctx);
++	return (void *)addr;
++}
++
++/*
++ * Object serialization support.
++ *
++ * To support knfsd file system provides export_operations that are used to
++ * construct and interpret NFS file handles. As a generalization of this,
++ * reiser4 object plugins have serialization support: it provides methods to
++ * create on-wire representation of identity of reiser4 object, and
++ * re-create/locate object given its on-wire identity.
++ *
++ */
++
++/*
++ * return number of bytes that on-wire representation of @inode's identity
++ * consumes.
++ */
++static int encode_inode_size(struct inode *inode)
++{
++	assert("nikita-3514", inode != NULL);
++	assert("nikita-3515", inode_file_plugin(inode) != NULL);
++	assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
++
++	return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
++}
++
++/*
++ * store on-wire representation of @inode's identity at the area beginning at
++ * @start.
++ */
++static char *encode_inode(struct inode *inode, char *start)
++{
++	assert("nikita-3517", inode != NULL);
++	assert("nikita-3518", inode_file_plugin(inode) != NULL);
++	assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
++
++	/*
++	 * first, store two-byte identifier of object plugin, then
++	 */
++	save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
++		       (d16 *) start);
++	start += sizeof(d16);
++	/*
++	 * call plugin to serialize object's identity
++	 */
++	return inode_file_plugin(inode)->wire.write(inode, start);
++}
++
++/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
++ * returned if file handle can not be stored */
++/**
++ * reiser4_encode_fh - encode_fh of export operations
++ * @dentry:
++ * @fh:
++ * @lenp:
++ * @need_parent:
++ *
++ */
++static int
++reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
++		  int need_parent)
++{
++	struct inode *inode;
++	struct inode *parent;
++	char *addr;
++	int need;
++	int delta;
++	int result;
++	reiser4_context *ctx;
++
++	/*
++	 * knfsd asks as to serialize object in @dentry, and, optionally its
++	 * parent (if need_parent != 0).
++	 *
++	 * encode_inode() and encode_inode_size() is used to build
++	 * representation of object and its parent. All hard work is done by
++	 * object plugins.
++	 */
++	inode = dentry->d_inode;
++	parent = dentry->d_parent->d_inode;
++
++	addr = (char *)fh;
++
++	need = encode_inode_size(inode);
++	if (need < 0)
++		return NFSERROR;
++	if (need_parent) {
++		delta = encode_inode_size(parent);
++		if (delta < 0)
++			return NFSERROR;
++		need += delta;
++	}
++
++	ctx = init_context(dentry->d_inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	if (need <= sizeof(__u32) * (*lenp)) {
++		addr = encode_inode(inode, addr);
++		if (need_parent)
++			addr = encode_inode(parent, addr);
++
++		/* store in lenp number of 32bit words required for file
++		 * handle. */
++		*lenp = (need + sizeof(__u32) - 1) >> 2;
++		result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
++	} else
++		/* no enough space in file handle */
++		result = NFSERROR;
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/**
++ * reiser4_get_dentry_parent - get_parent of export operations
++ * @child:
++ *
++ */
++static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
++{
++	struct inode *dir;
++	dir_plugin *dplug;
++
++	assert("nikita-3527", child != NULL);
++	/* see comment in reiser4_get_dentry() about following assertion */
++	assert("nikita-3528", is_in_reiser4_context());
++
++	dir = child->d_inode;
++	assert("nikita-3529", dir != NULL);
++	dplug = inode_dir_plugin(dir);
++	assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
++	if (dplug != NULL)
++		return dplug->get_parent(dir);
++	else
++		return ERR_PTR(RETERR(-ENOTDIR));
++}
++
++/**
++ * reiser4_get_dentry - get_dentry of export operations
++ * @super:
++ * @data:
++ *
++ *
++ */
++static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
++{
++	reiser4_object_on_wire *o;
++
++	assert("nikita-3522", super != NULL);
++	assert("nikita-3523", data != NULL);
++	/*
++	 * this is only supposed to be called by
++	 *
++	 *     reiser4_decode_fh->find_exported_dentry
++	 *
++	 * so, reiser4_context should be here already.
++	 */
++	assert("nikita-3526", is_in_reiser4_context());
++
++	o = (reiser4_object_on_wire *)data;
++	assert("nikita-3524", o->plugin != NULL);
++	assert("nikita-3525", o->plugin->wire.get != NULL);
++
++	return o->plugin->wire.get(super, o);
++}
++
++struct export_operations reiser4_export_operations = {
++	.encode_fh = reiser4_encode_fh,
++	.decode_fh = reiser4_decode_fh,
++	.get_parent = reiser4_get_dentry_parent,
++	.get_dentry = reiser4_get_dentry
++};
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/flush.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/flush.c
+@@ -0,0 +1,3626 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/item/item.h"
++#include "plugin/plugin.h"
++#include "plugin/object.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree_walk.h"
++#include "carry.h"
++#include "tree.h"
++#include "vfs_ops.h"
++#include "inode.h"
++#include "page_cache.h"
++#include "wander.h"
++#include "super.h"
++#include "entd.h"
++#include "reiser4.h"
++#include "flush.h"
++#include "writeout.h"
++
++#include <asm/atomic.h>
++#include <linux/fs.h>		/* for struct super_block  */
++#include <linux/mm.h>		/* for struct page */
++#include <linux/bio.h>		/* for struct bio */
++#include <linux/pagemap.h>
++#include <linux/blkdev.h>
++
++/* IMPLEMENTATION NOTES */
++
++/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
++   order to the nodes of the tree in which the parent is placed before its children, which
++   are ordered (recursively) in left-to-right order.  When we speak of a "parent-first preceder", it
++   describes the node that "came before in forward parent-first order".  When we speak of a
++   "parent-first follower", it describes the node that "comes next in parent-first
++   order" (alternatively the node that "came before in reverse parent-first order").
++
++   The following pseudo-code prints the nodes of a tree in forward parent-first order:
++
++   void parent_first (node)
++   {
++     print_node (node);
++     if (node->level > leaf) {
++       for (i = 0; i < num_children; i += 1) {
++         parent_first (node->child[i]);
++       }
++     }
++   }
++*/
++
++/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block allocation so
++   that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
++   can be accomplished with sequential reads, which results in reading nodes in their
++   parent-first order.  This is a read-optimization aspect of the flush algorithm, and
++   there is also a write-optimization aspect, which is that we wish to make large
++   sequential writes to the disk by allocating or reallocating blocks so that they can be
++   written in sequence.  Sometimes the read-optimization and write-optimization goals
++   conflict with each other, as we discuss in more detail below.
++*/
++
++/* STATE BITS: The flush code revolves around the state of the jnodes it covers.  Here are
++   the relevant jnode->state bits and their relevence to flush:
++
++     JNODE_DIRTY: If a node is dirty, it must be flushed.  But in order to be written it
++     must be allocated first.  In order to be considered allocated, the jnode must have
++     exactly one of { JNODE_OVRWR, JNODE_RELOC } set.  These two bits are exclusive, and
++     all dirtied jnodes eventually have one of these bits set during each transaction.
++
++     JNODE_CREATED: The node was freshly created in its transaction and has no previous
++     block address, so it is unconditionally assigned to be relocated, although this is
++     mainly for code-convenience.  It is not being 'relocated' from anything, but in
++     almost every regard it is treated as part of the relocate set.  The JNODE_CREATED bit
++     remains set even after JNODE_RELOC is set, so the actual relocate can be
++     distinguished from the created-and-allocated set easily: relocate-set members
++     (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
++     have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
++
++     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
++     decision to maintain the pre-existing location for this node and it will be written
++     to the wandered-log.
++
++     JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
++     not created, see note above).  A block with JNODE_RELOC set is eligible for
++     early-flushing and may be submitted during flush_empty_queues.  When the JNODE_RELOC
++     bit is set on a znode, the parent node's internal item is modified and the znode is
++     rehashed.
++
++     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
++     and calls plugin->f.squeeze() method for its items. By this technology we update disk
++     clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
++     has this flag (races with write(), rare case) the flush algorythm makes the decision
++     to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
++     repeated allocation.
++
++     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
++     flush queue.  This means the jnode is not on any clean or dirty list, instead it is
++     moved to one of the flush queue (see flush_queue.h) object private list. This
++     prevents multiple concurrent flushes from attempting to start flushing from the
++     same node.
++
++     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
++     squeeze-and-allocate on a node while its children are actively being squeezed and
++     allocated.  This flag was created to avoid submitting a write request for a node
++     while its children are still being allocated and squeezed. Then flush queue was
++     re-implemented to allow unlimited number of nodes be queued. This flag support was
++     commented out in source code because we decided that there was no reason to submit
++     queued nodes before jnode_flush() finishes.  However, current code calls fq_write()
++     during a slum traversal and may submit "busy nodes" to disk. Probably we can
++     re-enable the JNODE_FLUSH_BUSY bit support in future.
++
++   With these state bits, we describe a test used frequently in the code below,
++   jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()).  The
++   test for "flushprepped" returns true if any of the following are true:
++
++     - The node is not dirty
++     - The node has JNODE_RELOC set
++     - The node has JNODE_OVRWR set
++
++   If either the node is not dirty or it has already been processed by flush (and assigned
++   JNODE_OVRWR or JNODE_RELOC), then it is prepped.  If jnode_is_flushprepped() returns
++   true then flush has work to do on that node.
++*/
++
++/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
++   flushprepped twice (unless an explicit call to flush_unprep is made as described in
++   detail below).  For example a node is dirtied, allocated, and then early-flushed to
++   disk and set clean.  Before the transaction commits, the page is dirtied again and, due
++   to memory pressure, the node is flushed again.  The flush algorithm will not relocate
++   the node to a new disk location, it will simply write it to the same, previously
++   relocated position again.
++*/
++
++/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
++   start at a leaf node and allocate in parent-first order by iterating to the right.  At
++   each step of the iteration, we check for the right neighbor.  Before advancing to the
++   right neighbor, we check if the current position and the right neighbor share the same
++   parent.  If they do not share the same parent, the parent is allocated before the right
++   neighbor.
++
++   This process goes recursively up the tree and squeeze nodes level by level as long as
++   the right neighbor and the current position have different parents, then it allocates
++   the right-neighbors-with-different-parents on the way back down.  This process is
++   described in more detail in flush_squalloc_changed_ancestor and the recursive function
++   squalloc_one_changed_ancestor.  But the purpose here is not to discuss the
++   specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
++   approaches.
++
++   The top-down algorithm was implemented earlier (April-May 2002).  In the top-down
++   approach, we find a starting point by scanning left along each level past dirty nodes,
++   then going up and repeating the process until the left node and the parent node are
++   clean.  We then perform a parent-first traversal from the starting point, which makes
++   allocating in parent-first order trivial.  After one subtree has been allocated in this
++   manner, we move to the right, try moving upward, then repeat the parent-first
++   traversal.
++
++   Both approaches have problems that need to be addressed.  Both are approximately the
++   same amount of code, but the bottom-up approach has advantages in the order it acquires
++   locks which, at the very least, make it the better approach.  At first glance each one
++   makes the other one look simpler, so it is important to remember a few of the problems
++   with each one.
++
++   Main problem with the top-down approach: When you encounter a clean child during the
++   parent-first traversal, what do you do?  You would like to avoid searching through a
++   large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
++   obvious solution.  One of the advantages of the top-down approach is that during the
++   parent-first traversal you check every child of a parent to see if it is dirty.  In
++   this way, the top-down approach easily handles the main problem of the bottom-up
++   approach: unallocated children.
++
++   The unallocated children problem is that before writing a node to disk we must make
++   sure that all of its children are allocated.  Otherwise, the writing the node means
++   extra I/O because the node will have to be written again when the child is finally
++   allocated.
++
++   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM.  Except for bugs, this
++   should not cause any file system corruption, it only degrades I/O performance because a
++   node may be written when it is sure to be written at least one more time in the same
++   transaction when the remaining children are allocated.  What follows is a description
++   of how we will solve the problem.
++*/
++
++/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
++   proceeding in parent first order, allocate some of its left-children, then encounter a
++   clean child in the middle of the parent.  We do not allocate the clean child, but there
++   may remain unallocated (dirty) children to the right of the clean child.  If we were to
++   stop flushing at this moment and write everything to disk, the parent might still
++   contain unallocated children.
++
++   We could try to allocate all the descendents of every node that we allocate, but this
++   is not necessary.  Doing so could result in allocating the entire tree: if the root
++   node is allocated then every unallocated node would have to be allocated before
++   flushing.  Actually, we do not have to write a node just because we allocate it.  It is
++   possible to allocate but not write a node during flush, when it still has unallocated
++   children.  However, this approach is probably not optimal for the following reason.
++
++   The flush algorithm is designed to allocate nodes in parent-first order in an attempt
++   to optimize reads that occur in the same order.  Thus we are read-optimizing for a
++   left-to-right scan through all the leaves in the system, and we are hoping to
++   write-optimize at the same time because those nodes will be written together in batch.
++   What happens, however, if we assign a block number to a node in its read-optimized
++   order but then avoid writing it because it has unallocated children?  In that
++   situation, we lose out on the write-optimization aspect because a node will have to be
++   written again to the its location on the device, later, which likely means seeking back
++   to that location.
++
++   So there are tradeoffs. We can choose either:
++
++   A. Allocate all unallocated children to preserve both write-optimization and
++   read-optimization, but this is not always desirable because it may mean having to
++   allocate and flush very many nodes at once.
++
++   B. Defer writing nodes with unallocated children, keep their read-optimized locations,
++   but sacrifice write-optimization because those nodes will be written again.
++
++   C. Defer writing nodes with unallocated children, but do not keep their read-optimized
++   locations.  Instead, choose to write-optimize them later, when they are written.  To
++   facilitate this, we "undo" the read-optimized allocation that was given to the node so
++   that later it can be write-optimized, thus "unpreparing" the flush decision.  This is a
++   case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above.  By a
++   call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
++   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
++   location, and set the JNODE_CREATED bit, effectively setting the node back to an
++   unallocated state.
++
++   We will take the following approach in v4.0: for twig nodes we will always finish
++   allocating unallocated children (A).  For nodes with (level > TWIG) we will defer
++   writing and choose write-optimization (C).
++
++   To summarize, there are several parts to a solution that avoids the problem with
++   unallocated children:
++
++   FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
++   problem because there was an experiment which was done showed that we have 1-2 nodes
++   with unallocated children for thousands of written nodes.  The experiment was simple
++   like coping / deletion of linux kernel sources.  However the problem can arise in more
++   complex tests.  I think we have jnode_io_hook to insert a check for unallocated
++   children and see what kind of problem we have.
++
++   1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
++   squeeze-and-allocate on any remaining unallocated children.  FIXME: Difficulty to
++   implement: should be simple -- amounts to adding a while loop to jnode_flush, see
++   comments in that function.
++
++   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
++   have unallocated children.  If the twig level has unallocated children it is an
++   assertion failure.  If a higher-level node has unallocated children, then it should be
++   explicitly de-allocated by a call to flush_unprep().  FIXME: Difficulty to implement:
++   should be simple.
++
++   3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
++   CPU cycles than we would like, and it is possible (but medium complexity) to optimize
++   this somewhat in the case where large sub-trees are flushed.  The following observation
++   helps: if both the left- and right-neighbor of a node are processed by the flush
++   algorithm then the node itself is guaranteed to have all of its children allocated.
++   However, the cost of this check may not be so expensive after all: it is not needed for
++   leaves and flush can guarantee this property for twigs.  That leaves only (level >
++   TWIG) nodes that have to be checked, so this optimization only helps if at least three
++   (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
++   there are many more (level > TWIG) nodes.  But if there are many (level > TWIG) nodes
++   then the number of blocks being written will be very large, so the savings may be
++   insignificant.  That said, the idea is to maintain both the left and right edges of
++   nodes that are processed in flush.  When flush_empty_queue() is called, a relatively
++   simple test will tell whether the (level > TWIG) node is on the edge.  If it is on the
++   edge, the slow check is necessary, but if it is in the interior then it can be assumed
++   to have all of its children allocated.  FIXME: medium complexity to implement, but
++   simple to verify given that we must have a slow check anyway.
++
++   4. (Optional) This part is optional, not for v4.0--flush should work independently of
++   whether this option is used or not.  Called RAPID_SCAN, the idea is to amend the
++   left-scan operation to take unallocated children into account.  Normally, the left-scan
++   operation goes left as long as adjacent nodes are dirty up until some large maximum
++   value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing.  But scan-left
++   may stop at a position where there are unallocated children to the left with the same
++   parent.  When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
++   FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
++   with a rapid scan.  The rapid scan skips all the interior children of a node--if the
++   leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
++   twig to the left).  If the left neighbor of the leftmost child is also dirty, then
++   continue the scan at the left twig and repeat.  This option will cause flush to
++   allocate more twigs in a single pass, but it also has the potential to write many more
++   nodes than would otherwise be written without the RAPID_SCAN option.  RAPID_SCAN
++   was partially implemented, code removed August 12, 2002 by JMACD.
++*/
++
++/* FLUSH CALLED ON NON-LEAF LEVEL.  Most of our design considerations assume that the
++   starting point for flush is a leaf node, but actually the flush code cares very little
++   about whether or not this is true.  It is possible that all the leaf nodes are flushed
++   and dirty parent nodes still remain, in which case jnode_flush() is called on a
++   non-leaf argument.  Flush doesn't care--it treats the argument node as if it were a
++   leaf, even when it is not.  This is a simple approach, and there may be a more optimal
++   policy but until a problem with this approach is discovered, simplest is probably best.
++
++   NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
++   the leaves.  This is done as a matter of simplicity and there is only one (shaky)
++   justification.  When an atom commits, it flushes all leaf level nodes first, followed
++   by twigs, and so on.  With flushing done in this order, if flush is eventually called
++   on a non-leaf node it means that (somehow) we reached a point where all leaves are
++   clean and only internal nodes need to be flushed.  If that it the case, then it means
++   there were no leaves that were the parent-first preceder/follower of the parent.  This
++   is expected to be a rare case, which is why we do nothing special about it.  However,
++   memory pressure may pass an internal node to flush when there are still dirty leaf
++   nodes that need to be flushed, which could prove our original assumptions
++   "inoperative".  If this needs to be fixed, then scan_left/right should have
++   special checks for the non-leaf levels.  For example, instead of passing from a node to
++   the left neighbor, it should pass from the node to the left neighbor's rightmost
++   descendent (if dirty).
++
++*/
++
++/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING.  We walk the tree in 4MB-16MB chunks, dirtying everything and putting
++   it into a transaction.  We tell the allocator to allocate the blocks as far as possible towards one end of the
++   logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
++   device if we are walking from right to left.  We then make passes in alternating directions, and as we do this the
++   device becomes sorted such that tree order and block number order fully correlate.
++
++   Resizing is done by shifting everything either all the way to the left or all the way
++   to the right, and then reporting the last block.
++*/
++
++/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.  This
++   descibes the policy from the highest level:
++
++   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
++   leaf level during flush-scan (right, left), then we unconditionally decide to relocate
++   leaf nodes.
++
++   Otherwise, there are two contexts in which we make a decision to relocate:
++
++   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
++   During the initial stages of flush, after scan-right completes, we want to ask the
++   question: should we relocate this leaf node and thus dirty the parent node.  Then if
++   the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
++   the question at the next level up, and so on.  In these cases we are moving in the
++   reverse-parent first direction.
++
++   There is another case which is considered the reverse direction, which comes at the end
++   of a twig in reverse_relocate_end_of_twig().  As we finish processing a twig we may
++   reach a point where there is a clean twig to the right with a dirty leftmost child.  In
++   this case, we may wish to relocate the child by testing if it should be relocated
++   relative to its parent.
++
++   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
++   allocate_znode.  What distinguishes the forward parent-first case from the
++   reverse-parent first case is that the preceder has already been allocated in the
++   forward case, whereas in the reverse case we don't know what the preceder is until we
++   finish "going in reverse".  That simplifies the forward case considerably, and there we
++   actually use the block allocator to determine whether, e.g., a block closer to the
++   preceder is available.
++*/
++
++/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration.  The idea is, once we
++   finish scan-left and find a starting point, if the parent's left neighbor is dirty then
++   squeeze the parent's left neighbor and the parent.  This may change the
++   flush-starting-node's parent.  Repeat until the child's parent is stable.  If the child
++   is a leftmost child, repeat this left-edge squeezing operation at the next level up.
++   Note that we cannot allocate extents during this or they will be out of parent-first
++   order.  There is also some difficult coordinate maintenence issues.  We can't do a tree
++   search to find coordinates again (because we hold locks), we have to determine them
++   from the two nodes being squeezed.  Looks difficult, but has potential to increase
++   space utilization. */
++
++/* Flush-scan helper functions. */
++static void scan_init(flush_scan * scan);
++static void scan_done(flush_scan * scan);
++
++/* Flush-scan algorithm. */
++static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
++		     unsigned limit);
++static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
++static int scan_common(flush_scan * scan, flush_scan * other);
++static int scan_formatted(flush_scan * scan);
++static int scan_unformatted(flush_scan * scan, flush_scan * other);
++static int scan_by_coord(flush_scan * scan);
++
++/* Initial flush-point ancestor allocation. */
++static int alloc_pos_and_ancestors(flush_pos_t * pos);
++static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
++static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
++
++/* Main flush algorithm.  Note on abbreviation: "squeeze and allocate" == "squalloc". */
++static int squalloc(flush_pos_t * pos);
++
++/* Flush squeeze implementation. */
++static int squeeze_right_non_twig(znode * left, znode * right);
++static int shift_one_internal_unit(znode * left, znode * right);
++
++/* Flush reverse parent-first relocation routines. */
++static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
++					    const reiser4_block_nr * nblk);
++static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
++				 flush_pos_t * pos);
++static int reverse_relocate_check_dirty_parent(jnode * node,
++					       const coord_t * parent_coord,
++					       flush_pos_t * pos);
++
++/* Flush allocate write-queueing functions: */
++static int allocate_znode(znode * node, const coord_t * parent_coord,
++			  flush_pos_t * pos);
++static int allocate_znode_update(znode * node, const coord_t * parent_coord,
++				 flush_pos_t * pos);
++static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
++
++/* Flush helper functions: */
++static int jnode_lock_parent_coord(jnode * node,
++				   coord_t * coord,
++				   lock_handle * parent_lh,
++				   load_count * parent_zh,
++				   znode_lock_mode mode, int try);
++static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
++			    znode_lock_mode mode, int check_dirty);
++static int znode_same_parents(znode * a, znode * b);
++
++static int znode_check_flushprepped(znode * node)
++{
++	return jnode_check_flushprepped(ZJNODE(node));
++}
++
++/* Flush position functions */
++static void pos_init(flush_pos_t * pos);
++static int pos_valid(flush_pos_t * pos);
++static void pos_done(flush_pos_t * pos);
++static int pos_stop(flush_pos_t * pos);
++
++/* check that @org is first jnode extent unit, if extent is unallocated,
++ * because all jnodes of unallocated extent are dirty and of the same atom. */
++#define checkchild(scan)						\
++assert("nikita-3435",							\
++       ergo(scan->direction == LEFT_SIDE &&				\
++            (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
++	    jnode_is_unformatted(scan->node) &&				\
++	    extent_is_unallocated(&scan->parent_coord),			\
++	    extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
++
++/* This flush_cnt variable is used to track the number of concurrent flush operations,
++   useful for debugging.  It is initialized in txnmgr.c out of laziness (because flush has
++   no static initializer function...) */
++ON_DEBUG(atomic_t flush_cnt;
++    )
++
++/* check fs backing device for write congestion */
++static int check_write_congestion(void)
++{
++	struct super_block *sb;
++	struct backing_dev_info *bdi;
++
++	sb = reiser4_get_current_sb();
++	bdi = get_super_fake(sb)->i_mapping->backing_dev_info;
++	return bdi_write_congested(bdi);
++}
++
++/* conditionally write flush queue */
++static int write_prepped_nodes(flush_pos_t * pos)
++{
++	int ret;
++
++	assert("zam-831", pos);
++	assert("zam-832", pos->fq);
++
++	if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
++		return 0;
++
++	if (check_write_congestion())
++		return 0;
++
++	ret = write_fq(pos->fq, pos->nr_written,
++		       WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
++	return ret;
++}
++
++/* Proper release all flush pos. resources then move flush position to new
++   locked node */
++static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
++			   load_count * new_load, const coord_t * new_coord)
++{
++	assert("zam-857", new_lock->node == new_load->node);
++
++	if (new_coord) {
++		assert("zam-858", new_coord->node == new_lock->node);
++		coord_dup(&pos->coord, new_coord);
++	} else {
++		coord_init_first_unit(&pos->coord, new_lock->node);
++	}
++
++	if (pos->child) {
++		jput(pos->child);
++		pos->child = NULL;
++	}
++
++	move_load_count(&pos->load, new_load);
++	done_lh(&pos->lock);
++	move_lh(&pos->lock, new_lock);
++}
++
++/* delete empty node which link from the parent still exists. */
++static int delete_empty_node(znode * node)
++{
++	reiser4_key smallest_removed;
++
++	assert("zam-1019", node != NULL);
++	assert("zam-1020", node_is_empty(node));
++	assert("zam-1023", znode_is_wlocked(node));
++
++	return delete_node(node, &smallest_removed, NULL, 1);
++}
++
++/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
++static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
++{
++	int ret;
++	load_count load;
++	lock_handle lock;
++
++	init_lh(&lock);
++	init_load_count(&load);
++
++	if (jnode_is_znode(org)) {
++		ret = longterm_lock_znode(&lock, JZNODE(org),
++					  ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
++		if (ret)
++			return ret;
++
++		ret = incr_load_count_znode(&load, JZNODE(org));
++		if (ret)
++			return ret;
++
++		pos->state =
++		    (jnode_get_level(org) ==
++		     LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
++		move_flush_pos(pos, &lock, &load, NULL);
++	} else {
++		coord_t parent_coord;
++		ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
++					      &load, ZNODE_WRITE_LOCK, 0);
++		if (ret)
++			goto done;
++		if (!item_is_extent(&parent_coord)) {
++			/* file was converted to tail, org became HB, we found internal
++			   item */
++			ret = -EAGAIN;
++			goto done;
++		}
++
++		pos->state = POS_ON_EPOINT;
++		move_flush_pos(pos, &lock, &load, &parent_coord);
++		pos->child = jref(org);
++		if (extent_is_unallocated(&parent_coord)
++		    && extent_unit_index(&parent_coord) != index_jnode(org)) {
++			/* @org is not first child of its parent unit. This may happen
++			   because longerm lock of its parent node was released between
++			   scan_left and scan_right. For now work around this having flush to repeat */
++			ret = -EAGAIN;
++		}
++	}
++
++      done:
++	done_load_count(&load);
++	done_lh(&lock);
++	return ret;
++}
++
++/* TODO LIST (no particular order): */
++/* I have labelled most of the legitimate FIXME comments in this file with letters to
++   indicate which issue they relate to.  There are a few miscellaneous FIXMEs with
++   specific names mentioned instead that need to be inspected/resolved. */
++/* B. There is an issue described in reverse_relocate_test having to do with an
++   imprecise is_preceder? check having to do with partially-dirty extents.  The code that
++   sets preceder hints and computes the preceder is basically untested.  Careful testing
++   needs to be done that preceder calculations are done correctly, since if it doesn't
++   affect correctness we will not catch this stuff during regular testing. */
++/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling.  It is unclear which of these are
++   considered expected but unlikely conditions.  Flush currently returns 0 (i.e., success
++   but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
++   Many of the calls that may produce one of these return values (i.e.,
++   longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
++   values themselves and, for instance, stop flushing instead of resulting in a restart.
++   If any of these results are true error conditions then flush will go into a busy-loop,
++   as we noticed during testing when a corrupt tree caused find_child_ptr to return
++   ENOENT.  It needs careful thought and testing of corner conditions.
++*/
++/* D. Atomicity of flush_prep against deletion and flush concurrency.  Suppose a created
++   block is assigned a block number then early-flushed to disk.  It is dirtied again and
++   flush is called again.  Concurrently, that block is deleted, and the de-allocation of
++   its block number does not need to be deferred, since it is not part of the preserve set
++   (i.e., it didn't exist before the transaction).  I think there may be a race condition
++   where flush writes the dirty, created block after the non-deferred deallocated block
++   number is re-allocated, making it possible to write deleted data on top of non-deleted
++   data.  Its just a theory, but it needs to be thought out. */
++/* F. bio_alloc() failure is not handled gracefully. */
++/* G. Unallocated children. */
++/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
++/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
++
++/* JNODE_FLUSH: MAIN ENTRY POINT */
++/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
++   neighborhood is named "slum").  Jnode_flush() is called if reiser4 has to write dirty
++   blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
++   a part of transaction commit.
++
++   Our objective here is to prep and flush the slum the jnode belongs to. We want to
++   squish the slum together, and allocate the nodes in it as we squish because allocation
++   of children affects squishing of parents.
++
++   The "argument" @node tells flush where to start.  From there, flush finds the left edge
++   of the slum, and calls squalloc (in which nodes are squeezed and allocated).  To find a
++   "better place" to start squalloc first we perform a flush_scan.
++
++   Flush-scanning may be performed in both left and right directions, but for different
++   purposes.  When scanning to the left, we are searching for a node that precedes a
++   sequence of parent-first-ordered nodes which we will then flush in parent-first order.
++   During flush-scanning, we also take the opportunity to count the number of consecutive
++   leaf nodes.  If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
++   make a decision to reallocate leaf nodes (thus favoring write-optimization).
++
++   Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
++   also be dirty nodes to the right of the argument.  If the scan-left operation does not
++   count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
++   operation to see whether there is, in fact, enough nodes to meet the relocate
++   threshold.  Each right- and left-scan operation uses a single flush_scan object.
++
++   After left-scan and possibly right-scan, we prepare a flush_position object with the
++   starting flush point or parent coordinate, which was determined using scan-left.
++
++   Next we call the main flush routine, squalloc, which iterates along the
++   leaf level, squeezing and allocating nodes (and placing them into the flush queue).
++
++   After squalloc returns we take extra steps to ensure that all the children
++   of the final twig node are allocated--this involves repeating squalloc
++   until we finish at a twig with no unallocated children.
++
++   Finally, we call flush_empty_queue to submit write-requests to disk.  If we encounter
++   any above-twig nodes during flush_empty_queue that still have unallocated children, we
++   flush_unprep them.
++
++   Flush treats several "failure" cases as non-failures, essentially causing them to start
++   over.  E_DEADLOCK is one example.  FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
++   probably be handled properly rather than restarting, but there are a bunch of cases to
++   audit.
++*/
++
++static int
++jnode_flush(jnode * node, long nr_to_write, long *nr_written,
++	    flush_queue_t * fq, int flags)
++{
++	long ret = 0;
++	flush_scan *right_scan;
++	flush_scan *left_scan;
++	flush_pos_t *flush_pos;
++	int todo;
++	struct super_block *sb;
++	reiser4_super_info_data *sbinfo;
++	jnode *leftmost_in_slum = NULL;
++
++	assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
++	assert("nikita-3022", schedulable());
++
++	/* lock ordering: delete_sema and flush_sema are unordered */
++	assert("nikita-3185",
++	       get_current_super_private()->delete_sema_owner != current);
++
++	/* allocate right_scan, left_scan and flush_pos */
++	right_scan =
++	    kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), get_gfp_mask());
++	if (right_scan == NULL)
++		return RETERR(-ENOMEM);
++	left_scan = right_scan + 1;
++	flush_pos = (flush_pos_t *) (left_scan + 1);
++
++	sb = reiser4_get_current_sb();
++	sbinfo = get_super_private(sb);
++	if (!reiser4_is_set(sb, REISER4_MTFLUSH)) {
++		down(&sbinfo->flush_sema);
++	}
++
++	/* Flush-concurrency debug code */
++#if REISER4_DEBUG
++	atomic_inc(&flush_cnt);
++#endif
++
++	enter_flush(sb);
++
++	/* Initialize a flush position. */
++	pos_init(flush_pos);
++
++	flush_pos->nr_written = nr_written;
++	flush_pos->fq = fq;
++	flush_pos->flags = flags;
++	flush_pos->nr_to_write = nr_to_write;
++
++	scan_init(right_scan);
++	scan_init(left_scan);
++
++	/* First scan left and remember the leftmost scan position.  If the leftmost
++	   position is unformatted we remember its parent_coord.  We scan until counting
++	   FLUSH_SCAN_MAXNODES.
++
++	   If starting @node is unformatted, at the beginning of left scan its
++	   parent (twig level node, containing extent item) will be long term
++	   locked and lock handle will be stored in the
++	   @right_scan->parent_lock. This lock is used to start the rightward
++	   scan without redoing the tree traversal (necessary to find parent)
++	   and, hence, is kept during leftward scan. As a result, we have to
++	   use try-lock when taking long term locks during the leftward scan.
++	 */
++	ret = scan_left(left_scan, right_scan,
++			node, sbinfo->flush.scan_maxnodes);
++	if (ret != 0)
++		goto failed;
++
++	leftmost_in_slum = jref(left_scan->node);
++	scan_done(left_scan);
++
++	/* Then possibly go right to decide if we will use a policy of relocating leaves.
++	   This is only done if we did not scan past (and count) enough nodes during the
++	   leftward scan.  If we do scan right, we only care to go far enough to establish
++	   that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed.  The
++	   scan limit is the difference between left_scan.count and the threshold. */
++
++	todo = sbinfo->flush.relocate_threshold - left_scan->count;
++	/* scan right is inherently deadlock prone, because we are
++	 * (potentially) holding a lock on the twig node at this moment.
++	 * FIXME: this is incorrect comment: lock is not held */
++	if (todo > 0) {
++		ret = scan_right(right_scan, node, (unsigned)todo);
++		if (ret != 0)
++			goto failed;
++	}
++
++	/* Only the right-scan count is needed, release any rightward locks right away. */
++	scan_done(right_scan);
++
++	/* ... and the answer is: we should relocate leaf nodes if at least
++	   FLUSH_RELOCATE_THRESHOLD nodes were found. */
++	flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
++	    (left_scan->count + right_scan->count >=
++	     sbinfo->flush.relocate_threshold);
++
++	/* Funny business here.  We set the 'point' in the flush_position at prior to
++	   starting squalloc regardless of whether the first point is
++	   formatted or unformatted.  Without this there would be an invariant, in the
++	   rest of the code, that if the flush_position is unformatted then
++	   flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
++	   and if the flush_position is formatted then flush_position->point is non-NULL
++	   and no parent info is set.
++
++	   This seems lazy, but it makes the initial calls to reverse_relocate_test
++	   (which ask "is it the pos->point the leftmost child of its parent") much easier
++	   because we know the first child already.  Nothing is broken by this, but the
++	   reasoning is subtle.  Holding an extra reference on a jnode during flush can
++	   cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
++	   removed from sibling lists until they have zero reference count.  Flush would
++	   never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
++	   deleted to the right.  So if nothing is broken, why fix it?
++
++	   NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
++	   point and in any moment, because of the concurrent file system
++	   activity (for example, truncate). */
++
++	/* Check jnode state after flush_scan completed. Having a lock on this
++	   node or its parent (in case of unformatted) helps us in case of
++	   concurrent flushing. */
++	if (jnode_check_flushprepped(leftmost_in_slum)
++	    && !jnode_convertible(leftmost_in_slum)) {
++		ret = 0;
++		goto failed;
++	}
++
++	/* Now setup flush_pos using scan_left's endpoint. */
++	ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
++	if (ret)
++		goto failed;
++
++	if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
++	    && node_is_empty(flush_pos->coord.node)) {
++		znode *empty = flush_pos->coord.node;
++
++		assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
++		ret = delete_empty_node(empty);
++		goto failed;
++	}
++
++	if (jnode_check_flushprepped(leftmost_in_slum)
++	    && !jnode_convertible(leftmost_in_slum)) {
++		ret = 0;
++		goto failed;
++	}
++
++	/* Set pos->preceder and (re)allocate pos and its ancestors if it is needed  */
++	ret = alloc_pos_and_ancestors(flush_pos);
++	if (ret)
++		goto failed;
++
++	/* Do the main rightward-bottom-up squeeze and allocate loop. */
++	ret = squalloc(flush_pos);
++	pos_stop(flush_pos);
++	if (ret)
++		goto failed;
++
++	/* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
++	   First, the pos_stop() and pos_valid() routines should be modified
++	   so that pos_stop() sets a flush_position->stop flag to 1 without
++	   releasing the current position immediately--instead release it in
++	   pos_done().  This is a better implementation than the current one anyway.
++
++	   It is not clear that all fields of the flush_position should not be released,
++	   but at the very least the parent_lock, parent_coord, and parent_load should
++	   remain held because they are hold the last twig when pos_stop() is
++	   called.
++
++	   When we reach this point in the code, if the parent_coord is set to after the
++	   last item then we know that flush reached the end of a twig (and according to
++	   the new flush queueing design, we will return now).  If parent_coord is not
++	   past the last item, we should check if the current twig has any unallocated
++	   children to the right (we are not concerned with unallocated children to the
++	   left--in that case the twig itself should not have been allocated).  If the
++	   twig has unallocated children to the right, set the parent_coord to that
++	   position and then repeat the call to squalloc.
++
++	   Testing for unallocated children may be defined in two ways: if any internal
++	   item has a fake block number, it is unallocated; if any extent item is
++	   unallocated then all of its children are unallocated.  But there is a more
++	   aggressive approach: if there are any dirty children of the twig to the right
++	   of the current position, we may wish to relocate those nodes now.  Checking for
++	   potential relocation is more expensive as it requires knowing whether there are
++	   any dirty children that are not unallocated.  The extent_needs_allocation
++	   should be used after setting the correct preceder.
++
++	   When we reach the end of a twig at this point in the code, if the flush can
++	   continue (when the queue is ready) it will need some information on the future
++	   starting point.  That should be stored away in the flush_handle using a seal, I
++	   believe.  Holding a jref() on the future starting point may break other code
++	   that deletes that node.
++	 */
++
++	/* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
++	   above the twig level.  If the VM calls flush above the twig level, do nothing
++	   and return (but figure out why this happens).  The txnmgr should be modified to
++	   only flush its leaf-level dirty list.  This will do all the necessary squeeze
++	   and allocate steps but leave unallocated branches and possibly unallocated
++	   twigs (when the twig's leftmost child is not dirty).  After flushing the leaf
++	   level, the remaining unallocated nodes should be given write-optimized
++	   locations.  (Possibly, the remaining unallocated twigs should be allocated just
++	   before their leftmost child.)
++	 */
++
++	/* Any failure reaches this point. */
++      failed:
++
++	switch (ret) {
++	case -E_REPEAT:
++	case -EINVAL:
++	case -E_DEADLOCK:
++	case -E_NO_NEIGHBOR:
++	case -ENOENT:
++		/* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
++		   in each case.  They already are handled in many cases. */
++		/* Something bad happened, but difficult to avoid...  Try again! */
++		ret = 0;
++	}
++
++	if (leftmost_in_slum)
++		jput(leftmost_in_slum);
++
++	pos_done(flush_pos);
++	scan_done(left_scan);
++	scan_done(right_scan);
++	kfree(right_scan);
++
++	ON_DEBUG(atomic_dec(&flush_cnt));
++
++	leave_flush(sb);
++
++	if (!reiser4_is_set(sb, REISER4_MTFLUSH))
++		up(&sbinfo->flush_sema);
++
++	return ret;
++}
++
++/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
++ * flusher should submit all prepped nodes immediately without keeping them in
++ * flush queues for long time.  The reason for rapid flush mode is to free
++ * memory as fast as possible. */
++
++#if REISER4_USE_RAPID_FLUSH
++
++/**
++ * submit all prepped nodes if rapid flush mode is set,
++ * turn rapid flush mode off.
++ */
++
++static int rapid_flush(flush_pos_t * pos)
++{
++	if (!wbq_available())
++		return 0;
++
++	return write_prepped_nodes(pos);
++}
++
++#else
++
++#define rapid_flush(pos) (0)
++
++#endif				/* REISER4_USE_RAPID_FLUSH */
++
++static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
++				     flush_queue_t *fq, int *nr_queued,
++				     int flags)
++{
++	jnode * node;
++
++	if (start != NULL) {
++		spin_lock_jnode(start);
++		if (!jnode_is_flushprepped(start)) {
++			assert("zam-1056", start->atom == atom);
++			node = start;
++			goto enter;
++		}
++		spin_unlock_jnode(start);
++	}
++	/*
++	 * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
++	 * nodes. The atom spin lock is not released until all dirty nodes processed or
++	 * not prepped node found in the atom dirty lists.
++	 */
++	while ((node = find_first_dirty_jnode(atom, flags))) {
++		spin_lock_jnode(node);
++	enter:
++		assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
++		assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
++
++		if (JF_ISSET(node, JNODE_WRITEBACK)) {
++			/* move node to the end of atom's writeback list */
++			list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
++
++			/*
++			 * jnode is not necessarily on dirty list: if it was dirtied when
++			 * it was on flush queue - it does not get moved to dirty list
++			 */
++			ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
++					     WB_LIST, 1));
++
++		} else if (jnode_is_znode(node)
++			   && znode_above_root(JZNODE(node))) {
++			/*
++			 * A special case for znode-above-root.  The above-root (fake)
++			 * znode is captured and dirtied when the tree height changes or
++			 * when the root node is relocated.  This causes atoms to fuse so
++			 * that changes at the root are serialized.  However, this node is
++			 * never flushed.  This special case used to be in lock.c to
++			 * prevent the above-root node from ever being captured, but now
++			 * that it is captured we simply prevent it from flushing.  The
++			 * log-writer code relies on this to properly log superblock
++			 * modifications of the tree height.
++			 */
++			jnode_make_wander_nolock(node);
++		} else if (JF_ISSET(node, JNODE_RELOC)) {
++			queue_jnode(fq, node);
++			++(*nr_queued);
++		} else
++			break;
++
++		spin_unlock_jnode(node);
++	}
++	return node;
++}
++
++
++/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
++ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
++ * other errors as they are. */
++int
++flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
++		   txn_atom ** atom, jnode *start)
++{
++	reiser4_super_info_data *sinfo = get_current_super_private();
++	flush_queue_t *fq = NULL;
++	jnode *node;
++	int nr_queued;
++	int ret;
++
++	assert("zam-889", atom != NULL && *atom != NULL);
++	assert_spin_locked(&((*atom)->alock));
++	assert("zam-892", get_current_context()->trans->atom == *atom);
++
++	nr_to_write = LONG_MAX;
++	while (1) {
++		ret = fq_by_atom(*atom, &fq);
++		if (ret != -E_REPEAT)
++			break;
++		*atom = get_current_atom_locked();
++	}
++	if (ret)
++		return ret;
++
++	assert_spin_locked(&((*atom)->alock));
++
++	/* parallel flushers limit */
++	if (sinfo->tmgr.atom_max_flushers != 0) {
++		while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
++			/* An atom_send_event() call is inside fq_put_nolock() which is
++			   called when flush is finished and nr_flushers is
++			   decremented. */
++			atom_wait_event(*atom);
++			*atom = get_current_atom_locked();
++		}
++	}
++
++	/* count ourself as a flusher */
++	(*atom)->nr_flushers++;
++
++	writeout_mode_enable();
++
++	nr_queued = 0;
++	node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
++
++	if (node == NULL) {
++		if (nr_queued == 0) {
++			(*atom)->nr_flushers--;
++			fq_put_nolock(fq);
++			atom_send_event(*atom);
++			/* current atom remains locked */
++			writeout_mode_disable();
++			return 0;
++		}
++		spin_unlock_atom(*atom);
++	} else {
++		jref(node);
++		BUG_ON((*atom)->super != node->tree->super);
++		spin_unlock_atom(*atom);
++		spin_unlock_jnode(node);
++		BUG_ON(nr_to_write == 0);
++		ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
++		jput(node);
++	}
++
++	ret =
++	    write_fq(fq, nr_submitted,
++		     WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
++
++	*atom = get_current_atom_locked();
++	(*atom)->nr_flushers--;
++	fq_put_nolock(fq);
++	atom_send_event(*atom);
++	spin_unlock_atom(*atom);
++
++	writeout_mode_disable();
++
++	if (ret == 0)
++		ret = -E_REPEAT;
++
++	return ret;
++}
++
++/* REVERSE PARENT-FIRST RELOCATION POLICIES */
++
++/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
++   reverse parent-first relocate context.  Here all we know is the preceder and the block
++   number.  Since we are going in reverse, the preceder may still be relocated as well, so
++   we can't ask the block allocator "is there a closer block available to relocate?" here.
++   In the _forward_ parent-first relocate context (not here) we actually call the block
++   allocator to try and find a closer location. */
++static int
++reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
++				 const reiser4_block_nr * nblk)
++{
++	reiser4_block_nr dist;
++
++	assert("jmacd-7710", *pblk != 0 && *nblk != 0);
++	assert("jmacd-7711", !blocknr_is_fake(pblk));
++	assert("jmacd-7712", !blocknr_is_fake(nblk));
++
++	/* Distance is the absolute value. */
++	dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
++
++	/* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
++	   block, do not relocate. */
++	if (dist <= get_current_super_private()->flush.relocate_distance) {
++		return 0;
++	}
++
++	return 1;
++}
++
++/* This function is a predicate that tests for relocation.  Always called in the
++   reverse-parent-first context, when we are asking whether the current node should be
++   relocated in order to expand the flush by dirtying the parent level (and thus
++   proceeding to flush that level).  When traversing in the forward parent-first direction
++   (not here), relocation decisions are handled in two places: allocate_znode() and
++   extent_needs_allocation(). */
++static int
++reverse_relocate_test(jnode * node, const coord_t * parent_coord,
++		      flush_pos_t * pos)
++{
++	reiser4_block_nr pblk = 0;
++	reiser4_block_nr nblk = 0;
++
++	assert("jmacd-8989", !jnode_is_root(node));
++
++	/*
++	 * This function is called only from the
++	 * reverse_relocate_check_dirty_parent() and only if the parent
++	 * node is clean. This implies that the parent has the real (i.e., not
++	 * fake) block number, and, so does the child, because otherwise the
++	 * parent would be dirty.
++	 */
++
++	/* New nodes are treated as if they are being relocated. */
++	if (JF_ISSET (node, JNODE_CREATED) ||
++	    (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
++		return 1;
++	}
++
++	/* Find the preceder.  FIXME(B): When the child is an unformatted, previously
++	   existing node, the coord may be leftmost even though the child is not the
++	   parent-first preceder of the parent.  If the first dirty node appears somewhere
++	   in the middle of the first extent unit, this preceder calculation is wrong.
++	   Needs more logic in here. */
++	if (coord_is_leftmost_unit(parent_coord)) {
++		pblk = *znode_get_block(parent_coord->node);
++	} else {
++		pblk = pos->preceder.blk;
++	}
++	check_preceder(pblk);
++
++	/* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
++	if (pblk == 0) {
++		return 1;
++	}
++
++	nblk = *jnode_get_block(node);
++
++	if (blocknr_is_fake(&nblk))
++		/* child is unallocated, mark parent dirty */
++		return 1;
++
++	return reverse_relocate_if_close_enough(&pblk, &nblk);
++}
++
++/* This function calls reverse_relocate_test to make a reverse-parent-first
++   relocation decision and then, if yes, it marks the parent dirty. */
++static int
++reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
++				    flush_pos_t * pos)
++{
++	int ret;
++
++	if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
++
++		ret = reverse_relocate_test(node, parent_coord, pos);
++		if (ret < 0) {
++			return ret;
++		}
++
++		/* FIXME-ZAM
++		   if parent is already relocated - we do not want to grab space, right? */
++		if (ret == 1) {
++			int grabbed;
++
++			grabbed = get_current_context()->grabbed_blocks;
++			if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
++			    0)
++				reiser4_panic("umka-1250",
++					      "No space left during flush.");
++
++			assert("jmacd-18923",
++			       znode_is_write_locked(parent_coord->node));
++			znode_make_dirty(parent_coord->node);
++			grabbed2free_mark(grabbed);
++		}
++	}
++
++	return 0;
++}
++
++/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
++   PARENT-FIRST LOOP BEGINS) */
++
++/* Get the leftmost child for given coord. */
++static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
++{
++	int ret;
++
++	ret = item_utmost_child(coord, LEFT_SIDE, child);
++
++	if (ret)
++		return ret;
++
++	if (IS_ERR(*child))
++		return PTR_ERR(*child);
++
++	return 0;
++}
++
++/* This step occurs after the left- and right-scans are completed, before starting the
++   forward parent-first traversal.  Here we attempt to allocate ancestors of the starting
++   flush point, which means continuing in the reverse parent-first direction to the
++   parent, grandparent, and so on (as long as the child is a leftmost child).  This
++   routine calls a recursive process, alloc_one_ancestor, which does the real work,
++   except there is special-case handling here for the first ancestor, which may be a twig.
++   At each level (here and alloc_one_ancestor), we check for relocation and then, if
++   the child is a leftmost child, repeat at the next level.  On the way back down (the
++   recursion), we allocate the ancestors in parent-first order. */
++static int alloc_pos_and_ancestors(flush_pos_t * pos)
++{
++	int ret = 0;
++	lock_handle plock;
++	load_count pload;
++	coord_t pcoord;
++
++	if (znode_check_flushprepped(pos->lock.node))
++		return 0;
++
++	coord_init_invalid(&pcoord, NULL);
++	init_lh(&plock);
++	init_load_count(&pload);
++
++	if (pos->state == POS_ON_EPOINT) {
++		/* a special case for pos on twig level, where we already have
++		   a lock on parent node. */
++		/* The parent may not be dirty, in which case we should decide
++		   whether to relocate the child now. If decision is made to
++		   relocate the child, the parent is marked dirty. */
++		ret =
++		    reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
++							pos);
++		if (ret)
++			goto exit;
++
++		/* FIXME_NFQUCMPD: We only need to allocate the twig (if child
++		   is leftmost) and the leaf/child, so recursion is not needed.
++		   Levels above the twig will be allocated for
++		   write-optimization before the transaction commits.  */
++
++		/* Do the recursive step, allocating zero or more of our
++		 * ancestors. */
++		ret = alloc_one_ancestor(&pos->coord, pos);
++
++	} else {
++		if (!znode_is_root(pos->lock.node)) {
++			/* all formatted nodes except tree root */
++			ret =
++			    reiser4_get_parent(&plock, pos->lock.node,
++					       ZNODE_WRITE_LOCK);
++			if (ret)
++				goto exit;
++
++			ret = incr_load_count_znode(&pload, plock.node);
++			if (ret)
++				goto exit;
++
++			ret =
++			    find_child_ptr(plock.node, pos->lock.node, &pcoord);
++			if (ret)
++				goto exit;
++
++			ret =
++			    reverse_relocate_check_dirty_parent(ZJNODE
++								(pos->lock.
++								 node), &pcoord,
++								pos);
++			if (ret)
++				goto exit;
++
++			ret = alloc_one_ancestor(&pcoord, pos);
++			if (ret)
++				goto exit;
++		}
++
++		ret = allocate_znode(pos->lock.node, &pcoord, pos);
++	}
++      exit:
++	done_load_count(&pload);
++	done_lh(&plock);
++	return ret;
++}
++
++/* This is the recursive step described in alloc_pos_and_ancestors, above.  Ignoring the
++   call to set_preceder, which is the next function described, this checks if the
++   child is a leftmost child and returns if it is not.  If the child is a leftmost child
++   it checks for relocation, possibly dirtying the parent.  Then it performs the recursive
++   step. */
++static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
++{
++	int ret = 0;
++	lock_handle alock;
++	load_count aload;
++	coord_t acoord;
++
++	/* As we ascend at the left-edge of the region to flush, take this opportunity at
++	   the twig level to find our parent-first preceder unless we have already set
++	   it. */
++	if (pos->preceder.blk == 0) {
++		ret = set_preceder(coord, pos);
++		if (ret != 0)
++			return ret;
++	}
++
++	/* If the ancestor is clean or already allocated, or if the child is not a
++	   leftmost child, stop going up, even leaving coord->node not flushprepped. */
++	if (znode_check_flushprepped(coord->node)
++	    || !coord_is_leftmost_unit(coord))
++		return 0;
++
++	init_lh(&alock);
++	init_load_count(&aload);
++	coord_init_invalid(&acoord, NULL);
++
++	/* Only ascend to the next level if it is a leftmost child, but write-lock the
++	   parent in case we will relocate the child. */
++	if (!znode_is_root(coord->node)) {
++
++		ret =
++		    jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
++					    &alock, &aload, ZNODE_WRITE_LOCK,
++					    0);
++		if (ret != 0) {
++			/* FIXME(C): check EINVAL, E_DEADLOCK */
++			goto exit;
++		}
++
++		ret =
++		    reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
++							&acoord, pos);
++		if (ret != 0) {
++			goto exit;
++		}
++
++		/* Recursive call. */
++		if (!znode_check_flushprepped(acoord.node)) {
++			ret = alloc_one_ancestor(&acoord, pos);
++			if (ret)
++				goto exit;
++		}
++	}
++
++	/* Note: we call allocate with the parent write-locked (except at the root) in
++	   case we relocate the child, in which case it will modify the parent during this
++	   call. */
++	ret = allocate_znode(coord->node, &acoord, pos);
++
++      exit:
++	done_load_count(&aload);
++	done_lh(&alock);
++	return ret;
++}
++
++/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
++   a call to this function at the twig level.  During alloc_pos_and_ancestors we may ask:
++   should this node be relocated (in reverse parent-first context)?  We repeat this
++   process as long as the child is the leftmost child, eventually reaching an ancestor of
++   the flush point that is not a leftmost child.  The preceder of that ancestors, which is
++   not a leftmost child, is actually on the leaf level.  The preceder of that block is the
++   left-neighbor of the flush point.  The preceder of that block is the rightmost child of
++   the twig on the left.  So, when alloc_pos_and_ancestors passes upward through the twig
++   level, it stops momentarily to remember the block of the rightmost child of the twig on
++   the left and sets it to the flush_position's preceder_hint.
++
++   There is one other place where we may set the flush_position's preceder hint, which is
++   during scan-left.
++*/
++static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
++{
++	int ret;
++	coord_t coord;
++	lock_handle left_lock;
++	load_count left_load;
++
++	coord_dup(&coord, coord_in);
++
++	init_lh(&left_lock);
++	init_load_count(&left_load);
++
++	/* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
++	   coord_is_leftmost_unit is not the right test if the unformatted child is in the
++	   middle of the first extent unit. */
++	if (!coord_is_leftmost_unit(&coord)) {
++		coord_prev_unit(&coord);
++	} else {
++		ret =
++		    reiser4_get_left_neighbor(&left_lock, coord.node,
++					      ZNODE_READ_LOCK, GN_SAME_ATOM);
++		if (ret) {
++			/* If we fail for any reason it doesn't matter because the
++			   preceder is only a hint.  We are low-priority at this point, so
++			   this must be the case. */
++			if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
++			    ret == -ENOENT || ret == -EINVAL
++			    || ret == -E_DEADLOCK) {
++				ret = 0;
++			}
++			goto exit;
++		}
++
++		ret = incr_load_count_znode(&left_load, left_lock.node);
++		if (ret)
++			goto exit;
++
++		coord_init_last_unit(&coord, left_lock.node);
++	}
++
++	ret =
++	    item_utmost_child_real_block(&coord, RIGHT_SIDE,
++					 &pos->preceder.blk);
++      exit:
++	check_preceder(pos->preceder.blk);
++	done_load_count(&left_load);
++	done_lh(&left_lock);
++	return ret;
++}
++
++/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
++
++/* This procedure implements the outer loop of the flush algorithm.  To put this in
++   context, here is the general list of steps taken by the flush routine as a whole:
++
++   1. Scan-left
++   2. Scan-right (maybe)
++   3. Allocate initial flush position and its ancestors
++   4. <handle extents>
++   5. <squeeze and next position and its ancestors to-the-right,
++       then update position to-the-right>
++   6. <repeat from #4 until flush is stopped>
++
++   This procedure implements the loop in steps 4 through 6 in the above listing.
++
++   Step 4: if the current flush position is an extent item (position on the twig level),
++   it allocates the extent (allocate_extent_item_in_place) then shifts to the next
++   coordinate.  If the next coordinate's leftmost child needs flushprep, we will continue.
++   If the next coordinate is an internal item, we descend back to the leaf level,
++   otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below).  If the "next coordinate"
++   brings us past the end of the twig level, then we call
++   reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
++   step #5 which moves to the right.
++
++   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
++   tree to allocate any ancestors of the next-right flush position that are not also
++   ancestors of the current position.  Those ancestors (in top-down order) are the next in
++   parent-first order.  We squeeze adjacent nodes on the way up until the right node and
++   current node share the same parent, then allocate on the way back down.  Finally, this
++   step sets the flush position to the next-right node.  Then repeat steps 4 and 5.
++*/
++
++/* SQUEEZE CODE */
++
++/* squalloc_right_twig helper function, cut a range of extent items from
++   cut node to->node from the beginning up to coord @to. */
++static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
++				   znode * left)
++{
++	coord_t from;
++	reiser4_key from_key;
++
++	coord_init_first_unit(&from, to->node);
++	item_key_by_coord(&from, &from_key);
++
++	return cut_node_content(&from, to, &from_key, to_key, NULL);
++}
++
++/* Copy as much of the leading extents from @right to @left, allocating
++   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
++   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
++   internal item it calls shift_one_internal_unit and may then return
++   SUBTREE_MOVED. */
++static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
++{
++	int ret = SUBTREE_MOVED;
++	coord_t coord;		/* used to iterate over items */
++	reiser4_key stop_key;
++
++	assert("jmacd-2008", !node_is_empty(right));
++	coord_init_first_unit(&coord, right);
++
++	/* FIXME: can be optimized to cut once */
++	while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
++		ON_DEBUG(void *vp);
++
++		assert("vs-1468", coord_is_leftmost_unit(&coord));
++		ON_DEBUG(vp = shift_check_prepare(left, coord.node));
++
++		/* stop_key is used to find what was copied and what to cut */
++		stop_key = *min_key();
++		ret = squalloc_extent(left, &coord, pos, &stop_key);
++		if (ret != SQUEEZE_CONTINUE) {
++			ON_DEBUG(kfree(vp));
++			break;
++		}
++		assert("vs-1465", !keyeq(&stop_key, min_key()));
++
++		/* Helper function to do the cutting. */
++		set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
++		check_me("vs-1466",
++			 squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
++
++		ON_DEBUG(shift_check(vp, left, coord.node));
++	}
++
++	if (node_is_empty(coord.node))
++		ret = SQUEEZE_SOURCE_EMPTY;
++
++	if (ret == SQUEEZE_TARGET_FULL) {
++		goto out;
++	}
++
++	if (node_is_empty(right)) {
++		/* The whole right node was copied into @left. */
++		assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
++		goto out;
++	}
++
++	coord_init_first_unit(&coord, right);
++
++	if (!item_is_internal(&coord)) {
++		/* we do not want to squeeze anything else to left neighbor because "slum"
++		   is over */
++		ret = SQUEEZE_TARGET_FULL;
++		goto out;
++	}
++	assert("jmacd-433", item_is_internal(&coord));
++
++	/* Shift an internal unit.  The child must be allocated before shifting any more
++	   extents, so we stop here. */
++	ret = shift_one_internal_unit(left, right);
++
++      out:
++	assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
++	       || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
++
++	if (ret == SQUEEZE_TARGET_FULL) {
++		/* We submit prepped nodes here and expect that this @left twig
++		 * will not be modified again during this jnode_flush() call. */
++		int ret1;
++
++		/* NOTE: seems like io is done under long term locks. */
++		ret1 = write_prepped_nodes(pos);
++		if (ret1 < 0)
++			return ret1;
++	}
++
++	return ret;
++}
++
++#if REISER4_DEBUG
++static void item_convert_invariant(flush_pos_t * pos)
++{
++	assert("edward-1225", coord_is_existing_item(&pos->coord));
++	if (chaining_data_present(pos)) {
++		item_plugin *iplug = item_convert_plug(pos);
++
++		assert("edward-1000",
++		       iplug == item_plugin_by_coord(&pos->coord));
++		assert("edward-1001", iplug->f.convert != NULL);
++	} else
++		assert("edward-1226", pos->child == NULL);
++}
++#else
++
++#define item_convert_invariant(pos) noop
++
++#endif
++
++/* Scan node items starting from the first one and apply for each
++   item its flush ->convert() method (if any). This method may
++   resize/kill the item so the tree will be changed.
++*/
++static int convert_node(flush_pos_t * pos, znode * node)
++{
++	int ret = 0;
++	item_plugin *iplug;
++
++	assert("edward-304", pos != NULL);
++	assert("edward-305", pos->child == NULL);
++	assert("edward-475", znode_convertible(node));
++	assert("edward-669", znode_is_wlocked(node));
++	assert("edward-1210", !node_is_empty(node));
++
++	if (znode_get_level(node) != LEAF_LEVEL)
++		/* unsupported */
++		goto exit;
++
++	coord_init_first_unit(&pos->coord, node);
++
++	while (1) {
++		ret = 0;
++		coord_set_to_left(&pos->coord);
++		item_convert_invariant(pos);
++
++		iplug = item_plugin_by_coord(&pos->coord);
++		assert("edward-844", iplug != NULL);
++
++		if (iplug->f.convert) {
++			ret = iplug->f.convert(pos);
++			if (ret)
++				goto exit;
++		}
++		assert("edward-307", pos->child == NULL);
++
++		if (coord_next_item(&pos->coord)) {
++			/* node is over */
++
++			if (!chaining_data_present(pos))
++				/* finished this node */
++				break;
++			if (should_chain_next_node(pos)) {
++				/* go to next node */
++				move_chaining_data(pos, 0 /* to next node */ );
++				break;
++			}
++			/* repeat this node */
++			move_chaining_data(pos, 1 /* this node */ );
++			continue;
++		}
++		/* Node is not over.
++		   Check if there is attached convert data.
++		   If so roll one item position back and repeat
++		   on this node
++		 */
++		if (chaining_data_present(pos)) {
++
++			if (iplug != item_plugin_by_coord(&pos->coord))
++				set_item_convert_count(pos, 0);
++
++			ret = coord_prev_item(&pos->coord);
++			assert("edward-1003", !ret);
++
++			move_chaining_data(pos, 1 /* this node */ );
++		}
++	}
++	JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
++	znode_make_dirty(node);
++      exit:
++	assert("edward-1004", !ret);
++	return ret;
++}
++
++/* Squeeze and allocate the right neighbor.  This is called after @left and
++   its current children have been squeezed and allocated already.  This
++   procedure's job is to squeeze and items from @right to @left.
++
++   If at the leaf level, use the shift_everything_left memcpy-optimized
++   version of shifting (squeeze_right_leaf).
++
++   If at the twig level, extents are allocated as they are shifted from @right
++   to @left (squalloc_right_twig).
++
++   At any other level, shift one internal item and return to the caller
++   (squalloc_parent_first) so that the shifted-subtree can be processed in
++   parent-first order.
++
++   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
++   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
++   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
++   is returned.
++*/
++
++static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
++				  znode * right)
++{
++	int ret;
++
++	/* FIXME it is possible to see empty hasn't-heard-banshee node in a
++	 * tree owing to error (for example, ENOSPC) in write */
++	/* assert("jmacd-9321", !node_is_empty(left)); */
++	assert("jmacd-9322", !node_is_empty(right));
++	assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
++
++	switch (znode_get_level(left)) {
++	case TWIG_LEVEL:
++		/* Shift with extent allocating until either an internal item
++		   is encountered or everything is shifted or no free space
++		   left in @left */
++		ret = squeeze_right_twig(left, right, pos);
++		break;
++
++	default:
++		/* All other levels can use shift_everything until we implement per-item
++		   flush plugins. */
++		ret = squeeze_right_non_twig(left, right);
++		break;
++	}
++
++	assert("jmacd-2011", (ret < 0 ||
++			      ret == SQUEEZE_SOURCE_EMPTY
++			      || ret == SQUEEZE_TARGET_FULL
++			      || ret == SUBTREE_MOVED));
++	return ret;
++}
++
++static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
++						znode * right)
++{
++	int ret;
++
++	ret = squeeze_right_twig(pos->lock.node, right, pos);
++	if (ret < 0)
++		return ret;
++	if (ret > 0) {
++		coord_init_after_last_item(&pos->coord, pos->lock.node);
++		return ret;
++	}
++
++	coord_init_last_unit(&pos->coord, pos->lock.node);
++	return 0;
++}
++
++/* forward declaration */
++static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
++
++/* do a fast check for "same parents" condition before calling
++ * squalloc_upper_levels() */
++static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
++							  znode * left,
++							  znode * right)
++{
++	if (znode_same_parents(left, right))
++		return 0;
++
++	return squalloc_upper_levels(pos, left, right);
++}
++
++/* Check whether the parent of given @right node needs to be processes
++   ((re)allocated) prior to processing of the child.  If @left and @right do not
++   share at least the parent of the @right is after the @left but before the
++   @right in parent-first order, we have to (re)allocate it before the @right
++   gets (re)allocated. */
++static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
++{
++	int ret;
++
++	lock_handle left_parent_lock;
++	lock_handle right_parent_lock;
++
++	load_count left_parent_load;
++	load_count right_parent_load;
++
++	init_lh(&left_parent_lock);
++	init_lh(&right_parent_lock);
++
++	init_load_count(&left_parent_load);
++	init_load_count(&right_parent_load);
++
++	ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
++	if (ret)
++		goto out;
++
++	ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
++	if (ret)
++		goto out;
++
++	/* Check for same parents */
++	if (left_parent_lock.node == right_parent_lock.node)
++		goto out;
++
++	if (znode_check_flushprepped(right_parent_lock.node)) {
++		/* Keep parent-first order.  In the order, the right parent node stands
++		   before the @right node.  If it is already allocated, we set the
++		   preceder (next block search start point) to its block number, @right
++		   node should be allocated after it.
++
++		   However, preceder is set only if the right parent is on twig level.
++		   The explanation is the following: new branch nodes are allocated over
++		   already allocated children while the tree grows, it is difficult to
++		   keep tree ordered, we assume that only leaves and twings are correctly
++		   allocated.  So, only twigs are used as a preceder for allocating of the
++		   rest of the slum. */
++		if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
++			pos->preceder.blk =
++			    *znode_get_block(right_parent_lock.node);
++			check_preceder(pos->preceder.blk);
++		}
++		goto out;
++	}
++
++	ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
++	if (ret)
++		goto out;
++
++	ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
++	if (ret)
++		goto out;
++
++	ret =
++	    squeeze_right_neighbor(pos, left_parent_lock.node,
++				   right_parent_lock.node);
++	/* We stop if error. We stop if some items/units were shifted (ret == 0)
++	 * and thus @right changed its parent. It means we have not process
++	 * right_parent node prior to processing of @right. Positive return
++	 * values say that shifting items was not happen because of "empty
++	 * source" or "target full" conditions. */
++	if (ret <= 0)
++		goto out;
++
++	/* parent(@left) and parent(@right) may have different parents also. We
++	 * do a recursive call for checking that. */
++	ret =
++	    check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
++						    right_parent_lock.node);
++	if (ret)
++		goto out;
++
++	/* allocate znode when going down */
++	ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
++
++      out:
++	done_load_count(&left_parent_load);
++	done_load_count(&right_parent_load);
++
++	done_lh(&left_parent_lock);
++	done_lh(&right_parent_lock);
++
++	return ret;
++}
++
++/* Check the leftmost child "flushprepped" status, also returns true if child
++ * node was not found in cache.  */
++static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
++{
++	int ret;
++	int prepped;
++
++	jnode *child;
++
++	ret = get_leftmost_child_of_unit(coord, &child);
++
++	if (ret)
++		return ret;
++
++	if (child) {
++		prepped = jnode_check_flushprepped(child);
++		jput(child);
++	} else {
++		/* We consider not existing child as a node which slum
++		   processing should not continue to.  Not cached node is clean,
++		   so it is flushprepped. */
++		prepped = 1;
++	}
++
++	return prepped;
++}
++
++/* (re)allocate znode with automated getting parent node */
++static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
++{
++	int ret;
++	lock_handle parent_lock;
++	load_count parent_load;
++	coord_t pcoord;
++
++	assert("zam-851", znode_is_write_locked(node));
++
++	init_lh(&parent_lock);
++	init_load_count(&parent_load);
++
++	ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
++	if (ret)
++		goto out;
++
++	ret = incr_load_count_znode(&parent_load, parent_lock.node);
++	if (ret)
++		goto out;
++
++	ret = find_child_ptr(parent_lock.node, node, &pcoord);
++	if (ret)
++		goto out;
++
++	ret = allocate_znode(node, &pcoord, pos);
++
++      out:
++	done_load_count(&parent_load);
++	done_lh(&parent_lock);
++	return ret;
++}
++
++/* Process nodes on leaf level until unformatted node or rightmost node in the
++ * slum reached.  */
++static int handle_pos_on_formatted(flush_pos_t * pos)
++{
++	int ret;
++	lock_handle right_lock;
++	load_count right_load;
++
++	init_lh(&right_lock);
++	init_load_count(&right_load);
++
++	if (should_convert_node(pos, pos->lock.node)) {
++		ret = convert_node(pos, pos->lock.node);
++		if (ret)
++			return ret;
++	}
++
++	while (1) {
++		ret =
++		    neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
++				     ZNODE_WRITE_LOCK,
++				     !should_convert_next_node(pos,
++							       right_lock.
++							       node));
++		if (ret)
++			break;
++
++		/* we don't prep(allocate) nodes for flushing twice.  This can be suboptimal, or it
++		 * can be optimal.  For now we choose to live with the risk that it will
++		 * be suboptimal because it would be quite complex to code it to be
++		 * smarter. */
++		if (znode_check_flushprepped(right_lock.node)
++		    && !znode_convertible(right_lock.node)) {
++			assert("edward-1005",
++			       !should_convert_next_node(pos, right_lock.node));
++			pos_stop(pos);
++			break;
++		}
++
++		ret = incr_load_count_znode(&right_load, right_lock.node);
++		if (ret)
++			break;
++
++		if (should_convert_node(pos, right_lock.node)) {
++			ret = convert_node(pos, right_lock.node);
++			if (ret)
++				break;
++			if (node_is_empty(right_lock.node)) {
++				/* node became empty after converting, repeat */
++				done_load_count(&right_load);
++				done_lh(&right_lock);
++				continue;
++			}
++		}
++
++		/* squeeze _before_ going upward. */
++		ret =
++		    squeeze_right_neighbor(pos, pos->lock.node,
++					   right_lock.node);
++		if (ret < 0)
++			break;
++
++		if (znode_check_flushprepped(right_lock.node)) {
++			if (should_convert_next_node(pos, right_lock.node)) {
++				/* in spite of flushprepped status of the node,
++				   its right slum neighbor should be converted */
++				assert("edward-953", convert_data(pos));
++				assert("edward-954", item_convert_data(pos));
++
++				if (node_is_empty(right_lock.node)) {
++					done_load_count(&right_load);
++					done_lh(&right_lock);
++				} else
++					move_flush_pos(pos, &right_lock,
++						       &right_load, NULL);
++				continue;
++			}
++			pos_stop(pos);
++			break;
++		}
++
++		if (node_is_empty(right_lock.node)) {
++			/* repeat if right node was squeezed completely */
++			done_load_count(&right_load);
++			done_lh(&right_lock);
++			continue;
++		}
++
++		/* parent(right_lock.node) has to be processed before
++		 * (right_lock.node) due to "parent-first" allocation order. */
++		ret =
++		    check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
++							    right_lock.node);
++		if (ret)
++			break;
++		/* (re)allocate _after_ going upward */
++		ret = lock_parent_and_allocate_znode(right_lock.node, pos);
++		if (ret)
++			break;
++
++		if (should_terminate_squalloc(pos)) {
++			set_item_convert_count(pos, 0);
++			break;
++		}
++
++		/* advance the flush position to the right neighbor */
++		move_flush_pos(pos, &right_lock, &right_load, NULL);
++
++		ret = rapid_flush(pos);
++		if (ret)
++			break;
++	}
++
++	assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
++
++	done_load_count(&right_load);
++	done_lh(&right_lock);
++
++	/* This function indicates via pos whether to stop or go to twig or continue on current
++	 * level. */
++	return ret;
++
++}
++
++/* Process nodes on leaf level until unformatted node or rightmost node in the
++ * slum reached.  */
++static int handle_pos_on_leaf(flush_pos_t * pos)
++{
++	int ret;
++
++	assert("zam-845", pos->state == POS_ON_LEAF);
++
++	ret = handle_pos_on_formatted(pos);
++
++	if (ret == -E_NO_NEIGHBOR) {
++		/* cannot get right neighbor, go process extents. */
++		pos->state = POS_TO_TWIG;
++		return 0;
++	}
++
++	return ret;
++}
++
++/* Process slum on level > 1 */
++static int handle_pos_on_internal(flush_pos_t * pos)
++{
++	assert("zam-850", pos->state == POS_ON_INTERNAL);
++	return handle_pos_on_formatted(pos);
++}
++
++/* check whether squalloc should stop before processing given extent */
++static int squalloc_extent_should_stop(flush_pos_t * pos)
++{
++	assert("zam-869", item_is_extent(&pos->coord));
++
++	/* pos->child is a jnode handle_pos_on_extent() should start with in
++	 * stead of the first child of the first extent unit. */
++	if (pos->child) {
++		int prepped;
++
++		assert("vs-1383", jnode_is_unformatted(pos->child));
++		prepped = jnode_check_flushprepped(pos->child);
++		pos->pos_in_unit =
++		    jnode_get_index(pos->child) -
++		    extent_unit_index(&pos->coord);
++		assert("vs-1470",
++		       pos->pos_in_unit < extent_unit_width(&pos->coord));
++		assert("nikita-3434",
++		       ergo(extent_is_unallocated(&pos->coord),
++			    pos->pos_in_unit == 0));
++		jput(pos->child);
++		pos->child = NULL;
++
++		return prepped;
++	}
++
++	pos->pos_in_unit = 0;
++	if (extent_is_unallocated(&pos->coord))
++		return 0;
++
++	return leftmost_child_of_unit_check_flushprepped(&pos->coord);
++}
++
++/* Handle the case when regular reiser4 tree (znodes connected one to its
++ * neighbors by sibling pointers) is interrupted on leaf level by one or more
++ * unformatted nodes.  By having a lock on twig level and use extent code
++ * routines to process unformatted nodes we swim around an irregular part of
++ * reiser4 tree. */
++static int handle_pos_on_twig(flush_pos_t * pos)
++{
++	int ret;
++
++	assert("zam-844", pos->state == POS_ON_EPOINT);
++	assert("zam-843", item_is_extent(&pos->coord));
++
++	/* We decide should we continue slum processing with current extent
++	   unit: if leftmost child of current extent unit is flushprepped
++	   (i.e. clean or already processed by flush) we stop squalloc().  There
++	   is a fast check for unallocated extents which we assume contain all
++	   not flushprepped nodes. */
++	/* FIXME: Here we implement simple check, we are only looking on the
++	   leftmost child. */
++	ret = squalloc_extent_should_stop(pos);
++	if (ret != 0) {
++		pos_stop(pos);
++		return ret;
++	}
++
++	while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
++	       && item_is_extent(&pos->coord)) {
++		ret = alloc_extent(pos);
++		if (ret) {
++			break;
++		}
++		coord_next_unit(&pos->coord);
++	}
++
++	if (coord_is_after_rightmost(&pos->coord)) {
++		pos->state = POS_END_OF_TWIG;
++		return 0;
++	}
++	if (item_is_internal(&pos->coord)) {
++		pos->state = POS_TO_LEAF;
++		return 0;
++	}
++
++	assert("zam-860", item_is_extent(&pos->coord));
++
++	/* "slum" is over */
++	pos->state = POS_INVALID;
++	return 0;
++}
++
++/* When we about to return flush position from twig to leaf level we can process
++ * the right twig node or move position to the leaf.  This processes right twig
++ * if it is possible and jump to leaf level if not. */
++static int handle_pos_end_of_twig(flush_pos_t * pos)
++{
++	int ret;
++	lock_handle right_lock;
++	load_count right_load;
++	coord_t at_right;
++	jnode *child = NULL;
++
++	assert("zam-848", pos->state == POS_END_OF_TWIG);
++	assert("zam-849", coord_is_after_rightmost(&pos->coord));
++
++	init_lh(&right_lock);
++	init_load_count(&right_load);
++
++	/* We get a lock on the right twig node even it is not dirty because
++	 * slum continues or discontinues on leaf level not on next twig. This
++	 * lock on the right twig is needed for getting its leftmost child. */
++	ret =
++	    reiser4_get_right_neighbor(&right_lock, pos->lock.node,
++				       ZNODE_WRITE_LOCK, GN_SAME_ATOM);
++	if (ret)
++		goto out;
++
++	ret = incr_load_count_znode(&right_load, right_lock.node);
++	if (ret)
++		goto out;
++
++	/* right twig could be not dirty */
++	if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
++		/* If right twig node is dirty we always attempt to squeeze it
++		 * content to the left... */
++	      became_dirty:
++		ret =
++		    squeeze_right_twig_and_advance_coord(pos, right_lock.node);
++		if (ret <= 0) {
++			/* pos->coord is on internal item, go to leaf level, or
++			 * we have an error which will be caught in squalloc() */
++			pos->state = POS_TO_LEAF;
++			goto out;
++		}
++
++		/* If right twig was squeezed completely we wave to re-lock
++		 * right twig. now it is done through the top-level squalloc
++		 * routine. */
++		if (node_is_empty(right_lock.node))
++			goto out;
++
++		/* ... and prep it if it is not yet prepped */
++		if (!znode_check_flushprepped(right_lock.node)) {
++			/* As usual, process parent before ... */
++			ret =
++			    check_parents_and_squalloc_upper_levels(pos,
++								    pos->lock.
++								    node,
++								    right_lock.
++								    node);
++			if (ret)
++				goto out;
++
++			/* ... processing the child */
++			ret =
++			    lock_parent_and_allocate_znode(right_lock.node,
++							   pos);
++			if (ret)
++				goto out;
++		}
++	} else {
++		coord_init_first_unit(&at_right, right_lock.node);
++
++		/* check first child of next twig, should we continue there ? */
++		ret = get_leftmost_child_of_unit(&at_right, &child);
++		if (ret || child == NULL || jnode_check_flushprepped(child)) {
++			pos_stop(pos);
++			goto out;
++		}
++
++		/* check clean twig for possible relocation */
++		if (!znode_check_flushprepped(right_lock.node)) {
++			ret =
++			    reverse_relocate_check_dirty_parent(child,
++								&at_right, pos);
++			if (ret)
++				goto out;
++			if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
++				goto became_dirty;
++		}
++	}
++
++	assert("zam-875", znode_check_flushprepped(right_lock.node));
++
++	/* Update the preceder by a block number of just processed right twig
++	 * node. The code above could miss the preceder updating because
++	 * allocate_znode() could not be called for this node. */
++	pos->preceder.blk = *znode_get_block(right_lock.node);
++	check_preceder(pos->preceder.blk);
++
++	coord_init_first_unit(&at_right, right_lock.node);
++	assert("zam-868", coord_is_existing_unit(&at_right));
++
++	pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
++	move_flush_pos(pos, &right_lock, &right_load, &at_right);
++
++      out:
++	done_load_count(&right_load);
++	done_lh(&right_lock);
++
++	if (child)
++		jput(child);
++
++	return ret;
++}
++
++/* Move the pos->lock to leaf node pointed by pos->coord, check should we
++ * continue there. */
++static int handle_pos_to_leaf(flush_pos_t * pos)
++{
++	int ret;
++	lock_handle child_lock;
++	load_count child_load;
++	jnode *child;
++
++	assert("zam-846", pos->state == POS_TO_LEAF);
++	assert("zam-847", item_is_internal(&pos->coord));
++
++	init_lh(&child_lock);
++	init_load_count(&child_load);
++
++	ret = get_leftmost_child_of_unit(&pos->coord, &child);
++	if (ret)
++		return ret;
++	if (child == NULL) {
++		pos_stop(pos);
++		return 0;
++	}
++
++	if (jnode_check_flushprepped(child)) {
++		pos->state = POS_INVALID;
++		goto out;
++	}
++
++	ret =
++	    longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
++				ZNODE_LOCK_LOPRI);
++	if (ret)
++		goto out;
++
++	ret = incr_load_count_znode(&child_load, JZNODE(child));
++	if (ret)
++		goto out;
++
++	ret = allocate_znode(JZNODE(child), &pos->coord, pos);
++	if (ret)
++		goto out;
++
++	/* move flush position to leaf level */
++	pos->state = POS_ON_LEAF;
++	move_flush_pos(pos, &child_lock, &child_load, NULL);
++
++	if (node_is_empty(JZNODE(child))) {
++		ret = delete_empty_node(JZNODE(child));
++		pos->state = POS_INVALID;
++	}
++      out:
++	done_load_count(&child_load);
++	done_lh(&child_lock);
++	jput(child);
++
++	return ret;
++}
++
++/* move pos from leaf to twig, and move lock from leaf to twig. */
++/* Move pos->lock to upper (twig) level */
++static int handle_pos_to_twig(flush_pos_t * pos)
++{
++	int ret;
++
++	lock_handle parent_lock;
++	load_count parent_load;
++	coord_t pcoord;
++
++	assert("zam-852", pos->state == POS_TO_TWIG);
++
++	init_lh(&parent_lock);
++	init_load_count(&parent_load);
++
++	ret =
++	    reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
++	if (ret)
++		goto out;
++
++	ret = incr_load_count_znode(&parent_load, parent_lock.node);
++	if (ret)
++		goto out;
++
++	ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
++	if (ret)
++		goto out;
++
++	assert("zam-870", item_is_internal(&pcoord));
++	coord_next_item(&pcoord);
++
++	if (coord_is_after_rightmost(&pcoord))
++		pos->state = POS_END_OF_TWIG;
++	else if (item_is_extent(&pcoord))
++		pos->state = POS_ON_EPOINT;
++	else {
++		/* Here we understand that getting -E_NO_NEIGHBOR in
++		 * handle_pos_on_leaf() was because of just a reaching edge of
++		 * slum */
++		pos_stop(pos);
++		goto out;
++	}
++
++	move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
++
++      out:
++	done_load_count(&parent_load);
++	done_lh(&parent_lock);
++
++	return ret;
++}
++
++typedef int (*pos_state_handle_t) (flush_pos_t *);
++static pos_state_handle_t flush_pos_handlers[] = {
++	/* process formatted nodes on leaf level, keep lock on a leaf node */
++	[POS_ON_LEAF] = handle_pos_on_leaf,
++	/* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
++	 * being processed */
++	[POS_ON_EPOINT] = handle_pos_on_twig,
++	/* move a lock from leaf node to its parent for further processing of unformatted nodes */
++	[POS_TO_TWIG] = handle_pos_to_twig,
++	/* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
++	 * pos->coord points to the leaf node we jump to */
++	[POS_TO_LEAF] = handle_pos_to_leaf,
++	/* after processing last extent in the twig node, attempting to shift items from the twigs
++	 * right neighbor and process them while shifting */
++	[POS_END_OF_TWIG] = handle_pos_end_of_twig,
++	/* process formatted nodes on internal level, keep lock on an internal node */
++	[POS_ON_INTERNAL] = handle_pos_on_internal
++};
++
++/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
++ * encrypt) nodes and their ancestors in "parent-first" order */
++static int squalloc(flush_pos_t * pos)
++{
++	int ret = 0;
++
++	/* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
++	 * greater CPU efficiency? Measure and see.... -Hans */
++	while (pos_valid(pos)) {
++		ret = flush_pos_handlers[pos->state] (pos);
++		if (ret < 0)
++			break;
++
++		ret = rapid_flush(pos);
++		if (ret)
++			break;
++	}
++
++	/* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
++	   routines, -E_NO_NEIGHBOR means that slum edge was reached */
++	if (ret > 0 || ret == -E_NO_NEIGHBOR)
++		ret = 0;
++
++	return ret;
++}
++
++static void update_ldkey(znode * node)
++{
++	reiser4_key ldkey;
++
++	assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
++	if (node_is_empty(node))
++		return;
++
++	znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
++}
++
++/* this is to be called after calling of shift node's method to shift data from @right to
++   @left. It sets left delimiting keys of @left and @right to keys of first items of @left
++   and @right correspondingly and sets right delimiting key of @left to first key of @right */
++static void update_znode_dkeys(znode * left, znode * right)
++{
++	assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
++	assert("vs-1629", (znode_is_write_locked(left) &&
++			   znode_is_write_locked(right)));
++
++	/* we need to update left delimiting of left if it was empty before shift */
++	update_ldkey(left);
++	update_ldkey(right);
++	if (node_is_empty(right))
++		znode_set_rd_key(left, znode_get_rd_key(right));
++	else
++		znode_set_rd_key(left, znode_get_ld_key(right));
++}
++
++/* try to shift everything from @right to @left. If everything was shifted -
++   @right is removed from the tree.  Result is the number of bytes shifted. */
++static int
++shift_everything_left(znode * right, znode * left, carry_level * todo)
++{
++	coord_t from;
++	node_plugin *nplug;
++	carry_plugin_info info;
++
++	coord_init_after_last_item(&from, right);
++
++	nplug = node_plugin_by_node(right);
++	info.doing = NULL;
++	info.todo = todo;
++	return nplug->shift(&from, left, SHIFT_LEFT,
++			    1 /* delete @right if it becomes empty */ ,
++			    1
++			    /* move coord @from to node @left if everything will be shifted */
++			    ,
++			    &info);
++}
++
++/* Shift as much as possible from @right to @left using the memcpy-optimized
++   shift_everything_left.  @left and @right are formatted neighboring nodes on
++   leaf level. */
++static int squeeze_right_non_twig(znode * left, znode * right)
++{
++	int ret;
++	carry_pool *pool;
++	carry_level *todo;
++
++	assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
++
++	if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
++	    !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
++		return SQUEEZE_TARGET_FULL;
++
++	pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	todo = (carry_level *) (pool + 1);
++	init_carry_level(todo, pool);
++
++	ret = shift_everything_left(right, left, todo);
++	if (ret > 0) {
++		/* something was shifted */
++		reiser4_tree *tree;
++		__u64 grabbed;
++
++		znode_make_dirty(left);
++		znode_make_dirty(right);
++
++		/* update delimiting keys of nodes which participated in
++		   shift. FIXME: it would be better to have this in shift
++		   node's operation. But it can not be done there. Nobody
++		   remembers why, though */
++		tree = znode_get_tree(left);
++		write_lock_dk(tree);
++		update_znode_dkeys(left, right);
++		write_unlock_dk(tree);
++
++		/* Carry is called to update delimiting key and, maybe, to remove empty
++		   node. */
++		grabbed = get_current_context()->grabbed_blocks;
++		ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
++		assert("nikita-3003", ret == 0);	/* reserved space is exhausted. Ask Hans. */
++		ret = carry(todo, NULL /* previous level */ );
++		grabbed2free_mark(grabbed);
++	} else {
++		/* Shifting impossible, we return appropriate result code */
++		ret =
++		    node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
++		    SQUEEZE_TARGET_FULL;
++	}
++
++	done_carry_pool(pool);
++
++	return ret;
++}
++
++#if REISER4_DEBUG
++static int sibling_link_is_ok(const znode *left, const znode *right)
++{
++	int result;
++
++	read_lock_tree(znode_get_tree(left));
++	result = (left->right == right && left == right->left);
++	read_unlock_tree(znode_get_tree(left));
++	return result;
++}
++#endif
++
++/* Shift first unit of first item if it is an internal one.  Return
++   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
++   SUBTREE_MOVED. */
++static int shift_one_internal_unit(znode * left, znode * right)
++{
++	int ret;
++	carry_pool *pool;
++	carry_level *todo;
++	coord_t *coord;
++	carry_plugin_info *info;
++	int size, moved;
++
++	assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
++	assert("nikita-2435", znode_is_write_locked(left));
++	assert("nikita-2436", znode_is_write_locked(right));
++	assert("nikita-2434", sibling_link_is_ok(left, right));
++
++	pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
++			       sizeof(*coord) + sizeof(*info)
++#if REISER4_DEBUG
++			       + sizeof(*coord) + 2 * sizeof(reiser4_key)
++#endif
++	    );
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	todo = (carry_level *) (pool + 1);
++	init_carry_level(todo, pool);
++
++	coord = (coord_t *) (todo + 3);
++	coord_init_first_unit(coord, right);
++	info = (carry_plugin_info *) (coord + 1);
++
++#if REISER4_DEBUG
++	if (!node_is_empty(left)) {
++		coord_t *last;
++		reiser4_key *right_key;
++		reiser4_key *left_key;
++
++		last = (coord_t *) (info + 1);
++		right_key = (reiser4_key *) (last + 1);
++		left_key = right_key + 1;
++		coord_init_last_unit(last, left);
++
++		assert("nikita-2463",
++		       keyle(item_key_by_coord(last, left_key),
++			     item_key_by_coord(coord, right_key)));
++	}
++#endif
++
++	assert("jmacd-2007", item_is_internal(coord));
++
++	size = item_length_by_coord(coord);
++	info->todo = todo;
++	info->doing = NULL;
++
++	ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
++					       1
++					       /* delete @right if it becomes empty */
++					       ,
++					       0
++					       /* do not move coord @coord to node @left */
++					       ,
++					       info);
++
++	/* If shift returns positive, then we shifted the item. */
++	assert("vs-423", ret <= 0 || size == ret);
++	moved = (ret > 0);
++
++	if (moved) {
++		/* something was moved */
++		reiser4_tree *tree;
++		int grabbed;
++
++		znode_make_dirty(left);
++		znode_make_dirty(right);
++		tree = znode_get_tree(left);
++		write_lock_dk(tree);
++		update_znode_dkeys(left, right);
++		write_unlock_dk(tree);
++
++		/* reserve space for delimiting keys after shifting */
++		grabbed = get_current_context()->grabbed_blocks;
++		ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
++		assert("nikita-3003", ret == 0);	/* reserved space is exhausted. Ask Hans. */
++
++		ret = carry(todo, NULL /* previous level */ );
++		grabbed2free_mark(grabbed);
++	}
++
++	done_carry_pool(pool);
++
++	if (ret != 0) {
++		/* Shift or carry operation failed. */
++		assert("jmacd-7325", ret < 0);
++		return ret;
++	}
++
++	return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
++}
++
++/* Make the final relocate/wander decision during forward parent-first squalloc for a
++   znode.  For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
++static int
++allocate_znode_loaded(znode * node,
++		      const coord_t * parent_coord, flush_pos_t * pos)
++{
++	int ret;
++	reiser4_super_info_data *sbinfo = get_current_super_private();
++	/* FIXME(D): We have the node write-locked and should have checked for !
++	   allocated() somewhere before reaching this point, but there can be a race, so
++	   this assertion is bogus. */
++	assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
++	assert("jmacd-7988", znode_is_write_locked(node));
++	assert("jmacd-7989", coord_is_invalid(parent_coord)
++	       || znode_is_write_locked(parent_coord->node));
++
++	if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
++	    znode_is_root(node) ||
++	    /* We have enough nodes to relocate no matter what. */
++	    (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
++		/* No need to decide with new nodes, they are treated the same as
++		   relocate. If the root node is dirty, relocate. */
++		if (pos->preceder.blk == 0) {
++			/* preceder is unknown and we have decided to relocate node --
++			   using of default value for search start is better than search
++			   from block #0. */
++			get_blocknr_hint_default(&pos->preceder.blk);
++			check_preceder(pos->preceder.blk);
++		}
++
++		goto best_reloc;
++
++	} else if (pos->preceder.blk == 0) {
++		/* If we don't know the preceder, leave it where it is. */
++		jnode_make_wander(ZJNODE(node));
++	} else {
++		/* Make a decision based on block distance. */
++		reiser4_block_nr dist;
++		reiser4_block_nr nblk = *znode_get_block(node);
++
++		assert("jmacd-6172", !blocknr_is_fake(&nblk));
++		assert("jmacd-6173", !blocknr_is_fake(&pos->preceder.blk));
++		assert("jmacd-6174", pos->preceder.blk != 0);
++
++		if (pos->preceder.blk == nblk - 1) {
++			/* Ideal. */
++			jnode_make_wander(ZJNODE(node));
++		} else {
++
++			dist =
++			    (nblk <
++			     pos->preceder.blk) ? (pos->preceder.blk -
++						   nblk) : (nblk -
++							    pos->preceder.blk);
++
++			/* See if we can find a closer block (forward direction only). */
++			pos->preceder.max_dist =
++			    min((reiser4_block_nr) sbinfo->flush.
++				relocate_distance, dist);
++			pos->preceder.level = znode_get_level(node);
++
++			ret = allocate_znode_update(node, parent_coord, pos);
++
++			pos->preceder.max_dist = 0;
++
++			if (ret && (ret != -ENOSPC))
++				return ret;
++
++			if (ret == 0) {
++				/* Got a better allocation. */
++				znode_make_reloc(node, pos->fq);
++			} else if (dist < sbinfo->flush.relocate_distance) {
++				/* The present allocation is good enough. */
++				jnode_make_wander(ZJNODE(node));
++			} else {
++				/* Otherwise, try to relocate to the best position. */
++			      best_reloc:
++				ret =
++				    allocate_znode_update(node, parent_coord,
++							  pos);
++				if (ret != 0)
++					return ret;
++
++				/* set JNODE_RELOC bit _after_ node gets allocated */
++				znode_make_reloc(node, pos->fq);
++			}
++		}
++	}
++
++	/* This is the new preceder. */
++	pos->preceder.blk = *znode_get_block(node);
++	check_preceder(pos->preceder.blk);
++	pos->alloc_cnt += 1;
++
++	assert("jmacd-4277", !blocknr_is_fake(&pos->preceder.blk));
++
++	return 0;
++}
++
++static int
++allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
++{
++	/*
++	 * perform znode allocation with znode pinned in memory to avoid races
++	 * with asynchronous emergency flush (which plays with
++	 * JNODE_FLUSH_RESERVED bit).
++	 */
++	return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
++}
++
++/* A subroutine of allocate_znode, this is called first to see if there is a close
++   position to relocate to.  It may return ENOSPC if there is no close position.  If there
++   is no close position it may not relocate.  This takes care of updating the parent node
++   with the relocated block address. */
++static int
++allocate_znode_update(znode * node, const coord_t * parent_coord,
++		      flush_pos_t * pos)
++{
++	int ret;
++	reiser4_block_nr blk;
++	lock_handle uber_lock;
++	int flush_reserved_used = 0;
++	int grabbed;
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	init_lh(&uber_lock);
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	grabbed = ctx->grabbed_blocks;
++
++	/* discard e-flush allocation */
++	ret = zload(node);
++	if (ret)
++		return ret;
++
++	if (ZF_ISSET(node, JNODE_CREATED)) {
++		assert("zam-816", blocknr_is_fake(znode_get_block(node)));
++		pos->preceder.block_stage = BLOCK_UNALLOCATED;
++	} else {
++		pos->preceder.block_stage = BLOCK_GRABBED;
++
++		/* The disk space for relocating the @node is already reserved in "flush reserved"
++		 * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
++		 * space from whole disk not from only 95%). */
++		if (znode_get_level(node) == LEAF_LEVEL) {
++			/*
++			 * earlier (during do_jnode_make_dirty()) we decided
++			 * that @node can possibly go into overwrite set and
++			 * reserved block for its wandering location.
++			 */
++			txn_atom *atom = get_current_atom_locked();
++			assert("nikita-3449",
++			       ZF_ISSET(node, JNODE_FLUSH_RESERVED));
++			flush_reserved2grabbed(atom, (__u64) 1);
++			spin_unlock_atom(atom);
++			/*
++			 * we are trying to move node into relocate
++			 * set. Allocation of relocated position "uses"
++			 * reserved block.
++			 */
++			ZF_CLR(node, JNODE_FLUSH_RESERVED);
++			flush_reserved_used = 1;
++		} else {
++			ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
++			if (ret != 0)
++				goto exit;
++		}
++	}
++
++	/* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
++	ret = reiser4_alloc_block(&pos->preceder, &blk,
++				  BA_FORMATTED | BA_PERMANENT);
++	if (ret)
++		goto exit;
++
++	if (!ZF_ISSET(node, JNODE_CREATED) &&
++	    (ret =
++	     reiser4_dealloc_block(znode_get_block(node), 0,
++				   BA_DEFER | BA_FORMATTED)))
++		goto exit;
++
++	if (likely(!znode_is_root(node))) {
++		item_plugin *iplug;
++
++		iplug = item_plugin_by_coord(parent_coord);
++		assert("nikita-2954", iplug->f.update != NULL);
++		iplug->f.update(parent_coord, &blk);
++
++		znode_make_dirty(parent_coord->node);
++
++	} else {
++		reiser4_tree *tree = znode_get_tree(node);
++		znode *uber;
++
++		/* We take a longterm lock on the fake node in order to change
++		   the root block number.  This may cause atom fusion. */
++		ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
++				     &uber_lock);
++		/* The fake node cannot be deleted, and we must have priority
++		   here, and may not be confused with ENOSPC. */
++		assert("jmacd-74412",
++		       ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
++
++		if (ret)
++			goto exit;
++
++		uber = uber_lock.node;
++
++		write_lock_tree(tree);
++		tree->root_block = blk;
++		write_unlock_tree(tree);
++
++		znode_make_dirty(uber);
++	}
++
++	ret = znode_rehash(node, &blk);
++      exit:
++	if (ret) {
++		/* Get flush reserved block back if something fails, because
++		 * callers assume that on error block wasn't relocated and its
++		 * flush reserved block wasn't used. */
++		if (flush_reserved_used) {
++			/*
++			 * ok, we failed to move node into relocate
++			 * set. Restore status quo.
++			 */
++			grabbed2flush_reserved((__u64) 1);
++			ZF_SET(node, JNODE_FLUSH_RESERVED);
++		}
++	}
++	zrelse(node);
++	done_lh(&uber_lock);
++	grabbed2free_mark(grabbed);
++	return ret;
++}
++
++/* JNODE INTERFACE */
++
++/* Lock a node (if formatted) and then get its parent locked, set the child's
++   coordinate in the parent.  If the child is the root node, the above_root
++   znode is returned but the coord is not set.  This function may cause atom
++   fusion, but it is only used for read locks (at this point) and therefore
++   fusion only occurs when the parent is already dirty. */
++/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
++   pointer in jnodes. */
++static int
++jnode_lock_parent_coord(jnode * node,
++			coord_t * coord,
++			lock_handle * parent_lh,
++			load_count * parent_zh,
++			znode_lock_mode parent_mode, int try)
++{
++	int ret;
++
++	assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
++	assert("edward-54", jnode_is_unformatted(node)
++	       || znode_is_any_locked(JZNODE(node)));
++
++	if (!jnode_is_znode(node)) {
++		reiser4_key key;
++		tree_level stop_level = TWIG_LEVEL;
++		lookup_bias bias = FIND_EXACT;
++
++		assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
++
++		/* The case when node is not znode, but can have parent coord
++		   (unformatted node, node which represents cluster page,
++		   etc..).  Generate a key for the appropriate entry, search
++		   in the tree using coord_by_key, which handles locking for
++		   us. */
++
++		/*
++		 * nothing is locked at this moment, so, nothing prevents
++		 * concurrent truncate from removing jnode from inode. To
++		 * prevent this spin-lock jnode. jnode can be truncated just
++		 * after call to the jnode_build_key(), but this is ok,
++		 * because coord_by_key() will just fail to find appropriate
++		 * extent.
++		 */
++		spin_lock_jnode(node);
++		if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
++			jnode_build_key(node, &key);
++			ret = 0;
++		} else
++			ret = RETERR(-ENOENT);
++		spin_unlock_jnode(node);
++
++		if (ret != 0)
++			return ret;
++
++		if (jnode_is_cluster_page(node))
++			stop_level = LEAF_LEVEL;
++
++		assert("jmacd-1812", coord != NULL);
++
++		ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
++				   parent_mode, bias, stop_level, stop_level,
++				   CBK_UNIQUE, NULL /*ra_info */ );
++		switch (ret) {
++		case CBK_COORD_NOTFOUND:
++			assert("edward-1038",
++			       ergo(jnode_is_cluster_page(node),
++				    JF_ISSET(node, JNODE_HEARD_BANSHEE)));
++			if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
++				warning("nikita-3177", "Parent not found");
++			return ret;
++		case CBK_COORD_FOUND:
++			if (coord->between != AT_UNIT) {
++				/* FIXME: comment needed */
++				done_lh(parent_lh);
++				if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
++					warning("nikita-3178",
++						"Found but not happy: %i",
++						coord->between);
++				}
++				return RETERR(-ENOENT);
++			}
++			ret = incr_load_count_znode(parent_zh, parent_lh->node);
++			if (ret != 0)
++				return ret;
++			/* if (jnode_is_cluster_page(node)) {
++			   races with write() are possible
++			   check_child_cluster (parent_lh->node);
++			   }
++			 */
++			break;
++		default:
++			return ret;
++		}
++
++	} else {
++		int flags;
++		znode *z;
++
++		z = JZNODE(node);
++		/* Formatted node case: */
++		assert("jmacd-2061", !znode_is_root(z));
++
++		flags = GN_ALLOW_NOT_CONNECTED;
++		if (try)
++			flags |= GN_TRY_LOCK;
++
++		ret =
++		    reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
++		if (ret != 0)
++			/* -E_REPEAT is ok here, it is handled by the caller. */
++			return ret;
++
++		/* Make the child's position "hint" up-to-date.  (Unless above
++		   root, which caller must check.) */
++		if (coord != NULL) {
++
++			ret = incr_load_count_znode(parent_zh, parent_lh->node);
++			if (ret != 0) {
++				warning("jmacd-976812386",
++					"incr_load_count_znode failed: %d",
++					ret);
++				return ret;
++			}
++
++			ret = find_child_ptr(parent_lh->node, z, coord);
++			if (ret != 0) {
++				warning("jmacd-976812",
++					"find_child_ptr failed: %d", ret);
++				return ret;
++			}
++		}
++	}
++
++	return 0;
++}
++
++/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
++   If there is no next neighbor or the neighbor is not in memory or if there is a
++   neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
++   In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
++static int neighbor_in_slum(znode * node,	/* starting point */
++			    lock_handle * lock,	/* lock on starting point */
++			    sideof side,	/* left or right direction we seek the next node in */
++			    znode_lock_mode mode,	/* kind of lock we want */
++			    int check_dirty)
++{				/* true if the neighbor should be dirty */
++	int ret;
++
++	assert("jmacd-6334", znode_is_connected(node));
++
++	ret =
++	    reiser4_get_neighbor(lock, node, mode,
++				 GN_SAME_ATOM | (side ==
++						 LEFT_SIDE ? GN_GO_LEFT : 0));
++
++	if (ret) {
++		/* May return -ENOENT or -E_NO_NEIGHBOR. */
++		/* FIXME(C): check EINVAL, E_DEADLOCK */
++		if (ret == -ENOENT) {
++			ret = RETERR(-E_NO_NEIGHBOR);
++		}
++
++		return ret;
++	}
++	if (!check_dirty)
++		return 0;
++	/* Check dirty bit of locked znode, no races here */
++	if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
++		return 0;
++
++	done_lh(lock);
++	return RETERR(-E_NO_NEIGHBOR);
++}
++
++/* Return true if two znodes have the same parent.  This is called with both nodes
++   write-locked (for squeezing) so no tree lock is needed. */
++static int znode_same_parents(znode * a, znode * b)
++{
++	int result;
++
++	assert("jmacd-7011", znode_is_write_locked(a));
++	assert("jmacd-7012", znode_is_write_locked(b));
++
++	/* We lock the whole tree for this check.... I really don't like whole tree
++	 * locks... -Hans */
++	read_lock_tree(znode_get_tree(a));
++	result = (znode_parent(a) == znode_parent(b));
++	read_unlock_tree(znode_get_tree(a));
++	return result;
++}
++
++/* FLUSH SCAN */
++
++/* Initialize the flush_scan data structure. */
++static void scan_init(flush_scan * scan)
++{
++	memset(scan, 0, sizeof(*scan));
++	init_lh(&scan->node_lock);
++	init_lh(&scan->parent_lock);
++	init_load_count(&scan->parent_load);
++	init_load_count(&scan->node_load);
++	coord_init_invalid(&scan->parent_coord, NULL);
++}
++
++/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
++static void scan_done(flush_scan * scan)
++{
++	done_load_count(&scan->node_load);
++	if (scan->node != NULL) {
++		jput(scan->node);
++		scan->node = NULL;
++	}
++	done_load_count(&scan->parent_load);
++	done_lh(&scan->parent_lock);
++	done_lh(&scan->node_lock);
++}
++
++/* Returns true if flush scanning is finished. */
++int scan_finished(flush_scan * scan)
++{
++	return scan->stop || (scan->direction == RIGHT_SIDE &&
++			      scan->count >= scan->max_count);
++}
++
++/* Return true if the scan should continue to the @tonode.  True if the node meets the
++   same_slum_check condition.  If not, deref the "left" node and stop the scan. */
++int scan_goto(flush_scan * scan, jnode * tonode)
++{
++	int go = same_slum_check(scan->node, tonode, 1, 0);
++
++	if (!go) {
++		scan->stop = 1;
++		jput(tonode);
++	}
++
++	return go;
++}
++
++/* Set the current scan->node, refcount it, increment count by the @add_count (number to
++   count, e.g., skipped unallocated nodes), deref previous current, and copy the current
++   parent coordinate. */
++int
++scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
++		 const coord_t * parent)
++{
++	/* Release the old references, take the new reference. */
++	done_load_count(&scan->node_load);
++
++	if (scan->node != NULL) {
++		jput(scan->node);
++	}
++	scan->node = node;
++	scan->count += add_count;
++
++	/* This next stmt is somewhat inefficient.  The scan_extent_coord code could
++	   delay this update step until it finishes and update the parent_coord only once.
++	   It did that before, but there was a bug and this was the easiest way to make it
++	   correct. */
++	if (parent != NULL) {
++		coord_dup(&scan->parent_coord, parent);
++	}
++
++	/* Failure may happen at the incr_load_count call, but the caller can assume the reference
++	   is safely taken. */
++	return incr_load_count_jnode(&scan->node_load, node);
++}
++
++/* Return true if scanning in the leftward direction. */
++int scanning_left(flush_scan * scan)
++{
++	return scan->direction == LEFT_SIDE;
++}
++
++/* Performs leftward scanning starting from either kind of node.  Counts the starting
++   node.  The right-scan object is passed in for the left-scan in order to copy the parent
++   of an unformatted starting position.  This way we avoid searching for the unformatted
++   node's parent when scanning in each direction.  If we search for the parent once it is
++   set in both scan objects.  The limit parameter tells flush-scan when to stop.
++
++   Rapid scanning is used only during scan_left, where we are interested in finding the
++   'leftpoint' where we begin flushing.  We are interested in stopping at the left child
++   of a twig that does not have a dirty left neighbor.  THIS IS A SPECIAL CASE.  The
++   problem is finding a way to flush only those nodes without unallocated children, and it
++   is difficult to solve in the bottom-up flushing algorithm we are currently using.  The
++   problem can be solved by scanning left at every level as we go upward, but this would
++   basically bring us back to using a top-down allocation strategy, which we already tried
++   (see BK history from May 2002), and has a different set of problems.  The top-down
++   strategy makes avoiding unallocated children easier, but makes it difficult to
++   propertly flush dirty children with clean parents that would otherwise stop the
++   top-down flush, only later to dirty the parent once the children are flushed.  So we
++   solve the problem in the bottom-up algorithm with a special case for twigs and leaves
++   only.
++
++   The first step in solving the problem is this rapid leftward scan.  After we determine
++   that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
++   are no longer interested in the exact count, we are only interested in finding a the
++   best place to start the flush.  We could choose one of two possibilities:
++
++   1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
++   This requires checking one leaf per rapid-scan twig
++
++   2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
++   to the left.  This requires checking possibly all of the in-memory children of each
++   twig during the rapid scan.
++
++   For now we implement the first policy.
++*/
++static int
++scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
++{
++	int ret = 0;
++
++	scan->max_count = limit;
++	scan->direction = LEFT_SIDE;
++
++	ret = scan_set_current(scan, jref(node), 1, NULL);
++	if (ret != 0) {
++		return ret;
++	}
++
++	ret = scan_common(scan, right);
++	if (ret != 0) {
++		return ret;
++	}
++
++	/* Before rapid scanning, we need a lock on scan->node so that we can get its
++	   parent, only if formatted. */
++	if (jnode_is_znode(scan->node)) {
++		ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
++					  ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
++	}
++
++	/* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
++	return ret;
++}
++
++/* Performs rightward scanning... Does not count the starting node.  The limit parameter
++   is described in scan_left.  If the starting node is unformatted then the
++   parent_coord was already set during scan_left.  The rapid_after parameter is not used
++   during right-scanning.
++
++   scan_right is only called if the scan_left operation does not count at least
++   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter is set to
++   the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
++   scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
++static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
++{
++	int ret;
++
++	scan->max_count = limit;
++	scan->direction = RIGHT_SIDE;
++
++	ret = scan_set_current(scan, jref(node), 0, NULL);
++	if (ret != 0) {
++		return ret;
++	}
++
++	return scan_common(scan, NULL);
++}
++
++/* Common code to perform left or right scanning. */
++static int scan_common(flush_scan * scan, flush_scan * other)
++{
++	int ret;
++
++	assert("nikita-2376", scan->node != NULL);
++	assert("edward-54", jnode_is_unformatted(scan->node)
++	       || jnode_is_znode(scan->node));
++
++	/* Special case for starting at an unformatted node.  Optimization: we only want
++	   to search for the parent (which requires a tree traversal) once.  Obviously, we
++	   shouldn't have to call it once for the left scan and once for the right scan.
++	   For this reason, if we search for the parent during scan-left we then duplicate
++	   the coord/lock/load into the scan-right object. */
++	if (jnode_is_unformatted(scan->node)) {
++		ret = scan_unformatted(scan, other);
++		if (ret != 0)
++			return ret;
++	}
++	/* This loop expects to start at a formatted position and performs chaining of
++	   formatted regions */
++	while (!scan_finished(scan)) {
++
++		ret = scan_formatted(scan);
++		if (ret != 0) {
++			return ret;
++		}
++	}
++
++	return 0;
++}
++
++static int scan_unformatted(flush_scan * scan, flush_scan * other)
++{
++	int ret = 0;
++	int try = 0;
++
++	if (!coord_is_invalid(&scan->parent_coord))
++		goto scan;
++
++	/* set parent coord from */
++	if (!jnode_is_unformatted(scan->node)) {
++		/* formatted position */
++
++		lock_handle lock;
++		assert("edward-301", jnode_is_znode(scan->node));
++		init_lh(&lock);
++
++		/*
++		 * when flush starts from unformatted node, first thing it
++		 * does is tree traversal to find formatted parent of starting
++		 * node. This parent is then kept lock across scans to the
++		 * left and to the right. This means that during scan to the
++		 * left we cannot take left-ward lock, because this is
++		 * dead-lock prone. So, if we are scanning to the left and
++		 * there is already lock held by this thread,
++		 * jnode_lock_parent_coord() should use try-lock.
++		 */
++		try = scanning_left(scan)
++		    && !lock_stack_isclean(get_current_lock_stack());
++		/* Need the node locked to get the parent lock, We have to
++		   take write lock since there is at least one call path
++		   where this znode is already write-locked by us. */
++		ret =
++		    longterm_lock_znode(&lock, JZNODE(scan->node),
++					ZNODE_WRITE_LOCK,
++					scanning_left(scan) ? ZNODE_LOCK_LOPRI :
++					ZNODE_LOCK_HIPRI);
++		if (ret != 0)
++			/* EINVAL or E_DEADLOCK here mean... try again!  At this point we've
++			   scanned too far and can't back out, just start over. */
++			return ret;
++
++		ret = jnode_lock_parent_coord(scan->node,
++					      &scan->parent_coord,
++					      &scan->parent_lock,
++					      &scan->parent_load,
++					      ZNODE_WRITE_LOCK, try);
++
++		/* FIXME(C): check EINVAL, E_DEADLOCK */
++		done_lh(&lock);
++		if (ret == -E_REPEAT) {
++			scan->stop = 1;
++			return 0;
++		}
++		if (ret)
++			return ret;
++
++	} else {
++		/* unformatted position */
++
++		ret =
++		    jnode_lock_parent_coord(scan->node, &scan->parent_coord,
++					    &scan->parent_lock,
++					    &scan->parent_load,
++					    ZNODE_WRITE_LOCK, try);
++
++		if (IS_CBKERR(ret))
++			return ret;
++
++		if (ret == CBK_COORD_NOTFOUND)
++			/* FIXME(C): check EINVAL, E_DEADLOCK */
++			return ret;
++
++		/* parent was found */
++		assert("jmacd-8661", other != NULL);
++		/* Duplicate the reference into the other flush_scan. */
++		coord_dup(&other->parent_coord, &scan->parent_coord);
++		copy_lh(&other->parent_lock, &scan->parent_lock);
++		copy_load_count(&other->parent_load, &scan->parent_load);
++	}
++      scan:
++	return scan_by_coord(scan);
++}
++
++/* Performs left- or rightward scanning starting from a formatted node. Follow left
++   pointers under tree lock as long as:
++
++   - node->left/right is non-NULL
++   - node->left/right is connected, dirty
++   - node->left/right belongs to the same atom
++   - scan has not reached maximum count
++*/
++static int scan_formatted(flush_scan * scan)
++{
++	int ret;
++	znode *neighbor = NULL;
++
++	assert("jmacd-1401", !scan_finished(scan));
++
++	do {
++		znode *node = JZNODE(scan->node);
++
++		/* Node should be connected, but if not stop the scan. */
++		if (!znode_is_connected(node)) {
++			scan->stop = 1;
++			break;
++		}
++
++		/* Lock the tree, check-for and reference the next sibling. */
++		read_lock_tree(znode_get_tree(node));
++
++		/* It may be that a node is inserted or removed between a node and its
++		   left sibling while the tree lock is released, but the flush-scan count
++		   does not need to be precise.  Thus, we release the tree lock as soon as
++		   we get the neighboring node. */
++		neighbor = scanning_left(scan) ? node->left : node->right;
++		if (neighbor != NULL) {
++			zref(neighbor);
++		}
++
++		read_unlock_tree(znode_get_tree(node));
++
++		/* If neighbor is NULL at the leaf level, need to check for an unformatted
++		   sibling using the parent--break in any case. */
++		if (neighbor == NULL) {
++			break;
++		}
++
++		/* Check the condition for going left, break if it is not met.  This also
++		   releases (jputs) the neighbor if false. */
++		if (!scan_goto(scan, ZJNODE(neighbor))) {
++			break;
++		}
++
++		/* Advance the flush_scan state to the left, repeat. */
++		ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
++		if (ret != 0) {
++			return ret;
++		}
++
++	} while (!scan_finished(scan));
++
++	/* If neighbor is NULL then we reached the end of a formatted region, or else the
++	   sibling is out of memory, now check for an extent to the left (as long as
++	   LEAF_LEVEL). */
++	if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
++	    || scan_finished(scan)) {
++		scan->stop = 1;
++		return 0;
++	}
++	/* Otherwise, calls scan_by_coord for the right(left)most item of the
++	   left(right) neighbor on the parent level, then possibly continue. */
++
++	coord_init_invalid(&scan->parent_coord, NULL);
++	return scan_unformatted(scan, NULL);
++}
++
++/* NOTE-EDWARD:
++   This scans adjacent items of the same type and calls scan flush plugin for each one.
++   Performs left(right)ward scanning starting from a (possibly) unformatted node.  If we start
++   from unformatted node, then we continue only if the next neighbor is also unformatted.
++   When called from scan_formatted, we skip first iteration (to make sure that
++   right(left)most item of the left(right) neighbor on the parent level is of the same
++   type and set appropriate coord). */
++static int scan_by_coord(flush_scan * scan)
++{
++	int ret = 0;
++	int scan_this_coord;
++	lock_handle next_lock;
++	load_count next_load;
++	coord_t next_coord;
++	jnode *child;
++	item_plugin *iplug;
++
++	init_lh(&next_lock);
++	init_load_count(&next_load);
++	scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
++
++	/* set initial item id */
++	iplug = item_plugin_by_coord(&scan->parent_coord);
++
++	for (; !scan_finished(scan); scan_this_coord = 1) {
++		if (scan_this_coord) {
++			/* Here we expect that unit is scannable. it would not be so due
++			 * to race with extent->tail conversion.  */
++			if (iplug->f.scan == NULL) {
++				scan->stop = 1;
++				ret = -E_REPEAT;
++				/* skip the check at the end. */
++				goto race;
++			}
++
++			ret = iplug->f.scan(scan);
++			if (ret != 0)
++				goto exit;
++
++			if (scan_finished(scan)) {
++				checkchild(scan);
++				break;
++			}
++		} else {
++			/* the same race against truncate as above is possible
++			 * here, it seems */
++
++			/* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
++			   the first coordinate. */
++			assert("jmacd-1231",
++			       item_is_internal(&scan->parent_coord));
++		}
++
++		if (iplug->f.utmost_child == NULL
++		    || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
++			/* stop this coord and continue on parrent level */
++			ret =
++			    scan_set_current(scan,
++					     ZJNODE(zref
++						    (scan->parent_coord.node)),
++					     1, NULL);
++			if (ret != 0)
++				goto exit;
++			break;
++		}
++
++		/* Either way, the invariant is that scan->parent_coord is set to the
++		   parent of scan->node. Now get the next unit. */
++		coord_dup(&next_coord, &scan->parent_coord);
++		coord_sideof_unit(&next_coord, scan->direction);
++
++		/* If off-the-end of the twig, try the next twig. */
++		if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
++			/* We take the write lock because we may start flushing from this
++			 * coordinate. */
++			ret =
++			    neighbor_in_slum(next_coord.node, &next_lock,
++					     scan->direction, ZNODE_WRITE_LOCK,
++					     1 /* check dirty */ );
++			if (ret == -E_NO_NEIGHBOR) {
++				scan->stop = 1;
++				ret = 0;
++				break;
++			}
++
++			if (ret != 0) {
++				goto exit;
++			}
++
++			ret = incr_load_count_znode(&next_load, next_lock.node);
++			if (ret != 0) {
++				goto exit;
++			}
++
++			coord_init_sideof_unit(&next_coord, next_lock.node,
++					       sideof_reverse(scan->direction));
++		}
++
++		iplug = item_plugin_by_coord(&next_coord);
++
++		/* Get the next child. */
++		ret =
++		    iplug->f.utmost_child(&next_coord,
++					  sideof_reverse(scan->direction),
++					  &child);
++		if (ret != 0)
++			goto exit;
++		/* If the next child is not in memory, or, item_utmost_child
++		   failed (due to race with unlink, most probably), stop
++		   here. */
++		if (child == NULL || IS_ERR(child)) {
++			scan->stop = 1;
++			checkchild(scan);
++			break;
++		}
++
++		assert("nikita-2374", jnode_is_unformatted(child)
++		       || jnode_is_znode(child));
++
++		/* See if it is dirty, part of the same atom. */
++		if (!scan_goto(scan, child)) {
++			checkchild(scan);
++			break;
++		}
++
++		/* If so, make this child current. */
++		ret = scan_set_current(scan, child, 1, &next_coord);
++		if (ret != 0)
++			goto exit;
++
++		/* Now continue.  If formatted we release the parent lock and return, then
++		   proceed. */
++		if (jnode_is_znode(child))
++			break;
++
++		/* Otherwise, repeat the above loop with next_coord. */
++		if (next_load.node != NULL) {
++			done_lh(&scan->parent_lock);
++			move_lh(&scan->parent_lock, &next_lock);
++			move_load_count(&scan->parent_load, &next_load);
++		}
++	}
++
++	assert("jmacd-6233", scan_finished(scan) || jnode_is_znode(scan->node));
++      exit:
++	checkchild(scan);
++      race:			/* skip the above check  */
++	if (jnode_is_znode(scan->node)) {
++		done_lh(&scan->parent_lock);
++		done_load_count(&scan->parent_load);
++	}
++
++	done_load_count(&next_load);
++	done_lh(&next_lock);
++	return ret;
++}
++
++/* FLUSH POS HELPERS */
++
++/* Initialize the fields of a flush_position. */
++static void pos_init(flush_pos_t * pos)
++{
++	memset(pos, 0, sizeof *pos);
++
++	pos->state = POS_INVALID;
++	coord_init_invalid(&pos->coord, NULL);
++	init_lh(&pos->lock);
++	init_load_count(&pos->load);
++
++	blocknr_hint_init(&pos->preceder);
++}
++
++/* The flush loop inside squalloc periodically checks pos_valid to
++   determine when "enough flushing" has been performed.  This will return true until one
++   of the following conditions is met:
++
++   1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
++   parameter, meaning we have flushed as many blocks as the kernel requested.  When
++   flushing to commit, this parameter is NULL.
++
++   2. pos_stop() is called because squalloc discovers that the "next" node in the
++   flush order is either non-existant, not dirty, or not in the same atom.
++*/
++
++static int pos_valid(flush_pos_t * pos)
++{
++	return pos->state != POS_INVALID;
++}
++
++/* Release any resources of a flush_position.  Called when jnode_flush finishes. */
++static void pos_done(flush_pos_t * pos)
++{
++	pos_stop(pos);
++	blocknr_hint_done(&pos->preceder);
++	if (convert_data(pos))
++		free_convert_data(pos);
++}
++
++/* Reset the point and parent.  Called during flush subroutines to terminate the
++   squalloc loop. */
++static int pos_stop(flush_pos_t * pos)
++{
++	pos->state = POS_INVALID;
++	done_lh(&pos->lock);
++	done_load_count(&pos->load);
++	coord_init_invalid(&pos->coord, NULL);
++
++	if (pos->child) {
++		jput(pos->child);
++		pos->child = NULL;
++	}
++
++	return 0;
++}
++
++/* Return the flush_position's block allocator hint. */
++reiser4_blocknr_hint *pos_hint(flush_pos_t * pos)
++{
++	return &pos->preceder;
++}
++
++flush_queue_t *pos_fq(flush_pos_t * pos)
++{
++	return pos->fq;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 90
++   LocalWords:  preceder
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/flush.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/flush.h
+@@ -0,0 +1,274 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* DECLARATIONS: */
++
++#if !defined(__REISER4_FLUSH_H__)
++#define __REISER4_FLUSH_H__
++
++#include "plugin/cluster.h"
++
++/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
++   single level of the tree.  A flush-scan is used for counting the number of adjacent
++   nodes to flush, which is used to determine whether we should relocate, and it is also
++   used to find a starting point for flush.  A flush-scan object can scan in both right
++   and left directions via the scan_left() and scan_right() interfaces.  The
++   right- and left-variations are similar but perform different functions.  When scanning
++   left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
++   When scanning right we are simply counting the number of adjacent, dirty nodes. */
++struct flush_scan {
++
++	/* The current number of nodes scanned on this level. */
++	unsigned count;
++
++	/* There may be a maximum number of nodes for a scan on any single level.  When
++	   going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
++	unsigned max_count;
++
++	/* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
++	sideof direction;
++
++	/* Initially @stop is set to false then set true once some condition stops the
++	   search (e.g., we found a clean node before reaching max_count or we found a
++	   node belonging to another atom). */
++	int stop;
++
++	/* The current scan position.  If @node is non-NULL then its reference count has
++	   been incremented to reflect this reference. */
++	jnode *node;
++
++	/* A handle for zload/zrelse of current scan position node. */
++	load_count node_load;
++
++	/* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
++	   node is locked using this lock handle.  The endpoint needs to be locked for
++	   transfer to the flush_position object after scanning finishes. */
++	lock_handle node_lock;
++
++	/* When the position is unformatted, its parent, coordinate, and parent
++	   zload/zrelse handle. */
++	lock_handle parent_lock;
++	coord_t parent_coord;
++	load_count parent_load;
++
++	/* The block allocator preceder hint.  Sometimes flush_scan determines what the
++	   preceder is and if so it sets it here, after which it is copied into the
++	   flush_position.  Otherwise, the preceder is computed later. */
++	reiser4_block_nr preceder_blk;
++};
++
++typedef struct convert_item_info {
++	dc_item_stat d_cur;	/* disk cluster state of the current item */
++	dc_item_stat d_next;	/* disk cluster state of the next slum item */
++	struct inode *inode;
++	flow_t flow;
++} convert_item_info_t;
++
++typedef struct convert_info {
++	int count;		/* for squalloc terminating */
++	reiser4_cluster_t clust;	/* transform cluster */
++	item_plugin *iplug;	/* current item plugin */
++	convert_item_info_t *itm;	/* current item info */
++} convert_info_t;
++
++typedef enum flush_position_state {
++	POS_INVALID,		/* Invalid or stopped pos, do not continue slum
++				 * processing */
++	POS_ON_LEAF,		/* pos points to already prepped, locked formatted node at
++				 * leaf level */
++	POS_ON_EPOINT,		/* pos keeps a lock on twig level, "coord" field is used
++				 * to traverse unformatted nodes */
++	POS_TO_LEAF,		/* pos is being moved to leaf level */
++	POS_TO_TWIG,		/* pos is being moved to twig level */
++	POS_END_OF_TWIG,	/* special case of POS_ON_TWIG, when coord is after
++				 * rightmost unit of the current twig */
++	POS_ON_INTERNAL		/* same as POS_ON_LEAF, but points to internal node */
++} flushpos_state_t;
++
++/* An encapsulation of the current flush point and all the parameters that are passed
++   through the entire squeeze-and-allocate stage of the flush routine.  A single
++   flush_position object is constructed after left- and right-scanning finishes. */
++struct flush_position {
++	flushpos_state_t state;
++
++	coord_t coord;		/* coord to traverse unformatted nodes */
++	lock_handle lock;	/* current lock we hold */
++	load_count load;	/* load status for current locked formatted node  */
++
++	jnode *child;		/* for passing a reference to unformatted child
++				 * across pos state changes */
++
++	reiser4_blocknr_hint preceder;	/* The flush 'hint' state. */
++	int leaf_relocate;	/* True if enough leaf-level nodes were
++				 * found to suggest a relocate policy. */
++	int alloc_cnt;		/* The number of nodes allocated during squeeze and allococate. */
++	int prep_or_free_cnt;	/* The number of nodes prepared for write (allocate) or squeezed and freed. */
++	flush_queue_t *fq;
++	long *nr_written;	/* number of nodes submitted to disk */
++	int flags;		/* a copy of jnode_flush flags argument */
++
++	znode *prev_twig;	/* previous parent pointer value, used to catch
++				 * processing of new twig node */
++	convert_info_t *sq;	/* convert info */
++
++	unsigned long pos_in_unit;	/* for extents only. Position
++					   within an extent unit of first
++					   jnode of slum */
++	long nr_to_write;	/* number of unformatted nodes to handle on flush */
++};
++
++static inline int item_convert_count(flush_pos_t * pos)
++{
++	return pos->sq->count;
++}
++static inline void inc_item_convert_count(flush_pos_t * pos)
++{
++	pos->sq->count++;
++}
++static inline void set_item_convert_count(flush_pos_t * pos, int count)
++{
++	pos->sq->count = count;
++}
++static inline item_plugin *item_convert_plug(flush_pos_t * pos)
++{
++	return pos->sq->iplug;
++}
++
++static inline convert_info_t *convert_data(flush_pos_t * pos)
++{
++	return pos->sq;
++}
++
++static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
++{
++	assert("edward-955", convert_data(pos));
++	return pos->sq->itm;
++}
++
++static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
++{
++	return &pos->sq->clust.tc;
++}
++
++static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
++{
++	assert("edward-854", pos->sq != NULL);
++	return tfm_stream(tfm_cluster_sq(pos), id);
++}
++
++static inline int chaining_data_present(flush_pos_t * pos)
++{
++	return convert_data(pos) && item_convert_data(pos);
++}
++
++/* Returns true if next node contains next item of the disk cluster
++   so item convert data should be moved to the right slum neighbor.
++*/
++static inline int should_chain_next_node(flush_pos_t * pos)
++{
++	int result = 0;
++
++	assert("edward-1007", chaining_data_present(pos));
++
++	switch (item_convert_data(pos)->d_next) {
++	case DC_CHAINED_ITEM:
++		result = 1;
++		break;
++	case DC_AFTER_CLUSTER:
++		break;
++	default:
++		impossible("edward-1009", "bad state of next slum item");
++	}
++	return result;
++}
++
++/* update item state in a disk cluster to assign conversion mode */
++static inline void
++move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
++{
++
++	assert("edward-1010", chaining_data_present(pos));
++
++	if (this_node == 0) {
++		/* next item is on the right neighbor */
++		assert("edward-1011",
++		       item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
++		       item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
++		assert("edward-1012",
++		       item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
++
++		item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
++		item_convert_data(pos)->d_next = DC_INVALID_STATE;
++	} else {
++		/* next item is on the same node */
++		assert("edward-1013",
++		       item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
++		       item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
++		assert("edward-1227",
++		       item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
++		       item_convert_data(pos)->d_next == DC_INVALID_STATE);
++
++		item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
++		item_convert_data(pos)->d_next = DC_INVALID_STATE;
++	}
++}
++
++static inline int should_convert_node(flush_pos_t * pos, znode * node)
++{
++	return znode_convertible(node);
++}
++
++/* true if there is attached convert item info */
++static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
++{
++	return convert_data(pos) && item_convert_data(pos);
++}
++
++#define SQUALLOC_THRESHOLD 256
++
++static inline int should_terminate_squalloc(flush_pos_t * pos)
++{
++	return convert_data(pos) &&
++	    !item_convert_data(pos) &&
++	    item_convert_count(pos) >= SQUALLOC_THRESHOLD;
++}
++
++void free_convert_data(flush_pos_t * pos);
++/* used in extent.c */
++int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
++		     const coord_t * parent);
++int scan_finished(flush_scan * scan);
++int scanning_left(flush_scan * scan);
++int scan_goto(flush_scan * scan, jnode * tonode);
++txn_atom *atom_locked_by_fq(flush_queue_t * fq);
++int alloc_extent(flush_pos_t *flush_pos);
++squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
++			       reiser4_key *stop_key);
++extern int init_fqs(void);
++extern void done_fqs(void);
++
++#if REISER4_DEBUG
++
++extern void check_fq(const txn_atom *atom);
++extern atomic_t flush_cnt;
++
++#define check_preceder(blk) \
++assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
++extern void check_pos(flush_pos_t * pos);
++#else
++#define check_preceder(b) noop
++#define check_pos(pos) noop
++#endif
++
++/* __REISER4_FLUSH_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 90
++   LocalWords:  preceder
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/flush_queue.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/flush_queue.c
+@@ -0,0 +1,681 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "debug.h"
++#include "super.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "page_cache.h"
++#include "wander.h"
++#include "vfs_ops.h"
++#include "writeout.h"
++#include "flush.h"
++
++#include <linux/bio.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++#include <linux/blkdev.h>
++#include <linux/writeback.h>
++
++/* A flush queue object is an accumulator for keeping jnodes prepared
++   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
++   kept on the flush queue until memory pressure or atom commit asks
++   flush queues to write some or all from their jnodes. */
++
++/*
++   LOCKING:
++
++   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
++   list protected by atom spin lock.  fq->prepped list uses the following
++   locking:
++
++   two ways to protect fq->prepped list for read-only list traversal:
++
++   1. atom spin-lock atom.
++   2. fq is IN_USE, atom->nr_running_queues increased.
++
++   and one for list modification:
++
++   1. atom is spin-locked and one condition is true: fq is IN_USE or
++      atom->nr_running_queues == 0.
++
++   The deadlock-safe order for flush queues and atoms is: first lock atom, then
++   lock flush queue, then lock jnode.
++*/
++
++#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
++#define fq_ready(fq)           (!fq_in_use(fq))
++
++#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
++#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
++
++/* get lock on atom from locked flush queue object */
++static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
++{
++	/* This code is similar to jnode_get_atom(), look at it for the
++	 * explanation. */
++	txn_atom *atom;
++
++	assert_spin_locked(&(fq->guard));
++
++	while (1) {
++		atom = fq->atom;
++		if (atom == NULL)
++			break;
++
++		if (spin_trylock_atom(atom))
++			break;
++
++		atomic_inc(&atom->refcount);
++		spin_unlock(&(fq->guard));
++		spin_lock_atom(atom);
++		spin_lock(&(fq->guard));
++
++		if (fq->atom == atom) {
++			atomic_dec(&atom->refcount);
++			break;
++		}
++
++		spin_unlock(&(fq->guard));
++		atom_dec_and_unlock(atom);
++		spin_lock(&(fq->guard));
++	}
++
++	return atom;
++}
++
++txn_atom *atom_locked_by_fq(flush_queue_t * fq)
++{
++	txn_atom *atom;
++
++	spin_lock(&(fq->guard));
++	atom = atom_locked_by_fq_nolock(fq);
++	spin_unlock(&(fq->guard));
++	return atom;
++}
++
++static void init_fq(flush_queue_t * fq)
++{
++	memset(fq, 0, sizeof *fq);
++
++	atomic_set(&fq->nr_submitted, 0);
++
++	INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
++
++	sema_init(&fq->io_sem, 0);
++	spin_lock_init(&fq->guard);
++}
++
++/* slab for flush queues */
++static kmem_cache_t *fq_slab;
++
++
++/**
++ * init_fqs - create flush queue cache
++ *
++ * Initializes slab cache of flush queues. It is part of reiser4 module
++ * initialization.
++ */
++int init_fqs(void)
++{
++	fq_slab = kmem_cache_create("fq",
++				    sizeof(flush_queue_t),
++				    0, SLAB_HWCACHE_ALIGN, NULL, NULL);
++	if (fq_slab == NULL)
++		return RETERR(-ENOMEM);
++	return 0;
++}
++
++/**
++ * done_fqs - delete flush queue cache
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_fqs(void)
++{
++	destroy_reiser4_cache(&fq_slab);
++}
++
++/* create new flush queue object */
++static flush_queue_t *create_fq(gfp_t gfp)
++{
++	flush_queue_t *fq;
++
++	fq = kmem_cache_alloc(fq_slab, gfp);
++	if (fq)
++		init_fq(fq);
++
++	return fq;
++}
++
++/* adjust atom's and flush queue's counters of queued nodes */
++static void count_enqueued_node(flush_queue_t * fq)
++{
++	ON_DEBUG(fq->atom->num_queued++);
++}
++
++static void count_dequeued_node(flush_queue_t * fq)
++{
++	assert("zam-993", fq->atom->num_queued > 0);
++	ON_DEBUG(fq->atom->num_queued--);
++}
++
++/* attach flush queue object to the atom */
++static void attach_fq(txn_atom *atom, flush_queue_t *fq)
++{
++	assert_spin_locked(&(atom->alock));
++	list_add(&fq->alink, &atom->flush_queues);
++	fq->atom = atom;
++	ON_DEBUG(atom->nr_flush_queues++);
++}
++
++static void detach_fq(flush_queue_t * fq)
++{
++	assert_spin_locked(&(fq->atom->alock));
++
++	spin_lock(&(fq->guard));
++	list_del_init(&fq->alink);
++	assert("vs-1456", fq->atom->nr_flush_queues > 0);
++	ON_DEBUG(fq->atom->nr_flush_queues--);
++	fq->atom = NULL;
++	spin_unlock(&(fq->guard));
++}
++
++/* destroy flush queue object */
++static void done_fq(flush_queue_t * fq)
++{
++	assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
++	assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
++
++	kmem_cache_free(fq_slab, fq);
++}
++
++/* */
++void mark_jnode_queued(flush_queue_t * fq, jnode * node)
++{
++	JF_SET(node, JNODE_FLUSH_QUEUED);
++	count_enqueued_node(fq);
++}
++
++/* Putting jnode into the flush queue. Both atom and jnode should be
++   spin-locked. */
++void queue_jnode(flush_queue_t * fq, jnode * node)
++{
++	assert_spin_locked(&(node->guard));
++	assert("zam-713", node->atom != NULL);
++	assert_spin_locked(&(node->atom->alock));
++	assert("zam-716", fq->atom != NULL);
++	assert("zam-717", fq->atom == node->atom);
++	assert("zam-907", fq_in_use(fq));
++
++	assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
++	assert("zam-826", JF_ISSET(node, JNODE_RELOC));
++	assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
++	assert("vs-1481", NODE_LIST(node) != FQ_LIST);
++
++	mark_jnode_queued(fq, node);
++	list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
++
++	ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
++			     FQ_LIST, 1));
++}
++
++/* repeatable process for waiting io completion on a flush queue object */
++static int wait_io(flush_queue_t * fq, int *nr_io_errors)
++{
++	assert("zam-738", fq->atom != NULL);
++	assert_spin_locked(&(fq->atom->alock));
++	assert("zam-736", fq_in_use(fq));
++	assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
++
++	if (atomic_read(&fq->nr_submitted) != 0) {
++		struct super_block *super;
++
++		spin_unlock_atom(fq->atom);
++
++		assert("nikita-3013", schedulable());
++
++		super = reiser4_get_current_sb();
++
++		/* FIXME: this is instead of blk_run_queues() */
++		blk_run_address_space(get_super_fake(super)->i_mapping);
++
++		if (!(super->s_flags & MS_RDONLY))
++			down(&fq->io_sem);
++
++		/* Ask the caller to re-acquire the locks and call this
++		   function again. Note: this technique is commonly used in
++		   the txnmgr code. */
++		return -E_REPEAT;
++	}
++
++	*nr_io_errors += atomic_read(&fq->nr_errors);
++	return 0;
++}
++
++/* wait on I/O completion, re-submit dirty nodes to write */
++static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
++{
++	int ret;
++	txn_atom *atom = fq->atom;
++
++	assert("zam-801", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++	assert("zam-762", fq_in_use(fq));
++
++	ret = wait_io(fq, nr_io_errors);
++	if (ret)
++		return ret;
++
++	detach_fq(fq);
++	done_fq(fq);
++
++	atom_send_event(atom);
++
++	return 0;
++}
++
++/* wait for all i/o for given atom to be completed, actually do one iteration
++   on that and return -E_REPEAT if there more iterations needed */
++static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
++{
++	flush_queue_t *fq;
++
++	assert_spin_locked(&(atom->alock));
++
++	if (list_empty_careful(&atom->flush_queues))
++		return 0;
++
++	list_for_each_entry(fq, &atom->flush_queues, alink) {
++		if (fq_ready(fq)) {
++			int ret;
++
++			mark_fq_in_use(fq);
++			assert("vs-1247", fq->owner == NULL);
++			ON_DEBUG(fq->owner = current);
++			ret = finish_fq(fq, nr_io_errors);
++
++			if (*nr_io_errors)
++				reiser4_handle_error();
++
++			if (ret) {
++				fq_put(fq);
++				return ret;
++			}
++
++			spin_unlock_atom(atom);
++
++			return -E_REPEAT;
++		}
++	}
++
++	/* All flush queues are in use; atom remains locked */
++	return -EBUSY;
++}
++
++/* wait all i/o for current atom */
++int current_atom_finish_all_fq(void)
++{
++	txn_atom *atom;
++	int nr_io_errors = 0;
++	int ret = 0;
++
++	do {
++		while (1) {
++			atom = get_current_atom_locked();
++			ret = finish_all_fq(atom, &nr_io_errors);
++			if (ret != -EBUSY)
++				break;
++			atom_wait_event(atom);
++		}
++	} while (ret == -E_REPEAT);
++
++	/* we do not need locked atom after this function finishes, SUCCESS or
++	   -EBUSY are two return codes when atom remains locked after
++	   finish_all_fq */
++	if (!ret)
++		spin_unlock_atom(atom);
++
++	assert_spin_not_locked(&(atom->alock));
++
++	if (ret)
++		return ret;
++
++	if (nr_io_errors)
++		return RETERR(-EIO);
++
++	return 0;
++}
++
++/* change node->atom field for all jnode from given list */
++static void
++scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
++{
++	jnode *cur;
++
++	list_for_each_entry(cur, list, capture_link) {
++		spin_lock_jnode(cur);
++		cur->atom = atom;
++		spin_unlock_jnode(cur);
++	}
++}
++
++/* support for atom fusion operation */
++void fuse_fq(txn_atom *to, txn_atom *from)
++{
++	flush_queue_t *fq;
++
++	assert_spin_locked(&(to->alock));
++	assert_spin_locked(&(from->alock));
++
++	list_for_each_entry(fq, &from->flush_queues, alink) {
++		scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
++		spin_lock(&(fq->guard));
++		fq->atom = to;
++		spin_unlock(&(fq->guard));
++	}
++
++	list_splice_init(&from->flush_queues, to->flush_queues.prev);
++
++#if REISER4_DEBUG
++	to->num_queued += from->num_queued;
++	to->nr_flush_queues += from->nr_flush_queues;
++	from->nr_flush_queues = 0;
++#endif
++}
++
++#if REISER4_DEBUG
++int atom_fq_parts_are_clean(txn_atom * atom)
++{
++	assert("zam-915", atom != NULL);
++	return list_empty_careful(&atom->flush_queues);
++}
++#endif
++/* Bio i/o completion routine for reiser4 write operations. */
++static int
++end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
++	       int err)
++{
++	int i;
++	int nr_errors = 0;
++	flush_queue_t *fq;
++
++	assert("zam-958", bio->bi_rw & WRITE);
++
++	/* i/o op. is not fully completed */
++	if (bio->bi_size != 0)
++		return 1;
++
++	if (err == -EOPNOTSUPP)
++		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
++
++	/* we expect that bio->private is set to NULL or fq object which is used
++	 * for synchronization and error counting. */
++	fq = bio->bi_private;
++	/* Check all elements of io_vec for correct write completion. */
++	for (i = 0; i < bio->bi_vcnt; i += 1) {
++		struct page *pg = bio->bi_io_vec[i].bv_page;
++
++		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
++			SetPageError(pg);
++			nr_errors++;
++		}
++
++		{
++			/* jnode WRITEBACK ("write is in progress bit") is
++			 * atomically cleared here. */
++			jnode *node;
++
++			assert("zam-736", pg != NULL);
++			assert("zam-736", PagePrivate(pg));
++			node = jprivate(pg);
++
++			JF_CLR(node, JNODE_WRITEBACK);
++		}
++
++		end_page_writeback(pg);
++		page_cache_release(pg);
++	}
++
++	if (fq) {
++		/* count i/o error in fq object */
++		atomic_add(nr_errors, &fq->nr_errors);
++
++		/* If all write requests registered in this "fq" are done we up
++		 * the semaphore. */
++		if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
++			up(&fq->io_sem);
++	}
++
++	bio_put(bio);
++	return 0;
++}
++
++/* Count I/O requests which will be submitted by @bio in given flush queues
++   @fq */
++void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
++{
++	bio->bi_private = fq;
++	bio->bi_end_io = end_io_handler;
++
++	if (fq)
++		atomic_add(bio->bi_vcnt, &fq->nr_submitted);
++}
++
++/* Move all queued nodes out from @fq->prepped list. */
++static void release_prepped_list(flush_queue_t * fq)
++{
++	txn_atom *atom;
++
++	assert("zam-904", fq_in_use(fq));
++	atom = atom_locked_by_fq(fq);
++
++	while (!list_empty(ATOM_FQ_LIST(fq))) {
++		jnode *cur;
++
++		cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
++		list_del_init(&cur->capture_link);
++
++		count_dequeued_node(fq);
++		spin_lock_jnode(cur);
++		assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
++		assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
++		assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
++		JF_CLR(cur, JNODE_FLUSH_QUEUED);
++
++		if (JF_ISSET(cur, JNODE_DIRTY)) {
++			list_add_tail(&cur->capture_link,
++				      ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
++			ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
++					     DIRTY_LIST, 1));
++		} else {
++			list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
++			ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
++					     CLEAN_LIST, 1));
++		}
++
++		spin_unlock_jnode(cur);
++	}
++
++	if (--atom->nr_running_queues == 0)
++		atom_send_event(atom);
++
++	spin_unlock_atom(atom);
++}
++
++/* Submit write requests for nodes on the already filled flush queue @fq.
++
++   @fq: flush queue object which contains jnodes we can (and will) write.
++   @return: number of submitted blocks (>=0) if success, otherwise -- an error
++            code (<0). */
++int write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
++{
++	int ret;
++	txn_atom *atom;
++
++	while (1) {
++		atom = atom_locked_by_fq(fq);
++		assert("zam-924", atom);
++		/* do not write fq in parallel. */
++		if (atom->nr_running_queues == 0
++		    || !(flags & WRITEOUT_SINGLE_STREAM))
++			break;
++		atom_wait_event(atom);
++	}
++
++	atom->nr_running_queues++;
++	spin_unlock_atom(atom);
++
++	ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
++	release_prepped_list(fq);
++
++	return ret;
++}
++
++/* Getting flush queue object for exclusive use by one thread. May require
++   several iterations which is indicated by -E_REPEAT return code.
++
++   This function does not contain code for obtaining an atom lock because an
++   atom lock is obtained by different ways in different parts of reiser4,
++   usually it is current atom, but we need a possibility for getting fq for the
++   atom of given jnode. */
++static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
++{
++	flush_queue_t *fq;
++
++	assert_spin_locked(&(atom->alock));
++
++	fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
++	while (&atom->flush_queues != &fq->alink) {
++		spin_lock(&(fq->guard));
++
++		if (fq_ready(fq)) {
++			mark_fq_in_use(fq);
++			assert("vs-1246", fq->owner == NULL);
++			ON_DEBUG(fq->owner = current);
++			spin_unlock(&(fq->guard));
++
++			if (*new_fq)
++				done_fq(*new_fq);
++
++			*new_fq = fq;
++
++			return 0;
++		}
++
++		spin_unlock(&(fq->guard));
++
++		fq = list_entry(fq->alink.next, flush_queue_t, alink);
++	}
++
++	/* Use previously allocated fq object */
++	if (*new_fq) {
++		mark_fq_in_use(*new_fq);
++		assert("vs-1248", (*new_fq)->owner == 0);
++		ON_DEBUG((*new_fq)->owner = current);
++		attach_fq(atom, *new_fq);
++
++		return 0;
++	}
++
++	spin_unlock_atom(atom);
++
++	*new_fq = create_fq(gfp);
++
++	if (*new_fq == NULL)
++		return RETERR(-ENOMEM);
++
++	return RETERR(-E_REPEAT);
++}
++
++int fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
++{
++	return fq_by_atom_gfp(atom, new_fq, get_gfp_mask());
++}
++
++/* A wrapper around fq_by_atom for getting a flush queue object for current
++ * atom, if success fq->atom remains locked. */
++flush_queue_t *get_fq_for_current_atom(void)
++{
++	flush_queue_t *fq = NULL;
++	txn_atom *atom;
++	int ret;
++
++	do {
++		atom = get_current_atom_locked();
++		ret = fq_by_atom(atom, &fq);
++	} while (ret == -E_REPEAT);
++
++	if (ret)
++		return ERR_PTR(ret);
++	return fq;
++}
++
++/* Releasing flush queue object after exclusive use */
++void fq_put_nolock(flush_queue_t *fq)
++{
++	assert("zam-747", fq->atom != NULL);
++	assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
++	mark_fq_ready(fq);
++	assert("vs-1245", fq->owner == current);
++	ON_DEBUG(fq->owner = NULL);
++}
++
++void fq_put(flush_queue_t * fq)
++{
++	txn_atom *atom;
++
++	spin_lock(&(fq->guard));
++	atom = atom_locked_by_fq_nolock(fq);
++
++	assert("zam-746", atom != NULL);
++
++	fq_put_nolock(fq);
++	atom_send_event(atom);
++
++	spin_unlock(&(fq->guard));
++	spin_unlock_atom(atom);
++}
++
++/* A part of atom object initialization related to the embedded flush queue
++   list head */
++
++void init_atom_fq_parts(txn_atom *atom)
++{
++	INIT_LIST_HEAD(&atom->flush_queues);
++}
++
++#if REISER4_DEBUG
++
++void check_fq(const txn_atom *atom)
++{
++	/* check number of nodes on all atom's flush queues */
++	flush_queue_t *fq;
++	int count;
++	struct list_head *pos;
++
++	count = 0;
++	list_for_each_entry(fq, &atom->flush_queues, alink) {
++		spin_lock(&(fq->guard));
++		/* calculate number of jnodes on fq' list of prepped jnodes */
++		list_for_each(pos, ATOM_FQ_LIST(fq))
++			count++;
++		spin_unlock(&(fq->guard));
++	}
++	if (count != atom->fq)
++		warning("", "fq counter %d, real %d\n", atom->fq, count);
++
++}
++
++#endif
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/forward.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/forward.h
+@@ -0,0 +1,258 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Forward declarations. Thank you Kernighan. */
++
++#if !defined( __REISER4_FORWARD_H__ )
++#define __REISER4_FORWARD_H__
++
++#include <asm/errno.h>
++#include <linux/types.h>
++
++typedef struct zlock zlock;
++typedef struct lock_stack lock_stack;
++typedef struct lock_handle lock_handle;
++typedef struct znode znode;
++typedef struct flow flow_t;
++typedef struct coord coord_t;
++typedef struct tree_access_pointer tap_t;
++typedef struct item_coord item_coord;
++typedef struct shift_params shift_params;
++typedef struct reiser4_object_create_data reiser4_object_create_data;
++typedef union reiser4_plugin reiser4_plugin;
++typedef __u16 reiser4_plugin_id;
++typedef struct item_plugin item_plugin;
++typedef struct jnode_plugin jnode_plugin;
++typedef struct reiser4_item_data reiser4_item_data;
++typedef union reiser4_key reiser4_key;
++typedef struct reiser4_tree reiser4_tree;
++typedef struct carry_cut_data carry_cut_data;
++typedef struct carry_kill_data carry_kill_data;
++typedef struct carry_tree_op carry_tree_op;
++typedef struct carry_tree_node carry_tree_node;
++typedef struct carry_plugin_info carry_plugin_info;
++typedef struct reiser4_journal reiser4_journal;
++typedef struct txn_atom txn_atom;
++typedef struct txn_handle txn_handle;
++typedef struct txn_mgr txn_mgr;
++typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
++typedef struct reiser4_context reiser4_context;
++typedef struct carry_level carry_level;
++typedef struct blocknr_set blocknr_set;
++typedef struct blocknr_set_entry blocknr_set_entry;
++/* super_block->s_fs_info points to this */
++typedef struct reiser4_super_info_data reiser4_super_info_data;
++/* next two objects are fields of reiser4_super_info_data */
++typedef struct reiser4_oid_allocator reiser4_oid_allocator;
++typedef struct reiser4_space_allocator reiser4_space_allocator;
++
++typedef struct flush_scan flush_scan;
++typedef struct flush_position flush_pos_t;
++
++typedef unsigned short pos_in_node_t;
++#define MAX_POS_IN_NODE 65535
++
++typedef struct jnode jnode;
++typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
++
++typedef struct uf_coord uf_coord_t;
++typedef struct hint hint_t;
++
++typedef struct ktxnmgrd_context ktxnmgrd_context;
++
++typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
++
++struct inode;
++struct page;
++struct file;
++struct dentry;
++struct super_block;
++
++/* return values of coord_by_key(). cbk == coord_by_key */
++typedef enum {
++	CBK_COORD_FOUND = 0,
++	CBK_COORD_NOTFOUND = -ENOENT,
++} lookup_result;
++
++/* results of lookup with directory file */
++typedef enum {
++	FILE_NAME_FOUND = 0,
++	FILE_NAME_NOTFOUND = -ENOENT,
++	FILE_IO_ERROR = -EIO,	/* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
++	FILE_OOM = -ENOMEM	/* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
++} file_lookup_result;
++
++/* behaviors of lookup. If coord we are looking for is actually in a tree,
++    both coincide. */
++typedef enum {
++	/* search exactly for the coord with key given */
++	FIND_EXACT,
++	/* search for coord with the maximal key not greater than one
++	   given */
++	FIND_MAX_NOT_MORE_THAN	/*LEFT_SLANT_BIAS */
++} lookup_bias;
++
++typedef enum {
++	/* number of leaf level of the tree
++	   The fake root has (tree_level=0). */
++	LEAF_LEVEL = 1,
++
++	/* number of level one above leaf level of the tree.
++
++	   It is supposed that internal tree used by reiser4 to store file
++	   system data and meta data will have height 2 initially (when
++	   created by mkfs).
++	 */
++	TWIG_LEVEL = 2,
++} tree_level;
++
++/* The "real" maximum ztree height is the 0-origin size of any per-level
++   array, since the zero'th level is not used. */
++#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
++
++/* enumeration of possible mutual position of item and coord.  This enum is
++    return type of ->is_in_item() item plugin method which see. */
++typedef enum {
++	/* coord is on the left of an item */
++	IP_ON_THE_LEFT,
++	/* coord is inside item */
++	IP_INSIDE,
++	/* coord is inside item, but to the right of the rightmost unit of
++	   this item */
++	IP_RIGHT_EDGE,
++	/* coord is on the right of an item */
++	IP_ON_THE_RIGHT
++} interposition;
++
++/* type of lock to acquire on znode before returning it to caller */
++typedef enum {
++	ZNODE_NO_LOCK = 0,
++	ZNODE_READ_LOCK = 1,
++	ZNODE_WRITE_LOCK = 2,
++} znode_lock_mode;
++
++/* type of lock request */
++typedef enum {
++	ZNODE_LOCK_LOPRI = 0,
++	ZNODE_LOCK_HIPRI = (1 << 0),
++
++	/* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
++	   waiting for the lock to become available.  If the lock is unavailable, reiser4_znode_lock will immediately
++	   return the value -E_REPEAT. */
++	ZNODE_LOCK_NONBLOCK = (1 << 1),
++	/* An option for longterm_lock_znode which prevents atom fusion */
++	ZNODE_LOCK_DONT_FUSE = (1 << 2)
++} znode_lock_request;
++
++typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
++
++/* used to specify direction of shift. These must be -1 and 1 */
++typedef enum {
++	SHIFT_LEFT = 1,
++	SHIFT_RIGHT = -1
++} shift_direction;
++
++typedef enum {
++	LEFT_SIDE,
++	RIGHT_SIDE
++} sideof;
++
++#define round_up( value, order )						\
++	( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) &	\
++			     ~( ( order ) - 1 ) ) )
++
++/* values returned by squalloc_right_neighbor and its auxiliary functions */
++typedef enum {
++	/* unit of internal item is moved */
++	SUBTREE_MOVED = 0,
++	/* nothing else can be squeezed into left neighbor */
++	SQUEEZE_TARGET_FULL = 1,
++	/* all content of node is squeezed into its left neighbor */
++	SQUEEZE_SOURCE_EMPTY = 2,
++	/* one more item is copied (this is only returned by
++	   allocate_and_copy_extent to squalloc_twig)) */
++	SQUEEZE_CONTINUE = 3
++} squeeze_result;
++
++/* Do not change items ids. If you do - there will be format change */
++typedef enum {
++	STATIC_STAT_DATA_ID = 0x0,
++	SIMPLE_DIR_ENTRY_ID = 0x1,
++	COMPOUND_DIR_ID = 0x2,
++	NODE_POINTER_ID = 0x3,
++	EXTENT_POINTER_ID = 0x5,
++	FORMATTING_ID = 0x6,
++	CTAIL_ID = 0x7,
++	BLACK_BOX_ID = 0x8,
++	LAST_ITEM_ID = 0x9
++} item_id;
++
++/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
++   whether commit() was called or VM memory pressure was applied. */
++typedef enum {
++	/* submit flush queue to disk at jnode_flush completion */
++	JNODE_FLUSH_WRITE_BLOCKS = 1,
++
++	/* flush is called for commit */
++	JNODE_FLUSH_COMMIT = 2,
++	/* not implemented */
++	JNODE_FLUSH_MEMORY_FORMATTED = 4,
++
++	/* not implemented */
++	JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
++} jnode_flush_flags;
++
++/* Flags to insert/paste carry operations. Currently they only used in
++   flushing code, but in future, they can be used to optimize for repetitive
++   accesses.  */
++typedef enum {
++	/* carry is not allowed to shift data to the left when trying to find
++	   free space  */
++	COPI_DONT_SHIFT_LEFT = (1 << 0),
++	/* carry is not allowed to shift data to the right when trying to find
++	   free space  */
++	COPI_DONT_SHIFT_RIGHT = (1 << 1),
++	/* carry is not allowed to allocate new node(s) when trying to find
++	   free space */
++	COPI_DONT_ALLOCATE = (1 << 2),
++	/* try to load left neighbor if its not in a cache */
++	COPI_LOAD_LEFT = (1 << 3),
++	/* try to load right neighbor if its not in a cache */
++	COPI_LOAD_RIGHT = (1 << 4),
++	/* shift insertion point to the left neighbor */
++	COPI_GO_LEFT = (1 << 5),
++	/* shift insertion point to the right neighbor */
++	COPI_GO_RIGHT = (1 << 6),
++	/* try to step back into original node if insertion into new node
++	   fails after shifting data there. */
++	COPI_STEP_BACK = (1 << 7)
++} cop_insert_flag;
++
++typedef enum {
++	SAFE_UNLINK,		/* safe-link for unlink */
++	SAFE_TRUNCATE		/* safe-link for truncate */
++} reiser4_safe_link_t;
++
++/* this is to show on which list of atom jnode is */
++typedef enum {
++	NOT_CAPTURED,
++	DIRTY_LIST,
++	CLEAN_LIST,
++	FQ_LIST,
++	WB_LIST,
++	OVRWR_LIST
++} atom_list;
++
++
++
++/* __REISER4_FORWARD_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/fsdata.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/fsdata.c
+@@ -0,0 +1,803 @@
++/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "fsdata.h"
++#include "inode.h"
++
++
++/* cache or dir_cursors */
++static kmem_cache_t *d_cursor_cache;
++static struct shrinker *d_cursor_shrinker;
++
++/* list of unused cursors */
++static LIST_HEAD(cursor_cache);
++
++/* number of cursors in list of ununsed cursors */
++static unsigned long d_cursor_unused = 0;
++
++/* spinlock protecting manipulations with dir_cursor's hash table and lists */
++DEFINE_SPINLOCK(d_lock);
++
++static reiser4_file_fsdata *create_fsdata(struct file *file);
++static int file_is_stateless(struct file *file);
++static void free_fsdata(reiser4_file_fsdata *fsdata);
++static void kill_cursor(dir_cursor *);
++
++/**
++ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
++ * @nr: number of objects to free
++ * @mask: GFP mask
++ *
++ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
++ * number. Return number of still freeable cursors.
++ */
++static int d_cursor_shrink(int nr, gfp_t mask)
++{
++	if (nr != 0) {
++		dir_cursor *scan;
++		int killed;
++
++		killed = 0;
++		spin_lock(&d_lock);
++		while (!list_empty(&cursor_cache)) {
++			scan = list_entry(cursor_cache.next, dir_cursor, alist);
++			assert("nikita-3567", scan->ref == 0);
++			kill_cursor(scan);
++			++killed;
++			--nr;
++			if (nr == 0)
++				break;
++		}
++		spin_unlock(&d_lock);
++	}
++	return d_cursor_unused;
++}
++
++/**
++ * init_d_cursor - create d_cursor cache
++ *
++ * Initializes slab cache of d_cursors. It is part of reiser4 module
++ * initialization.
++ */
++int init_d_cursor(void)
++{
++	d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
++					   SLAB_HWCACHE_ALIGN, NULL, NULL);
++	if (d_cursor_cache == NULL)
++		return RETERR(-ENOMEM);
++
++	/*
++	 * actually, d_cursors are "priceless", because there is no way to
++	 * recover information stored in them. On the other hand, we don't
++	 * want to consume all kernel memory by them. As a compromise, just
++	 * assign higher "seeks" value to d_cursor cache, so that it will be
++	 * shrunk only if system is really tight on memory.
++	 */
++	d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
++					 d_cursor_shrink);
++	if (d_cursor_shrinker == NULL) {
++		destroy_reiser4_cache(&d_cursor_cache);
++		d_cursor_cache = NULL;
++		return RETERR(-ENOMEM);
++	}
++	return 0;
++}
++
++/**
++ * done_d_cursor - delete d_cursor cache and d_cursor shrinker
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_d_cursor(void)
++{
++	BUG_ON(d_cursor_shrinker == NULL);
++	remove_shrinker(d_cursor_shrinker);
++	d_cursor_shrinker = NULL;
++
++	destroy_reiser4_cache(&d_cursor_cache);
++}
++
++#define D_CURSOR_TABLE_SIZE (256)
++
++static inline unsigned long
++d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
++{
++	assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
++	return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
++}
++
++static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
++{
++	return k1->cid == k2->cid && k1->oid == k2->oid;
++}
++
++/*
++ * define functions to manipulate reiser4 super block's hash table of
++ * dir_cursors
++ */
++#define KMALLOC(size) kmalloc((size), get_gfp_mask())
++#define KFREE(ptr, size) kfree(ptr)
++TYPE_SAFE_HASH_DEFINE(d_cursor,
++		      dir_cursor,
++		      d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
++#undef KFREE
++#undef KMALLOC
++
++/**
++ * init_super_d_info - initialize per-super-block d_cursor resources
++ * @super: super block to initialize
++ *
++ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
++ * of mount.
++ */
++int init_super_d_info(struct super_block *super)
++{
++	d_cursor_info *p;
++
++	p = &get_super_private(super)->d_info;
++
++	INIT_RADIX_TREE(&p->tree, get_gfp_mask());
++	return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
++}
++
++/**
++ * done_super_d_info - release per-super-block d_cursor resources
++ * @super: super block being umounted
++ *
++ * It is called on umount. Kills all directory cursors attached to suoer block.
++ */
++void done_super_d_info(struct super_block *super)
++{
++	d_cursor_info *d_info;
++	dir_cursor *cursor, *next;
++
++	d_info = &get_super_private(super)->d_info;
++	for_all_in_htable(&d_info->table, d_cursor, cursor, next)
++		kill_cursor(cursor);
++
++	BUG_ON(d_info->tree.rnode != NULL);
++	d_cursor_hash_done(&d_info->table);
++}
++
++/**
++ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
++ * @cursor: cursor to free
++ *
++ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
++ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
++ * indices, hash table, list of unused cursors and frees it.
++ */
++static void kill_cursor(dir_cursor *cursor)
++{
++	unsigned long index;
++
++	assert("nikita-3566", cursor->ref == 0);
++	assert("nikita-3572", cursor->fsdata != NULL);
++
++	index = (unsigned long)cursor->key.oid;
++	list_del_init(&cursor->fsdata->dir.linkage);
++	free_fsdata(cursor->fsdata);
++	cursor->fsdata = NULL;
++
++	if (list_empty_careful(&cursor->list))
++		/* this is last cursor for a file. Kill radix-tree entry */
++		radix_tree_delete(&cursor->info->tree, index);
++	else {
++		void **slot;
++
++		/*
++		 * there are other cursors for the same oid.
++		 */
++
++		/*
++		 * if radix tree point to the cursor being removed, re-target
++		 * radix tree slot to the next cursor in the (non-empty as was
++		 * checked above) element of the circular list of all cursors
++		 * for this oid.
++		 */
++		slot = radix_tree_lookup_slot(&cursor->info->tree, index);
++		assert("nikita-3571", *slot != NULL);
++		if (*slot == cursor)
++			*slot = list_entry(cursor->list.next, dir_cursor, list);
++		/* remove cursor from circular list */
++		list_del_init(&cursor->list);
++	}
++	/* remove cursor from the list of unused cursors */
++	list_del_init(&cursor->alist);
++	/* remove cursor from the hash table */
++	d_cursor_hash_remove(&cursor->info->table, cursor);
++	/* and free it */
++	kmem_cache_free(d_cursor_cache, cursor);
++	--d_cursor_unused;
++}
++
++/* possible actions that can be performed on all cursors for the given file */
++enum cursor_action {
++	/*
++	 * load all detached state: this is called when stat-data is loaded
++	 * from the disk to recover information about all pending readdirs
++	 */
++	CURSOR_LOAD,
++	/*
++	 * detach all state from inode, leaving it in the cache. This is called
++	 * when inode is removed form the memory by memory pressure
++	 */
++	CURSOR_DISPOSE,
++	/*
++	 * detach cursors from the inode, and free them. This is called when
++	 * inode is destroyed
++	 */
++	CURSOR_KILL
++};
++
++/*
++ * return d_cursor data for the file system @inode is in.
++ */
++static inline d_cursor_info *d_info(struct inode *inode)
++{
++	return &get_super_private(inode->i_sb)->d_info;
++}
++
++/*
++ * lookup d_cursor in the per-super-block radix tree.
++ */
++static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
++{
++	return (dir_cursor *) radix_tree_lookup(&info->tree, index);
++}
++
++/*
++ * attach @cursor to the radix tree. There may be multiple cursors for the
++ * same oid, they are chained into circular list.
++ */
++static void bind_cursor(dir_cursor * cursor, unsigned long index)
++{
++	dir_cursor *head;
++
++	head = lookup(cursor->info, index);
++	if (head == NULL) {
++		/* this is the first cursor for this index */
++		INIT_LIST_HEAD(&cursor->list);
++		radix_tree_insert(&cursor->info->tree, index, cursor);
++	} else {
++		/* some cursor already exists. Chain ours */
++		list_add(&cursor->list, &head->list);
++	}
++}
++
++/*
++ * detach fsdata (if detachable) from file descriptor, and put cursor on the
++ * "unused" list. Called when file descriptor is not longer in active use.
++ */
++static void clean_fsdata(struct file *file)
++{
++	dir_cursor *cursor;
++	reiser4_file_fsdata *fsdata;
++
++	assert("nikita-3570", file_is_stateless(file));
++
++	fsdata = (reiser4_file_fsdata *) file->private_data;
++	if (fsdata != NULL) {
++		cursor = fsdata->cursor;
++		if (cursor != NULL) {
++			spin_lock(&d_lock);
++			--cursor->ref;
++			if (cursor->ref == 0) {
++				list_add_tail(&cursor->alist, &cursor_cache);
++				++d_cursor_unused;
++			}
++			spin_unlock(&d_lock);
++			file->private_data = NULL;
++		}
++	}
++}
++
++/*
++ * global counter used to generate "client ids". These ids are encoded into
++ * high bits of fpos.
++ */
++static __u32 cid_counter = 0;
++#define CID_SHIFT (20)
++#define CID_MASK  (0xfffffull)
++
++static void free_file_fsdata_nolock(struct file *);
++
++/**
++ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
++ * @cursor:
++ * @file:
++ * @inode:
++ *
++ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
++ * reiser4 super block's hash table and radix tree.
++ add detachable readdir
++ * state to the @f
++ */
++static int insert_cursor(dir_cursor *cursor, struct file *file,
++			 struct inode *inode)
++{
++	int result;
++	reiser4_file_fsdata *fsdata;
++
++	memset(cursor, 0, sizeof *cursor);
++
++	/* this is either first call to readdir, or rewind. Anyway, create new
++	 * cursor. */
++	fsdata = create_fsdata(NULL);
++	if (fsdata != NULL) {
++		result = radix_tree_preload(get_gfp_mask());
++		if (result == 0) {
++			d_cursor_info *info;
++			oid_t oid;
++
++			info = d_info(inode);
++			oid = get_inode_oid(inode);
++			/* cid occupies higher 12 bits of f->f_pos. Don't
++			 * allow it to become negative: this confuses
++			 * nfsd_readdir() */
++			cursor->key.cid = (++cid_counter) & 0x7ff;
++			cursor->key.oid = oid;
++			cursor->fsdata = fsdata;
++			cursor->info = info;
++			cursor->ref = 1;
++
++			spin_lock_inode(inode);
++			/* install cursor as @f's private_data, discarding old
++			 * one if necessary */
++#if REISER4_DEBUG
++			if (file->private_data)
++				warning("", "file has fsdata already");
++#endif
++			clean_fsdata(file);
++			free_file_fsdata_nolock(file);
++			file->private_data = fsdata;
++			fsdata->cursor = cursor;
++			spin_unlock_inode(inode);
++			spin_lock(&d_lock);
++			/* insert cursor into hash table */
++			d_cursor_hash_insert(&info->table, cursor);
++			/* and chain it into radix-tree */
++			bind_cursor(cursor, (unsigned long)oid);
++			spin_unlock(&d_lock);
++			radix_tree_preload_end();
++			file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
++		}
++	} else
++		result = RETERR(-ENOMEM);
++	return result;
++}
++
++/**
++ * process_cursors - do action on each cursor attached to inode
++ * @inode:
++ * @act: action to do
++ *
++ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
++ * and performs action specified by @act on each of cursors.
++ */
++static void process_cursors(struct inode *inode, enum cursor_action act)
++{
++	oid_t oid;
++	dir_cursor *start;
++	struct list_head *head;
++	reiser4_context *ctx;
++	d_cursor_info *info;
++
++	/* this can be called by
++	 *
++	 * kswapd->...->prune_icache->..reiser4_destroy_inode
++	 *
++	 * without reiser4_context
++	 */
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx)) {
++		warning("vs-23", "failed to init context");
++		return;
++	}
++
++	assert("nikita-3558", inode != NULL);
++
++	info = d_info(inode);
++	oid = get_inode_oid(inode);
++	spin_lock_inode(inode);
++	head = get_readdir_list(inode);
++	spin_lock(&d_lock);
++	/* find any cursor for this oid: reference to it is hanging of radix
++	 * tree */
++	start = lookup(info, (unsigned long)oid);
++	if (start != NULL) {
++		dir_cursor *scan;
++		reiser4_file_fsdata *fsdata;
++
++		/* process circular list of cursors for this oid */
++		scan = start;
++		do {
++			dir_cursor *next;
++
++			next = list_entry(scan->list.next, dir_cursor, list);
++			fsdata = scan->fsdata;
++			assert("nikita-3557", fsdata != NULL);
++			if (scan->key.oid == oid) {
++				switch (act) {
++				case CURSOR_DISPOSE:
++					list_del_init(&fsdata->dir.linkage);
++					break;
++				case CURSOR_LOAD:
++					list_add(&fsdata->dir.linkage, head);
++					break;
++				case CURSOR_KILL:
++					kill_cursor(scan);
++					break;
++				}
++			}
++			if (scan == next)
++				/* last cursor was just killed */
++				break;
++			scan = next;
++		} while (scan != start);
++	}
++	spin_unlock(&d_lock);
++	/* check that we killed 'em all */
++	assert("nikita-3568",
++	       ergo(act == CURSOR_KILL,
++		    list_empty_careful(get_readdir_list(inode))));
++	assert("nikita-3569",
++	       ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
++	spin_unlock_inode(inode);
++	reiser4_exit_context(ctx);
++}
++
++/**
++ * dispose_cursors - removes cursors from inode's list
++ * @inode: inode to dispose cursors of
++ *
++ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
++ * attached to cursor from inode's readdir list. This is called when inode is
++ * removed from the memory by memory pressure.
++ */
++void dispose_cursors(struct inode *inode)
++{
++	process_cursors(inode, CURSOR_DISPOSE);
++}
++
++/**
++ * load_cursors - attach cursors to inode
++ * @inode: inode to load cursors to
++ *
++ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
++ * attached to cursor to inode's readdir list. This is done when inode is
++ * loaded into memory.
++ */
++void load_cursors(struct inode *inode)
++{
++	process_cursors(inode, CURSOR_LOAD);
++}
++
++/**
++ * kill_cursors - kill all inode cursors
++ * @inode: inode to kill cursors of
++ *
++ * Frees all cursors for this inode. This is called when inode is destroyed.
++ */
++void kill_cursors(struct inode *inode)
++{
++	process_cursors(inode, CURSOR_KILL);
++}
++
++/**
++ * file_is_stateless -
++ * @file:
++ *
++ * true, if file descriptor @f is created by NFS server by "demand" to serve
++ * one file system operation. This means that there may be "detached state"
++ * for underlying inode.
++ */
++static int file_is_stateless(struct file *file)
++{
++	return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
++}
++
++/**
++ * get_dir_fpos -
++ * @dir:
++ *
++ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
++ * in the case of stateless directory operation (readdir-over-nfs), client id
++ * was encoded in the high bits of cookie and should me masked off.
++ */
++loff_t get_dir_fpos(struct file *dir)
++{
++	if (file_is_stateless(dir))
++		return dir->f_pos & CID_MASK;
++	else
++		return dir->f_pos;
++}
++
++/**
++ * try_to_attach_fsdata - ???
++ * @file:
++ * @inode:
++ *
++ * Finds or creates cursor for readdir-over-nfs.
++ */
++int try_to_attach_fsdata(struct file *file, struct inode *inode)
++{
++	loff_t pos;
++	int result;
++	dir_cursor *cursor;
++
++	/*
++	 * we are serialized by inode->i_mutex
++	 */
++	if (!file_is_stateless(file))
++		return 0;
++
++	pos = file->f_pos;
++	result = 0;
++	if (pos == 0) {
++		/*
++		 * first call to readdir (or rewind to the beginning of
++		 * directory)
++		 */
++		cursor = kmem_cache_alloc(d_cursor_cache, get_gfp_mask());
++		if (cursor != NULL)
++			result = insert_cursor(cursor, file, inode);
++		else
++			result = RETERR(-ENOMEM);
++	} else {
++		/* try to find existing cursor */
++		d_cursor_key key;
++
++		key.cid = pos >> CID_SHIFT;
++		key.oid = get_inode_oid(inode);
++		spin_lock(&d_lock);
++		cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
++		if (cursor != NULL) {
++			/* cursor was found */
++			if (cursor->ref == 0) {
++				/* move it from unused list */
++				list_del_init(&cursor->alist);
++				--d_cursor_unused;
++			}
++			++cursor->ref;
++		}
++		spin_unlock(&d_lock);
++		if (cursor != NULL) {
++			spin_lock_inode(inode);
++			assert("nikita-3556", cursor->fsdata->back == NULL);
++			clean_fsdata(file);
++			free_file_fsdata_nolock(file);
++			file->private_data = cursor->fsdata;
++			spin_unlock_inode(inode);
++		}
++	}
++	return result;
++}
++
++/**
++ * detach_fsdata - ???
++ * @file:
++ *
++ * detach fsdata, if necessary
++ */
++void detach_fsdata(struct file *file)
++{
++	struct inode *inode;
++
++	if (!file_is_stateless(file))
++		return;
++
++	inode = file->f_dentry->d_inode;
++	spin_lock_inode(inode);
++	clean_fsdata(file);
++	spin_unlock_inode(inode);
++}
++
++/* slab for reiser4_dentry_fsdata */
++static kmem_cache_t *dentry_fsdata_cache;
++
++/**
++ * init_dentry_fsdata - create cache of dentry_fsdata
++ *
++ * Initializes slab cache of structures attached to denty->d_fsdata. It is
++ * part of reiser4 module initialization.
++ */
++int init_dentry_fsdata(void)
++{
++	dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
++						sizeof(reiser4_dentry_fsdata),
++						0,
++						SLAB_HWCACHE_ALIGN |
++						SLAB_RECLAIM_ACCOUNT, NULL,
++						NULL);
++	if (dentry_fsdata_cache == NULL)
++		return RETERR(-ENOMEM);
++	return 0;
++}
++
++/**
++ * done_dentry_fsdata - delete cache of dentry_fsdata
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_dentry_fsdata(void)
++{
++	destroy_reiser4_cache(&dentry_fsdata_cache);
++}
++
++/**
++ * reiser4_get_dentry_fsdata - get fs-specific dentry data
++ * @dentry: queried dentry
++ *
++ * Allocates if necessary and returns per-dentry data that we attach to each
++ * dentry.
++ */
++reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
++{
++	assert("nikita-1365", dentry != NULL);
++
++	if (dentry->d_fsdata == NULL) {
++		dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
++						    get_gfp_mask());
++		if (dentry->d_fsdata == NULL)
++			return ERR_PTR(RETERR(-ENOMEM));
++		memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
++	}
++	return dentry->d_fsdata;
++}
++
++/**
++ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
++ * @dentry: dentry to free fsdata of
++ *
++ * Detaches and frees fs-specific dentry data
++ */
++void reiser4_free_dentry_fsdata(struct dentry *dentry)
++{
++	if (dentry->d_fsdata != NULL) {
++		kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
++		dentry->d_fsdata = NULL;
++	}
++}
++
++
++/* slab for reiser4_file_fsdata */
++static kmem_cache_t *file_fsdata_cache;
++
++/**
++ * init_file_fsdata - create cache of reiser4_file_fsdata
++ *
++ * Initializes slab cache of structures attached to file->private_data. It is
++ * part of reiser4 module initialization.
++ */
++int init_file_fsdata(void)
++{
++	file_fsdata_cache = kmem_cache_create("file_fsdata",
++					      sizeof(reiser4_file_fsdata),
++					      0,
++					      SLAB_HWCACHE_ALIGN |
++					      SLAB_RECLAIM_ACCOUNT, NULL, NULL);
++	if (file_fsdata_cache == NULL)
++		return RETERR(-ENOMEM);
++	return 0;
++}
++
++/**
++ * done_file_fsdata - delete cache of reiser4_file_fsdata
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_file_fsdata(void)
++{
++	destroy_reiser4_cache(&file_fsdata_cache);
++}
++
++/**
++ * create_fsdata - allocate and initialize reiser4_file_fsdata
++ * @file: what to create file_fsdata for, may be NULL
++ *
++ * Allocates and initializes reiser4_file_fsdata structure.
++ */
++static reiser4_file_fsdata *create_fsdata(struct file *file)
++{
++	reiser4_file_fsdata *fsdata;
++
++	fsdata = kmem_cache_alloc(file_fsdata_cache, get_gfp_mask());
++	if (fsdata != NULL) {
++		memset(fsdata, 0, sizeof *fsdata);
++		fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
++		fsdata->back = file;
++		INIT_LIST_HEAD(&fsdata->dir.linkage);
++	}
++	return fsdata;
++}
++
++/**
++ * free_fsdata - free reiser4_file_fsdata
++ * @fsdata: object to free
++ *
++ * Dual to create_fsdata(). Free reiser4_file_fsdata.
++ */
++static void free_fsdata(reiser4_file_fsdata *fsdata)
++{
++	BUG_ON(fsdata == NULL);
++	kmem_cache_free(file_fsdata_cache, fsdata);
++}
++
++/**
++ * reiser4_get_file_fsdata - get fs-specific file data
++ * @file: queried file
++ *
++ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
++ * to @file.
++ */
++reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
++{
++	assert("nikita-1603", file != NULL);
++
++	if (file->private_data == NULL) {
++		reiser4_file_fsdata *fsdata;
++		struct inode *inode;
++
++		fsdata = create_fsdata(file);
++		if (fsdata == NULL)
++			return ERR_PTR(RETERR(-ENOMEM));
++
++		inode = file->f_dentry->d_inode;
++		spin_lock_inode(inode);
++		if (file->private_data == NULL) {
++			file->private_data = fsdata;
++			fsdata = NULL;
++		}
++		spin_unlock_inode(inode);
++		if (fsdata != NULL)
++			/* other thread initialized ->fsdata */
++			kmem_cache_free(file_fsdata_cache, fsdata);
++	}
++	assert("nikita-2665", file->private_data != NULL);
++	return file->private_data;
++}
++
++/**
++ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
++ * @file:
++ *
++ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
++ * readdir list, frees if it is not linked to d_cursor object.
++ */
++static void free_file_fsdata_nolock(struct file *file)
++{
++	reiser4_file_fsdata *fsdata;
++
++	assert("", spin_inode_is_locked(file->f_dentry->d_inode));
++	fsdata = file->private_data;
++	if (fsdata != NULL) {
++		list_del_init(&fsdata->dir.linkage);
++		if (fsdata->cursor == NULL)
++			free_fsdata(fsdata);
++	}
++	file->private_data = NULL;
++}
++
++/**
++ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
++ * @file:
++ *
++ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
++ */
++void reiser4_free_file_fsdata(struct file *file)
++{
++	spin_lock_inode(file->f_dentry->d_inode);
++	free_file_fsdata_nolock(file);
++	spin_unlock_inode(file->f_dentry->d_inode);
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/fsdata.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/fsdata.h
+@@ -0,0 +1,218 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#if !defined( __REISER4_FSDATA_H__ )
++#define __REISER4_FSDATA_H__
++
++#include "debug.h"
++#include "kassign.h"
++#include "seal.h"
++#include "type_safe_hash.h"
++#include "plugin/file/file.h"
++#include "readahead.h"
++
++/*
++ * comment about reiser4_dentry_fsdata
++ *
++ *
++ */
++
++/*
++ * locking: fields of per file descriptor readdir_pos and ->f_pos are
++ * protected by ->i_mutex on inode. Under this lock following invariant
++ * holds:
++ *
++ *     file descriptor is "looking" at the entry_no-th directory entry from
++ *     the beginning of directory. This entry has key dir_entry_key and is
++ *     pos-th entry with duplicate-key sequence.
++ *
++ */
++
++/* logical position within directory */
++typedef struct {
++	/* key of directory entry (actually, part of a key sufficient to
++	   identify directory entry)  */
++	de_id dir_entry_key;
++	/* ordinal number of directory entry among all entries with the same
++	   key. (Starting from 0.) */
++	unsigned pos;
++} dir_pos;
++
++typedef struct {
++	/* f_pos corresponding to this readdir position */
++	__u64 fpos;
++	/* logical position within directory */
++	dir_pos position;
++	/* logical number of directory entry within
++	   directory  */
++	__u64 entry_no;
++} readdir_pos;
++
++/*
++ * this is used to speed up lookups for directory entry: on initial call to
++ * ->lookup() seal and coord of directory entry (if found, that is) are stored
++ * in struct dentry and reused later to avoid tree traversals.
++ */
++typedef struct de_location {
++	/* seal covering directory entry */
++	seal_t entry_seal;
++	/* coord of directory entry */
++	coord_t entry_coord;
++	/* ordinal number of directory entry among all entries with the same
++	   key. (Starting from 0.) */
++	int pos;
++} de_location;
++
++/**
++ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
++ *
++ * This is allocated dynamically and released in d_op->d_release()
++ *
++ * Currently it only contains cached location (hint) of directory entry, but
++ * it is expected that other information will be accumulated here.
++ */
++typedef struct reiser4_dentry_fsdata {
++	/*
++	 * here will go fields filled by ->lookup() to speedup next
++	 * create/unlink, like blocknr of znode with stat-data, or key of
++	 * stat-data.
++	 */
++	de_location dec;
++	int stateless;		/* created through reiser4_decode_fh, needs special
++				 * treatment in readdir. */
++} reiser4_dentry_fsdata;
++
++extern int init_dentry_fsdata(void);
++extern void done_dentry_fsdata(void);
++extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
++extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
++
++
++/**
++ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
++ *
++ * This is allocated dynamically and released in inode->i_fop->release
++ */
++typedef struct reiser4_file_fsdata {
++	/*
++	 * pointer back to the struct file which this reiser4_file_fsdata is
++	 * part of
++	 */
++	struct file *back;
++	/* detached cursor for stateless readdir. */
++	struct dir_cursor *cursor;
++	/*
++	 * We need both directory and regular file parts here, because there
++	 * are file system objects that are files and directories.
++	 */
++	struct {
++		/*
++		 * position in directory. It is updated each time directory is
++		 * modified
++		 */
++		readdir_pos readdir;
++		/* head of this list is reiser4_inode->lists.readdir_list */
++		struct list_head linkage;
++	} dir;
++	/* hints to speed up operations with regular files: read and write. */
++	struct {
++		hint_t hint;
++	} reg;
++	/* */
++	struct {
++		/* this is called by reiser4_readpages if set */
++		void (*readpages) (struct address_space *,
++				   struct list_head * pages, void *data);
++		/* reiser4_readpaextended coord. It is set by read_extent before
++		   calling page_cache_readahead */
++		void *data;
++	} ra2;
++	struct reiser4_file_ra_state ra1;
++
++} reiser4_file_fsdata;
++
++extern int init_file_fsdata(void);
++extern void done_file_fsdata(void);
++extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
++extern void reiser4_free_file_fsdata(struct file *);
++
++
++/*
++ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
++ * used to address problem reiser4 has with readdir accesses via NFS. See
++ * plugin/file_ops_readdir.c for more details.
++ */
++typedef struct {
++	__u16 cid;
++	__u64 oid;
++} d_cursor_key;
++
++/*
++ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
++ * maintain hash table of dir_cursor-s in reiser4's super block
++ */
++typedef struct dir_cursor dir_cursor;
++TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
++
++typedef struct d_cursor_info d_cursor_info;
++
++struct dir_cursor {
++	int ref;
++	reiser4_file_fsdata *fsdata;
++
++	/* link to reiser4 super block hash table of cursors */
++	d_cursor_hash_link hash;
++
++	/*
++	 * this is to link cursors to reiser4 super block's radix tree of
++	 * cursors if there are more than one cursor of the same objectid
++	 */
++	struct list_head list;
++	d_cursor_key key;
++	d_cursor_info *info;
++	/* list of unused cursors */
++	struct list_head alist;
++};
++
++extern int init_d_cursor(void);
++extern void done_d_cursor(void);
++
++extern int init_super_d_info(struct super_block *);
++extern void done_super_d_info(struct super_block *);
++
++extern loff_t get_dir_fpos(struct file *);
++extern int try_to_attach_fsdata(struct file *, struct inode *);
++extern void detach_fsdata(struct file *);
++
++
++/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
++   more details */
++void dispose_cursors(struct inode *inode);
++void load_cursors(struct inode *inode);
++void kill_cursors(struct inode *inode);
++void adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj);
++
++/*
++ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
++ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
++ */
++struct d_cursor_info {
++	d_cursor_hash_table table;
++	struct radix_tree_root tree;
++};
++
++/* spinlock protecting readdir cursors */
++extern spinlock_t d_lock;
++
++/* __REISER4_FSDATA_H__ */
++#endif
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/init_super.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/init_super.c
+@@ -0,0 +1,739 @@
++/* Copyright by Hans Reiser, 2003 */
++
++#include "super.h"
++#include "inode.h"
++#include "plugin/plugin_set.h"
++
++#include <linux/swap.h>
++
++
++/**
++ * init_fs_info - allocate reiser4 specific super block
++ * @super: super block of filesystem
++ *
++ * Allocates and initialize reiser4_super_info_data, attaches it to
++ * super->s_fs_info, initializes structures maintaining d_cursor-s.
++ */
++int init_fs_info(struct super_block *super)
++{
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = kmalloc(sizeof(reiser4_super_info_data), get_gfp_mask());
++	if (!sbinfo)
++		return RETERR(-ENOMEM);
++
++	super->s_fs_info = sbinfo;
++	super->s_op = NULL;
++	memset(sbinfo, 0, sizeof(*sbinfo));
++
++	ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
++	ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
++
++	sema_init(&sbinfo->delete_sema, 1);
++	sema_init(&sbinfo->flush_sema, 1);
++	spin_lock_init(&(sbinfo->guard));
++
++	/*  initialize per-super-block d_cursor resources */
++	init_super_d_info(super);
++
++	return 0;
++}
++
++/**
++ * done_fs_info - free reiser4 specific super block
++ * @super: super block of filesystem
++ *
++ * Performs some sanity checks, releases structures maintaining d_cursor-s,
++ * frees reiser4_super_info_data.
++ */
++void done_fs_info(struct super_block *super)
++{
++	assert("zam-990", super->s_fs_info != NULL);
++
++	/* release per-super-block d_cursor resources */
++	done_super_d_info(super);
++
++	/* make sure that there are not jnodes already */
++	assert("", list_empty(&get_super_private(super)->all_jnodes));
++	assert("", get_current_context()->trans->atom == NULL);
++	check_block_counters(super);
++	kfree(super->s_fs_info);
++	super->s_fs_info = NULL;
++}
++
++/* type of option parseable by parse_option() */
++typedef enum {
++	/* value of option is arbitrary string */
++	OPT_STRING,
++
++	/*
++	 * option specifies bit in a bitmask. When option is set - bit in
++	 * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
++	 * dont_load_bitmap, atomic_write.
++	 */
++	OPT_BIT,
++
++	/*
++	 * value of option should conform to sprintf() format. Examples are
++	 * tmgr.atom_max_size=N, tmgr.atom_max_age=N
++	 */
++	OPT_FORMAT,
++
++	/*
++	 * option can take one of predefined values. Example is onerror=panic or
++	 * onerror=remount-ro
++	 */
++	OPT_ONEOF,
++} opt_type_t;
++
++typedef struct opt_bitmask_bit {
++	const char *bit_name;
++	int bit_nr;
++} opt_bitmask_bit;
++
++/* description of option parseable by parse_option() */
++typedef struct opt_desc {
++	/* option name.
++
++	   parsed portion of string has a form "name=value".
++	 */
++	const char *name;
++	/* type of option */
++	opt_type_t type;
++	union {
++		/* where to store value of string option (type == OPT_STRING) */
++		char **string;
++		/* description of bits for bit option (type == OPT_BIT) */
++		struct {
++			int nr;
++			void *addr;
++		} bit;
++		/* description of format and targets for format option (type
++		   == OPT_FORMAT) */
++		struct {
++			const char *format;
++			int nr_args;
++			void *arg1;
++			void *arg2;
++			void *arg3;
++			void *arg4;
++		} f;
++		struct {
++			int *result;
++			const char *list[10];
++		} oneof;
++		struct {
++			void *addr;
++			int nr_bits;
++			opt_bitmask_bit *bits;
++		} bitmask;
++	} u;
++} opt_desc_t;
++
++/**
++ * parse_option - parse one option
++ * @opt_strin: starting point of parsing
++ * @opt: option description
++ *
++ * foo=bar,
++ * ^   ^  ^
++ * |   |  +-- replaced to '\0'
++ * |   +-- val_start
++ * +-- opt_string
++ * Figures out option type and handles option correspondingly.
++ */
++static int parse_option(char *opt_string, opt_desc_t *opt)
++{
++	char *val_start;
++	int result;
++	const char *err_msg;
++
++	/* NOTE-NIKITA think about using lib/cmdline.c functions here. */
++
++	val_start = strchr(opt_string, '=');
++	if (val_start != NULL) {
++		*val_start = '\0';
++		++val_start;
++	}
++
++	err_msg = NULL;
++	result = 0;
++	switch (opt->type) {
++	case OPT_STRING:
++		if (val_start == NULL) {
++			err_msg = "String arg missing";
++			result = RETERR(-EINVAL);
++		} else
++			*opt->u.string = val_start;
++		break;
++	case OPT_BIT:
++		if (val_start != NULL)
++			err_msg = "Value ignored";
++		else
++			set_bit(opt->u.bit.nr, opt->u.bit.addr);
++		break;
++	case OPT_FORMAT:
++		if (val_start == NULL) {
++			err_msg = "Formatted arg missing";
++			result = RETERR(-EINVAL);
++			break;
++		}
++		if (sscanf(val_start, opt->u.f.format,
++			   opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
++			   opt->u.f.arg4) != opt->u.f.nr_args) {
++			err_msg = "Wrong conversion";
++			result = RETERR(-EINVAL);
++		}
++		break;
++	case OPT_ONEOF:
++		{
++			int i = 0;
++
++			if (val_start == NULL) {
++				err_msg = "Value is missing";
++				result = RETERR(-EINVAL);
++				break;
++			}
++			err_msg = "Wrong option value";
++			result = RETERR(-EINVAL);
++			while (opt->u.oneof.list[i]) {
++				if (!strcmp(opt->u.oneof.list[i], val_start)) {
++					result = 0;
++					err_msg = NULL;
++					*opt->u.oneof.result = i;
++					break;
++				}
++				i++;
++			}
++			break;
++		}
++	default:
++		wrong_return_value("nikita-2100", "opt -> type");
++		break;
++	}
++	if (err_msg != NULL) {
++		warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
++			err_msg, opt->name, val_start ? "=" : "",
++			val_start ? : "");
++	}
++	return result;
++}
++
++/**
++ * parse_options - parse reiser4 mount options
++ * @opt_string: starting point
++ * @opts: array of option description
++ * @nr_opts: number of elements in @opts
++ *
++ * Parses comma separated list of reiser4 mount options.
++ */
++static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
++{
++	int result;
++
++	result = 0;
++	while ((result == 0) && opt_string && *opt_string) {
++		int j;
++		char *next;
++
++		next = strchr(opt_string, ',');
++		if (next != NULL) {
++			*next = '\0';
++			++next;
++		}
++		for (j = 0; j < nr_opts; ++j) {
++			if (!strncmp(opt_string, opts[j].name,
++				     strlen(opts[j].name))) {
++				result = parse_option(opt_string, &opts[j]);
++				break;
++			}
++		}
++		if (j == nr_opts) {
++			warning("nikita-2307", "Unrecognized option: \"%s\"",
++				opt_string);
++			/* traditionally, -EINVAL is returned on wrong mount
++			   option */
++			result = RETERR(-EINVAL);
++		}
++		opt_string = next;
++	}
++	return result;
++}
++
++#define NUM_OPT( label, fmt, addr )				\
++		{						\
++			.name = ( label ),			\
++			.type = OPT_FORMAT,			\
++			.u = {					\
++				.f = {				\
++					.format  = ( fmt ),	\
++					.nr_args = 1,		\
++					.arg1 = ( addr ),	\
++					.arg2 = NULL,		\
++					.arg3 = NULL,		\
++					.arg4 = NULL		\
++				}				\
++			}					\
++		}
++
++#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
++
++#define BIT_OPT(label, bitnr)					\
++	{							\
++		.name = label,					\
++		.type = OPT_BIT,				\
++		.u = {						\
++			.bit = {				\
++				.nr = bitnr,			\
++				.addr = &sbinfo->fs_flags	\
++			}					\
++		}						\
++	}
++
++#define MAX_NR_OPTIONS (30)
++
++/**
++ * init_super_data - initialize reiser4 private super block
++ * @super: super block to initialize
++ * @opt_string: list of reiser4 mount options
++ *
++ * Sets various reiser4 parameters to default values. Parses mount options and
++ * overwrites default settings.
++ */
++int init_super_data(struct super_block *super, char *opt_string)
++{
++	int result;
++	opt_desc_t *opts, *p;
++	reiser4_super_info_data *sbinfo = get_super_private(super);
++
++	/* initialize super, export, dentry operations */
++	sbinfo->ops.super = reiser4_super_operations;
++	sbinfo->ops.export = reiser4_export_operations;
++	sbinfo->ops.dentry = reiser4_dentry_operations;
++	super->s_op = &sbinfo->ops.super;
++	super->s_export_op = &sbinfo->ops.export;
++
++	/* initialize transaction manager parameters to default values */
++	sbinfo->tmgr.atom_max_size = totalram_pages / 4;
++	sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
++	sbinfo->tmgr.atom_min_size = 256;
++	sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
++
++	/* initialize cbk cache parameter */
++	sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
++
++	/* initialize flush parameters */
++	sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
++	sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
++	sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
++	sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
++
++	sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
++
++	/* preliminary tree initializations */
++	sbinfo->tree.super = super;
++	sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
++	sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
++	sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
++	sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
++	rwlock_init(&(sbinfo->tree.tree_lock));
++	spin_lock_init(&(sbinfo->tree.epoch_lock));
++
++	/* initialize default readahead params */
++	sbinfo->ra_params.max = num_physpages / 4;
++	sbinfo->ra_params.flags = 0;
++
++	/* allocate memory for structure describing reiser4 mount options */
++	opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS, get_gfp_mask());
++	if (opts == NULL)
++		return RETERR(-ENOMEM);
++
++	/* initialize structure describing reiser4 mount options */
++	p = opts;
++
++#if REISER4_DEBUG
++#  define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) {		\
++		warning ("zam-1046", "opt array is overloaded"); break;	\
++	}
++#else
++#   define OPT_ARRAY_CHECK noop
++#endif
++
++#define PUSH_OPT(...)				\
++do {						\
++	 opt_desc_t o = __VA_ARGS__;		\
++	 OPT_ARRAY_CHECK;			\
++	 *p ++ = o;				\
++} while (0)
++
++#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
++#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
++
++	/*
++	 * tmgr.atom_max_size=N
++	 * Atoms containing more than N blocks will be forced to commit. N is
++	 * decimal.
++	 */
++	PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
++	/*
++	 * tmgr.atom_max_age=N
++	 * Atoms older than N seconds will be forced to commit. N is decimal.
++	 */
++	PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
++	/*
++	 * tmgr.atom_min_size=N
++	 * In committing an atom to free dirty pages, force the atom less than
++	 * N in size to fuse with another one.
++	 */
++	PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
++	/*
++	 * tmgr.atom_max_flushers=N
++	 * limit of concurrent flushers for one atom. 0 means no limit.
++	 */
++	PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
++	/*
++	 * tree.cbk_cache_slots=N
++	 * Number of slots in the cbk cache.
++	 */
++	PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
++	/*
++	 * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
++	 * leaf-level blocks it will force them to be relocated.
++	 */
++	PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
++	/*
++	 * If flush finds can find a block allocation closer than at most
++	 * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
++	 * position.
++	 */
++	PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
++	/*
++	 * If we have written this much or more blocks before encountering busy
++	 * jnode in flush list - abort flushing hoping that next time we get
++	 * called this jnode will be clean already, and we will save some
++	 * seeks.
++	 */
++	PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
++	/* The maximum number of nodes to scan left on a level during flush. */
++	PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
++	/* preferred IO size */
++	PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
++	/* carry flags used for insertion of new nodes */
++	PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
++	/* carry flags used for insertion of new extents */
++	PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
++	/* carry flags used for paste operations */
++	PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
++	/* carry flags used for insert operations */
++	PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
++
++#ifdef CONFIG_REISER4_BADBLOCKS
++	/*
++	 * Alternative master superblock location in case if it's original
++	 * location is not writeable/accessable. This is offset in BYTES.
++	 */
++	PUSH_SB_FIELD_OPT(altsuper, "%lu");
++#endif
++
++	/* turn on BSD-style gid assignment */
++	PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
++	/* turn on 32 bit times */
++	PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
++	/* turn off concurrent flushing */
++	PUSH_BIT_OPT("mtflush", REISER4_MTFLUSH);
++	/*
++	 * Don't load all bitmap blocks at mount time, it is useful for
++	 * machines with tiny RAM and large disks.
++	 */
++	PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
++	/* disable transaction commits during write() */
++	PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
++	/* disable use of write barriers in the reiser4 log writer. */
++	PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
++
++	PUSH_OPT(
++	{
++		/*
++		 * tree traversal readahead parameters:
++		 * -o readahead:MAXNUM:FLAGS
++		 * MAXNUM - max number fo nodes to request readahead for: -1UL
++		 * will set it to max_sane_readahead()
++		 * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
++		 * CONTINUE_ON_PRESENT
++		 */
++		.name = "readahead",
++		.type = OPT_FORMAT,
++		.u = {
++			.f = {
++				.format = "%u:%u",
++				.nr_args = 2,
++				.arg1 = &sbinfo->ra_params.max,
++				.arg2 = &sbinfo->ra_params.flags,
++				.arg3 = NULL,
++				.arg4 = NULL
++			}
++		}
++	}
++	);
++
++	/* What to do in case of fs error */
++	PUSH_OPT(
++	{
++		.name = "onerror",
++		.type = OPT_ONEOF,
++		.u = {
++			.oneof = {
++				.result = &sbinfo->onerror,
++				.list = {
++					"panic", "remount-ro", NULL
++				},
++			}
++		}
++	}
++	);
++
++	/* modify default settings to values set by mount options */
++	result = parse_options(opt_string, opts, p - opts);
++	kfree(opts);
++	if (result != 0)
++		return result;
++
++	/* correct settings to sanity values */
++	sbinfo->tmgr.atom_max_age *= HZ;
++	if (sbinfo->tmgr.atom_max_age <= 0)
++		/* overflow */
++		sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
++
++	/* round optimal io size up to 512 bytes */
++	sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
++	sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
++	if (sbinfo->optimal_io_size == 0) {
++		warning("nikita-2497", "optimal_io_size is too small");
++		return RETERR(-EINVAL);
++	}
++
++	/* disable single-threaded flush as it leads to deadlock */
++	sbinfo->fs_flags |= (1 << REISER4_MTFLUSH);
++	return result;
++}
++
++/**
++ * init_read_super - read reiser4 master super block
++ * @super: super block to fill
++ * @silent: if 0 - print warnings
++ *
++ * Reads reiser4 master super block either from predefined location or from
++ * location specified by altsuper mount option, initializes disk format plugin.
++ */
++int init_read_super(struct super_block *super, int silent)
++{
++	struct buffer_head *super_bh;
++	struct reiser4_master_sb *master_sb;
++	reiser4_super_info_data *sbinfo = get_super_private(super);
++	unsigned long blocksize;
++
++ read_super_block:
++#ifdef CONFIG_REISER4_BADBLOCKS
++	if (sbinfo->altsuper)
++		/*
++		 * read reiser4 master super block at position specified by
++		 * mount option
++		 */
++		super_bh = sb_bread(super,
++				    (sector_t)(sbinfo->altsuper / super->s_blocksize));
++	else
++#endif
++		/* read reiser4 master super block at 16-th 4096 block */
++		super_bh = sb_bread(super,
++				    (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
++	if (!super_bh)
++		return RETERR(-EIO);
++
++	master_sb = (struct reiser4_master_sb *)super_bh->b_data;
++	/* check reiser4 magic string */
++	if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
++		     sizeof(REISER4_SUPER_MAGIC_STRING))) {
++		/* reiser4 master super block contains filesystem blocksize */
++		blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
++
++		if (blocksize != PAGE_CACHE_SIZE) {
++			/*
++			 * currenly reiser4's blocksize must be equal to
++			 * pagesize
++			 */
++			if (!silent)
++				warning("nikita-2609",
++					"%s: wrong block size %ld\n", super->s_id,
++					blocksize);
++			brelse(super_bh);
++			return RETERR(-EINVAL);
++		}
++		if (blocksize != super->s_blocksize) {
++			/*
++			 * filesystem uses different blocksize. Reread master
++			 * super block with correct blocksize
++			 */
++			brelse(super_bh);
++			if (!sb_set_blocksize(super, (int)blocksize))
++				return RETERR(-EINVAL);
++			goto read_super_block;
++		}
++
++		sbinfo->df_plug =
++			disk_format_plugin_by_id(
++				le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
++		if (sbinfo->df_plug == NULL) {
++			if (!silent)
++				warning("nikita-26091",
++					"%s: unknown disk format plugin %d\n",
++					super->s_id,
++					le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
++			brelse(super_bh);
++			return RETERR(-EINVAL);
++		}
++		sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
++		brelse(super_bh);
++		return 0;
++	}
++
++	/* there is no reiser4 on the device */
++	if (!silent)
++		warning("nikita-2608",
++			"%s: wrong master super block magic", super->s_id);
++	brelse(super_bh);
++	return RETERR(-EINVAL);
++}
++
++static struct {
++	reiser4_plugin_type type;
++	reiser4_plugin_id id;
++} default_plugins[PSET_LAST] = {
++	[PSET_FILE] = {
++		.type = REISER4_FILE_PLUGIN_TYPE,
++		.id = UNIX_FILE_PLUGIN_ID
++	},
++	[PSET_DIR] = {
++		.type = REISER4_DIR_PLUGIN_TYPE,
++		.id = HASHED_DIR_PLUGIN_ID
++	},
++	[PSET_HASH] = {
++		.type = REISER4_HASH_PLUGIN_TYPE,
++		.id = R5_HASH_ID
++	},
++	[PSET_FIBRATION] = {
++		.type = REISER4_FIBRATION_PLUGIN_TYPE,
++		.id = FIBRATION_DOT_O
++	},
++	[PSET_PERM] = {
++		.type = REISER4_PERM_PLUGIN_TYPE,
++		.id = NULL_PERM_ID
++	},
++	[PSET_FORMATTING] = {
++		.type = REISER4_FORMATTING_PLUGIN_TYPE,
++		.id = SMALL_FILE_FORMATTING_ID
++	},
++	[PSET_SD] = {
++		.type = REISER4_ITEM_PLUGIN_TYPE,
++		.id = STATIC_STAT_DATA_ID
++	},
++	[PSET_DIR_ITEM] = {
++		.type = REISER4_ITEM_PLUGIN_TYPE,
++		.id = COMPOUND_DIR_ID
++	},
++	[PSET_CIPHER] = {
++		.type = REISER4_CIPHER_PLUGIN_TYPE,
++		.id = NONE_CIPHER_ID
++	},
++	[PSET_DIGEST] = {
++		.type = REISER4_DIGEST_PLUGIN_TYPE,
++		.id = SHA256_32_DIGEST_ID
++	},
++	[PSET_COMPRESSION] = {
++		.type = REISER4_COMPRESSION_PLUGIN_TYPE,
++		.id = LZO1_COMPRESSION_ID
++	},
++	[PSET_COMPRESSION_MODE] = {
++		.type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++		.id = COL_16_COMPRESSION_MODE_ID
++	},
++	[PSET_CLUSTER] = {
++		.type = REISER4_CLUSTER_PLUGIN_TYPE,
++		.id = CLUSTER_64K_ID
++	},
++	[PSET_REGULAR_ENTRY] = {
++		.type = REISER4_REGULAR_PLUGIN_TYPE,
++		.id = UF_REGULAR_ID
++	}
++};
++
++/* access to default plugin table */
++static reiser4_plugin *get_default_plugin(pset_member memb)
++{
++	return plugin_by_id(default_plugins[memb].type,
++			    default_plugins[memb].id);
++}
++
++/**
++ * init_root_inode - obtain inode of root directory
++ * @super: super block of filesystem
++ *
++ * Obtains inode of root directory (reading it from disk), initializes plugin
++ * set it was not initialized.
++ */
++int init_root_inode(struct super_block *super)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(super);
++	struct inode *inode;
++	int result = 0;
++
++	inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
++	if (IS_ERR(inode))
++		return RETERR(PTR_ERR(inode));
++
++	super->s_root = d_alloc_root(inode);
++	if (!super->s_root) {
++		iput(inode);
++		return RETERR(-ENOMEM);
++	}
++
++	super->s_root->d_op = &sbinfo->ops.dentry;
++
++	if (!is_inode_loaded(inode)) {
++		pset_member memb;
++
++		for (memb = 0; memb < PSET_LAST; ++memb) {
++			reiser4_plugin *plug;
++
++			plug = get_default_plugin(memb);
++			result = grab_plugin_from(inode, memb, plug);
++			if (result != 0)
++				break;
++		}
++
++		if (result == 0) {
++			if (REISER4_DEBUG) {
++				plugin_set *pset;
++
++				pset = reiser4_inode_data(inode)->pset;
++				for (memb = 0; memb < PSET_LAST; ++memb)
++					assert("nikita-3500",
++					       pset_get(pset, memb) != NULL);
++			}
++		} else
++			warning("nikita-3448", "Cannot set plugins of root: %i",
++				result);
++		reiser4_iget_complete(inode);
++	}
++	super->s_maxbytes = MAX_LFS_FILESIZE;
++	return result;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/inode.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/inode.c
+@@ -0,0 +1,727 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Inode specific operations. */
++
++#include "forward.h"
++#include "debug.h"
++#include "key.h"
++#include "kassign.h"
++#include "coord.h"
++#include "seal.h"
++#include "dscale.h"
++#include "plugin/item/item.h"
++#include "plugin/security/perm.h"
++#include "plugin/plugin.h"
++#include "plugin/object.h"
++#include "znode.h"
++#include "vfs_ops.h"
++#include "inode.h"
++#include "super.h"
++#include "reiser4.h"
++
++#include <linux/fs.h>		/* for struct super_block,  address_space */
++
++/* return reiser4 internal tree which inode belongs to */
++/* Audited by: green(2002.06.17) */
++reiser4_tree *tree_by_inode(const struct inode *inode /* inode queried */ )
++{
++	assert("nikita-256", inode != NULL);
++	assert("nikita-257", inode->i_sb != NULL);
++	return get_tree(inode->i_sb);
++}
++
++/* return reiser4-specific inode flags */
++static inline unsigned long *inode_flags(const struct inode *const inode)
++{
++	assert("nikita-2842", inode != NULL);
++	return &reiser4_inode_data(inode)->flags;
++}
++
++/* set reiser4-specific flag @f in @inode */
++void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
++{
++	assert("nikita-2248", inode != NULL);
++	set_bit((int)f, inode_flags(inode));
++}
++
++/* clear reiser4-specific flag @f in @inode */
++void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
++{
++	assert("nikita-2250", inode != NULL);
++	clear_bit((int)f, inode_flags(inode));
++}
++
++/* true if reiser4-specific flag @f is set in @inode */
++int inode_get_flag(const struct inode *inode, reiser4_file_plugin_flags f)
++{
++	assert("nikita-2251", inode != NULL);
++	return test_bit((int)f, inode_flags(inode));
++}
++
++/* convert oid to inode number */
++ino_t oid_to_ino(oid_t oid)
++{
++	return (ino_t) oid;
++}
++
++/* convert oid to user visible inode number */
++ino_t oid_to_uino(oid_t oid)
++{
++	/* reiser4 object is uniquely identified by oid which is 64 bit
++	   quantity. Kernel in-memory inode is indexed (in the hash table) by
++	   32 bit i_ino field, but this is not a problem, because there is a
++	   way to further distinguish inodes with identical inode numbers
++	   (find_actor supplied to iget()).
++
++	   But user space expects unique 32 bit inode number. Obviously this
++	   is impossible. Work-around is to somehow hash oid into user visible
++	   inode number.
++	 */
++	oid_t max_ino = (ino_t) ~ 0;
++
++	if (REISER4_INO_IS_OID || (oid <= max_ino))
++		return oid;
++	else
++		/* this is remotely similar to algorithm used to find next pid
++		   to use for process: after wrap-around start from some
++		   offset rather than from 0. Idea is that there are some long
++		   living objects with which we don't want to collide.
++		 */
++		return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
++}
++
++/* check that "inode" is on reiser4 file-system */
++int is_reiser4_inode(const struct inode *inode /* inode queried */ )
++{
++	return inode != NULL && is_reiser4_super(inode->i_sb);
++}
++
++/* Maximal length of a name that can be stored in directory @inode.
++
++   This is used in check during file creation and lookup. */
++int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
++{
++	assert("nikita-287", is_reiser4_inode(inode));
++	assert("nikita-1710", inode_dir_item_plugin(inode));
++	if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
++		return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
++	else
++		return 255;
++}
++
++#if REISER4_USE_COLLISION_LIMIT
++/* Maximal number of hash collisions for this directory. */
++int max_hash_collisions(const struct inode *dir /* inode queried */ )
++{
++	assert("nikita-1711", dir != NULL);
++	return reiser4_inode_data(dir)->plugin.max_collisions;
++}
++#endif  /*  REISER4_USE_COLLISION_LIMIT  */
++
++/* Install file, inode, and address_space operation on @inode, depending on
++   its mode. */
++int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
++		    reiser4_object_create_data * data	/* parameters to create
++							 * object */ )
++{
++	reiser4_super_info_data *sinfo;
++	file_plugin *fplug;
++	dir_plugin *dplug;
++
++	fplug = inode_file_plugin(inode);
++	dplug = inode_dir_plugin(inode);
++
++	sinfo = get_super_private(inode->i_sb);
++
++	switch (inode->i_mode & S_IFMT) {
++	case S_IFSOCK:
++	case S_IFBLK:
++	case S_IFCHR:
++	case S_IFIFO:
++		{
++			dev_t rdev;	/* to keep gcc happy */
++
++			assert("vs-46", fplug != NULL);
++			/* ugly hack with rdev */
++			if (data == NULL) {
++				rdev = inode->i_rdev;
++				inode->i_rdev = 0;
++			} else
++				rdev = data->rdev;
++			inode->i_blocks = 0;
++			assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
++			inode->i_op = &file_plugins[fplug->h.id].inode_ops;
++			/* initialize inode->i_fop and inode->i_rdev for block and char
++			   devices */
++			init_special_inode(inode, inode->i_mode, rdev);
++			/* all address space operations are null */
++			inode->i_mapping->a_ops =
++			    &file_plugins[fplug->h.id].as_ops;
++			break;
++		}
++	case S_IFLNK:
++		assert("vs-46", fplug != NULL);
++		assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
++		inode->i_op = &file_plugins[fplug->h.id].inode_ops;
++		inode->i_fop = NULL;
++		/* all address space operations are null */
++		inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
++		break;
++	case S_IFDIR:
++		assert("vs-46", dplug != NULL);
++		assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
++				 dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
++		inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
++		inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
++		inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
++		break;
++	case S_IFREG:
++		assert("vs-46", fplug != NULL);
++		assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
++				 fplug->h.id == CRC_FILE_PLUGIN_ID));
++		inode->i_op = &file_plugins[fplug->h.id].inode_ops;
++		inode->i_fop = &file_plugins[fplug->h.id].file_ops;
++		inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
++		break;
++	default:
++		warning("nikita-291", "wrong file mode: %o for %llu",
++			inode->i_mode,
++			(unsigned long long)get_inode_oid(inode));
++		reiser4_make_bad_inode(inode);
++		return RETERR(-EINVAL);
++	}
++	return 0;
++}
++
++/* initialize inode from disk data. Called with inode locked.
++    Return inode locked. */
++static int init_inode(struct inode *inode /* inode to intialise */ ,
++		      coord_t * coord /* coord of stat data */ )
++{
++	int result;
++	item_plugin *iplug;
++	void *body;
++	int length;
++	reiser4_inode *state;
++
++	assert("nikita-292", coord != NULL);
++	assert("nikita-293", inode != NULL);
++
++	coord_clear_iplug(coord);
++	result = zload(coord->node);
++	if (result)
++		return result;
++	iplug = item_plugin_by_coord(coord);
++	body = item_body_by_coord(coord);
++	length = item_length_by_coord(coord);
++
++	assert("nikita-295", iplug != NULL);
++	assert("nikita-296", body != NULL);
++	assert("nikita-297", length > 0);
++
++	/* inode is under I_LOCK now */
++
++	state = reiser4_inode_data(inode);
++	/* call stat-data plugin method to load sd content into inode */
++	result = iplug->s.sd.init_inode(inode, body, length);
++	plugin_set_sd(&state->pset, iplug);
++	if (result == 0) {
++		result = setup_inode_ops(inode, NULL);
++		if (result == 0 &&
++		    inode->i_sb->s_root && inode->i_sb->s_root->d_inode) {
++			struct inode *root;
++			pset_member ind;
++
++			/* take missing plugins from file-system defaults */
++			root = inode->i_sb->s_root->d_inode;
++			/* file and directory plugins are already initialized. */
++			for (ind = PSET_DIR + 1; ind < PSET_LAST; ++ind) {
++				result = grab_plugin(inode, root, ind);
++				if (result != 0)
++					break;
++			}
++			if (result != 0) {
++				warning("nikita-3447",
++					"Cannot set up plugins for %lli",
++					(unsigned long long)
++					get_inode_oid(inode));
++			}
++		}
++	}
++	zrelse(coord->node);
++	return result;
++}
++
++/* read `inode' from the disk. This is what was previously in
++   reiserfs_read_inode2().
++
++   Must be called with inode locked. Return inode still locked.
++*/
++static int read_inode(struct inode *inode /* inode to read from disk */ ,
++		      const reiser4_key * key /* key of stat data */ ,
++		      int silent)
++{
++	int result;
++	lock_handle lh;
++	reiser4_inode *info;
++	coord_t coord;
++
++	assert("nikita-298", inode != NULL);
++	assert("nikita-1945", !is_inode_loaded(inode));
++
++	info = reiser4_inode_data(inode);
++	assert("nikita-300", info->locality_id != 0);
++
++	coord_init_zero(&coord);
++	init_lh(&lh);
++	/* locate stat-data in a tree and return znode locked */
++	result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
++	assert("nikita-301", !is_inode_loaded(inode));
++	if (result == 0) {
++		/* use stat-data plugin to load sd into inode. */
++		result = init_inode(inode, &coord);
++		if (result == 0) {
++			/* initialize stat-data seal */
++			spin_lock_inode(inode);
++			seal_init(&info->sd_seal, &coord, key);
++			info->sd_coord = coord;
++			spin_unlock_inode(inode);
++
++			/* call file plugin's method to initialize plugin
++			 * specific part of inode */
++			if (inode_file_plugin(inode)->init_inode_data)
++				inode_file_plugin(inode)->init_inode_data(inode,
++									  NULL,
++									  0);
++			/* load detached directory cursors for stateless
++			 * directory readers (NFS). */
++			load_cursors(inode);
++
++			/* Check the opened inode for consistency. */
++			result =
++			    get_super_private(inode->i_sb)->df_plug->
++			    check_open(inode);
++		}
++	}
++	/* lookup_sd() doesn't release coord because we want znode
++	   stay read-locked while stat-data fields are accessed in
++	   init_inode() */
++	done_lh(&lh);
++
++	if (result != 0)
++		reiser4_make_bad_inode(inode);
++	return result;
++}
++
++/* initialise new reiser4 inode being inserted into hash table. */
++static int init_locked_inode(struct inode *inode /* new inode */ ,
++			     void *opaque	/* key of stat data passed to the
++						 * iget5_locked as cookie */ )
++{
++	reiser4_key *key;
++
++	assert("nikita-1995", inode != NULL);
++	assert("nikita-1996", opaque != NULL);
++	key = opaque;
++	set_inode_oid(inode, get_key_objectid(key));
++	reiser4_inode_data(inode)->locality_id = get_key_locality(key);
++	return 0;
++}
++
++/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
++
++   This function is called by iget5_locked() to distinguish reiser4 inodes
++   having the same inode numbers. Such inodes can only exist due to some error
++   condition. One of them should be bad. Inodes with identical inode numbers
++   (objectids) are distinguished by their packing locality.
++
++*/
++static int reiser4_inode_find_actor(struct inode *inode	/* inode from hash table to
++							 * check */ ,
++				    void *opaque	/* "cookie" passed to
++							 * iget5_locked(). This is stat data
++							 * key */ )
++{
++	reiser4_key *key;
++
++	key = opaque;
++	return
++	    /* oid is unique, so first term is enough, actually. */
++	    get_inode_oid(inode) == get_key_objectid(key) &&
++	    /*
++	     * also, locality should be checked, but locality is stored in
++	     * the reiser4-specific part of the inode, and actor can be
++	     * called against arbitrary inode that happened to be in this
++	     * hash chain. Hence we first have to check that this is
++	     * reiser4 inode at least. is_reiser4_inode() is probably too
++	     * early to call, as inode may have ->i_op not yet
++	     * initialised.
++	     */
++	    is_reiser4_super(inode->i_sb) &&
++	    /*
++	     * usually objectid is unique, but pseudo files use counter to
++	     * generate objectid. All pseudo files are placed into special
++	     * (otherwise unused) locality.
++	     */
++	    reiser4_inode_data(inode)->locality_id == get_key_locality(key);
++}
++
++/* hook for kmem_cache_create */
++void loading_init_once(reiser4_inode * info)
++{
++	sema_init(&info->loading, 1);
++}
++
++/* for reiser4_alloc_inode */
++void loading_alloc(reiser4_inode * info)
++{
++#if REISER4_DEBUG
++	assert("vs-1717", down_trylock(&info->loading) == 0);
++	up(&info->loading);
++#endif
++}
++
++/* for reiser4_destroy */
++void loading_destroy(reiser4_inode * info)
++{
++#if REISER4_DEBUG
++	assert("vs-1717", down_trylock(&info->loading) == 0);
++	up(&info->loading);
++#endif
++}
++
++static void loading_down(reiser4_inode * info)
++{
++	down(&info->loading);
++}
++
++static void loading_up(reiser4_inode * info)
++{
++	up(&info->loading);
++}
++
++/**
++ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
++ * @super: super block of filesystem
++ * @key: key of inode's stat-data
++ * @silent:
++ *
++ * This is our helper function a la iget(). This is be called by
++ * reiser4_lookup() and reiser4_read_super(). Return inode locked or error
++ * encountered.
++ */
++struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
++			   int silent)
++{
++	struct inode *inode;
++	int result;
++	reiser4_inode *info;
++
++	assert("nikita-302", super != NULL);
++	assert("nikita-303", key != NULL);
++
++	result = 0;
++
++	/* call iget(). Our ->read_inode() is dummy, so this will either
++	   find inode in cache or return uninitialised inode */
++	inode = iget5_locked(super,
++			     (unsigned long)get_key_objectid(key),
++			     reiser4_inode_find_actor,
++			     init_locked_inode, (reiser4_key *) key);
++	if (inode == NULL)
++		return ERR_PTR(RETERR(-ENOMEM));
++	if (is_bad_inode(inode)) {
++		warning("nikita-304", "Bad inode found");
++		print_key("key", key);
++		iput(inode);
++		return ERR_PTR(RETERR(-EIO));
++	}
++
++	info = reiser4_inode_data(inode);
++
++	/* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
++	   loaded and initialized inode from just allocated inode. If
++	   REISER4_LOADED bit is not set, reiser4_iget() completes loading under
++	   info->loading.  The place in reiser4 which uses not initialized inode
++	   is the reiser4 repacker, see repacker-related functions in
++	   plugin/item/extent.c */
++	if (!is_inode_loaded(inode)) {
++		loading_down(info);
++		if (!is_inode_loaded(inode)) {
++			/* locking: iget5_locked returns locked inode */
++			assert("nikita-1941", !is_inode_loaded(inode));
++			assert("nikita-1949",
++			       reiser4_inode_find_actor(inode,
++							(reiser4_key *) key));
++			/* now, inode has objectid as ->i_ino and locality in
++			   reiser4-specific part. This is enough for
++			   read_inode() to read stat data from the disk */
++			result = read_inode(inode, key, silent);
++		} else
++			loading_up(info);
++	}
++
++	if (inode->i_state & I_NEW)
++		unlock_new_inode(inode);
++
++	if (is_bad_inode(inode)) {
++		assert("vs-1717", result != 0);
++		loading_up(info);
++		iput(inode);
++		inode = ERR_PTR(result);
++	} else if (REISER4_DEBUG) {
++		reiser4_key found_key;
++
++		assert("vs-1717", result == 0);
++		build_sd_key(inode, &found_key);
++		if (!keyeq(&found_key, key)) {
++			warning("nikita-305", "Wrong key in sd");
++			print_key("sought for", key);
++			print_key("found", &found_key);
++		}
++		if (inode->i_nlink == 0) {
++			warning("nikita-3559", "Unlinked inode found: %llu\n",
++				(unsigned long long)get_inode_oid(inode));
++		}
++	}
++	return inode;
++}
++
++/* reiser4_iget() may return not fully initialized inode, this function should
++ * be called after one completes reiser4 inode initializing. */
++void reiser4_iget_complete(struct inode *inode)
++{
++	assert("zam-988", is_reiser4_inode(inode));
++
++	if (!is_inode_loaded(inode)) {
++		inode_set_flag(inode, REISER4_LOADED);
++		loading_up(reiser4_inode_data(inode));
++	}
++}
++
++void reiser4_make_bad_inode(struct inode *inode)
++{
++	assert("nikita-1934", inode != NULL);
++
++	/* clear LOADED bit */
++	inode_clr_flag(inode, REISER4_LOADED);
++	make_bad_inode(inode);
++	return;
++}
++
++file_plugin *inode_file_plugin(const struct inode * inode)
++{
++	assert("nikita-1997", inode != NULL);
++	return reiser4_inode_data(inode)->pset->file;
++}
++
++dir_plugin *inode_dir_plugin(const struct inode * inode)
++{
++	assert("nikita-1998", inode != NULL);
++	return reiser4_inode_data(inode)->pset->dir;
++}
++
++#if 0
++perm_plugin *inode_perm_plugin(const struct inode * inode)
++{
++	assert("nikita-1999", inode != NULL);
++	return reiser4_inode_data(inode)->pset->perm;
++}
++#endif  /*  0  */
++
++formatting_plugin *inode_formatting_plugin(const struct inode * inode)
++{
++	assert("nikita-2000", inode != NULL);
++	return reiser4_inode_data(inode)->pset->formatting;
++}
++
++hash_plugin *inode_hash_plugin(const struct inode * inode)
++{
++	assert("nikita-2001", inode != NULL);
++	return reiser4_inode_data(inode)->pset->hash;
++}
++
++fibration_plugin *inode_fibration_plugin(const struct inode * inode)
++{
++	assert("nikita-2001", inode != NULL);
++	return reiser4_inode_data(inode)->pset->fibration;
++}
++
++cipher_plugin *inode_cipher_plugin(const struct inode * inode)
++{
++	assert("edward-36", inode != NULL);
++	return reiser4_inode_data(inode)->pset->cipher;
++}
++
++compression_plugin *inode_compression_plugin(const struct inode * inode)
++{
++	assert("edward-37", inode != NULL);
++	return reiser4_inode_data(inode)->pset->compression;
++}
++
++compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
++						       inode)
++{
++	assert("edward-1330", inode != NULL);
++	return reiser4_inode_data(inode)->pset->compression_mode;
++}
++
++cluster_plugin *inode_cluster_plugin(const struct inode * inode)
++{
++	assert("edward-1328", inode != NULL);
++	return reiser4_inode_data(inode)->pset->cluster;
++}
++
++regular_plugin *inode_regular_plugin(const struct inode * inode)
++{
++	assert("edward-1329", inode != NULL);
++	return reiser4_inode_data(inode)->pset->regular_entry;
++}
++
++digest_plugin *inode_digest_plugin(const struct inode * inode)
++{
++	assert("edward-86", inode != NULL);
++	return reiser4_inode_data(inode)->pset->digest;
++}
++
++item_plugin *inode_sd_plugin(const struct inode * inode)
++{
++	assert("vs-534", inode != NULL);
++	return reiser4_inode_data(inode)->pset->sd;
++}
++
++item_plugin *inode_dir_item_plugin(const struct inode * inode)
++{
++	assert("vs-534", inode != NULL);
++	return reiser4_inode_data(inode)->pset->dir_item;
++}
++
++void inode_set_extension(struct inode *inode, sd_ext_bits ext)
++{
++	reiser4_inode *state;
++
++	assert("nikita-2716", inode != NULL);
++	assert("nikita-2717", ext < LAST_SD_EXTENSION);
++	assert("nikita-3491", spin_inode_is_locked(inode));
++
++	state = reiser4_inode_data(inode);
++	state->extmask |= 1 << ext;
++	/* force re-calculation of stat-data length on next call to
++	   update_sd(). */
++	inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
++}
++
++void
++inode_set_plugin(struct inode *inode, reiser4_plugin * plug, pset_member memb)
++{
++	assert("nikita-2718", inode != NULL);
++	assert("nikita-2719", plug != NULL);
++
++	reiser4_inode_data(inode)->plugin_mask |= (1 << memb);
++}
++
++void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
++{
++	assert("edward-1287", inode != NULL);
++	if (!dscale_fit(old, new))
++		inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
++	return;
++}
++
++void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
++{
++	assert("nikita-2875", inode != NULL);
++	spin_lock_inode(inode);
++	inode_check_scale_nolock(inode, old, new);
++	spin_unlock_inode(inode);
++}
++
++/*
++ * initialize ->ordering field of inode. This field defines how file stat-data
++ * and body is ordered within a tree with respect to other objects within the
++ * same parent directory.
++ */
++void
++init_inode_ordering(struct inode *inode,
++		    reiser4_object_create_data * crd, int create)
++{
++	reiser4_key key;
++
++	if (create) {
++		struct inode *parent;
++
++		parent = crd->parent;
++		assert("nikita-3224", inode_dir_plugin(parent) != NULL);
++		inode_dir_plugin(parent)->build_entry_key(parent,
++							  &crd->dentry->d_name,
++							  &key);
++	} else {
++		coord_t *coord;
++
++		coord = &reiser4_inode_data(inode)->sd_coord;
++		coord_clear_iplug(coord);
++		/* safe to use ->sd_coord, because node is under long term
++		 * lock */
++		WITH_DATA(coord->node, item_key_by_coord(coord, &key));
++	}
++
++	set_inode_ordering(inode, get_key_ordering(&key));
++}
++
++znode *inode_get_vroot(struct inode *inode)
++{
++	reiser4_block_nr blk;
++	znode *result;
++
++	spin_lock_inode(inode);
++	blk = reiser4_inode_data(inode)->vroot;
++	spin_unlock_inode(inode);
++	if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
++		result = zlook(tree_by_inode(inode), &blk);
++	else
++		result = NULL;
++	return result;
++}
++
++void inode_set_vroot(struct inode *inode, znode *vroot)
++{
++	spin_lock_inode(inode);
++	reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
++	spin_unlock_inode(inode);
++}
++
++#if REISER4_DEBUG
++
++void inode_invariant(const struct inode *inode)
++{
++	assert("nikita-3077", spin_inode_is_locked(inode));
++}
++
++int inode_has_no_jnodes(reiser4_inode * r4_inode)
++{
++	return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
++		r4_inode->nr_jnodes == 0;
++}
++
++#endif
++
++/* true if directory is empty (only contains dot and dotdot) */
++/* FIXME: shouldn't it be dir plugin method? */
++int is_dir_empty(const struct inode *dir)
++{
++	assert("nikita-1976", dir != NULL);
++
++	/* rely on our method to maintain directory i_size being equal to the
++	   number of entries. */
++	return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/inode.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/inode.h
+@@ -0,0 +1,430 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Inode functions. */
++
++#if !defined( __REISER4_INODE_H__ )
++#define __REISER4_INODE_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "key.h"
++#include "seal.h"
++#include "plugin/plugin.h"
++#include "plugin/file/cryptcompress.h"
++#include "plugin/file/file.h"
++#include "plugin/dir/dir.h"
++#include "plugin/plugin_set.h"
++#include "plugin/security/perm.h"
++#include "vfs_ops.h"
++#include "jnode.h"
++#include "fsdata.h"
++
++#include <linux/types.h>	/* for __u?? , ino_t */
++#include <linux/fs.h>		/* for struct super_block, struct
++				 * rw_semaphore, etc  */
++#include <linux/spinlock.h>
++#include <asm/types.h>
++
++/* reiser4-specific inode flags. They are "transient" and are not
++   supposed to be stored on disk. Used to trace "state" of
++   inode
++*/
++typedef enum {
++	/* this is light-weight inode, inheriting some state from its
++	   parent  */
++	REISER4_LIGHT_WEIGHT = 0,
++	/* stat data wasn't yet created */
++	REISER4_NO_SD = 1,
++	/* internal immutable flag. Currently is only used
++	   to avoid race condition during file creation.
++	   See comment in create_object(). */
++	REISER4_IMMUTABLE = 2,
++	/* inode was read from storage */
++	REISER4_LOADED = 3,
++	/* this bit is set for symlinks. inode->u.generic_ip points to target
++	   name of symlink. */
++	REISER4_GENERIC_PTR_USED = 4,
++	/* set if size of stat-data item for this inode is known. If this is
++	 * set we can avoid recalculating size of stat-data on each update. */
++	REISER4_SDLEN_KNOWN = 5,
++	/* reiser4_inode->crypt points to the crypto stat */
++	REISER4_CRYPTO_STAT_LOADED = 6,
++	/* cryptcompress_inode_data points to the secret key */
++	REISER4_SECRET_KEY_INSTALLED = 7,
++	/* File (possibly) has pages corresponding to the tail items, that
++	 * were created by ->readpage. It is set by mmap_unix_file() and
++	 * sendfile_unix_file(). This bit is inspected by write_unix_file and
++	 * kill-hook of tail items. It is never cleared once set. This bit is
++	 * modified and inspected under i_mutex. */
++	REISER4_HAS_MMAP = 8,
++
++	REISER4_PART_MIXED = 9,
++	REISER4_PART_IN_CONV = 10 
++} reiser4_file_plugin_flags;
++
++/* state associated with each inode.
++   reiser4 inode.
++
++   NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
++   be of the same size. File-system allocates inodes by itself through
++   s_op->allocate_inode() method. So, it is possible to adjust size of inode
++   at the time of its creation.
++
++   Invariants involving parts of this data-type:
++
++      [inode->eflushed]
++
++*/
++
++typedef struct reiser4_inode reiser4_inode;
++/* return pointer to reiser4-specific part of inode */
++static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
++						/* inode queried */ );
++
++#if BITS_PER_LONG == 64
++
++#define REISER4_INO_IS_OID (1)
++typedef struct {;
++} oid_hi_t;
++
++/* BITS_PER_LONG == 64 */
++#else
++
++#define REISER4_INO_IS_OID (0)
++typedef __u32 oid_hi_t;
++
++/* BITS_PER_LONG == 64 */
++#endif
++
++struct reiser4_inode {
++	/* spin lock protecting fields of this structure. */
++	spinlock_t guard;
++	/* object plugins */
++	plugin_set *pset;
++	/* plugins set for inheritance */
++	plugin_set *hset;
++	/* high 32 bits of object id */
++	oid_hi_t oid_hi;
++	/* seal for stat-data */
++	seal_t sd_seal;
++	/* locality id for this file */
++	oid_t locality_id;
++#if REISER4_LARGE_KEY
++	__u64 ordering;
++#endif
++	/* coord of stat-data in sealed node */
++	coord_t sd_coord;
++	/* bit-mask of stat-data extentions used by this file */
++	__u64 extmask;
++	/* bitmask of non-default plugins for this inode */
++	__u16 plugin_mask;
++	union {
++		struct list_head readdir_list;
++		struct list_head not_used;
++	} lists;
++	/* per-inode flags. Filled by values of reiser4_file_plugin_flags */
++	unsigned long flags;
++	union {
++		/* fields specific to unix_file plugin */
++		unix_file_info_t unix_file_info;
++		/* fields specific to cryptcompress plugin */
++		cryptcompress_info_t cryptcompress_info;
++	} file_plugin_data;
++
++	/* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
++	   tagged in that tree by EFLUSH_TAG_ANONYMOUS */
++	struct radix_tree_root jnodes_tree;
++#if REISER4_DEBUG
++	/* number of unformatted node jnodes of this file in jnode hash table */
++	unsigned long nr_jnodes;
++#endif
++
++	/* block number of virtual root for this object. See comment above
++	 * fs/reiser4/search.c:handle_vroot() */
++	reiser4_block_nr vroot;
++	struct semaphore loading;
++};
++
++void loading_init_once(reiser4_inode *);
++void loading_alloc(reiser4_inode *);
++void loading_destroy(reiser4_inode *);
++
++typedef struct reiser4_inode_object {
++	/* private part */
++	reiser4_inode p;
++	/* generic fields not specific to reiser4, but used by VFS */
++	struct inode vfs_inode;
++} reiser4_inode_object;
++
++/* return pointer to the reiser4 specific portion of @inode */
++static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
++						/* inode queried */ )
++{
++	assert("nikita-254", inode != NULL);
++	return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
++}
++
++static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
++						   r4_inode /* inode queried */
++						   )
++{
++	return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
++}
++
++/*
++ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
++ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
++ * bits.
++ *
++ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
++ * of inode, otherwise whole oid is stored in i_ino.
++ *
++ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
++ */
++
++#define OID_HI_SHIFT (sizeof(ino_t) * 8)
++
++#if REISER4_INO_IS_OID
++
++static inline oid_t get_inode_oid(const struct inode *inode)
++{
++	return inode->i_ino;
++}
++
++static inline void set_inode_oid(struct inode *inode, oid_t oid)
++{
++	inode->i_ino = oid;
++}
++
++/* REISER4_INO_IS_OID */
++#else
++
++static inline oid_t get_inode_oid(const struct inode *inode)
++{
++	return
++	    ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
++	    inode->i_ino;
++}
++
++static inline void set_inode_oid(struct inode *inode, oid_t oid)
++{
++	assert("nikita-2519", inode != NULL);
++	inode->i_ino = (ino_t) (oid);
++	reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
++	assert("nikita-2521", get_inode_oid(inode) == (oid));
++}
++
++/* REISER4_INO_IS_OID */
++#endif
++
++static inline oid_t get_inode_locality(const struct inode *inode)
++{
++	return reiser4_inode_data(inode)->locality_id;
++}
++
++#if REISER4_LARGE_KEY
++static inline __u64 get_inode_ordering(const struct inode *inode)
++{
++	return reiser4_inode_data(inode)->ordering;
++}
++
++static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
++{
++	reiser4_inode_data(inode)->ordering = ordering;
++}
++
++#else
++
++#define get_inode_ordering(inode) (0)
++#define set_inode_ordering(inode, val) noop
++
++#endif
++
++/* return inode in which @uf_info is embedded */
++static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
++						    uf_info)
++{
++	return &container_of(uf_info, reiser4_inode_object,
++			     p.file_plugin_data.unix_file_info)->vfs_inode;
++}
++
++
++extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
++extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
++
++extern reiser4_tree *tree_by_inode(const struct inode *inode);
++
++#if REISER4_DEBUG
++extern void inode_invariant(const struct inode *inode);
++extern int inode_has_no_jnodes(reiser4_inode *);
++#else
++#define inode_invariant(inode) noop
++#endif
++
++static inline int spin_inode_is_locked(const struct inode *inode)
++{
++	assert_spin_locked(&reiser4_inode_data(inode)->guard);
++	return 1;
++}
++
++/**
++ * spin_lock_inode - lock reiser4_inode' embedded spinlock
++ * @inode: inode to lock
++ *
++ * In debug mode it checks that lower priority locks are not held and
++ * increments reiser4_context's lock counters on which lock ordering checking
++ * is based.
++ */
++static inline void spin_lock_inode(struct inode *inode)
++{
++	assert("", LOCK_CNT_NIL(spin_locked));
++	/* check lock ordering */
++	assert_spin_not_locked(&d_lock);
++
++	spin_lock(&reiser4_inode_data(inode)->guard);
++
++	LOCK_CNT_INC(spin_locked_inode);
++	LOCK_CNT_INC(spin_locked);
++
++	inode_invariant(inode);
++}
++
++/**
++ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
++ * @inode: inode to unlock
++ *
++ * In debug mode it checks that spinlock is held and decrements
++ * reiser4_context's lock counters on which lock ordering checking is based.
++ */
++static inline void spin_unlock_inode(struct inode *inode)
++{
++	assert_spin_locked(&reiser4_inode_data(inode)->guard);
++	assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	inode_invariant(inode);
++
++	LOCK_CNT_DEC(spin_locked_inode);
++	LOCK_CNT_DEC(spin_locked);
++
++	spin_unlock(&reiser4_inode_data(inode)->guard);
++}
++
++
++extern znode *inode_get_vroot(struct inode *inode);
++extern void inode_set_vroot(struct inode *inode, znode * vroot);
++
++extern int reiser4_max_filename_len(const struct inode *inode);
++extern int max_hash_collisions(const struct inode *dir);
++extern void reiser4_unlock_inode(struct inode *inode);
++extern int is_reiser4_inode(const struct inode *inode);
++extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
++extern struct inode *reiser4_iget(struct super_block *super,
++				  const reiser4_key * key, int silent);
++extern void reiser4_iget_complete(struct inode *inode);
++extern void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
++extern void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
++extern int inode_get_flag(const struct inode *inode,
++			  reiser4_file_plugin_flags f);
++
++/*  has inode been initialized? */
++static inline int
++is_inode_loaded(const struct inode *inode /* inode queried */ )
++{
++	assert("nikita-1120", inode != NULL);
++	return inode_get_flag(inode, REISER4_LOADED);
++}
++
++extern file_plugin *inode_file_plugin(const struct inode *inode);
++extern dir_plugin *inode_dir_plugin(const struct inode *inode);
++extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
++extern hash_plugin *inode_hash_plugin(const struct inode *inode);
++extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
++extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
++extern digest_plugin *inode_digest_plugin(const struct inode *inode);
++extern compression_plugin *inode_compression_plugin(const struct inode *inode);
++extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
++							      *inode);
++extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
++extern regular_plugin *inode_regular_plugin(const struct inode *inode);
++extern item_plugin *inode_sd_plugin(const struct inode *inode);
++extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
++
++extern void inode_set_plugin(struct inode *inode,
++			     reiser4_plugin * plug, pset_member memb);
++extern void reiser4_make_bad_inode(struct inode *inode);
++
++extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
++extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
++extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
++
++/*
++ * update field @field in inode @i to contain value @value.
++ */
++#define INODE_SET_FIELD(i, field, value)		\
++({							\
++	struct inode *__i;				\
++	typeof(value) __v;				\
++							\
++	__i = (i);					\
++	__v = (value);					\
++	inode_check_scale(__i, __i->field, __v);	\
++	__i->field = __v;				\
++})
++
++#define INODE_INC_FIELD(i, field)				\
++({								\
++	struct inode *__i;					\
++								\
++	__i = (i);						\
++	inode_check_scale(__i, __i->field, __i->field + 1);	\
++	++ __i->field;						\
++})
++
++#define INODE_DEC_FIELD(i, field)				\
++({								\
++	struct inode *__i;					\
++								\
++	__i = (i);						\
++	inode_check_scale(__i, __i->field, __i->field - 1);	\
++	-- __i->field;						\
++})
++
++/* See comment before readdir_common() for description. */
++static inline struct list_head *get_readdir_list(const struct inode *inode)
++{
++	return &reiser4_inode_data(inode)->lists.readdir_list;
++}
++
++extern void init_inode_ordering(struct inode *inode,
++				reiser4_object_create_data * crd, int create);
++
++static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
++{
++	return &reiser4_inode_data(inode)->jnodes_tree;
++}
++
++static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
++								  * r4_inode)
++{
++	return &r4_inode->jnodes_tree;
++}
++
++#if REISER4_DEBUG
++extern void print_inode(const char *prefix, const struct inode *i);
++#endif
++
++int is_dir_empty(const struct inode *);
++
++/* __REISER4_INODE_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/ioctl.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/ioctl.h
+@@ -0,0 +1,41 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#if !defined( __REISER4_IOCTL_H__ )
++#define __REISER4_IOCTL_H__
++
++#include <linux/fs.h>
++
++/*
++ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
++ * extents and fix in this state. This is used by applications that rely on
++ *
++ *     . files being block aligned, and
++ *
++ *     . files never migrating on disk
++ *
++ * for example, boot loaders (LILO) need this.
++ *
++ * This ioctl should be used as
++ *
++ *     result = ioctl(fd, REISER4_IOC_UNPACK);
++ *
++ * File behind fd descriptor will be converted to the extents (if necessary),
++ * and its stat-data will be updated so that it will never be converted back
++ * into tails again.
++ */
++#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
++
++/* __REISER4_IOCTL_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/jnode.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/jnode.c
+@@ -0,0 +1,1921 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++/* Jnode manipulation functions. */
++/* Jnode is entity used to track blocks with data and meta-data in reiser4.
++
++   In particular, jnodes are used to track transactional information
++   associated with each block. Each znode contains jnode as ->zjnode field.
++
++   Jnode stands for either Josh or Journal node.
++*/
++
++/*
++ * Taxonomy.
++ *
++ *     Jnode represents block containing data or meta-data. There are jnodes
++ *     for:
++ *
++ *         unformatted blocks (jnodes proper). There are plans, however to
++ *         have a handle per extent unit rather than per each unformatted
++ *         block, because there are so many of them.
++ *
++ *         For bitmaps. Each bitmap is actually represented by two jnodes--one
++ *         for working and another for "commit" data, together forming bnode.
++ *
++ *         For io-heads. These are used by log writer.
++ *
++ *         For formatted nodes (znode). See comment at the top of znode.c for
++ *         details specific to the formatted nodes (znodes).
++ *
++ * Node data.
++ *
++ *     Jnode provides access to the data of node it represents. Data are
++ *     stored in a page. Page is kept in a page cache. This means, that jnodes
++ *     are highly interconnected with page cache and VM internals.
++ *
++ *     jnode has a pointer to page (->pg) containing its data. Pointer to data
++ *     themselves is cached in ->data field to avoid frequent calls to
++ *     page_address().
++ *
++ *     jnode and page are attached to each other by jnode_attach_page(). This
++ *     function places pointer to jnode in set_page_private(), sets PG_private
++ *     flag and increments page counter.
++ *
++ *     Opposite operation is performed by page_clear_jnode().
++ *
++ *     jnode->pg is protected by jnode spin lock, and page->private is
++ *     protected by page lock. See comment at the top of page_cache.c for
++ *     more.
++ *
++ *     page can be detached from jnode for two reasons:
++ *
++ *         . jnode is removed from a tree (file is truncated, of formatted
++ *         node is removed by balancing).
++ *
++ *         . during memory pressure, VM calls ->releasepage() method
++ *         (reiser4_releasepage()) to evict page from memory.
++ *
++ *    (there, of course, is also umount, but this is special case we are not
++ *    concerned with here).
++ *
++ *    To protect jnode page from eviction, one calls jload() function that
++ *    "pins" page in memory (loading it if necessary), increments
++ *    jnode->d_count, and kmap()s page. Page is unpinned through call to
++ *    jrelse().
++ *
++ * Jnode life cycle.
++ *
++ *    jnode is created, placed in hash table, and, optionally, in per-inode
++ *    radix tree. Page can be attached to jnode, pinned, released, etc.
++ *
++ *    When jnode is captured into atom its reference counter is
++ *    increased. While being part of an atom, jnode can be "early
++ *    flushed". This means that as part of flush procedure, jnode is placed
++ *    into "relocate set", and its page is submitted to the disk. After io
++ *    completes, page can be detached, then loaded again, re-dirtied, etc.
++ *
++ *    Thread acquired reference to jnode by calling jref() and releases it by
++ *    jput(). When last reference is removed, jnode is still retained in
++ *    memory (cached) if it has page attached, _unless_ it is scheduled for
++ *    destruction (has JNODE_HEARD_BANSHEE bit set).
++ *
++ *    Tree read-write lock was used as "existential" lock for jnodes. That is,
++ *    jnode->x_count could be changed from 0 to 1 only under tree write lock,
++ *    that is, tree lock protected unreferenced jnodes stored in the hash
++ *    table, from recycling.
++ *
++ *    This resulted in high contention on tree lock, because jref()/jput() is
++ *    frequent operation. To ameliorate this problem, RCU is used: when jput()
++ *    is just about to release last reference on jnode it sets JNODE_RIP bit
++ *    on it, and then proceed with jnode destruction (removing jnode from hash
++ *    table, cbk_cache, detaching page, etc.). All places that change jnode
++ *    reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
++ *    cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
++ *    jnode_rip_check() function), and pretend that nothing was found in hash
++ *    table if bit is set.
++ *
++ *    jput defers actual return of jnode into slab cache to some later time
++ *    (by call_rcu()), this guarantees that other threads can safely continue
++ *    working with JNODE_RIP-ped jnode.
++ *
++ */
++
++#include "reiser4.h"
++#include "debug.h"
++#include "dformat.h"
++#include "jnode.h"
++#include "plugin/plugin_header.h"
++#include "plugin/plugin.h"
++#include "txnmgr.h"
++/*#include "jnode.h"*/
++#include "znode.h"
++#include "tree.h"
++#include "tree_walk.h"
++#include "super.h"
++#include "inode.h"
++#include "page_cache.h"
++
++#include <asm/uaccess.h>	/* UML needs this for PAGE_OFFSET */
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/pagemap.h>
++#include <linux/vmalloc.h>	/* for vmalloc(), vfree() */
++#include <linux/swap.h>
++#include <linux/fs.h>		/* for struct address_space  */
++#include <linux/writeback.h>	/* for inode_lock */
++
++static kmem_cache_t *_jnode_slab = NULL;
++
++static void jnode_set_type(jnode * node, jnode_type type);
++static int jdelete(jnode * node);
++static int jnode_try_drop(jnode * node);
++
++#if REISER4_DEBUG
++static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
++#endif
++
++/* true if valid page is attached to jnode */
++static inline int jnode_is_parsed(jnode * node)
++{
++	return JF_ISSET(node, JNODE_PARSED);
++}
++
++/* hash table support */
++
++/* compare two jnode keys for equality. Used by hash-table macros */
++static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
++{
++	assert("nikita-2350", k1 != NULL);
++	assert("nikita-2351", k2 != NULL);
++
++	return (k1->index == k2->index && k1->objectid == k2->objectid);
++}
++
++/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
++static inline __u32
++jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
++{
++	assert("nikita-2352", key != NULL);
++	assert("nikita-3346", IS_POW(table->_buckets));
++
++	/* yes, this is remarkable simply (where not stupid) hash function. */
++	return (key->objectid + key->index) & (table->_buckets - 1);
++}
++
++/* The hash table definition */
++#define KMALLOC(size) vmalloc(size)
++#define KFREE(ptr, size) vfree(ptr)
++TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
++		      jnode_key_eq);
++#undef KFREE
++#undef KMALLOC
++
++/* call this to initialise jnode hash table */
++int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
++{
++	assert("nikita-2359", tree != NULL);
++	return j_hash_init(&tree->jhash_table, 16384);
++}
++
++/* call this to destroy jnode hash table. This is called during umount. */
++int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
++{
++	j_hash_table *jtable;
++	jnode *node;
++	jnode *next;
++
++	assert("nikita-2360", tree != NULL);
++
++	/*
++	 * Scan hash table and free all jnodes.
++	 */
++	jtable = &tree->jhash_table;
++	if (jtable->_table) {
++		for_all_in_htable(jtable, j, node, next) {
++			assert("nikita-2361", !atomic_read(&node->x_count));
++			jdrop(node);
++		}
++
++		j_hash_done(&tree->jhash_table);
++	}
++	return 0;
++}
++
++/**
++ * init_jnodes - create jnode cache
++ *
++ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
++ */
++int init_jnodes(void)
++{
++	assert("umka-168", _jnode_slab == NULL);
++
++	_jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
++					SLAB_HWCACHE_ALIGN |
++					SLAB_RECLAIM_ACCOUNT, NULL, NULL);
++	if (_jnode_slab == NULL)
++		return RETERR(-ENOMEM);
++
++	return 0;
++}
++
++/**
++ * done_znodes - delete znode cache
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_jnodes(void)
++{
++	destroy_reiser4_cache(&_jnode_slab);
++}
++
++/* Initialize a jnode. */
++void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
++{
++	assert("umka-175", node != NULL);
++
++	memset(node, 0, sizeof(jnode));
++	ON_DEBUG(node->magic = JMAGIC);
++	jnode_set_type(node, type);
++	atomic_set(&node->d_count, 0);
++	atomic_set(&node->x_count, 0);
++	spin_lock_init(&node->guard);
++	spin_lock_init(&node->load);
++	node->atom = NULL;
++	node->tree = tree;
++	INIT_LIST_HEAD(&node->capture_link);
++
++	ASSIGN_NODE_LIST(node, NOT_CAPTURED);
++
++	INIT_RCU_HEAD(&node->rcu);
++
++#if REISER4_DEBUG
++	{
++		reiser4_super_info_data *sbinfo;
++
++		sbinfo = get_super_private(tree->super);
++		spin_lock_irq(&sbinfo->all_guard);
++		list_add(&node->jnodes, &sbinfo->all_jnodes);
++		spin_unlock_irq(&sbinfo->all_guard);
++	}
++#endif
++}
++
++#if REISER4_DEBUG
++/*
++ * Remove jnode from ->all_jnodes list.
++ */
++static void jnode_done(jnode * node, reiser4_tree * tree)
++{
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = get_super_private(tree->super);
++
++	spin_lock_irq(&sbinfo->all_guard);
++	assert("nikita-2422", !list_empty(&node->jnodes));
++	list_del_init(&node->jnodes);
++	spin_unlock_irq(&sbinfo->all_guard);
++}
++#endif
++
++/* return already existing jnode of page */
++jnode *jnode_by_page(struct page *pg)
++{
++	assert("nikita-2066", pg != NULL);
++	assert("nikita-2400", PageLocked(pg));
++	assert("nikita-2068", PagePrivate(pg));
++	assert("nikita-2067", jprivate(pg) != NULL);
++	return jprivate(pg);
++}
++
++/* exported functions to allocate/free jnode objects outside this file */
++jnode *jalloc(void)
++{
++	jnode *jal = kmem_cache_alloc(_jnode_slab, get_gfp_mask());
++	return jal;
++}
++
++/* return jnode back to the slab allocator */
++inline void jfree(jnode * node)
++{
++	assert("zam-449", node != NULL);
++
++	assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
++			       NODE_LIST(node) == NOT_CAPTURED));
++	assert("nikita-3222", list_empty(&node->jnodes));
++	assert("nikita-3221", jnode_page(node) == NULL);
++
++	/* not yet phash_jnode_destroy(node); */
++
++	kmem_cache_free(_jnode_slab, node);
++}
++
++/*
++ * This function is supplied as RCU callback. It actually frees jnode when
++ * last reference to it is gone.
++ */
++static void jnode_free_actor(struct rcu_head *head)
++{
++	jnode *node;
++	jnode_type jtype;
++
++	node = container_of(head, jnode, rcu);
++	jtype = jnode_get_type(node);
++
++	ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
++
++	switch (jtype) {
++	case JNODE_IO_HEAD:
++	case JNODE_BITMAP:
++	case JNODE_UNFORMATTED_BLOCK:
++		jfree(node);
++		break;
++	case JNODE_FORMATTED_BLOCK:
++		zfree(JZNODE(node));
++		break;
++	case JNODE_INODE:
++	default:
++		wrong_return_value("nikita-3197", "Wrong jnode type");
++	}
++}
++
++/*
++ * Free a jnode. Post a callback to be executed later through RCU when all
++ * references to @node are released.
++ */
++static inline void jnode_free(jnode * node, jnode_type jtype)
++{
++	if (jtype != JNODE_INODE) {
++		/*assert("nikita-3219", list_empty(&node->rcu.list)); */
++		call_rcu(&node->rcu, jnode_free_actor);
++	} else
++		jnode_list_remove(node);
++}
++
++/* allocate new unformatted jnode */
++static jnode *jnew_unformatted(void)
++{
++	jnode *jal;
++
++	jal = jalloc();
++	if (jal == NULL)
++		return NULL;
++
++	jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
++	jal->key.j.mapping = NULL;
++	jal->key.j.index = (unsigned long)-1;
++	jal->key.j.objectid = 0;
++	return jal;
++}
++
++/* look for jnode with given mapping and offset within hash table */
++jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
++{
++	jnode_key_t jkey;
++	jnode *node;
++
++	assert("nikita-2353", tree != NULL);
++
++	jkey.objectid = objectid;
++	jkey.index = index;
++
++	/*
++	 * hash table is _not_ protected by any lock during lookups. All we
++	 * have to do is to disable preemption to keep RCU happy.
++	 */
++
++	rcu_read_lock();
++	node = j_hash_find(&tree->jhash_table, &jkey);
++	if (node != NULL) {
++		/* protect @node from recycling */
++		jref(node);
++		assert("nikita-2955", jnode_invariant(node, 0, 0));
++		node = jnode_rip_check(tree, node);
++	}
++	rcu_read_unlock();
++	return node;
++}
++
++/* per inode radix tree of jnodes is protected by tree's read write spin lock */
++static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
++{
++	assert("vs-1694", mapping->host != NULL);
++
++	return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
++}
++
++jnode *jfind(struct address_space * mapping, unsigned long index)
++{
++	reiser4_tree *tree;
++	jnode *node;
++
++	assert("vs-1694", mapping->host != NULL);
++	tree = tree_by_inode(mapping->host);
++
++	read_lock_tree(tree);
++	node = jfind_nolock(mapping, index);
++	if (node != NULL)
++		jref(node);
++	read_unlock_tree(tree);
++	return node;
++}
++
++static void inode_attach_jnode(jnode * node)
++{
++	struct inode *inode;
++	reiser4_inode *info;
++	struct radix_tree_root *rtree;
++
++	assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
++	assert("zam-1043", node->key.j.mapping != NULL);
++	inode = node->key.j.mapping->host;
++	info = reiser4_inode_data(inode);
++	rtree = jnode_tree_by_reiser4_inode(info);
++	if (rtree->rnode == NULL) {
++		/* prevent inode from being pruned when it has jnodes attached
++		   to it */
++		write_lock_irq(&inode->i_data.tree_lock);
++		inode->i_data.nrpages++;
++		write_unlock_irq(&inode->i_data.tree_lock);
++	}
++	assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
++	check_me("zam-1045",
++		 !radix_tree_insert(rtree, node->key.j.index, node));
++	ON_DEBUG(info->nr_jnodes++);
++}
++
++static void inode_detach_jnode(jnode * node)
++{
++	struct inode *inode;
++	reiser4_inode *info;
++	struct radix_tree_root *rtree;
++
++	assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
++	assert("zam-1044", node->key.j.mapping != NULL);
++	inode = node->key.j.mapping->host;
++	info = reiser4_inode_data(inode);
++	rtree = jnode_tree_by_reiser4_inode(info);
++
++	assert("zam-1051", info->nr_jnodes != 0);
++	assert("zam-1052", rtree->rnode != NULL);
++	ON_DEBUG(info->nr_jnodes--);
++
++	/* delete jnode from inode's radix tree of jnodes */
++	check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
++	if (rtree->rnode == NULL) {
++		/* inode can be pruned now */
++		write_lock_irq(&inode->i_data.tree_lock);
++		inode->i_data.nrpages--;
++		write_unlock_irq(&inode->i_data.tree_lock);
++	}
++}
++
++/* put jnode into hash table (where they can be found by flush who does not know
++   mapping) and to inode's tree of jnodes (where they can be found (hopefully
++   faster) in places where mapping is known). Currently it is used by
++   fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
++   created */
++static void
++hash_unformatted_jnode(jnode * node, struct address_space *mapping,
++		       unsigned long index)
++{
++	j_hash_table *jtable;
++
++	assert("vs-1446", jnode_is_unformatted(node));
++	assert("vs-1442", node->key.j.mapping == 0);
++	assert("vs-1443", node->key.j.objectid == 0);
++	assert("vs-1444", node->key.j.index == (unsigned long)-1);
++	assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
++
++	node->key.j.mapping = mapping;
++	node->key.j.objectid = get_inode_oid(mapping->host);
++	node->key.j.index = index;
++
++	jtable = &jnode_get_tree(node)->jhash_table;
++
++	/* race with some other thread inserting jnode into the hash table is
++	 * impossible, because we keep the page lock. */
++	/*
++	 * following assertion no longer holds because of RCU: it is possible
++	 * jnode is in the hash table, but with JNODE_RIP bit set.
++	 */
++	/* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
++	j_hash_insert_rcu(jtable, node);
++	inode_attach_jnode(node);
++}
++
++static void unhash_unformatted_node_nolock(jnode * node)
++{
++	assert("vs-1683", node->key.j.mapping != NULL);
++	assert("vs-1684",
++	       node->key.j.objectid ==
++	       get_inode_oid(node->key.j.mapping->host));
++
++	/* remove jnode from hash-table */
++	j_hash_remove_rcu(&node->tree->jhash_table, node);
++	inode_detach_jnode(node);
++	node->key.j.mapping = NULL;
++	node->key.j.index = (unsigned long)-1;
++	node->key.j.objectid = 0;
++
++}
++
++/* remove jnode from hash table and from inode's tree of jnodes. This is used in
++   reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
++   uncapture_jnode */
++void unhash_unformatted_jnode(jnode * node)
++{
++	assert("vs-1445", jnode_is_unformatted(node));
++
++	write_lock_tree(node->tree);
++	unhash_unformatted_node_nolock(node);
++	write_unlock_tree(node->tree);
++}
++
++/*
++ * search hash table for a jnode with given oid and index. If not found,
++ * allocate new jnode, insert it, and also insert into radix tree for the
++ * given inode/mapping.
++ */
++jnode *find_get_jnode(reiser4_tree * tree, struct address_space *mapping,
++		      oid_t oid, unsigned long index)
++{
++	jnode *result;
++	jnode *shadow;
++	int preload;
++
++	result = jnew_unformatted();
++
++	if (unlikely(result == NULL))
++		return ERR_PTR(RETERR(-ENOMEM));
++
++	preload = radix_tree_preload(get_gfp_mask());
++	if (preload != 0)
++		return ERR_PTR(preload);
++
++	write_lock_tree(tree);
++	shadow = jfind_nolock(mapping, index);
++	if (likely(shadow == NULL)) {
++		/* add new jnode to hash table and inode's radix tree of jnodes */
++		jref(result);
++		hash_unformatted_jnode(result, mapping, index);
++	} else {
++		/* jnode is found in inode's radix tree of jnodes */
++		jref(shadow);
++		jnode_free(result, JNODE_UNFORMATTED_BLOCK);
++		assert("vs-1498", shadow->key.j.mapping == mapping);
++		result = shadow;
++	}
++	write_unlock_tree(tree);
++
++	assert("nikita-2955",
++	       ergo(result != NULL, jnode_invariant(result, 0, 0)));
++	radix_tree_preload_end();
++	return result;
++}
++
++/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
++   creates) jnode corresponding to page @pg. jnode is attached to page and
++   inserted into jnode hash-table. */
++static jnode *do_jget(reiser4_tree * tree, struct page *pg)
++{
++	/*
++	 * There are two ways to create jnode: starting with pre-existing page
++	 * and without page.
++	 *
++	 * When page already exists, jnode is created
++	 * (jnode_of_page()->do_jget()) under page lock. This is done in
++	 * ->writepage(), or when capturing anonymous page dirtied through
++	 * mmap.
++	 *
++	 * Jnode without page is created by index_extent_jnode().
++	 *
++	 */
++
++	jnode *result;
++	oid_t oid = get_inode_oid(pg->mapping->host);
++
++	assert("umka-176", pg != NULL);
++	assert("nikita-2394", PageLocked(pg));
++
++	result = jprivate(pg);
++	if (likely(result != NULL))
++		return jref(result);
++
++	tree = tree_by_page(pg);
++
++	/* check hash-table first */
++	result = jfind(pg->mapping, pg->index);
++	if (unlikely(result != NULL)) {
++		spin_lock_jnode(result);
++		jnode_attach_page(result, pg);
++		spin_unlock_jnode(result);
++		result->key.j.mapping = pg->mapping;
++		return result;
++	}
++
++	result = find_get_jnode(tree, pg->mapping, oid, pg->index);
++	if (unlikely(IS_ERR(result)))
++		return result;
++	/* attach jnode to page */
++	spin_lock_jnode(result);
++	jnode_attach_page(result, pg);
++	spin_unlock_jnode(result);
++	return result;
++}
++
++/*
++ * return jnode for @pg, creating it if necessary.
++ */
++jnode *jnode_of_page(struct page * pg)
++{
++	jnode *result;
++
++	assert("umka-176", pg != NULL);
++	assert("nikita-2394", PageLocked(pg));
++
++	result = do_jget(tree_by_page(pg), pg);
++
++	if (REISER4_DEBUG && !IS_ERR(result)) {
++		assert("nikita-3210", result == jprivate(pg));
++		assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
++		if (jnode_is_unformatted(jprivate(pg))) {
++			assert("nikita-2364",
++			       jprivate(pg)->key.j.index == pg->index);
++			assert("nikita-2367",
++			       jprivate(pg)->key.j.mapping == pg->mapping);
++			assert("nikita-2365",
++			       jprivate(pg)->key.j.objectid ==
++			       get_inode_oid(pg->mapping->host));
++			assert("vs-1200",
++			       jprivate(pg)->key.j.objectid ==
++			       pg->mapping->host->i_ino);
++			assert("nikita-2356",
++			       jnode_is_unformatted(jnode_by_page(pg)));
++		}
++		assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
++	}
++	return result;
++}
++
++/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
++ * page.*/
++void jnode_attach_page(jnode * node, struct page *pg)
++{
++	assert("nikita-2060", node != NULL);
++	assert("nikita-2061", pg != NULL);
++
++	assert("nikita-2050", jprivate(pg) == 0ul);
++	assert("nikita-2393", !PagePrivate(pg));
++	assert("vs-1741", node->pg == NULL);
++
++	assert("nikita-2396", PageLocked(pg));
++	assert_spin_locked(&(node->guard));
++
++	page_cache_get(pg);
++	set_page_private(pg, (unsigned long)node);
++	node->pg = pg;
++	SetPagePrivate(pg);
++}
++
++/* Dual to jnode_attach_page: break a binding between page and jnode */
++void page_clear_jnode(struct page *page, jnode * node)
++{
++	assert("nikita-2424", page != NULL);
++	assert("nikita-2425", PageLocked(page));
++	assert("nikita-2426", node != NULL);
++	assert_spin_locked(&(node->guard));
++	assert("nikita-2428", PagePrivate(page));
++
++	assert("nikita-3551", !PageWriteback(page));
++
++	JF_CLR(node, JNODE_PARSED);
++	set_page_private(page, 0ul);
++	ClearPagePrivate(page);
++	node->pg = NULL;
++	page_cache_release(page);
++}
++
++/* it is only used in one place to handle error */
++void
++page_detach_jnode(struct page *page, struct address_space *mapping,
++		  unsigned long index)
++{
++	assert("nikita-2395", page != NULL);
++
++	lock_page(page);
++	if ((page->mapping == mapping) && (page->index == index)
++	    && PagePrivate(page)) {
++		jnode *node;
++
++		node = jprivate(page);
++		spin_lock_jnode(node);
++		page_clear_jnode(page, node);
++		spin_unlock_jnode(node);
++	}
++	unlock_page(page);
++}
++
++/* return @node page locked.
++
++   Locking ordering requires that one first takes page lock and afterwards
++   spin lock on node attached to this page. Sometimes it is necessary to go in
++   the opposite direction. This is done through standard trylock-and-release
++   loop.
++*/
++static struct page *jnode_lock_page(jnode * node)
++{
++	struct page *page;
++
++	assert("nikita-2052", node != NULL);
++	assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
++
++	while (1) {
++
++		spin_lock_jnode(node);
++		page = jnode_page(node);
++		if (page == NULL) {
++			break;
++		}
++
++		/* no need to page_cache_get( page ) here, because page cannot
++		   be evicted from memory without detaching it from jnode and
++		   this requires spin lock on jnode that we already hold.
++		 */
++		if (!TestSetPageLocked(page)) {
++			/* We won a lock on jnode page, proceed. */
++			break;
++		}
++
++		/* Page is locked by someone else. */
++		page_cache_get(page);
++		spin_unlock_jnode(node);
++		wait_on_page_locked(page);
++		/* it is possible that page was detached from jnode and
++		   returned to the free pool, or re-assigned while we were
++		   waiting on locked bit. This will be rechecked on the next
++		   loop iteration.
++		 */
++		page_cache_release(page);
++
++		/* try again */
++	}
++	return page;
++}
++
++/*
++ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
++ * validness of jnode content.
++ */
++static inline int jparse(jnode * node)
++{
++	int result;
++
++	assert("nikita-2466", node != NULL);
++
++	spin_lock_jnode(node);
++	if (likely(!jnode_is_parsed(node))) {
++		result = jnode_ops(node)->parse(node);
++		if (likely(result == 0))
++			JF_SET(node, JNODE_PARSED);
++	} else
++		result = 0;
++	spin_unlock_jnode(node);
++	return result;
++}
++
++/* Lock a page attached to jnode, create and attach page to jnode if it had no
++ * one. */
++struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
++{
++	struct page *page;
++
++	spin_lock_jnode(node);
++	page = jnode_page(node);
++
++	if (page == NULL) {
++		spin_unlock_jnode(node);
++		page = find_or_create_page(jnode_get_mapping(node),
++					   jnode_get_index(node), gfp_flags);
++		if (page == NULL)
++			return ERR_PTR(RETERR(-ENOMEM));
++	} else {
++		if (!TestSetPageLocked(page)) {
++			spin_unlock_jnode(node);
++			return page;
++		}
++		page_cache_get(page);
++		spin_unlock_jnode(node);
++		lock_page(page);
++		assert("nikita-3134", page->mapping == jnode_get_mapping(node));
++	}
++
++	spin_lock_jnode(node);
++	if (!jnode_page(node))
++		jnode_attach_page(node, page);
++	spin_unlock_jnode(node);
++
++	page_cache_release(page);
++	assert("zam-894", jnode_page(node) == page);
++	return page;
++}
++
++/* Start read operation for jnode's page if page is not up-to-date. */
++static int jnode_start_read(jnode * node, struct page *page)
++{
++	assert("zam-893", PageLocked(page));
++
++	if (PageUptodate(page)) {
++		unlock_page(page);
++		return 0;
++	}
++	return page_io(page, node, READ, get_gfp_mask());
++}
++
++#if REISER4_DEBUG
++static void check_jload(jnode * node, struct page *page)
++{
++	if (jnode_is_znode(node)) {
++		node40_header *nh;
++		znode *z;
++
++		z = JZNODE(node);
++		if (znode_is_any_locked(z)) {
++			nh = (node40_header *) kmap(page);
++			/* this only works for node40-only file systems. For
++			 * debugging. */
++			assert("nikita-3253",
++			       z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
++			kunmap(page);
++		}
++		assert("nikita-3565", znode_invariant(z));
++	}
++}
++#else
++#define check_jload(node, page) noop
++#endif
++
++/* prefetch jnode to speed up next call to jload. Call this when you are going
++ * to call jload() shortly. This will bring appropriate portion of jnode into
++ * CPU cache. */
++void jload_prefetch(jnode * node)
++{
++	prefetchw(&node->x_count);
++}
++
++/* load jnode's data into memory */
++int jload_gfp(jnode * node /* node to load */ ,
++	      gfp_t gfp_flags /* allocation flags */ ,
++	      int do_kmap /* true if page should be kmapped */ )
++{
++	struct page *page;
++	int result = 0;
++	int parsed;
++
++	assert("nikita-3010", schedulable());
++
++	prefetchw(&node->pg);
++
++	/* taking d-reference implies taking x-reference. */
++	jref(node);
++
++	/*
++	 * acquiring d-reference to @jnode and check for JNODE_PARSED bit
++	 * should be atomic, otherwise there is a race against
++	 * reiser4_releasepage().
++	 */
++	spin_lock(&(node->load));
++	add_d_ref(node);
++	parsed = jnode_is_parsed(node);
++	spin_unlock(&(node->load));
++
++	if (unlikely(!parsed)) {
++		page = jnode_get_page_locked(node, gfp_flags);
++		if (unlikely(IS_ERR(page))) {
++			result = PTR_ERR(page);
++			goto failed;
++		}
++
++		result = jnode_start_read(node, page);
++		if (unlikely(result != 0))
++			goto failed;
++
++		wait_on_page_locked(page);
++		if (unlikely(!PageUptodate(page))) {
++			result = RETERR(-EIO);
++			goto failed;
++		}
++
++		if (do_kmap)
++			node->data = kmap(page);
++
++		result = jparse(node);
++		if (unlikely(result != 0)) {
++			if (do_kmap)
++				kunmap(page);
++			goto failed;
++		}
++		check_jload(node, page);
++	} else {
++		page = jnode_page(node);
++		check_jload(node, page);
++		if (do_kmap)
++			node->data = kmap(page);
++	}
++
++	if (!is_writeout_mode())
++		/* We do not mark pages active if jload is called as a part of
++		 * jnode_flush() or reiser4_write_logs().  Both jnode_flush()
++		 * and write_logs() add no value to cached data, there is no
++		 * sense to mark pages as active when they go to disk, it just
++		 * confuses vm scanning routines because clean page could be
++		 * moved out from inactive list as a result of this
++		 * mark_page_accessed() call. */
++		mark_page_accessed(page);
++
++	return 0;
++
++      failed:
++	jrelse_tail(node);
++	return result;
++
++}
++
++/* start asynchronous reading for given jnode's page. */
++int jstartio(jnode * node)
++{
++	struct page *page;
++
++	page = jnode_get_page_locked(node, get_gfp_mask());
++	if (IS_ERR(page))
++		return PTR_ERR(page);
++
++	return jnode_start_read(node, page);
++}
++
++/* Initialize a node by calling appropriate plugin instead of reading
++ * node from disk as in jload(). */
++int jinit_new(jnode * node, gfp_t gfp_flags)
++{
++	struct page *page;
++	int result;
++
++	jref(node);
++	add_d_ref(node);
++
++	page = jnode_get_page_locked(node, gfp_flags);
++	if (IS_ERR(page)) {
++		result = PTR_ERR(page);
++		goto failed;
++	}
++
++	SetPageUptodate(page);
++	unlock_page(page);
++
++	node->data = kmap(page);
++
++	if (!jnode_is_parsed(node)) {
++		jnode_plugin *jplug = jnode_ops(node);
++		spin_lock_jnode(node);
++		result = jplug->init(node);
++		spin_unlock_jnode(node);
++		if (result) {
++			kunmap(page);
++			goto failed;
++		}
++		JF_SET(node, JNODE_PARSED);
++	}
++
++	return 0;
++
++      failed:
++	jrelse(node);
++	return result;
++}
++
++/* release a reference to jnode acquired by jload(), decrement ->d_count */
++void jrelse_tail(jnode * node /* jnode to release references to */ )
++{
++	assert("nikita-489", atomic_read(&node->d_count) > 0);
++	atomic_dec(&node->d_count);
++	/* release reference acquired in jload_gfp() or jinit_new() */
++	jput(node);
++	if (jnode_is_unformatted(node) || jnode_is_znode(node))
++		LOCK_CNT_DEC(d_refs);
++}
++
++/* drop reference to node data. When last reference is dropped, data are
++   unloaded. */
++void jrelse(jnode * node /* jnode to release references to */ )
++{
++	struct page *page;
++
++	assert("nikita-487", node != NULL);
++	assert_spin_not_locked(&(node->guard));
++
++	page = jnode_page(node);
++	if (likely(page != NULL)) {
++		/*
++		 * it is safe not to lock jnode here, because at this point
++		 * @node->d_count is greater than zero (if jrelse() is used
++		 * correctly, that is). JNODE_PARSED may be not set yet, if,
++		 * for example, we got here as a result of error handling path
++		 * in jload(). Anyway, page cannot be detached by
++		 * reiser4_releasepage(). truncate will invalidate page
++		 * regardless, but this should not be a problem.
++		 */
++		kunmap(page);
++	}
++	jrelse_tail(node);
++}
++
++/* called from jput() to wait for io completion */
++static void jnode_finish_io(jnode * node)
++{
++	struct page *page;
++
++	assert("nikita-2922", node != NULL);
++
++	spin_lock_jnode(node);
++	page = jnode_page(node);
++	if (page != NULL) {
++		page_cache_get(page);
++		spin_unlock_jnode(node);
++		wait_on_page_writeback(page);
++		page_cache_release(page);
++	} else
++		spin_unlock_jnode(node);
++}
++
++/*
++ * This is called by jput() when last reference to jnode is released. This is
++ * separate function, because we want fast path of jput() to be inline and,
++ * therefore, small.
++ */
++void jput_final(jnode * node)
++{
++	int r_i_p;
++
++	/* A fast check for keeping node in cache. We always keep node in cache
++	 * if its page is present and node was not marked for deletion */
++	if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
++		rcu_read_unlock();
++		return;
++	}
++	assert("edward-1432", node->page_count == 0);
++
++	r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
++	/*
++	 * if r_i_p is true, we were first to set JNODE_RIP on this node. In
++	 * this case it is safe to access node after unlock.
++	 */
++	rcu_read_unlock();
++	if (r_i_p) {
++		jnode_finish_io(node);
++		if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
++			/* node is removed from the tree. */
++			jdelete(node);
++		else
++			jnode_try_drop(node);
++	}
++	/* if !r_i_p some other thread is already killing it */
++}
++
++int jwait_io(jnode * node, int rw)
++{
++	struct page *page;
++	int result;
++
++	assert("zam-447", node != NULL);
++	assert("zam-448", jnode_page(node) != NULL);
++
++	page = jnode_page(node);
++
++	result = 0;
++	if (rw == READ) {
++		wait_on_page_locked(page);
++	} else {
++		assert("nikita-2227", rw == WRITE);
++		wait_on_page_writeback(page);
++	}
++	if (PageError(page))
++		result = RETERR(-EIO);
++
++	return result;
++}
++
++/*
++ * jnode types and plugins.
++ *
++ * jnode by itself is a "base type". There are several different jnode
++ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
++ * has to do different things based on jnode type. In the standard reiser4 way
++ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
++ *
++ * Functions below deal with jnode types and define methods of jnode plugin.
++ *
++ */
++
++/* set jnode type. This is done during jnode initialization. */
++static void jnode_set_type(jnode * node, jnode_type type)
++{
++	static unsigned long type_to_mask[] = {
++		[JNODE_UNFORMATTED_BLOCK] = 1,
++		[JNODE_FORMATTED_BLOCK] = 0,
++		[JNODE_BITMAP] = 2,
++		[JNODE_IO_HEAD] = 6,
++		[JNODE_INODE] = 4
++	};
++
++	assert("zam-647", type < LAST_JNODE_TYPE);
++	assert("nikita-2815", !jnode_is_loaded(node));
++	assert("nikita-3386", node->state == 0);
++
++	node->state |= (type_to_mask[type] << JNODE_TYPE_1);
++}
++
++/* ->init() method of jnode plugin for jnodes that don't require plugin
++ * specific initialization. */
++static int init_noinit(jnode * node UNUSED_ARG)
++{
++	return 0;
++}
++
++/* ->parse() method of jnode plugin for jnodes that don't require plugin
++ * specific pasring. */
++static int parse_noparse(jnode * node UNUSED_ARG)
++{
++	return 0;
++}
++
++/* ->mapping() method for unformatted jnode */
++struct address_space *mapping_jnode(const jnode * node)
++{
++	struct address_space *map;
++
++	assert("nikita-2713", node != NULL);
++
++	/* mapping is stored in jnode */
++
++	map = node->key.j.mapping;
++	assert("nikita-2714", map != NULL);
++	assert("nikita-2897", is_reiser4_inode(map->host));
++	assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
++	return map;
++}
++
++/* ->index() method for unformatted jnodes */
++unsigned long index_jnode(const jnode * node)
++{
++	/* index is stored in jnode */
++	return node->key.j.index;
++}
++
++/* ->remove() method for unformatted jnodes */
++static inline void remove_jnode(jnode * node, reiser4_tree * tree)
++{
++	/* remove jnode from hash table and radix tree */
++	if (node->key.j.mapping)
++		unhash_unformatted_node_nolock(node);
++}
++
++/* ->mapping() method for znodes */
++static struct address_space *mapping_znode(const jnode * node)
++{
++	/* all znodes belong to fake inode */
++	return get_super_fake(jnode_get_tree(node)->super)->i_mapping;
++}
++
++/* ->index() method for znodes */
++static unsigned long index_znode(const jnode * node)
++{
++	unsigned long addr;
++	assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
++
++	/* index of znode is just its address (shifted) */
++	addr = (unsigned long)node;
++	return (addr - PAGE_OFFSET) >> znode_shift_order;
++}
++
++/* ->mapping() method for bitmap jnode */
++static struct address_space *mapping_bitmap(const jnode * node)
++{
++	/* all bitmap blocks belong to special bitmap inode */
++	return get_super_private(jnode_get_tree(node)->super)->bitmap->
++	    i_mapping;
++}
++
++/* ->index() method for jnodes that are indexed by address */
++static unsigned long index_is_address(const jnode * node)
++{
++	unsigned long ind;
++
++	ind = (unsigned long)node;
++	return ind - PAGE_OFFSET;
++}
++
++/* resolve race with jput */
++jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
++{
++	/*
++	 * This is used as part of RCU-based jnode handling.
++	 *
++	 * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
++	 * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
++	 * not protected during this, so concurrent thread may execute
++	 * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
++	 * freed in jput_final(). To avoid such races, jput_final() sets
++	 * JNODE_RIP on jnode (under tree lock). All places that work with
++	 * unreferenced jnodes call this function. It checks for JNODE_RIP bit
++	 * (first without taking tree lock), and if this bit is set, released
++	 * reference acquired by the current thread and returns NULL.
++	 *
++	 * As a result, if jnode is being concurrently freed, NULL is returned
++	 * and caller should pretend that jnode wasn't found in the first
++	 * place.
++	 *
++	 * Otherwise it's safe to release "rcu-read-lock" and continue with
++	 * jnode.
++	 */
++	if (unlikely(JF_ISSET(node, JNODE_RIP))) {
++		read_lock_tree(tree);
++		if (JF_ISSET(node, JNODE_RIP)) {
++			dec_x_ref(node);
++			node = NULL;
++		}
++		read_unlock_tree(tree);
++	}
++	return node;
++}
++
++reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
++{
++	struct inode *inode;
++	item_plugin *iplug;
++	loff_t off;
++
++	assert("nikita-3092", node != NULL);
++	assert("nikita-3093", key != NULL);
++	assert("nikita-3094", jnode_is_unformatted(node));
++
++	off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
++	inode = mapping_jnode(node)->host;
++
++	if (node->parent_item_id != 0)
++		iplug = item_plugin_by_id(node->parent_item_id);
++	else
++		iplug = NULL;
++
++	if (iplug != NULL && iplug->f.key_by_offset)
++		iplug->f.key_by_offset(inode, off, key);
++	else {
++		file_plugin *fplug;
++
++		fplug = inode_file_plugin(inode);
++		assert("zam-1007", fplug != NULL);
++		assert("zam-1008", fplug->key_by_inode != NULL);
++
++		fplug->key_by_inode(inode, off, key);
++	}
++
++	return key;
++}
++
++/* ->parse() method for formatted nodes */
++static int parse_znode(jnode * node)
++{
++	return zparse(JZNODE(node));
++}
++
++/* ->delete() method for formatted nodes */
++static void delete_znode(jnode * node, reiser4_tree * tree)
++{
++	znode *z;
++
++	assert_rw_write_locked(&(tree->tree_lock));
++	assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
++
++	z = JZNODE(node);
++	assert("vs-899", z->c_count == 0);
++
++	/* delete znode from sibling list. */
++	sibling_list_remove(z);
++
++	znode_remove(z, tree);
++}
++
++/* ->remove() method for formatted nodes */
++static int remove_znode(jnode * node, reiser4_tree * tree)
++{
++	znode *z;
++
++	assert_rw_write_locked(&(tree->tree_lock));
++	z = JZNODE(node);
++
++	if (z->c_count == 0) {
++		/* detach znode from sibling list. */
++		sibling_list_drop(z);
++		/* this is called with tree spin-lock held, so call
++		   znode_remove() directly (rather than znode_lock_remove()). */
++		znode_remove(z, tree);
++		return 0;
++	}
++	return RETERR(-EBUSY);
++}
++
++/* ->init() method for formatted nodes */
++static int init_znode(jnode * node)
++{
++	znode *z;
++
++	z = JZNODE(node);
++	/* call node plugin to do actual initialization */
++	return z->nplug->init(z);
++}
++
++/* ->clone() method for formatted nodes */
++static jnode *clone_formatted(jnode * node)
++{
++	znode *clone;
++
++	assert("vs-1430", jnode_is_znode(node));
++	clone = zalloc(get_gfp_mask());
++	if (clone == NULL)
++		return ERR_PTR(RETERR(-ENOMEM));
++	zinit(clone, NULL, current_tree);
++	jnode_set_block(ZJNODE(clone), jnode_get_block(node));
++	/* ZJNODE(clone)->key.z is not initialized */
++	clone->level = JZNODE(node)->level;
++
++	return ZJNODE(clone);
++}
++
++/* jplug->clone for unformatted nodes */
++static jnode *clone_unformatted(jnode * node)
++{
++	jnode *clone;
++
++	assert("vs-1431", jnode_is_unformatted(node));
++	clone = jalloc();
++	if (clone == NULL)
++		return ERR_PTR(RETERR(-ENOMEM));
++
++	jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
++	jnode_set_block(clone, jnode_get_block(node));
++
++	return clone;
++
++}
++
++/*
++ * Setup jnode plugin methods for various jnode types.
++ */
++jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
++	[JNODE_UNFORMATTED_BLOCK] = {
++		.h = {
++			.type_id = REISER4_JNODE_PLUGIN_TYPE,
++			.id = JNODE_UNFORMATTED_BLOCK,
++			.pops = NULL,
++			.label = "unformatted",
++			.desc = "unformatted node",
++			.linkage = {NULL, NULL}
++		},
++		.init = init_noinit,
++		.parse = parse_noparse,
++		.mapping = mapping_jnode,
++		.index = index_jnode,
++		.clone = clone_unformatted
++	},
++	[JNODE_FORMATTED_BLOCK] = {
++		.h = {
++			.type_id = REISER4_JNODE_PLUGIN_TYPE,
++			.id = JNODE_FORMATTED_BLOCK,
++			.pops = NULL,
++			.label = "formatted",
++			.desc = "formatted tree node",
++			.linkage = {NULL, NULL}
++		},
++		.init = init_znode,
++		.parse = parse_znode,
++		.mapping = mapping_znode,
++		.index = index_znode,
++		.clone = clone_formatted
++	},
++	[JNODE_BITMAP] = {
++		.h = {
++			.type_id = REISER4_JNODE_PLUGIN_TYPE,
++			.id = JNODE_BITMAP,
++			.pops = NULL,
++			.label = "bitmap",
++			.desc = "bitmap node",
++			.linkage = {NULL, NULL}
++		},
++		.init = init_noinit,
++		.parse = parse_noparse,
++		.mapping = mapping_bitmap,
++		.index = index_is_address,
++		.clone = NULL
++	},
++	[JNODE_IO_HEAD] = {
++		.h = {
++			.type_id = REISER4_JNODE_PLUGIN_TYPE,
++			.id = JNODE_IO_HEAD,
++			.pops = NULL,
++			.label = "io head",
++			.desc = "io head",
++			.linkage = {NULL, NULL}
++		},
++		.init = init_noinit,
++		.parse = parse_noparse,
++		.mapping = mapping_bitmap,
++		.index = index_is_address,
++		.clone = NULL
++	},
++	[JNODE_INODE] = {
++		.h = {
++			.type_id = REISER4_JNODE_PLUGIN_TYPE,
++			.id = JNODE_INODE,
++			.pops = NULL,
++			.label = "inode",
++			.desc = "inode's builtin jnode",
++			.linkage = {NULL, NULL}
++		},
++		.init = NULL,
++		.parse = NULL,
++		.mapping = NULL,
++		.index = NULL,
++		.clone = NULL
++	}
++};
++
++/*
++ * jnode destruction.
++ *
++ * Thread may use a jnode after it acquired a reference to it. References are
++ * counted in ->x_count field. Reference protects jnode from being
++ * recycled. This is different from protecting jnode data (that are stored in
++ * jnode page) from being evicted from memory. Data are protected by jload()
++ * and released by jrelse().
++ *
++ * If thread already possesses a reference to the jnode it can acquire another
++ * one through jref(). Initial reference is obtained (usually) by locating
++ * jnode in some indexing structure that depends on jnode type: formatted
++ * nodes are kept in global hash table, where they are indexed by block
++ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
++ * table, which is indexed by oid and offset within file, and in per-inode
++ * radix tree.
++ *
++ * Reference to jnode is released by jput(). If last reference is released,
++ * jput_final() is called. This function determines whether jnode has to be
++ * deleted (this happens when corresponding node is removed from the file
++ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
++ * should be just "removed" (deleted from memory).
++ *
++ * Jnode destruction is signally delicate dance because of locking and RCU.
++ */
++
++/*
++ * Returns true if jnode cannot be removed right now. This check is called
++ * under tree lock. If it returns true, jnode is irrevocably committed to be
++ * deleted/removed.
++ */
++static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
++{
++	/* if other thread managed to acquire a reference to this jnode, don't
++	 * free it. */
++	if (atomic_read(&node->x_count) > 0)
++		return 1;
++	/* also, don't free znode that has children in memory */
++	if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
++		return 1;
++	return 0;
++}
++
++/*
++ * this is called as part of removing jnode. Based on jnode type, call
++ * corresponding function that removes jnode from indices and returns it back
++ * to the appropriate slab (through RCU).
++ */
++static inline void
++jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
++{
++	switch (jtype) {
++	case JNODE_UNFORMATTED_BLOCK:
++		remove_jnode(node, tree);
++		break;
++	case JNODE_IO_HEAD:
++	case JNODE_BITMAP:
++		break;
++	case JNODE_INODE:
++		break;
++	case JNODE_FORMATTED_BLOCK:
++		remove_znode(node, tree);
++		break;
++	default:
++		wrong_return_value("nikita-3196", "Wrong jnode type");
++	}
++}
++
++/*
++ * this is called as part of deleting jnode. Based on jnode type, call
++ * corresponding function that removes jnode from indices and returns it back
++ * to the appropriate slab (through RCU).
++ *
++ * This differs from jnode_remove() only for formatted nodes---for them
++ * sibling list handling is different for removal and deletion.
++ */
++static inline void
++jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
++{
++	switch (jtype) {
++	case JNODE_UNFORMATTED_BLOCK:
++		remove_jnode(node, tree);
++		break;
++	case JNODE_IO_HEAD:
++	case JNODE_BITMAP:
++		break;
++	case JNODE_FORMATTED_BLOCK:
++		delete_znode(node, tree);
++		break;
++	case JNODE_INODE:
++	default:
++		wrong_return_value("nikita-3195", "Wrong jnode type");
++	}
++}
++
++#if REISER4_DEBUG
++/*
++ * remove jnode from the debugging list of all jnodes hanging off super-block.
++ */
++void jnode_list_remove(jnode * node)
++{
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = get_super_private(jnode_get_tree(node)->super);
++
++	spin_lock_irq(&sbinfo->all_guard);
++	assert("nikita-2422", !list_empty(&node->jnodes));
++	list_del_init(&node->jnodes);
++	spin_unlock_irq(&sbinfo->all_guard);
++}
++#endif
++
++/*
++ * this is called by jput_final() to remove jnode when last reference to it is
++ * released.
++ */
++static int jnode_try_drop(jnode * node)
++{
++	int result;
++	reiser4_tree *tree;
++	jnode_type jtype;
++
++	assert("nikita-2491", node != NULL);
++	assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
++
++	tree = jnode_get_tree(node);
++	jtype = jnode_get_type(node);
++
++	spin_lock_jnode(node);
++	write_lock_tree(tree);
++	/*
++	 * if jnode has a page---leave it alone. Memory pressure will
++	 * eventually kill page and jnode.
++	 */
++	if (jnode_page(node) != NULL) {
++		write_unlock_tree(tree);
++		spin_unlock_jnode(node);
++		JF_CLR(node, JNODE_RIP);
++		return RETERR(-EBUSY);
++	}
++
++	/* re-check ->x_count under tree lock. */
++	result = jnode_is_busy(node, jtype);
++	if (result == 0) {
++		assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
++		assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
++
++		spin_unlock_jnode(node);
++		/* no page and no references---despatch him. */
++		jnode_remove(node, jtype, tree);
++		write_unlock_tree(tree);
++		jnode_free(node, jtype);
++	} else {
++		/* busy check failed: reference was acquired by concurrent
++		 * thread. */
++		write_unlock_tree(tree);
++		spin_unlock_jnode(node);
++		JF_CLR(node, JNODE_RIP);
++	}
++	return result;
++}
++
++/* jdelete() -- Delete jnode from the tree and file system */
++static int jdelete(jnode * node /* jnode to finish with */ )
++{
++	struct page *page;
++	int result;
++	reiser4_tree *tree;
++	jnode_type jtype;
++
++	assert("nikita-467", node != NULL);
++	assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
++
++	jtype = jnode_get_type(node);
++
++	page = jnode_lock_page(node);
++	assert_spin_locked(&(node->guard));
++
++	tree = jnode_get_tree(node);
++
++	write_lock_tree(tree);
++	/* re-check ->x_count under tree lock. */
++	result = jnode_is_busy(node, jtype);
++	if (likely(!result)) {
++		assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
++		assert("jmacd-511", atomic_read(&node->d_count) == 0);
++
++		/* detach page */
++		if (page != NULL) {
++			/*
++			 * FIXME this is racy against jnode_extent_write().
++			 */
++			page_clear_jnode(page, node);
++		}
++		spin_unlock_jnode(node);
++		/* goodbye */
++		jnode_delete(node, jtype, tree);
++		write_unlock_tree(tree);
++		jnode_free(node, jtype);
++		/* @node is no longer valid pointer */
++		if (page != NULL)
++			drop_page(page);
++	} else {
++		/* busy check failed: reference was acquired by concurrent
++		 * thread. */
++		JF_CLR(node, JNODE_RIP);
++		write_unlock_tree(tree);
++		spin_unlock_jnode(node);
++		if (page != NULL)
++			unlock_page(page);
++	}
++	return result;
++}
++
++/* drop jnode on the floor.
++
++   Return value:
++
++    -EBUSY:  failed to drop jnode, because there are still references to it
++
++    0:       successfully dropped jnode
++
++*/
++static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
++{
++	struct page *page;
++	jnode_type jtype;
++	int result;
++
++	assert("zam-602", node != NULL);
++	assert_rw_not_read_locked(&(tree->tree_lock));
++	assert_rw_not_write_locked(&(tree->tree_lock));
++	assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
++
++	jtype = jnode_get_type(node);
++
++	page = jnode_lock_page(node);
++	assert_spin_locked(&(node->guard));
++
++	write_lock_tree(tree);
++
++	/* re-check ->x_count under tree lock. */
++	result = jnode_is_busy(node, jtype);
++	if (!result) {
++		assert("nikita-2488", page == jnode_page(node));
++		assert("nikita-2533", atomic_read(&node->d_count) == 0);
++		if (page != NULL) {
++			assert("nikita-2126", !PageDirty(page));
++			assert("nikita-2127", PageUptodate(page));
++			assert("nikita-2181", PageLocked(page));
++			page_clear_jnode(page, node);
++		}
++		spin_unlock_jnode(node);
++		jnode_remove(node, jtype, tree);
++		write_unlock_tree(tree);
++		jnode_free(node, jtype);
++		if (page != NULL) {
++			drop_page(page);
++		}
++	} else {
++		/* busy check failed: reference was acquired by concurrent
++		 * thread. */
++		JF_CLR(node, JNODE_RIP);
++		write_unlock_tree(tree);
++		spin_unlock_jnode(node);
++		if (page != NULL)
++			unlock_page(page);
++	}
++	return result;
++}
++
++/* This function frees jnode "if possible". In particular, [dcx]_count has to
++   be 0 (where applicable).  */
++void jdrop(jnode * node)
++{
++	jdrop_in_tree(node, jnode_get_tree(node));
++}
++
++/* IO head jnode implementation; The io heads are simple j-nodes with limited
++   functionality (these j-nodes are not in any hash table) just for reading
++   from and writing to disk. */
++
++jnode *alloc_io_head(const reiser4_block_nr * block)
++{
++	jnode *jal = jalloc();
++
++	if (jal != NULL) {
++		jnode_init(jal, current_tree, JNODE_IO_HEAD);
++		jnode_set_block(jal, block);
++	}
++
++	jref(jal);
++
++	return jal;
++}
++
++void drop_io_head(jnode * node)
++{
++	assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
++
++	jput(node);
++	jdrop(node);
++}
++
++/* protect keep jnode data from reiser4_releasepage()  */
++void pin_jnode_data(jnode * node)
++{
++	assert("zam-671", jnode_page(node) != NULL);
++	page_cache_get(jnode_page(node));
++}
++
++/* make jnode data free-able again */
++void unpin_jnode_data(jnode * node)
++{
++	assert("zam-672", jnode_page(node) != NULL);
++	page_cache_release(jnode_page(node));
++}
++
++struct address_space *jnode_get_mapping(const jnode * node)
++{
++	assert("nikita-3162", node != NULL);
++	return jnode_ops(node)->mapping(node);
++}
++
++#if REISER4_DEBUG
++/* debugging aid: jnode invariant */
++int jnode_invariant_f(const jnode * node, char const **msg)
++{
++#define _ergo(ant, con) 						\
++	((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
++#define _check(exp) ((*msg) = #exp, (exp))
++
++	return _check(node != NULL) &&
++	    /* [jnode-queued] */
++	    /* only relocated node can be queued, except that when znode
++	     * is being deleted, its JNODE_RELOC bit is cleared */
++	    _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
++		  JF_ISSET(node, JNODE_RELOC) ||
++		  JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
++	    _check(node->jnodes.prev != NULL) &&
++	    _check(node->jnodes.next != NULL) &&
++	    /* [jnode-dirty] invariant */
++	    /* dirty inode is part of atom */
++	    _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
++	    /* [jnode-oid] invariant */
++	    /* for unformatted node ->objectid and ->mapping fields are
++	     * consistent */
++	    _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
++		  node->key.j.objectid ==
++		  get_inode_oid(node->key.j.mapping->host)) &&
++	    /* [jnode-atom-valid] invariant */
++	    /* node atom has valid state */
++	    _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
++	    /* [jnode-page-binding] invariant */
++	    /* if node points to page, it points back to node */
++	    _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
++	    /* [jnode-refs] invariant */
++	    /* only referenced jnode can be loaded */
++	    _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
++
++}
++
++static const char *jnode_type_name(jnode_type type)
++{
++	switch (type) {
++	case JNODE_UNFORMATTED_BLOCK:
++		return "unformatted";
++	case JNODE_FORMATTED_BLOCK:
++		return "formatted";
++	case JNODE_BITMAP:
++		return "bitmap";
++	case JNODE_IO_HEAD:
++		return "io head";
++	case JNODE_INODE:
++		return "inode";
++	case LAST_JNODE_TYPE:
++		return "last";
++	default:{
++			static char unknown[30];
++
++			sprintf(unknown, "unknown %i", type);
++			return unknown;
++		}
++	}
++}
++
++#define jnode_state_name( node, flag )			\
++	( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
++
++/* debugging aid: output human readable information about @node */
++static void info_jnode(const char *prefix /* prefix to print */ ,
++		       const jnode * node /* node to print */ )
++{
++	assert("umka-068", prefix != NULL);
++
++	if (node == NULL) {
++		printk("%s: null\n", prefix);
++		return;
++	}
++
++	printk
++	    ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
++	     " block: %s, d_count: %d, x_count: %d, "
++	     "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
++	     node->state, 
++	     jnode_state_name(node, JNODE_PARSED),
++	     jnode_state_name(node, JNODE_HEARD_BANSHEE),
++	     jnode_state_name(node, JNODE_LEFT_CONNECTED),
++	     jnode_state_name(node, JNODE_RIGHT_CONNECTED),
++	     jnode_state_name(node, JNODE_ORPHAN),
++	     jnode_state_name(node, JNODE_CREATED),
++	     jnode_state_name(node, JNODE_RELOC),
++	     jnode_state_name(node, JNODE_OVRWR),
++	     jnode_state_name(node, JNODE_DIRTY),
++	     jnode_state_name(node, JNODE_IS_DYING),
++	     jnode_state_name(node, JNODE_RIP),
++	     jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
++	     jnode_state_name(node, JNODE_WRITEBACK),
++	     jnode_state_name(node, JNODE_NEW),
++	     jnode_state_name(node, JNODE_DKSET),
++	     jnode_state_name(node, JNODE_REPACK),
++	     jnode_state_name(node, JNODE_CLUSTER_PAGE),
++	     jnode_get_level(node), sprint_address(jnode_get_block(node)),
++	     atomic_read(&node->d_count), atomic_read(&node->x_count),
++	     jnode_page(node), node->atom, 0, 0,
++	     jnode_type_name(jnode_get_type(node)));
++	if (jnode_is_unformatted(node)) {
++		printk("inode: %llu, index: %lu, ",
++		       node->key.j.objectid, node->key.j.index);
++	}
++}
++
++/* debugging aid: check znode invariant and panic if it doesn't hold */
++static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
++{
++	char const *failed_msg;
++	int result;
++	reiser4_tree *tree;
++
++	tree = jnode_get_tree(node);
++
++	assert("umka-063312", node != NULL);
++	assert("umka-064321", tree != NULL);
++
++	if (!jlocked && !tlocked)
++		spin_lock_jnode((jnode *) node);
++	if (!tlocked)
++		read_lock_tree(jnode_get_tree(node));
++	result = jnode_invariant_f(node, &failed_msg);
++	if (!result) {
++		info_jnode("corrupted node", node);
++		warning("jmacd-555", "Condition %s failed", failed_msg);
++	}
++	if (!tlocked)
++		read_unlock_tree(jnode_get_tree(node));
++	if (!jlocked && !tlocked)
++		spin_unlock_jnode((jnode *) node);
++	return result;
++}
++
++#endif				/* REISER4_DEBUG */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 80
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/jnode.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/jnode.h
+@@ -0,0 +1,711 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Declaration of jnode. See jnode.c for details. */
++
++#ifndef __JNODE_H__
++#define __JNODE_H__
++
++#include "forward.h"
++#include "type_safe_hash.h"
++#include "txnmgr.h"
++#include "key.h"
++#include "debug.h"
++#include "dformat.h"
++#include "context.h"
++
++#include "plugin/plugin.h"
++
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/spinlock.h>
++#include <asm/atomic.h>
++#include <asm/bitops.h>
++#include <linux/list.h>
++#include <linux/rcupdate.h>
++
++/* declare hash table of jnodes (jnodes proper, that is, unformatted
++   nodes)  */
++TYPE_SAFE_HASH_DECLARE(j, jnode);
++
++/* declare hash table of znodes */
++TYPE_SAFE_HASH_DECLARE(z, znode);
++
++typedef struct {
++	__u64 objectid;
++	unsigned long index;
++	struct address_space *mapping;
++} jnode_key_t;
++
++/*
++   Jnode is the "base class" of other nodes in reiser4. It is also happens to
++   be exactly the node we use for unformatted tree nodes.
++
++   Jnode provides following basic functionality:
++
++   . reference counting and indexing.
++
++   . integration with page cache. Jnode has ->pg reference to which page can
++   be attached.
++
++   . interface to transaction manager. It is jnode that is kept in transaction
++   manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
++   means, there should be special type of jnode for inode.)
++
++   Locking:
++
++   Spin lock: the following fields are protected by the per-jnode spin lock:
++
++    ->state
++    ->atom
++    ->capture_link
++
++   Following fields are protected by the global tree lock:
++
++    ->link
++    ->key.z (content of ->key.z is only changed in znode_rehash())
++    ->key.j
++
++   Atomic counters
++
++    ->x_count
++    ->d_count
++
++    ->pg, and ->data are protected by spin lock for unused jnode and are
++    immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
++    is false).
++
++    ->tree is immutable after creation
++
++   Unclear
++
++    ->blocknr: should be under jnode spin-lock, but current interface is based
++    on passing of block address.
++
++   If you ever need to spin lock two nodes at once, do this in "natural"
++   memory order: lock znode with lower address first. (See lock_two_nodes().)
++
++   Invariants involving this data-type:
++
++      [jnode-dirty]
++      [jnode-refs]
++      [jnode-oid]
++      [jnode-queued]
++      [jnode-atom-valid]
++      [jnode-page-binding]
++*/
++
++struct jnode {
++#if REISER4_DEBUG
++#define JMAGIC 0x52654973	/* "ReIs" */
++	int magic;
++#endif
++	/* FIRST CACHE LINE (16 bytes): data used by jload */
++
++	/* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
++	/*   0 */ unsigned long state;
++
++	/* lock, protecting jnode's fields. */
++	/*   4 */ spinlock_t load;
++
++	/* counter of references to jnode itself. Increased on jref().
++	   Decreased on jput().
++	 */
++	/*   8 */ atomic_t x_count;
++
++	/* counter of references to jnode's data. Pin data page(s) in
++	   memory while this is greater than 0. Increased on jload().
++	   Decreased on jrelse().
++	 */
++	/*   12 */ atomic_t d_count;
++
++	/* SECOND CACHE LINE: data used by hash table lookups */
++
++	/*   16 */ union {
++		/* znodes are hashed by block number */
++		reiser4_block_nr z;
++		/* unformatted nodes are hashed by mapping plus offset */
++		jnode_key_t j;
++	} key;
++
++	/* THIRD CACHE LINE */
++
++	/*   32 */ union {
++		/* pointers to maintain hash-table */
++		z_hash_link z;
++		j_hash_link j;
++	} link;
++
++	/* pointer to jnode page.  */
++	/*   36 */ struct page *pg;
++	/* pointer to node itself. This is page_address(node->pg) when page is
++	   attached to the jnode
++	 */
++	/*   40 */ void *data;
++
++	/*   44 */ reiser4_tree *tree;
++
++	/* FOURTH CACHE LINE: atom related fields */
++
++	/*   48 */ spinlock_t guard;
++
++	/* atom the block is in, if any */
++	/*   52 */ txn_atom *atom;
++
++	/* capture list */
++	/*   56 */ struct list_head capture_link;
++
++	/* FIFTH CACHE LINE */
++
++	/*   64 */ struct rcu_head rcu;
++	/* crosses cache line */
++
++	/* SIXTH CACHE LINE */
++
++	/* the real blocknr (where io is going to/from) */
++	/*   80 */ reiser4_block_nr blocknr;
++	/* Parent item type, unformatted and CRC need it for offset => key conversion.  */
++	/* NOTE: this parent_item_id looks like jnode type. */
++	/*   88 */ reiser4_plugin_id parent_item_id;
++	/*   92 */
++#if REISER4_DEBUG
++	/* number of pages referenced by the jnode (meaningful while capturing of
++	   page clusters) */
++	int page_count;
++	/* list of all jnodes for debugging purposes. */
++	struct list_head jnodes;
++	/* how many times this jnode was written in one transaction */
++	int written;
++	/* this indicates which atom's list the jnode is on */
++	atom_list list;
++#endif
++} __attribute__ ((aligned(16)));
++
++/*
++ * jnode types. Enumeration of existing jnode types.
++ */
++typedef enum {
++	JNODE_UNFORMATTED_BLOCK,	/* unformatted block */
++	JNODE_FORMATTED_BLOCK,	/* formatted block, znode */
++	JNODE_BITMAP,		/* bitmap */
++	JNODE_IO_HEAD,		/* jnode representing a block in the
++				 * wandering log */
++	JNODE_INODE,		/* jnode embedded into inode */
++	LAST_JNODE_TYPE
++} jnode_type;
++
++/* jnode states */
++typedef enum {
++	/* jnode's page is loaded and data checked */
++	JNODE_PARSED = 0,
++	/* node was deleted, not all locks on it were released. This
++	   node is empty and is going to be removed from the tree
++	   shortly. */
++	JNODE_HEARD_BANSHEE = 1,
++	/* left sibling pointer is valid */
++	JNODE_LEFT_CONNECTED = 2,
++	/* right sibling pointer is valid */
++	JNODE_RIGHT_CONNECTED = 3,
++
++	/* znode was just created and doesn't yet have a pointer from
++	   its parent */
++	JNODE_ORPHAN = 4,
++
++	/* this node was created by its transaction and has not been assigned
++	   a block address. */
++	JNODE_CREATED = 5,
++
++	/* this node is currently relocated */
++	JNODE_RELOC = 6,
++	/* this node is currently wandered */
++	JNODE_OVRWR = 7,
++
++	/* this znode has been modified */
++	JNODE_DIRTY = 8,
++
++	/* znode lock is being invalidated */
++	JNODE_IS_DYING = 9,
++
++	/* THIS PLACE IS INTENTIONALLY LEFT BLANK */
++
++	/* jnode is queued for flushing. */
++	JNODE_FLUSH_QUEUED = 12,
++
++	/* In the following bits jnode type is encoded. */
++	JNODE_TYPE_1 = 13,
++	JNODE_TYPE_2 = 14,
++	JNODE_TYPE_3 = 15,
++
++	/* jnode is being destroyed */
++	JNODE_RIP = 16,
++
++	/* znode was not captured during locking (it might so be because
++	   ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
++	JNODE_MISSED_IN_CAPTURE = 17,
++
++	/* write is in progress */
++	JNODE_WRITEBACK = 18,
++
++	/* FIXME: now it is used by crypto-compress plugin only */
++	JNODE_NEW = 19,
++
++	/* delimiting keys are already set for this znode. */
++	JNODE_DKSET = 20,
++
++	/* when this bit is set page and jnode can not be disconnected */
++	JNODE_WRITE_PREPARED = 21,
++
++	JNODE_CLUSTER_PAGE = 22,
++	/* Jnode is marked for repacking, that means the reiser4 flush and the
++	 * block allocator should process this node special way  */
++	JNODE_REPACK = 23,
++	/* node should be converted by flush in squalloc phase */
++	JNODE_CONVERTIBLE = 24,
++	/*
++	 * When jnode is dirtied for the first time in given transaction,
++	 * do_jnode_make_dirty() checks whether this jnode can possible became
++	 * member of overwrite set. If so, this bit is set, and one block is
++	 * reserved in the ->flush_reserved space of atom.
++	 *
++	 * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
++	 *
++	 *     (1) flush decides that we want this block to go into relocate
++	 *     set after all.
++	 *
++	 *     (2) wandering log is allocated (by log writer)
++	 *
++	 *     (3) extent is allocated
++	 *
++	 */
++	JNODE_FLUSH_RESERVED = 29
++} reiser4_jnode_state;
++
++/* Macros for accessing the jnode state. */
++
++static inline void JF_CLR(jnode * j, int f)
++{
++	assert("unknown-1", j->magic == JMAGIC);
++	clear_bit(f, &j->state);
++}
++static inline int JF_ISSET(const jnode * j, int f)
++{
++	assert("unknown-2", j->magic == JMAGIC);
++	return test_bit(f, &((jnode *) j)->state);
++}
++static inline void JF_SET(jnode * j, int f)
++{
++	assert("unknown-3", j->magic == JMAGIC);
++	set_bit(f, &j->state);
++}
++
++static inline int JF_TEST_AND_SET(jnode * j, int f)
++{
++	assert("unknown-4", j->magic == JMAGIC);
++	return test_and_set_bit(f, &j->state);
++}
++
++static inline void spin_lock_jnode(jnode *node)
++{
++	/* check that spinlocks of lower priorities are not held */
++	assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
++		    LOCK_CNT_NIL(spin_locked_txnh) &&
++		    LOCK_CNT_NIL(spin_locked_zlock) &&
++		    LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_LT(spin_locked_jnode, 2)));
++
++	spin_lock(&(node->guard));
++
++	LOCK_CNT_INC(spin_locked_jnode);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline void spin_unlock_jnode(jnode *node)
++{
++	assert_spin_locked(&(node->guard));
++	assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(spin_locked_jnode);
++	LOCK_CNT_DEC(spin_locked);
++
++	spin_unlock(&(node->guard));
++}
++
++static inline int jnode_is_in_deleteset(const jnode * node)
++{
++	return JF_ISSET(node, JNODE_RELOC);
++}
++
++extern int init_jnodes(void);
++extern void done_jnodes(void);
++
++/* Jnode routines */
++extern jnode *jalloc(void);
++extern void jfree(jnode * node) NONNULL;
++extern jnode *jclone(jnode *);
++extern jnode *jlookup(reiser4_tree * tree,
++		      oid_t objectid, unsigned long ind) NONNULL;
++extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
++extern jnode *jnode_by_page(struct page *pg) NONNULL;
++extern jnode *jnode_of_page(struct page *pg) NONNULL;
++void jnode_attach_page(jnode * node, struct page *pg);
++jnode *find_get_jnode(reiser4_tree * tree,
++		      struct address_space *mapping, oid_t oid,
++		      unsigned long index);
++
++void unhash_unformatted_jnode(jnode *);
++struct page *jnode_get_page_locked(jnode *, gfp_t gfp_flags);
++extern jnode *page_next_jnode(jnode * node) NONNULL;
++extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
++extern void jnode_make_dirty(jnode * node) NONNULL;
++extern void jnode_make_clean(jnode * node) NONNULL;
++extern void jnode_make_wander_nolock(jnode * node) NONNULL;
++extern void jnode_make_wander(jnode *) NONNULL;
++extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
++extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
++extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
++
++/**
++ * jnode_get_block
++ * @node: jnode to query
++ *
++ */
++static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
++{
++	assert("nikita-528", node != NULL);
++
++	return &node->blocknr;
++}
++
++/**
++ * jnode_set_block
++ * @node: jnode to update
++ * @blocknr: new block nr
++ */
++static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
++{
++	assert("nikita-2020", node != NULL);
++	assert("umka-055", blocknr != NULL);
++	node->blocknr = *blocknr;
++}
++
++
++/* block number for IO. Usually this is the same as jnode_get_block(), unless
++ * jnode was emergency flushed---then block number chosen by eflush is
++ * used. */
++static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
++{
++	assert("nikita-2768", node != NULL);
++	assert_spin_locked(&(node->guard));
++
++	return jnode_get_block(node);
++}
++
++/* Jnode flush interface. */
++extern reiser4_blocknr_hint *pos_hint(flush_pos_t * pos);
++extern flush_queue_t *pos_fq(flush_pos_t * pos);
++
++/* FIXME-VS: these are used in plugin/item/extent.c */
++
++/* does extent_get_block have to be called */
++#define jnode_mapped(node)     JF_ISSET (node, JNODE_MAPPED)
++#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
++
++/* the node should be converted during flush squalloc phase */
++#define jnode_convertible(node)        JF_ISSET (node, JNODE_CONVERTIBLE)
++#define jnode_set_convertible(node)    JF_SET (node, JNODE_CONVERTIBLE)
++
++/* Macros to convert from jnode to znode, znode to jnode.  These are macros
++   because C doesn't allow overloading of const prototypes. */
++#define ZJNODE(x) (& (x) -> zjnode)
++#define JZNODE(x)						\
++({								\
++	typeof (x) __tmp_x;					\
++								\
++	__tmp_x = (x);						\
++	assert ("jmacd-1300", jnode_is_znode (__tmp_x));	\
++	(znode*) __tmp_x;					\
++})
++
++extern int jnodes_tree_init(reiser4_tree * tree);
++extern int jnodes_tree_done(reiser4_tree * tree);
++
++#if REISER4_DEBUG
++
++extern int znode_is_any_locked(const znode * node);
++extern void jnode_list_remove(jnode * node);
++
++#else
++
++#define jnode_list_remove(node) noop
++
++#endif
++
++int znode_is_root(const znode * node) NONNULL;
++
++/* bump reference counter on @node */
++static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
++{
++	assert("nikita-1911", node != NULL);
++
++	atomic_inc(&node->x_count);
++	LOCK_CNT_INC(x_refs);
++}
++
++static inline void dec_x_ref(jnode * node)
++{
++	assert("nikita-3215", node != NULL);
++	assert("nikita-3216", atomic_read(&node->x_count) > 0);
++
++	atomic_dec(&node->x_count);
++	assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
++	LOCK_CNT_DEC(x_refs);
++}
++
++/* jref() - increase counter of references to jnode/znode (x_count) */
++static inline jnode *jref(jnode * node)
++{
++	assert("jmacd-508", (node != NULL) && !IS_ERR(node));
++	add_x_ref(node);
++	return node;
++}
++
++/* get the page of jnode */
++static inline struct page *jnode_page(const jnode * node)
++{
++	return node->pg;
++}
++
++/* return pointer to jnode data */
++static inline char *jdata(const jnode * node)
++{
++	assert("nikita-1415", node != NULL);
++	assert("nikita-3198", jnode_page(node) != NULL);
++	return node->data;
++}
++
++static inline int jnode_is_loaded(const jnode * node)
++{
++	assert("zam-506", node != NULL);
++	return atomic_read(&node->d_count) > 0;
++}
++
++extern void page_detach_jnode(struct page *page,
++			      struct address_space *mapping,
++			      unsigned long index) NONNULL;
++extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
++
++static inline void jnode_set_reloc(jnode * node)
++{
++	assert("nikita-2431", node != NULL);
++	assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
++	JF_SET(node, JNODE_RELOC);
++}
++
++/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
++
++extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
++
++static inline int jload(jnode *node)
++{
++	return jload_gfp(node, get_gfp_mask(), 1);
++}
++
++extern int jinit_new(jnode *, gfp_t) NONNULL;
++extern int jstartio(jnode *) NONNULL;
++
++extern void jdrop(jnode *) NONNULL;
++extern int jwait_io(jnode *, int rw) NONNULL;
++
++void jload_prefetch(jnode *);
++
++extern jnode *alloc_io_head(const reiser4_block_nr * block) NONNULL;
++extern void drop_io_head(jnode * node) NONNULL;
++
++static inline reiser4_tree *jnode_get_tree(const jnode * node)
++{
++	assert("nikita-2691", node != NULL);
++	return node->tree;
++}
++
++extern void pin_jnode_data(jnode *);
++extern void unpin_jnode_data(jnode *);
++
++static inline jnode_type jnode_get_type(const jnode * node)
++{
++	static const unsigned long state_mask =
++	    (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
++
++	static jnode_type mask_to_type[] = {
++		/*  JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
++
++		/* 000 */
++		[0] = JNODE_FORMATTED_BLOCK,
++		/* 001 */
++		[1] = JNODE_UNFORMATTED_BLOCK,
++		/* 010 */
++		[2] = JNODE_BITMAP,
++		/* 011 */
++		[3] = LAST_JNODE_TYPE,	/*invalid */
++		/* 100 */
++		[4] = JNODE_INODE,
++		/* 101 */
++		[5] = LAST_JNODE_TYPE,
++		/* 110 */
++		[6] = JNODE_IO_HEAD,
++		/* 111 */
++		[7] = LAST_JNODE_TYPE,	/* invalid */
++	};
++
++	return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
++}
++
++/* returns true if node is a znode */
++static inline int jnode_is_znode(const jnode * node)
++{
++	return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
++}
++
++static inline int jnode_is_flushprepped(jnode * node)
++{
++	assert("jmacd-78212", node != NULL);
++	assert_spin_locked(&(node->guard));
++	return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
++		JF_ISSET(node, JNODE_OVRWR);
++}
++
++/* Return true if @node has already been processed by the squeeze and allocate
++   process.  This implies the block address has been finalized for the
++   duration of this atom (or it is clean and will remain in place).  If this
++   returns true you may use the block number as a hint. */
++static inline int jnode_check_flushprepped(jnode * node)
++{
++	int result;
++
++	/* It must be clean or relocated or wandered.  New allocations are set to relocate. */
++	spin_lock_jnode(node);
++	result = jnode_is_flushprepped(node);
++	spin_unlock_jnode(node);
++	return result;
++}
++
++/* returns true if node is unformatted */
++static inline int jnode_is_unformatted(const jnode * node)
++{
++	assert("jmacd-0123", node != NULL);
++	return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
++}
++
++/* returns true if node represents a cluster cache page */
++static inline int jnode_is_cluster_page(const jnode * node)
++{
++	assert("edward-50", node != NULL);
++	return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
++}
++
++/* returns true is node is builtin inode's jnode */
++static inline int jnode_is_inode(const jnode * node)
++{
++	assert("vs-1240", node != NULL);
++	return jnode_get_type(node) == JNODE_INODE;
++}
++
++static inline jnode_plugin *jnode_ops_of(const jnode_type type)
++{
++	assert("nikita-2367", type < LAST_JNODE_TYPE);
++	return jnode_plugin_by_id((reiser4_plugin_id) type);
++}
++
++static inline jnode_plugin *jnode_ops(const jnode * node)
++{
++	assert("nikita-2366", node != NULL);
++
++	return jnode_ops_of(jnode_get_type(node));
++}
++
++/* Get the index of a block. */
++static inline unsigned long jnode_get_index(jnode * node)
++{
++	return jnode_ops(node)->index(node);
++}
++
++/* return true if "node" is the root */
++static inline int jnode_is_root(const jnode * node)
++{
++	return jnode_is_znode(node) && znode_is_root(JZNODE(node));
++}
++
++extern struct address_space *mapping_jnode(const jnode * node);
++extern unsigned long index_jnode(const jnode * node);
++
++static inline void jput(jnode * node);
++extern void jput_final(jnode * node);
++
++/* bump data counter on @node */
++static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
++{
++	assert("nikita-1962", node != NULL);
++
++	atomic_inc(&node->d_count);
++	if (jnode_is_unformatted(node) || jnode_is_znode(node))
++		LOCK_CNT_INC(d_refs);
++}
++
++/* jput() - decrement x_count reference counter on znode.
++
++   Count may drop to 0, jnode stays in cache until memory pressure causes the
++   eviction of its page. The c_count variable also ensures that children are
++   pressured out of memory before the parent. The jnode remains hashed as
++   long as the VM allows its page to stay in memory.
++*/
++static inline void jput(jnode * node)
++{
++	assert("jmacd-509", node != NULL);
++	assert("jmacd-510", atomic_read(&node->x_count) > 0);
++	assert("zam-926", schedulable());
++	LOCK_CNT_DEC(x_refs);
++
++	rcu_read_lock();
++	/*
++	 * we don't need any kind of lock here--jput_final() uses RCU.
++	 */
++	if (unlikely(atomic_dec_and_test(&node->x_count))) {
++		jput_final(node);
++	} else
++		rcu_read_unlock();
++	assert("nikita-3473", schedulable());
++}
++
++extern void jrelse(jnode * node);
++extern void jrelse_tail(jnode * node);
++
++extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
++
++/* resolve race with jput */
++static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
++{
++	if (unlikely(JF_ISSET(node, JNODE_RIP)))
++		node = jnode_rip_sync(tree, node);
++	return node;
++}
++
++extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
++
++#if REISER4_DEBUG
++extern int jnode_invariant_f(const jnode *node, char const **msg);
++#endif
++
++extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
++
++/* __JNODE_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/kassign.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/kassign.c
+@@ -0,0 +1,659 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Key assignment policy implementation */
++
++/*
++ * In reiser4 every piece of file system data and meta-data has a key. Keys
++ * are used to store information in and retrieve it from reiser4 internal
++ * tree. In addition to this, keys define _ordering_ of all file system
++ * information: things having close keys are placed into the same or
++ * neighboring (in the tree order) nodes of the tree. As our block allocator
++ * tries to respect tree order (see flush.c), keys also define order in which
++ * things are laid out on the disk, and hence, affect performance directly.
++ *
++ * Obviously, assignment of keys to data and meta-data should be consistent
++ * across whole file system. Algorithm that calculates a key for a given piece
++ * of data or meta-data is referred to as "key assignment".
++ *
++ * Key assignment is too expensive to be implemented as a plugin (that is,
++ * with an ability to support different key assignment schemas in the same
++ * compiled kernel image). As a compromise, all key-assignment functions and
++ * data-structures are collected in this single file, so that modifications to
++ * key assignment algorithm can be localized. Additional changes may be
++ * required in key.[ch].
++ *
++ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
++ * may guess, there is "Plan B" too.
++ *
++ */
++
++/*
++ * Additional complication with key assignment implementation is a requirement
++ * to support different key length.
++ */
++
++/*
++ *                   KEY ASSIGNMENT: PLAN A, LONG KEYS.
++ *
++ * DIRECTORY ITEMS
++ *
++ *  |       60     | 4 | 7 |1|   56        |        64        |        64       |
++ *  +--------------+---+---+-+-------------+------------------+-----------------+
++ *  |    dirid     | 0 | F |H|  prefix-1   |    prefix-2      |  prefix-3/hash  |
++ *  +--------------+---+---+-+-------------+------------------+-----------------+
++ *  |                  |                   |                  |                 |
++ *  |    8 bytes       |      8 bytes      |     8 bytes      |     8 bytes     |
++ *
++ * dirid         objectid of directory this item is for
++ *
++ * F             fibration, see fs/reiser4/plugin/fibration.[ch]
++ *
++ * H             1 if last 8 bytes of the key contain hash,
++ *               0 if last 8 bytes of the key contain prefix-3
++ *
++ * prefix-1      first 7 characters of file name.
++ *               Padded by zeroes if name is not long enough.
++ *
++ * prefix-2      next 8 characters of the file name.
++ *
++ * prefix-3      next 8 characters of the file name.
++ *
++ * hash          hash of the rest of file name (i.e., portion of file
++ *               name not included into prefix-1 and prefix-2).
++ *
++ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
++ * in the key. Such file names are called "short". They are distinguished by H
++ * bit set 0 in the key.
++ *
++ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
++ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
++ * key. Last 8 bytes of the key are occupied by hash of the remaining
++ * characters of the name.
++ *
++ * This key assignment reaches following important goals:
++ *
++ *     (1) directory entries are sorted in approximately lexicographical
++ *     order.
++ *
++ *     (2) collisions (when multiple directory items have the same key), while
++ *     principally unavoidable in a tree with fixed length keys, are rare.
++ *
++ * STAT DATA
++ *
++ *  |       60     | 4 |       64        | 4 |     60       |        64       |
++ *  +--------------+---+-----------------+---+--------------+-----------------+
++ *  |  locality id | 1 |    ordering     | 0 |  objectid    |        0        |
++ *  +--------------+---+-----------------+---+--------------+-----------------+
++ *  |                  |                 |                  |                 |
++ *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
++ *
++ * locality id     object id of a directory where first name was created for
++ *                 the object
++ *
++ * ordering        copy of second 8-byte portion of the key of directory
++ *                 entry for the first name of this object. Ordering has a form
++ *                         {
++ *                                 fibration :7;
++ *                                 h         :1;
++ *                                 prefix1   :56;
++ *                         }
++ *                 see description of key for directory entry above.
++ *
++ * objectid        object id for this object
++ *
++ * This key assignment policy is designed to keep stat-data in the same order
++ * as corresponding directory items, thus speeding up readdir/stat types of
++ * workload.
++ *
++ * FILE BODY
++ *
++ *  |       60     | 4 |       64        | 4 |     60       |        64       |
++ *  +--------------+---+-----------------+---+--------------+-----------------+
++ *  |  locality id | 4 |    ordering     | 0 |  objectid    |      offset     |
++ *  +--------------+---+-----------------+---+--------------+-----------------+
++ *  |                  |                 |                  |                 |
++ *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
++ *
++ * locality id     object id of a directory where first name was created for
++ *                 the object
++ *
++ * ordering        the same as in the key of stat-data for this object
++ *
++ * objectid        object id for this object
++ *
++ * offset          logical offset from the beginning of this file.
++ *                 Measured in bytes.
++ *
++ *
++ *                   KEY ASSIGNMENT: PLAN A, SHORT KEYS.
++ *
++ * DIRECTORY ITEMS
++ *
++ *  |       60     | 4 | 7 |1|   56        |        64       |
++ *  +--------------+---+---+-+-------------+-----------------+
++ *  |    dirid     | 0 | F |H|  prefix-1   |  prefix-2/hash  |
++ *  +--------------+---+---+-+-------------+-----------------+
++ *  |                  |                   |                 |
++ *  |    8 bytes       |      8 bytes      |     8 bytes     |
++ *
++ * dirid         objectid of directory this item is for
++ *
++ * F             fibration, see fs/reiser4/plugin/fibration.[ch]
++ *
++ * H             1 if last 8 bytes of the key contain hash,
++ *               0 if last 8 bytes of the key contain prefix-2
++ *
++ * prefix-1      first 7 characters of file name.
++ *               Padded by zeroes if name is not long enough.
++ *
++ * prefix-2      next 8 characters of the file name.
++ *
++ * hash          hash of the rest of file name (i.e., portion of file
++ *               name not included into prefix-1).
++ *
++ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
++ * the key. Such file names are called "short". They are distinguished by H
++ * bit set in the key.
++ *
++ * Other file names are "long". For long name, H bit is 0, and first 7
++ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
++ * key are occupied by hash of the remaining characters of the name.
++ *
++ * STAT DATA
++ *
++ *  |       60     | 4 | 4 |     60       |        64       |
++ *  +--------------+---+---+--------------+-----------------+
++ *  |  locality id | 1 | 0 |  objectid    |        0        |
++ *  +--------------+---+---+--------------+-----------------+
++ *  |                  |                  |                 |
++ *  |    8 bytes       |     8 bytes      |     8 bytes     |
++ *
++ * locality id     object id of a directory where first name was created for
++ *                 the object
++ *
++ * objectid        object id for this object
++ *
++ * FILE BODY
++ *
++ *  |       60     | 4 | 4 |     60       |        64       |
++ *  +--------------+---+---+--------------+-----------------+
++ *  |  locality id | 4 | 0 |  objectid    |      offset     |
++ *  +--------------+---+---+--------------+-----------------+
++ *  |                  |                  |                 |
++ *  |    8 bytes       |     8 bytes      |     8 bytes     |
++ *
++ * locality id     object id of a directory where first name was created for
++ *                 the object
++ *
++ * objectid        object id for this object
++ *
++ * offset          logical offset from the beginning of this file.
++ *                 Measured in bytes.
++ *
++ *
++ */
++
++#include "debug.h"
++#include "key.h"
++#include "kassign.h"
++#include "vfs_ops.h"
++#include "inode.h"
++#include "super.h"
++#include "dscale.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block, etc  */
++
++/* bitmask for H bit (see comment at the beginning of this file */
++static const __u64 longname_mark = 0x0100000000000000ull;
++/* bitmask for F and H portions of the key. */
++static const __u64 fibration_mask = 0xff00000000000000ull;
++
++/* return true if name is not completely encoded in @key */
++int is_longname_key(const reiser4_key * key)
++{
++	__u64 highpart;
++
++	assert("nikita-2863", key != NULL);
++	if (get_key_type(key) != KEY_FILE_NAME_MINOR)
++		print_key("oops", key);
++	assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
++
++	if (REISER4_LARGE_KEY)
++		highpart = get_key_ordering(key);
++	else
++		highpart = get_key_objectid(key);
++
++	return (highpart & longname_mark) ? 1 : 0;
++}
++
++/* return true if @name is too long to be completely encoded in the key */
++int is_longname(const char *name UNUSED_ARG, int len)
++{
++	if (REISER4_LARGE_KEY)
++		return len > 23;
++	else
++		return len > 15;
++}
++
++/* code ascii string into __u64.
++
++   Put characters of @name into result (@str) one after another starting
++   from @start_idx-th highest (arithmetically) byte. This produces
++   endian-safe encoding. memcpy(2) will not do.
++
++*/
++static __u64 pack_string(const char *name /* string to encode */ ,
++			 int start_idx	/* highest byte in result from
++					 * which to start encoding */ )
++{
++	unsigned i;
++	__u64 str;
++
++	str = 0;
++	for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
++		str <<= 8;
++		str |= (unsigned char)name[i];
++	}
++	str <<= (sizeof str - i - start_idx) << 3;
++	return str;
++}
++
++/* opposite to pack_string(). Takes value produced by pack_string(), restores
++ * string encoded in it and stores result in @buf */
++char *unpack_string(__u64 value, char *buf)
++{
++	do {
++		*buf = value >> (64 - 8);
++		if (*buf)
++			++buf;
++		value <<= 8;
++	} while (value != 0);
++	*buf = 0;
++	return buf;
++}
++
++/* obtain name encoded in @key and store it in @buf */
++char *extract_name_from_key(const reiser4_key * key, char *buf)
++{
++	char *c;
++
++	assert("nikita-2868", !is_longname_key(key));
++
++	c = buf;
++	if (REISER4_LARGE_KEY) {
++		c = unpack_string(get_key_ordering(key) & ~fibration_mask, c);
++		c = unpack_string(get_key_fulloid(key), c);
++	} else
++		c = unpack_string(get_key_fulloid(key) & ~fibration_mask, c);
++	unpack_string(get_key_offset(key), c);
++	return buf;
++}
++
++/**
++ * complete_entry_key - calculate entry key by name
++ * @dir: directory where entry is (or will be) in
++ * @name: name to calculate key of
++ * @len: lenth of name
++ * @result: place to store result in
++ *
++ * Sets fields of entry key @result which depend on file name.
++ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
++ * objectid and offset. Otherwise, objectid and offset are set.
++ */
++void complete_entry_key(const struct inode *dir, const char *name,
++			int len, reiser4_key *result)
++{
++#if REISER4_LARGE_KEY
++	__u64 ordering;
++	__u64 objectid;
++	__u64 offset;
++
++	assert("nikita-1139", dir != NULL);
++	assert("nikita-1142", result != NULL);
++	assert("nikita-2867", strlen(name) == len);
++
++	/*
++	 * key allocation algorithm for directory entries in case of large
++	 * keys:
++	 *
++	 * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
++	 * characters into ordering field of key, next 8 charactes (if any)
++	 * into objectid field of key and next 8 ones (of any) into offset
++	 * field of key
++	 *
++	 * If file name is longer than 23 characters, put first 7 characters
++	 * into key's ordering, next 8 to objectid and hash of remaining
++	 * characters into offset field.
++	 *
++	 * To distinguish above cases, in latter set up unused high bit in
++	 * ordering field.
++	 */
++
++	/* [0-6] characters to ordering */
++	ordering = pack_string(name, 1);
++	if (len > 7) {
++		/* [7-14] characters to objectid */
++		objectid = pack_string(name + 7, 0);
++		if (len > 15) {
++			if (len <= 23) {
++				/* [15-23] characters to offset */
++				offset = pack_string(name + 15, 0);
++			} else {
++				/* note in a key the fact that offset contains hash. */
++				ordering |= longname_mark;
++
++				/* offset is the hash of the file name's tail. */
++				offset = inode_hash_plugin(dir)->hash(name + 15,
++								      len - 15);
++			}
++		} else {
++			offset = 0ull;
++		}
++	} else {
++		objectid = 0ull;
++		offset = 0ull;
++	}
++
++	assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
++	ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
++
++	set_key_ordering(result, ordering);
++	set_key_fulloid(result, objectid);
++	set_key_offset(result, offset);
++	return;
++
++#else
++	__u64 objectid;
++	__u64 offset;
++
++	assert("nikita-1139", dir != NULL);
++	assert("nikita-1142", result != NULL);
++	assert("nikita-2867", strlen(name) == len);
++
++	/*
++	 * key allocation algorithm for directory entries in case of not large
++	 * keys:
++	 *
++	 * If name is not longer than 7 + 8 = 15 characters, put first 7
++	 * characters into objectid field of key, next 8 charactes (if any)
++	 * into offset field of key
++	 *
++	 * If file name is longer than 15 characters, put first 7 characters
++	 * into key's objectid, and hash of remaining characters into offset
++	 * field.
++	 *
++	 * To distinguish above cases, in latter set up unused high bit in
++	 * objectid field.
++	 */
++
++	/* [0-6] characters to objectid */
++	objectid = pack_string(name, 1);
++	if (len > 7) {
++		if (len <= 15) {
++			/* [7-14] characters to offset */
++			offset = pack_string(name + 7, 0);
++		} else {
++			/* note in a key the fact that offset contains hash. */
++			objectid |= longname_mark;
++
++			/* offset is the hash of the file name. */
++			offset = inode_hash_plugin(dir)->hash(name + 7,
++							      len - 7);
++		}
++	} else
++		offset = 0ull;
++
++	assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
++	objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
++
++	set_key_fulloid(result, objectid);
++	set_key_offset(result, offset);
++	return;
++#endif				/* ! REISER4_LARGE_KEY */
++}
++
++/* true, if @key is the key of "." */
++int is_dot_key(const reiser4_key * key /* key to check */ )
++{
++	assert("nikita-1717", key != NULL);
++	assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
++	return
++	    (get_key_ordering(key) == 0ull) &&
++	    (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
++}
++
++/* build key for stat-data.
++
++   return key of stat-data of this object. This should became sd plugin
++   method in the future. For now, let it be here.
++
++*/
++reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
++			  reiser4_key * result	/* resulting key of @target
++						   stat-data */ )
++{
++	assert("nikita-261", result != NULL);
++
++	reiser4_key_init(result);
++	set_key_locality(result, reiser4_inode_data(target)->locality_id);
++	set_key_ordering(result, get_inode_ordering(target));
++	set_key_objectid(result, get_inode_oid(target));
++	set_key_type(result, KEY_SD_MINOR);
++	set_key_offset(result, (__u64) 0);
++	return result;
++}
++
++/* encode part of key into &obj_key_id
++
++   This encodes into @id part of @key sufficient to restore @key later,
++   given that latter is key of object (key of stat-data).
++
++   See &obj_key_id
++*/
++int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
++		     obj_key_id * id /* id where key is encoded in */ )
++{
++	assert("nikita-1151", key != NULL);
++	assert("nikita-1152", id != NULL);
++
++	memcpy(id, key, sizeof *id);
++	return 0;
++}
++
++/* encode reference to @obj in @id.
++
++   This is like build_obj_key_id() above, but takes inode as parameter. */
++int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
++		       obj_key_id * id /* result */ )
++{
++	reiser4_key sdkey;
++
++	assert("nikita-1166", obj != NULL);
++	assert("nikita-1167", id != NULL);
++
++	build_sd_key(obj, &sdkey);
++	build_obj_key_id(&sdkey, id);
++	return 0;
++}
++
++/* decode @id back into @key
++
++   Restore key of object stat-data from @id. This is dual to
++   build_obj_key_id() above.
++*/
++int extract_key_from_id(const obj_key_id * id	/* object key id to extract key
++						 * from */ ,
++			reiser4_key * key /* result */ )
++{
++	assert("nikita-1153", id != NULL);
++	assert("nikita-1154", key != NULL);
++
++	reiser4_key_init(key);
++	memcpy(key, id, sizeof *id);
++	return 0;
++}
++
++/* extract objectid of directory from key of directory entry within said
++   directory.
++   */
++oid_t extract_dir_id_from_key(const reiser4_key * de_key	/* key of
++								 * directory
++								 * entry */ )
++{
++	assert("nikita-1314", de_key != NULL);
++	return get_key_locality(de_key);
++}
++
++/* encode into @id key of directory entry.
++
++   Encode into @id information sufficient to later distinguish directory
++   entries within the same directory. This is not whole key, because all
++   directory entries within directory item share locality which is equal
++   to objectid of their directory.
++
++*/
++int build_de_id(const struct inode *dir /* inode of directory */ ,
++		const struct qstr *name	/* name to be given to @obj by
++					 * directory entry being
++					 * constructed */ ,
++		de_id * id /* short key of directory entry */ )
++{
++	reiser4_key key;
++
++	assert("nikita-1290", dir != NULL);
++	assert("nikita-1292", id != NULL);
++
++	/* NOTE-NIKITA this is suboptimal. */
++	inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
++	return build_de_id_by_key(&key, id);
++}
++
++/* encode into @id key of directory entry.
++
++   Encode into @id information sufficient to later distinguish directory
++   entries within the same directory. This is not whole key, because all
++   directory entries within directory item share locality which is equal
++   to objectid of their directory.
++
++*/
++int build_de_id_by_key(const reiser4_key * entry_key	/* full key of directory
++							 * entry */ ,
++		       de_id * id /* short key of directory entry */ )
++{
++	memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
++	return 0;
++}
++
++/* restore from @id key of directory entry.
++
++   Function dual to build_de_id(): given @id and locality, build full
++   key of directory entry within directory item.
++
++*/
++int extract_key_from_de_id(const oid_t locality	/* locality of directory
++						 * entry */ ,
++			   const de_id * id /* directory entry id */ ,
++			   reiser4_key * key /* result */ )
++{
++	/* no need to initialise key here: all fields are overwritten */
++	memcpy(((__u64 *) key) + 1, id, sizeof *id);
++	set_key_locality(key, locality);
++	set_key_type(key, KEY_FILE_NAME_MINOR);
++	return 0;
++}
++
++/* compare two &de_id's */
++cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
++		const de_id * id2 /* second &de_id to compare */ )
++{
++	/* NOTE-NIKITA ugly implementation */
++	reiser4_key k1;
++	reiser4_key k2;
++
++	extract_key_from_de_id((oid_t) 0, id1, &k1);
++	extract_key_from_de_id((oid_t) 0, id2, &k2);
++	return keycmp(&k1, &k2);
++}
++
++/* compare &de_id with key */
++cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
++		    const reiser4_key * key /* key to compare */ )
++{
++	cmp_t result;
++	reiser4_key *k1;
++
++	k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
++	result = KEY_DIFF_EL(k1, key, 1);
++	if (result == EQUAL_TO) {
++		result = KEY_DIFF_EL(k1, key, 2);
++		if (REISER4_LARGE_KEY && result == EQUAL_TO) {
++			result = KEY_DIFF_EL(k1, key, 3);
++		}
++	}
++	return result;
++}
++
++/*
++ * return number of bytes necessary to encode @inode identity.
++ */
++int inode_onwire_size(const struct inode *inode)
++{
++	int result;
++
++	result = dscale_bytes(get_inode_oid(inode));
++	result += dscale_bytes(get_inode_locality(inode));
++
++	/*
++	 * ordering is large (it usually has highest bits set), so it makes
++	 * little sense to dscale it.
++	 */
++	if (REISER4_LARGE_KEY)
++		result += sizeof(get_inode_ordering(inode));
++	return result;
++}
++
++/*
++ * encode @inode identity at @start
++ */
++char *build_inode_onwire(const struct inode *inode, char *start)
++{
++	start += dscale_write(start, get_inode_locality(inode));
++	start += dscale_write(start, get_inode_oid(inode));
++
++	if (REISER4_LARGE_KEY) {
++		put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
++		start += sizeof(get_inode_ordering(inode));
++	}
++	return start;
++}
++
++/*
++ * extract key that was previously encoded by build_inode_onwire() at @addr
++ */
++char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
++{
++	__u64 val;
++
++	addr += dscale_read(addr, &val);
++	val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
++	put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
++	addr += dscale_read(addr, &val);
++	put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
++#if REISER4_LARGE_KEY
++	memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
++	addr += sizeof key_id->ordering;
++#endif
++	return addr;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/kassign.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/kassign.h
+@@ -0,0 +1,110 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Key assignment policy interface. See kassign.c for details. */
++
++#if !defined( __KASSIGN_H__ )
++#define __KASSIGN_H__
++
++#include "forward.h"
++#include "key.h"
++#include "dformat.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block, etc  */
++#include <linux/dcache.h>	/* for struct qstr */
++
++/* key assignment functions */
++
++/* Information from which key of file stat-data can be uniquely
++   restored. This depends on key assignment policy for
++   stat-data. Currently it's enough to store object id and locality id
++   (60+60==120) bits, because minor packing locality and offset of
++   stat-data key are always known constants: KEY_SD_MINOR and 0
++   respectively. For simplicity 4 bits are wasted in each id, and just
++   two 64 bit integers are stored.
++
++   This field has to be byte-aligned, because we don't want to waste
++   space in directory entries. There is another side of a coin of
++   course: we waste CPU and bus bandwidth in stead, by copying data back
++   and forth.
++
++   Next optimization: &obj_key_id is mainly used to address stat data from
++   directory entries. Under the assumption that majority of files only have
++   only name (one hard link) from *the* parent directory it seems reasonable
++   to only store objectid of stat data and take its locality from key of
++   directory item.
++
++   This requires some flag to be added to the &obj_key_id to distinguish
++   between these two cases. Remaining bits in flag byte are then asking to be
++   used to store file type.
++
++   This optimization requires changes in directory item handling code.
++
++*/
++typedef struct obj_key_id {
++	d8 locality[sizeof(__u64)];
++	 ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
++	    )
++	d8 objectid[sizeof(__u64)];
++}
++obj_key_id;
++
++/* Information sufficient to uniquely identify directory entry within
++   compressed directory item.
++
++   For alignment issues see &obj_key_id above.
++*/
++typedef struct de_id {
++	ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
++	d8 objectid[sizeof(__u64)];
++	d8 offset[sizeof(__u64)];
++}
++de_id;
++
++extern int inode_onwire_size(const struct inode *obj);
++extern char *build_inode_onwire(const struct inode *obj, char *area);
++extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
++
++extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
++extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
++extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
++extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
++extern int build_de_id(const struct inode *dir, const struct qstr *name,
++		       de_id * id);
++extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
++extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
++				  reiser4_key * key);
++extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
++extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
++
++extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
++extern void build_entry_key_common(const struct inode *dir,
++				   const struct qstr *name,
++				   reiser4_key * result);
++extern void build_entry_key_stable_entry(const struct inode *dir,
++					 const struct qstr *name,
++					 reiser4_key * result);
++extern int is_dot_key(const reiser4_key * key);
++extern reiser4_key *build_sd_key(const struct inode *target,
++				 reiser4_key * result);
++
++extern int is_longname_key(const reiser4_key * key);
++extern int is_longname(const char *name, int len);
++extern char *extract_name_from_key(const reiser4_key * key, char *buf);
++extern char *unpack_string(__u64 value, char *buf);
++extern void complete_entry_key(const struct inode *dir, const char *name,
++			       int len, reiser4_key *result);
++
++/* __KASSIGN_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/key.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/key.c
+@@ -0,0 +1,137 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Key manipulations. */
++
++#include "debug.h"
++#include "key.h"
++#include "super.h"
++#include "reiser4.h"
++
++#include <linux/types.h>	/* for __u??  */
++
++/* Minimal possible key: all components are zero. It is presumed that this is
++   independent of key scheme. */
++static const reiser4_key MINIMAL_KEY = {
++	.el = {
++		0ull,
++		ON_LARGE_KEY(0ull,)
++		0ull,
++		0ull
++	}
++};
++
++/* Maximal possible key: all components are ~0. It is presumed that this is
++   independent of key scheme. */
++static const reiser4_key MAXIMAL_KEY = {
++	.el = {
++		__constant_cpu_to_le64(~0ull),
++		ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
++		__constant_cpu_to_le64(~0ull),
++		__constant_cpu_to_le64(~0ull)
++	}
++};
++
++/* Initialize key. */
++void reiser4_key_init(reiser4_key * key /* key to init */ )
++{
++	assert("nikita-1169", key != NULL);
++	memset(key, 0, sizeof *key);
++}
++
++/* minimal possible key in the tree. Return pointer to the static storage. */
++const reiser4_key *min_key(void)
++{
++	return &MINIMAL_KEY;
++}
++
++/* maximum possible key in the tree. Return pointer to the static storage. */
++const reiser4_key *max_key(void)
++{
++	return &MAXIMAL_KEY;
++}
++
++#if REISER4_DEBUG
++/* debugging aid: print symbolic name of key type */
++static const char *type_name(unsigned int key_type /* key type */ )
++{
++	switch (key_type) {
++	case KEY_FILE_NAME_MINOR:
++		return "file name";
++	case KEY_SD_MINOR:
++		return "stat data";
++	case KEY_ATTR_NAME_MINOR:
++		return "attr name";
++	case KEY_ATTR_BODY_MINOR:
++		return "attr body";
++	case KEY_BODY_MINOR:
++		return "file body";
++	default:
++		return "unknown";
++	}
++}
++
++/* debugging aid: print human readable information about key */
++void print_key(const char *prefix /* prefix to print */ ,
++	       const reiser4_key * key /* key to print */ )
++{
++	/* turn bold on */
++	/* printf ("\033[1m"); */
++	if (key == NULL)
++		printk("%s: null key\n", prefix);
++	else {
++		if (REISER4_LARGE_KEY)
++			printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
++			       get_key_locality(key),
++			       get_key_type(key),
++			       get_key_ordering(key),
++			       get_key_band(key),
++			       get_key_objectid(key), get_key_offset(key));
++		else
++			printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
++			       get_key_locality(key),
++			       get_key_type(key),
++			       get_key_band(key),
++			       get_key_objectid(key), get_key_offset(key));
++		/*
++		 * if this is a key of directory entry, try to decode part of
++		 * a name stored in the key, and output it.
++		 */
++		if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
++			char buf[DE_NAME_BUF_LEN];
++			char *c;
++
++			c = buf;
++			c = unpack_string(get_key_ordering(key), c);
++			unpack_string(get_key_fulloid(key), c);
++			printk("[%s", buf);
++			if (is_longname_key(key))
++				/*
++				 * only part of the name is stored in the key.
++				 */
++				printk("...]\n");
++			else {
++				/*
++				 * whole name is stored in the key.
++				 */
++				unpack_string(get_key_offset(key), buf);
++				printk("%s]\n", buf);
++			}
++		} else {
++			printk("[%s]\n", type_name(get_key_type(key)));
++		}
++	}
++	/* turn bold off */
++	/* printf ("\033[m\017"); */
++}
++
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/key.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/key.h
+@@ -0,0 +1,384 @@
++/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Declarations of key-related data-structures and operations on keys. */
++
++#if !defined( __REISER4_KEY_H__ )
++#define __REISER4_KEY_H__
++
++#include "dformat.h"
++#include "forward.h"
++#include "debug.h"
++
++#include <linux/types.h>	/* for __u??  */
++
++/* Operations on keys in reiser4 tree */
++
++/* No access to any of these fields shall be done except via a
++   wrapping macro/function, and that wrapping macro/function shall
++   convert to little endian order.  Compare keys will consider cpu byte order. */
++
++/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
++   which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
++   within that directory, and not near to the file itself.  It is interesting to consider whether this is the wrong
++   approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
++   right one.  */
++
++/* possible values for minor packing locality (4 bits required) */
++typedef enum {
++	/* file name */
++	KEY_FILE_NAME_MINOR = 0,
++	/* stat-data */
++	KEY_SD_MINOR = 1,
++	/* file attribute name */
++	KEY_ATTR_NAME_MINOR = 2,
++	/* file attribute value */
++	KEY_ATTR_BODY_MINOR = 3,
++	/* file body (tail or extent) */
++	KEY_BODY_MINOR = 4,
++} key_minor_locality;
++
++/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
++   Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
++   and by the repacker.  It is stylistically better to put aggregation information into the key.  Thus, if you want to
++   segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
++   block_alloc.c to check the node type when deciding where to allocate the node.
++
++   The need to randomly displace new directories and large files disturbs this symmetry unfortunately.  However, it
++   should be noted that this is a need that is not clearly established given the existence of a repacker.  Also, in our
++   current implementation tails have a different minor packing locality from extents, and no files have both extents and
++   tails, so maybe symmetry can be had without performance cost after all.  Symmetry is what we ship for now....
++*/
++
++/* Arbitrary major packing localities can be assigned to objects using
++   the reiser4(filenameA/..packing<=some_number) system call.
++
++   In reiser4, the creat() syscall creates a directory
++
++   whose default flow (that which is referred to if the directory is
++   read as a file) is the traditional unix file body.
++
++   whose directory plugin is the 'filedir'
++
++   whose major packing locality is that of the parent of the object created.
++
++   The static_stat item is a particular commonly used directory
++   compression (the one for normal unix files).
++
++   The filedir plugin checks to see if the static_stat item exists.
++   There is a unique key for static_stat.  If yes, then it uses the
++   static_stat item for all of the values that it contains.  The
++   static_stat item contains a flag for each stat it contains which
++   indicates whether one should look outside the static_stat item for its
++   contents.
++*/
++
++/* offset of fields in reiser4_key. Value of each element of this enum
++    is index within key (thought as array of __u64's) where this field
++    is. */
++typedef enum {
++	/* major "locale", aka dirid. Sits in 1st element */
++	KEY_LOCALITY_INDEX = 0,
++	/* minor "locale", aka item type. Sits in 1st element */
++	KEY_TYPE_INDEX = 0,
++	ON_LARGE_KEY(KEY_ORDERING_INDEX,)
++	    /* "object band". Sits in 2nd element */
++	    KEY_BAND_INDEX,
++	/* objectid. Sits in 2nd element */
++	KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
++	/* full objectid. Sits in 2nd element */
++	KEY_FULLOID_INDEX = KEY_BAND_INDEX,
++	/* Offset. Sits in 3rd element */
++	KEY_OFFSET_INDEX,
++	/* Name hash. Sits in 3rd element */
++	KEY_HASH_INDEX = KEY_OFFSET_INDEX,
++	KEY_CACHELINE_END = KEY_OFFSET_INDEX,
++	KEY_LAST_INDEX
++} reiser4_key_field_index;
++
++/* key in reiser4 internal "balanced" tree. It is just array of three
++    64bit integers in disk byte order (little-endian by default). This
++    array is actually indexed by reiser4_key_field.  Each __u64 within
++    this array is called "element". Logical key component encoded within
++    elements are called "fields".
++
++    We declare this as union with second component dummy to suppress
++    inconvenient array<->pointer casts implied in C. */
++union reiser4_key {
++	__le64 el[KEY_LAST_INDEX];
++	int pad;
++};
++
++/* bitmasks showing where within reiser4_key particular key is stored. */
++/* major locality occupies higher 60 bits of the first element */
++#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
++
++/* minor locality occupies lower 4 bits of the first element */
++#define KEY_TYPE_MASK 0xfull
++
++/* controversial band occupies higher 4 bits of the 2nd element */
++#define KEY_BAND_MASK 0xf000000000000000ull
++
++/* objectid occupies lower 60 bits of the 2nd element */
++#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
++
++/* full 64bit objectid*/
++#define KEY_FULLOID_MASK 0xffffffffffffffffull
++
++/* offset is just 3rd L.M.Nt itself */
++#define KEY_OFFSET_MASK 0xffffffffffffffffull
++
++/* ordering is whole second element */
++#define KEY_ORDERING_MASK 0xffffffffffffffffull
++
++/* how many bits key element should be shifted to left to get particular field */
++typedef enum {
++	KEY_LOCALITY_SHIFT = 4,
++	KEY_TYPE_SHIFT = 0,
++	KEY_BAND_SHIFT = 60,
++	KEY_OBJECTID_SHIFT = 0,
++	KEY_FULLOID_SHIFT = 0,
++	KEY_OFFSET_SHIFT = 0,
++	KEY_ORDERING_SHIFT = 0,
++} reiser4_key_field_shift;
++
++static inline __u64
++get_key_el(const reiser4_key * key, reiser4_key_field_index off)
++{
++	assert("nikita-753", key != NULL);
++	assert("nikita-754", off < KEY_LAST_INDEX);
++	return le64_to_cpu(get_unaligned(&key->el[off]));
++}
++
++static inline void
++set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
++{
++	assert("nikita-755", key != NULL);
++	assert("nikita-756", off < KEY_LAST_INDEX);
++	put_unaligned(cpu_to_le64(value), &key->el[off]);
++}
++
++/* macro to define getter and setter functions for field F with type T */
++#define DEFINE_KEY_FIELD( L, U, T )					\
++static inline T get_key_ ## L ( const reiser4_key *key )		\
++{									\
++	assert( "nikita-750", key != NULL );				\
++	return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) &		\
++		 KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT;		\
++}									\
++									\
++static inline void set_key_ ## L ( reiser4_key *key, T loc )		\
++{									\
++	__u64 el;							\
++									\
++	assert( "nikita-752", key != NULL );				\
++									\
++	el = get_key_el( key, KEY_ ## U ## _INDEX );			\
++	/* clear field bits in the key */				\
++	el &= ~KEY_ ## U ## _MASK;					\
++	/* actually it should be					\
++									\
++	   el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK;	\
++									\
++	   but we trust user to never pass values that wouldn't fit	\
++	   into field. Clearing extra bits is one operation, but this	\
++	   function is time-critical.					\
++	   But check this in assertion. */				\
++	assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) &	\
++		~KEY_ ## U ## _MASK ) == 0 );				\
++	el |= ( loc << KEY_ ## U ## _SHIFT );				\
++	set_key_el( key, KEY_ ## U ## _INDEX, el );			\
++}
++
++typedef __u64 oid_t;
++
++/* define get_key_locality(), set_key_locality() */
++DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
++/* define get_key_type(), set_key_type() */
++DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
++/* define get_key_band(), set_key_band() */
++DEFINE_KEY_FIELD(band, BAND, __u64);
++/* define get_key_objectid(), set_key_objectid() */
++DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
++/* define get_key_fulloid(), set_key_fulloid() */
++DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
++/* define get_key_offset(), set_key_offset() */
++DEFINE_KEY_FIELD(offset, OFFSET, __u64);
++#if (REISER4_LARGE_KEY)
++/* define get_key_ordering(), set_key_ordering() */
++DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
++#else
++static inline __u64 get_key_ordering(const reiser4_key * key)
++{
++	return 0;
++}
++
++static inline void set_key_ordering(reiser4_key * key, __u64 val)
++{
++}
++#endif
++
++/* key comparison result */
++typedef enum { LESS_THAN = -1,	/* if first key is less than second */
++	EQUAL_TO = 0,		/* if keys are equal */
++	GREATER_THAN = +1	/* if first key is greater than second */
++} cmp_t;
++
++void reiser4_key_init(reiser4_key * key);
++
++/* minimal possible key in the tree. Return pointer to the static storage. */
++extern const reiser4_key *min_key(void);
++extern const reiser4_key *max_key(void);
++
++/* helper macro for keycmp() */
++#define KEY_DIFF(k1, k2, field)							\
++({										\
++	typeof (get_key_ ## field (k1)) f1;                              	\
++	typeof (get_key_ ## field (k2)) f2;					\
++										\
++	f1 = get_key_ ## field (k1);						\
++	f2 = get_key_ ## field (k2);						\
++										\
++	(f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN);		\
++})
++
++/* helper macro for keycmp() */
++#define KEY_DIFF_EL(k1, k2, off)						\
++({										\
++	__u64 e1;								\
++	__u64 e2;								\
++										\
++	e1 = get_key_el(k1, off);						\
++	e2 = get_key_el(k2, off);						\
++										\
++	(e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN);		\
++})
++
++/* compare `k1' and `k2'.  This function is a heart of "key allocation
++    policy". All you need to implement new policy is to add yet another
++    clause here. */
++static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
++			   const reiser4_key * k2 /* second key to compare */ )
++{
++	cmp_t result;
++
++	/*
++	 * This function is the heart of reiser4 tree-routines. Key comparison
++	 * is among most heavily used operations in the file system.
++	 */
++
++	assert("nikita-439", k1 != NULL);
++	assert("nikita-440", k2 != NULL);
++
++	/* there is no actual branch here: condition is compile time constant
++	 * and constant folding and propagation ensures that only one branch
++	 * is actually compiled in. */
++
++	if (REISER4_PLANA_KEY_ALLOCATION) {
++		/* if physical order of fields in a key is identical
++		   with logical order, we can implement key comparison
++		   as three 64bit comparisons. */
++		/* logical order of fields in plan-a:
++		   locality->type->objectid->offset. */
++		/* compare locality and type at once */
++		result = KEY_DIFF_EL(k1, k2, 0);
++		if (result == EQUAL_TO) {
++			/* compare objectid (and band if it's there) */
++			result = KEY_DIFF_EL(k1, k2, 1);
++			/* compare offset */
++			if (result == EQUAL_TO) {
++				result = KEY_DIFF_EL(k1, k2, 2);
++				if (REISER4_LARGE_KEY && result == EQUAL_TO) {
++					result = KEY_DIFF_EL(k1, k2, 3);
++				}
++			}
++		}
++	} else if (REISER4_3_5_KEY_ALLOCATION) {
++		result = KEY_DIFF(k1, k2, locality);
++		if (result == EQUAL_TO) {
++			result = KEY_DIFF(k1, k2, objectid);
++			if (result == EQUAL_TO) {
++				result = KEY_DIFF(k1, k2, type);
++				if (result == EQUAL_TO)
++					result = KEY_DIFF(k1, k2, offset);
++			}
++		}
++	} else
++		impossible("nikita-441", "Unknown key allocation scheme!");
++	return result;
++}
++
++/* true if @k1 equals @k2 */
++static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
++			const reiser4_key * k2 /* second key to compare */ )
++{
++	assert("nikita-1879", k1 != NULL);
++	assert("nikita-1880", k2 != NULL);
++	return !memcmp(k1, k2, sizeof *k1);
++}
++
++/* true if @k1 is less than @k2 */
++static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
++			const reiser4_key * k2 /* second key to compare */ )
++{
++	assert("nikita-1952", k1 != NULL);
++	assert("nikita-1953", k2 != NULL);
++	return keycmp(k1, k2) == LESS_THAN;
++}
++
++/* true if @k1 is less than or equal to @k2 */
++static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
++			const reiser4_key * k2 /* second key to compare */ )
++{
++	assert("nikita-1954", k1 != NULL);
++	assert("nikita-1955", k2 != NULL);
++	return keycmp(k1, k2) != GREATER_THAN;
++}
++
++/* true if @k1 is greater than @k2 */
++static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
++			const reiser4_key * k2 /* second key to compare */ )
++{
++	assert("nikita-1959", k1 != NULL);
++	assert("nikita-1960", k2 != NULL);
++	return keycmp(k1, k2) == GREATER_THAN;
++}
++
++/* true if @k1 is greater than or equal to @k2 */
++static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
++			const reiser4_key * k2 /* second key to compare */ )
++{
++	assert("nikita-1956", k1 != NULL);
++	assert("nikita-1957", k2 != NULL);	/* October  4: sputnik launched
++						 * November 3: Laika */
++	return keycmp(k1, k2) != LESS_THAN;
++}
++
++static inline void prefetchkey(reiser4_key * key)
++{
++	prefetch(key);
++	prefetch(&key->el[KEY_CACHELINE_END]);
++}
++
++/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
++           1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
++/* size of a buffer suitable to hold human readable key representation */
++#define KEY_BUF_LEN (80)
++
++#if REISER4_DEBUG
++extern void print_key(const char *prefix, const reiser4_key * key);
++#else
++#define print_key(p,k) noop
++#endif
++
++/* __FS_REISERFS_KEY_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/ktxnmgrd.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/ktxnmgrd.c
+@@ -0,0 +1,214 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* Transaction manager daemon. */
++
++/*
++ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
++ * needed/important for the following reasons:
++ *
++ *     1. in reiser4 atom is not committed immediately when last transaction
++ *     handle closes, unless atom is either too old or too large (see
++ *     atom_should_commit()). This is done to avoid committing too frequently.
++ *     because:
++ *
++ *     2. sometimes we don't want to commit atom when closing last transaction
++ *     handle even if it is old and fat enough. For example, because we are at
++ *     this point under directory semaphore, and committing would stall all
++ *     accesses to this directory.
++ *
++ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
++ * either due to (tunable) timeout or because it was explicitly woken up by
++ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
++ * eligible.
++ *
++ */
++
++#include "debug.h"
++#include "txnmgr.h"
++#include "tree.h"
++#include "ktxnmgrd.h"
++#include "super.h"
++#include "reiser4.h"
++
++#include <linux/sched.h>	/* for struct task_struct */
++#include <linux/wait.h>
++#include <linux/suspend.h>
++#include <linux/kernel.h>
++#include <linux/writeback.h>
++#include <linux/kthread.h>
++
++static int scan_mgr(struct super_block *);
++
++/*
++ * change current->comm so that ps, top, and friends will see changed
++ * state. This serves no useful purpose whatsoever, but also costs nothing. May
++ * be it will make lonely system administrator feeling less alone at 3 A.M.
++ */
++#define set_comm( state ) 						\
++	snprintf( current -> comm, sizeof( current -> comm ),	\
++		  "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
++
++/**
++ * ktxnmgrd - kernel txnmgr daemon
++ * @arg: pointer to super block
++ *
++ * The background transaction manager daemon, started as a kernel thread during
++ * reiser4 initialization.
++ */
++static int ktxnmgrd(void *arg)
++{
++	struct super_block *super;
++	ktxnmgrd_context *ctx;
++	txn_mgr *mgr;
++	int done = 0;
++
++	super = arg;
++	mgr = &get_super_private(super)->tmgr;
++
++	/*
++	 * do_fork() just copies task_struct into the new thread. ->fs_context
++	 * shouldn't be copied of course. This shouldn't be a problem for the
++	 * rest of the code though.
++	 */
++	current->journal_info = NULL;
++	ctx = mgr->daemon;
++	while (1) {
++		try_to_freeze();
++		set_comm("wait");
++		{
++			DEFINE_WAIT(__wait);
++
++			prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
++			if (kthread_should_stop()) {
++				done = 1;
++			} else
++				schedule_timeout(ctx->timeout);
++			finish_wait(&ctx->wait, &__wait);
++		}
++		if (done)
++			break;
++		set_comm("run");
++		spin_lock(&ctx->guard);
++		/*
++		 * wait timed out or ktxnmgrd was woken up by explicit request
++		 * to commit something. Scan list of atoms in txnmgr and look
++		 * for too old atoms.
++		 */
++		do {
++			ctx->rescan = 0;
++			scan_mgr(super);
++			spin_lock(&ctx->guard);
++			if (ctx->rescan) {
++				/*
++				 * the list could be modified while ctx
++				 * spinlock was released, we have to repeat
++				 * scanning from the beginning
++				 */
++				break;
++			}
++		} while (ctx->rescan);
++		spin_unlock(&ctx->guard);
++	}
++	return 0;
++}
++
++#undef set_comm
++
++/**
++ * init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
++ * @super: pointer to super block
++ *
++ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
++ * manager. Starts kernel txnmgr daemon. This is called on mount.
++ */
++int init_ktxnmgrd(struct super_block *super)
++{
++	txn_mgr *mgr;
++	ktxnmgrd_context *ctx;
++
++	mgr = &get_super_private(super)->tmgr;
++
++	assert("zam-1014", mgr->daemon == NULL);
++
++	ctx = kmalloc(sizeof(ktxnmgrd_context), get_gfp_mask());
++	if (ctx == NULL)
++		return RETERR(-ENOMEM);
++
++	assert("nikita-2442", ctx != NULL);
++
++	memset(ctx, 0, sizeof *ctx);
++	init_waitqueue_head(&ctx->wait);
++
++	/*kcond_init(&ctx->startup);*/
++	spin_lock_init(&ctx->guard);
++	ctx->timeout = REISER4_TXNMGR_TIMEOUT;
++	ctx->rescan = 1;
++	mgr->daemon = ctx;
++
++	ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
++	if (IS_ERR(ctx->tsk)) {
++		int ret = PTR_ERR(ctx->tsk);
++		mgr->daemon = NULL;
++		kfree(ctx);
++		return RETERR(ret);
++	}
++	return 0;
++}
++
++void ktxnmgrd_kick(txn_mgr *mgr)
++{
++	assert("nikita-3234", mgr != NULL);
++	assert("nikita-3235", mgr->daemon != NULL);
++	wake_up(&mgr->daemon->wait);
++}
++
++int is_current_ktxnmgrd(void)
++{
++	return (get_current_super_private()->tmgr.daemon->tsk == current);
++}
++
++/**
++ * scan_mgr - commit atoms which are to be committed
++ * @super: super block to commit atoms of
++ *
++ * Commits old atoms.
++ */
++static int scan_mgr(struct super_block *super)
++{
++	int ret;
++	reiser4_context ctx;
++
++	init_stack_context(&ctx, super);
++
++	ret = commit_some_atoms(&get_super_private(super)->tmgr);
++
++	reiser4_exit_context(&ctx);
++	return ret;
++}
++
++/**
++ * done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
++ * @mgr:
++ *
++ * This is called on umount. Stops ktxnmgrd and free t
++ */
++void done_ktxnmgrd(struct super_block *super)
++{
++	txn_mgr *mgr;
++
++	mgr = &get_super_private(super)->tmgr;
++	assert("zam-1012", mgr->daemon != NULL);
++
++	kthread_stop(mgr->daemon->tsk);
++	kfree(mgr->daemon);
++	mgr->daemon = NULL;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/ktxnmgrd.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/ktxnmgrd.h
+@@ -0,0 +1,52 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Transaction manager daemon. See ktxnmgrd.c for comments. */
++
++#ifndef __KTXNMGRD_H__
++#define __KTXNMGRD_H__
++
++#include "txnmgr.h"
++
++#include <linux/fs.h>
++#include <linux/wait.h>
++#include <linux/completion.h>
++#include <linux/spinlock.h>
++#include <asm/atomic.h>
++#include <linux/sched.h>	/* for struct task_struct */
++
++/* in this structure all data necessary to start up, shut down and communicate
++ * with ktxnmgrd are kept. */
++struct ktxnmgrd_context {
++	/* wait queue head on which ktxnmgrd sleeps */
++	wait_queue_head_t wait;
++	/* spin lock protecting all fields of this structure */
++	spinlock_t guard;
++	/* timeout of sleeping on ->wait */
++	signed long timeout;
++	/* kernel thread running ktxnmgrd */
++	struct task_struct *tsk;
++	/* list of all file systems served by this ktxnmgrd */
++	struct list_head queue;
++	/* should ktxnmgrd repeat scanning of atoms? */
++	unsigned int rescan:1;
++};
++
++extern int init_ktxnmgrd(struct super_block *);
++extern void done_ktxnmgrd(struct super_block *);
++
++extern void ktxnmgrd_kick(txn_mgr * mgr);
++extern int is_current_ktxnmgrd(void);
++
++/* __KTXNMGRD_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/lock.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/lock.c
+@@ -0,0 +1,1261 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
++   order.  V4 balances the tree from the bottom up, and searches the tree from
++   the top down, and that is really the way we want it, so tradition won't work
++   for us.
++
++   Instead we have two lock orderings, a high priority lock ordering, and a low
++   priority lock ordering.  Each node in the tree has a lock in its znode.
++
++   Suppose we have a set of processes which lock (R/W) tree nodes. Each process
++   has a set (maybe empty) of already locked nodes ("process locked set"). Each
++   process may have a pending lock request to a node locked by another process.
++   Note: we lock and unlock, but do not transfer locks: it is possible
++   transferring locks instead would save some bus locking....
++
++   Deadlock occurs when we have a loop constructed from process locked sets and
++   lock request vectors.
++
++   NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
++   memory is extended with "znodes" with which we connect nodes with their left
++   and right neighbors using sibling pointers stored in the znodes.  When we
++   perform balancing operations we often go from left to right and from right to
++   left.
++
++   +-P1-+          +-P3-+
++   |+--+|   V1     |+--+|
++   ||N1|| -------> ||N3||
++   |+--+|          |+--+|
++   +----+          +----+
++     ^               |
++     |V2             |V3
++     |               v
++   +---------P2---------+
++   |+--+            +--+|
++   ||N2|  --------  |N4||
++   |+--+            +--+|
++   +--------------------+
++
++   We solve this by ensuring that only low priority processes lock in top to
++   bottom order and from right to left, and high priority processes lock from
++   bottom to top and left to right.
++
++   ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
++   kill those damn busy loops.
++   ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
++   stage) cannot be ordered that way. There are no rules what nodes can belong
++   to the atom and what nodes cannot.  We cannot define what is right or left
++   direction, what is top or bottom.  We can take immediate parent or side
++   neighbor of one node, but nobody guarantees that, say, left neighbor node is
++   not a far right neighbor for other nodes from the same atom.  It breaks
++   deadlock avoidance rules and hi-low priority locking cannot be applied for
++   atom locks.
++
++   How does it help to avoid deadlocks ?
++
++   Suppose we have a deadlock with n processes. Processes from one priority
++   class never deadlock because they take locks in one consistent
++   order.
++
++   So, any possible deadlock loop must have low priority as well as high
++   priority processes.  There are no other lock priority levels except low and
++   high. We know that any deadlock loop contains at least one node locked by a
++   low priority process and requested by a high priority process. If this
++   situation is caught and resolved it is sufficient to avoid deadlocks.
++
++   V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
++
++   The deadlock prevention algorithm is based on comparing
++   priorities of node owners (processes which keep znode locked) and
++   requesters (processes which want to acquire a lock on znode).  We
++   implement a scheme where low-priority owners yield locks to
++   high-priority requesters. We created a signal passing system that
++   is used to ask low-priority processes to yield one or more locked
++   znodes.
++
++   The condition when a znode needs to change its owners is described by the
++   following formula:
++
++   #############################################
++   #                                           #
++   # (number of high-priority requesters) >  0 #
++   #                AND                        #
++   # (numbers of high-priority owners)    == 0 #
++   #                                           #
++   #############################################
++
++   Note that a low-priority process delays node releasing if another
++   high-priority process owns this node.  So, slightly more strictly speaking,
++   to have a deadlock capable cycle you must have a loop in which a high
++   priority process is waiting on a low priority process to yield a node, which
++   is slightly different from saying a high priority process is waiting on a
++   node owned by a low priority process.
++
++   It is enough to avoid deadlocks if we prevent any low-priority process from
++   falling asleep if its locked set contains a node which satisfies the
++   deadlock condition.
++
++   That condition is implicitly or explicitly checked in all places where new
++   high-priority requests may be added or removed from node request queue or
++   high-priority process takes or releases a lock on node. The main
++   goal of these checks is to never lose the moment when node becomes "has
++   wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
++   at that time.
++
++   The information about received signals is stored in the per-process
++   structure (lock stack) and analyzed before a low-priority process goes to
++   sleep but after a "fast" attempt to lock a node fails. Any signal wakes
++   sleeping process up and forces him to re-check lock status and received
++   signal info. If "must-yield-this-lock" signals were received the locking
++   primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
++
++   V4 LOCKING DRAWBACKS
++
++   If we have already balanced on one level, and we are propagating our changes
++   upward to a higher level, it could be very messy to surrender all locks on
++   the lower level because we put so much computational work into it, and
++   reverting them to their state before they were locked might be very complex.
++   We also don't want to acquire all locks before performing balancing because
++   that would either be almost as much work as the balancing, or it would be
++   too conservative and lock too much.  We want balancing to be done only at
++   high priority.  Yet, we might want to go to the left one node and use some
++   of its empty space... So we make one attempt at getting the node to the left
++   using try_lock, and if it fails we do without it, because we didn't really
++   need it, it was only a nice to have.
++
++   LOCK STRUCTURES DESCRIPTION
++
++   The following data structures are used in the reiser4 locking
++   implementation:
++
++   All fields related to long-term locking are stored in znode->lock.
++
++   The lock stack is a per thread object.  It owns all znodes locked by the
++   thread. One znode may be locked by several threads in case of read lock or
++   one znode may be write locked by one thread several times. The special link
++   objects (lock handles) support n<->m relation between znodes and lock
++   owners.
++
++   <Thread 1>                       <Thread 2>
++
++   +---------+                     +---------+
++   |  LS1    |		           |  LS2    |
++   +---------+			   +---------+
++       ^                                ^
++       |---------------+                +----------+
++       v               v                v          v
++   +---------+      +---------+    +---------+   +---------+
++   |  LH1    |      |   LH2   |	   |  LH3    |   |   LH4   |
++   +---------+	    +---------+	   +---------+   +---------+
++       ^                   ^            ^           ^
++       |                   +------------+           |
++       v                   v                        v
++   +---------+      +---------+                  +---------+
++   |  Z1     |	    |	Z2    |                  |  Z3     |
++   +---------+	    +---------+                  +---------+
++
++   Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
++   picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
++   LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it.  Znode
++   Z1 is locked by only one thread, znode has only one lock handle LH1 on its
++   list, similar situation is for Z3 which is locked by the thread 2 only. Z2
++   is locked (for read) twice by different threads and two lock handles are on
++   its list. Each lock handle represents a single relation of a locking of a
++   znode by a thread. Locking of a znode is an establishing of a locking
++   relation between the lock stack and the znode by adding of a new lock handle
++   to a list of lock handles, the lock stack.  The lock stack links all lock
++   handles for all znodes locked by the lock stack.  The znode list groups all
++   lock handles for all locks stacks which locked the znode.
++
++   Yet another relation may exist between znode and lock owners.  If lock
++   procedure cannot immediately take lock on an object it adds the lock owner
++   on special `requestors' list belongs to znode.  That list represents a
++   queue of pending lock requests.  Because one lock owner may request only
++   only one lock object at a time, it is a 1->n relation between lock objects
++   and a lock owner implemented as it is described above. Full information
++   (priority, pointers to lock and link objects) about each lock request is
++   stored in lock owner structure in `request' field.
++
++   SHORT_TERM LOCKING
++
++   This is a list of primitive operations over lock stacks / lock handles /
++   znodes and locking descriptions for them.
++
++   1. locking / unlocking which is done by two list insertion/deletion, one
++      to/from znode's list of lock handles, another one is to/from lock stack's
++      list of lock handles.  The first insertion is protected by
++      znode->lock.guard spinlock.  The list owned by the lock stack can be
++      modified only by thread who owns the lock stack and nobody else can
++      modify/read it. There is nothing to be protected by a spinlock or
++      something else.
++
++   2. adding/removing a lock request to/from znode requesters list. The rule is
++      that znode->lock.guard spinlock should be taken for this.
++
++   3. we can traverse list of lock handles and use references to lock stacks who
++      locked given znode if znode->lock.guard spinlock is taken.
++
++   4. If a lock stack is associated with a znode as a lock requestor or lock
++      owner its existence is guaranteed by znode->lock.guard spinlock.  Some its
++      (lock stack's) fields should be protected from being accessed in parallel
++      by two or more threads. Please look at  lock_stack structure definition
++      for the info how those fields are protected. */
++
++/* Znode lock and capturing intertwining. */
++/* In current implementation we capture formatted nodes before locking
++   them. Take a look on longterm lock znode, try_capture() request precedes
++   locking requests.  The longterm_lock_znode function unconditionally captures
++   znode before even checking of locking conditions.
++
++   Another variant is to capture znode after locking it.  It was not tested, but
++   at least one deadlock condition is supposed to be there.  One thread has
++   locked a znode (Node-1) and calls try_capture() for it.  Try_capture() sleeps
++   because znode's atom has CAPTURE_WAIT state.  Second thread is a flushing
++   thread, its current atom is the atom Node-1 belongs to. Second thread wants
++   to lock Node-1 and sleeps because Node-1 is locked by the first thread.  The
++   described situation is a deadlock. */
++
++#include "debug.h"
++#include "txnmgr.h"
++#include "znode.h"
++#include "jnode.h"
++#include "tree.h"
++#include "plugin/node/node.h"
++#include "super.h"
++
++#include <linux/spinlock.h>
++
++#if REISER4_DEBUG
++static int request_is_deadlock_safe(znode *, znode_lock_mode,
++				    znode_lock_request);
++#endif
++
++/* Returns a lock owner associated with current thread */
++lock_stack *get_current_lock_stack(void)
++{
++	return &get_current_context()->stack;
++}
++
++/* Wakes up all low priority owners informing them about possible deadlock */
++static void wake_up_all_lopri_owners(znode * node)
++{
++	lock_handle *handle;
++
++	assert_spin_locked(&(node->lock.guard));
++	list_for_each_entry(handle, &node->lock.owners, owners_link) {
++		assert("nikita-1832", handle->node == node);
++		/* count this signal in owner->nr_signaled */
++		if (!handle->signaled) {
++			handle->signaled = 1;
++			atomic_inc(&handle->owner->nr_signaled);
++			/* Wake up a single process */
++			reiser4_wake_up(handle->owner);
++		}
++	}
++}
++
++/* Adds a lock to a lock owner, which means creating a link to the lock and
++   putting the link into the two lists all links are on (the doubly linked list
++   that forms the lock_stack, and the doubly linked list of links attached
++   to a lock.
++*/
++static inline void
++link_object(lock_handle * handle, lock_stack * owner, znode * node)
++{
++	assert("jmacd-810", handle->owner == NULL);
++	assert_spin_locked(&(node->lock.guard));
++
++	handle->owner = owner;
++	handle->node = node;
++
++	assert("reiser4-4",
++	       ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
++
++	/* add lock handle to the end of lock_stack's list of locks */
++	list_add_tail(&handle->locks_link, &owner->locks);
++	ON_DEBUG(owner->nr_locks++);
++	set_gfp_mask();
++
++	/* add lock handle to the head of znode's list of owners */
++	list_add(&handle->owners_link, &node->lock.owners);
++	handle->signaled = 0;
++}
++
++/* Breaks a relation between a lock and its owner */
++static inline void unlink_object(lock_handle * handle)
++{
++	assert("zam-354", handle->owner != NULL);
++	assert("nikita-1608", handle->node != NULL);
++	assert_spin_locked(&(handle->node->lock.guard));
++	assert("nikita-1829", handle->owner == get_current_lock_stack());
++	assert("reiser4-5", handle->owner->nr_locks > 0);
++
++	/* remove lock handle from lock_stack's list of locks */
++	list_del(&handle->locks_link);
++	ON_DEBUG(handle->owner->nr_locks--);
++	set_gfp_mask();
++	assert("reiser4-6",
++	       ergo(list_empty_careful(&handle->owner->locks),
++		    handle->owner->nr_locks == 0));
++	/* remove lock handle from znode's list of owners */
++	list_del(&handle->owners_link);
++	/* indicates that lock handle is free now */
++	handle->node = NULL;
++#if REISER4_DEBUG
++	INIT_LIST_HEAD(&handle->locks_link);
++	INIT_LIST_HEAD(&handle->owners_link);
++	handle->owner = NULL;
++#endif
++}
++
++/* Actually locks an object knowing that we are able to do this */
++static void lock_object(lock_stack * owner)
++{
++	lock_request *request;
++	znode *node;
++
++	request = &owner->request;
++	node = request->node;
++	assert_spin_locked(&(node->lock.guard));
++	if (request->mode == ZNODE_READ_LOCK) {
++		node->lock.nr_readers++;
++	} else {
++		/* check that we don't switched from read to write lock */
++		assert("nikita-1840", node->lock.nr_readers <= 0);
++		/* We allow recursive locking; a node can be locked several
++		   times for write by same process */
++		node->lock.nr_readers--;
++	}
++
++	link_object(request->handle, owner, node);
++
++	if (owner->curpri) {
++		node->lock.nr_hipri_owners++;
++	}
++}
++
++/* Check for recursive write locking */
++static int recursive(lock_stack * owner)
++{
++	int ret;
++	znode *node;
++	lock_handle *lh;
++
++	node = owner->request.node;
++
++	/* Owners list is not empty for a locked node */
++	assert("zam-314", !list_empty_careful(&node->lock.owners));
++	assert("nikita-1841", owner == get_current_lock_stack());
++	assert_spin_locked(&(node->lock.guard));
++
++
++	lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
++	ret = (lh->owner == owner);
++
++	/* Recursive read locking should be done usual way */
++	assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
++	/* mixing of read/write locks is not allowed */
++	assert("zam-341", !ret || znode_is_wlocked(node));
++
++	return ret;
++}
++
++#if REISER4_DEBUG
++/* Returns true if the lock is held by the calling thread. */
++int znode_is_any_locked(const znode * node)
++{
++	lock_handle *handle;
++	lock_stack *stack;
++	int ret;
++
++	if (!znode_is_locked(node)) {
++		return 0;
++	}
++
++	stack = get_current_lock_stack();
++
++	spin_lock_stack(stack);
++
++	ret = 0;
++
++	list_for_each_entry(handle, &stack->locks, locks_link) {
++		if (handle->node == node) {
++			ret = 1;
++			break;
++		}
++	}
++
++	spin_unlock_stack(stack);
++
++	return ret;
++}
++
++#endif
++
++/* Returns true if a write lock is held by the calling thread. */
++int znode_is_write_locked(const znode * node)
++{
++	lock_stack *stack;
++	lock_handle *handle;
++
++	assert("jmacd-8765", node != NULL);
++
++	if (!znode_is_wlocked(node)) {
++		return 0;
++	}
++
++	stack = get_current_lock_stack();
++
++	/*
++	 * When znode is write locked, all owner handles point to the same lock
++	 * stack. Get pointer to lock stack from the first lock handle from
++	 * znode's owner list
++	 */
++	handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
++
++	return (handle->owner == stack);
++}
++
++/* This "deadlock" condition is the essential part of reiser4 locking
++   implementation. This condition is checked explicitly by calling
++   check_deadlock_condition() or implicitly in all places where znode lock
++   state (set of owners and request queue) is changed. Locking code is
++   designed to use this condition to trigger procedure of passing object from
++   low priority owner(s) to high priority one(s).
++
++   The procedure results in passing an event (setting lock_handle->signaled
++   flag) and counting this event in nr_signaled field of owner's lock stack
++   object and wakeup owner's process.
++*/
++static inline int check_deadlock_condition(znode * node)
++{
++	assert_spin_locked(&(node->lock.guard));
++	return node->lock.nr_hipri_requests > 0
++	    && node->lock.nr_hipri_owners == 0;
++}
++
++static int check_livelock_condition(znode * node, znode_lock_mode mode)
++{
++	zlock * lock = &node->lock;
++
++	return mode == ZNODE_READ_LOCK &&
++		lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
++}
++
++/* checks lock/request compatibility */
++static int can_lock_object(lock_stack * owner)
++{
++	znode *node = owner->request.node;
++
++	assert_spin_locked(&(node->lock.guard));
++
++	/* See if the node is disconnected. */
++	if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
++		return RETERR(-EINVAL);
++
++	/* Do not ever try to take a lock if we are going in low priority
++	   direction and a node have a high priority request without high
++	   priority owners. */
++	if (unlikely(!owner->curpri && check_deadlock_condition(node)))
++		return RETERR(-E_REPEAT);
++	if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
++		return RETERR(-E_REPEAT);
++	if (unlikely(!is_lock_compatible(node, owner->request.mode)))
++		return RETERR(-E_REPEAT);
++	return 0;
++}
++
++/* Setting of a high priority to the process. It clears "signaled" flags
++   because znode locked by high-priority process can't satisfy our "deadlock
++   condition". */
++static void set_high_priority(lock_stack * owner)
++{
++	assert("nikita-1846", owner == get_current_lock_stack());
++	/* Do nothing if current priority is already high */
++	if (!owner->curpri) {
++		/* We don't need locking for owner->locks list, because, this
++		 * function is only called with the lock stack of the current
++		 * thread, and no other thread can play with owner->locks list
++		 * and/or change ->node pointers of lock handles in this list.
++		 *
++		 * (Interrupts also are not involved.)
++		 */
++		lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
++		while (&owner->locks != &item->locks_link) {
++			znode *node = item->node;
++
++			spin_lock_zlock(&node->lock);
++
++			node->lock.nr_hipri_owners++;
++
++			/* we can safely set signaled to zero, because
++			   previous statement (nr_hipri_owners ++) guarantees
++			   that signaled will be never set again. */
++			item->signaled = 0;
++			spin_unlock_zlock(&node->lock);
++
++			item = list_entry(item->locks_link.next, lock_handle, locks_link);
++		}
++		owner->curpri = 1;
++		atomic_set(&owner->nr_signaled, 0);
++	}
++}
++
++/* Sets a low priority to the process. */
++static void set_low_priority(lock_stack * owner)
++{
++	assert("nikita-3075", owner == get_current_lock_stack());
++	/* Do nothing if current priority is already low */
++	if (owner->curpri) {
++		/* scan all locks (lock handles) held by @owner, which is
++		   actually current thread, and check whether we are reaching
++		   deadlock possibility anywhere.
++		 */
++		lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
++		while (&owner->locks != &handle->locks_link) {
++			znode *node = handle->node;
++			spin_lock_zlock(&node->lock);
++			/* this thread just was hipri owner of @node, so
++			   nr_hipri_owners has to be greater than zero. */
++			assert("nikita-1835", node->lock.nr_hipri_owners > 0);
++			node->lock.nr_hipri_owners--;
++			/* If we have deadlock condition, adjust a nr_signaled
++			   field. It is enough to set "signaled" flag only for
++			   current process, other low-pri owners will be
++			   signaled and waken up after current process unlocks
++			   this object and any high-priority requestor takes
++			   control. */
++			if (check_deadlock_condition(node)
++			    && !handle->signaled) {
++				handle->signaled = 1;
++				atomic_inc(&owner->nr_signaled);
++			}
++			spin_unlock_zlock(&node->lock);
++			handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
++		}
++		owner->curpri = 0;
++	}
++}
++
++static void remove_lock_request(lock_stack * requestor)
++{
++	zlock * lock = &requestor->request.node->lock;
++
++	if (requestor->curpri) {
++		assert("nikita-1838", lock->nr_hipri_requests > 0);
++		lock->nr_hipri_requests--;
++		if (requestor->request.mode == ZNODE_WRITE_LOCK)
++			lock->nr_hipri_write_requests --;
++	}
++	list_del(&requestor->requestors_link);
++}
++
++
++static void invalidate_all_lock_requests(znode * node)
++{
++	lock_stack *requestor, *tmp;
++
++	assert_spin_locked(&(node->lock.guard));
++
++	list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
++		remove_lock_request(requestor);
++		requestor->request.ret_code = -EINVAL;
++		reiser4_wake_up(requestor);
++		requestor->request.mode = ZNODE_NO_LOCK;
++	}
++}
++
++static void dispatch_lock_requests(znode * node)
++{
++	lock_stack *requestor, *tmp;
++
++	assert_spin_locked(&(node->lock.guard));
++
++	list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
++		if (znode_is_write_locked(node))
++			break;
++		if (!can_lock_object(requestor)) {
++			lock_object(requestor);
++			remove_lock_request(requestor);
++			requestor->request.ret_code = 0;
++			reiser4_wake_up(requestor);
++			requestor->request.mode = ZNODE_NO_LOCK;
++		}
++	}
++}
++
++/* release long-term lock, acquired by longterm_lock_znode() */
++void longterm_unlock_znode(lock_handle * handle)
++{
++	znode *node = handle->node;
++	lock_stack *oldowner = handle->owner;
++	int hipri;
++	int readers;
++	int rdelta;
++	int youdie;
++
++	/*
++	 * this is time-critical and highly optimized code. Modify carefully.
++	 */
++
++	assert("jmacd-1021", handle != NULL);
++	assert("jmacd-1022", handle->owner != NULL);
++	assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
++
++	assert("zam-130", oldowner == get_current_lock_stack());
++
++	LOCK_CNT_DEC(long_term_locked_znode);
++
++	/*
++	 * to minimize amount of operations performed under lock, pre-compute
++	 * all variables used within critical section. This makes code
++	 * obscure.
++	 */
++
++	/* was this lock of hi or lo priority */
++	hipri = oldowner->curpri ? -1 : 0;
++	/* number of readers */
++	readers = node->lock.nr_readers;
++	/* +1 if write lock, -1 if read lock */
++	rdelta = (readers > 0) ? -1 : +1;
++	/* true if node is to die and write lock is released */
++	youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
++
++	spin_lock_zlock(&node->lock);
++
++	assert("zam-101", znode_is_locked(node));
++
++	/* Adjust a number of high priority owners of this lock */
++	node->lock.nr_hipri_owners += hipri;
++	assert("nikita-1836", node->lock.nr_hipri_owners >= 0);
++
++	/* Handle znode deallocation on last write-lock release. */
++	if (znode_is_wlocked_once(node)) {
++		if (youdie) {
++			forget_znode(handle);
++			assert("nikita-2191", znode_invariant(node));
++			zput(node);
++			return;
++		}
++	}
++
++	if (handle->signaled)
++		atomic_dec(&oldowner->nr_signaled);
++
++	/* Unlocking means owner<->object link deletion */
++	unlink_object(handle);
++
++	/* This is enough to be sure whether an object is completely
++	   unlocked. */
++	node->lock.nr_readers += rdelta;
++
++	/* If the node is locked it must have an owners list.  Likewise, if
++	   the node is unlocked it must have an empty owners list. */
++	assert("zam-319", equi(znode_is_locked(node),
++			       !list_empty_careful(&node->lock.owners)));
++
++#if REISER4_DEBUG
++	if (!znode_is_locked(node))
++		++node->times_locked;
++#endif
++
++	/* If there are pending lock requests we wake up a requestor */
++	if (!znode_is_wlocked(node))
++		dispatch_lock_requests(node);
++	if (check_deadlock_condition(node))
++		wake_up_all_lopri_owners(node);
++	spin_unlock_zlock(&node->lock);
++
++	/* minus one reference from handle->node */
++	assert("nikita-2190", znode_invariant(node));
++	ON_DEBUG(check_lock_data());
++	ON_DEBUG(check_lock_node_data(node));
++	zput(node);
++}
++
++/* final portion of longterm-lock */
++static int
++lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
++{
++	znode *node = owner->request.node;
++
++	assert_spin_locked(&(node->lock.guard));
++
++	/* If we broke with (ok == 0) it means we can_lock, now do it. */
++	if (ok == 0) {
++		lock_object(owner);
++		owner->request.mode = 0;
++		/* count a reference from lockhandle->node
++
++		   znode was already referenced at the entry to this function,
++		   hence taking spin-lock here is not necessary (see comment
++		   in the zref()).
++		 */
++		zref(node);
++
++		LOCK_CNT_INC(long_term_locked_znode);
++	}
++	spin_unlock_zlock(&node->lock);
++	ON_DEBUG(check_lock_data());
++	ON_DEBUG(check_lock_node_data(node));
++	return ok;
++}
++
++/*
++ * version of longterm_znode_lock() optimized for the most common case: read
++ * lock without any special flags. This is the kind of lock that any tree
++ * traversal takes on the root node of the tree, which is very frequent.
++ */
++static int longterm_lock_tryfast(lock_stack * owner)
++{
++	int result;
++	znode *node;
++	zlock *lock;
++
++	node = owner->request.node;
++	lock = &node->lock;
++
++	assert("nikita-3340", schedulable());
++	assert("nikita-3341", request_is_deadlock_safe(node,
++						       ZNODE_READ_LOCK,
++						       ZNODE_LOCK_LOPRI));
++	spin_lock_zlock(lock);
++	result = can_lock_object(owner);
++	spin_unlock_zlock(lock);
++
++	if (likely(result != -EINVAL)) {
++		spin_lock_znode(node);
++		result = try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
++		spin_unlock_znode(node);
++		spin_lock_zlock(lock);
++		if (unlikely(result != 0)) {
++			owner->request.mode = 0;
++		} else {
++			result = can_lock_object(owner);
++			if (unlikely(result == -E_REPEAT)) {
++				/* fall back to longterm_lock_znode() */
++				spin_unlock_zlock(lock);
++				return 1;
++			}
++		}
++		return lock_tail(owner, result, ZNODE_READ_LOCK);
++	} else
++		return 1;
++}
++
++/* locks given lock object */
++int longterm_lock_znode(
++			       /* local link object (allocated by lock owner thread, usually on its own
++			        * stack) */
++			       lock_handle * handle,
++			       /* znode we want to lock. */
++			       znode * node,
++			       /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
++			       znode_lock_mode mode,
++			       /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
++			       znode_lock_request request) {
++	int ret;
++	int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
++	int non_blocking = 0;
++	int has_atom;
++	txn_capture cap_flags;
++	zlock *lock;
++	txn_handle *txnh;
++	tree_level level;
++
++	/* Get current process context */
++	lock_stack *owner = get_current_lock_stack();
++
++	/* Check that the lock handle is initialized and isn't already being
++	 * used. */
++	assert("jmacd-808", handle->owner == NULL);
++	assert("nikita-3026", schedulable());
++	assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
++	assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
++	/* long term locks are not allowed in the VM contexts (->writepage(),
++	 * prune_{d,i}cache()).
++	 *
++	 * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
++	 * bug caused by d_splice_alias() only working for directories.
++	 */
++	assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
++	assert ("zam-1055", mode != ZNODE_NO_LOCK);
++
++	cap_flags = 0;
++	if (request & ZNODE_LOCK_NONBLOCK) {
++		cap_flags |= TXN_CAPTURE_NONBLOCKING;
++		non_blocking = 1;
++	}
++
++	if (request & ZNODE_LOCK_DONT_FUSE)
++		cap_flags |= TXN_CAPTURE_DONT_FUSE;
++
++	/* If we are changing our process priority we must adjust a number
++	   of high priority owners for each znode that we already lock */
++	if (hipri) {
++		set_high_priority(owner);
++	} else {
++		set_low_priority(owner);
++	}
++
++	level = znode_get_level(node);
++
++	/* Fill request structure with our values. */
++	owner->request.mode = mode;
++	owner->request.handle = handle;
++	owner->request.node = node;
++
++	txnh = get_current_context()->trans;
++	lock = &node->lock;
++
++	if (mode == ZNODE_READ_LOCK && request == 0) {
++		ret = longterm_lock_tryfast(owner);
++		if (ret <= 0)
++			return ret;
++	}
++
++	has_atom = (txnh->atom != NULL);
++
++	/* Synchronize on node's zlock guard lock. */
++	spin_lock_zlock(lock);
++
++	if (znode_is_locked(node) &&
++	    mode == ZNODE_WRITE_LOCK && recursive(owner))
++		return lock_tail(owner, 0, mode);
++
++	for (;;) {
++		/* Check the lock's availability: if it is unavaiable we get
++		   E_REPEAT, 0 indicates "can_lock", otherwise the node is
++		   invalid.  */
++		ret = can_lock_object(owner);
++
++		if (unlikely(ret == -EINVAL)) {
++			/* @node is dying. Leave it alone. */
++			break;
++		}
++
++		if (unlikely(ret == -E_REPEAT && non_blocking)) {
++			/* either locking of @node by the current thread will
++			 * lead to the deadlock, or lock modes are
++			 * incompatible. */
++			break;
++		}
++
++		assert("nikita-1844", (ret == 0)
++		       || ((ret == -E_REPEAT) && !non_blocking));
++		/* If we can get the lock... Try to capture first before
++		   taking the lock. */
++
++		/* first handle commonest case where node and txnh are already
++		 * in the same atom. */
++		/* safe to do without taking locks, because:
++		 *
++		 * 1. read of aligned word is atomic with respect to writes to
++		 * this word
++		 *
++		 * 2. false negatives are handled in try_capture().
++		 *
++		 * 3. false positives are impossible.
++		 *
++		 * PROOF: left as an exercise to the curious reader.
++		 *
++		 * Just kidding. Here is one:
++		 *
++		 * At the time T0 txnh->atom is stored in txnh_atom.
++		 *
++		 * At the time T1 node->atom is stored in node_atom.
++		 *
++		 * At the time T2 we observe that
++		 *
++		 *     txnh_atom != NULL && node_atom == txnh_atom.
++		 *
++		 * Imagine that at this moment we acquire node and txnh spin
++		 * lock in this order. Suppose that under spin lock we have
++		 *
++		 *     node->atom != txnh->atom,                       (S1)
++		 *
++		 * at the time T3.
++		 *
++		 * txnh->atom != NULL still, because txnh is open by the
++		 * current thread.
++		 *
++		 * Suppose node->atom == NULL, that is, node was un-captured
++		 * between T1, and T3. But un-capturing of formatted node is
++		 * always preceded by the call to invalidate_lock(), which
++		 * marks znode as JNODE_IS_DYING under zlock spin
++		 * lock. Contradiction, because can_lock_object() above checks
++		 * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
++		 *
++		 * Suppose that node->atom != node_atom, that is, atom, node
++		 * belongs to was fused into another atom: node_atom was fused
++		 * into node->atom. Atom of txnh was equal to node_atom at T2,
++		 * which means that under spin lock, txnh->atom == node->atom,
++		 * because txnh->atom can only follow fusion
++		 * chain. Contradicts S1.
++		 *
++		 * The same for hypothesis txnh->atom != txnh_atom. Hence,
++		 * node->atom == node_atom == txnh_atom == txnh->atom. Again
++		 * contradicts S1. Hence S1 is false. QED.
++		 *
++		 */
++
++		if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
++			;
++		} else {
++			/*
++			 * unlock zlock spin lock here. It is possible for
++			 * longterm_unlock_znode() to sneak in here, but there
++			 * is no harm: invalidate_lock() will mark znode as
++			 * JNODE_IS_DYING and this will be noted by
++			 * can_lock_object() below.
++			 */
++			spin_unlock_zlock(lock);
++			spin_lock_znode(node);
++			ret = try_capture(ZJNODE(node), mode, cap_flags);
++			spin_unlock_znode(node);
++			spin_lock_zlock(lock);
++			if (unlikely(ret != 0)) {
++				/* In the failure case, the txnmgr releases
++				   the znode's lock (or in some cases, it was
++				   released a while ago).  There's no need to
++				   reacquire it so we should return here,
++				   avoid releasing the lock. */
++				owner->request.mode = 0;
++				break;
++			}
++
++			/* Check the lock's availability again -- this is
++			   because under some circumstances the capture code
++			   has to release and reacquire the znode spinlock. */
++			ret = can_lock_object(owner);
++		}
++
++		/* This time, a return of (ret == 0) means we can lock, so we
++		   should break out of the loop. */
++		if (likely(ret != -E_REPEAT || non_blocking)) {
++			break;
++		}
++
++		/* Lock is unavailable, we have to wait. */
++
++		/* By having semaphore initialization here we cannot lose
++		   wakeup signal even if it comes after `nr_signaled' field
++		   check. */
++		ret = prepare_to_sleep(owner);
++		if (unlikely(ret != 0)) {
++			break;
++		}
++
++		assert_spin_locked(&(node->lock.guard));
++		if (hipri) {
++			/* If we are going in high priority direction then
++			   increase high priority requests counter for the
++			   node */
++			lock->nr_hipri_requests++;
++			if (mode == ZNODE_WRITE_LOCK)
++				lock->nr_hipri_write_requests ++;
++			/* If there are no high priority owners for a node,
++			   then immediately wake up low priority owners, so
++			   they can detect possible deadlock */
++			if (lock->nr_hipri_owners == 0)
++				wake_up_all_lopri_owners(node);
++		}
++		list_add_tail(&owner->requestors_link, &lock->requestors);
++
++		/* Ok, here we have prepared a lock request, so unlock
++		   a znode ... */
++		spin_unlock_zlock(lock);
++		/* ... and sleep */
++		go_to_sleep(owner);
++		if (owner->request.mode == ZNODE_NO_LOCK)
++			goto request_is_done;
++		spin_lock_zlock(lock);
++		if (owner->request.mode == ZNODE_NO_LOCK) {
++			spin_unlock_zlock(lock);
++		request_is_done:
++			if (owner->request.ret_code == 0) {
++				LOCK_CNT_INC(long_term_locked_znode);
++				zref(node);
++			}
++			return owner->request.ret_code;
++		}
++		remove_lock_request(owner);
++	}
++
++	return lock_tail(owner, ret, mode);
++}
++
++/* lock object invalidation means changing of lock object state to `INVALID'
++   and waiting for all other processes to cancel theirs lock requests. */
++void invalidate_lock(lock_handle * handle	/* path to lock
++						 * owner and lock
++						 * object is being
++						 * invalidated. */ )
++{
++	znode *node = handle->node;
++	lock_stack *owner = handle->owner;
++
++	assert("zam-325", owner == get_current_lock_stack());
++	assert("zam-103", znode_is_write_locked(node));
++	assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
++	assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
++	assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
++	assert("nikita-3097", znode_is_wlocked_once(node));
++	assert_spin_locked(&(node->lock.guard));
++
++	if (handle->signaled)
++		atomic_dec(&owner->nr_signaled);
++
++	ZF_SET(node, JNODE_IS_DYING);
++	unlink_object(handle);
++	node->lock.nr_readers = 0;
++
++	invalidate_all_lock_requests(node);
++	spin_unlock_zlock(&node->lock);
++}
++
++/* Initializes lock_stack. */
++void init_lock_stack(lock_stack * owner	/* pointer to
++					 * allocated
++					 * structure. */ )
++{
++	INIT_LIST_HEAD(&owner->locks);
++	INIT_LIST_HEAD(&owner->requestors_link);
++	spin_lock_init(&owner->sguard);
++	owner->curpri = 1;
++	sema_init(&owner->sema, 0);
++}
++
++/* Initializes lock object. */
++void reiser4_init_lock(zlock * lock	/* pointer on allocated
++					 * uninitialized lock object
++					 * structure. */ )
++{
++	memset(lock, 0, sizeof(zlock));
++	spin_lock_init(&lock->guard);
++	INIT_LIST_HEAD(&lock->requestors);
++	INIT_LIST_HEAD(&lock->owners);
++}
++
++/* Transfer a lock handle (presumably so that variables can be moved between stack and
++   heap locations). */
++static void
++move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
++{
++	znode *node = old->node;
++	lock_stack *owner = old->owner;
++	int signaled;
++
++	/* locks_list, modified by link_object() is not protected by
++	   anything. This is valid because only current thread ever modifies
++	   locks_list of its lock_stack.
++	 */
++	assert("nikita-1827", owner == get_current_lock_stack());
++	assert("nikita-1831", new->owner == NULL);
++
++	spin_lock_zlock(&node->lock);
++
++	signaled = old->signaled;
++	if (unlink_old) {
++		unlink_object(old);
++	} else {
++		if (node->lock.nr_readers > 0) {
++			node->lock.nr_readers += 1;
++		} else {
++			node->lock.nr_readers -= 1;
++		}
++		if (signaled) {
++			atomic_inc(&owner->nr_signaled);
++		}
++		if (owner->curpri) {
++			node->lock.nr_hipri_owners += 1;
++		}
++		LOCK_CNT_INC(long_term_locked_znode);
++
++		zref(node);
++	}
++	link_object(new, owner, node);
++	new->signaled = signaled;
++
++	spin_unlock_zlock(&node->lock);
++}
++
++void move_lh(lock_handle * new, lock_handle * old)
++{
++	move_lh_internal(new, old, /*unlink_old */ 1);
++}
++
++void copy_lh(lock_handle * new, lock_handle * old)
++{
++	move_lh_internal(new, old, /*unlink_old */ 0);
++}
++
++/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
++int check_deadlock(void)
++{
++	lock_stack *owner = get_current_lock_stack();
++	return atomic_read(&owner->nr_signaled) != 0;
++}
++
++/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
++   priorities. */
++int prepare_to_sleep(lock_stack * owner)
++{
++	assert("nikita-1847", owner == get_current_lock_stack());
++	/* NOTE(Zam): We cannot reset the lock semaphore here because it may
++	   clear wake-up signal. The initial design was to re-check all
++	   conditions under which we continue locking, release locks or sleep
++	   until conditions are changed. However, even lock.c does not follow
++	   that design.  So, wake-up signal which is stored in semaphore state
++	   could we loosen by semaphore reset.  The less complex scheme without
++	   resetting the semaphore is enough to not to loose wake-ups.
++
++	   if (0) {
++
++	   NOTE-NIKITA: I commented call to sema_init() out hoping
++	   that it is the reason or thread sleeping in
++	   down(&owner->sema) without any other thread running.
++
++	   Anyway, it is just an optimization: is semaphore is not
++	   reinitialised at this point, in the worst case
++	   longterm_lock_znode() would have to iterate its loop once
++	   more.
++	   spin_lock_stack(owner);
++	   sema_init(&owner->sema, 0);
++	   spin_unlock_stack(owner);
++	   }
++	 */
++
++	/* We return -E_DEADLOCK if one or more "give me the lock" messages are
++	 * counted in nr_signaled */
++	if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
++		assert("zam-959", !owner->curpri);
++		return RETERR(-E_DEADLOCK);
++	}
++	return 0;
++}
++
++/* Wakes up a single thread */
++void __reiser4_wake_up(lock_stack * owner)
++{
++	up(&owner->sema);
++}
++
++/* Puts a thread to sleep */
++void go_to_sleep(lock_stack * owner)
++{
++	/* Well, we might sleep here, so holding of any spinlocks is no-no */
++	assert("nikita-3027", schedulable());
++	/* return down_interruptible(&owner->sema); */
++	down(&owner->sema);
++}
++
++int lock_stack_isclean(lock_stack * owner)
++{
++	if (list_empty_careful(&owner->locks)) {
++		assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
++		return 1;
++	}
++
++	return 0;
++}
++
++#if REISER4_DEBUG
++
++/*
++ * debugging functions
++ */
++
++static void list_check(struct list_head *head)
++{
++	struct list_head *pos;
++
++	list_for_each(pos, head)
++		assert("", (pos->prev != NULL && pos->next != NULL &&
++			    pos->prev->next == pos && pos->next->prev == pos));
++}
++
++/* check consistency of locking data-structures hanging of the @stack */
++static void check_lock_stack(lock_stack * stack)
++{
++	spin_lock_stack(stack);
++	/* check that stack->locks is not corrupted */
++	list_check(&stack->locks);
++	spin_unlock_stack(stack);
++}
++
++/* check consistency of locking data structures */
++void check_lock_data(void)
++{
++	check_lock_stack(&get_current_context()->stack);
++}
++
++/* check consistency of locking data structures for @node */
++void check_lock_node_data(znode * node)
++{
++	spin_lock_zlock(&node->lock);
++	list_check(&node->lock.owners);
++	list_check(&node->lock.requestors);
++	spin_unlock_zlock(&node->lock);
++}
++
++/* check that given lock request is dead lock safe. This check is, of course,
++ * not exhaustive. */
++static int
++request_is_deadlock_safe(znode * node, znode_lock_mode mode,
++			 znode_lock_request request)
++{
++	lock_stack *owner;
++
++	owner = get_current_lock_stack();
++	/*
++	 * check that hipri lock request is not issued when there are locked
++	 * nodes at the higher levels.
++	 */
++	if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
++	    znode_get_level(node) != 0) {
++		lock_handle *item;
++
++		list_for_each_entry(item, &owner->locks, locks_link) {
++			znode *other;
++
++			other = item->node;
++
++			if (znode_get_level(other) == 0)
++				continue;
++			if (znode_get_level(other) > znode_get_level(node))
++				return 0;
++		}
++	}
++	return 1;
++}
++
++#endif
++
++/* return pointer to static storage with name of lock_mode. For
++    debugging */
++const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
++{
++	if (lock == ZNODE_READ_LOCK)
++		return "read";
++	else if (lock == ZNODE_WRITE_LOCK)
++		return "write";
++	else {
++		static char buf[30];
++
++		sprintf(buf, "unknown: %i", lock);
++		return buf;
++	}
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 79
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/lock.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/lock.h
+@@ -0,0 +1,272 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Long term locking data structures. See lock.c for details. */
++
++#ifndef __LOCK_H__
++#define __LOCK_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/node/node.h"
++#include "txnmgr.h"
++#include "readahead.h"
++
++#include <linux/types.h>
++#include <linux/spinlock.h>
++#include <linux/pagemap.h>	/* for PAGE_CACHE_SIZE */
++#include <asm/atomic.h>
++#include <asm/semaphore.h>
++
++/* Per-znode lock object */
++struct zlock {
++	spinlock_t guard;
++	/* The number of readers if positive; the number of recursively taken
++	   write locks if negative. Protected by zlock spin lock. */
++	int nr_readers;
++	/* A number of processes (lock_stacks) that have this object
++	   locked with high priority */
++	unsigned nr_hipri_owners;
++	/* A number of attempts to lock znode in high priority direction */
++	unsigned nr_hipri_requests;
++	/* A linked list of lock_handle objects that contains pointers
++	   for all lock_stacks which have this lock object locked */
++	unsigned nr_hipri_write_requests;
++	struct list_head owners;
++	/* A linked list of lock_stacks that wait for this lock */
++	struct list_head requestors;
++};
++
++static inline void spin_lock_zlock(zlock *lock)
++{
++	/* check that zlock is not locked */
++	assert("", LOCK_CNT_NIL(spin_locked_zlock));
++	/* check that spinlocks of lower priorities are not held */
++	assert("", LOCK_CNT_NIL(spin_locked_stack));
++
++	spin_lock(&lock->guard);
++
++	LOCK_CNT_INC(spin_locked_zlock);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline void spin_unlock_zlock(zlock *lock)
++{
++	assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(spin_locked_zlock);
++	LOCK_CNT_DEC(spin_locked);
++
++	spin_unlock(&lock->guard);
++}
++
++#define lock_is_locked(lock)          ((lock)->nr_readers != 0)
++#define lock_is_rlocked(lock)         ((lock)->nr_readers > 0)
++#define lock_is_wlocked(lock)         ((lock)->nr_readers < 0)
++#define lock_is_wlocked_once(lock)    ((lock)->nr_readers == -1)
++#define lock_can_be_rlocked(lock)     ((lock)->nr_readers >=0)
++#define lock_mode_compatible(lock, mode)				\
++             (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) ||	\
++              ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
++
++/* Since we have R/W znode locks we need additional bidirectional `link'
++   objects to implement n<->m relationship between lock owners and lock
++   objects. We call them `lock handles'.
++
++   Locking: see lock.c/"SHORT-TERM LOCKING"
++*/
++struct lock_handle {
++	/* This flag indicates that a signal to yield a lock was passed to
++	   lock owner and counted in owner->nr_signalled
++
++	   Locking: this is accessed under spin lock on ->node.
++	 */
++	int signaled;
++	/* A link to owner of a lock */
++	lock_stack *owner;
++	/* A link to znode locked */
++	znode *node;
++	/* A list of all locks for a process */
++	struct list_head locks_link;
++	/* A list of all owners for a znode */
++	struct list_head owners_link;
++};
++
++typedef struct lock_request {
++	/* A pointer to uninitialized link object */
++	lock_handle *handle;
++	/* A pointer to the object we want to lock */
++	znode *node;
++	/* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
++	znode_lock_mode mode;
++	/* how dispatch_lock_requests() returns lock request result code */
++	int ret_code;
++} lock_request;
++
++/* A lock stack structure for accumulating locks owned by a process */
++struct lock_stack {
++	/* A guard lock protecting a lock stack */
++	spinlock_t sguard;
++	/* number of znodes which were requested by high priority processes */
++	atomic_t nr_signaled;
++	/* Current priority of a process
++
++	   This is only accessed by the current thread and thus requires no
++	   locking.
++	 */
++	int curpri;
++	/* A list of all locks owned by this process. Elements can be added to
++	 * this list only by the current thread. ->node pointers in this list
++	 * can be only changed by the current thread. */
++	struct list_head locks;
++	/* When lock_stack waits for the lock, it puts itself on double-linked
++	   requestors list of that lock */
++	struct list_head requestors_link;
++	/* Current lock request info.
++
++	   This is only accessed by the current thread and thus requires no
++	   locking.
++	 */
++	lock_request request;
++	/* It is a lock_stack's synchronization object for when process sleeps
++	   when requested lock not on this lock_stack but which it wishes to
++	   add to this lock_stack is not immediately available. It is used
++	   instead of wait_queue_t object due to locking problems (lost wake
++	   up). "lost wakeup" occurs when process is waken up before he actually
++	   becomes 'sleepy' (through sleep_on()). Using of semaphore object is
++	   simplest way to avoid that problem.
++
++	   A semaphore is used in the following way: only the process that is
++	   the owner of the lock_stack initializes it (to zero) and calls
++	   down(sema) on it. Usually this causes the process to sleep on the
++	   semaphore. Other processes may wake him up by calling up(sema). The
++	   advantage to a semaphore is that up() and down() calls are not
++	   required to preserve order. Unlike wait_queue it works when process
++	   is woken up before getting to sleep.
++
++	   NOTE-NIKITA: Transaction manager is going to have condition variables
++	   (&kcondvar_t) anyway, so this probably will be replaced with
++	   one in the future.
++
++	   After further discussion, Nikita has shown me that Zam's implementation is
++	   exactly a condition variable.  The znode's {zguard,requestors_list} represents
++	   condition variable and the lock_stack's {sguard,semaphore} guards entry and
++	   exit from the condition variable's wait queue.  But the existing code can't
++	   just be replaced with a more general abstraction, and I think its fine the way
++	   it is. */
++	struct semaphore sema;
++#if REISER4_DEBUG
++	int nr_locks;		/* number of lock handles in the above list */
++#endif
++};
++
++
++/*
++  User-visible znode locking functions
++*/
++
++extern int longterm_lock_znode(lock_handle * handle,
++			       znode * node,
++			       znode_lock_mode mode,
++			       znode_lock_request request);
++
++extern void longterm_unlock_znode(lock_handle * handle);
++
++extern int check_deadlock(void);
++
++extern lock_stack *get_current_lock_stack(void);
++
++extern void init_lock_stack(lock_stack * owner);
++extern void reiser4_init_lock(zlock * lock);
++
++static inline void init_lh(lock_handle *lh)
++{
++#if REISER4_DEBUG
++	memset(lh, 0, sizeof *lh);
++	INIT_LIST_HEAD(&lh->locks_link);
++	INIT_LIST_HEAD(&lh->owners_link);
++#else
++	lh->node = NULL;
++#endif
++}
++
++static inline  void done_lh(lock_handle *lh)
++{
++	assert("zam-342", lh != NULL);
++	if (lh->node != NULL)
++		longterm_unlock_znode(lh);
++}
++
++extern void move_lh(lock_handle * new, lock_handle * old);
++extern void copy_lh(lock_handle * new, lock_handle * old);
++
++extern int prepare_to_sleep(lock_stack * owner);
++extern void go_to_sleep(lock_stack * owner);
++extern void __reiser4_wake_up(lock_stack * owner);
++
++extern int lock_stack_isclean(lock_stack * owner);
++
++/* zlock object state check macros: only used in assertions.  Both forms imply that the
++   lock is held by the current thread. */
++extern int znode_is_write_locked(const znode *);
++extern void invalidate_lock(lock_handle *);
++
++/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
++#define spin_ordering_pred_stack(stack)			\
++	(LOCK_CNT_NIL(spin_locked_stack) &&		\
++	 LOCK_CNT_NIL(spin_locked_txnmgr) &&		\
++	 LOCK_CNT_NIL(spin_locked_inode) &&		\
++	 LOCK_CNT_NIL(rw_locked_cbk_cache) &&		\
++	 LOCK_CNT_NIL(spin_locked_super_eflush) )
++
++static inline void spin_lock_stack(lock_stack *stack)
++{
++	assert("", spin_ordering_pred_stack(stack));
++	spin_lock(&(stack->sguard));
++	LOCK_CNT_INC(spin_locked_stack);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline void spin_unlock_stack(lock_stack *stack)
++{
++	assert_spin_locked(&(stack->sguard));
++	assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++	LOCK_CNT_DEC(spin_locked_stack);
++	LOCK_CNT_DEC(spin_locked);
++	spin_unlock(&(stack->sguard));
++}
++
++
++static inline void reiser4_wake_up(lock_stack * owner)
++{
++	spin_lock_stack(owner);
++	__reiser4_wake_up(owner);
++	spin_unlock_stack(owner);
++}
++
++const char *lock_mode_name(znode_lock_mode lock);
++
++#if REISER4_DEBUG
++extern void check_lock_data(void);
++extern void check_lock_node_data(znode * node);
++#else
++#define check_lock_data() noop
++#define check_lock_node_data() noop
++#endif
++
++/* __LOCK_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/oid.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/oid.c
+@@ -0,0 +1,141 @@
++/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "debug.h"
++#include "super.h"
++#include "txnmgr.h"
++
++/* we used to have oid allocation plugin. It was removed because it
++   was recognized as providing unneeded level of abstraction. If one
++   ever will find it useful - look at yet_unneeded_abstractions/oid
++*/
++
++/*
++ * initialize in-memory data for oid allocator at @super. @nr_files and @next
++ * are provided by disk format plugin that reads them from the disk during
++ * mount.
++ */
++int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
++{
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = get_super_private(super);
++
++	sbinfo->next_to_use = next;
++	sbinfo->oids_in_use = nr_files;
++	return 0;
++}
++
++/*
++ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
++ * runs out of oids.
++ */
++oid_t oid_allocate(struct super_block * super)
++{
++	reiser4_super_info_data *sbinfo;
++	oid_t oid;
++
++	sbinfo = get_super_private(super);
++
++	spin_lock_reiser4_super(sbinfo);
++	if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
++		oid = sbinfo->next_to_use++;
++		sbinfo->oids_in_use++;
++	} else
++		oid = ABSOLUTE_MAX_OID;
++	spin_unlock_reiser4_super(sbinfo);
++	return oid;
++}
++
++/*
++ * Tell oid allocator that @oid is now free.
++ */
++int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
++{
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = get_super_private(super);
++
++	spin_lock_reiser4_super(sbinfo);
++	sbinfo->oids_in_use--;
++	spin_unlock_reiser4_super(sbinfo);
++	return 0;
++}
++
++/*
++ * return next @oid that would be allocated (i.e., returned by oid_allocate())
++ * without actually allocating it. This is used by disk format plugin to save
++ * oid allocator state on the disk.
++ */
++oid_t oid_next(const struct super_block * super)
++{
++	reiser4_super_info_data *sbinfo;
++	oid_t oid;
++
++	sbinfo = get_super_private(super);
++
++	spin_lock_reiser4_super(sbinfo);
++	oid = sbinfo->next_to_use;
++	spin_unlock_reiser4_super(sbinfo);
++	return oid;
++}
++
++/*
++ * returns number of currently used oids. This is used by statfs(2) to report
++ * number of "inodes" and by disk format plugin to save oid allocator state on
++ * the disk.
++ */
++long oids_used(const struct super_block *super)
++{
++	reiser4_super_info_data *sbinfo;
++	oid_t used;
++
++	sbinfo = get_super_private(super);
++
++	spin_lock_reiser4_super(sbinfo);
++	used = sbinfo->oids_in_use;
++	spin_unlock_reiser4_super(sbinfo);
++	if (used < (__u64) ((long)~0) >> 1)
++		return (long)used;
++	else
++		return (long)-1;
++}
++
++/*
++ * Count oid as allocated in atom. This is done after call to oid_allocate()
++ * at the point when we are irrevocably committed to creation of the new file
++ * (i.e., when oid allocation cannot be any longer rolled back due to some
++ * error).
++ */
++void oid_count_allocated(void)
++{
++	txn_atom *atom;
++
++	atom = get_current_atom_locked();
++	atom->nr_objects_created++;
++	spin_unlock_atom(atom);
++}
++
++/*
++ * Count oid as free in atom. This is done after call to oid_release() at the
++ * point when we are irrevocably committed to the deletion of the file (i.e.,
++ * when oid release cannot be any longer rolled back due to some error).
++ */
++void oid_count_released(void)
++{
++	txn_atom *atom;
++
++	atom = get_current_atom_locked();
++	atom->nr_objects_deleted++;
++	spin_unlock_atom(atom);
++}
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/page_cache.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/page_cache.c
+@@ -0,0 +1,712 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Memory pressure hooks. Fake inodes handling. */
++/* We store all file system meta data (and data, of course) in the page cache.
++
++   What does this mean? In stead of using bread/brelse we create special
++   "fake" inode (one per super block) and store content of formatted nodes
++   into pages bound to this inode in the page cache. In newer kernels bread()
++   already uses inode attached to block device (bd_inode). Advantage of having
++   our own fake inode is that we can install appropriate methods in its
++   address_space operations. Such methods are called by VM on memory pressure
++   (or during background page flushing) and we can use them to react
++   appropriately.
++
++   In initial version we only support one block per page. Support for multiple
++   blocks per page is complicated by relocation.
++
++   To each page, used by reiser4, jnode is attached. jnode is analogous to
++   buffer head. Difference is that jnode is bound to the page permanently:
++   jnode cannot be removed from memory until its backing page is.
++
++   jnode contain pointer to page (->pg field) and page contain pointer to
++   jnode in ->private field. Pointer from jnode to page is protected to by
++   jnode's spinlock and pointer from page to jnode is protected by page lock
++   (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
++   lock. To go into reverse direction use jnode_lock_page() function that uses
++   standard try-lock-and-release device.
++
++   Properties:
++
++   1. when jnode-to-page mapping is established (by jnode_attach_page()), page
++   reference counter is increased.
++
++   2. when jnode-to-page mapping is destroyed (by jnode_detach_page() and
++   page_detach_jnode()), page reference counter is decreased.
++
++   3. on jload() reference counter on jnode page is increased, page is
++   kmapped and `referenced'.
++
++   4. on jrelse() inverse operations are performed.
++
++   5. kmapping/kunmapping of unformatted pages is done by read/write methods.
++
++   DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
++   historically.]
++
++   [In the following discussion, `lock' invariably means long term lock on
++   znode.] (What about page locks?)
++
++   There is some special class of deadlock possibilities related to memory
++   pressure. Locks acquired by other reiser4 threads are accounted for in
++   deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
++   invoked additional hidden arc is added to the locking graph: thread that
++   tries to allocate memory waits for ->vm_writeback() to finish. If this
++   thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
++   prevention is useless.
++
++   Another related problem is possibility for ->vm_writeback() to run out of
++   memory itself. This is not a problem for ext2 and friends, because their
++   ->vm_writeback() don't allocate much memory, but reiser4 flush is
++   definitely able to allocate huge amounts of memory.
++
++   It seems that there is no reliable way to cope with the problems above. In
++   stead it was decided that ->vm_writeback() (as invoked in the kswapd
++   context) wouldn't perform any flushing itself, but rather should just wake
++   up some auxiliary thread dedicated for this purpose (or, the same thread
++   that does periodic commit of old atoms (ktxnmgrd.c)).
++
++   Details:
++
++   1. Page is called `reclaimable' against particular reiser4 mount F if this
++   page can be ultimately released by try_to_free_pages() under presumptions
++   that:
++
++    a. ->vm_writeback() for F is no-op, and
++
++    b. none of the threads accessing F are making any progress, and
++
++    c. other reiser4 mounts obey the same memory reservation protocol as F
++    (described below).
++
++   For example, clean un-pinned page, or page occupied by ext2 data are
++   reclaimable against any reiser4 mount.
++
++   When there is more than one reiser4 mount in a system, condition (c) makes
++   reclaim-ability not easily verifiable beyond trivial cases mentioned above.
++
++   THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
++
++   Fake inode is used to bound formatted nodes and each node is indexed within
++   fake inode by its block number. If block size of smaller than page size, it
++   may so happen that block mapped to the page with formatted node is occupied
++   by unformatted node or is unallocated. This lead to some complications,
++   because flushing whole page can lead to an incorrect overwrite of
++   unformatted node that is moreover, can be cached in some other place as
++   part of the file body. To avoid this, buffers for unformatted nodes are
++   never marked dirty. Also pages in the fake are never marked dirty. This
++   rules out usage of ->writepage() as memory pressure hook. In stead
++   ->releasepage() is used.
++
++   Josh is concerned that page->buffer is going to die. This should not pose
++   significant problem though, because we need to add some data structures to
++   the page anyway (jnode) and all necessary book keeping can be put there.
++
++*/
++
++/* Life cycle of pages/nodes.
++
++   jnode contains reference to page and page contains reference back to
++   jnode. This reference is counted in page ->count. Thus, page bound to jnode
++   cannot be released back into free pool.
++
++    1. Formatted nodes.
++
++      1. formatted node is represented by znode. When new znode is created its
++      ->pg pointer is NULL initially.
++
++      2. when node content is loaded into znode (by call to zload()) for the
++      first time following happens (in call to ->read_node() or
++      ->allocate_node()):
++
++        1. new page is added to the page cache.
++
++        2. this page is attached to znode and its ->count is increased.
++
++        3. page is kmapped.
++
++      3. if more calls to zload() follow (without corresponding zrelses), page
++      counter is left intact and in its stead ->d_count is increased in znode.
++
++      4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
++      ->release_node() is called and page is kunmapped as result.
++
++      5. at some moment node can be captured by a transaction. Its ->x_count
++      is then increased by transaction manager.
++
++      6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
++      bit set) following will happen (also see comment at the top of znode.c):
++
++        1. when last lock is released, node will be uncaptured from
++        transaction. This released reference that transaction manager acquired
++        at the step 5.
++
++        2. when last reference is released, zput() detects that node is
++        actually deleted and calls ->delete_node()
++        operation. page_cache_delete_node() implementation detaches jnode from
++        page and releases page.
++
++      7. otherwise (node wasn't removed from the tree), last reference to
++      znode will be released after transaction manager committed transaction
++      node was in. This implies squallocing of this node (see
++      flush.c). Nothing special happens at this point. Znode is still in the
++      hash table and page is still attached to it.
++
++      8. znode is actually removed from the memory because of the memory
++      pressure, or during umount (znodes_tree_done()). Anyway, znode is
++      removed by the call to zdrop(). At this moment, page is detached from
++      znode and removed from the inode address space.
++
++*/
++
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree.h"
++#include "vfs_ops.h"
++#include "inode.h"
++#include "super.h"
++#include "entd.h"
++#include "page_cache.h"
++#include "ktxnmgrd.h"
++
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <linux/mm.h>		/* for struct page */
++#include <linux/swap.h>		/* for struct page */
++#include <linux/pagemap.h>
++#include <linux/bio.h>
++#include <linux/writeback.h>
++#include <linux/blkdev.h>
++
++static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
++
++static struct address_space_operations formatted_fake_as_ops;
++
++static const oid_t fake_ino = 0x1;
++static const oid_t bitmap_ino = 0x2;
++static const oid_t cc_ino = 0x3;
++
++static void
++init_fake_inode(struct super_block *super, struct inode *fake,
++		struct inode **pfake)
++{
++	assert("nikita-2168", fake->i_state & I_NEW);
++	fake->i_mapping->a_ops = &formatted_fake_as_ops;
++	*pfake = fake;
++	/* NOTE-NIKITA something else? */
++	unlock_new_inode(fake);
++}
++
++/**
++ * init_formatted_fake - iget inodes for formatted nodes and bitmaps
++ * @super: super block to init fake inode for
++ *
++ * Initializes fake inode to which formatted nodes are bound in the page cache
++ * and inode for bitmaps.
++ */
++int init_formatted_fake(struct super_block *super)
++{
++	struct inode *fake;
++	struct inode *bitmap;
++	struct inode *cc;
++	reiser4_super_info_data *sinfo;
++
++	assert("nikita-1703", super != NULL);
++
++	sinfo = get_super_private_nocheck(super);
++	fake = iget_locked(super, oid_to_ino(fake_ino));
++
++	if (fake != NULL) {
++		init_fake_inode(super, fake, &sinfo->fake);
++
++		bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
++		if (bitmap != NULL) {
++			init_fake_inode(super, bitmap, &sinfo->bitmap);
++
++			cc = iget_locked(super, oid_to_ino(cc_ino));
++			if (cc != NULL) {
++				init_fake_inode(super, cc, &sinfo->cc);
++				return 0;
++			} else {
++				iput(sinfo->fake);
++				iput(sinfo->bitmap);
++				sinfo->fake = NULL;
++				sinfo->bitmap = NULL;
++			}
++		} else {
++			iput(sinfo->fake);
++			sinfo->fake = NULL;
++		}
++	}
++	return RETERR(-ENOMEM);
++}
++
++/**
++ * done_formatted_fake - release inode used by formatted nodes and bitmaps
++ * @super: super block to init fake inode for
++ *
++ * Releases inodes which were used as address spaces of bitmap and formatted
++ * nodes.
++ */
++void done_formatted_fake(struct super_block *super)
++{
++	reiser4_super_info_data *sinfo;
++
++	sinfo = get_super_private_nocheck(super);
++
++	if (sinfo->fake != NULL) {
++		assert("vs-1426", sinfo->fake->i_data.nrpages == 0);
++		iput(sinfo->fake);
++		sinfo->fake = NULL;
++	}
++
++	if (sinfo->bitmap != NULL) {
++		iput(sinfo->bitmap);
++		sinfo->bitmap = NULL;
++	}
++
++	if (sinfo->cc != NULL) {
++		iput(sinfo->cc);
++		sinfo->cc = NULL;
++	}
++	return;
++}
++
++void reiser4_wait_page_writeback(struct page *page)
++{
++	assert("zam-783", PageLocked(page));
++
++	do {
++		unlock_page(page);
++		wait_on_page_writeback(page);
++		lock_page(page);
++	} while (PageWriteback(page));
++}
++
++/* return tree @page is in */
++reiser4_tree *tree_by_page(const struct page *page /* page to query */ )
++{
++	assert("nikita-2461", page != NULL);
++	return &get_super_private(page->mapping->host->i_sb)->tree;
++}
++
++/* completion handler for single page bio-based read.
++
++   mpage_end_io_read() would also do. But it's static.
++
++*/
++static int
++end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
++			 int err UNUSED_ARG)
++{
++	struct page *page;
++
++	if (bio->bi_size != 0) {
++		warning("nikita-3332", "Truncated single page read: %i",
++			bio->bi_size);
++		return 1;
++	}
++
++	page = bio->bi_io_vec[0].bv_page;
++
++	if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
++		SetPageUptodate(page);
++	} else {
++		ClearPageUptodate(page);
++		SetPageError(page);
++	}
++	unlock_page(page);
++	bio_put(bio);
++	return 0;
++}
++
++/* completion handler for single page bio-based write.
++
++   mpage_end_io_write() would also do. But it's static.
++
++*/
++static int
++end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
++			  int err UNUSED_ARG)
++{
++	struct page *page;
++
++	if (bio->bi_size != 0) {
++		warning("nikita-3333", "Truncated single page write: %i",
++			bio->bi_size);
++		return 1;
++	}
++
++	page = bio->bi_io_vec[0].bv_page;
++
++	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
++		SetPageError(page);
++	end_page_writeback(page);
++	bio_put(bio);
++	return 0;
++}
++
++/* ->readpage() method for formatted nodes */
++static int formatted_readpage(struct file *f UNUSED_ARG,
++			      struct page *page /* page to read */ )
++{
++	assert("nikita-2412", PagePrivate(page) && jprivate(page));
++	return page_io(page, jprivate(page), READ, get_gfp_mask());
++}
++
++/**
++ * page_io - submit single-page bio request
++ * @page: page to perform io for
++ * @node: jnode of page
++ * @rw: read or write
++ * @gfp: gfp mask for bio allocation
++ *
++ * Submits single page read or write.
++ */
++int page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
++{
++	struct bio *bio;
++	int result;
++
++	assert("nikita-2094", page != NULL);
++	assert("nikita-2226", PageLocked(page));
++	assert("nikita-2634", node != NULL);
++	assert("nikita-2893", rw == READ || rw == WRITE);
++
++	if (rw) {
++		if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
++			unlock_page(page);
++			return 0;
++		}
++	}
++
++	bio = page_bio(page, node, rw, gfp);
++	if (!IS_ERR(bio)) {
++		if (rw == WRITE) {
++			SetPageWriteback(page);
++			unlock_page(page);
++		}
++		reiser4_submit_bio(rw, bio);
++		result = 0;
++	} else {
++		unlock_page(page);
++		result = PTR_ERR(bio);
++	}
++
++	return result;
++}
++
++/* helper function to construct bio for page */
++static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
++{
++	struct bio *bio;
++	assert("nikita-2092", page != NULL);
++	assert("nikita-2633", node != NULL);
++
++	/* Simple implementation in the assumption that blocksize == pagesize.
++
++	   We only have to submit one block, but submit_bh() will allocate bio
++	   anyway, so lets use all the bells-and-whistles of bio code.
++	 */
++
++	bio = bio_alloc(gfp, 1);
++	if (bio != NULL) {
++		int blksz;
++		struct super_block *super;
++		reiser4_block_nr blocknr;
++
++		super = page->mapping->host->i_sb;
++		assert("nikita-2029", super != NULL);
++		blksz = super->s_blocksize;
++		assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
++
++		spin_lock_jnode(node);
++		blocknr = *jnode_get_io_block(node);
++		spin_unlock_jnode(node);
++
++		assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
++		assert("nikita-2276", !blocknr_is_fake(&blocknr));
++
++		bio->bi_bdev = super->s_bdev;
++		/* fill bio->bi_sector before calling bio_add_page(), because
++		 * q->merge_bvec_fn may want to inspect it (see
++		 * drivers/md/linear.c:linear_mergeable_bvec() for example. */
++		bio->bi_sector = blocknr * (blksz >> 9);
++
++		if (!bio_add_page(bio, page, blksz, 0)) {
++			warning("nikita-3452",
++				"Single page bio cannot be constructed");
++			return ERR_PTR(RETERR(-EINVAL));
++		}
++
++		/* bio -> bi_idx is filled by bio_init() */
++		bio->bi_end_io = (rw == READ) ?
++		    end_bio_single_page_read : end_bio_single_page_write;
++
++		return bio;
++	} else
++		return ERR_PTR(RETERR(-ENOMEM));
++}
++
++/* this function is internally called by jnode_make_dirty() */
++int set_page_dirty_internal(struct page *page)
++{
++	struct address_space *mapping;
++
++	mapping = page->mapping;
++	BUG_ON(mapping == NULL);
++
++	if (!TestSetPageDirty(page)) {
++		if (mapping_cap_account_dirty(mapping))
++			inc_page_state(nr_dirty);
++
++		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
++	}
++
++	/* znode must be dirty ? */
++	if (mapping->host == get_super_fake(mapping->host->i_sb))
++		assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
++	return 0;
++}
++
++#if REISER4_DEBUG
++
++/**
++ * can_hit_entd
++ *
++ * This is used on 
++ */
++static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
++{
++	if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
++		return 1;
++	if (ctx->super != s)
++		return 1;
++	if (get_super_private(s)->entd.tsk == current)
++		return 0;
++	if (!lock_stack_isclean(&ctx->stack))
++		return 0;
++	if (ctx->trans->atom != NULL)
++		return 0;
++	return 1;
++}
++
++#endif
++
++/**
++ * reiser4_writepage - writepage of struct address_space_operations
++ * @page: page to write
++ * @wbc:
++ *
++ *
++ */
++/* Common memory pressure notification. */
++int reiser4_writepage(struct page *page,
++		      struct writeback_control *wbc)
++{
++	struct super_block *s;
++	reiser4_context *ctx;
++
++	assert("vs-828", PageLocked(page));
++
++	s = page->mapping->host->i_sb;
++	ctx = get_current_context_check();
++
++	assert("", can_hit_entd(ctx, s));
++
++	return write_page_by_ent(page, wbc);
++}
++
++/* ->set_page_dirty() method of formatted address_space */
++static int formatted_set_page_dirty(struct page *page)
++{
++	assert("nikita-2173", page != NULL);
++	BUG();
++	return __set_page_dirty_nobuffers(page);
++}
++
++/* writepages method of address space operations in reiser4 is used to involve
++   into transactions pages which are dirtied via mmap. Only regular files can
++   have such pages. Fake inode is used to access formatted nodes via page
++   cache. As formatted nodes can never be mmaped, fake inode's writepages has
++   nothing to do */
++static int
++writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
++{
++	return 0;
++}
++
++/* address space operations for the fake inode */
++static struct address_space_operations formatted_fake_as_ops = {
++	/* Perform a writeback of a single page as a memory-freeing
++	 * operation. */
++	.writepage = reiser4_writepage,
++	/* this is called to read formatted node */
++	.readpage = formatted_readpage,
++	/* ->sync_page() method of fake inode address space operations. Called
++	   from wait_on_page() and lock_page().
++
++	   This is most annoyingly misnomered method. Actually it is called
++	   from wait_on_page_bit() and lock_page() and its purpose is to
++	   actually start io by jabbing device drivers.
++	 */
++	.sync_page = block_sync_page,
++	/* Write back some dirty pages from this mapping. Called from sync.
++	   called during sync (pdflush) */
++	.writepages = writepages_fake,
++	/* Set a page dirty */
++	.set_page_dirty = formatted_set_page_dirty,
++	/* used for read-ahead. Not applicable */
++	.readpages = NULL,
++	.prepare_write = NULL,
++	.commit_write = NULL,
++	.bmap = NULL,
++	/* called just before page is being detached from inode mapping and
++	   removed from memory. Called on truncate, cut/squeeze, and
++	   umount. */
++	.invalidatepage = reiser4_invalidatepage,
++	/* this is called by shrink_cache() so that file system can try to
++	   release objects (jnodes, buffers, journal heads) attached to page
++	   and, may be made page itself free-able.
++	 */
++	.releasepage = reiser4_releasepage,
++	.direct_IO = NULL
++};
++
++/* called just before page is released (no longer used by reiser4). Callers:
++   jdelete() and extent2tail(). */
++void drop_page(struct page *page)
++{
++	assert("nikita-2181", PageLocked(page));
++	clear_page_dirty_for_io(page);
++	ClearPageUptodate(page);
++#if defined(PG_skipped)
++	ClearPageSkipped(page);
++#endif
++	if (page->mapping != NULL) {
++		remove_from_page_cache(page);
++		unlock_page(page);
++		page_cache_release(page);
++	} else
++		unlock_page(page);
++}
++
++/* this is called by truncate_jnodes_range which in its turn is always called
++   after truncate_mapping_pages_range. Therefore, here jnode can not have
++   page. New pages can not be created because truncate_jnodes_range goes under
++   exclusive access on file obtained, where as new page creation requires
++   non-exclusive access obtained */
++static void invalidate_unformatted(jnode * node)
++{
++	struct page *page;
++
++	spin_lock_jnode(node);
++	page = node->pg;
++	if (page) {
++		loff_t from, to;
++
++		page_cache_get(page);
++		spin_unlock_jnode(node);
++		/* FIXME: use truncate_complete_page instead */
++		from = (loff_t) page->index << PAGE_CACHE_SHIFT;
++		to = from + PAGE_CACHE_SIZE - 1;
++		truncate_inode_pages_range(page->mapping, from, to);
++		page_cache_release(page);
++	} else {
++		JF_SET(node, JNODE_HEARD_BANSHEE);
++		uncapture_jnode(node);
++		unhash_unformatted_jnode(node);
++	}
++}
++
++#define JNODE_GANG_SIZE (16)
++
++/* find all eflushed jnodes from range specified and invalidate them */
++static int
++truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
++{
++	reiser4_inode *info;
++	int truncated_jnodes;
++	reiser4_tree *tree;
++	unsigned long index;
++	unsigned long end;
++
++	truncated_jnodes = 0;
++
++	info = reiser4_inode_data(inode);
++	tree = tree_by_inode(inode);
++
++	index = from;
++	end = from + count;
++
++	while (1) {
++		jnode *gang[JNODE_GANG_SIZE];
++		int taken;
++		int i;
++		jnode *node;
++
++		assert("nikita-3466", index <= end);
++
++		read_lock_tree(tree);
++		taken =
++		    radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
++					   (void **)gang, index,
++					   JNODE_GANG_SIZE);
++		for (i = 0; i < taken; ++i) {
++			node = gang[i];
++			if (index_jnode(node) < end)
++				jref(node);
++			else
++				gang[i] = NULL;
++		}
++		read_unlock_tree(tree);
++
++		for (i = 0; i < taken; ++i) {
++			node = gang[i];
++			if (node != NULL) {
++				index = max(index, index_jnode(node));
++				invalidate_unformatted(node);
++				truncated_jnodes++;
++				jput(node);
++			} else
++				break;
++		}
++		if (i != taken || taken == 0)
++			break;
++	}
++	return truncated_jnodes;
++}
++
++void
++reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
++			 unsigned long count, int even_cows)
++{
++	loff_t from_bytes, count_bytes;
++
++	if (count == 0)
++		return;
++	from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
++	count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
++
++	unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
++	truncate_inode_pages_range(mapping, from_bytes,
++				   from_bytes + count_bytes - 1);
++	truncate_jnodes_range(mapping->host, from, count);
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/page_cache.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/page_cache.h
+@@ -0,0 +1,62 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
++
++#if !defined( __REISER4_PAGE_CACHE_H__ )
++#define __REISER4_PAGE_CACHE_H__
++
++#include "forward.h"
++#include "debug.h"
++
++#include <linux/fs.h>		/* for struct super_block, address_space  */
++#include <linux/mm.h>		/* for struct page  */
++#include <linux/pagemap.h>	/* for lock_page()  */
++
++
++extern int init_formatted_fake(struct super_block *);
++extern void done_formatted_fake(struct super_block *);
++
++extern reiser4_tree *tree_by_page(const struct page *);
++
++extern int set_page_dirty_internal(struct page *);
++
++#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
++
++extern void reiser4_wait_page_writeback(struct page *);
++static inline void lock_and_wait_page_writeback(struct page *page)
++{
++	lock_page(page);
++	if (unlikely(PageWriteback(page)))
++		reiser4_wait_page_writeback(page);
++}
++
++#define jprivate(page) ((jnode *)page_private(page))
++
++extern int page_io(struct page *, jnode *, int rw, gfp_t);
++extern void drop_page(struct page *);
++extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
++				     unsigned long count, int even_cows);
++extern void capture_reiser4_inodes(struct super_block *,
++				   struct writeback_control *);
++
++#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
++
++#if REISER4_DEBUG
++extern void print_page(const char *prefix, struct page *page);
++#else
++#define print_page(prf, p) noop
++#endif
++
++/* __REISER4_PAGE_CACHE_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/Makefile
+@@ -0,0 +1,26 @@
++obj-$(CONFIG_REISER4_FS) += plugins.o
++
++plugins-objs :=			\
++	plugin.o		\
++	plugin_set.o		\
++	object.o		\
++	inode_ops.o		\
++	inode_ops_rename.o	\
++	file_ops.o		\
++	file_ops_readdir.o	\
++	file_plugin_common.o	\
++	dir_plugin_common.o	\
++	digest.o		\
++	hash.o			\
++	fibration.o		\
++	tail_policy.o		\
++	regular.o
++
++obj-$(CONFIG_REISER4_FS) += item/
++obj-$(CONFIG_REISER4_FS) += file/
++obj-$(CONFIG_REISER4_FS) += dir/
++obj-$(CONFIG_REISER4_FS) += node/
++obj-$(CONFIG_REISER4_FS) += compress/
++obj-$(CONFIG_REISER4_FS) += space/
++obj-$(CONFIG_REISER4_FS) += disk_format/
++obj-$(CONFIG_REISER4_FS) += security/
+Index: linux-2.6.16/fs/reiser4/plugin/cluster.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/cluster.c
+@@ -0,0 +1,66 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Contains reiser4 cluster plugins (see
++   http://www.namesys.com/cryptcompress_design.html
++   "Concepts of clustering" for details). */
++
++#include "plugin_header.h"
++#include "plugin.h"
++#include "../inode.h"
++
++static int change_cluster(struct inode *inode, reiser4_plugin * plugin)
++{
++	int result = 0;
++
++	assert("edward-1324", inode != NULL);
++	assert("edward-1325", plugin != NULL);
++	assert("edward-1326", is_reiser4_inode(inode));
++	assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
++
++	if (inode_file_plugin(inode)->h.id == DIRECTORY_FILE_PLUGIN_ID)
++		result = plugin_set_cluster(&reiser4_inode_data(inode)->pset,
++					    &plugin->clust);
++	else
++		result = RETERR(-EINVAL);
++	return result;
++}
++
++static reiser4_plugin_ops cluster_plugin_ops = {
++	.init = NULL,
++	.load = NULL,
++	.save_len = NULL,
++	.save = NULL,
++	.change = &change_cluster
++};
++
++#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC)			\
++	[CLUSTER_ ## ID ## _ID] = {				\
++		.h = {						\
++			.type_id = REISER4_CLUSTER_PLUGIN_TYPE,	\
++			.id = CLUSTER_ ## ID ## _ID,		\
++			.pops = &cluster_plugin_ops,		\
++			.label = LABEL,				\
++			.desc = DESC,				\
++			.linkage = {NULL, NULL}			\
++		},						\
++		.shift = SHIFT					\
++	}
++
++cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
++	SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
++	SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
++	SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
++	SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
++	SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
++};
++
++/*
++  Local variables:
++  c-indentation-style: "K&R"
++  mode-name: "LC"
++  c-basic-offset: 8
++  tab-width: 8
++  fill-column: 120
++  scroll-step: 1
++  End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/cluster.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/cluster.h
+@@ -0,0 +1,316 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* This file contains page/cluster index translators and offset modulators
++   See http://www.namesys.com/cryptcompress_design.html for details */
++
++#if !defined( __FS_REISER4_CLUSTER_H__ )
++#define __FS_REISER4_CLUSTER_H__
++
++#include "../inode.h"
++
++static inline int inode_cluster_shift(struct inode *inode)
++{
++	assert("edward-92", inode != NULL);
++	assert("edward-93", reiser4_inode_data(inode) != NULL);
++
++	return inode_cluster_plugin(inode)->shift;
++}
++
++static inline unsigned cluster_nrpages_shift(struct inode *inode)
++{
++	return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
++}
++
++/* cluster size in page units */
++static inline unsigned cluster_nrpages(struct inode *inode)
++{
++	return 1U << cluster_nrpages_shift(inode);
++}
++
++static inline size_t inode_cluster_size(struct inode *inode)
++{
++	assert("edward-96", inode != NULL);
++
++	return 1U << inode_cluster_shift(inode);
++}
++
++static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
++{
++	return idx >> cluster_nrpages_shift(inode);
++}
++
++static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
++{
++	return idx << cluster_nrpages_shift(inode);
++}
++
++static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
++{
++	return clust_to_pg(pg_to_clust(idx, inode), inode);
++}
++
++static inline pgoff_t off_to_pg(loff_t off)
++{
++	return (off >> PAGE_CACHE_SHIFT);
++}
++
++static inline loff_t pg_to_off(pgoff_t idx)
++{
++	return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
++}
++
++static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
++{
++	return off >> inode_cluster_shift(inode);
++}
++
++static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
++{
++	return (loff_t) idx << inode_cluster_shift(inode);
++}
++
++static inline unsigned long count_to_nr(loff_t count, unsigned shift)
++{
++	return (count + (1UL << shift) - 1) >> shift;
++}
++
++/* number of pages occupied by @count bytes */
++static inline pgoff_t count_to_nrpages(loff_t count)
++{
++	return count_to_nr(count, PAGE_CACHE_SHIFT);
++}
++
++/* number of clusters occupied by @count bytes */
++static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
++{
++	return count_to_nr(count, inode_cluster_shift(inode));
++}
++
++/* number of clusters occupied by @count pages */
++static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
++{
++	return count_to_nr(count, cluster_nrpages_shift(inode));
++}
++
++static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
++{
++	return clust_to_off(off_to_clust(off, inode), inode);
++}
++
++static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
++{
++	return clust_to_pg(off_to_clust(off, inode), inode);
++}
++
++static inline unsigned off_to_pgoff(loff_t off)
++{
++	return off & (PAGE_CACHE_SIZE - 1);
++}
++
++static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
++{
++	return off & ((loff_t) (inode_cluster_size(inode)) - 1);
++}
++
++static inline unsigned
++pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
++{
++	return off_to_cloff(pg_to_off(idx), inode);
++}
++
++/* if @size != 0, returns index of the page
++   which contains the last byte of the file */
++static inline pgoff_t size_to_pg(loff_t size)
++{
++	return (size ? off_to_pg(size - 1) : 0);
++}
++
++/* minimal index of the page which doesn't contain
++   file data */
++static inline pgoff_t size_to_next_pg(loff_t size)
++{
++	return (size ? off_to_pg(size - 1) + 1 : 0);
++}
++
++/* how many bytes of file of size @cnt can be contained
++   in page of index @idx */
++static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
++{
++	if (idx > off_to_pg(cnt))
++		return 0;
++	if (idx < off_to_pg(cnt))
++		return PAGE_CACHE_SIZE;
++	return off_to_pgoff(cnt);
++}
++
++/* how many bytes of file of size @cnt can be contained
++   in logical cluster of index @idx */
++static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
++				    struct inode *inode)
++{
++	if (idx > off_to_clust(cnt, inode))
++		return 0;
++	if (idx < off_to_clust(cnt, inode))
++		return inode_cluster_size(inode);
++	return off_to_cloff(cnt, inode);
++}
++
++static inline unsigned
++fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
++{
++	assert("edward-288", clust != NULL);
++	assert("edward-289", inode != NULL);
++
++	return cnt_to_clcnt(inode->i_size, clust->index, inode);
++}
++
++static inline int
++cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
++{
++	return clust->tc.lsize == inode_cluster_size(inode);
++}
++
++static inline void reiser4_slide_init(reiser4_slide_t * win)
++{
++	assert("edward-1084", win != NULL);
++	memset(win, 0, sizeof *win);
++}
++
++static inline void
++tfm_cluster_init_act(tfm_cluster_t * tc, tfm_action act)
++{
++	assert("edward-1356", tc != NULL);
++	tc->act = act;
++}
++
++static inline void
++cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
++	assert("edward-84", clust != NULL);
++	memset(clust, 0, sizeof *clust);
++	tfm_cluster_init_act(&clust->tc, act);
++	clust->dstat = INVAL_DISK_CLUSTER;
++	clust->win = window;
++}
++
++static inline void
++cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
++{
++	cluster_init_act (clust, TFM_READ_ACT, window);
++}
++
++static inline void
++cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
++{
++	cluster_init_act (clust, TFM_WRITE_ACT, window);
++}
++
++static inline int dclust_get_extension(hint_t * hint)
++{
++	return hint->ext_coord.extension.ctail.shift;
++}
++
++static inline void dclust_set_extension(hint_t * hint)
++{
++	assert("edward-1270",
++	       item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
++	hint->ext_coord.extension.ctail.shift =
++	    cluster_shift_by_coord(&hint->ext_coord.coord);
++}
++
++static inline int hint_is_unprepped_dclust(hint_t * hint)
++{
++	return dclust_get_extension(hint) == (int)UCTAIL_SHIFT;
++}
++
++static inline void coord_set_between_clusters(coord_t * coord)
++{
++#if REISER4_DEBUG
++	int result;
++	result = zload(coord->node);
++	assert("edward-1296", !result);
++#endif
++	if (!coord_is_between_items(coord)) {
++		coord->between = AFTER_ITEM;
++		coord->unit_pos = 0;
++	}
++#if REISER4_DEBUG
++	zrelse(coord->node);
++#endif
++}
++
++int inflate_cluster(reiser4_cluster_t *, struct inode *);
++int find_cluster(reiser4_cluster_t *, struct inode *, int read, int write);
++void forget_cluster_pages(struct page **page, int nrpages);
++int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
++int deflate_cluster(reiser4_cluster_t *, struct inode *);
++void truncate_page_cluster(struct inode *inode, cloff_t start);
++void invalidate_hint_cluster(reiser4_cluster_t * clust);
++void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
++		      znode_lock_mode mode);
++int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
++			    znode_lock_mode lock_mode);
++void reset_cluster_params(reiser4_cluster_t * clust);
++int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
++			int count);
++int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
++			 int capture);
++void release_cluster_pages(reiser4_cluster_t *);
++void put_cluster_handle(reiser4_cluster_t * clust);
++int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
++int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
++void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
++void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
++
++/* move cluster handle to the target position
++   specified by the page of index @pgidx
++*/
++static inline void
++move_cluster_forward(reiser4_cluster_t * clust, struct inode *inode,
++		     pgoff_t pgidx, int *progress)
++{
++	assert("edward-1297", clust != NULL);
++	assert("edward-1298", inode != NULL);
++
++	reset_cluster_params(clust);
++	if (*progress &&
++	    /* Hole in the indices. Hint became invalid and can not be
++	       used by find_cluster_item() even if seal/node versions
++	       will coincide */
++	    pg_to_clust(pgidx, inode) != clust->index + 1) {
++		unset_hint(clust->hint);
++		invalidate_hint_cluster(clust);
++	}
++	*progress = 1;
++	clust->index = pg_to_clust(pgidx, inode);
++}
++
++static inline int
++alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
++{
++	assert("edward-791", clust != NULL);
++	assert("edward-792", inode != NULL);
++	clust->pages =
++		kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
++			GFP_KERNEL);
++	if (!clust->pages)
++		return -ENOMEM;
++	return 0;
++}
++
++static inline void free_clust_pages(reiser4_cluster_t * clust)
++{
++	kfree(clust->pages);
++}
++
++#endif				/* __FS_REISER4_CLUSTER_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/compress/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/compress/Makefile
+@@ -0,0 +1,6 @@
++obj-$(CONFIG_REISER4_FS) += compress_plugins.o
++
++compress_plugins-objs :=	\
++	compress.o		\
++	minilzo.o		\
++	compress_mode.o
+Index: linux-2.6.16/fs/reiser4/plugin/compress/compress.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/compress/compress.c
+@@ -0,0 +1,370 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* reiser4 compression transform plugins */
++
++#include "../../debug.h"
++#include "../../inode.h"
++#include "../plugin.h"
++#include "minilzo.h"
++
++#include <linux/config.h>
++#include <linux/zlib.h>
++#include <linux/types.h>
++#include <linux/hardirq.h>
++
++static int change_compression(struct inode *inode, reiser4_plugin * plugin)
++{
++	assert("edward-1316", inode != NULL);
++	assert("edward-1317", plugin != NULL);
++	assert("edward-1318", is_reiser4_inode(inode));
++	assert("edward-1319",
++	       plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
++	/* cannot change compression plugin of already existing object */
++	return RETERR(-EINVAL);
++}
++
++static reiser4_plugin_ops compression_plugin_ops = {
++	.init = NULL,
++	.load = NULL,
++	.save_len = NULL,
++	.save = NULL,
++	.change = &change_compression
++};
++
++/******************************************************************************/
++/*                         gzip1 compression                                  */
++/******************************************************************************/
++
++#define GZIP1_DEF_LEVEL		        Z_BEST_SPEED
++#define GZIP1_DEF_WINBITS		15
++#define GZIP1_DEF_MEMLEVEL		MAX_MEM_LEVEL
++
++static int gzip1_init(void)
++{
++	int ret = -EINVAL;
++#if REISER4_ZLIB
++	ret = 0;
++#endif
++	if (ret == -EINVAL)
++		warning("edward-1337", "Zlib not compiled into kernel");
++	return ret;
++}
++
++static int gzip1_overrun(unsigned src_len UNUSED_ARG)
++{
++	return 0;
++}
++
++static coa_t gzip1_alloc(tfm_action act)
++{
++	coa_t coa = NULL;
++#if REISER4_ZLIB
++	int ret = 0;
++	switch (act) {
++	case TFM_WRITE_ACT:	/* compress */
++		coa = vmalloc(zlib_deflate_workspacesize());
++		if (!coa) {
++			ret = -ENOMEM;
++			break;
++		}
++		memset(coa, 0, zlib_deflate_workspacesize());
++		break;
++	case TFM_READ_ACT:	/* decompress */
++		coa = vmalloc(zlib_inflate_workspacesize());
++		if (!coa) {
++			ret = -ENOMEM;
++			break;
++		}
++		memset(coa, 0, zlib_inflate_workspacesize());
++		break;
++	default:
++		impossible("edward-767",
++			   "trying to alloc workspace for unknown tfm action");
++	}
++	if (ret) {
++		warning("edward-768",
++			"alloc workspace for gzip1 (tfm action = %d) failed\n",
++			act);
++		return ERR_PTR(ret);
++	}
++#endif
++	return coa;
++}
++
++static void gzip1_free(coa_t coa, tfm_action act)
++{
++	assert("edward-769", coa != NULL);
++
++	switch (act) {
++	case TFM_WRITE_ACT:	/* compress */
++		vfree(coa);
++		break;
++	case TFM_READ_ACT:		/* decompress */
++		vfree(coa);
++		break;
++	default:
++		impossible("edward-770", "unknown tfm action");
++	}
++	return;
++}
++
++static int gzip1_min_size_deflate(void)
++{
++	return 64;
++}
++
++static void
++gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
++	       __u8 * dst_first, unsigned *dst_len)
++{
++#if REISER4_ZLIB
++	int ret = 0;
++	struct z_stream_s stream;
++
++	memset(&stream, 0, sizeof(stream));
++
++	assert("edward-842", coa != NULL);
++	assert("edward-875", src_len != 0);
++
++	stream.workspace = coa;
++	ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
++				-GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
++				Z_DEFAULT_STRATEGY);
++	if (ret != Z_OK) {
++		warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
++		goto rollback;
++	}
++	ret = zlib_deflateReset(&stream);
++	if (ret != Z_OK) {
++		warning("edward-772", "zlib_deflateReset returned %d\n", ret);
++		goto rollback;
++	}
++	stream.next_in = src_first;
++	stream.avail_in = src_len;
++	stream.next_out = dst_first;
++	stream.avail_out = *dst_len;
++
++	ret = zlib_deflate(&stream, Z_FINISH);
++	if (ret != Z_STREAM_END) {
++		if (ret != Z_OK)
++			warning("edward-773",
++				"zlib_deflate returned %d\n", ret);
++		goto rollback;
++	}
++	*dst_len = stream.total_out;
++	return;
++      rollback:
++	*dst_len = src_len;
++#endif
++	return;
++}
++
++static void
++gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
++		 __u8 * dst_first, unsigned *dst_len)
++{
++#if REISER4_ZLIB
++	int ret = 0;
++	struct z_stream_s stream;
++
++	memset(&stream, 0, sizeof(stream));
++
++	assert("edward-843", coa != NULL);
++	assert("edward-876", src_len != 0);
++
++	stream.workspace = coa;
++	ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
++	if (ret != Z_OK) {
++		warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
++		return;
++	}
++	ret = zlib_inflateReset(&stream);
++	if (ret != Z_OK) {
++		warning("edward-775", "zlib_inflateReset returned %d\n", ret);
++		return;
++	}
++
++	stream.next_in = src_first;
++	stream.avail_in = src_len;
++	stream.next_out = dst_first;
++	stream.avail_out = *dst_len;
++
++	ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
++	/*
++	 * Work around a bug in zlib, which sometimes wants to taste an extra
++	 * byte when being used in the (undocumented) raw deflate mode.
++	 * (From USAGI).
++	 */
++	if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
++		u8 zerostuff = 0;
++		stream.next_in = &zerostuff;
++		stream.avail_in = 1;
++		ret = zlib_inflate(&stream, Z_FINISH);
++	}
++	if (ret != Z_STREAM_END) {
++		warning("edward-776", "zlib_inflate returned %d\n", ret);
++		return;
++	}
++	*dst_len = stream.total_out;
++#endif
++	return;
++}
++
++/******************************************************************************/
++/*                            lzo1 compression                                */
++/******************************************************************************/
++
++static int lzo1_init(void)
++{
++	int ret;
++	ret = lzo_init();
++	if (ret != LZO_E_OK)
++		warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
++	return ret;
++}
++
++static int lzo1_overrun(unsigned in_len)
++{
++	return in_len / 64 + 16 + 3;
++}
++
++#define LZO_HEAP_SIZE(size) \
++	sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
++
++static coa_t lzo1_alloc(tfm_action act)
++{
++	int ret = 0;
++	coa_t coa = NULL;
++
++	switch (act) {
++	case TFM_WRITE_ACT:	/* compress */
++		coa = vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
++		if (!coa) {
++			ret = -ENOMEM;
++			break;
++		}
++		memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
++	case TFM_READ_ACT:		/* decompress */
++		break;
++	default:
++		impossible("edward-877",
++			   "trying to alloc workspace for unknown tfm action");
++	}
++	if (ret) {
++		warning("edward-878",
++			"alloc workspace for lzo1 (tfm action = %d) failed\n",
++			act);
++		return ERR_PTR(ret);
++	}
++	return coa;
++}
++
++static void lzo1_free(coa_t coa, tfm_action act)
++{
++	assert("edward-879", coa != NULL);
++
++	switch (act) {
++	case TFM_WRITE_ACT:	/* compress */
++		vfree(coa);
++		break;
++	case TFM_READ_ACT:		/* decompress */
++		impossible("edward-1304",
++			   "trying to free non-allocated workspace");
++	default:
++		impossible("edward-880", "unknown tfm action");
++	}
++	return;
++}
++
++static int lzo1_min_size_deflate(void)
++{
++	return 256;
++}
++
++static void
++lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
++	      __u8 * dst_first, unsigned *dst_len)
++{
++	int result;
++
++	assert("edward-846", coa != NULL);
++	assert("edward-847", src_len != 0);
++
++	result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
++	if (result != LZO_E_OK) {
++		warning("edward-849", "lzo1x_1_compress failed\n");
++		goto out;
++	}
++	if (*dst_len >= src_len) {
++		//warning("edward-850", "lzo1x_1_compress: incompressible data\n");
++		goto out;
++	}
++	return;
++      out:
++	*dst_len = src_len;
++	return;
++}
++
++static void
++lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
++		__u8 * dst_first, unsigned *dst_len)
++{
++	int result;
++
++	assert("edward-851", coa == NULL);
++	assert("edward-852", src_len != 0);
++
++	result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
++	if (result != LZO_E_OK)
++		warning("edward-853", "lzo1x_1_decompress failed\n");
++	return;
++}
++
++compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
++	[LZO1_COMPRESSION_ID] = {
++		.h = {
++			.type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
++			.id = LZO1_COMPRESSION_ID,
++			.pops = &compression_plugin_ops,
++			.label = "lzo1",
++			.desc = "lzo1 compression transform",
++			.linkage = {NULL, NULL}
++		},
++		.init = lzo1_init,
++		.overrun = lzo1_overrun,
++		.alloc = lzo1_alloc,
++		.free = lzo1_free,
++		.min_size_deflate = lzo1_min_size_deflate,
++		.checksum = reiser4_adler32,
++		.compress = lzo1_compress,
++		.decompress = lzo1_decompress
++	},
++	[GZIP1_COMPRESSION_ID] = {
++		.h = {
++			.type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
++			.id = GZIP1_COMPRESSION_ID,
++			.pops = &compression_plugin_ops,
++			.label = "gzip1",
++			.desc = "gzip1 compression transform",
++			.linkage = {NULL, NULL}
++		},
++		.init = gzip1_init,
++		.overrun = gzip1_overrun,
++		.alloc = gzip1_alloc,
++		.free = gzip1_free,
++		.min_size_deflate = gzip1_min_size_deflate,
++		.checksum = NULL,
++		.compress = gzip1_compress,
++		.decompress = gzip1_decompress
++	}
++};
++
++/*
++  Local variables:
++  c-indentation-style: "K&R"
++  mode-name: "LC"
++  c-basic-offset: 8
++  tab-width: 8
++  fill-column: 120
++  scroll-step: 1
++  End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/compress/compress.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/compress/compress.h
+@@ -0,0 +1,38 @@
++#if !defined( __FS_REISER4_COMPRESS_H__ )
++#define __FS_REISER4_COMPRESS_H__
++
++#include <linux/types.h>
++#include <linux/string.h>
++
++typedef enum {
++	TFM_READ_ACT,
++	TFM_WRITE_ACT,
++	TFM_LAST_ACT
++} tfm_action;
++
++/* builtin compression plugins */
++
++typedef enum {
++	LZO1_COMPRESSION_ID,
++	GZIP1_COMPRESSION_ID,
++	LAST_COMPRESSION_ID,
++} reiser4_compression_id;
++
++typedef unsigned long cloff_t;
++typedef void *coa_t;
++typedef coa_t coa_set[LAST_COMPRESSION_ID][TFM_LAST_ACT];
++
++__u32 reiser4_adler32(char *data, __u32 len);
++
++#endif				/* __FS_REISER4_COMPRESS_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/compress/compress_mode.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/compress/compress_mode.c
+@@ -0,0 +1,163 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* This file contains Reiser4 compression mode plugins.
++
++   Compression mode plugin is a set of handlers called by compressor
++   at flush time and represent some heuristics including the ones
++   which are to avoid compression of incompressible data, see
++   http://www.namesys.com/cryptcompress_design.html for more details.
++*/
++#include "../../inode.h"
++#include "../plugin.h"
++
++static int should_deflate_test(struct inode * inode, cloff_t index)
++{
++	return !test_bit(0, &index);
++}
++
++static int should_deflate_none(struct inode * inode, cloff_t index)
++{
++	return 0;
++}
++
++static int should_deflate_common(struct inode * inode, cloff_t index)
++{
++	return compression_is_on(cryptcompress_inode_data(inode));
++}
++
++static int turn_off_compression(struct inode *inode, cloff_t index)
++{
++	toggle_compression(cryptcompress_inode_data(inode), 0);
++	return 0;
++}
++
++static int turn_on_compression(struct inode *inode, cloff_t index)
++{
++	toggle_compression(cryptcompress_inode_data(inode), 1);
++	return 0;
++}
++
++static int turn_off_compression_on_zero(struct inode *inode, cloff_t index)
++{
++	assert("edward-1308", inode != NULL);
++	if (index == 0)
++		toggle_compression(cryptcompress_inode_data(inode), 0);
++	return 0;
++}
++
++/* Check on lattice (COL) of some sparseness factor,
++   the family of adaptive compression modes which define
++   the following behavior:
++   
++   Compression is on: try to compress everything and turn
++   it off, whenever cluster is incompressible.
++   
++   Compression is off: try to compress clusters of indexes 
++   k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
++   them is compressible. */
++
++/* check if @index belongs to one-dimensional lattice
++   of sparce factor @factor */
++static int check_on_lattice(cloff_t index, int factor)
++{
++	return (factor ? index % factor == 0: index == 0);
++}
++
++#define DEFINE_CHECK_ON_LATTICE(FACTOR)                                 \
++	static int check_on_lattice_ ## FACTOR (struct inode * inode,   \
++						cloff_t index)		\
++{                                                                       \
++	return should_deflate_common(inode, index) ||			\
++		check_on_lattice(index, FACTOR);			\
++}
++
++#define SUPPORT_COL_COMPRESSION_MODE(FACTOR, LABEL)                     \
++[COL_ ## FACTOR ## _COMPRESSION_MODE_ID] = {                            \
++	.h = {                                                          \
++		.type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,        \
++		.id = COL_ ## FACTOR ## _COMPRESSION_MODE_ID,           \
++		.pops = NULL,                                           \
++		.label = LABEL,                                         \
++		.desc = LABEL,                                          \
++		.linkage = {NULL, NULL}                                 \
++	},                                                              \
++	.should_deflate = check_on_lattice_ ## FACTOR,                  \
++	.accept_hook =  turn_on_compression,                            \
++	.discard_hook = turn_off_compression                            \
++}
++
++DEFINE_CHECK_ON_LATTICE(8)
++DEFINE_CHECK_ON_LATTICE(16)
++DEFINE_CHECK_ON_LATTICE(32)
++
++/* compression mode_plugins */
++compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
++	[NONE_COMPRESSION_MODE_ID] = {
++		.h = {
++			.type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++			.id = NONE_COMPRESSION_MODE_ID,
++			.pops = NULL,
++			.label = "none",
++			.desc = "Don't compress",
++			.linkage = {NULL, NULL}
++		},
++		.should_deflate = should_deflate_none,
++		.accept_hook = NULL,
++		.discard_hook = NULL
++	},
++	/* Check-on-lattice adaptive compression modes */
++	SUPPORT_COL_COMPRESSION_MODE(8, "col8"),
++	SUPPORT_COL_COMPRESSION_MODE(16, "col16"),
++	SUPPORT_COL_COMPRESSION_MODE(32, "col32"),
++	/* Turn off compression if logical cluster of index == 0
++	   is incompressible, then don't check anymore */
++	[COZ_COMPRESSION_MODE_ID] = {
++		.h = {
++			.type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++			.id = COZ_COMPRESSION_MODE_ID,
++			.pops = NULL,
++			.label = "coz",
++			.desc = "Check on zero",
++			.linkage = {NULL, NULL}
++		},
++		.should_deflate = should_deflate_common,
++		.accept_hook = NULL,
++		.discard_hook = turn_off_compression_on_zero
++	},
++	[FORCE_COMPRESSION_MODE_ID] = {
++		.h = {
++			.type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++			.id = FORCE_COMPRESSION_MODE_ID,
++			.pops = NULL,
++			.label = "force",
++			.desc = "Compress everything",
++			.linkage = {NULL, NULL}
++		},
++		.should_deflate = NULL,
++		.accept_hook = NULL,
++		.discard_hook = NULL
++	},
++	[TEST_COMPRESSION_MODE_ID] = {
++		.h = {
++			.type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++			.id = TEST_COMPRESSION_MODE_ID,
++			.pops = NULL,
++			.label = "test", /* This mode is for benchmarks only */
++			.desc = "Don't compress odd clusters",
++			.linkage = {NULL, NULL}
++		},
++		.should_deflate = should_deflate_test,
++		.accept_hook = NULL,
++		.discard_hook = NULL
++	}
++};
++
++/*
++  Local variables:
++  c-indentation-style: "K&R"
++  mode-name: "LC"
++  c-basic-offset: 8
++  tab-width: 8
++  fill-column: 120
++  scroll-step: 1
++  End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/compress/lzoconf.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/compress/lzoconf.h
+@@ -0,0 +1,420 @@
++/* lzoconf.h -- configuration for the LZO real-time data compression library
++   adopted for reiser4 compression transform plugin.
++
++   This file is part of the LZO real-time data compression library
++   and not included in any proprietary licenses of reiser4.
++
++   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
++   All Rights Reserved.
++
++   The LZO library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU General Public License as
++   published by the Free Software Foundation; either version 2 of
++   the License, or (at your option) any later version.
++
++   The LZO library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++   GNU General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with the LZO library; see the file COPYING.
++   If not, write to the Free Software Foundation, Inc.,
++   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++
++   Markus F.X.J. Oberhumer
++   <markus@oberhumer.com>
++   http://www.oberhumer.com/opensource/lzo/
++ */
++
++#include <linux/kernel.h>	/* for UINT_MAX, ULONG_MAX - edward */
++
++#ifndef __LZOCONF_H
++#define __LZOCONF_H
++
++#define LZO_VERSION             0x1080
++#define LZO_VERSION_STRING      "1.08"
++#define LZO_VERSION_DATE        "Jul 12 2002"
++
++/* internal Autoconf configuration file - only used when building LZO */
++#if defined(LZO_HAVE_CONFIG_H)
++#  include <config.h>
++#endif
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/***********************************************************************
++// LZO requires a conforming <limits.h>
++************************************************************************/
++
++#define CHAR_BIT  8
++#define USHRT_MAX 0xffff
++
++/* workaround a cpp bug under hpux 10.20 */
++#define LZO_0xffffffffL         4294967295ul
++
++/***********************************************************************
++// architecture defines
++************************************************************************/
++
++#if !defined(__LZO_WIN) && !defined(__LZO_DOS) && !defined(__LZO_OS2)
++#  if defined(__WINDOWS__) || defined(_WINDOWS) || defined(_Windows)
++#    define __LZO_WIN
++#  elif defined(__WIN32__) || defined(_WIN32) || defined(WIN32)
++#    define __LZO_WIN
++#  elif defined(__NT__) || defined(__NT_DLL__) || defined(__WINDOWS_386__)
++#    define __LZO_WIN
++#  elif defined(__DOS__) || defined(__MSDOS__) || defined(MSDOS)
++#    define __LZO_DOS
++#  elif defined(__OS2__) || defined(__OS2V2__) || defined(OS2)
++#    define __LZO_OS2
++#  elif defined(__palmos__)
++#    define __LZO_PALMOS
++#  elif defined(__TOS__) || defined(__atarist__)
++#    define __LZO_TOS
++#  endif
++#endif
++
++#if (UINT_MAX < LZO_0xffffffffL)
++#  if defined(__LZO_WIN)
++#    define __LZO_WIN16
++#  elif defined(__LZO_DOS)
++#    define __LZO_DOS16
++#  elif defined(__LZO_PALMOS)
++#    define __LZO_PALMOS16
++#  elif defined(__LZO_TOS)
++#    define __LZO_TOS16
++#  elif defined(__C166__)
++#  else
++	/* porting hint: for pure 16-bit architectures try compiling
++	 * everything with -D__LZO_STRICT_16BIT */
++#    error "16-bit target not supported - contact me for porting hints"
++#  endif
++#endif
++
++#if !defined(__LZO_i386)
++#  if defined(__LZO_DOS) || defined(__LZO_WIN16)
++#    define __LZO_i386
++#  elif defined(__i386__) || defined(__386__) || defined(_M_IX86)
++#    define __LZO_i386
++#  endif
++#endif
++
++#if defined(__LZO_STRICT_16BIT)
++#  if (UINT_MAX < LZO_0xffffffffL)
++#    include <lzo16bit.h>
++#  endif
++#endif
++
++/* memory checkers */
++#if !defined(__LZO_CHECKER)
++#  if defined(__BOUNDS_CHECKING_ON)
++#    define __LZO_CHECKER
++#  elif defined(__CHECKER__)
++#    define __LZO_CHECKER
++#  elif defined(__INSURE__)
++#    define __LZO_CHECKER
++#  elif defined(__PURIFY__)
++#    define __LZO_CHECKER
++#  endif
++#endif
++
++/***********************************************************************
++// integral and pointer types
++************************************************************************/
++
++/* Integral types with 32 bits or more */
++#if !defined(LZO_UINT32_MAX)
++#  if (UINT_MAX >= LZO_0xffffffffL)
++	typedef unsigned int lzo_uint32;
++	typedef int lzo_int32;
++#    define LZO_UINT32_MAX      UINT_MAX
++#    define LZO_INT32_MAX       INT_MAX
++#    define LZO_INT32_MIN       INT_MIN
++#  elif (ULONG_MAX >= LZO_0xffffffffL)
++	typedef unsigned long lzo_uint32;
++	typedef long lzo_int32;
++#    define LZO_UINT32_MAX      ULONG_MAX
++#    define LZO_INT32_MAX       LONG_MAX
++#    define LZO_INT32_MIN       LONG_MIN
++#  else
++#    error "lzo_uint32"
++#  endif
++#endif
++
++/* lzo_uint is used like size_t */
++#if !defined(LZO_UINT_MAX)
++#  if (UINT_MAX >= LZO_0xffffffffL)
++	typedef unsigned int lzo_uint;
++	typedef int lzo_int;
++#    define LZO_UINT_MAX        UINT_MAX
++#    define LZO_INT_MAX         INT_MAX
++#    define LZO_INT_MIN         INT_MIN
++#  elif (ULONG_MAX >= LZO_0xffffffffL)
++	typedef unsigned long lzo_uint;
++	typedef long lzo_int;
++#    define LZO_UINT_MAX        ULONG_MAX
++#    define LZO_INT_MAX         LONG_MAX
++#    define LZO_INT_MIN         LONG_MIN
++#  else
++#    error "lzo_uint"
++#  endif
++#endif
++
++	typedef int lzo_bool;
++
++/***********************************************************************
++// memory models
++************************************************************************/
++
++/* Memory model for the public code segment. */
++#if !defined(__LZO_CMODEL)
++#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
++#    define __LZO_CMODEL        __far
++#  elif defined(__LZO_i386) && defined(__WATCOMC__)
++#    define __LZO_CMODEL        __near
++#  else
++#    define __LZO_CMODEL
++#  endif
++#endif
++
++/* Memory model for the public data segment. */
++#if !defined(__LZO_DMODEL)
++#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
++#    define __LZO_DMODEL        __far
++#  elif defined(__LZO_i386) && defined(__WATCOMC__)
++#    define __LZO_DMODEL        __near
++#  else
++#    define __LZO_DMODEL
++#  endif
++#endif
++
++/* Memory model that allows to access memory at offsets of lzo_uint. */
++#if !defined(__LZO_MMODEL)
++#  if (LZO_UINT_MAX <= UINT_MAX)
++#    define __LZO_MMODEL
++#  elif defined(__LZO_DOS16) || defined(__LZO_WIN16)
++#    define __LZO_MMODEL        __huge
++#    define LZO_999_UNSUPPORTED
++#  elif defined(__LZO_PALMOS16) || defined(__LZO_TOS16)
++#    define __LZO_MMODEL
++#  else
++#    error "__LZO_MMODEL"
++#  endif
++#endif
++
++/* no typedef here because of const-pointer issues */
++#define lzo_byte                unsigned char __LZO_MMODEL
++#define lzo_bytep               unsigned char __LZO_MMODEL *
++#define lzo_charp               char __LZO_MMODEL *
++#define lzo_voidp               void __LZO_MMODEL *
++#define lzo_shortp              short __LZO_MMODEL *
++#define lzo_ushortp             unsigned short __LZO_MMODEL *
++#define lzo_uint32p             lzo_uint32 __LZO_MMODEL *
++#define lzo_int32p              lzo_int32 __LZO_MMODEL *
++#define lzo_uintp               lzo_uint __LZO_MMODEL *
++#define lzo_intp                lzo_int __LZO_MMODEL *
++#define lzo_voidpp              lzo_voidp __LZO_MMODEL *
++#define lzo_bytepp              lzo_bytep __LZO_MMODEL *
++
++#ifndef lzo_sizeof_dict_t
++#  define lzo_sizeof_dict_t     sizeof(lzo_bytep)
++#endif
++
++/***********************************************************************
++// calling conventions and function types
++************************************************************************/
++
++/* linkage */
++#if !defined(__LZO_EXTERN_C)
++#  ifdef __cplusplus
++#    define __LZO_EXTERN_C      extern "C"
++#  else
++#    define __LZO_EXTERN_C      extern
++#  endif
++#endif
++
++/* calling convention */
++#if !defined(__LZO_CDECL)
++#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
++#    define __LZO_CDECL         __LZO_CMODEL __cdecl
++#  elif defined(__LZO_i386) && defined(_MSC_VER)
++#    define __LZO_CDECL         __LZO_CMODEL __cdecl
++#  elif defined(__LZO_i386) && defined(__WATCOMC__)
++#    define __LZO_CDECL         __LZO_CMODEL __cdecl
++#  else
++#    define __LZO_CDECL         __LZO_CMODEL
++#  endif
++#endif
++#if !defined(__LZO_ENTRY)
++#  define __LZO_ENTRY           __LZO_CDECL
++#endif
++
++/* C++ exception specification for extern "C" function types */
++#if !defined(__cplusplus)
++#  undef LZO_NOTHROW
++#  define LZO_NOTHROW
++#elif !defined(LZO_NOTHROW)
++#  define LZO_NOTHROW
++#endif
++
++	typedef int
++	 (__LZO_ENTRY * lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
++					 lzo_byte * dst, lzo_uintp dst_len,
++					 lzo_voidp wrkmem);
++
++	typedef int
++	 (__LZO_ENTRY * lzo_decompress_t) (const lzo_byte * src,
++					   lzo_uint src_len, lzo_byte * dst,
++					   lzo_uintp dst_len, lzo_voidp wrkmem);
++
++	typedef int
++	 (__LZO_ENTRY * lzo_optimize_t) (lzo_byte * src, lzo_uint src_len,
++					 lzo_byte * dst, lzo_uintp dst_len,
++					 lzo_voidp wrkmem);
++
++	typedef int
++	 (__LZO_ENTRY * lzo_compress_dict_t) (const lzo_byte * src,
++					      lzo_uint src_len, lzo_byte * dst,
++					      lzo_uintp dst_len,
++					      lzo_voidp wrkmem,
++					      const lzo_byte * dict,
++					      lzo_uint dict_len);
++
++	typedef int
++	 (__LZO_ENTRY * lzo_decompress_dict_t) (const lzo_byte * src,
++						lzo_uint src_len,
++						lzo_byte * dst,
++						lzo_uintp dst_len,
++						lzo_voidp wrkmem,
++						const lzo_byte * dict,
++						lzo_uint dict_len);
++
++/* assembler versions always use __cdecl */
++	typedef int
++	 (__LZO_CDECL * lzo_compress_asm_t) (const lzo_byte * src,
++					     lzo_uint src_len, lzo_byte * dst,
++					     lzo_uintp dst_len,
++					     lzo_voidp wrkmem);
++
++	typedef int
++	 (__LZO_CDECL * lzo_decompress_asm_t) (const lzo_byte * src,
++					       lzo_uint src_len, lzo_byte * dst,
++					       lzo_uintp dst_len,
++					       lzo_voidp wrkmem);
++
++/* a progress indicator callback function */
++	typedef void (__LZO_ENTRY * lzo_progress_callback_t) (lzo_uint,
++							      lzo_uint);
++
++/***********************************************************************
++// export information
++************************************************************************/
++
++/* DLL export information */
++#if !defined(__LZO_EXPORT1)
++#  define __LZO_EXPORT1
++#endif
++#if !defined(__LZO_EXPORT2)
++#  define __LZO_EXPORT2
++#endif
++
++/* exported calling convention for C functions */
++#if !defined(LZO_PUBLIC)
++#  define LZO_PUBLIC(_rettype) \
++                __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_ENTRY
++#endif
++#if !defined(LZO_EXTERN)
++#  define LZO_EXTERN(_rettype)          __LZO_EXTERN_C LZO_PUBLIC(_rettype)
++#endif
++#if !defined(LZO_PRIVATE)
++#  define LZO_PRIVATE(_rettype)         static _rettype __LZO_ENTRY
++#endif
++
++/* exported __cdecl calling convention for assembler functions */
++#if !defined(LZO_PUBLIC_CDECL)
++#  define LZO_PUBLIC_CDECL(_rettype) \
++                __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_CDECL
++#endif
++#if !defined(LZO_EXTERN_CDECL)
++#  define LZO_EXTERN_CDECL(_rettype)    __LZO_EXTERN_C LZO_PUBLIC_CDECL(_rettype)
++#endif
++
++/* exported global variables (LZO currently uses no static variables and
++ * is fully thread safe) */
++#if !defined(LZO_PUBLIC_VAR)
++#  define LZO_PUBLIC_VAR(_type) \
++                __LZO_EXPORT1 _type __LZO_EXPORT2 __LZO_DMODEL
++#endif
++#if !defined(LZO_EXTERN_VAR)
++#  define LZO_EXTERN_VAR(_type)         extern LZO_PUBLIC_VAR(_type)
++#endif
++
++/***********************************************************************
++// error codes and prototypes
++************************************************************************/
++
++/* Error codes for the compression/decompression functions. Negative
++ * values are errors, positive values will be used for special but
++ * normal events.
++ */
++#define LZO_E_OK                    0
++#define LZO_E_ERROR                 (-1)
++#define LZO_E_OUT_OF_MEMORY         (-2)	/* not used right now */
++#define LZO_E_NOT_COMPRESSIBLE      (-3)	/* not used right now */
++#define LZO_E_INPUT_OVERRUN         (-4)
++#define LZO_E_OUTPUT_OVERRUN        (-5)
++#define LZO_E_LOOKBEHIND_OVERRUN    (-6)
++#define LZO_E_EOF_NOT_FOUND         (-7)
++#define LZO_E_INPUT_NOT_CONSUMED    (-8)
++
++/* lzo_init() should be the first function you call.
++ * Check the return code !
++ *
++ * lzo_init() is a macro to allow checking that the library and the
++ * compiler's view of various types are consistent.
++ */
++#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
++    (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
++    (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
++    (int)sizeof(lzo_compress_t))
++	 LZO_EXTERN(int) __lzo_init2(unsigned, int, int, int, int, int, int,
++				     int, int, int);
++
++/* checksum functions */
++	 LZO_EXTERN(lzo_uint32)
++	 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf, lzo_uint _len);
++
++/* misc. */
++	typedef union {
++		lzo_bytep p;
++		lzo_uint u;
++	} __lzo_pu_u;
++	typedef union {
++		lzo_bytep p;
++		lzo_uint32 u32;
++	} __lzo_pu32_u;
++	typedef union {
++		void *vp;
++		lzo_bytep bp;
++		lzo_uint32 u32;
++		long l;
++	} lzo_align_t;
++
++#define LZO_PTR_ALIGN_UP(_ptr,_size) \
++    ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
++
++/* deprecated - only for backward compatibility */
++#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
++
++#ifdef __cplusplus
++}				/* extern "C" */
++#endif
++#endif				/* already included */
+Index: linux-2.6.16/fs/reiser4/plugin/compress/minilzo.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/compress/minilzo.c
+@@ -0,0 +1,2155 @@
++/* minilzo.c -- mini subset of the LZO real-time data compression library
++   adopted for reiser4 compression transform plugin.
++
++   This file is part of the LZO real-time data compression library
++   and not included in any proprietary licenses of reiser4.
++
++   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
++   All Rights Reserved.
++
++   The LZO library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU General Public License as
++   published by the Free Software Foundation; either version 2 of
++   the License, or (at your option) any later version.
++
++   The LZO library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++   GNU General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with the LZO library; see the file COPYING.
++   If not, write to the Free Software Foundation, Inc.,
++   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++
++   Markus F.X.J. Oberhumer
++   <markus@oberhumer.com>
++   http://www.oberhumer.com/opensource/lzo/
++ */
++
++/*
++ * NOTE:
++ *   the full LZO package can be found at
++ *   http://www.oberhumer.com/opensource/lzo/
++ */
++
++#include "../../debug.h"	/* for reiser4 assert macro -edward */
++
++#define __LZO_IN_MINILZO
++#define LZO_BUILD
++
++#ifdef MINILZO_HAVE_CONFIG_H
++#  include <config.h>
++#endif
++
++#undef LZO_HAVE_CONFIG_H
++#include "minilzo.h"
++
++#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
++#  error "version mismatch in miniLZO source files"
++#endif
++
++#ifdef MINILZO_HAVE_CONFIG_H
++#  define LZO_HAVE_CONFIG_H
++#endif
++
++
++#ifndef __LZO_CONF_H
++#define __LZO_CONF_H
++
++#if !defined(__LZO_IN_MINILZO)
++#  ifndef __LZOCONF_H
++#    include <lzoconf.h>
++#  endif
++#endif
++
++#if defined(__BOUNDS_CHECKING_ON)
++#  include <unchecked.h>
++#else
++#  define BOUNDS_CHECKING_OFF_DURING(stmt)      stmt
++#  define BOUNDS_CHECKING_OFF_IN_EXPR(expr)     (expr)
++#endif
++
++#  define HAVE_MEMCMP
++#  define HAVE_MEMCPY
++#  define HAVE_MEMMOVE
++#  define HAVE_MEMSET
++
++#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
++#  define HAVE_MALLOC_H
++#  define HAVE_HALLOC
++#endif
++
++#undef NDEBUG
++#if !defined(LZO_DEBUG)
++#  define NDEBUG
++#endif
++#if defined(LZO_DEBUG) || !defined(NDEBUG)
++#  if !defined(NO_STDIO_H)
++#    include <stdio.h>
++#  endif
++#endif
++# if 0				/* edward */
++#include <assert.h>
++#endif				/* edward */
++
++#if !defined(LZO_COMPILE_TIME_ASSERT)
++#  define LZO_COMPILE_TIME_ASSERT(expr) \
++	{ typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
++#endif
++
++#if !defined(LZO_UNUSED)
++#  if 1
++#    define LZO_UNUSED(var)     ((void)&var)
++#  elif 0
++#    define LZO_UNUSED(var)     { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
++#  else
++#    define LZO_UNUSED(parm)    (parm = parm)
++#  endif
++#endif
++
++#if !defined(__inline__) && !defined(__GNUC__)
++#  if defined(__cplusplus)
++#    define __inline__      inline
++#  else
++#    define __inline__
++#  endif
++#endif
++
++#if defined(NO_MEMCMP)
++#  undef HAVE_MEMCMP
++#endif
++
++#if !defined(HAVE_MEMSET)
++#  undef memset
++#  define memset    lzo_memset
++#endif
++
++#  define LZO_BYTE(x)       ((unsigned char) ((x) & 0xff))
++
++#define LZO_MAX(a,b)        ((a) >= (b) ? (a) : (b))
++#define LZO_MIN(a,b)        ((a) <= (b) ? (a) : (b))
++#define LZO_MAX3(a,b,c)     ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
++#define LZO_MIN3(a,b,c)     ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
++
++#define lzo_sizeof(type)    ((lzo_uint) (sizeof(type)))
++
++#define LZO_HIGH(array)     ((lzo_uint) (sizeof(array)/sizeof(*(array))))
++
++#define LZO_SIZE(bits)      (1u << (bits))
++#define LZO_MASK(bits)      (LZO_SIZE(bits) - 1)
++
++#define LZO_LSIZE(bits)     (1ul << (bits))
++#define LZO_LMASK(bits)     (LZO_LSIZE(bits) - 1)
++
++#define LZO_USIZE(bits)     ((lzo_uint) 1 << (bits))
++#define LZO_UMASK(bits)     (LZO_USIZE(bits) - 1)
++
++#define LZO_STYPE_MAX(b)    (((1l  << (8*(b)-2)) - 1l)  + (1l  << (8*(b)-2)))
++#define LZO_UTYPE_MAX(b)    (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
++
++#if !defined(SIZEOF_UNSIGNED)
++#  if (UINT_MAX == 0xffff)
++#    define SIZEOF_UNSIGNED         2
++#  elif (UINT_MAX == LZO_0xffffffffL)
++#    define SIZEOF_UNSIGNED         4
++#  elif (UINT_MAX >= LZO_0xffffffffL)
++#    define SIZEOF_UNSIGNED         8
++#  else
++#    error "SIZEOF_UNSIGNED"
++#  endif
++#endif
++
++#if !defined(SIZEOF_UNSIGNED_LONG)
++#  if (ULONG_MAX == LZO_0xffffffffL)
++#    define SIZEOF_UNSIGNED_LONG    4
++#  elif (ULONG_MAX >= LZO_0xffffffffL)
++#    define SIZEOF_UNSIGNED_LONG    8
++#  else
++#    error "SIZEOF_UNSIGNED_LONG"
++#  endif
++#endif
++
++#if !defined(SIZEOF_SIZE_T)
++#  define SIZEOF_SIZE_T             SIZEOF_UNSIGNED
++#endif
++#if !defined(SIZE_T_MAX)
++#  define SIZE_T_MAX                LZO_UTYPE_MAX(SIZEOF_SIZE_T)
++#endif
++
++#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
++#  if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
++#    define LZO_UNALIGNED_OK_2
++#  endif
++#  if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
++#    define LZO_UNALIGNED_OK_4
++#  endif
++#endif
++
++#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
++#  if !defined(LZO_UNALIGNED_OK)
++#    define LZO_UNALIGNED_OK
++#  endif
++#endif
++
++#if defined(__LZO_NO_UNALIGNED)
++#  undef LZO_UNALIGNED_OK
++#  undef LZO_UNALIGNED_OK_2
++#  undef LZO_UNALIGNED_OK_4
++#endif
++
++#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
++#  error "LZO_UNALIGNED_OK_2 must not be defined on this system"
++#endif
++#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
++#  error "LZO_UNALIGNED_OK_4 must not be defined on this system"
++#endif
++
++#if defined(__LZO_NO_ALIGNED)
++#  undef LZO_ALIGNED_OK_4
++#endif
++
++#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
++#  error "LZO_ALIGNED_OK_4 must not be defined on this system"
++#endif
++
++#define LZO_LITTLE_ENDIAN       1234
++#define LZO_BIG_ENDIAN          4321
++#define LZO_PDP_ENDIAN          3412
++
++#if !defined(LZO_BYTE_ORDER)
++#  if defined(MFX_BYTE_ORDER)
++#    define LZO_BYTE_ORDER      MFX_BYTE_ORDER
++#  elif defined(__LZO_i386)
++#    define LZO_BYTE_ORDER      LZO_LITTLE_ENDIAN
++#  elif defined(BYTE_ORDER)
++#    define LZO_BYTE_ORDER      BYTE_ORDER
++#  elif defined(__BYTE_ORDER)
++#    define LZO_BYTE_ORDER      __BYTE_ORDER
++#  endif
++#endif
++
++#if defined(LZO_BYTE_ORDER)
++#  if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
++      (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
++#    error "invalid LZO_BYTE_ORDER"
++#  endif
++#endif
++
++#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
++#  error "LZO_BYTE_ORDER is not defined"
++#endif
++
++#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
++
++#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
++#  if defined(__GNUC__) && defined(__i386__)
++#    if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
++#      define LZO_OPTIMIZE_GNUC_i386
++#    endif
++#  endif
++#endif
++
++__LZO_EXTERN_C const lzo_uint32 _lzo_crc32_table[256];
++
++#define _LZO_STRINGIZE(x)           #x
++#define _LZO_MEXPAND(x)             _LZO_STRINGIZE(x)
++
++#define _LZO_CONCAT2(a,b)           a ## b
++#define _LZO_CONCAT3(a,b,c)         a ## b ## c
++#define _LZO_CONCAT4(a,b,c,d)       a ## b ## c ## d
++#define _LZO_CONCAT5(a,b,c,d,e)     a ## b ## c ## d ## e
++
++#define _LZO_ECONCAT2(a,b)          _LZO_CONCAT2(a,b)
++#define _LZO_ECONCAT3(a,b,c)        _LZO_CONCAT3(a,b,c)
++#define _LZO_ECONCAT4(a,b,c,d)      _LZO_CONCAT4(a,b,c,d)
++#define _LZO_ECONCAT5(a,b,c,d,e)    _LZO_CONCAT5(a,b,c,d,e)
++
++#ifndef __LZO_PTR_H
++#define __LZO_PTR_H
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
++#  include <dos.h>
++#  if 1 && defined(__WATCOMC__)
++#    include <i86.h>
++	__LZO_EXTERN_C unsigned char _HShift;
++#    define __LZO_HShift    _HShift
++#  elif 1 && defined(_MSC_VER)
++	__LZO_EXTERN_C unsigned short __near _AHSHIFT;
++#    define __LZO_HShift    ((unsigned) &_AHSHIFT)
++#  elif defined(__LZO_WIN16)
++#    define __LZO_HShift    3
++#  else
++#    define __LZO_HShift    12
++#  endif
++#  if !defined(_FP_SEG) && defined(FP_SEG)
++#    define _FP_SEG         FP_SEG
++#  endif
++#  if !defined(_FP_OFF) && defined(FP_OFF)
++#    define _FP_OFF         FP_OFF
++#  endif
++#endif
++
++#if !defined(lzo_ptrdiff_t)
++#  if (UINT_MAX >= LZO_0xffffffffL)
++	typedef ptrdiff_t lzo_ptrdiff_t;
++#  else
++	typedef long lzo_ptrdiff_t;
++#  endif
++#endif
++
++#if !defined(__LZO_HAVE_PTR_T)
++#  if defined(lzo_ptr_t)
++#    define __LZO_HAVE_PTR_T
++#  endif
++#endif
++#if !defined(__LZO_HAVE_PTR_T)
++#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
++#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
++	typedef unsigned long lzo_ptr_t;
++	typedef long lzo_sptr_t;
++#      define __LZO_HAVE_PTR_T
++#    endif
++#  endif
++#endif
++#if !defined(__LZO_HAVE_PTR_T)
++#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
++#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
++	typedef unsigned int lzo_ptr_t;
++	typedef int lzo_sptr_t;
++#      define __LZO_HAVE_PTR_T
++#    endif
++#  endif
++#endif
++#if !defined(__LZO_HAVE_PTR_T)
++#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
++#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
++	typedef unsigned short lzo_ptr_t;
++	typedef short lzo_sptr_t;
++#      define __LZO_HAVE_PTR_T
++#    endif
++#  endif
++#endif
++#if !defined(__LZO_HAVE_PTR_T)
++#  if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
++#    error "no suitable type for lzo_ptr_t"
++#  else
++	typedef unsigned long lzo_ptr_t;
++	typedef long lzo_sptr_t;
++#    define __LZO_HAVE_PTR_T
++#  endif
++#endif
++
++#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
++#define PTR(a)              ((lzo_bytep) (a))
++#define PTR_ALIGNED_4(a)    ((_FP_OFF(a) & 3) == 0)
++#define PTR_ALIGNED2_4(a,b) (((_FP_OFF(a) | _FP_OFF(b)) & 3) == 0)
++#else
++#define PTR(a)              ((lzo_ptr_t) (a))
++#define PTR_LINEAR(a)       PTR(a)
++#define PTR_ALIGNED_4(a)    ((PTR_LINEAR(a) & 3) == 0)
++#define PTR_ALIGNED_8(a)    ((PTR_LINEAR(a) & 7) == 0)
++#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
++#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
++#endif
++
++#define PTR_LT(a,b)         (PTR(a) < PTR(b))
++#define PTR_GE(a,b)         (PTR(a) >= PTR(b))
++#define PTR_DIFF(a,b)       ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
++#define pd(a,b)             ((lzo_uint) ((a)-(b)))
++
++	typedef union {
++		char a_char;
++		unsigned char a_uchar;
++		short a_short;
++		unsigned short a_ushort;
++		int a_int;
++		unsigned int a_uint;
++		long a_long;
++		unsigned long a_ulong;
++		lzo_int a_lzo_int;
++		lzo_uint a_lzo_uint;
++		lzo_int32 a_lzo_int32;
++		lzo_uint32 a_lzo_uint32;
++		ptrdiff_t a_ptrdiff_t;
++		lzo_ptrdiff_t a_lzo_ptrdiff_t;
++		lzo_ptr_t a_lzo_ptr_t;
++		lzo_voidp a_lzo_voidp;
++		void *a_void_p;
++		lzo_bytep a_lzo_bytep;
++		lzo_bytepp a_lzo_bytepp;
++		lzo_uintp a_lzo_uintp;
++		lzo_uint *a_lzo_uint_p;
++		lzo_uint32p a_lzo_uint32p;
++		lzo_uint32 *a_lzo_uint32_p;
++		unsigned char *a_uchar_p;
++		char *a_char_p;
++	} lzo_full_align_t;
++
++#ifdef __cplusplus
++}
++#endif
++#endif
++#define LZO_DETERMINISTIC
++#define LZO_DICT_USE_PTR
++#if defined(__LZO_DOS16) || defined(__LZO_WIN16) || defined(__LZO_STRICT_16BIT)
++#  undef LZO_DICT_USE_PTR
++#endif
++#if defined(LZO_DICT_USE_PTR)
++#  define lzo_dict_t    const lzo_bytep
++#  define lzo_dict_p    lzo_dict_t __LZO_MMODEL *
++#else
++#  define lzo_dict_t    lzo_uint
++#  define lzo_dict_p    lzo_dict_t __LZO_MMODEL *
++#endif
++#if !defined(lzo_moff_t)
++#define lzo_moff_t      lzo_uint
++#endif
++#endif
++static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
++{
++	lzo_ptr_t p;
++
++#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
++	p = (((lzo_ptr_t) (_FP_SEG(ptr))) << (16 - __LZO_HShift)) +
++	    (_FP_OFF(ptr));
++#else
++	p = PTR_LINEAR(ptr);
++#endif
++
++	return p;
++}
++
++static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
++{
++	lzo_ptr_t p, s, n;
++
++	assert("lzo-01", size > 0);
++
++	p = __lzo_ptr_linear(ptr);
++	s = (lzo_ptr_t) (size - 1);
++	n = (((p + s) / size) * size) - p;
++
++	assert("lzo-02", (long)n >= 0);
++	assert("lzo-03", n <= s);
++
++	return (unsigned)n;
++}
++
++#ifndef __LZO_UTIL_H
++#define __LZO_UTIL_H
++
++#ifndef __LZO_CONF_H
++#endif
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#if 1 && defined(HAVE_MEMCPY)
++#if !defined(__LZO_DOS16) && !defined(__LZO_WIN16)
++
++#define MEMCPY8_DS(dest,src,len) \
++    memcpy(dest,src,len); \
++    dest += len; \
++    src += len
++
++#endif
++#endif
++
++#if !defined(MEMCPY8_DS)
++
++#define MEMCPY8_DS(dest,src,len) \
++    { register lzo_uint __l = (len) / 8; \
++    do { \
++	*dest++ = *src++; \
++	*dest++ = *src++; \
++	*dest++ = *src++; \
++	*dest++ = *src++; \
++	*dest++ = *src++; \
++	*dest++ = *src++; \
++	*dest++ = *src++; \
++	*dest++ = *src++; \
++    } while (--__l > 0); }
++
++#endif
++
++#define MEMCPY_DS(dest,src,len) \
++    do *dest++ = *src++; \
++    while (--len > 0)
++
++#define MEMMOVE_DS(dest,src,len) \
++    do *dest++ = *src++; \
++    while (--len > 0)
++
++
++#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
++
++#define BZERO8_PTR(s,l,n)   memset((s),0,(lzo_uint)(l)*(n))
++
++#else
++
++#define BZERO8_PTR(s,l,n) \
++    lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
++
++#endif
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
++
++/* If you use the LZO library in a product, you *must* keep this
++ * copyright string in the executable of your product.
++ */
++
++static const lzo_byte __lzo_copyright[] =
++#if !defined(__LZO_IN_MINLZO)
++    LZO_VERSION_STRING;
++#else
++    "\n\n\n"
++    "LZO real-time data compression library.\n"
++    "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
++    "<markus.oberhumer@jk.uni-linz.ac.at>\n"
++    "http://www.oberhumer.com/opensource/lzo/\n"
++    "\n"
++    "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
++    "LZO build date: " __DATE__ " " __TIME__ "\n\n"
++    "LZO special compilation options:\n"
++#ifdef __cplusplus
++    " __cplusplus\n"
++#endif
++#if defined(__PIC__)
++    " __PIC__\n"
++#elif defined(__pic__)
++    " __pic__\n"
++#endif
++#if (UINT_MAX < LZO_0xffffffffL)
++    " 16BIT\n"
++#endif
++#if defined(__LZO_STRICT_16BIT)
++    " __LZO_STRICT_16BIT\n"
++#endif
++#if (UINT_MAX > LZO_0xffffffffL)
++    " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
++#endif
++#if (ULONG_MAX > LZO_0xffffffffL)
++    " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
++#endif
++#if defined(LZO_BYTE_ORDER)
++    " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
++#endif
++#if defined(LZO_UNALIGNED_OK_2)
++    " LZO_UNALIGNED_OK_2\n"
++#endif
++#if defined(LZO_UNALIGNED_OK_4)
++    " LZO_UNALIGNED_OK_4\n"
++#endif
++#if defined(LZO_ALIGNED_OK_4)
++    " LZO_ALIGNED_OK_4\n"
++#endif
++#if defined(LZO_DICT_USE_PTR)
++    " LZO_DICT_USE_PTR\n"
++#endif
++#if defined(__LZO_QUERY_COMPRESS)
++    " __LZO_QUERY_COMPRESS\n"
++#endif
++#if defined(__LZO_QUERY_DECOMPRESS)
++    " __LZO_QUERY_DECOMPRESS\n"
++#endif
++#if defined(__LZO_IN_MINILZO)
++    " __LZO_IN_MINILZO\n"
++#endif
++    "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
++#if defined(__GNUC__) && defined(__VERSION__)
++    " by gcc " __VERSION__
++#elif defined(__BORLANDC__)
++    " by Borland C " _LZO_MEXPAND(__BORLANDC__)
++#elif defined(_MSC_VER)
++    " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
++#elif defined(__PUREC__)
++    " by Pure C " _LZO_MEXPAND(__PUREC__)
++#elif defined(__SC__)
++    " by Symantec C " _LZO_MEXPAND(__SC__)
++#elif defined(__TURBOC__)
++    " by Turbo C " _LZO_MEXPAND(__TURBOC__)
++#elif defined(__WATCOMC__)
++    " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
++#endif
++    " $\n"
++    "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
++#endif
++
++#define LZO_BASE 65521u
++#define LZO_NMAX 5552
++
++#define LZO_DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
++#define LZO_DO2(buf,i)  LZO_DO1(buf,i); LZO_DO1(buf,i+1);
++#define LZO_DO4(buf,i)  LZO_DO2(buf,i); LZO_DO2(buf,i+2);
++#define LZO_DO8(buf,i)  LZO_DO4(buf,i); LZO_DO4(buf,i+4);
++#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
++
++#  define IS_SIGNED(type)       (((type) (-1)) < ((type) 0))
++#  define IS_UNSIGNED(type)     (((type) (-1)) > ((type) 0))
++
++#define IS_POWER_OF_2(x)        (((x) & ((x) - 1)) == 0)
++
++static lzo_bool schedule_insns_bug(void);
++static lzo_bool strength_reduce_bug(int *);
++
++#  define __lzo_assert(x)   ((x) ? 1 : 0)
++
++#undef COMPILE_TIME_ASSERT
++
++#  define COMPILE_TIME_ASSERT(expr)     LZO_COMPILE_TIME_ASSERT(expr)
++
++static lzo_bool basic_integral_check(void)
++{
++	lzo_bool r = 1;
++
++	COMPILE_TIME_ASSERT(CHAR_BIT == 8);
++	COMPILE_TIME_ASSERT(sizeof(char) == 1);
++	COMPILE_TIME_ASSERT(sizeof(short) >= 2);
++	COMPILE_TIME_ASSERT(sizeof(long) >= 4);
++	COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
++	COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
++
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
++
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
++#if defined(__LZO_STRICT_16BIT)
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
++#else
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
++#endif
++
++#if (USHRT_MAX == 65535u)
++	COMPILE_TIME_ASSERT(sizeof(short) == 2);
++#elif (USHRT_MAX == LZO_0xffffffffL)
++	COMPILE_TIME_ASSERT(sizeof(short) == 4);
++#elif (USHRT_MAX >= LZO_0xffffffffL)
++	COMPILE_TIME_ASSERT(sizeof(short) > 4);
++#endif
++#if 0				/* to make gcc happy -edward */
++#if (UINT_MAX == 65535u)
++	COMPILE_TIME_ASSERT(sizeof(int) == 2);
++#elif (UINT_MAX == LZO_0xffffffffL)
++	COMPILE_TIME_ASSERT(sizeof(int) == 4);
++#elif (UINT_MAX >= LZO_0xffffffffL)
++	COMPILE_TIME_ASSERT(sizeof(int) > 4);
++#endif
++#if (ULONG_MAX == 65535ul)
++	COMPILE_TIME_ASSERT(sizeof(long) == 2);
++#elif (ULONG_MAX == LZO_0xffffffffL)
++	COMPILE_TIME_ASSERT(sizeof(long) == 4);
++#elif (ULONG_MAX >= LZO_0xffffffffL)
++	COMPILE_TIME_ASSERT(sizeof(long) > 4);
++#endif
++#if defined(SIZEOF_UNSIGNED)
++	COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED == sizeof(unsigned));
++#endif
++#if defined(SIZEOF_UNSIGNED_LONG)
++	COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_LONG == sizeof(unsigned long));
++#endif
++#if defined(SIZEOF_UNSIGNED_SHORT)
++	COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_SHORT == sizeof(unsigned short));
++#endif
++#if !defined(__LZO_IN_MINILZO)
++#if defined(SIZEOF_SIZE_T)
++	COMPILE_TIME_ASSERT(SIZEOF_SIZE_T == sizeof(size_t));
++#endif
++#endif
++#endif				/* -edward */
++
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
++	COMPILE_TIME_ASSERT(IS_SIGNED(short));
++	COMPILE_TIME_ASSERT(IS_SIGNED(int));
++	COMPILE_TIME_ASSERT(IS_SIGNED(long));
++
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
++	COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
++	COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
++
++	COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
++	COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
++	COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
++	COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
++	//    COMPILE_TIME_ASSERT(SHRT_MAX   == LZO_STYPE_MAX(sizeof(short))); /* edward */
++	COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
++	COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
++			    LZO_UTYPE_MAX(sizeof(lzo_uint32)));
++	COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
++#if !defined(__LZO_IN_MINILZO)
++	COMPILE_TIME_ASSERT(SIZE_T_MAX == LZO_UTYPE_MAX(sizeof(size_t)));
++#endif
++
++	r &= __lzo_assert(LZO_BYTE(257) == 1);
++
++	return r;
++}
++
++static lzo_bool basic_ptr_check(void)
++{
++	lzo_bool r = 1;
++
++	COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
++	COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
++
++	COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
++	COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
++	COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
++	COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
++
++	COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
++	COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
++	COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
++
++	COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
++	COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
++
++	COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
++	COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
++
++#if defined(SIZEOF_CHAR_P)
++	COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
++#endif
++#if defined(SIZEOF_PTRDIFF_T)
++	COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
++#endif
++
++	COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
++	COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
++	COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
++	COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
++
++	return r;
++}
++
++static lzo_bool ptr_check(void)
++{
++	lzo_bool r = 1;
++	int i;
++	char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
++	lzo_bytep wrkmem;
++	lzo_bytepp dict;
++	unsigned char x[4 * sizeof(lzo_full_align_t)];
++	long d;
++	lzo_full_align_t a;
++	lzo_full_align_t u;
++
++	for (i = 0; i < (int)sizeof(x); i++)
++		x[i] = LZO_BYTE(i);
++
++	wrkmem =
++	    LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
++
++	u.a_lzo_bytep = wrkmem;
++	dict = u.a_lzo_bytepp;
++
++	d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
++	r &= __lzo_assert(d >= 0);
++	r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
++
++	memset(&a, 0, sizeof(a));
++	r &= __lzo_assert(a.a_lzo_voidp == NULL);
++
++	memset(&a, 0xff, sizeof(a));
++	r &= __lzo_assert(a.a_ushort == USHRT_MAX);
++	r &= __lzo_assert(a.a_uint == UINT_MAX);
++	r &= __lzo_assert(a.a_ulong == ULONG_MAX);
++	r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
++	r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
++
++	if (r == 1) {
++		for (i = 0; i < 8; i++)
++			r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
++					  (const
++					   lzo_voidp)(&wrkmem[i *
++							      sizeof(lzo_byte
++								     *)]));
++	}
++
++	memset(&a, 0, sizeof(a));
++	r &= __lzo_assert(a.a_char_p == NULL);
++	r &= __lzo_assert(a.a_lzo_bytep == NULL);
++	r &= __lzo_assert(NULL == (void *)0);
++	if (r == 1) {
++		for (i = 0; i < 10; i++)
++			dict[i] = wrkmem;
++		BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
++		r &= __lzo_assert(dict[0] == wrkmem);
++		for (i = 1; i < 9; i++)
++			r &= __lzo_assert(dict[i] == NULL);
++		r &= __lzo_assert(dict[9] == wrkmem);
++	}
++
++	if (r == 1) {
++		unsigned k = 1;
++		const unsigned n = (unsigned)sizeof(lzo_uint32);
++		lzo_byte *p0;
++		lzo_byte *p1;
++
++		k += __lzo_align_gap(&x[k], n);
++		p0 = (lzo_bytep) & x[k];
++#if defined(PTR_LINEAR)
++		r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
++#else
++		r &= __lzo_assert(n == 4);
++		r &= __lzo_assert(PTR_ALIGNED_4(p0));
++#endif
++
++		r &= __lzo_assert(k >= 1);
++		p1 = (lzo_bytep) & x[1];
++		r &= __lzo_assert(PTR_GE(p0, p1));
++
++		r &= __lzo_assert(k < 1 + n);
++		p1 = (lzo_bytep) & x[1 + n];
++		r &= __lzo_assert(PTR_LT(p0, p1));
++
++		if (r == 1) {
++			lzo_uint32 v0, v1;
++
++			u.a_uchar_p = &x[k];
++			v0 = *u.a_lzo_uint32_p;
++			u.a_uchar_p = &x[k + n];
++			v1 = *u.a_lzo_uint32_p;
++
++			r &= __lzo_assert(v0 > 0);
++			r &= __lzo_assert(v1 > 0);
++		}
++	}
++
++	return r;
++}
++
++static int _lzo_config_check(void)
++{
++	lzo_bool r = 1;
++	int i;
++	union {
++		lzo_uint32 a;
++		unsigned short b;
++		lzo_uint32 aa[4];
++		unsigned char x[4 * sizeof(lzo_full_align_t)];
++	}
++	u;
++
++	COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
++	COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
++			    < 0);
++
++	r &= basic_integral_check();
++	r &= basic_ptr_check();
++	if (r != 1)
++		return LZO_E_ERROR;
++
++	u.a = 0;
++	u.b = 0;
++	for (i = 0; i < (int)sizeof(u.x); i++)
++		u.x[i] = LZO_BYTE(i);
++
++#if defined(LZO_BYTE_ORDER)
++	if (r == 1) {
++#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
++		lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
++		unsigned short b = (unsigned short)(u.b & 0xffff);
++		r &= __lzo_assert(a == 0x03020100L);
++		r &= __lzo_assert(b == 0x0100);
++#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
++		lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
++		unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
++		r &= __lzo_assert(a == 0x00010203L);
++		r &= __lzo_assert(b == 0x0001);
++#  else
++#    error "invalid LZO_BYTE_ORDER"
++#  endif
++	}
++#endif
++
++#if defined(LZO_UNALIGNED_OK_2)
++	COMPILE_TIME_ASSERT(sizeof(short) == 2);
++	if (r == 1) {
++		unsigned short b[4];
++
++		for (i = 0; i < 4; i++)
++			b[i] = *(const unsigned short *)&u.x[i];
++
++#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
++		r &= __lzo_assert(b[0] == 0x0100);
++		r &= __lzo_assert(b[1] == 0x0201);
++		r &= __lzo_assert(b[2] == 0x0302);
++		r &= __lzo_assert(b[3] == 0x0403);
++#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
++		r &= __lzo_assert(b[0] == 0x0001);
++		r &= __lzo_assert(b[1] == 0x0102);
++		r &= __lzo_assert(b[2] == 0x0203);
++		r &= __lzo_assert(b[3] == 0x0304);
++#  endif
++	}
++#endif
++
++#if defined(LZO_UNALIGNED_OK_4)
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
++	if (r == 1) {
++		lzo_uint32 a[4];
++
++		for (i = 0; i < 4; i++)
++			a[i] = *(const lzo_uint32 *)&u.x[i];
++
++#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
++		r &= __lzo_assert(a[0] == 0x03020100L);
++		r &= __lzo_assert(a[1] == 0x04030201L);
++		r &= __lzo_assert(a[2] == 0x05040302L);
++		r &= __lzo_assert(a[3] == 0x06050403L);
++#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
++		r &= __lzo_assert(a[0] == 0x00010203L);
++		r &= __lzo_assert(a[1] == 0x01020304L);
++		r &= __lzo_assert(a[2] == 0x02030405L);
++		r &= __lzo_assert(a[3] == 0x03040506L);
++#  endif
++	}
++#endif
++
++#if defined(LZO_ALIGNED_OK_4)
++	COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
++#endif
++
++	COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
++
++	if (r == 1) {
++		r &= __lzo_assert(!schedule_insns_bug());
++	}
++
++	if (r == 1) {
++		static int x[3];
++		static unsigned xn = 3;
++		register unsigned j;
++
++		for (j = 0; j < xn; j++)
++			x[j] = (int)j - 3;
++		r &= __lzo_assert(!strength_reduce_bug(x));
++	}
++
++	if (r == 1) {
++		r &= ptr_check();
++	}
++
++	return r == 1 ? LZO_E_OK : LZO_E_ERROR;
++}
++
++static lzo_bool schedule_insns_bug(void)
++{
++#if defined(__LZO_CHECKER)
++	return 0;
++#else
++	const int clone[] = { 1, 2, 0 };
++	const int *q;
++	q = clone;
++	return (*q) ? 0 : 1;
++#endif
++}
++
++static lzo_bool strength_reduce_bug(int *x)
++{
++	return x[0] != -3 || x[1] != -2 || x[2] != -1;
++}
++
++#undef COMPILE_TIME_ASSERT
++
++LZO_PUBLIC(int)
++    __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
++	    int s6, int s7, int s8, int s9)
++{
++	int r;
++
++	if (v == 0)
++		return LZO_E_ERROR;
++
++	r = (s1 == -1 || s1 == (int)sizeof(short)) &&
++	    (s2 == -1 || s2 == (int)sizeof(int)) &&
++	    (s3 == -1 || s3 == (int)sizeof(long)) &&
++	    (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
++	    (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
++	    (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
++	    (s7 == -1 || s7 == (int)sizeof(char *)) &&
++	    (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
++	    (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
++	if (!r)
++		return LZO_E_ERROR;
++
++	r = _lzo_config_check();
++	if (r != LZO_E_OK)
++		return r;
++
++	return r;
++}
++
++#if !defined(__LZO_IN_MINILZO)
++
++LZO_EXTERN(int)
++    __lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7);
++
++LZO_PUBLIC(int)
++__lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7)
++{
++	if (v == 0 || v > 0x1010)
++		return LZO_E_ERROR;
++	return __lzo_init2(v, s1, s2, s3, s4, s5, -1, -1, s6, s7);
++}
++
++#endif
++
++#define do_compress         _lzo1x_1_do_compress
++
++#define LZO_NEED_DICT_H
++#define D_BITS          14
++#define D_INDEX1(d,p)       d = DM((0x21*DX3(p,5,5,6)) >> 5)
++#define D_INDEX2(d,p)       d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
++
++#ifndef __LZO_CONFIG1X_H
++#define __LZO_CONFIG1X_H
++
++#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
++#  define LZO1X
++#endif
++
++#if !defined(__LZO_IN_MINILZO)
++#include <lzo1x.h>
++#endif
++
++#define LZO_EOF_CODE
++#undef LZO_DETERMINISTIC
++
++#define M1_MAX_OFFSET   0x0400
++#ifndef M2_MAX_OFFSET
++#define M2_MAX_OFFSET   0x0800
++#endif
++#define M3_MAX_OFFSET   0x4000
++#define M4_MAX_OFFSET   0xbfff
++
++#define MX_MAX_OFFSET   (M1_MAX_OFFSET + M2_MAX_OFFSET)
++
++#define M1_MIN_LEN      2
++#define M1_MAX_LEN      2
++#define M2_MIN_LEN      3
++#ifndef M2_MAX_LEN
++#define M2_MAX_LEN      8
++#endif
++#define M3_MIN_LEN      3
++#define M3_MAX_LEN      33
++#define M4_MIN_LEN      3
++#define M4_MAX_LEN      9
++
++#define M1_MARKER       0
++#define M2_MARKER       64
++#define M3_MARKER       32
++#define M4_MARKER       16
++
++#ifndef MIN_LOOKAHEAD
++#define MIN_LOOKAHEAD       (M2_MAX_LEN + 1)
++#endif
++
++#if defined(LZO_NEED_DICT_H)
++
++#ifndef LZO_HASH
++#define LZO_HASH            LZO_HASH_LZO_INCREMENTAL_B
++#endif
++#define DL_MIN_LEN          M2_MIN_LEN
++
++#ifndef __LZO_DICT_H
++#define __LZO_DICT_H
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#if !defined(D_BITS) && defined(DBITS)
++#  define D_BITS        DBITS
++#endif
++#if !defined(D_BITS)
++#  error "D_BITS is not defined"
++#endif
++#if (D_BITS < 16)
++#  define D_SIZE        LZO_SIZE(D_BITS)
++#  define D_MASK        LZO_MASK(D_BITS)
++#else
++#  define D_SIZE        LZO_USIZE(D_BITS)
++#  define D_MASK        LZO_UMASK(D_BITS)
++#endif
++#define D_HIGH          ((D_MASK >> 1) + 1)
++
++#if !defined(DD_BITS)
++#  define DD_BITS       0
++#endif
++#define DD_SIZE         LZO_SIZE(DD_BITS)
++#define DD_MASK         LZO_MASK(DD_BITS)
++
++#if !defined(DL_BITS)
++#  define DL_BITS       (D_BITS - DD_BITS)
++#endif
++#if (DL_BITS < 16)
++#  define DL_SIZE       LZO_SIZE(DL_BITS)
++#  define DL_MASK       LZO_MASK(DL_BITS)
++#else
++#  define DL_SIZE       LZO_USIZE(DL_BITS)
++#  define DL_MASK       LZO_UMASK(DL_BITS)
++#endif
++
++#if (D_BITS != DL_BITS + DD_BITS)
++#  error "D_BITS does not match"
++#endif
++#if (D_BITS < 8 || D_BITS > 18)
++#  error "invalid D_BITS"
++#endif
++#if (DL_BITS < 8 || DL_BITS > 20)
++#  error "invalid DL_BITS"
++#endif
++#if (DD_BITS < 0 || DD_BITS > 6)
++#  error "invalid DD_BITS"
++#endif
++
++#if !defined(DL_MIN_LEN)
++#  define DL_MIN_LEN    3
++#endif
++#if !defined(DL_SHIFT)
++#  define DL_SHIFT      ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
++#endif
++
++#define LZO_HASH_GZIP                   1
++#define LZO_HASH_GZIP_INCREMENTAL       2
++#define LZO_HASH_LZO_INCREMENTAL_A      3
++#define LZO_HASH_LZO_INCREMENTAL_B      4
++
++#if !defined(LZO_HASH)
++#  error "choose a hashing strategy"
++#endif
++
++#if (DL_MIN_LEN == 3)
++#  define _DV2_A(p,shift1,shift2) \
++	(((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
++#  define _DV2_B(p,shift1,shift2) \
++	(((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
++#  define _DV3_B(p,shift1,shift2,shift3) \
++	((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
++#elif (DL_MIN_LEN == 2)
++#  define _DV2_A(p,shift1,shift2) \
++	(( (lzo_uint32)(p[0]) << shift1) ^ p[1])
++#  define _DV2_B(p,shift1,shift2) \
++	(( (lzo_uint32)(p[1]) << shift1) ^ p[2])
++#else
++#  error "invalid DL_MIN_LEN"
++#endif
++#define _DV_A(p,shift)      _DV2_A(p,shift,shift)
++#define _DV_B(p,shift)      _DV2_B(p,shift,shift)
++#define DA2(p,s1,s2) \
++	(((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
++#define DS2(p,s1,s2) \
++	(((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
++#define DX2(p,s1,s2) \
++	(((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
++#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
++#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
++#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
++#define DMS(v,s)        ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
++#define DM(v)           DMS(v,0)
++
++#if (LZO_HASH == LZO_HASH_GZIP)
++#  define _DINDEX(dv,p)     (_DV_A((p),DL_SHIFT))
++
++#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
++#  define __LZO_HASH_INCREMENTAL
++#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),DL_SHIFT)
++#  define DVAL_NEXT(dv,p)   dv = (((dv) << DL_SHIFT) ^ p[2])
++#  define _DINDEX(dv,p)     (dv)
++#  define DVAL_LOOKAHEAD    DL_MIN_LEN
++
++#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
++#  define __LZO_HASH_INCREMENTAL
++#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),5)
++#  define DVAL_NEXT(dv,p) \
++		dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
++#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
++#  define DVAL_LOOKAHEAD    DL_MIN_LEN
++
++#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
++#  define __LZO_HASH_INCREMENTAL
++#  define DVAL_FIRST(dv,p)  dv = _DV_B((p),5)
++#  define DVAL_NEXT(dv,p) \
++		dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
++#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
++#  define DVAL_LOOKAHEAD    DL_MIN_LEN
++
++#else
++#  error "choose a hashing strategy"
++#endif
++
++#ifndef DINDEX
++#define DINDEX(dv,p)        ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
++#endif
++#if !defined(DINDEX1) && defined(D_INDEX1)
++#define DINDEX1             D_INDEX1
++#endif
++#if !defined(DINDEX2) && defined(D_INDEX2)
++#define DINDEX2             D_INDEX2
++#endif
++
++#if !defined(__LZO_HASH_INCREMENTAL)
++#  define DVAL_FIRST(dv,p)  ((void) 0)
++#  define DVAL_NEXT(dv,p)   ((void) 0)
++#  define DVAL_LOOKAHEAD    0
++#endif
++
++#if !defined(DVAL_ASSERT)
++#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
++	static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p) {
++		lzo_uint32 df;
++		 DVAL_FIRST(df, (p));
++		 assert(DINDEX(dv, p) == DINDEX(df, p));
++	}
++#else
++#  define DVAL_ASSERT(dv,p) ((void) 0)
++#endif
++#endif
++
++#if defined(LZO_DICT_USE_PTR)
++#  define DENTRY(p,in)                          (p)
++#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_pos = dict[dindex]
++#else
++#  define DENTRY(p,in)                          ((lzo_uint) ((p)-(in)))
++#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_off = dict[dindex]
++#endif
++
++#if (DD_BITS == 0)
++
++#  define UPDATE_D(dict,drun,dv,p,in)       dict[ DINDEX(dv,p) ] = DENTRY(p,in)
++#  define UPDATE_I(dict,drun,index,p,in)    dict[index] = DENTRY(p,in)
++#  define UPDATE_P(ptr,drun,p,in)           (ptr)[0] = DENTRY(p,in)
++
++#else
++
++#  define UPDATE_D(dict,drun,dv,p,in)   \
++	dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
++#  define UPDATE_I(dict,drun,index,p,in)    \
++	dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
++#  define UPDATE_P(ptr,drun,p,in)   \
++	(ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
++
++#endif
++
++#if defined(LZO_DICT_USE_PTR)
++
++#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
++	(m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
++
++#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
++    (BOUNDS_CHECKING_OFF_IN_EXPR( \
++	(PTR_LT(m_pos,in) || \
++	 (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
++	  m_off > max_offset) ))
++
++#else
++
++#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
++	(m_off == 0 || \
++	 ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
++	 (m_pos = (ip) - (m_off), 0) )
++
++#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
++	((lzo_moff_t) ((ip)-(in)) <= m_off || \
++	 ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
++	 (m_pos = (ip) - (m_off), 0) )
++
++#endif
++
++#if defined(LZO_DETERMINISTIC)
++#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_DET
++#else
++#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_NON_DET
++#endif
++
++#ifdef __cplusplus
++}
++#endif
++#endif
++#endif
++#endif
++#define DO_COMPRESS     lzo1x_1_compress
++static
++lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
++		     lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
++{
++	register const lzo_byte *ip;
++	lzo_byte *op;
++	const lzo_byte *const in_end = in + in_len;
++	const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
++	const lzo_byte *ii;
++	lzo_dict_p const dict = (lzo_dict_p) wrkmem;
++
++	op = out;
++	ip = in;
++	ii = ip;
++
++	ip += 4;
++	for (;;) {
++		register const lzo_byte *m_pos;
++
++		lzo_moff_t m_off;
++		lzo_uint m_len;
++		lzo_uint dindex;
++
++		DINDEX1(dindex, ip);
++		GINDEX(m_pos, m_off, dict, dindex, in);
++		if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
++			goto literal;
++#if 1
++		if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
++			goto try_match;
++		DINDEX2(dindex, ip);
++#endif
++		GINDEX(m_pos, m_off, dict, dindex, in);
++		if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
++			goto literal;
++		if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
++			goto try_match;
++		goto literal;
++
++	      try_match:
++#if 1 && defined(LZO_UNALIGNED_OK_2)
++		if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
++#else
++		if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
++#endif
++			;
++		} else {
++			if (m_pos[2] == ip[2]) {
++				goto match;
++			} else {
++				;
++			}
++		}
++
++	      literal:
++		UPDATE_I(dict, 0, dindex, ip, in);
++		++ip;
++		if (ip >= ip_end)
++			break;
++		continue;
++
++	      match:
++		UPDATE_I(dict, 0, dindex, ip, in);
++		if (pd(ip, ii) > 0) {
++			register lzo_uint t = pd(ip, ii);
++
++			if (t <= 3) {
++				assert("lzo-04", op - 2 > out);
++				op[-2] |= LZO_BYTE(t);
++			} else if (t <= 18)
++				*op++ = LZO_BYTE(t - 3);
++			else {
++				register lzo_uint tt = t - 18;
++
++				*op++ = 0;
++				while (tt > 255) {
++					tt -= 255;
++					*op++ = 0;
++				}
++				assert("lzo-05", tt > 0);
++				*op++ = LZO_BYTE(tt);
++			}
++			do
++				*op++ = *ii++;
++			while (--t > 0);
++		}
++
++		assert("lzo-06", ii == ip);
++		ip += 3;
++		if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
++		    || m_pos[6] != *ip++ || m_pos[7] != *ip++
++		    || m_pos[8] != *ip++
++#ifdef LZO1Y
++		    || m_pos[9] != *ip++ || m_pos[10] != *ip++
++		    || m_pos[11] != *ip++ || m_pos[12] != *ip++
++		    || m_pos[13] != *ip++ || m_pos[14] != *ip++
++#endif
++		    ) {
++			--ip;
++			m_len = ip - ii;
++			assert("lzo-07", m_len >= 3);
++			assert("lzo-08", m_len <= M2_MAX_LEN);
++
++			if (m_off <= M2_MAX_OFFSET) {
++				m_off -= 1;
++#if defined(LZO1X)
++				*op++ =
++				    LZO_BYTE(((m_len -
++					       1) << 5) | ((m_off & 7) << 2));
++				*op++ = LZO_BYTE(m_off >> 3);
++#elif defined(LZO1Y)
++				*op++ =
++				    LZO_BYTE(((m_len +
++					       1) << 4) | ((m_off & 3) << 2));
++				*op++ = LZO_BYTE(m_off >> 2);
++#endif
++			} else if (m_off <= M3_MAX_OFFSET) {
++				m_off -= 1;
++				*op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
++				goto m3_m4_offset;
++			} else
++#if defined(LZO1X)
++			{
++				m_off -= 0x4000;
++				assert("lzo-09", m_off > 0);
++				assert("lzo-10", m_off <= 0x7fff);
++				*op++ = LZO_BYTE(M4_MARKER |
++						 ((m_off & 0x4000) >> 11) |
++						 (m_len - 2));
++				goto m3_m4_offset;
++			}
++#elif defined(LZO1Y)
++				goto m4_match;
++#endif
++		} else {
++			{
++				const lzo_byte *end = in_end;
++				const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
++				while (ip < end && *m == *ip)
++					m++, ip++;
++				m_len = (ip - ii);
++			}
++			assert("lzo-11", m_len > M2_MAX_LEN);
++
++			if (m_off <= M3_MAX_OFFSET) {
++				m_off -= 1;
++				if (m_len <= 33)
++					*op++ =
++					    LZO_BYTE(M3_MARKER | (m_len - 2));
++				else {
++					m_len -= 33;
++					*op++ = M3_MARKER | 0;
++					goto m3_m4_len;
++				}
++			} else {
++#if defined(LZO1Y)
++			      m4_match:
++#endif
++				m_off -= 0x4000;
++				assert("lzo-12", m_off > 0);
++				assert("lzo-13", m_off <= 0x7fff);
++				if (m_len <= M4_MAX_LEN)
++					*op++ = LZO_BYTE(M4_MARKER |
++							 ((m_off & 0x4000) >>
++							  11) | (m_len - 2));
++				else {
++					m_len -= M4_MAX_LEN;
++					*op++ =
++					    LZO_BYTE(M4_MARKER |
++						     ((m_off & 0x4000) >> 11));
++				      m3_m4_len:
++					while (m_len > 255) {
++						m_len -= 255;
++						*op++ = 0;
++					}
++					assert("lzo-14", m_len > 0);
++					*op++ = LZO_BYTE(m_len);
++				}
++			}
++
++		      m3_m4_offset:
++			*op++ = LZO_BYTE((m_off & 63) << 2);
++			*op++ = LZO_BYTE(m_off >> 6);
++		}
++
++		ii = ip;
++		if (ip >= ip_end)
++			break;
++	}
++
++	*out_len = op - out;
++	return pd(in_end, ii);
++}
++
++LZO_PUBLIC(int)
++    DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
++	    lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
++{
++	lzo_byte *op = out;
++	lzo_uint t;
++
++#if defined(__LZO_QUERY_COMPRESS)
++	if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
++		return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
++					    D_SIZE, lzo_sizeof(lzo_dict_t));
++#endif
++
++	if (in_len <= M2_MAX_LEN + 5)
++		t = in_len;
++	else {
++		t = do_compress(in, in_len, op, out_len, wrkmem);
++		op += *out_len;
++	}
++
++	if (t > 0) {
++		const lzo_byte *ii = in + in_len - t;
++
++		if (op == out && t <= 238)
++			*op++ = LZO_BYTE(17 + t);
++		else if (t <= 3)
++			op[-2] |= LZO_BYTE(t);
++		else if (t <= 18)
++			*op++ = LZO_BYTE(t - 3);
++		else {
++			lzo_uint tt = t - 18;
++
++			*op++ = 0;
++			while (tt > 255) {
++				tt -= 255;
++				*op++ = 0;
++			}
++			assert("lzo-15", tt > 0);
++			*op++ = LZO_BYTE(tt);
++		}
++		do
++			*op++ = *ii++;
++		while (--t > 0);
++	}
++
++	*op++ = M4_MARKER | 1;
++	*op++ = 0;
++	*op++ = 0;
++
++	*out_len = op - out;
++	return LZO_E_OK;
++}
++
++#undef do_compress
++#undef DO_COMPRESS
++#undef LZO_HASH
++
++#undef LZO_TEST_DECOMPRESS_OVERRUN
++#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
++#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
++#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
++#undef DO_DECOMPRESS
++#define DO_DECOMPRESS       lzo1x_decompress
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
++#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
++#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
++#  endif
++#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
++#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
++#  endif
++#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
++#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
++#  endif
++#endif
++
++#undef TEST_IP
++#undef TEST_OP
++#undef TEST_LOOKBEHIND
++#undef NEED_IP
++#undef NEED_OP
++#undef HAVE_TEST_IP
++#undef HAVE_TEST_OP
++#undef HAVE_NEED_IP
++#undef HAVE_NEED_OP
++#undef HAVE_ANY_IP
++#undef HAVE_ANY_OP
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
++#    define TEST_IP             (ip < ip_end)
++#  endif
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
++#    define NEED_IP(x) \
++	    if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
++#  endif
++#endif
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
++#    define TEST_OP             (op <= op_end)
++#  endif
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
++#    undef TEST_OP
++#    define NEED_OP(x) \
++	    if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
++#  endif
++#endif
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
++#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
++#else
++#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
++#endif
++
++#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
++#  define TEST_IP               (ip < ip_end)
++#endif
++
++#if defined(TEST_IP)
++#  define HAVE_TEST_IP
++#else
++#  define TEST_IP               1
++#endif
++#if defined(TEST_OP)
++#  define HAVE_TEST_OP
++#else
++#  define TEST_OP               1
++#endif
++
++#if defined(NEED_IP)
++#  define HAVE_NEED_IP
++#else
++#  define NEED_IP(x)            ((void) 0)
++#endif
++#if defined(NEED_OP)
++#  define HAVE_NEED_OP
++#else
++#  define NEED_OP(x)            ((void) 0)
++#endif
++
++#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
++#  define HAVE_ANY_IP
++#endif
++#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
++#  define HAVE_ANY_OP
++#endif
++
++#undef __COPY4
++#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
++
++#undef COPY4
++#if defined(LZO_UNALIGNED_OK_4)
++#  define COPY4(dst,src)    __COPY4(dst,src)
++#elif defined(LZO_ALIGNED_OK_4)
++#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
++#endif
++
++#if defined(DO_DECOMPRESS)
++LZO_PUBLIC(int)
++    DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
++	      lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
++#endif
++{
++	register lzo_byte *op;
++	register const lzo_byte *ip;
++	register lzo_uint t;
++#if defined(COPY_DICT)
++	lzo_uint m_off;
++	const lzo_byte *dict_end;
++#else
++	register const lzo_byte *m_pos;
++#endif
++
++	const lzo_byte *const ip_end = in + in_len;
++#if defined(HAVE_ANY_OP)
++	lzo_byte *const op_end = out + *out_len;
++#endif
++#if defined(LZO1Z)
++	lzo_uint last_m_off = 0;
++#endif
++
++	LZO_UNUSED(wrkmem);
++
++#if defined(__LZO_QUERY_DECOMPRESS)
++	if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
++		return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
++					      0, 0);
++#endif
++
++#if defined(COPY_DICT)
++	if (dict) {
++		if (dict_len > M4_MAX_OFFSET) {
++			dict += dict_len - M4_MAX_OFFSET;
++			dict_len = M4_MAX_OFFSET;
++		}
++		dict_end = dict + dict_len;
++	} else {
++		dict_len = 0;
++		dict_end = NULL;
++	}
++#endif
++
++	*out_len = 0;
++
++	op = out;
++	ip = in;
++
++	if (*ip > 17) {
++		t = *ip++ - 17;
++		if (t < 4)
++			goto match_next;
++		assert("lzo-16", t > 0);
++		NEED_OP(t);
++		NEED_IP(t + 1);
++		do
++			*op++ = *ip++;
++		while (--t > 0);
++		goto first_literal_run;
++	}
++
++	while (TEST_IP && TEST_OP) {
++		t = *ip++;
++		if (t >= 16)
++			goto match;
++		if (t == 0) {
++			NEED_IP(1);
++			while (*ip == 0) {
++				t += 255;
++				ip++;
++				NEED_IP(1);
++			}
++			t += 15 + *ip++;
++		}
++		assert("lzo-17", t > 0);
++		NEED_OP(t + 3);
++		NEED_IP(t + 4);
++#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
++#if !defined(LZO_UNALIGNED_OK_4)
++		if (PTR_ALIGNED2_4(op, ip)) {
++#endif
++			COPY4(op, ip);
++			op += 4;
++			ip += 4;
++			if (--t > 0) {
++				if (t >= 4) {
++					do {
++						COPY4(op, ip);
++						op += 4;
++						ip += 4;
++						t -= 4;
++					} while (t >= 4);
++					if (t > 0)
++						do
++							*op++ = *ip++;
++						while (--t > 0);
++				} else
++					do
++						*op++ = *ip++;
++					while (--t > 0);
++			}
++#if !defined(LZO_UNALIGNED_OK_4)
++		} else
++#endif
++#endif
++#if !defined(LZO_UNALIGNED_OK_4)
++		{
++			*op++ = *ip++;
++			*op++ = *ip++;
++			*op++ = *ip++;
++			do
++				*op++ = *ip++;
++			while (--t > 0);
++		}
++#endif
++
++	      first_literal_run:
++
++		t = *ip++;
++		if (t >= 16)
++			goto match;
++#if defined(COPY_DICT)
++#if defined(LZO1Z)
++		m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
++		last_m_off = m_off;
++#else
++		m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
++#endif
++		NEED_OP(3);
++		t = 3;
++		COPY_DICT(t, m_off)
++#else
++#if defined(LZO1Z)
++		t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
++		m_pos = op - t;
++		last_m_off = t;
++#else
++		m_pos = op - (1 + M2_MAX_OFFSET);
++		m_pos -= t >> 2;
++		m_pos -= *ip++ << 2;
++#endif
++		TEST_LOOKBEHIND(m_pos, out);
++		NEED_OP(3);
++		*op++ = *m_pos++;
++		*op++ = *m_pos++;
++		*op++ = *m_pos;
++#endif
++		goto match_done;
++
++		while (TEST_IP && TEST_OP) {
++		      match:
++			if (t >= 64) {
++#if defined(COPY_DICT)
++#if defined(LZO1X)
++				m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
++				t = (t >> 5) - 1;
++#elif defined(LZO1Y)
++				m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
++				t = (t >> 4) - 3;
++#elif defined(LZO1Z)
++				m_off = t & 0x1f;
++				if (m_off >= 0x1c)
++					m_off = last_m_off;
++				else {
++					m_off = 1 + (m_off << 6) + (*ip++ >> 2);
++					last_m_off = m_off;
++				}
++				t = (t >> 5) - 1;
++#endif
++#else
++#if defined(LZO1X)
++				m_pos = op - 1;
++				m_pos -= (t >> 2) & 7;
++				m_pos -= *ip++ << 3;
++				t = (t >> 5) - 1;
++#elif defined(LZO1Y)
++				m_pos = op - 1;
++				m_pos -= (t >> 2) & 3;
++				m_pos -= *ip++ << 2;
++				t = (t >> 4) - 3;
++#elif defined(LZO1Z)
++				{
++					lzo_uint off = t & 0x1f;
++					m_pos = op;
++					if (off >= 0x1c) {
++						assert(last_m_off > 0);
++						m_pos -= last_m_off;
++					} else {
++						off =
++						    1 + (off << 6) +
++						    (*ip++ >> 2);
++						m_pos -= off;
++						last_m_off = off;
++					}
++				}
++				t = (t >> 5) - 1;
++#endif
++				TEST_LOOKBEHIND(m_pos, out);
++				assert("lzo-18", t > 0);
++				NEED_OP(t + 3 - 1);
++				goto copy_match;
++#endif
++			} else if (t >= 32) {
++				t &= 31;
++				if (t == 0) {
++					NEED_IP(1);
++					while (*ip == 0) {
++						t += 255;
++						ip++;
++						NEED_IP(1);
++					}
++					t += 31 + *ip++;
++				}
++#if defined(COPY_DICT)
++#if defined(LZO1Z)
++				m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
++				last_m_off = m_off;
++#else
++				m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
++#endif
++#else
++#if defined(LZO1Z)
++				{
++					lzo_uint off =
++					    1 + (ip[0] << 6) + (ip[1] >> 2);
++					m_pos = op - off;
++					last_m_off = off;
++				}
++#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
++				m_pos = op - 1;
++				m_pos -= (*(const lzo_ushortp)ip) >> 2;
++#else
++				m_pos = op - 1;
++				m_pos -= (ip[0] >> 2) + (ip[1] << 6);
++#endif
++#endif
++				ip += 2;
++			} else if (t >= 16) {
++#if defined(COPY_DICT)
++				m_off = (t & 8) << 11;
++#else
++				m_pos = op;
++				m_pos -= (t & 8) << 11;
++#endif
++				t &= 7;
++				if (t == 0) {
++					NEED_IP(1);
++					while (*ip == 0) {
++						t += 255;
++						ip++;
++						NEED_IP(1);
++					}
++					t += 7 + *ip++;
++				}
++#if defined(COPY_DICT)
++#if defined(LZO1Z)
++				m_off += (ip[0] << 6) + (ip[1] >> 2);
++#else
++				m_off += (ip[0] >> 2) + (ip[1] << 6);
++#endif
++				ip += 2;
++				if (m_off == 0)
++					goto eof_found;
++				m_off += 0x4000;
++#if defined(LZO1Z)
++				last_m_off = m_off;
++#endif
++#else
++#if defined(LZO1Z)
++				m_pos -= (ip[0] << 6) + (ip[1] >> 2);
++#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
++				m_pos -= (*(const lzo_ushortp)ip) >> 2;
++#else
++				m_pos -= (ip[0] >> 2) + (ip[1] << 6);
++#endif
++				ip += 2;
++				if (m_pos == op)
++					goto eof_found;
++				m_pos -= 0x4000;
++#if defined(LZO1Z)
++				last_m_off = op - m_pos;
++#endif
++#endif
++			} else {
++#if defined(COPY_DICT)
++#if defined(LZO1Z)
++				m_off = 1 + (t << 6) + (*ip++ >> 2);
++				last_m_off = m_off;
++#else
++				m_off = 1 + (t >> 2) + (*ip++ << 2);
++#endif
++				NEED_OP(2);
++				t = 2;
++				COPY_DICT(t, m_off)
++#else
++#if defined(LZO1Z)
++				t = 1 + (t << 6) + (*ip++ >> 2);
++				m_pos = op - t;
++				last_m_off = t;
++#else
++				m_pos = op - 1;
++				m_pos -= t >> 2;
++				m_pos -= *ip++ << 2;
++#endif
++				TEST_LOOKBEHIND(m_pos, out);
++				NEED_OP(2);
++				*op++ = *m_pos++;
++				*op++ = *m_pos;
++#endif
++				goto match_done;
++			}
++
++#if defined(COPY_DICT)
++
++			NEED_OP(t + 3 - 1);
++			t += 3 - 1;
++			COPY_DICT(t, m_off)
++#else
++
++			TEST_LOOKBEHIND(m_pos, out);
++			assert("lzo-19", t > 0);
++			NEED_OP(t + 3 - 1);
++#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
++#if !defined(LZO_UNALIGNED_OK_4)
++			if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
++				assert((op - m_pos) >= 4);
++#else
++			if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
++#endif
++				COPY4(op, m_pos);
++				op += 4;
++				m_pos += 4;
++				t -= 4 - (3 - 1);
++				do {
++					COPY4(op, m_pos);
++					op += 4;
++					m_pos += 4;
++					t -= 4;
++				} while (t >= 4);
++				if (t > 0)
++					do
++						*op++ = *m_pos++;
++					while (--t > 0);
++			} else
++#endif
++			{
++			      copy_match:
++				*op++ = *m_pos++;
++				*op++ = *m_pos++;
++				do
++					*op++ = *m_pos++;
++				while (--t > 0);
++			}
++
++#endif
++
++		      match_done:
++#if defined(LZO1Z)
++			t = ip[-1] & 3;
++#else
++			t = ip[-2] & 3;
++#endif
++			if (t == 0)
++				break;
++
++		      match_next:
++			assert("lzo-20", t > 0);
++			NEED_OP(t);
++			NEED_IP(t + 1);
++			do
++				*op++ = *ip++;
++			while (--t > 0);
++			t = *ip++;
++		}
++	}
++
++#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
++	*out_len = op - out;
++	return LZO_E_EOF_NOT_FOUND;
++#endif
++
++      eof_found:
++	assert("lzo-21", t == 1);
++	*out_len = op - out;
++	return (ip == ip_end ? LZO_E_OK :
++		(ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
++
++#if defined(HAVE_NEED_IP)
++      input_overrun:
++	*out_len = op - out;
++	return LZO_E_INPUT_OVERRUN;
++#endif
++
++#if defined(HAVE_NEED_OP)
++      output_overrun:
++	*out_len = op - out;
++	return LZO_E_OUTPUT_OVERRUN;
++#endif
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
++      lookbehind_overrun:
++	*out_len = op - out;
++	return LZO_E_LOOKBEHIND_OVERRUN;
++#endif
++}
++
++#define LZO_TEST_DECOMPRESS_OVERRUN
++#undef DO_DECOMPRESS
++#define DO_DECOMPRESS       lzo1x_decompress_safe
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
++#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
++#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
++#  endif
++#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
++#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
++#  endif
++#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
++#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
++#  endif
++#endif
++
++#undef TEST_IP
++#undef TEST_OP
++#undef TEST_LOOKBEHIND
++#undef NEED_IP
++#undef NEED_OP
++#undef HAVE_TEST_IP
++#undef HAVE_TEST_OP
++#undef HAVE_NEED_IP
++#undef HAVE_NEED_OP
++#undef HAVE_ANY_IP
++#undef HAVE_ANY_OP
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
++#    define TEST_IP             (ip < ip_end)
++#  endif
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
++#    define NEED_IP(x) \
++	    if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
++#  endif
++#endif
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
++#    define TEST_OP             (op <= op_end)
++#  endif
++#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
++#    undef TEST_OP
++#    define NEED_OP(x) \
++	    if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
++#  endif
++#endif
++
++#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
++#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
++#else
++#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
++#endif
++
++#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
++#  define TEST_IP               (ip < ip_end)
++#endif
++
++#if defined(TEST_IP)
++#  define HAVE_TEST_IP
++#else
++#  define TEST_IP               1
++#endif
++#if defined(TEST_OP)
++#  define HAVE_TEST_OP
++#else
++#  define TEST_OP               1
++#endif
++
++#if defined(NEED_IP)
++#  define HAVE_NEED_IP
++#else
++#  define NEED_IP(x)            ((void) 0)
++#endif
++#if defined(NEED_OP)
++#  define HAVE_NEED_OP
++#else
++#  define NEED_OP(x)            ((void) 0)
++#endif
++
++#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
++#  define HAVE_ANY_IP
++#endif
++#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
++#  define HAVE_ANY_OP
++#endif
++
++#undef __COPY4
++#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
++
++#undef COPY4
++#if defined(LZO_UNALIGNED_OK_4)
++#  define COPY4(dst,src)    __COPY4(dst,src)
++#elif defined(LZO_ALIGNED_OK_4)
++#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
++#endif
++
++/***** End of minilzo.c *****/
+Index: linux-2.6.16/fs/reiser4/plugin/compress/minilzo.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/compress/minilzo.h
+@@ -0,0 +1,94 @@
++/* minilzo.h -- mini subset of the LZO real-time data compression library
++   adopted for reiser4 compression transform plugin.
++
++   This file is part of the LZO real-time data compression library
++   and not included in any proprietary licenses of reiser4.
++
++   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
++   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
++   All Rights Reserved.
++
++   The LZO library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU General Public License as
++   published by the Free Software Foundation; either version 2 of
++   the License, or (at your option) any later version.
++
++   The LZO library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++   GNU General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with the LZO library; see the file COPYING.
++   If not, write to the Free Software Foundation, Inc.,
++   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++
++   Markus F.X.J. Oberhumer
++   <markus@oberhumer.com>
++   http://www.oberhumer.com/opensource/lzo/
++ */
++
++/*
++ * NOTE:
++ *   the full LZO package can be found at
++ *   http://www.oberhumer.com/opensource/lzo/
++ */
++
++#ifndef __MINILZO_H
++#define __MINILZO_H
++
++#define MINILZO_VERSION         0x1080
++
++#ifdef __LZOCONF_H
++#  error "you cannot use both LZO and miniLZO"
++#endif
++
++#undef LZO_HAVE_CONFIG_H
++#include "lzoconf.h"
++
++#if !defined(LZO_VERSION) || (LZO_VERSION != MINILZO_VERSION)
++#  error "version mismatch in header files"
++#endif
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/***********************************************************************
++//
++************************************************************************/
++
++/* Memory required for the wrkmem parameter.
++ * When the required size is 0, you can also pass a NULL pointer.
++ */
++
++#define LZO1X_MEM_COMPRESS      LZO1X_1_MEM_COMPRESS
++#define LZO1X_1_MEM_COMPRESS    ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
++#define LZO1X_MEM_DECOMPRESS    (0)
++
++/* compression */
++	LZO_EXTERN(int)
++	 lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
++			  lzo_byte * dst, lzo_uintp dst_len, lzo_voidp wrkmem);
++
++/* decompression */
++	 LZO_EXTERN(int)
++	 lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
++			  lzo_byte * dst, lzo_uintp dst_len,
++			  lzo_voidp wrkmem /* NOT USED */ );
++
++/* safe decompression with overrun testing */
++	 LZO_EXTERN(int)
++	 lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
++			       lzo_byte * dst, lzo_uintp dst_len,
++			       lzo_voidp wrkmem /* NOT USED */ );
++
++#ifdef __cplusplus
++}				/* extern "C" */
++#endif
++#endif				/* already included */
+Index: linux-2.6.16/fs/reiser4/plugin/crypto/cipher.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/crypto/cipher.c
+@@ -0,0 +1,116 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser,
++   licensing governed by reiser4/README */
++/* Reiser4 cipher transform plugins */
++
++#include "../../debug.h"
++#include "../plugin.h"
++#include "../file/cryptcompress.h"
++#include <linux/types.h>
++#include <linux/random.h>
++
++#define MIN_CIPHER_BLOCKSIZE 8
++#define MAX_CIPHER_BLOCKSIZE 128
++
++/*
++  Default align() method of the cipher plugin (look for description of this
++  method in plugin/plugin.h)
++
++  1) creates the aligning armored format of the input flow before encryption.
++     "armored" means that padding is filled by private data (for example,
++     pseudo-random sequence of bytes is not private data).
++  2) returns length of appended padding
++
++   [ flow | aligning_padding ]
++            ^
++            |
++	  @pad
++*/
++static int align_stream_common(__u8 * pad,
++			       int flow_size /* size of non-aligned flow */,
++			       int blocksize /* cipher block size */)
++{
++	int pad_size;
++
++	assert("edward-01", pad != NULL);
++	assert("edward-02", flow_size != 0);
++	assert("edward-03", blocksize != 0
++	       || blocksize <= MAX_CIPHER_BLOCKSIZE);
++
++	pad_size = blocksize - (flow_size % blocksize);
++	get_random_bytes(pad, pad_size);
++	return pad_size;
++}
++
++/* This is used for all the cipher algorithms which do not inflate
++   block-aligned data */
++static loff_t scale_common(struct inode *inode, size_t blocksize,
++			   loff_t src_off /* offset to scale */ )
++{
++	return src_off;
++}
++
++static void free_aes (struct crypto_tfm * tfm)
++{
++#if REISER4_AES
++	crypto_free_tfm(tfm);
++#endif
++	return;
++}
++
++static struct crypto_tfm * alloc_aes (void)
++{
++#if REISER4_AES
++	return crypto_alloc_tfm ("aes", 0);
++#else
++	warning("edward-1417", "aes unsupported");
++	return ERR_PTR(-EINVAL);
++#endif /* REISER4_AES */
++}
++
++cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
++	[NONE_CIPHER_ID] = {
++		.h = {
++			.type_id = REISER4_CIPHER_PLUGIN_TYPE,
++			.id = NONE_CIPHER_ID,
++			.pops = NULL,
++			.label = "none",
++			.desc = "no cipher transform",
++			.linkage = {NULL, NULL}
++		},
++		.alloc = NULL,
++		.free = NULL,
++		.scale = NULL,
++		.align_stream = NULL,
++		.setkey = NULL,
++		.encrypt = NULL,
++		.decrypt = NULL
++	},
++	[AES_CIPHER_ID] = {
++		.h = {
++			.type_id = REISER4_CIPHER_PLUGIN_TYPE,
++			.id = AES_CIPHER_ID,
++			.pops = NULL,
++			.label = "aes",
++			.desc = "aes cipher transform",
++			.linkage = {NULL, NULL}
++		},
++		.alloc = alloc_aes,
++		.free = free_aes,
++		.scale = scale_common,
++		.align_stream = align_stream_common,
++		.setkey = NULL,
++		.encrypt = NULL,
++		.decrypt = NULL
++	}
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/crypto/cipher.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/crypto/cipher.h
+@@ -0,0 +1,67 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* This file contains definitions for the objects operated
++   by reiser4 key manager, which is something like keyring
++   wrapped by appropriate reiser4 plugin */
++
++#if !defined( __FS_REISER4_CRYPT_H__ )
++#define __FS_REISER4_CRYPT_H__
++
++#include <linux/crypto.h>
++
++
++/* Transform actions involved in ciphering process and
++   supported by reiser4 via appropriate transform plugins */
++typedef enum {
++	CIPHER_TFM,       /* cipher transform */
++	DIGEST_TFM,       /* digest transform */
++	LAST_TFM
++} reiser4_tfm;
++
++/* This represents a transform action in reiser4 */
++typedef struct reiser4_tfma {
++	reiser4_plugin * plug;     /* transform plugin */
++	struct crypto_tfm * tfm;   /* low-level info, operated by
++				      linux crypto-api (see linux/crypto) */
++} reiser4_tfma_t;
++
++/* key info imported from user space */
++typedef struct crypto_data {
++	int keysize;    /* uninstantiated key size */
++	__u8 * key;     /* uninstantiated key */
++	int keyid_size; /* size of passphrase */
++	__u8 * keyid;   /* passphrase */
++} crypto_data_t;
++
++/* This object contains all needed infrastructure to implement
++   cipher transform. This is operated (allocating, inheriting,
++   validating, binding to host inode, etc..) by reiser4 key manager.
++
++   This info can be allocated in two cases:
++   1. importing a key from user space. 
++   2. reading inode from disk */
++typedef struct crypto_stat {
++	reiser4_tfma_t tfma[LAST_TFM];
++//      cipher_key_plugin * kplug; /* key manager */
++	__u8 * keyid;              /* key fingerprint, created by digest plugin,
++				      using uninstantiated key and passphrase.
++				      supposed to be stored in disk stat-data */
++	int inst;                  /* this indicates if the cipher key is
++				      instantiated (case 1 above) */
++	int keysize;               /* uninstantiated key size (bytes), supposed
++				      to be stored in disk stat-data */
++	int keyload_count;         /* number of the objects which has this
++				      crypto-stat attached */
++} crypto_stat_t;
++
++#endif /* __FS_REISER4_CRYPT_H__ */
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/crypto/digest.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/crypto/digest.c
+@@ -0,0 +1,58 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
++/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
++#include "../../debug.h"
++#include "../plugin_header.h"
++#include "../plugin.h"
++#include "../file/cryptcompress.h"
++
++#include <linux/types.h>
++
++extern digest_plugin digest_plugins[LAST_DIGEST_ID];
++
++static struct crypto_tfm * alloc_sha256 (void)
++{
++#if REISER4_SHA256
++	return crypto_alloc_tfm ("sha256", 0);
++#else
++	warning("edward-1418", "sha256 unsupported");
++	return ERR_PTR(-EINVAL);
++#endif
++}
++
++static void free_sha256 (struct crypto_tfm * tfm)
++{
++#if REISER4_SHA256
++	crypto_free_tfm(tfm);
++#endif
++	return;
++}
++
++/* digest plugins */
++digest_plugin digest_plugins[LAST_DIGEST_ID] = {
++	[SHA256_32_DIGEST_ID] = {
++		.h = {
++			.type_id = REISER4_DIGEST_PLUGIN_TYPE,
++			.id = SHA256_32_DIGEST_ID,
++			.pops = NULL,
++			.label = "sha256_32",
++			.desc = "sha256_32 digest transform",
++			.linkage = {NULL, NULL}
++		},
++		.fipsize = sizeof(__u32),
++		.alloc = alloc_sha256,
++		.free = free_sha256
++	}
++};
++
++/*
++  Local variables:
++  c-indentation-style: "K&R"
++  mode-name: "LC"
++  c-basic-offset: 8
++  tab-width: 8
++  fill-column: 120
++  scroll-step: 1
++  End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/dir/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/dir/Makefile
+@@ -0,0 +1,5 @@
++obj-$(CONFIG_REISER4_FS) += dir_plugins.o
++
++dir_plugins-objs :=	\
++	hashed_dir.o	\
++	seekable_dir.o
+Index: linux-2.6.16/fs/reiser4/plugin/dir/dir.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/dir/dir.h
+@@ -0,0 +1,36 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* this file contains declarations of methods implementing directory plugins */
++
++#if !defined( __REISER4_DIR_H__ )
++#define __REISER4_DIR_H__
++
++/*#include "../../key.h"
++
++#include <linux/fs.h>*/
++
++/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
++
++/* "hashed" directory methods of dir plugin */
++void build_entry_key_hashed(const struct inode *, const struct qstr *,
++			    reiser4_key *);
++
++/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
++
++/* "seekable" directory methods of dir plugin */
++void build_entry_key_seekable(const struct inode *, const struct qstr *,
++			      reiser4_key *);
++
++/* __REISER4_DIR_H__ */
++#endif
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/dir/hashed_dir.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/dir/hashed_dir.c
+@@ -0,0 +1,81 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
++   names to the files. */
++
++/*
++ * Hashed directory logically consists of persistent directory
++ * entries. Directory entry is a pair of a file name and a key of stat-data of
++ * a file that has this name in the given directory.
++ *
++ * Directory entries are stored in the tree in the form of directory
++ * items. Directory item should implement dir_entry_ops portion of item plugin
++ * interface (see plugin/item/item.h). Hashed directory interacts with
++ * directory item plugin exclusively through dir_entry_ops operations.
++ *
++ * Currently there are two implementations of directory items: "simple
++ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
++ * (plugin/item/cde.[ch]) with the latter being the default.
++ *
++ * There is, however some delicate way through which directory code interferes
++ * with item plugin: key assignment policy. A key for a directory item is
++ * chosen by directory code, and as described in kassign.c, this key contains
++ * a portion of file name. Directory item uses this knowledge to avoid storing
++ * this portion of file name twice: in the key and in the directory item body.
++ *
++ */
++
++#include "../../inode.h"
++
++void complete_entry_key(const struct inode *, const char *name,
++			int len, reiser4_key * result);
++
++/* this is implementation of build_entry_key method of dir
++   plugin for HASHED_DIR_PLUGIN_ID
++ */
++void build_entry_key_hashed(const struct inode *dir,	/* directory where entry is
++							 * (or will be) in.*/
++			    const struct qstr *qname,	/* name of file referenced
++							 * by this entry */
++			    reiser4_key * result	/* resulting key of directory
++							 * entry */ )
++{
++	const char *name;
++	int len;
++
++	assert("nikita-1139", dir != NULL);
++	assert("nikita-1140", qname != NULL);
++	assert("nikita-1141", qname->name != NULL);
++	assert("nikita-1142", result != NULL);
++
++	name = qname->name;
++	len = qname->len;
++
++	assert("nikita-2867", strlen(name) == len);
++
++	reiser4_key_init(result);
++	/* locality of directory entry's key is objectid of parent
++	   directory */
++	set_key_locality(result, get_inode_oid(dir));
++	/* minor packing locality is constant */
++	set_key_type(result, KEY_FILE_NAME_MINOR);
++	/* dot is special case---we always want it to be first entry in
++	   a directory. Actually, we just want to have smallest
++	   directory entry.
++	 */
++	if (len == 1 && name[0] == '.')
++		return;
++
++	/* initialize part of entry key which depends on file name */
++	complete_entry_key(dir, name, len, result);
++}
++
++/* Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/dir/seekable_dir.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/dir/seekable_dir.c
+@@ -0,0 +1,46 @@
++/* Copyright 2005 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "../../inode.h"
++
++/* this is implementation of build_entry_key method of dir
++   plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
++   This is for directories where we want repeatable and restartable readdir()
++   even in case 32bit user level struct dirent (readdir(3)).
++*/
++void
++build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
++			 reiser4_key * result)
++{
++	oid_t objectid;
++
++	assert("nikita-2283", dir != NULL);
++	assert("nikita-2284", name != NULL);
++	assert("nikita-2285", name->name != NULL);
++	assert("nikita-2286", result != NULL);
++
++	reiser4_key_init(result);
++	/* locality of directory entry's key is objectid of parent
++	   directory */
++	set_key_locality(result, get_inode_oid(dir));
++	/* minor packing locality is constant */
++	set_key_type(result, KEY_FILE_NAME_MINOR);
++	/* dot is special case---we always want it to be first entry in
++	   a directory. Actually, we just want to have smallest
++	   directory entry.
++	 */
++	if ((name->len == 1) && (name->name[0] == '.'))
++		return;
++
++	/* objectid of key is 31 lowest bits of hash. */
++	objectid =
++	    inode_hash_plugin(dir)->hash(name->name,
++					 (int)name->len) & 0x7fffffff;
++
++	assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
++	set_key_objectid(result, objectid);
++
++	/* offset is always 0. */
++	set_key_offset(result, (__u64) 0);
++	return;
++}
+Index: linux-2.6.16/fs/reiser4/plugin/dir_plugin_common.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/dir_plugin_common.c
+@@ -0,0 +1,864 @@
++/* Copyright 2005 by Hans Reiser, licensing governed by
++   reiser4/README */
++
++/* this file contains typical implementations for most of methods of
++   directory plugin
++*/
++
++#include "../inode.h"
++
++int find_entry(struct inode *dir, struct dentry *name,
++	       lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
++int lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
++void check_light_weight(struct inode *inode, struct inode *parent);
++
++/* this is common implementation of get_parent method of dir plugin
++   this is used by NFS kernel server to "climb" up directory tree to
++   check permissions
++ */
++struct dentry *get_parent_common(struct inode *child)
++{
++	struct super_block *s;
++	struct inode *parent;
++	struct dentry dotdot;
++	struct dentry *dentry;
++	reiser4_key key;
++	int result;
++
++	/*
++	 * lookup dotdot entry.
++	 */
++
++	s = child->i_sb;
++	memset(&dotdot, 0, sizeof(dotdot));
++	dotdot.d_name.name = "..";
++	dotdot.d_name.len = 2;
++	dotdot.d_op = &get_super_private(s)->ops.dentry;
++
++	result = lookup_name(child, &dotdot, &key);
++	if (result != 0)
++		return ERR_PTR(result);
++
++	parent = reiser4_iget(s, &key, 1);
++	if (!IS_ERR(parent)) {
++		/*
++		 * FIXME-NIKITA dubious: attributes are inherited from @child
++		 * to @parent. But:
++		 *
++		 *     (*) this is the only this we can do
++		 *
++		 *     (*) attributes of light-weight object are inherited
++		 *     from a parent through which object was looked up first,
++		 *     so it is ambiguous anyway.
++		 *
++		 */
++		check_light_weight(parent, child);
++		reiser4_iget_complete(parent);
++		dentry = d_alloc_anon(parent);
++		if (dentry == NULL) {
++			iput(parent);
++			dentry = ERR_PTR(RETERR(-ENOMEM));
++		} else
++			dentry->d_op = &get_super_private(s)->ops.dentry;
++	} else if (PTR_ERR(parent) == -ENOENT)
++		dentry = ERR_PTR(RETERR(-ESTALE));
++	else
++		dentry = (void *)parent;
++	return dentry;
++}
++
++/* this is common implementation of is_name_acceptable method of dir
++   plugin
++ */
++int is_name_acceptable_common(const struct inode *inode,	/* directory to check */
++			      const char *name UNUSED_ARG,	/* name to check */
++			      int len /* @name's length */ )
++{
++	assert("nikita-733", inode != NULL);
++	assert("nikita-734", name != NULL);
++	assert("nikita-735", len > 0);
++
++	return len <= reiser4_max_filename_len(inode);
++}
++
++/* there is no common implementation of build_entry_key method of dir
++   plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
++   plugin/dir/seekable.c:build_entry_key_seekable() for example
++*/
++
++/* this is common implementation of build_readdir_key method of dir
++   plugin
++   see readdir_common for more details
++*/
++int build_readdir_key_common(struct file *dir /* directory being read */ ,
++			     reiser4_key * result /* where to store key */ )
++{
++	reiser4_file_fsdata *fdata;
++	struct inode *inode;
++
++	assert("nikita-1361", dir != NULL);
++	assert("nikita-1362", result != NULL);
++	assert("nikita-1363", dir->f_dentry != NULL);
++	inode = dir->f_dentry->d_inode;
++	assert("nikita-1373", inode != NULL);
++
++	fdata = reiser4_get_file_fsdata(dir);
++	if (IS_ERR(fdata))
++		return PTR_ERR(fdata);
++	assert("nikita-1364", fdata != NULL);
++	return extract_key_from_de_id(get_inode_oid(inode),
++				      &fdata->dir.readdir.position.
++				      dir_entry_key, result);
++
++}
++
++void adjust_dir_file(struct inode *, const struct dentry *, int offset,
++		     int adj);
++
++/* this is common implementation of add_entry method of dir plugin
++*/
++int add_entry_common(struct inode *object,	/* directory to add new name
++						 * in */
++		     struct dentry *where,	/* new name */
++		     reiser4_object_create_data * data UNUSED_ARG,	/* parameters
++									 * of new
++									 * object */
++		     reiser4_dir_entry_desc * entry	/* parameters of new
++							 * directory entry */ )
++{
++	int result;
++	coord_t *coord;
++	lock_handle lh;
++	reiser4_dentry_fsdata *fsdata;
++	reiser4_block_nr reserve;
++
++	assert("nikita-1114", object != NULL);
++	assert("nikita-1250", where != NULL);
++
++	fsdata = reiser4_get_dentry_fsdata(where);
++	if (unlikely(IS_ERR(fsdata)))
++		return PTR_ERR(fsdata);
++
++	reserve = inode_dir_plugin(object)->estimate.add_entry(object);
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
++		return RETERR(-ENOSPC);
++
++	init_lh(&lh);
++	coord = &fsdata->dec.entry_coord;
++	coord_clear_iplug(coord);
++
++	/* check for this entry in a directory. This is plugin method. */
++	result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry);
++	if (likely(result == -ENOENT)) {
++		/* add new entry. Just pass control to the directory
++		   item plugin. */
++		assert("nikita-1709", inode_dir_item_plugin(object));
++		assert("nikita-2230", coord->node == lh.node);
++		seal_done(&fsdata->dec.entry_seal);
++		result =
++		    inode_dir_item_plugin(object)->s.dir.add_entry(object,
++								   coord, &lh,
++								   where,
++								   entry);
++		if (result == 0) {
++			adjust_dir_file(object, where, fsdata->dec.pos + 1, +1);
++			INODE_INC_FIELD(object, i_size);
++		}
++	} else if (result == 0) {
++		assert("nikita-2232", coord->node == lh.node);
++		result = RETERR(-EEXIST);
++	}
++	done_lh(&lh);
++
++	return result;
++}
++
++/**
++ * rem_entry - remove entry from directory item
++ * @dir:
++ * @dentry:
++ * @entry:
++ * @coord:
++ * @lh:
++ *
++ * Checks that coordinate @coord is set properly and calls item plugin
++ * method to cut entry.
++ */
++static int
++rem_entry(struct inode *dir, struct dentry *dentry,
++	  reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
++{
++	item_plugin *iplug;
++	struct inode *child;
++
++	iplug = inode_dir_item_plugin(dir);
++	child = dentry->d_inode;
++	assert("nikita-3399", child != NULL);
++
++	/* check that we are really destroying an entry for @child */
++	if (REISER4_DEBUG) {
++		int result;
++		reiser4_key key;
++
++		result = iplug->s.dir.extract_key(coord, &key);
++		if (result != 0)
++			return result;
++		if (get_key_objectid(&key) != get_inode_oid(child)) {
++			warning("nikita-3397",
++				"rem_entry: %#llx != %#llx\n",
++				get_key_objectid(&key),
++				(unsigned long long)get_inode_oid(child));
++			return RETERR(-EIO);
++		}
++	}
++	return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
++}
++
++/**
++ * rem_entry_common - remove entry from a directory
++ * @dir: directory to remove entry from
++ * @where: name that is being removed
++ * @entry: description of entry being removed
++ *
++ * This is common implementation of rem_entry method of dir plugin.
++ */
++int rem_entry_common(struct inode *dir,
++		     struct dentry *dentry,
++		     reiser4_dir_entry_desc *entry)
++{
++	int result;
++	coord_t *coord;
++	lock_handle lh;
++	reiser4_dentry_fsdata *fsdata;
++	__u64 tograb;
++
++	assert("nikita-1124", dir != NULL);
++	assert("nikita-1125", dentry != NULL);
++
++	tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
++	result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
++	if (result != 0)
++		return RETERR(-ENOSPC);
++
++	init_lh(&lh);
++
++	/* check for this entry in a directory. This is plugin method. */
++	result = find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
++	fsdata = reiser4_get_dentry_fsdata(dentry);
++	if (IS_ERR(fsdata)) {
++		done_lh(&lh);
++		return PTR_ERR(fsdata);
++	}
++
++	coord = &fsdata->dec.entry_coord;
++
++	assert("nikita-3404",
++	       get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
++	       dir->i_size <= 1);
++
++	coord_clear_iplug(coord);
++	if (result == 0) {
++		/* remove entry. Just pass control to the directory item
++		   plugin. */
++		assert("vs-542", inode_dir_item_plugin(dir));
++		seal_done(&fsdata->dec.entry_seal);
++		adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
++		result =
++		    WITH_COORD(coord,
++			       rem_entry(dir, dentry, entry, coord, &lh));
++		if (result == 0) {
++			if (dir->i_size >= 1)
++				INODE_DEC_FIELD(dir, i_size);
++			else {
++				warning("nikita-2509", "Dir %llu is runt",
++					(unsigned long long)
++					get_inode_oid(dir));
++				result = RETERR(-EIO);
++			}
++
++			assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
++			       dentry->d_inode->i_size != 2 ||
++			       inode_dir_plugin(dentry->d_inode) == NULL);
++		}
++	}
++	done_lh(&lh);
++
++	return result;
++}
++
++static reiser4_block_nr estimate_init(struct inode *parent,
++				      struct inode *object);
++static int create_dot_dotdot(struct inode *object, struct inode *parent);
++
++/* this is common implementation of init method of dir plugin
++   create "." and ".." entries
++*/
++int init_common(struct inode *object,	/* new directory */
++		struct inode *parent,	/* parent directory */
++		reiser4_object_create_data * data UNUSED_ARG	/* info passed
++								 * to us, this
++								 * is filled by
++								 * reiser4()
++								 * syscall in
++								 * particular */ )
++{
++	reiser4_block_nr reserve;
++
++	assert("nikita-680", object != NULL);
++	assert("nikita-681", S_ISDIR(object->i_mode));
++	assert("nikita-682", parent != NULL);
++	assert("nikita-684", data != NULL);
++	assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
++	assert("nikita-687", object->i_mode & S_IFDIR);
++
++	reserve = estimate_init(parent, object);
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
++		return RETERR(-ENOSPC);
++
++	return create_dot_dotdot(object, parent);
++}
++
++/* this is common implementation of done method of dir plugin
++   remove "." entry
++*/
++int done_common(struct inode *object /* object being deleted */ )
++{
++	int result;
++	reiser4_block_nr reserve;
++	struct dentry goodby_dots;
++	reiser4_dir_entry_desc entry;
++
++	assert("nikita-1449", object != NULL);
++
++	if (inode_get_flag(object, REISER4_NO_SD))
++		return 0;
++
++	/* of course, this can be rewritten to sweep everything in one
++	   cut_tree(). */
++	memset(&entry, 0, sizeof entry);
++
++	/* FIXME: this done method is called from delete_directory_common which
++	 * reserved space already */
++	reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
++		return RETERR(-ENOSPC);
++
++	memset(&goodby_dots, 0, sizeof goodby_dots);
++	entry.obj = goodby_dots.d_inode = object;
++	goodby_dots.d_name.name = ".";
++	goodby_dots.d_name.len = 1;
++	result = rem_entry_common(object, &goodby_dots, &entry);
++	reiser4_free_dentry_fsdata(&goodby_dots);
++	if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
++		/* only worth a warning
++
++		   "values of B will give rise to dom!\n"
++		   -- v6src/s2/mv.c:89
++		 */
++		warning("nikita-2252", "Cannot remove dot of %lli: %i",
++			(unsigned long long)get_inode_oid(object), result);
++	return 0;
++}
++
++/* this is common implementation of attach method of dir plugin
++*/
++int
++attach_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG)
++{
++	assert("nikita-2647", child != NULL);
++	assert("nikita-2648", parent != NULL);
++
++	return 0;
++}
++
++/* this is common implementation of detach method of dir plugin
++   remove "..", decrease nlink on parent
++*/
++int detach_common(struct inode *object, struct inode *parent)
++{
++	int result;
++	struct dentry goodby_dots;
++	reiser4_dir_entry_desc entry;
++
++	assert("nikita-2885", object != NULL);
++	assert("nikita-2886", !inode_get_flag(object, REISER4_NO_SD));
++
++	memset(&entry, 0, sizeof entry);
++
++	/* NOTE-NIKITA this only works if @parent is -the- parent of
++	   @object, viz. object whose key is stored in dotdot
++	   entry. Wouldn't work with hard-links on directories. */
++	memset(&goodby_dots, 0, sizeof goodby_dots);
++	entry.obj = goodby_dots.d_inode = parent;
++	goodby_dots.d_name.name = "..";
++	goodby_dots.d_name.len = 2;
++	result = rem_entry_common(object, &goodby_dots, &entry);
++	reiser4_free_dentry_fsdata(&goodby_dots);
++	if (result == 0) {
++		/* the dot should be the only entry remaining at this time... */
++		assert("nikita-3400", object->i_size == 1 &&
++		       (object->i_nlink >= 0 && object->i_nlink <= 2));
++#if 0
++		/* and, together with the only name directory can have, they
++		 * provides for the last 2 remaining references. If we get
++		 * here as part of error handling during mkdir, @object
++		 * possibly has no name yet, so its nlink == 1. If we get here
++		 * from rename (targeting empty directory), it has no name
++		 * already, so its nlink == 1. */
++		assert("nikita-3401",
++		       object->i_nlink == 2 || object->i_nlink == 1);
++#endif
++
++		/* decrement nlink of directory removed ".." pointed
++		   to */
++		reiser4_del_nlink(parent, NULL, 0);
++	}
++	return result;
++}
++
++/* this is common implementation of estimate.add_entry method of
++   dir plugin
++   estimation of adding entry which supposes that entry is inserting a
++   unit into item
++*/
++reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
++{
++	return estimate_one_insert_into_item(tree_by_inode(inode));
++}
++
++/* this is common implementation of estimate.rem_entry method of dir
++   plugin
++*/
++reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
++{
++	return estimate_one_item_removal(tree_by_inode(inode));
++}
++
++/* this is common implementation of estimate.unlink method of dir
++   plugin
++*/
++reiser4_block_nr
++dir_estimate_unlink_common(const struct inode * parent,
++			   const struct inode * object)
++{
++	reiser4_block_nr res;
++
++	/* hashed_rem_entry(object) */
++	res = inode_dir_plugin(object)->estimate.rem_entry(object);
++	/* del_nlink(parent) */
++	res += 2 * inode_file_plugin(parent)->estimate.update(parent);
++
++	return res;
++}
++
++/*
++ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
++ * methods: if @inode is a light-weight file, setup its credentials
++ * that are not stored in the stat-data in this case
++ */
++void check_light_weight(struct inode *inode, struct inode *parent)
++{
++	if (inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
++		inode->i_uid = parent->i_uid;
++		inode->i_gid = parent->i_gid;
++		/* clear light-weight flag. If inode would be read by any
++		   other name, [ug]id wouldn't change. */
++		inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
++	}
++}
++
++/* looks for name specified in @dentry in directory @parent and if name is
++   found - key of object found entry points to is stored in @entry->key */
++int lookup_name(struct inode *parent,	/* inode of directory to lookup for
++					 * name in */
++		struct dentry *dentry,	/* name to look for */
++		reiser4_key * key /* place to store key */ )
++{
++	int result;
++	coord_t *coord;
++	lock_handle lh;
++	const char *name;
++	int len;
++	reiser4_dir_entry_desc entry;
++	reiser4_dentry_fsdata *fsdata;
++
++	assert("nikita-1247", parent != NULL);
++	assert("nikita-1248", dentry != NULL);
++	assert("nikita-1123", dentry->d_name.name != NULL);
++	assert("vs-1486",
++	       dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
++
++	name = dentry->d_name.name;
++	len = dentry->d_name.len;
++
++	if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
++		/* some arbitrary error code to return */
++		return RETERR(-ENAMETOOLONG);
++
++	fsdata = reiser4_get_dentry_fsdata(dentry);
++	if (IS_ERR(fsdata))
++		return PTR_ERR(fsdata);
++
++	coord = &fsdata->dec.entry_coord;
++	coord_clear_iplug(coord);
++	init_lh(&lh);
++
++	/* find entry in a directory. This is plugin method. */
++	result = find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, &entry);
++	if (result == 0) {
++		/* entry was found, extract object key from it. */
++		result =
++		    WITH_COORD(coord,
++			       item_plugin_by_coord(coord)->s.dir.
++			       extract_key(coord, key));
++	}
++	done_lh(&lh);
++	return result;
++
++}
++
++/* helper for init_common(): estimate number of blocks to reserve */
++static reiser4_block_nr
++estimate_init(struct inode *parent, struct inode *object)
++{
++	reiser4_block_nr res = 0;
++
++	assert("vpf-321", parent != NULL);
++	assert("vpf-322", object != NULL);
++
++	/* hashed_add_entry(object) */
++	res += inode_dir_plugin(object)->estimate.add_entry(object);
++	/* reiser4_add_nlink(object) */
++	res += inode_file_plugin(object)->estimate.update(object);
++	/* hashed_add_entry(object) */
++	res += inode_dir_plugin(object)->estimate.add_entry(object);
++	/* reiser4_add_nlink(parent) */
++	res += inode_file_plugin(parent)->estimate.update(parent);
++
++	return 0;
++}
++
++/* helper function for init_common(). Create "." and ".." */
++static int create_dot_dotdot(struct inode *object	/* object to create dot and
++							 * dotdot for */ ,
++			     struct inode *parent /* parent of @object */ )
++{
++	int result;
++	struct dentry dots_entry;
++	reiser4_dir_entry_desc entry;
++
++	assert("nikita-688", object != NULL);
++	assert("nikita-689", S_ISDIR(object->i_mode));
++	assert("nikita-691", parent != NULL);
++
++	/* We store dot and dotdot as normal directory entries. This is
++	   not necessary, because almost all information stored in them
++	   is already in the stat-data of directory, the only thing
++	   being missed is objectid of grand-parent directory that can
++	   easily be added there as extension.
++
++	   But it is done the way it is done, because not storing dot
++	   and dotdot will lead to the following complications:
++
++	   . special case handling in ->lookup().
++	   . addition of another extension to the sd.
++	   . dependency on key allocation policy for stat data.
++
++	 */
++
++	memset(&entry, 0, sizeof entry);
++	memset(&dots_entry, 0, sizeof dots_entry);
++	entry.obj = dots_entry.d_inode = object;
++	dots_entry.d_name.name = ".";
++	dots_entry.d_name.len = 1;
++	result = add_entry_common(object, &dots_entry, NULL, &entry);
++	reiser4_free_dentry_fsdata(&dots_entry);
++
++	if (result == 0) {
++		result = reiser4_add_nlink(object, object, 0);
++		if (result == 0) {
++			entry.obj = dots_entry.d_inode = parent;
++			dots_entry.d_name.name = "..";
++			dots_entry.d_name.len = 2;
++			result = add_entry_common(object,
++						  &dots_entry, NULL, &entry);
++			reiser4_free_dentry_fsdata(&dots_entry);
++			/* if creation of ".." failed, iput() will delete
++			   object with ".". */
++			if (result == 0) {
++				result = reiser4_add_nlink(parent, object, 0);
++				if (result != 0)
++					/*
++					 * if we failed to bump i_nlink, try
++					 * to remove ".."
++					 */
++					detach_common(object, parent);
++			}
++		}
++	}
++
++	if (result != 0) {
++		/*
++		 * in the case of error, at least update stat-data so that,
++		 * ->i_nlink updates are not lingering.
++		 */
++		reiser4_update_sd(object);
++		reiser4_update_sd(parent);
++	}
++
++	return result;
++}
++
++/*
++ * return 0 iff @coord contains a directory entry for the file with the name
++ * @name.
++ */
++static int
++check_item(const struct inode *dir, const coord_t * coord, const char *name)
++{
++	item_plugin *iplug;
++	char buf[DE_NAME_BUF_LEN];
++
++	iplug = item_plugin_by_coord(coord);
++	if (iplug == NULL) {
++		warning("nikita-1135", "Cannot get item plugin");
++		print_coord("coord", coord, 1);
++		return RETERR(-EIO);
++	} else if (item_id_by_coord(coord) !=
++		   item_id_by_plugin(inode_dir_item_plugin(dir))) {
++		/* item id of current item does not match to id of items a
++		   directory is built of */
++		warning("nikita-1136", "Wrong item plugin");
++		print_coord("coord", coord, 1);
++		return RETERR(-EIO);
++	}
++	assert("nikita-1137", iplug->s.dir.extract_name);
++
++	/* Compare name stored in this entry with name we are looking for.
++
++	   NOTE-NIKITA Here should go code for support of something like
++	   unicode, code tables, etc.
++	 */
++	return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
++}
++
++static int
++check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
++{
++	return WITH_COORD(coord, check_item(dir, coord, name->name));
++}
++
++/*
++ * argument package used by entry_actor to scan entries with identical keys.
++ */
++typedef struct entry_actor_args {
++	/* name we are looking for */
++	const char *name;
++	/* key of directory entry. entry_actor() scans through sequence of
++	 * items/units having the same key */
++	reiser4_key *key;
++	/* how many entries with duplicate key was scanned so far. */
++	int non_uniq;
++#if REISER4_USE_COLLISION_LIMIT
++	/* scan limit */
++	int max_non_uniq;
++#endif
++	/* return parameter: set to true, if ->name wasn't found */
++	int not_found;
++	/* what type of lock to take when moving to the next node during
++	 * scan */
++	znode_lock_mode mode;
++
++	/* last coord that was visited during scan */
++	coord_t last_coord;
++	/* last node locked during scan */
++	lock_handle last_lh;
++	/* inode of directory */
++	const struct inode *inode;
++} entry_actor_args;
++
++/* Function called by find_entry() to look for given name in the directory. */
++static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
++		       coord_t * coord /* current coord */ ,
++		       lock_handle * lh /* current lock handle */ ,
++		       void *entry_actor_arg /* argument to scan */ )
++{
++	reiser4_key unit_key;
++	entry_actor_args *args;
++
++	assert("nikita-1131", tree != NULL);
++	assert("nikita-1132", coord != NULL);
++	assert("nikita-1133", entry_actor_arg != NULL);
++
++	args = entry_actor_arg;
++	++args->non_uniq;
++#if REISER4_USE_COLLISION_LIMIT
++	if (args->non_uniq > args->max_non_uniq) {
++		args->not_found = 1;
++		/* hash collision overflow. */
++		return RETERR(-EBUSY);
++	}
++#endif
++
++	/*
++	 * did we just reach the end of the sequence of items/units with
++	 * identical keys?
++	 */
++	if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
++		assert("nikita-1791",
++		       keylt(args->key, unit_key_by_coord(coord, &unit_key)));
++		args->not_found = 1;
++		args->last_coord.between = AFTER_UNIT;
++		return 0;
++	}
++
++	coord_dup(&args->last_coord, coord);
++	/*
++	 * did scan just moved to the next node?
++	 */
++	if (args->last_lh.node != lh->node) {
++		int lock_result;
++
++		/*
++		 * if so, lock new node with the mode requested by the caller
++		 */
++		done_lh(&args->last_lh);
++		assert("nikita-1896", znode_is_any_locked(lh->node));
++		lock_result = longterm_lock_znode(&args->last_lh, lh->node,
++						  args->mode, ZNODE_LOCK_HIPRI);
++		if (lock_result != 0)
++			return lock_result;
++	}
++	return check_item(args->inode, coord, args->name);
++}
++
++/* Look for given @name within directory @dir.
++
++   This is called during lookup, creation and removal of directory
++   entries and on rename_common
++
++   First calculate key that directory entry for @name would have. Search
++   for this key in the tree. If such key is found, scan all items with
++   the same key, checking name in each directory entry along the way.
++*/
++int find_entry(struct inode *dir,	/* directory to scan */
++	       struct dentry *de,	/* name to search for */
++	       lock_handle * lh,	/* resulting lock handle */
++	       znode_lock_mode mode,	/* required lock mode */
++	       reiser4_dir_entry_desc * entry	/* parameters of found directory
++						 * entry */ )
++{
++	const struct qstr *name;
++	seal_t *seal;
++	coord_t *coord;
++	int result;
++	__u32 flags;
++	de_location *dec;
++	reiser4_dentry_fsdata *fsdata;
++
++	assert("nikita-1130", lh != NULL);
++	assert("nikita-1128", dir != NULL);
++
++	name = &de->d_name;
++	assert("nikita-1129", name != NULL);
++
++	/* dentry private data don't require lock, because dentry
++	   manipulations are protected by i_mutex on parent.
++
++	   This is not so for inodes, because there is no -the- parent in
++	   inode case.
++	 */
++	fsdata = reiser4_get_dentry_fsdata(de);
++	if (IS_ERR(fsdata))
++		return PTR_ERR(fsdata);
++	dec = &fsdata->dec;
++
++	coord = &dec->entry_coord;
++	coord_clear_iplug(coord);
++	seal = &dec->entry_seal;
++	/* compose key of directory entry for @name */
++	inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
++
++	if (seal_is_set(seal)) {
++		/* check seal */
++		result = seal_validate(seal, coord, &entry->key,
++				       lh, mode, ZNODE_LOCK_LOPRI);
++		if (result == 0) {
++			/* key was found. Check that it is really item we are
++			   looking for. */
++			result = check_entry(dir, coord, name);
++			if (result == 0)
++				return 0;
++		}
++	}
++	flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
++	/*
++	 * find place in the tree where directory item should be located.
++	 */
++	result = object_lookup(dir, &entry->key, coord, lh, mode,
++			       FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags,
++			       NULL /*ra_info */ );
++	if (result == CBK_COORD_FOUND) {
++		entry_actor_args arg;
++
++		/* fast path: no hash collisions */
++		result = check_entry(dir, coord, name);
++		if (result == 0) {
++			seal_init(seal, coord, &entry->key);
++			dec->pos = 0;
++		} else if (result > 0) {
++			/* Iterate through all units with the same keys. */
++			arg.name = name->name;
++			arg.key = &entry->key;
++			arg.not_found = 0;
++			arg.non_uniq = 0;
++#if REISER4_USE_COLLISION_LIMIT
++			arg.max_non_uniq = max_hash_collisions(dir);
++			assert("nikita-2851", arg.max_non_uniq > 1);
++#endif
++			arg.mode = mode;
++			arg.inode = dir;
++			coord_init_zero(&arg.last_coord);
++			init_lh(&arg.last_lh);
++
++			result = iterate_tree(tree_by_inode(dir), coord, lh,
++					      entry_actor, &arg, mode, 1);
++			/* if end of the tree or extent was reached during
++			   scanning. */
++			if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
++				/* step back */
++				done_lh(lh);
++
++				result = zload(arg.last_coord.node);
++				if (result == 0) {
++					coord_clear_iplug(&arg.last_coord);
++					coord_dup(coord, &arg.last_coord);
++					move_lh(lh, &arg.last_lh);
++					result = RETERR(-ENOENT);
++					zrelse(arg.last_coord.node);
++					--arg.non_uniq;
++				}
++			}
++
++			done_lh(&arg.last_lh);
++			if (result == 0)
++				seal_init(seal, coord, &entry->key);
++
++			if (result == 0 || result == -ENOENT) {
++				assert("nikita-2580", arg.non_uniq > 0);
++				dec->pos = arg.non_uniq - 1;
++			}
++		}
++	} else
++		dec->pos = -1;
++	return result;
++}
++
++/* Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/disk_format/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/disk_format/Makefile
+@@ -0,0 +1,5 @@
++obj-$(CONFIG_REISER4_FS) += df_plugins.o
++
++df_plugins-objs :=	\
++	disk_format40.o	\
++	disk_format.o
+Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.c
+@@ -0,0 +1,37 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "../../debug.h"
++#include "../plugin_header.h"
++#include "disk_format40.h"
++#include "disk_format.h"
++#include "../plugin.h"
++
++/* initialization of disk layout plugins */
++disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
++	[FORMAT40_ID] = {
++		.h = {
++			.type_id = REISER4_FORMAT_PLUGIN_TYPE,
++			.id = FORMAT40_ID,
++			.pops = NULL,
++			.label = "reiser40",
++			.desc = "standard disk layout for reiser40",
++			.linkage = {NULL, NULL}
++		},
++		.init_format = init_format_format40,
++		.root_dir_key = root_dir_key_format40,
++		.release = release_format40,
++		.log_super = log_super_format40,
++		.check_open = check_open_format40
++	}
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format.h
+@@ -0,0 +1,27 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* identifiers for disk layouts, they are also used as indexes in array of disk
++   plugins */
++
++#if !defined( __REISER4_DISK_FORMAT_H__ )
++#define __REISER4_DISK_FORMAT_H__
++
++typedef enum {
++	/* standard reiser4 disk layout plugin id */
++	FORMAT40_ID,
++	LAST_FORMAT_ID
++} disk_format_id;
++
++/* __REISER4_DISK_FORMAT_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.c
+@@ -0,0 +1,556 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../../key.h"
++#include "../node/node.h"
++#include "../space/space_allocator.h"
++#include "disk_format40.h"
++#include "../plugin.h"
++#include "../../txnmgr.h"
++#include "../../jnode.h"
++#include "../../tree.h"
++#include "../../super.h"
++#include "../../wander.h"
++#include "../../inode.h"
++#include "../../ktxnmgrd.h"
++#include "../../status_flags.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block  */
++#include <linux/buffer_head.h>
++
++/* reiser 4.0 default disk layout */
++
++/* Amount of free blocks needed to perform release_format40 when fs gets
++   mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
++   & tx record. */
++#define RELEASE_RESERVED 4
++
++/* functions to access fields of format40_disk_super_block */
++static __u64 get_format40_block_count(const format40_disk_super_block * sb)
++{
++	return le64_to_cpu(get_unaligned(&sb->block_count));
++}
++
++static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
++{
++	return le64_to_cpu(get_unaligned(&sb->free_blocks));
++}
++
++static __u64 get_format40_root_block(const format40_disk_super_block * sb)
++{
++	return le64_to_cpu(get_unaligned(&sb->root_block));
++}
++
++static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
++{
++	return le16_to_cpu(get_unaligned(&sb->tree_height));
++}
++
++static __u64 get_format40_file_count(const format40_disk_super_block * sb)
++{
++	return le64_to_cpu(get_unaligned(&sb->file_count));
++}
++
++static __u64 get_format40_oid(const format40_disk_super_block * sb)
++{
++	return le64_to_cpu(get_unaligned(&sb->oid));
++}
++
++static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
++{
++	return le32_to_cpu(get_unaligned(&sb->mkfs_id));
++}
++
++static __u64 get_format40_flags(const format40_disk_super_block * sb)
++{
++	return le64_to_cpu(get_unaligned(&sb->flags));
++}
++
++static format40_super_info *get_sb_info(struct super_block *super)
++{
++	return &get_super_private(super)->u.format40;
++}
++
++static int consult_diskmap(struct super_block *s)
++{
++	format40_super_info *info;
++	journal_location *jloc;
++
++	info = get_sb_info(s);
++	jloc = &get_super_private(s)->jloc;
++	/* Default format-specific locations, if there is nothing in
++	 * diskmap */
++	jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
++	jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
++	info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
++#ifdef CONFIG_REISER4_BADBLOCKS
++	reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
++				  &jloc->footer);
++	reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
++				  &jloc->header);
++	reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
++				  &info->loc.super);
++#endif
++	return 0;
++}
++
++/* find any valid super block of disk_format40 (even if the first
++   super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
++   if needed */
++static struct buffer_head *find_a_disk_format40_super_block(struct super_block
++							    *s)
++{
++	struct buffer_head *super_bh;
++	format40_disk_super_block *disk_sb;
++	format40_super_info *info;
++
++	assert("umka-487", s != NULL);
++
++	info = get_sb_info(s);
++
++	super_bh = sb_bread(s, info->loc.super);
++	if (super_bh == NULL)
++		return ERR_PTR(RETERR(-EIO));
++
++	disk_sb = (format40_disk_super_block *) super_bh->b_data;
++	if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
++		brelse(super_bh);
++		return ERR_PTR(RETERR(-EINVAL));
++	}
++
++	reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
++	reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
++				le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
++	reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
++
++	return super_bh;
++}
++
++/* find the most recent version of super block. This is called after journal is
++   replayed */
++static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
++{
++	/* Here the most recent superblock copy has to be read. However, as
++	   journal replay isn't complete, we are using
++	   find_a_disk_format40_super_block() function. */
++	return find_a_disk_format40_super_block(s);
++}
++
++static int get_super_jnode(struct super_block *s)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++	jnode *sb_jnode;
++	int ret;
++
++	sb_jnode = alloc_io_head(&get_sb_info(s)->loc.super);
++
++	ret = jload(sb_jnode);
++
++	if (ret) {
++		drop_io_head(sb_jnode);
++		return ret;
++	}
++
++	pin_jnode_data(sb_jnode);
++	jrelse(sb_jnode);
++
++	sbinfo->u.format40.sb_jnode = sb_jnode;
++
++	return 0;
++}
++
++static void done_super_jnode(struct super_block *s)
++{
++	jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
++
++	if (sb_jnode) {
++		unpin_jnode_data(sb_jnode);
++		drop_io_head(sb_jnode);
++	}
++}
++
++typedef enum format40_init_stage {
++	NONE_DONE = 0,
++	CONSULT_DISKMAP,
++	FIND_A_SUPER,
++	INIT_JOURNAL_INFO,
++	INIT_STATUS,
++	JOURNAL_REPLAY,
++	READ_SUPER,
++	KEY_CHECK,
++	INIT_OID,
++	INIT_TREE,
++	JOURNAL_RECOVER,
++	INIT_SA,
++	INIT_JNODE,
++	ALL_DONE
++} format40_init_stage;
++
++static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
++{
++	format40_disk_super_block *sb_copy;
++
++	sb_copy = kmalloc(sizeof(format40_disk_super_block), get_gfp_mask());
++	if (sb_copy == NULL)
++		return ERR_PTR(RETERR(-ENOMEM));
++	memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
++	       sizeof(format40_disk_super_block));
++	return sb_copy;
++}
++
++static int check_key_format(const format40_disk_super_block *sb_copy)
++{
++	if (!equi(REISER4_LARGE_KEY,
++		  get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
++		warning("nikita-3228", "Key format mismatch. "
++			"Only %s keys are supported.",
++			REISER4_LARGE_KEY ? "large" : "small");
++		return RETERR(-EINVAL);
++	}
++	return 0;
++}
++
++/**
++ * try_init_format40
++ * @super:
++ * @stage:
++ *
++ */
++static int try_init_format40(struct super_block *super,
++			     format40_init_stage *stage)
++{
++	int result;
++	struct buffer_head *super_bh;
++	reiser4_super_info_data *sbinfo;
++	format40_disk_super_block *sb_copy;
++	tree_level height;
++	reiser4_block_nr root_block;
++	node_plugin *nplug;
++
++	assert("vs-475", super != NULL);
++	assert("vs-474", get_super_private(super));
++
++	*stage = NONE_DONE;
++
++	result = consult_diskmap(super);
++	if (result)
++		return result;
++	*stage = CONSULT_DISKMAP;
++
++	super_bh = find_a_disk_format40_super_block(super);
++	if (IS_ERR(super_bh))
++		return PTR_ERR(super_bh);
++	brelse(super_bh);
++	*stage = FIND_A_SUPER;
++
++	/* map jnodes for journal control blocks (header, footer) to disk  */
++	result = init_journal_info(super);
++	if (result)
++		return result;
++	*stage = INIT_JOURNAL_INFO;
++
++	/* ok, we are sure that filesystem format is a format40 format */
++	/* Now check it's state */
++	result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
++	if (result != 0 && result != -EINVAL)
++		/* -EINVAL means there is no magic, so probably just old
++		 * fs. */
++		return result;
++	*stage = INIT_STATUS;
++
++	result = reiser4_status_query(NULL, NULL);
++	if (result == REISER4_STATUS_MOUNT_WARN)
++		printk("Warning, mounting filesystem with errors\n");
++	if (result == REISER4_STATUS_MOUNT_RO) {
++		printk
++		    ("Warning, mounting filesystem with fatal errors, forcing read-only mount\n");
++		/* FIXME: here we should actually enforce read-only mount,
++		 * only it is unsupported yet. */
++	}
++
++	result = reiser4_journal_replay(super);
++	if (result)
++		return result;
++	*stage = JOURNAL_REPLAY;
++
++	super_bh = read_super_block(super);
++	if (IS_ERR(super_bh))
++		return PTR_ERR(super_bh);
++	*stage = READ_SUPER;
++
++	/* allocate and make a copy of format40_disk_super_block */
++	sb_copy = copy_sb(super_bh);
++	brelse(super_bh);
++	if (IS_ERR(sb_copy))
++		return PTR_ERR(sb_copy);
++
++	/* make sure that key format of kernel and filesyste match */
++	result = check_key_format(sb_copy);
++	if (result) {
++		kfree(sb_copy);
++		return result;
++	}
++	*stage = KEY_CHECK;
++
++	result = oid_init_allocator(super, get_format40_file_count(sb_copy),
++				    get_format40_oid(sb_copy));
++	if (result) {
++		kfree(sb_copy);
++		return result;
++	}
++	*stage = INIT_OID;
++
++	/* get things necessary to init reiser4_tree */
++	root_block = get_format40_root_block(sb_copy);
++	height = get_format40_tree_height(sb_copy);
++	nplug = node_plugin_by_id(NODE40_ID);
++
++
++	/* initialize reiser4_super_info_data */
++	sbinfo = get_super_private(super);
++	assert("", sbinfo->tree.super == super);
++	/* init reiser4_tree for the filesystem */
++	result = init_tree(&sbinfo->tree, &root_block, height, nplug);
++	if (result) {
++		kfree(sb_copy);
++		return result;
++	}
++	*stage = INIT_TREE;
++
++	/*
++	 * initialize reiser4_super_info_data with data from format40 super
++	 * block
++	 */
++	sbinfo->default_uid = 0;
++	sbinfo->default_gid = 0;
++	sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
++	/* number of blocks in filesystem and reserved space */
++	reiser4_set_block_count(super, get_format40_block_count(sb_copy));
++	sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
++	kfree(sb_copy);
++
++	sbinfo->fsuid = 0;
++	sbinfo->fs_flags |= (1 << REISER4_ADG);	/* hard links for directories
++						 * are not supported */
++	sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN);	/* all nodes in
++								 * layout 40 are
++								 * of one
++								 * plugin */
++	/* sbinfo->tmgr is initialized already */
++
++	/* recover sb data which were logged separately from sb block */
++
++	/* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
++	 * oid_init_allocator() and reiser4_set_free_blocks() with new
++	 * data. What's the reason to call them above? */
++	result = reiser4_journal_recover_sb_data(super);
++	if (result != 0)
++		return result;
++	*stage = JOURNAL_RECOVER;
++
++	/*
++	 * Set number of used blocks.  The number of used blocks is not stored
++	 * neither in on-disk super block nor in the journal footer blocks.  At
++	 * this moment actual values of total blocks and free block counters
++	 * are set in the reiser4 super block (in-memory structure) and we can
++	 * calculate number of used blocks from them.
++	 */
++	reiser4_set_data_blocks(super,
++				reiser4_block_count(super) -
++				reiser4_free_blocks(super));
++
++#if REISER4_DEBUG
++	sbinfo->min_blocks_used = 16 /* reserved area */  +
++		2 /* super blocks */  +
++		2 /* journal footer and header */ ;
++#endif
++
++	/* init disk space allocator */
++	result = sa_init_allocator(get_space_allocator(super), super, NULL);
++	if (result)
++		return result;
++	*stage = INIT_SA;
++
++	result = get_super_jnode(super);
++	if (result == 0)
++		*stage = ALL_DONE;
++	return result;
++}
++
++/* plugin->u.format.get_ready */
++int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
++{
++	int result;
++	format40_init_stage stage;
++
++	result = try_init_format40(s, &stage);
++	switch (stage) {
++	case ALL_DONE:
++		assert("nikita-3458", result == 0);
++		break;
++	case INIT_JNODE:
++		done_super_jnode(s);
++	case INIT_SA:
++		sa_destroy_allocator(get_space_allocator(s), s);
++	case JOURNAL_RECOVER:
++	case INIT_TREE:
++		done_tree(&get_super_private(s)->tree);
++	case INIT_OID:
++	case KEY_CHECK:
++	case READ_SUPER:
++	case JOURNAL_REPLAY:
++	case INIT_STATUS:
++		reiser4_status_finish();
++	case INIT_JOURNAL_INFO:
++		done_journal_info(s);
++	case FIND_A_SUPER:
++	case CONSULT_DISKMAP:
++	case NONE_DONE:
++		break;
++	default:
++		impossible("nikita-3457", "init stage: %i", stage);
++	}
++
++	if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
++		return RETERR(-ENOSPC);
++
++	return result;
++}
++
++static void pack_format40_super(const struct super_block *s, char *data)
++{
++	format40_disk_super_block *super_data =
++	    (format40_disk_super_block *) data;
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++
++	assert("zam-591", data != NULL);
++
++	put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
++		      &super_data->free_blocks);
++	put_unaligned(cpu_to_le64(sbinfo->tree.root_block), &super_data->root_block);
++
++	put_unaligned(cpu_to_le64(oid_next(s)), &super_data->oid);
++	put_unaligned(cpu_to_le64(oids_used(s)), &super_data->file_count);
++
++	put_unaligned(cpu_to_le16(sbinfo->tree.height), &super_data->tree_height);
++}
++
++/* plugin->u.format.log_super
++   return a jnode which should be added to transaction when the super block
++   gets logged */
++jnode *log_super_format40(struct super_block *s)
++{
++	jnode *sb_jnode;
++
++	sb_jnode = get_super_private(s)->u.format40.sb_jnode;
++
++	jload(sb_jnode);
++
++	pack_format40_super(s, jdata(sb_jnode));
++
++	jrelse(sb_jnode);
++
++	return sb_jnode;
++}
++
++/* plugin->u.format.release */
++int release_format40(struct super_block *s)
++{
++	int ret;
++	reiser4_super_info_data *sbinfo;
++
++	sbinfo = get_super_private(s);
++	assert("zam-579", sbinfo != NULL);
++
++	if (!rofs_super(s)) {
++		ret = capture_super_block(s);
++		if (ret != 0)
++			warning("vs-898", "capture_super_block failed: %d",
++				ret);
++
++		ret = txnmgr_force_commit_all(s, 1);
++		if (ret != 0)
++			warning("jmacd-74438", "txn_force failed: %d", ret);
++
++		all_grabbed2free();
++	}
++
++	sa_destroy_allocator(&sbinfo->space_allocator, s);
++	done_journal_info(s);
++	done_super_jnode(s);
++
++	rcu_barrier();
++	done_tree(&sbinfo->tree);
++	/* call finish_rcu(), because some znode were "released" in
++	 * done_tree(). */
++	rcu_barrier();
++
++	return 0;
++}
++
++#define FORMAT40_ROOT_LOCALITY 41
++#define FORMAT40_ROOT_OBJECTID 42
++
++/* plugin->u.format.root_dir_key */
++const reiser4_key *root_dir_key_format40(const struct super_block *super
++					 UNUSED_ARG)
++{
++	static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
++		.el = {
++			__constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
++#if REISER4_LARGE_KEY
++			ON_LARGE_KEY(0ull,)
++#endif
++			__constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
++			0ull
++		}
++	};
++
++	return &FORMAT40_ROOT_DIR_KEY;
++}
++
++/* plugin->u.format.check_open.
++   Check the opened object for validness. For now it checks for the valid oid &
++   locality only, can be improved later and it its work may depend on the mount
++   options. */
++int check_open_format40(const struct inode *object)
++{
++	oid_t max, oid;
++
++	max = oid_next(object->i_sb) - 1;
++
++	/* Check the oid. */
++	oid = get_inode_oid(object);
++	if (oid > max) {
++		warning("vpf-1360", "The object with the oid %llu "
++			"greater then the max used oid %llu found.",
++			(unsigned long long)oid, (unsigned long long)max);
++
++		return RETERR(-EIO);
++	}
++
++	/* Check the locality. */
++	oid = reiser4_inode_data(object)->locality_id;
++	if (oid > max) {
++		warning("vpf-1360", "The object with the locality %llu "
++			"greater then the max used oid %llu found.",
++			(unsigned long long)oid, (unsigned long long)max);
++
++		return RETERR(-EIO);
++	}
++
++	return 0;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/disk_format/disk_format40.h
+@@ -0,0 +1,99 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* this file contains:
++   - definition of ondisk super block of standart disk layout for
++     reiser 4.0 (layout 40)
++   - definition of layout 40 specific portion of in-core super block
++   - declarations of functions implementing methods of layout plugin
++     for layout 40
++   - declarations of functions used to get/set fields in layout 40 super block
++*/
++
++#ifndef __DISK_FORMAT40_H__
++#define __DISK_FORMAT40_H__
++
++/* magic for default reiser4 layout */
++#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
++#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
++
++#include "../../dformat.h"
++
++#include <linux/fs.h>		/* for struct super_block  */
++
++typedef enum {
++	FORMAT40_LARGE_KEYS
++} format40_flags;
++
++/* ondisk super block for format 40. It is 512 bytes long */
++typedef struct format40_disk_super_block {
++	/*   0 */ d64 block_count;
++	/* number of block in a filesystem */
++	/*   8 */ d64 free_blocks;
++	/* number of free blocks */
++	/*  16 */ d64 root_block;
++	/* filesystem tree root block */
++	/*  24 */ d64 oid;
++	/* smallest free objectid */
++	/*  32 */ d64 file_count;
++	/* number of files in a filesystem */
++	/*  40 */ d64 flushes;
++	/* number of times super block was
++	   flushed. Needed if format 40
++	   will have few super blocks */
++	/*  48 */ d32 mkfs_id;
++	/* unique identifier of fs */
++	/*  52 */ char magic[16];
++	/* magic string ReIsEr40FoRmAt */
++	/*  68 */ d16 tree_height;
++	/* height of filesystem tree */
++	/*  70 */ d16 formatting_policy;
++	/*  72 */ d64 flags;
++	/*  72 */ char not_used[432];
++} format40_disk_super_block;
++
++/* format 40 specific part of reiser4_super_info_data */
++typedef struct format40_super_info {
++/*	format40_disk_super_block actual_sb; */
++	jnode *sb_jnode;
++	struct {
++		reiser4_block_nr super;
++	} loc;
++} format40_super_info;
++
++/* Defines for journal header and footer respectively. */
++#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
++	((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
++
++#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
++	((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
++
++#define FORMAT40_STATUS_BLOCKNR \
++	((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
++
++/* Diskmap declarations */
++#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
++#define FORMAT40_SUPER 1
++#define FORMAT40_JH 2
++#define FORMAT40_JF 3
++
++/* declarations of functions implementing methods of layout plugin for
++   format 40. The functions theirself are in disk_format40.c */
++int init_format_format40(struct super_block *, void *data);
++const reiser4_key *root_dir_key_format40(const struct super_block *);
++int release_format40(struct super_block *s);
++jnode *log_super_format40(struct super_block *s);
++int check_open_format40(const struct inode *object);
++
++/* __DISK_FORMAT40_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/fibration.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/fibration.c
+@@ -0,0 +1,174 @@
++/* Copyright 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Directory fibrations */
++
++/*
++ * Suppose we have a directory tree with sources of some project. During
++ * compilation .o files are created within this tree. This makes access
++ * to the original source files less efficient, because source files are
++ * now "diluted" by object files: default directory plugin uses prefix
++ * of a file name as a part of the key for directory entry (and this
++ * part is also inherited by the key of file body). This means that
++ * foo.o will be located close to foo.c and foo.h in the tree.
++ *
++ * To avoid this effect directory plugin fill highest 7 (unused
++ * originally) bits of the second component of the directory entry key
++ * by bit-pattern depending on the file name (see
++ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
++ * "fibre". Fibre of the file name key is inherited by key of stat data
++ * and keys of file body (in the case of REISER4_LARGE_KEY).
++ *
++ * Fibre for a given file is chosen by per-directory fibration
++ * plugin. Names within given fibre are ordered lexicographically.
++ */
++
++#include "../debug.h"
++#include "plugin_header.h"
++#include "plugin.h"
++#include "../super.h"
++#include "../inode.h"
++
++#include <linux/types.h>
++
++static const int fibre_shift = 57;
++
++#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
++
++/*
++ * Trivial fibration: all files of directory are just ordered
++ * lexicographically.
++ */
++static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
++{
++	return FIBRE_NO(0);
++}
++
++/*
++ * dot-o fibration: place .o files after all others.
++ */
++static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
++{
++	/* special treatment for .*\.o */
++	if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
++		return FIBRE_NO(1);
++	else
++		return FIBRE_NO(0);
++}
++
++/*
++ * ext.1 fibration: subdivide directory into 128 fibrations one for each
++ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
++ * default fibre for the rest.
++ */
++static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
++{
++	if (len > 2 && name[len - 2] == '.')
++		return FIBRE_NO(name[len - 1]);
++	else
++		return FIBRE_NO(0);
++}
++
++/*
++ * ext.3 fibration: try to separate files with different 3-character
++ * extensions from each other.
++ */
++static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
++{
++	if (len > 4 && name[len - 4] == '.')
++		return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
++	else
++		return FIBRE_NO(0);
++}
++
++static int change_fibration(struct inode *inode, reiser4_plugin * plugin)
++{
++	int result;
++
++	assert("nikita-3503", inode != NULL);
++	assert("nikita-3504", plugin != NULL);
++
++	assert("nikita-3505", is_reiser4_inode(inode));
++	assert("nikita-3506", inode_dir_plugin(inode) != NULL);
++	assert("nikita-3507",
++	       plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
++
++	result = 0;
++	if (inode_fibration_plugin(inode) == NULL ||
++	    inode_fibration_plugin(inode)->h.id != plugin->h.id) {
++		if (is_dir_empty(inode) == 0)
++			result =
++			    plugin_set_fibration(&reiser4_inode_data(inode)->
++						 pset, &plugin->fibration);
++		else
++			result = RETERR(-ENOTEMPTY);
++
++	}
++	return result;
++}
++
++static reiser4_plugin_ops fibration_plugin_ops = {
++	.init = NULL,
++	.load = NULL,
++	.save_len = NULL,
++	.save = NULL,
++	.change = change_fibration
++};
++
++/* fibration plugins */
++fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
++	[FIBRATION_LEXICOGRAPHIC] = {
++		.h = {
++			.type_id = REISER4_FIBRATION_PLUGIN_TYPE,
++			.id = FIBRATION_LEXICOGRAPHIC,
++			.pops = &fibration_plugin_ops,
++			.label = "lexicographic",
++			.desc = "no fibration",
++			.linkage = {NULL, NULL}
++		},
++		.fibre = fibre_trivial
++	},
++	[FIBRATION_DOT_O] = {
++		.h = {
++			.type_id = REISER4_FIBRATION_PLUGIN_TYPE,
++			.id = FIBRATION_DOT_O,
++			.pops = &fibration_plugin_ops,
++			.label = "dot-o",
++			.desc = "fibrate .o files separately",
++			.linkage = {NULL, NULL}
++		},
++		.fibre = fibre_dot_o
++	},
++	[FIBRATION_EXT_1] = {
++		.h = {
++			.type_id = REISER4_FIBRATION_PLUGIN_TYPE,
++			.id = FIBRATION_EXT_1,
++			.pops = &fibration_plugin_ops,
++			.label = "ext-1",
++			.desc = "fibrate file by single character extension",
++			.linkage = {NULL, NULL}
++		},
++		.fibre = fibre_ext_1
++	},
++	[FIBRATION_EXT_3] = {
++		.h = {
++			.type_id = REISER4_FIBRATION_PLUGIN_TYPE,
++			.id = FIBRATION_EXT_3,
++			.pops = &fibration_plugin_ops,
++			.label = "ext-3",
++			.desc = "fibrate file by three character extension",
++			.linkage = {NULL, NULL}
++		},
++		.fibre = fibre_ext_3
++	}
++};
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/fibration.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/fibration.h
+@@ -0,0 +1,37 @@
++/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Fibration plugin used by hashed directory plugin to segment content
++ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
++
++#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
++#define __FS_REISER4_PLUGIN_FIBRATION_H__
++
++#include "plugin_header.h"
++
++typedef struct fibration_plugin {
++	/* generic fields */
++	plugin_header h;
++
++	 __u64(*fibre) (const struct inode * dir, const char *name, int len);
++} fibration_plugin;
++
++typedef enum {
++	FIBRATION_LEXICOGRAPHIC,
++	FIBRATION_DOT_O,
++	FIBRATION_EXT_1,
++	FIBRATION_EXT_3,
++	LAST_FIBRATION_ID
++} reiser4_fibration_id;
++
++/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/file/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/Makefile
+@@ -0,0 +1,7 @@
++obj-$(CONFIG_REISER4_FS) += file_plugins.o
++
++file_plugins-objs :=		\
++	file.o			\
++	tail_conversion.o	\
++	symlink.o		\
++	cryptcompress.o
+Index: linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.c
+@@ -0,0 +1,3817 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by 
++   reiser4/README */
++
++/* This file contains implementations of inode/file/address_space/file plugin
++ * operations specific for cryptcompress file plugin which manages files with
++ * compressed and encrypted bodies. "Cryptcompress file" is built of items of
++ * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
++ */
++
++#include "../../page_cache.h"
++#include "../../inode.h"
++#include "../cluster.h"
++#include "../object.h"
++#include "../../tree_walk.h"
++#include "cryptcompress.h"
++
++#include <asm/scatterlist.h>
++#include <linux/pagevec.h>
++#include <asm/uaccess.h>
++#include <linux/swap.h>
++#include <linux/writeback.h>
++#include <linux/random.h>
++
++/* get cryptcompress specific portion of inode */
++cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
++{
++	return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
++}
++
++/* plugin->u.file.init_inode_data */
++void
++init_inode_data_cryptcompress(struct inode *inode,
++			      reiser4_object_create_data * crd, int create)
++{
++	cryptcompress_info_t *data;
++
++	data = cryptcompress_inode_data(inode);
++	assert("edward-685", data != NULL);
++
++	memset(data, 0, sizeof(*data));
++
++	init_rwsem(&data->lock);
++	toggle_compression(data, 1);
++	init_inode_ordering(inode, crd, create);
++}
++
++#if REISER4_DEBUG
++int crc_inode_ok(struct inode *inode)
++{
++	if (cluster_shift_ok(inode_cluster_shift(inode)))
++		return 1;
++	assert("edward-686", 0);
++	return 0;
++}
++#endif
++
++static int check_cryptcompress(struct inode *inode)
++{
++	int result = 0;
++	assert("edward-1307", inode_compression_plugin(inode) != NULL);
++
++	if (inode_cluster_size(inode) < PAGE_CACHE_SIZE) {
++		warning("edward-1331",
++			"%s clusters are unsupported",
++			inode_cluster_plugin(inode)->h.label);
++		return RETERR(-EINVAL);
++	}
++
++	/* FIXME-EDWARD: init? or check? */
++	if (inode_compression_plugin(inode)->init)
++		result = inode_compression_plugin(inode)->init();
++	return result;
++}
++
++/* The following is a part of reiser4 cipher key manager
++   which is called when opening/creating a cryptcompress file */
++
++/* get/set cipher key info */
++crypto_stat_t * inode_crypto_stat (struct inode * inode)
++{
++	assert("edward-90", inode != NULL);
++	assert("edward-91", reiser4_inode_data(inode) != NULL);
++	return cryptcompress_inode_data(inode)->crypt;
++}
++
++static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
++{
++	cryptcompress_inode_data(inode)->crypt = stat;
++}
++
++/* allocate a cipher key info */
++crypto_stat_t * alloc_crypto_stat (struct inode * inode)
++{
++	crypto_stat_t * info;
++	int fipsize;
++
++	assert("edward-1421", 0);
++	info = kmalloc(sizeof(*info), GFP_KERNEL);
++	if (!info)
++		return ERR_PTR(-ENOMEM);
++	memset(info, 0, sizeof (*info));
++	fipsize = inode_digest_plugin(inode)->fipsize;
++	info->keyid = kmalloc(fipsize, GFP_KERNEL);
++	if (!info->keyid) {
++		kfree(info);
++		return ERR_PTR(-ENOMEM);
++	}
++	return info;
++}
++
++#if 0
++/* allocate/free low-level info for cipher and digest
++   transforms */
++static int
++alloc_crypto_tfms(plugin_set * pset, crypto_stat_t * info)
++{
++	struct crypto_tfm * ret = NULL;
++	cipher_plugin * cplug = pset->cipher;
++	digest_plugin * dplug = pset->digest;
++
++	assert("edward-1363", info != NULL);
++	assert("edward-414", cplug != NULL);
++	assert("edward-415", dplug != NULL);
++
++	if (cplug->alloc) {
++		ret = cplug->alloc();
++		if (ret == NULL) {
++			warning("edward-1364",
++				"Can not allocate info for %s\n",
++				cplug->h.desc);
++			return RETERR(-EINVAL);
++		}
++	}
++	info_set_tfm(info, CIPHER_TFM, ret);
++	if (dplug->alloc) {
++		ret = dplug->alloc();
++		if (ret == NULL) {
++			warning("edward-1365",
++				"Can not allocate info for %s\n",
++				dplug->h.desc);
++			goto err;
++		}
++	}
++	info_set_tfm(info, DIGEST_TFM, ret);
++	return 0;
++ err:
++	if (cplug->free) {
++		cplug->free(info->tfma[CIPHER_TFM].tfm);
++		info_set_tfm(info, CIPHER_TFM, NULL);
++	}
++	return RETERR(-EINVAL);
++}
++#endif
++
++static void
++free_crypto_tfms(crypto_stat_t * info)
++{
++	assert("edward-1366", info != NULL);
++	if (!info_cipher_tfm(info))
++		return;
++	info_cipher_plugin(info)->free(info_cipher_tfm(info));
++	info_set_tfm(info, CIPHER_TFM, NULL);
++	info_digest_plugin(info)->free(info_digest_tfm(info));
++	info_set_tfm(info, DIGEST_TFM, NULL);
++	return;
++}
++
++#if 0
++/* create a key fingerprint for disk stat-data */
++static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
++{
++	int ret = -ENOMEM;
++	size_t blk, pad;
++	__u8 * dmem;
++	__u8 * cmem;
++	struct crypto_tfm * dtfm;
++	struct crypto_tfm * ctfm;
++	struct scatterlist sg;
++
++	assert("edward-1422", 0);
++	assert("edward-1367", info != NULL);
++	assert("edward-1368", info->keyid != NULL);
++
++	dtfm = info_digest_tfm(info);
++	ctfm = info_cipher_tfm(info);
++
++	dmem = kmalloc((size_t)crypto_tfm_alg_digestsize(dtfm),
++			       GFP_KERNEL);
++	if (!dmem)
++		goto exit1;
++
++	blk = crypto_tfm_alg_blocksize(ctfm);
++
++	pad = data->keyid_size % blk;
++	pad = (pad ? blk - pad : 0);
++
++	cmem = kmalloc((size_t)data->keyid_size + pad, GFP_KERNEL);
++	if (!cmem)
++		goto exit2;
++	memcpy(cmem, data->keyid, data->keyid_size);
++	memset(cmem + data->keyid_size, 0, pad);
++
++	sg.page = virt_to_page(cmem);
++	sg.offset = offset_in_page(cmem);
++	sg.length = data->keyid_size + pad;
++
++	ret = crypto_cipher_encrypt(ctfm, &sg, &sg, data->keyid_size + pad);
++	if (ret) {
++		warning("edward-1369",
++			"encryption failed flags=%x\n", ctfm->crt_flags);
++		goto exit3;
++	}
++	crypto_digest_init (dtfm);
++	crypto_digest_update (dtfm, &sg, 1);
++	crypto_digest_final (dtfm, dmem);
++	memcpy(info->keyid, dmem, info_digest_plugin(info)->fipsize);
++ exit3:
++	kfree(cmem);
++ exit2:
++	kfree(dmem);
++ exit1:
++	return ret;
++}
++#endif
++
++static void destroy_keyid(crypto_stat_t * info)
++{
++	assert("edward-1370", info != NULL);
++	assert("edward-1371", info->keyid != NULL);
++	kfree(info->keyid);
++	return;
++}
++
++static void free_crypto_stat (crypto_stat_t * info)
++{
++	assert("edward-1372", info != NULL);
++
++	free_crypto_tfms(info);
++	destroy_keyid(info);
++	kfree(info);
++}
++
++#if 0
++static void instantiate_crypto_stat(crypto_stat_t * info)
++{
++	assert("edward-1373", info != NULL);
++	assert("edward-1374", info->inst == 0);
++	info->inst = 1;
++}
++#endif
++
++static void uninstantiate_crypto_stat(crypto_stat_t * info)
++{
++	assert("edward-1375", info != NULL);
++	info->inst = 0;
++}
++
++static int crypto_stat_instantiated(crypto_stat_t * info)
++{
++	return info->inst;
++}
++
++static int inode_has_cipher_key(struct inode * inode)
++{
++	assert("edward-1376", inode != NULL);
++	return inode_crypto_stat(inode) &&
++		crypto_stat_instantiated(inode_crypto_stat(inode));
++}
++
++static void inode_free_crypto_stat (struct inode * inode)
++{
++	uninstantiate_crypto_stat(inode_crypto_stat(inode));
++	free_crypto_stat(inode_crypto_stat(inode));
++}
++
++static int need_cipher(struct inode * inode)
++{
++	return inode_cipher_plugin(inode) !=
++		cipher_plugin_by_id(NONE_CIPHER_ID);
++}
++
++/* Create a crypto-stat and attach result to the @object.
++   If success is returned, then low-level cipher info contains
++   an instantiated key */
++#if 0
++crypto_stat_t * 
++create_crypto_stat(struct inode * object, 
++		   crypto_data_t * data /* this contains a (uninstantiated) 
++					   cipher key imported from user
++					   space */)
++{
++	int ret;
++	crypto_stat_t * info;
++
++	assert("edward-1377", data != NULL);
++	assert("edward-1378", need_cipher(object));
++
++	if (inode_file_plugin(object) !=
++	    file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
++		return ERR_PTR(-EINVAL);
++
++	info = alloc_crypto_stat(object);
++	if (IS_ERR(info))
++		return info;
++	ret = alloc_crypto_tfms(reiser4_inode_data(object)->pset, info);
++	if (ret)
++		goto err;
++	/* Someone can change plugins of the host (for example if
++	   the host is a directory), so we keep the original ones
++	   in the crypto-stat. */
++	info_set_cipher_plugin(info, inode_cipher_plugin(object));
++	info_set_digest_plugin(info, inode_digest_plugin(object));
++	/* instantiating a key */
++	ret = crypto_cipher_setkey(info_cipher_tfm(info),
++				   data->key,
++				   data->keysize);
++	if (ret) {
++		warning("edward-1379",
++			"setkey failed flags=%x\n",
++			info_cipher_tfm(info)->crt_flags);
++		goto err;
++	}
++	info->keysize = data->keysize;
++	ret = create_keyid(info, data);
++	if (ret)
++		goto err;
++	instantiate_crypto_stat(info);
++	return info;
++ err:
++	free_crypto_stat(info);
++ 	return ERR_PTR(ret);
++}
++#endif
++
++/* increment/decrement a load counter when 
++   attaching/detaching the crypto-stat to any object */
++static void load_crypto_stat(crypto_stat_t * info)
++{
++	assert("edward-1380", info != NULL);
++	inc_keyload_count(info);
++}
++
++static void unload_crypto_stat(struct inode * inode)
++{
++	crypto_stat_t * info = inode_crypto_stat(inode);
++	assert("edward-1381", info->keyload_count > 0);
++
++	dec_keyload_count(inode_crypto_stat(inode));
++	if (info->keyload_count == 0)
++		/* final release */
++		inode_free_crypto_stat(inode);
++}
++
++/* attach/detach an existing crypto-stat */
++void attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
++{
++	assert("edward-1382", inode != NULL);
++	assert("edward-1383", info != NULL);
++	assert("edward-1384", inode_crypto_stat(inode) == NULL);
++
++	set_inode_crypto_stat(inode, info);
++	load_crypto_stat(info);
++}
++
++/* returns true, if crypto stat can be attached to the @host */
++#if REISER4_DEBUG
++static int host_allows_crypto_stat(struct inode * host)
++{
++	int ret;
++	file_plugin * fplug = inode_file_plugin(host);
++
++	switch (fplug->h.id) {
++	case CRC_FILE_PLUGIN_ID:
++		ret = 1;
++		break;
++	default:
++		ret = 0;
++	}
++	return ret;
++}
++#endif  /*  REISER4_DEBUG  */
++
++void detach_crypto_stat(struct inode * inode)
++{
++	assert("edward-1385", inode != NULL);
++	assert("edward-1386", host_allows_crypto_stat(inode));
++
++	if (inode_crypto_stat(inode))
++		unload_crypto_stat(inode);
++	set_inode_crypto_stat(inode, NULL);
++}
++
++#if 0
++
++/* compare fingerprints of @child and @parent */
++static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
++{
++	return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
++}
++
++/* check if a crypto-stat (which is bound to @parent) can be inherited */
++int can_inherit_crypto_crc(struct inode *child, struct inode *parent)
++{
++	if (!need_cipher(child))
++		return 0;
++	/* the child is created */
++	if (!inode_crypto_stat(child))
++		return 1;
++	/* the child is looked up */
++	if (!inode_crypto_stat(parent))
++		return 0;
++	return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
++		inode_digest_plugin(child) == inode_digest_plugin(parent) &&
++		inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
++		keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
++}
++#endif
++
++/* helper functions for ->create() method of the cryptcompress plugin */
++static int inode_set_crypto(struct inode * object)
++{
++	reiser4_inode * info;
++	if (!inode_crypto_stat(object)) {
++		if (need_cipher(object))
++			return RETERR(-EINVAL);
++		/* the file is not to be encrypted */
++		return 0;
++	}
++	info = reiser4_inode_data(object);
++	info->extmask |= (1 << CRYPTO_STAT);
++	info->plugin_mask |= (1 << PSET_CIPHER) | (1 << PSET_DIGEST);
++ 	return 0;
++}
++
++static int
++inode_set_compression(struct inode * object)
++{
++	int result = 0;
++	compression_plugin * cplug;
++	reiser4_inode * info = reiser4_inode_data(object);
++
++	cplug = inode_compression_plugin(object);
++
++	if (cplug->init != NULL) {
++		result = cplug->init();
++		if (result)
++			return result;
++	}
++	info->plugin_mask |= (1 << PSET_COMPRESSION);
++
++	return 0;
++}
++
++static void
++inode_set_compression_mode(struct inode * object)
++{
++	reiser4_inode * info = reiser4_inode_data(object);
++
++	info->plugin_mask |= (1 << PSET_COMPRESSION_MODE);
++	return;
++}
++
++static int inode_set_cluster(struct inode *object)
++{
++	reiser4_inode *info;
++	cluster_plugin *cplug;
++
++	assert("edward-696", object != NULL);
++
++	info = reiser4_inode_data(object);
++	cplug = inode_cluster_plugin(object);
++
++	if (cplug->shift < PAGE_CACHE_SHIFT) {
++		warning("edward-1320",
++			"Can not support %p clusters (less then page size)",
++			cplug->h.label);
++		return RETERR(-EINVAL);
++	}
++	info->plugin_mask |= (1 << PSET_CLUSTER);
++	return 0;
++}
++
++/* ->destroy_inode() method of the cryptcompress plugin */
++void destroy_inode_cryptcompress(struct inode * inode)
++{
++	assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
++	detach_crypto_stat(inode);
++	return;
++}
++
++/* ->create() method of the cryptcompress plugin
++
++. install plugins
++. attach crypto info if specified
++. attach compression info if specified
++. attach cluster info
++*/
++int
++create_cryptcompress(struct inode *object, struct inode *parent,
++		     reiser4_object_create_data * data)
++{
++	int result;
++	reiser4_inode *info;
++
++	assert("edward-23", object != NULL);
++	assert("edward-24", parent != NULL);
++	assert("edward-30", data != NULL);
++	assert("edward-26", inode_get_flag(object, REISER4_NO_SD));
++	assert("edward-27", data->id == CRC_FILE_PLUGIN_ID);
++
++	info = reiser4_inode_data(object);
++
++	assert("edward-29", info != NULL);
++
++	/* set file bit */
++	info->plugin_mask |= (1 << PSET_FILE);
++
++	/* set crypto */
++	result = inode_set_crypto(object);
++	if (result)
++		goto error;
++	/* set compression */
++	result = inode_set_compression(object);
++	if (result)
++		goto error;
++	inode_set_compression_mode(object);
++
++	/* set cluster info */
++	result = inode_set_cluster(object);
++	if (result)
++		goto error;
++	/* set plugin mask */
++	info->extmask |= (1 << PLUGIN_STAT);
++
++	/* save everything in disk stat-data */
++	result = write_sd_by_inode_common(object);
++	if (!result)
++		return 0;
++ error:
++	detach_crypto_stat(object);
++	return result;
++}
++
++/* ->open() method of the cryptcompress plugin */
++int open_cryptcompress(struct inode * inode, struct file * file)
++{
++	struct inode * parent;
++
++	assert("edward-1394", inode != NULL);
++	assert("edward-1395", file != NULL);
++	assert("edward-1396", file != NULL);
++	assert("edward-1397", file->f_dentry->d_inode == inode);
++	assert("edward-1398", file->f_dentry->d_parent != NULL);
++	assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
++	assert("edward-698",
++	       inode_file_plugin(inode) ==
++	       file_plugin_by_id(CRC_FILE_PLUGIN_ID));
++
++	if (!need_cipher(inode))
++		/* the file is not to be ciphered */
++		return 0;
++	parent = file->f_dentry->d_parent->d_inode;
++	if (!inode_has_cipher_key(inode))
++		return RETERR(-EINVAL);
++	return 0;
++}
++
++/* returns a blocksize, the attribute of a cipher algorithm */
++static unsigned int
++cipher_blocksize(struct inode * inode)
++{
++	assert("edward-758", need_cipher(inode));
++	assert("edward-1400", inode_crypto_stat(inode) != NULL);
++	return crypto_tfm_alg_blocksize
++		(info_cipher_tfm(inode_crypto_stat(inode)));
++}
++
++/* returns offset translated by scale factor of the crypto-algorithm */
++static loff_t inode_scaled_offset (struct inode * inode,
++				   const loff_t src_off /* input offset */)
++{
++	assert("edward-97", inode != NULL);
++
++	if (!need_cipher(inode) ||
++	    src_off == get_key_offset(min_key()) ||
++	    src_off == get_key_offset(max_key()))
++		return src_off;
++
++	return inode_cipher_plugin(inode)->scale(inode,
++						 cipher_blocksize(inode),
++						 src_off);
++}
++
++/* returns disk cluster size */
++size_t inode_scaled_cluster_size(struct inode * inode)
++{
++	assert("edward-110", inode != NULL);
++
++	return inode_scaled_offset(inode, inode_cluster_size(inode));
++}
++
++static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
++{
++	return (clust_to_off(clust->index, inode) >= inode->i_size);
++}
++
++/* set number of cluster pages */
++static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
++{
++	reiser4_slide_t *win;
++
++	assert("edward-180", clust != NULL);
++	assert("edward-1040", inode != NULL);
++
++	win = clust->win;
++	if (!win) {
++		/* NOTE-EDWARD: i_size should be protected */
++		clust->nr_pages =
++		    count_to_nrpages(fsize_to_count(clust, inode));
++		return;
++	}
++	assert("edward-1176", clust->op != PCL_UNKNOWN);
++	assert("edward-1064", win->off + win->count + win->delta != 0);
++
++	if (win->stat == HOLE_WINDOW &&
++	    win->off == 0 && win->count == inode_cluster_size(inode)) {
++		/* special case: we start write hole from fake cluster */
++		clust->nr_pages = 0;
++		return;
++	}
++	clust->nr_pages =
++	    count_to_nrpages(max_count(win->off + win->count + win->delta,
++				       fsize_to_count(clust, inode)));
++	return;
++}
++
++/* ->key_by_inode() method of the cryptcompress plugin */
++/* see plugin/plugin.h for details */
++int
++key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
++{
++	loff_t clust_off;
++
++	assert("edward-64", inode != 0);
++	//      assert("edward-112", ergo(off != get_key_offset(max_key()), !off_to_cloff(off, inode)));
++	/* don't come here with other offsets */
++
++	clust_off =
++	    (off ==
++	     get_key_offset(max_key())? get_key_offset(max_key()) :
++	     off_to_clust_to_off(off, inode));
++
++	key_by_inode_and_offset_common(inode, 0, key);
++	set_key_offset(key,
++		       (__u64) (!inode_crypto_stat(inode) ? clust_off :
++				inode_scaled_offset(inode, clust_off)));
++	return 0;
++}
++
++/* plugin->flow_by_inode */
++int
++flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
++			    const char __user *buf /* user level buffer */ ,
++			    int user	/* 1 if @buf is of user space, 0 - if it is
++					   kernel space */ ,
++			    loff_t size /* buffer size */ ,
++			    loff_t off /* offset to start io from */ ,
++			    rw_op op /* READ or WRITE */ ,
++			    flow_t * f /* resulting flow */ )
++{
++	assert("edward-436", f != NULL);
++	assert("edward-149", inode != NULL);
++	assert("edward-150", inode_file_plugin(inode) != NULL);
++	assert("edward-151",
++	       inode_file_plugin(inode)->key_by_inode ==
++	       key_by_inode_cryptcompress);
++
++	f->length = size;
++	memcpy(&f->data, &buf, sizeof(buf));
++	f->user = user;
++	f->op = op;
++
++	if (op == WRITE_OP && user == 1)
++		return 0;
++	return key_by_inode_cryptcompress(inode, off, &f->key);
++}
++
++static int
++crc_hint_validate(hint_t * hint, const reiser4_key * key,
++		  znode_lock_mode lock_mode)
++{
++	coord_t *coord;
++
++	assert("edward-704", hint != NULL);
++	assert("edward-1089", !hint->ext_coord.valid);
++	assert("edward-706", hint->lh.owner == NULL);
++
++	coord = &hint->ext_coord.coord;
++
++	if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
++		/* hint either not set or set by different operation */
++		return RETERR(-E_REPEAT);
++
++	if (get_key_offset(key) != hint->offset)
++		/* hint is set for different key */
++		return RETERR(-E_REPEAT);
++
++	assert("edward-707", schedulable());
++
++	return seal_validate(&hint->seal, &hint->ext_coord.coord,
++			     key, &hint->lh, lock_mode, ZNODE_LOCK_LOPRI);
++}
++
++/* reserve disk space when writing a logical cluster */
++static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
++{
++	int result = 0;
++
++	assert("edward-965", schedulable());
++	assert("edward-439", inode != NULL);
++	assert("edward-440", clust != NULL);
++	assert("edward-441", clust->pages != NULL);
++	assert("edward-1261", get_current_context()->grabbed_blocks == 0);
++
++	if (clust->nr_pages == 0) {
++		assert("edward-1152", clust->win != NULL);
++		assert("edward-1153", clust->win->stat == HOLE_WINDOW);
++		/* don't reserve space for fake disk clusteer */
++		return 0;
++	}
++	assert("edward-442", jprivate(clust->pages[0]) != NULL);
++
++	result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
++					  estimate_update_cluster(inode),
++ 					  BA_CAN_COMMIT);
++	if (result)
++		return result;
++	clust->reserved = 1;
++	grabbed2cluster_reserved(estimate_insert_cluster(inode) +
++				 estimate_update_cluster(inode));
++#if REISER4_DEBUG
++	clust->reserved_prepped = estimate_update_cluster(inode);
++	clust->reserved_unprepped = estimate_insert_cluster(inode);
++#endif
++	/* there can be space grabbed by txnmgr_force_commit_all */
++	all_grabbed2free();
++	return 0;
++}
++
++/* free reserved disk space if writing a logical cluster fails */
++static void
++free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
++{
++	assert("edward-967", clust->reserved == 1);
++
++	cluster_reserved2free(count);
++	clust->reserved = 0;
++}
++
++/* The core search procedure of the cryptcompress plugin.
++   If returned value is not cbk_errored, then current znode is locked */
++static int find_cluster_item(hint_t * hint, 
++			     const reiser4_key * key, /* key of the item we are
++							 looking for */
++			     znode_lock_mode lock_mode /* which lock */ ,
++			     ra_info_t * ra_info, lookup_bias bias, __u32 flags)
++{
++	int result;
++	reiser4_key ikey;
++	coord_t *coord = &hint->ext_coord.coord;
++	coord_t orig = *coord;
++
++	assert("edward-152", hint != NULL);
++
++	if (hint->ext_coord.valid == 0) {
++		result = crc_hint_validate(hint, key, lock_mode);
++		if (result == -E_REPEAT)
++			goto traverse_tree;
++		else if (result) {
++			assert("edward-1216", 0);
++			return result;
++		}
++		hint->ext_coord.valid = 1;
++	}
++	assert("edward-709", znode_is_any_locked(coord->node));
++
++	/* In-place lookup is going here, it means we just need to
++	   check if next item of the @coord match to the @keyhint) */
++
++	if (equal_to_rdk(coord->node, key)) {
++		result = goto_right_neighbor(coord, &hint->lh);
++		if (result == -E_NO_NEIGHBOR) {
++			assert("edward-1217", 0);
++			return RETERR(-EIO);
++		}
++		if (result)
++			return result;
++		assert("edward-1218", equal_to_ldk(coord->node, key));
++	} else {
++		coord->item_pos++;
++		coord->unit_pos = 0;
++		coord->between = AT_UNIT;
++	}
++	result = zload(coord->node);
++	if (result)
++		return result;
++	assert("edward-1219", !node_is_empty(coord->node));
++
++	if (!coord_is_existing_item(coord)) {
++		zrelse(coord->node);
++		goto not_found;
++	}
++	item_key_by_coord(coord, &ikey);
++	zrelse(coord->node);
++	if (!keyeq(key, &ikey))
++		goto not_found;
++	return CBK_COORD_FOUND;
++
++      not_found:
++	assert("edward-1220", coord->item_pos > 0);
++	//coord->item_pos--;
++	/* roll back */
++	*coord = orig;
++	ON_DEBUG(coord_update_v(coord));
++	return CBK_COORD_NOTFOUND;
++
++      traverse_tree:
++	assert("edward-713", hint->lh.owner == NULL);
++	assert("edward-714", schedulable());
++
++	unset_hint(hint);
++	coord_init_zero(coord);
++	result = coord_by_key(current_tree, key, coord, &hint->lh,
++			      lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
++			      CBK_UNIQUE | flags, ra_info);
++	if (cbk_errored(result))
++		return result;
++	hint->ext_coord.valid = 1;
++	return result;
++}
++
++/* This function is called by deflate[inflate] manager when
++   creating a transformed/plain stream to check if we should
++   create/cut some overhead. If this returns true, then @oh
++   contains the size of this overhead.
++ */
++static int
++need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
++		  rw_op rw, int * oh)
++{
++	tfm_cluster_t * tc = &clust->tc;
++	switch (rw) {
++	case WRITE_OP: /* estimate align */
++		*oh = tc->len % cipher_blocksize(inode);
++		if (*oh != 0)
++			return 1;
++		break;
++	case READ_OP:  /* estimate cut */
++		*oh = *(tfm_output_data(clust) + tc->len - 1);
++		break;
++	default:
++		impossible("edward-1401", "bad option");
++	}
++	return (tc->len != tc->lsize);
++}
++
++/* create/cut an overhead of transformed/plain stream */
++static void
++align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
++{
++	int oh;
++	cipher_plugin * cplug = inode_cipher_plugin(inode);
++
++	assert("edward-1402", need_cipher(inode));
++
++	if (!need_cut_or_align(inode, clust, rw, &oh))
++		return;
++	switch (rw) {
++	case WRITE_OP: /* do align */
++		clust->tc.len +=
++			cplug->align_stream(tfm_input_data(clust) +
++					    clust->tc.len, clust->tc.len,
++					    cipher_blocksize(inode));
++		*(tfm_input_data(clust) + clust->tc.len - 1) =
++			cipher_blocksize(inode) - oh;
++		break;
++	case READ_OP: /* do cut */
++		assert("edward-1403", oh <= cipher_blocksize(inode));
++		clust->tc.len -= oh;
++		break;
++	default:
++		impossible("edward-1404", "bad option");
++	}
++	return;
++}
++
++/* the following two functions are to evaluate results
++   of compression transform */
++static unsigned
++max_cipher_overhead(struct inode * inode)
++{
++	if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
++		return 0;
++	return cipher_blocksize(inode);
++}
++
++static int deflate_overhead(struct inode *inode)
++{
++	return (inode_compression_plugin(inode)->
++		checksum ? DC_CHECKSUM_SIZE : 0);
++}
++
++static unsigned deflate_overrun(struct inode * inode, int ilen)
++{
++	return coa_overrun(inode_compression_plugin(inode), ilen);
++}
++
++/* Estimating compressibility of a logical cluster by various
++   policies represented by compression mode plugin.
++   If this returns false, then compressor won't be called for
++   the cluster of index @index.
++*/
++static int should_compress(tfm_cluster_t * tc, cloff_t index,
++			   struct inode *inode)
++{
++	compression_plugin *cplug = inode_compression_plugin(inode);
++	compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
++
++	assert("edward-1321", tc->len != 0);
++	assert("edward-1322", cplug != NULL);
++	assert("edward-1323", mplug != NULL);
++
++	return /* estimate by size */
++		(cplug->min_size_deflate ?
++		 tc->len >= cplug->min_size_deflate() :
++		 1) &&
++		/* estimate by compression mode plugin */
++		(mplug->should_deflate ?
++		 mplug->should_deflate(inode, index) :
++		 1);
++}
++
++/* Evaluating results of compression transform.
++   Returns true, if we need to accept this results */
++static int
++save_compressed(int size_before, int size_after, struct inode * inode)
++{
++	return (size_after + deflate_overhead(inode) +
++		max_cipher_overhead(inode) < size_before);
++}
++
++/* Guess result of the evaluation above */
++static int
++need_inflate(reiser4_cluster_t * clust, struct inode *inode,
++	     int encrypted /* is cluster encrypted */ )
++{
++	tfm_cluster_t *tc = &clust->tc;
++
++	assert("edward-142", tc != 0);
++	assert("edward-143", inode != NULL);
++
++	return tc->len <
++	    (encrypted ?
++	     inode_scaled_offset(inode, tc->lsize) :
++	     tc->lsize);
++}
++
++/* If results of compression were accepted, then we add
++   a checksum to catch possible disk cluster corruption.
++   The following is a format of the data stored in disk clusters:
++
++		   data                   This is (transformed) logical cluster.
++		   cipher_overhead        This is created by ->align() method
++                                          of cipher plugin. May be absent.
++		   checksum          (4)  This is created by ->checksum method
++                                          of compression plugin to check
++                                          integrity. May be absent.
++
++		   Crypto overhead format:
++
++		   data
++		   control_byte      (1)   contains aligned overhead size:
++		                           1 <= overhead <= cipher_blksize
++*/
++/* Append a checksum at the end of a transformed stream */
++static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
++{
++	__u32 checksum;
++
++	assert("edward-1309", tc != NULL);
++	assert("edward-1310", tc->len > 0);
++	assert("edward-1311", cplug->checksum != NULL);
++
++	checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
++	put_unaligned(cpu_to_le32(checksum),
++		 (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
++	tc->len += (int)DC_CHECKSUM_SIZE;
++}
++
++/* Check a disk cluster checksum.
++   Returns 0 if checksum is correct, otherwise returns 1 */
++static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
++{
++	assert("edward-1312", tc != NULL);
++	assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
++	assert("edward-1314", cplug->checksum != NULL);
++
++	if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
++			    tc->len - (int)DC_CHECKSUM_SIZE) !=
++	    le32_to_cpu(get_unaligned((d32 *)
++				      (tfm_stream_data(tc, INPUT_STREAM)
++				       + tc->len - (int)DC_CHECKSUM_SIZE)))) {
++		warning("edward-156",
++			"Bad disk cluster checksum %d, (should be %d) Fsck?\n",
++			(int)le32_to_cpu
++			(get_unaligned((d32 *)
++				       (tfm_stream_data(tc, INPUT_STREAM) +
++					tc->len - (int)DC_CHECKSUM_SIZE))),
++			(int)cplug->checksum
++			(tfm_stream_data(tc, INPUT_STREAM),
++			 tc->len - (int)DC_CHECKSUM_SIZE));
++		return 1;
++	}
++	tc->len -= (int)DC_CHECKSUM_SIZE;
++	return 0;
++}
++
++/* get input/output stream for some transform action */
++int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
++		    tfm_stream_id id)
++{
++	size_t size = inode_scaled_cluster_size(inode);
++
++	assert("edward-901", tc != NULL);
++	assert("edward-1027", inode_compression_plugin(inode) != NULL);
++
++	if (tc->act == TFM_WRITE_ACT)
++		size += deflate_overrun(inode, inode_cluster_size(inode));
++
++	if (!tfm_stream(tc, id) && id == INPUT_STREAM)
++		alternate_streams(tc);
++	if (!tfm_stream(tc, id))
++		return alloc_tfm_stream(tc, size, id);
++
++	assert("edward-902", tfm_stream_is_set(tc, id));
++
++	if (tfm_stream_size(tc, id) < size)
++		return realloc_tfm_stream(tc, size, id);
++	return 0;
++}
++
++/* Common deflate manager */
++int deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
++{
++	int result = 0;
++	int compressed = 0;
++	int encrypted = 0;
++	tfm_cluster_t * tc = &clust->tc;
++	compression_plugin * coplug;
++
++	assert("edward-401", inode != NULL);
++	assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
++	assert("edward-1348", tc->act == TFM_WRITE_ACT);
++	assert("edward-498", !tfm_cluster_is_uptodate(tc));
++
++	coplug = inode_compression_plugin(inode);
++	if (should_compress(tc, clust->index, inode)) {
++		/* try to compress, discard bad results */
++		__u32 dst_len;
++		compression_mode_plugin * mplug =
++			inode_compression_mode_plugin(inode);
++		assert("edward-602", coplug != NULL);
++		assert("edward-1423", coplug->compress != NULL);
++
++		result = grab_coa(tc, coplug);
++		if (result) {
++		    warning("edward-1424",
++			    "alloc_coa failed with ret=%d, skipped compression",
++			    result);
++		    goto cipher;
++		}
++		result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
++		if (result) {
++		    warning("edward-1425",
++			 "alloc stream failed with ret=%d, skipped compression",
++			    result);
++		    goto cipher;
++		}
++		dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
++		coplug->compress(get_coa(tc, coplug->h.id, tc->act),
++				 tfm_input_data(clust), tc->len,
++				 tfm_output_data(clust), &dst_len);
++		/* make sure we didn't overwrite extra bytes */
++		assert("edward-603",
++		       dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
++
++		/* evaluate results of compression transform */
++		if (save_compressed(tc->len, dst_len, inode)) {
++			/* good result, accept */
++			tc->len = dst_len;
++			if (mplug->accept_hook != NULL) {
++			       result = mplug->accept_hook(inode, clust->index);
++			       if (result)
++				       warning("edward-1426",
++					       "accept_hook failed with ret=%d",
++					       result);
++			}
++			compressed = 1;
++		}
++		else {
++			/* bad result, discard */
++#if REISER4_DEBUG
++			if (cluster_is_complete(clust, inode))
++			      warning("edward-1338",
++				      "incompressible cluster %lu (inode %llu)",
++				      clust->index,
++				      (unsigned long long)get_inode_oid(inode));
++#endif
++			if (mplug->discard_hook != NULL &&
++			    cluster_is_complete(clust, inode)) {
++				result = mplug->discard_hook(inode,
++							     clust->index);
++				if (result)
++				      warning("edward-1427",
++					      "discard_hook failed with ret=%d",
++					      result);
++			}
++		}
++	}
++ cipher:
++	if (need_cipher(inode)) {
++		cipher_plugin * ciplug;
++		struct crypto_tfm * tfm;
++		struct scatterlist src;
++		struct scatterlist dst;
++
++		ciplug = inode_cipher_plugin(inode);
++		tfm = info_cipher_tfm(inode_crypto_stat(inode));
++		if (compressed)
++			alternate_streams(tc);
++		result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
++		if (result)
++			return result;
++
++		align_or_cut_overhead(inode, clust, WRITE_OP);
++		src.page = virt_to_page(tfm_input_data(clust));
++		src.offset = offset_in_page(tfm_input_data(clust));
++		src.length = tc->len;
++
++		dst.page = virt_to_page(tfm_output_data(clust));
++		dst.offset = offset_in_page(tfm_output_data(clust));
++		dst.length = tc->len;
++
++		result = crypto_cipher_encrypt(tfm, &dst, &src, tc->len);
++		if (result) {
++			warning("edward-1405",
++				"encryption failed flags=%x\n", tfm->crt_flags);
++			return result;
++		}
++		encrypted = 1;
++	}
++	if (compressed && coplug->checksum != NULL)
++		dc_set_checksum(coplug, tc);
++	if (!compressed && !encrypted)
++		alternate_streams(tc);
++	return result;
++}
++
++/* Common inflate manager. */
++int inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
++{
++	int result = 0;
++	int transformed = 0;
++	tfm_cluster_t * tc = &clust->tc;
++	compression_plugin * coplug;
++
++	assert("edward-905", inode != NULL);
++	assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
++	assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
++	assert("edward-1349", tc->act == TFM_READ_ACT);
++	assert("edward-907", !tfm_cluster_is_uptodate(tc));
++
++	/* Handle a checksum (if any) */
++	coplug = inode_compression_plugin(inode);
++	if (need_inflate(clust, inode, need_cipher(inode)) &&
++	    coplug->checksum != NULL) {
++		result = dc_check_checksum(coplug, tc);
++		if (result)
++			return RETERR(-EIO);
++	}
++	if (need_cipher(inode)) {
++		cipher_plugin * ciplug;
++		struct crypto_tfm * tfm;
++		struct scatterlist src;
++		struct scatterlist dst;
++
++		ciplug = inode_cipher_plugin(inode);
++		tfm = info_cipher_tfm(inode_crypto_stat(inode));
++		result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
++		if (result)
++			return result;
++		assert("edward-909", tfm_cluster_is_set(tc));
++
++		src.page   =   virt_to_page(tfm_input_data(clust));
++		src.offset = offset_in_page(tfm_input_data(clust));
++		src.length = tc->len;
++
++		dst.page   =   virt_to_page(tfm_output_data(clust));
++		dst.offset = offset_in_page(tfm_output_data(clust));
++		dst.length = tc->len;
++
++		result = crypto_cipher_decrypt(tfm, &dst, &src, tc->len);
++		if (result)
++			return result;
++		align_or_cut_overhead(inode, clust, READ_OP);
++		transformed = 1;
++	}
++	if (need_inflate(clust, inode, 0)) {
++		unsigned dst_len = inode_cluster_size(inode);
++		if(transformed)
++			alternate_streams(tc);
++
++		result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
++		if (result)
++			return result;
++		assert("edward-1305", coplug->decompress != NULL);
++		assert("edward-910", tfm_cluster_is_set(tc));
++
++		coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
++				   tfm_input_data(clust), tc->len,
++				   tfm_output_data(clust), &dst_len);
++		/* check length */
++		tc->len = dst_len;
++		assert("edward-157", dst_len == tc->lsize);
++		transformed = 1;
++	}
++	if (!transformed)
++		alternate_streams(tc);
++	return result;
++}
++
++/* This is implementation of readpage method of struct
++   address_space_operations for cryptcompress plugin. */
++int readpage_cryptcompress(struct file *file, struct page *page)
++{
++	reiser4_context *ctx;
++	reiser4_cluster_t clust;
++	item_plugin *iplug;
++	int result;
++
++	assert("edward-88", PageLocked(page));
++	assert("vs-976", !PageUptodate(page));
++	assert("edward-89", page->mapping && page->mapping->host);
++
++	ctx = init_context(page->mapping->host->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	result = check_cryptcompress(page->mapping->host);
++	if (result) {
++		unlock_page(page);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++	assert("edward-113",
++	       ergo(file != NULL,
++		    page->mapping == file->f_dentry->d_inode->i_mapping));
++
++	if (PageUptodate(page)) {
++		warning("edward-1338", "page is already uptodate\n");
++		reiser4_exit_context(ctx);
++		return 0;
++	}
++	cluster_init_read(&clust, NULL);
++	clust.file = file;
++	iplug = item_plugin_by_id(CTAIL_ID);
++	if (!iplug->s.file.readpage) {
++		unlock_page(page);
++		put_cluster_handle(&clust);
++		reiser4_exit_context(ctx);
++		return -EINVAL;
++	}
++	result = iplug->s.file.readpage(&clust, page);
++	if (result)
++		unlock_page(page);
++	assert("edward-64",
++	       ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
++	put_cluster_handle(&clust);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/* how much pages will be captured */
++static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
++{
++	switch (clust->op) {
++	case PCL_APPEND:
++		return clust->nr_pages;
++	case PCL_TRUNCATE:
++		assert("edward-1179", clust->win != NULL);
++		return count_to_nrpages(clust->win->off + clust->win->count);
++	default:
++		impossible("edward-1180", "bad page cluster option");
++		return 0;
++	}
++}
++
++static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
++{
++	int i;
++	struct page *pg;
++	int nrpages = cluster_nrpages_to_capture(clust);
++
++	for (i = 0; i < nrpages; i++) {
++
++		pg = clust->pages[i];
++		assert("edward-968", pg != NULL);
++		lock_page(pg);
++		assert("edward-1065", PageUptodate(pg));
++		set_page_dirty_internal(pg);
++		unlock_page(pg);
++		mark_page_accessed(pg);
++	}
++}
++
++static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
++{
++	int i;
++	assert("edward-1275", clust != NULL);
++
++	for (i = 0; i < clust->nr_pages; i++) {
++		assert("edward-1276", clust->pages[i] != NULL);
++
++		lock_page(clust->pages[i]);
++		if (PageDirty(clust->pages[i])) {
++			assert("edward-1277", PageUptodate(clust->pages[i]));
++			clear_page_dirty_for_io(clust->pages[i]);
++		}
++#if REISER4_DEBUG
++		else
++			/* Race between flush and write:
++			   some pages became clean when write() (or another
++			   process which modifies data) capture the cluster. */
++			warning("edward-985", "Page of index %lu (inode %llu)"
++				" is not dirty\n", clust->pages[i]->index,
++				(unsigned long long)get_inode_oid(clust->
++								  pages[i]->
++								  mapping->
++								  host));
++#endif
++		unlock_page(clust->pages[i]);
++	}
++}
++
++/* update i_size by window */
++static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
++{
++	loff_t size;
++	reiser4_slide_t *win;
++
++	assert("edward-1181", clust != NULL);
++	assert("edward-1182", inode != NULL);
++
++	win = clust->win;
++	assert("edward-1183", win != NULL);
++
++	size = clust_to_off(clust->index, inode) + win->off;
++
++	switch (clust->op) {
++	case PCL_APPEND:
++		if (size + win->count <= inode->i_size)
++			/* overwrite only */
++			return;
++		size += win->count;
++		break;
++	case PCL_TRUNCATE:
++		break;
++	default:
++		impossible("edward-1184", "bad page cluster option");
++		break;
++	}
++	inode_check_scale_nolock(inode, inode->i_size, size);
++	inode->i_size = size;
++	return;
++}
++
++/* Check in page cluster modifications.
++   . Make jnode dirty, if it wasn't;
++   . Reserve space for a disk cluster update by flush algorithm, if needed;
++   . Clean up old references (if any).
++   . Put pages (grabbed in this thread) which will be truncated
++*/
++static void
++make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
++				loff_t * old_isize, struct inode *inode)
++{
++	int i;
++	int old_nrpages;
++	int new_nrpages = cluster_nrpages_to_capture(clust);
++
++	assert("edward-973", new_nrpages > 0);
++	assert("edward-221", node != NULL);
++	assert("edward-971", clust->reserved == 1);
++	assert_spin_locked(&(node->guard));
++	assert("edward-972", node->page_count < cluster_nrpages(inode));
++	assert("edward-1263",
++	       clust->reserved_prepped == estimate_update_cluster(inode));
++	assert("edward-1264", clust->reserved_unprepped == 0);
++
++	if (JF_ISSET(node, JNODE_DIRTY)) {
++		/* someone has modified this cluster, but
++		   the modifications are not committed yet */
++		old_nrpages =
++			count_to_nrpages(cnt_to_clcnt(*old_isize,
++						      clust->index, inode));
++		/* free space which is already reserved */
++		free_reserved4cluster(inode, clust,
++				      estimate_update_cluster(inode));
++		/* put old references */
++		for (i = 0; i < old_nrpages; i++) {
++			assert("edward-975", clust->pages[i]);
++			assert("edward-1185", PageUptodate(clust->pages[i]));
++
++			page_cache_release(clust->pages[i]);
++#if REISER4_DEBUG
++			cryptcompress_inode_data(inode)->pgcount --;
++#endif
++		}
++	} else {
++		/* no captured pages */
++		assert("edward-1043", node->page_count == 0);
++		jnode_make_dirty_locked(node);
++		clust->reserved = 0;
++	}
++	/* put pages that will be truncated (if any) */
++	for (i = new_nrpages; i < clust->nr_pages; i++) {
++		assert("edward-1433", clust->pages[i]);
++		assert("edward-1434", PageUptodate(clust->pages[i]));
++		page_cache_release(clust->pages[i]);
++#if REISER4_DEBUG
++		cryptcompress_inode_data(inode)->pgcount --;
++#endif
++	}
++#if REISER4_DEBUG
++	clust->reserved_prepped -= estimate_update_cluster(inode);
++	node->page_count = new_nrpages - 1;
++#endif
++	return;
++}
++
++/* This function spawns a transaction and
++   is called by any thread as a final step in page cluster modification.
++*/
++static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
++{
++	int result = 0;
++	loff_t old_size;
++	jnode *node;
++
++	assert("edward-1029", clust != NULL);
++	assert("edward-1030", clust->reserved == 1);
++	assert("edward-1031", clust->nr_pages != 0);
++	assert("edward-1032", clust->pages != NULL);
++	assert("edward-1033", clust->pages[0] != NULL);
++
++	node = jprivate(clust->pages[0]);
++
++	assert("edward-1035", node != NULL);
++
++	spin_lock_jnode(node);
++	old_size = inode->i_size;
++	if (clust->win)
++		inode_set_new_size(clust, inode);
++
++	result = try_capture(node, ZNODE_WRITE_LOCK, 0);
++	if (result)
++		goto exit;
++	make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
++      exit:
++	assert("edward-1034", !result);
++	spin_unlock_jnode(node);
++	jput(node);
++	return result;
++}
++
++/* Collect unlocked cluster pages for any modifications and attach a jnode.
++   We allocate only one jnode per cluster, this jnode is binded to the first
++   page of this cluster, so we have an extra-reference that will exist with
++   this jnode, other references will be cleaned up in flush time.
++*/
++static int
++grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
++{
++	int i;
++	int result = 0;
++	jnode *node = NULL;
++
++	assert("edward-182", clust != NULL);
++	assert("edward-183", clust->pages != NULL);
++	assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
++
++	if (clust->nr_pages == 0)
++		return 0;
++
++	for (i = 0; i < clust->nr_pages; i++) {
++
++		assert("edward-1044", clust->pages[i] == NULL);
++
++		clust->pages[i] =
++		    grab_cache_page(inode->i_mapping,
++				    clust_to_pg(clust->index, inode) + i);
++		if (!clust->pages[i]) {
++			result = RETERR(-ENOMEM);
++			break;
++		}
++		if (i == 0) {
++			node = jnode_of_page(clust->pages[i]);
++			if (IS_ERR(node)) {
++				result = PTR_ERR(node);
++				unlock_page(clust->pages[i]);
++				break;
++			}
++			JF_SET(node, JNODE_CLUSTER_PAGE);
++			unlock_page(clust->pages[i]);
++			assert("edward-919", node);
++			continue;
++		}
++		unlock_page(clust->pages[i]);
++	}
++	if (result) {
++		while (i)
++			page_cache_release(clust->pages[--i]);
++		if (node && !IS_ERR(node))
++			jput(node);
++		return result;
++	}
++	assert("edward-920", jprivate(clust->pages[0]));
++#if REISER4_DEBUG
++	cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
++#endif
++	return 0;
++}
++
++/* Collect unlocked cluster pages only for read (not to modify) */
++static int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
++{
++	int i;
++	int result = 0;
++
++	assert("edward-1428", inode != NULL);
++	assert("edward-1429", inode->i_mapping != NULL);	
++	assert("edward-787", clust != NULL);
++	assert("edward-788", clust->pages != NULL);
++	assert("edward-789", clust->nr_pages != 0);
++	assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
++
++	for (i = 0; i < clust->nr_pages; i++) {
++		clust->pages[i] =
++		    grab_cache_page(inode->i_mapping,
++				    clust_to_pg(clust->index, inode) + i);
++		if (!clust->pages[i]) {
++			result = RETERR(-ENOMEM);
++			break;
++		}
++		unlock_page(clust->pages[i]);
++	}
++	if (result)
++		while (i)
++			page_cache_release(clust->pages[--i]);
++	return result;
++}
++
++/* @node might be attached by reiser4_writepage(), not by
++   cryptcompress plugin code, but emergency flush should
++   understand that pages of cryptcompress files are not
++   flushable.
++*/
++int jnode_of_cluster(const jnode * node, struct page * page)
++{
++	assert("edward-1339", node != NULL);
++	assert("edward-1340", page != NULL);
++	assert("edward-1341", page->mapping != NULL);
++	assert("edward-1342", page->mapping->host != NULL);
++	assert("edward-1343",
++	       ergo(jnode_is_unformatted(node),
++		    get_inode_oid(page->mapping->host) ==
++		    node->key.j.objectid));
++	if (inode_file_plugin(page->mapping->host) ==
++	    file_plugin_by_id(CRC_FILE_PLUGIN_ID)) {
++#if REISER4_DEBUG
++		if (!jnode_is_cluster_page(node))
++			warning("edward-1345",
++			"inode %llu: cluster page of index %lu became private",
++			(unsigned long long)get_inode_oid(page->mapping->host),
++			page->index);
++#endif
++		return 1;
++	}
++	return 0;
++}
++
++/* put cluster pages */
++void release_cluster_pages(reiser4_cluster_t * clust)
++{
++	int i;
++
++	assert("edward-447", clust != NULL);	
++	for (i = 0; i < clust->nr_pages; i++) {
++
++		assert("edward-449", clust->pages[i] != NULL);
++
++		page_cache_release(clust->pages[i]);
++	}
++}
++
++/* this is called when something is failed */
++static void release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
++{
++	jnode *node;
++
++	assert("edward-445", clust != NULL);
++	assert("edward-922", clust->pages != NULL);
++	assert("edward-446", clust->pages[0] != NULL);
++
++	node = jprivate(clust->pages[0]);
++
++	assert("edward-447", node != NULL);
++
++	release_cluster_pages(clust);
++	jput(node);
++}
++
++#if REISER4_DEBUG
++static int window_ok(reiser4_slide_t * win, struct inode *inode)
++{
++	assert("edward-1115", win != NULL);
++	assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
++
++	return (win->off != inode_cluster_size(inode)) &&
++	    (win->off + win->count + win->delta <= inode_cluster_size(inode));
++}
++
++static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
++{
++	assert("edward-279", clust != NULL);
++
++	if (!clust->pages)
++		return 0;
++	return (clust->win ? window_ok(clust->win, inode) : 1);
++}
++#endif
++
++/* guess next window stat */
++static inline window_stat next_window_stat(reiser4_slide_t * win)
++{
++	assert("edward-1130", win != NULL);
++	return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
++		HOLE_WINDOW : DATA_WINDOW);
++}
++
++/* guess next cluster index and window params */
++static void
++update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
++	       loff_t to_file)
++{
++	reiser4_slide_t *win;
++
++	assert("edward-185", clust != NULL);
++	assert("edward-438", clust->pages != NULL);
++	assert("edward-281", cluster_ok(clust, inode));
++
++	win = clust->win;
++	if (!win)
++		return;
++
++	switch (win->stat) {
++	case DATA_WINDOW:
++		/* increment window position */
++		clust->index++;
++		win->stat = DATA_WINDOW;
++		win->off = 0;
++		win->count = min_count(inode_cluster_size(inode), to_file);
++		break;
++	case HOLE_WINDOW:
++		switch (next_window_stat(win)) {
++		case HOLE_WINDOW:
++			/* set window to fit the offset we start write from */
++			clust->index = off_to_clust(file_off, inode);
++			win->stat = HOLE_WINDOW;
++			win->off = 0;
++			win->count = off_to_cloff(file_off, inode);
++			win->delta =
++			    min_count(inode_cluster_size(inode) - win->count,
++				      to_file);
++			break;
++		case DATA_WINDOW:
++			/* do not move the window, just change its state,
++			   off+count+delta=inv */
++			win->stat = DATA_WINDOW;
++			win->off = win->off + win->count;
++			win->count = win->delta;
++			win->delta = 0;
++			break;
++		default:
++			impossible("edward-282", "wrong next window state");
++		}
++		break;
++	default:
++		impossible("edward-283", "wrong current window state");
++	}
++	assert("edward-1068", cluster_ok(clust, inode));
++}
++
++static int update_sd_cryptcompress(struct inode *inode)
++{
++	int result = 0;
++
++	assert("edward-978", schedulable());
++	assert("edward-1265", get_current_context()->grabbed_blocks == 0);
++
++	result = reiser4_grab_space_force(	/* one for stat data update */
++						 estimate_update_common(inode),
++						 BA_CAN_COMMIT);
++	assert("edward-979", !result);
++	if (result)
++		return result;
++	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
++	result = reiser4_update_sd(inode);
++
++	all_grabbed2free();
++	return result;
++}
++
++
++/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
++static void uncapture_cluster_jnode(jnode * node)
++{
++	txn_atom *atom;
++
++	assert_spin_locked(&(node->guard));
++
++	/*jnode_make_clean(node); */
++	atom = jnode_get_atom(node);
++	if (atom == NULL) {
++		assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
++		spin_unlock_jnode(node);
++		return;
++	}
++
++	uncapture_block(node);
++	spin_unlock_atom(atom);
++	jput(node);
++}
++
++void forget_cluster_pages(struct page **pages, int nr)
++{
++	int i;
++	for (i = 0; i < nr; i++) {
++
++		assert("edward-1045", pages[i] != NULL);
++		page_cache_release(pages[i]);
++	}
++}
++
++/* Check out last modifications we are about to commit,
++   and prepare input stream for transform operations.
++*/
++int
++flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
++		    struct inode *inode)
++{
++	int result = 0;
++	int i;
++	int nr_pages = 0;
++	tfm_cluster_t *tc = &clust->tc;
++
++	assert("edward-980", node != NULL);
++	assert("edward-236", inode != NULL);
++	assert("edward-237", clust != NULL);
++	assert("edward-240", !clust->win);
++	assert("edward-241", schedulable());
++	assert("edward-718", crc_inode_ok(inode));
++
++	result = grab_tfm_stream(inode, tc, INPUT_STREAM);
++	if (result) {
++		warning("edward-1430",
++			"alloc stream failed with ret=%d", result);
++		return result;
++	}
++	spin_lock_jnode(node);
++	assert("edward-1435", JF_ISSET(node, JNODE_DIRTY));
++
++	/* Check out a size of logical cluster and 
++	   set a number of cluster pages to commit. */
++	tc->len = tc->lsize = fsize_to_count(clust, inode);
++	clust->nr_pages = count_to_nrpages(tc->len);
++
++	assert("edward-983", clust->nr_pages == node->page_count + 1);
++#if REISER4_DEBUG
++	node->page_count = 0;
++#endif
++	cluster_reserved2grabbed(estimate_update_cluster(inode));
++	uncapture_cluster_jnode(node);
++
++	assert("edward-1224", schedulable());
++	/* Check out cluster pages to commit */
++	nr_pages =
++	      find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
++			     clust->nr_pages, clust->pages);
++
++	assert("edward-1280", nr_pages == clust->nr_pages);
++	/* Construct input stream from the checked out pages */
++	for (i = 0; i < clust->nr_pages; i++) {
++		char *data;
++
++		assert("edward-242", clust->pages[i] != NULL);
++		assert("edward-1436", clust->pages[i]->index ==
++		       clust_to_pg(clust->index, inode) + i); 
++		assert("edward-1437", PageUptodate(clust->pages[i]));
++		/* flush the page into the input stream */
++		lock_page(clust->pages[i]);
++		data = kmap(clust->pages[i]);
++
++		assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
++
++		memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
++		       data, cnt_to_pgcnt(tc->len, i));
++		kunmap(clust->pages[i]);
++		unlock_page(clust->pages[i]);
++	}
++	clear_cluster_pages_dirty(clust);
++      	release_cluster_pages(clust);
++#if REISER4_DEBUG
++	cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
++#endif
++	/* put pages that were found here */
++	release_cluster_pages(clust);
++	return result;
++}
++
++/* set hint for the cluster of the index @index */
++static void set_hint_cluster(struct inode *inode, hint_t * hint,
++			     cloff_t index, znode_lock_mode mode)
++{
++	reiser4_key key;
++	assert("edward-722", crc_inode_ok(inode));
++	assert("edward-723",
++	       inode_file_plugin(inode) ==
++	       file_plugin_by_id(CRC_FILE_PLUGIN_ID));
++
++	inode_file_plugin(inode)->key_by_inode(inode,
++					       clust_to_off(index, inode),
++					       &key);
++
++	seal_init(&hint->seal, &hint->ext_coord.coord, &key);
++	hint->offset = get_key_offset(&key);
++	hint->mode = mode;
++}
++
++void invalidate_hint_cluster(reiser4_cluster_t * clust)
++{
++	assert("edward-1291", clust != NULL);
++	assert("edward-1292", clust->hint != NULL);
++
++	done_lh(&clust->hint->lh);
++	clust->hint->ext_coord.valid = 0;
++}
++
++void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
++		 znode_lock_mode mode)
++{
++	assert("edward-1286", clust != NULL);
++	assert("edward-1287", clust->hint != NULL);
++
++	set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
++	invalidate_hint_cluster(clust);
++}
++
++static int
++balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
++			   loff_t off, loff_t to_file)
++{
++	int result;
++
++	assert("edward-724", inode != NULL);
++	assert("edward-725", crc_inode_ok(inode));
++	assert("edward-1272", get_current_context()->grabbed_blocks == 0);
++
++	/* set next window params */
++	update_cluster(inode, clust, off, to_file);
++
++	result = update_sd_cryptcompress(inode);
++	assert("edward-988", !result);
++	if (result)
++		return result;
++	assert("edward-726", clust->hint->lh.owner == NULL);
++
++	reiser4_throttle_write(inode);
++	all_grabbed2free();
++	return 0;
++}
++
++/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
++static int
++write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
++	   loff_t to_file)
++{
++	char *data;
++	int result = 0;
++	unsigned cl_off, cl_count = 0;
++	unsigned to_pg, pg_off;
++	reiser4_slide_t *win;
++
++	assert("edward-190", clust != NULL);
++	assert("edward-1069", clust->win != NULL);
++	assert("edward-191", inode != NULL);
++	assert("edward-727", crc_inode_ok(inode));
++	assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
++	assert("edward-1154",
++	       ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
++
++	win = clust->win;
++
++	assert("edward-1070", win != NULL);
++	assert("edward-201", win->stat == HOLE_WINDOW);
++	assert("edward-192", cluster_ok(clust, inode));
++
++	if (win->off == 0 && win->count == inode_cluster_size(inode)) {
++		/* the hole will be represented by fake disk cluster */
++		update_cluster(inode, clust, file_off, to_file);
++		return 0;
++	}
++	cl_count = win->count;	/* number of zeroes to write */
++	cl_off = win->off;
++	pg_off = off_to_pgoff(win->off);
++
++	while (cl_count) {
++		struct page *page;
++		page = clust->pages[off_to_pg(cl_off)];
++
++		assert("edward-284", page != NULL);
++
++		to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
++		lock_page(page);
++		data = kmap_atomic(page, KM_USER0);
++		memset(data + pg_off, 0, to_pg);
++		flush_dcache_page(page);
++		kunmap_atomic(data, KM_USER0);
++		SetPageUptodate(page);
++		unlock_page(page);
++
++		cl_off += to_pg;
++		cl_count -= to_pg;
++		pg_off = 0;
++	}
++	if (!win->delta) {
++		/* only zeroes, try to capture */
++
++		set_cluster_pages_dirty(clust);
++		result = try_capture_cluster(clust, inode);
++		if (result)
++			return result;
++		put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
++		result =
++		    balance_dirty_page_cluster(clust, inode, file_off, to_file);
++	} else
++		update_cluster(inode, clust, file_off, to_file);
++	return result;
++}
++
++/*
++  The main disk search procedure for cryptcompress plugins, which
++  . scans all items of disk cluster
++  . maybe reads each one (if @read != 0)
++  . maybe makes its znode dirty  (if @write != 0)
++
++  NOTE-EDWARD: Callers should handle the case when disk cluster
++  is incomplete (-EIO)
++*/
++int
++find_cluster(reiser4_cluster_t * clust,
++	     struct inode *inode, int read, int write)
++{
++	flow_t f;
++	hint_t *hint;
++	int result = 0;
++	unsigned long cl_idx;
++	ra_info_t ra_info;
++	file_plugin *fplug;
++	item_plugin *iplug;
++	tfm_cluster_t *tc;
++	int was_grabbed;
++
++	assert("edward-138", clust != NULL);
++	assert("edward-728", clust->hint != NULL);
++	assert("edward-225", read || write);
++	assert("edward-226", schedulable());
++	assert("edward-137", inode != NULL);
++	assert("edward-729", crc_inode_ok(inode));
++
++	hint = clust->hint;
++	cl_idx = clust->index;
++	fplug = inode_file_plugin(inode);
++	was_grabbed = get_current_context()->grabbed_blocks;
++	tc = &clust->tc;
++
++	assert("edward-462", !tfm_cluster_is_uptodate(tc));
++	assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
++
++	/* set key of the first disk cluster item */
++	fplug->flow_by_inode(inode,
++			     (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
++			     0 /* kernel space */ ,
++			     inode_scaled_cluster_size(inode),
++			     clust_to_off(cl_idx, inode), READ_OP, &f);
++	if (write) {
++		/* reserve for flush to make dirty all the leaf nodes
++		   which contain disk cluster */
++		result =
++		    reiser4_grab_space_force(estimate_dirty_cluster(inode),
++					     BA_CAN_COMMIT);
++		assert("edward-990", !result);
++		if (result)
++			goto out;
++	}
++
++	ra_info.key_to_stop = f.key;
++	set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
++
++	while (f.length) {
++		result = find_cluster_item(hint,
++					   &f.key,
++					   (write ? ZNODE_WRITE_LOCK :
++					    ZNODE_READ_LOCK), NULL, FIND_EXACT,
++					   (write ? CBK_FOR_INSERT : 0));
++		switch (result) {
++		case CBK_COORD_NOTFOUND:
++			result = 0;
++			if (inode_scaled_offset
++			    (inode,
++			     clust_to_off(cl_idx,
++					  inode)) == get_key_offset(&f.key)) {
++				/* first item not found, this is treated
++				   as disk cluster is absent */
++				clust->dstat = FAKE_DISK_CLUSTER;
++				goto out;
++			}
++			/* we are outside the cluster, stop search here */
++			assert("edward-146",
++			       f.length != inode_scaled_cluster_size(inode));
++			goto ok;
++		case CBK_COORD_FOUND:
++			assert("edward-148",
++			       hint->ext_coord.coord.between == AT_UNIT);
++			assert("edward-460",
++			       hint->ext_coord.coord.unit_pos == 0);
++
++			coord_clear_iplug(&hint->ext_coord.coord);
++			result = zload_ra(hint->ext_coord.coord.node, &ra_info);
++			if (unlikely(result))
++				goto out;
++			iplug = item_plugin_by_coord(&hint->ext_coord.coord);
++			assert("edward-147",
++			       item_id_by_coord(&hint->ext_coord.coord) ==
++			       CTAIL_ID);
++
++			result = iplug->s.file.read(NULL, &f, hint);
++			if (result) {
++				zrelse(hint->ext_coord.coord.node);
++				goto out;
++			}
++			if (write) {
++				znode_make_dirty(hint->ext_coord.coord.node);
++				znode_set_convertible(hint->ext_coord.coord.
++						      node);
++			}
++			zrelse(hint->ext_coord.coord.node);
++			break;
++		default:
++			goto out;
++		}
++	}
++ ok:
++	/* at least one item was found  */
++	/* NOTE-EDWARD: Callers should handle the case
++	   when disk cluster is incomplete (-EIO) */
++	tc->len = inode_scaled_cluster_size(inode) - f.length;
++	tc->lsize = fsize_to_count(clust, inode);
++	assert("edward-1196", tc->len > 0);
++	assert("edward-1406", tc->lsize > 0);
++
++	if (hint_is_unprepped_dclust(clust->hint))
++		clust->dstat = UNPR_DISK_CLUSTER;
++	else
++		clust->dstat = PREP_DISK_CLUSTER;
++ out:
++	assert("edward-1339",
++	       get_current_context()->grabbed_blocks >= was_grabbed);
++	grabbed2free(get_current_context(),
++		     get_current_super_private(),
++		     get_current_context()->grabbed_blocks - was_grabbed);
++	return result;
++}
++
++int
++get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
++			znode_lock_mode lock_mode)
++{
++	reiser4_key key;
++	ra_info_t ra_info;
++
++	assert("edward-730", schedulable());
++	assert("edward-731", clust != NULL);
++	assert("edward-732", inode != NULL);
++
++	if (clust->hint->ext_coord.valid) {
++		assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
++		assert("edward-1294",
++		       znode_is_write_locked(clust->hint->lh.node));
++		/* already have a valid locked position */
++		return (clust->dstat ==
++			FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
++			CBK_COORD_FOUND);
++	}
++	key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
++				   &key);
++	ra_info.key_to_stop = key;
++	set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
++
++	return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
++				 CBK_FOR_INSERT);
++}
++
++/* Read needed cluster pages before modifying.
++   If success, @clust->hint contains locked position in the tree.
++   Also:
++   . find and set disk cluster state
++   . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
++*/
++static int
++read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
++{
++	int i;
++	int result = 0;
++	item_plugin *iplug;
++	reiser4_slide_t *win = clust->win;
++
++	iplug = item_plugin_by_id(CTAIL_ID);
++
++	assert("edward-733", get_current_context()->grabbed_blocks == 0);
++	assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
++
++#if REISER4_DEBUG
++	if (clust->nr_pages == 0) {
++		/* start write hole from fake disk cluster */
++		assert("edward-1117", win != NULL);
++		assert("edward-1118", win->stat == HOLE_WINDOW);
++		assert("edward-1119", new_cluster(clust, inode));
++	}
++#endif
++	if (new_cluster(clust, inode)) {
++		/*
++		   new page cluster is about to be written, nothing to read,
++		 */
++		assert("edward-734", schedulable());
++		assert("edward-735", clust->hint->lh.owner == NULL);
++
++		if (clust->nr_pages) {
++			int off;
++			char *data;
++			struct page * pg;
++			assert("edward-1419", clust->pages != NULL);
++			pg = clust->pages[clust->nr_pages - 1];
++			assert("edward-1420", pg != NULL);
++			off = off_to_pgoff(win->off+win->count+win->delta);
++			if (off) {
++				lock_page(pg);
++				data = kmap_atomic(pg, KM_USER0);
++				memset(data + off, 0, PAGE_CACHE_SIZE - off);
++				flush_dcache_page(pg);
++				kunmap_atomic(data, KM_USER0);
++				unlock_page(pg);
++			}
++		}
++		clust->dstat = FAKE_DISK_CLUSTER;
++		return 0;
++	}
++	/*
++	   Here we should search for disk cluster to figure out its real state.
++	   Also there is one more important reason to do disk search: we need
++	   to make disk cluster _dirty_ if it exists
++	 */
++
++	/* if windows is specified, read the only pages
++	   that will be modified partially */
++
++	for (i = 0; i < clust->nr_pages; i++) {
++		struct page *pg = clust->pages[i];
++
++		lock_page(pg);
++		if (PageUptodate(pg)) {
++			unlock_page(pg);
++			continue;
++		}
++		unlock_page(pg);
++
++		if (win &&
++		    i >= count_to_nrpages(win->off) &&
++		    i < off_to_pg(win->off + win->count + win->delta))
++			/* page will be completely overwritten */
++			continue;
++
++		if (win && (i == clust->nr_pages - 1) &&
++		    /* the last page is
++		       partially modified,
++		       not uptodate .. */
++		    (count_to_nrpages(inode->i_size) <= pg->index)) {
++			/* .. and appended,
++			   so set zeroes to the rest */
++			char *data;
++			int offset;
++			lock_page(pg);
++			data = kmap_atomic(pg, KM_USER0);
++
++			assert("edward-1260",
++			       count_to_nrpages(win->off + win->count +
++						win->delta) - 1 == i);
++
++			offset =
++			    off_to_pgoff(win->off + win->count + win->delta);
++			memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
++			flush_dcache_page(pg);
++			kunmap_atomic(data, KM_USER0);
++			unlock_page(pg);
++			/* still not uptodate */
++			break;
++		}
++		if (!tfm_cluster_is_uptodate(&clust->tc)) {
++			result = ctail_read_disk_cluster(clust, inode, 1);
++			assert("edward-992", !result);
++			if (result)
++				goto out;
++			assert("edward-925",
++			       tfm_cluster_is_uptodate(&clust->tc));
++		}
++		lock_page(pg);
++		result = do_readpage_ctail(inode, clust, pg);
++		unlock_page(pg);
++		assert("edward-993", !result);
++		if (result) {
++			impossible("edward-219",
++				   "do_readpage_ctail returned crap");
++			goto out;
++		}
++	}
++	if (!tfm_cluster_is_uptodate(&clust->tc)) {
++		/* disk cluster unclaimed, but we need to make its znodes dirty
++		   to make flush update convert its content */
++		result =
++		    find_cluster(clust, inode, 0 /* do not read */ ,
++				 1 /* write */ );
++		assert("edward-994", !result);
++	}
++ out:
++	tfm_cluster_clr_uptodate(&clust->tc);
++	return result;
++}
++
++static int
++should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
++{
++	assert("edward-737", clust != NULL);
++
++	switch (clust->dstat) {
++	case PREP_DISK_CLUSTER:
++	case UNPR_DISK_CLUSTER:
++		return 0;
++	case FAKE_DISK_CLUSTER:
++		if (clust->win &&
++		    clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
++			assert("edward-1172", new_cluster(clust, inode));
++			return 0;
++		}
++		return 1;
++	default:
++		impossible("edward-1173", "bad disk cluster state");
++		return 0;
++	}
++}
++
++static int
++crc_make_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
++{
++	int result;
++
++	assert("edward-1123", schedulable());
++	assert("edward-737", clust != NULL);
++	assert("edward-738", inode != NULL);
++	assert("edward-739", crc_inode_ok(inode));
++	assert("edward-1053", clust->hint != NULL);
++	assert("edward-1266", get_current_context()->grabbed_blocks == 0);
++
++	if (clust->reserved) {
++		cluster_reserved2grabbed(estimate_insert_cluster(inode));
++#if REISER4_DEBUG
++		assert("edward-1267",
++		       clust->reserved_unprepped ==
++		       estimate_insert_cluster(inode));
++		clust->reserved_unprepped -= estimate_insert_cluster(inode);
++#endif
++	}
++	if (!should_create_unprepped_cluster(clust, inode)) {
++		all_grabbed2free();
++		return 0;
++	} else {
++		assert("edward-1268", clust->reserved == 1);
++	}
++	result = ctail_insert_unprepped_cluster(clust, inode);
++	all_grabbed2free();
++	if (result)
++		return result;
++
++	assert("edward-743", crc_inode_ok(inode));
++	assert("edward-1269", get_current_context()->grabbed_blocks == 0);
++	assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
++
++	clust->dstat = UNPR_DISK_CLUSTER;
++	return 0;
++}
++
++#if REISER4_DEBUG
++static int jnode_truncate_ok(struct inode *inode, cloff_t index)
++{
++	jnode *node;
++	node =
++	    jlookup(current_tree, get_inode_oid(inode),
++		    clust_to_pg(index, inode));
++	if (likely(!node))
++		return 1;
++	/* someone got this jnode */
++	warning("edward-1315", "jnode %p is untruncated\n", node);
++	jput(node);
++	return (atomic_read(&node->x_count));
++}
++#endif
++
++/* Collect unlocked cluster pages and jnode (the last is in the
++   case when the page cluster will be modified and captured) */
++int
++prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
++		     int capture)
++{
++	assert("edward-177", inode != NULL);
++	assert("edward-741", crc_inode_ok(inode));
++	assert("edward-740", clust->pages != NULL);
++
++	set_cluster_nrpages(clust, inode);
++	reset_cluster_pgset(clust, cluster_nrpages(inode));
++	return (capture ?
++		grab_cluster_pages_jnode(inode, clust) :
++		grab_cluster_pages(inode, clust));
++}
++
++/* Truncate all pages of the cluster of index @index.
++   This is called by ->kill_hook() method of item plugin */
++void truncate_page_cluster(struct inode *inode, cloff_t index)
++{
++	int i;
++	int found = 0;
++	int nr_pages;
++	jnode *node;
++	struct page *pages[MAX_CLUSTER_NRPAGES];
++
++	node =
++	    jlookup(current_tree, get_inode_oid(inode),
++		    clust_to_pg(index, inode));
++	/* jnode is absent, just drop pages which can not
++	   acquire jnode because of exclusive access */
++	if (!node) {
++		truncate_inode_pages_range(inode->i_mapping,
++					   clust_to_off(index, inode),
++					   clust_to_off(index,
++							inode) +
++					   inode_cluster_size(inode) - 1);
++		return;
++	}
++	/* jnode is present and may be dirty */
++	nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
++
++	found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
++			       nr_pages, pages);
++	spin_lock_jnode(node);
++	if (JF_ISSET(node, JNODE_DIRTY)) {
++		/* someone has done modifications which are not
++		   yet committed, so we need to release some resources */
++		
++		/* free disk space grabbed for disk cluster converting */
++		cluster_reserved2grabbed(estimate_update_cluster(inode));
++		grabbed2free(get_current_context(),
++			     get_current_super_private(),
++			     estimate_update_cluster(inode));
++
++		assert("edward-1198", found == nr_pages);
++		assert("edward-1199", node->page_count + 1 == nr_pages);
++#if REISER4_DEBUG
++		node->page_count = 0;
++#endif
++		/* This will clear dirty bit */	
++		uncapture_cluster_jnode(node);
++
++		/* put pages grabbed for last uncommitted modifications */
++		for (i = 0; i < nr_pages; i++) {
++			assert("edward-1200", PageUptodate(pages[i]));
++			page_cache_release(pages[i]);
++#if REISER4_DEBUG
++			cryptcompress_inode_data(inode)->pgcount --;
++#endif
++		}
++	} else
++		spin_unlock_jnode(node);
++	/* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
++
++	jput(node);
++	/* put pages found here */
++	forget_cluster_pages(pages, found);
++	truncate_inode_pages_range(inode->i_mapping,
++				   clust_to_off(index, inode),
++				   clust_to_off(index,
++						inode) +
++				   inode_cluster_size(inode) - 1);
++	assert("edward-1201", jnode_truncate_ok(inode, index));
++	return;
++}
++
++/* Prepare cluster handle before(after) modifications
++   which are supposed to be committed.
++
++   . grab cluster pages;
++   . reserve disk space;
++   . maybe read pages from disk and set the disk cluster dirty;
++   . maybe write hole;
++   . maybe create 'unprepped' disk cluster if the last one is fake
++     (i.e. is not represenred by any items)
++*/
++
++static int
++prepare_cluster(struct inode *inode,
++		loff_t file_off /* write position in the file */ ,
++		loff_t to_file,	/* bytes of users data to write to the file */
++		reiser4_cluster_t * clust, page_cluster_op op)
++{
++	int result = 0;
++	reiser4_slide_t *win = clust->win;
++
++	assert("edward-1273", get_current_context()->grabbed_blocks == 0);
++	reset_cluster_params(clust);
++#if REISER4_DEBUG
++	clust->ctx = get_current_context();
++#endif
++	assert("edward-1190", op != PCL_UNKNOWN);
++
++	clust->op = op;
++
++	result = prepare_page_cluster(inode, clust, 1);
++	if (result)
++		return result;
++	result = reserve4cluster(inode, clust);
++	if (result)
++		goto err1;
++	result = read_some_cluster_pages(inode, clust);
++	if (result) {
++		free_reserved4cluster(inode,
++				      clust,
++				      estimate_update_cluster(inode) +
++				      estimate_insert_cluster(inode));
++		goto err1;
++	}
++	assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
++
++	result = crc_make_unprepped_cluster(clust, inode);
++	if (result)
++		goto err2;
++	if (win && win->stat == HOLE_WINDOW) {
++		result = write_hole(inode, clust, file_off, to_file);
++		if (result)
++			goto err2;
++	}
++	return 0;
++      err2:
++	free_reserved4cluster(inode, clust,
++			      estimate_update_cluster(inode));
++      err1:
++	release_cluster_pages_and_jnode(clust);
++	assert("edward-1125", result == -ENOSPC);
++	return result;
++}
++
++/* set window by two offsets */
++static void
++set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
++	   struct inode *inode, loff_t o1, loff_t o2)
++{
++	assert("edward-295", clust != NULL);
++	assert("edward-296", inode != NULL);
++	assert("edward-1071", win != NULL);
++	assert("edward-297", o1 <= o2);
++
++	clust->index = off_to_clust(o1, inode);
++
++	win->off = off_to_cloff(o1, inode);
++	win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
++	win->delta = 0;
++
++	clust->win = win;
++}
++
++static int
++set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
++		      reiser4_slide_t * win, flow_t * f, loff_t file_off)
++{
++	int result;
++
++	assert("edward-197", clust != NULL);
++	assert("edward-1072", win != NULL);
++	assert("edward-198", inode != NULL);
++
++	result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
++	if (result)
++		return result;
++
++	if (file_off > inode->i_size) {
++		/* Uhmm, hole in cryptcompress file... */
++		loff_t hole_size;
++		hole_size = file_off - inode->i_size;
++
++		set_window(clust, win, inode, inode->i_size, file_off);
++		win->stat = HOLE_WINDOW;
++		if (win->off + hole_size < inode_cluster_size(inode))
++			/* there is also user's data to append to the hole */
++			win->delta =
++			    min_count(inode_cluster_size(inode) -
++				      (win->off + win->count), f->length);
++		return 0;
++	}
++	set_window(clust, win, inode, file_off, file_off + f->length);
++	win->stat = DATA_WINDOW;
++	return 0;
++}
++
++int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
++			int count)
++{
++	int result = 0;
++	int (*setting_actor)(reiser4_cluster_t * clust, int count);
++
++	assert("edward-1358", clust != NULL);
++	assert("edward-1359", page != NULL);
++	assert("edward-1360", page->mapping != NULL);
++	assert("edward-1361", page->mapping->host != NULL);
++
++	setting_actor  = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
++	result = setting_actor(clust, count);
++	clust->index = pg_to_clust(page->index, page->mapping->host);
++	return result;
++}
++
++/* reset all the params that not get updated */
++void reset_cluster_params(reiser4_cluster_t * clust)
++{
++	assert("edward-197", clust != NULL);
++
++	clust->dstat = INVAL_DISK_CLUSTER;
++	clust->tc.uptodate = 0;
++	clust->tc.len = 0;
++}
++
++/* Core write procedure of cryptcompress plugin, which slices user's
++   flow into logical clusters, maps the last ones to the appropriate
++   page clusters, and tries to capture them.
++   If @buf != NULL, returns number of successfully written bytes,
++   otherwise returns error
++*/
++static loff_t
++write_cryptcompress_flow(struct file *file, struct inode *inode,
++			 const char __user *buf, size_t count, loff_t pos)
++{
++	int i;
++	flow_t f;
++	hint_t *hint;
++	int result = 0;
++	size_t to_write = 0;
++	loff_t file_off;
++	reiser4_slide_t win;
++	reiser4_cluster_t clust;
++
++	assert("edward-161", schedulable());
++	assert("edward-748", crc_inode_ok(inode));
++	assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
++	assert("edward-1274", get_current_context()->grabbed_blocks == 0);
++
++	result = check_cryptcompress(inode);
++	if (result)
++		return result;
++	hint = kmalloc(sizeof(*hint), GFP_KERNEL);
++	if (hint == NULL)
++		return RETERR(-ENOMEM);
++
++	result = load_file_hint(file, hint);
++	if (result) {
++		kfree(hint);
++		return result;
++	}
++
++	result =
++	    flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
++					count, pos, WRITE_OP, &f);
++	if (result)
++		goto out;
++	to_write = f.length;
++
++	/* current write position in file */
++	file_off = pos;
++	reiser4_slide_init(&win);
++	cluster_init_read(&clust, &win);
++	clust.hint = hint;
++
++	result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
++	if (result)
++		goto out;
++
++	if (next_window_stat(&win) == HOLE_WINDOW) {
++		result =
++		    prepare_cluster(inode, file_off, f.length, &clust,
++				    PCL_APPEND);
++		if (result)
++			goto out;
++	}
++	do {
++		char *src;
++		unsigned page_off, page_count;
++
++		assert("edward-750", schedulable());
++
++		result =
++		    prepare_cluster(inode, file_off, f.length, &clust,
++				    PCL_APPEND);
++		if (result)
++			goto out;
++
++		assert("edward-751", crc_inode_ok(inode));
++		assert("edward-204", win.stat == DATA_WINDOW);
++		assert("edward-1288", clust.hint->ext_coord.valid);
++		assert("edward-752",
++		       znode_is_write_locked(hint->ext_coord.coord.node));
++
++		put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
++
++		/* set write position in page */
++		page_off = off_to_pgoff(win.off);
++
++		/* copy user's data to cluster pages */
++		for (i = off_to_pg(win.off), src = f.data;
++		     i < count_to_nrpages(win.off + win.count);
++		     i++, src += page_count) {
++			page_count =
++			    cnt_to_pgcnt(win.off + win.count, i) - page_off;
++
++			assert("edward-1039",
++			       page_off + page_count <= PAGE_CACHE_SIZE);
++			assert("edward-287", clust.pages[i] != NULL);
++
++			lock_page(clust.pages[i]);
++			result =
++			    __copy_from_user((char *)kmap(clust.pages[i]) +
++					     page_off, (char __user *)src, page_count);
++			kunmap(clust.pages[i]);
++			if (unlikely(result)) {
++				unlock_page(clust.pages[i]);
++				result = -EFAULT;
++				goto err2;
++			}
++			SetPageUptodate(clust.pages[i]);
++			unlock_page(clust.pages[i]);
++			page_off = 0;
++		}
++		assert("edward-753", crc_inode_ok(inode));
++
++		set_cluster_pages_dirty(&clust);
++
++		result = try_capture_cluster(&clust, inode);
++		if (result)
++			goto err2;
++
++		assert("edward-998", f.user == 1);
++
++		move_flow_forward(&f, win.count);
++
++		/* disk cluster may be already clean at this point */
++
++		/* . update cluster
++		   . set hint for new offset
++		   . unlock znode
++		   . update inode
++		   . balance dirty pages
++		 */
++		result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
++		if (result)
++			goto err1;
++		assert("edward-755", hint->lh.owner == NULL);
++		reset_cluster_params(&clust);
++		continue;
++	      err2:
++		release_cluster_pages_and_jnode(&clust);
++	      err1:
++		if (clust.reserved)
++			free_reserved4cluster(inode,
++					      &clust,
++					      estimate_update_cluster(inode));
++		break;
++	} while (f.length);
++      out:
++	done_lh(&hint->lh);
++	if (result == -EEXIST)
++		warning("edward-1407", "write returns EEXIST!\n");
++
++	put_cluster_handle(&clust);
++	save_file_hint(file, hint);
++	kfree(hint);
++	if (buf) {
++		/* if nothing were written - there must be an error */
++		assert("edward-195", ergo((to_write == f.length), result < 0));
++		return (to_write - f.length) ? (to_write - f.length) : result;
++	}
++	return result;
++}
++
++static ssize_t write_crc_file(struct file *file,	/* file to write to */
++			      struct inode *inode,	/* inode */
++			      const char __user *buf,	/* address of user-space buffer */
++			      size_t count,	/* number of bytes to write */
++			      loff_t * off /* position to write which */ )
++{
++
++	int result;
++	loff_t pos;
++	ssize_t written;
++	cryptcompress_info_t *info = cryptcompress_inode_data(inode);
++
++	assert("edward-196", crc_inode_ok(inode));
++
++	result = generic_write_checks(file, off, &count, 0);
++	if (unlikely(result != 0))
++		return result;
++
++	if (unlikely(count == 0))
++		return 0;
++
++	down_write(&info->lock);
++	LOCK_CNT_INC(inode_sem_w);
++
++	pos = *off;
++	written =
++	    write_cryptcompress_flow(file, inode, buf, count, pos);
++
++	up_write(&info->lock);
++	LOCK_CNT_DEC(inode_sem_w);
++
++	if (written < 0) {
++		if (written == -EEXIST)
++			printk("write_crc_file returns EEXIST!\n");
++		return written;
++	}
++	/* update position in a file */
++	*off = pos + written;
++	/* return number of written bytes */
++	return written;
++}
++
++/**
++ * write_cryptcompress - write of struct file_operations
++ * @file: file to write to
++ * @buf: address of user-space buffer
++ * @read_amount: number of bytes to write
++ * @off: position in file to write to
++ *
++ * This is implementation of vfs's write method of struct file_operations for
++ * cryptcompress plugin.
++ */
++ssize_t write_cryptcompress(struct file *file, const char __user *buf,
++			    size_t count, loff_t *off)
++{
++	ssize_t result;
++	struct inode *inode;
++	reiser4_context *ctx;
++
++	inode = file->f_dentry->d_inode;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	mutex_lock(&inode->i_mutex);
++
++	result = write_crc_file(file, inode, buf, count, off);
++
++	mutex_unlock(&inode->i_mutex);
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++static void
++readpages_crc(struct address_space *mapping, struct list_head *pages,
++	      void *data)
++{
++	file_plugin *fplug;
++	item_plugin *iplug;
++
++	assert("edward-1112", mapping != NULL);
++	assert("edward-1113", mapping->host != NULL);
++
++	fplug = inode_file_plugin(mapping->host);
++	assert("edward-1114", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
++	iplug = item_plugin_by_id(CTAIL_ID);
++
++	iplug->s.file.readpages(data, mapping, pages);
++
++	return;
++}
++
++static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
++{
++	/* reserve one block to update stat data item */
++	assert("edward-1193",
++	       inode_file_plugin(inode)->estimate.update ==
++	       estimate_update_common);
++	return estimate_update_common(inode);
++}
++
++/**
++ * read_cryptcompress - read of struct file_operations
++ * @file: file to read from
++ * @buf: address of user-space buffer
++ * @read_amount: number of bytes to read
++ * @off: position in file to read from
++ *
++ * This is implementation of vfs's read method of struct file_operations for
++ * cryptcompress plugin.
++ */
++ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
++			   loff_t * off)
++{
++	ssize_t result;
++	struct inode *inode;
++	reiser4_context *ctx;
++	reiser4_file_fsdata *fsdata;
++	cryptcompress_info_t *info;
++	reiser4_block_nr needed;
++
++	inode = file->f_dentry->d_inode;
++	assert("edward-1194", !inode_get_flag(inode, REISER4_NO_SD));
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	info = cryptcompress_inode_data(inode);
++	needed = cryptcompress_estimate_read(inode);
++
++	/* FIXME-EDWARD:
++	   Grab space for sd_update so find_cluster will be happy */
++	result = reiser4_grab_space(needed, BA_CAN_COMMIT);
++	if (result != 0) {
++		reiser4_exit_context(ctx);
++		return result;
++	}
++	fsdata = reiser4_get_file_fsdata(file);
++	fsdata->ra2.data = file;
++	fsdata->ra2.readpages = readpages_crc;
++
++	down_read(&info->lock);
++	LOCK_CNT_INC(inode_sem_r);
++
++	result = generic_file_read(file, buf, size, off);
++
++	up_read(&info->lock);
++	LOCK_CNT_DEC(inode_sem_r);
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++
++	return result;
++}
++
++/* If @index > 0, find real disk cluster of the index (@index - 1),
++   If @index == 0 find the real disk cluster of the object of maximal index.
++   Keep incremented index of the result in @found.
++   It succes was returned:
++   (@index == 0 && @found == 0) means that the object doesn't have real disk
++   clusters.
++   (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
++   exist.
++*/
++static int
++find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
++{
++	int result;
++	reiser4_key key;
++	loff_t offset;
++	hint_t *hint;
++	lock_handle *lh;
++	lookup_bias bias;
++	coord_t *coord;
++	item_plugin *iplug;
++
++	assert("edward-1131", inode != NULL);
++	assert("edward-95", crc_inode_ok(inode));
++
++	hint = kmalloc(sizeof(*hint), GFP_KERNEL);
++	if (hint == NULL)
++		return RETERR(-ENOMEM);
++	hint_init_zero(hint);
++	lh = &hint->lh;
++
++	bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
++	offset =
++	    (index ? clust_to_off(index, inode) -
++	     1 : get_key_offset(max_key()));
++
++	key_by_inode_cryptcompress(inode, offset, &key);
++
++	/* find the last item of this object */
++	result =
++	    find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
++			      bias, 0);
++	if (cbk_errored(result)) {
++		done_lh(lh);
++		kfree(hint);
++		return result;
++	}
++	if (result == CBK_COORD_NOTFOUND) {
++		/* no real disk clusters */
++		done_lh(lh);
++		kfree(hint);
++		*found = 0;
++		return 0;
++	}
++	/* disk cluster is found */
++	coord = &hint->ext_coord.coord;
++	coord_clear_iplug(coord);
++	result = zload(coord->node);
++	if (unlikely(result)) {
++		done_lh(lh);
++		kfree(hint);
++		return result;
++	}
++	iplug = item_plugin_by_coord(coord);
++	assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
++	assert("edward-1202", ctail_ok(coord));
++
++	item_key_by_coord(coord, &key);
++	*found = off_to_clust(get_key_offset(&key), inode) + 1;
++
++	assert("edward-1132", ergo(index, index == *found));
++
++	zrelse(coord->node);
++	done_lh(lh);
++	kfree(hint);
++	return 0;
++}
++
++static int find_fake_appended(struct inode *inode, cloff_t * index)
++{
++	return find_real_disk_cluster(inode, index,
++				      0 /* find last real one */ );
++}
++
++/* Set left coord when unit is not found after node_lookup()
++   This takes into account that there can be holes in a sequence
++   of disk clusters */
++
++static void adjust_left_coord(coord_t * left_coord)
++{
++	switch (left_coord->between) {
++	case AFTER_UNIT:
++		left_coord->between = AFTER_ITEM;
++	case AFTER_ITEM:
++	case BEFORE_UNIT:
++		break;
++	default:
++		impossible("edward-1204", "bad left coord to cut");
++	}
++	return;
++}
++
++#define CRC_CUT_TREE_MIN_ITERATIONS 64
++int
++cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
++			      const reiser4_key * to_key,
++			      reiser4_key * smallest_removed,
++			      struct inode *object, int truncate, int *progress)
++{
++	lock_handle next_node_lock;
++	coord_t left_coord;
++	int result;
++
++	assert("edward-1158", tap->coord->node != NULL);
++	assert("edward-1159", znode_is_write_locked(tap->coord->node));
++	assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
++
++	*progress = 0;
++	init_lh(&next_node_lock);
++
++	while (1) {
++		znode *node;	/* node from which items are cut */
++		node_plugin *nplug;	/* node plugin for @node */
++
++		node = tap->coord->node;
++
++		/* Move next_node_lock to the next node on the left. */
++		result =
++		    reiser4_get_left_neighbor(&next_node_lock, node,
++					      ZNODE_WRITE_LOCK,
++					      GN_CAN_USE_UPPER_LEVELS);
++		if (result != 0 && result != -E_NO_NEIGHBOR)
++			break;
++		/* FIXME-EDWARD: Check can we delete the node as a whole. */
++		result = tap_load(tap);
++		if (result)
++			return result;
++
++		/* Prepare the second (right) point for cut_node() */
++		if (*progress)
++			coord_init_last_unit(tap->coord, node);
++
++		else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
++			/* set rightmost unit for the items without lookup method */
++			tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
++
++		nplug = node->nplug;
++
++		assert("edward-1161", nplug);
++		assert("edward-1162", nplug->lookup);
++
++		/* left_coord is leftmost unit cut from @node */
++		result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
++
++		if (IS_CBKERR(result))
++			break;
++
++		if (result == CBK_COORD_NOTFOUND)
++			adjust_left_coord(&left_coord);
++
++		/* adjust coordinates so that they are set to existing units */
++		if (coord_set_to_right(&left_coord)
++		    || coord_set_to_left(tap->coord)) {
++			result = 0;
++			break;
++		}
++
++		if (coord_compare(&left_coord, tap->coord) ==
++		    COORD_CMP_ON_RIGHT) {
++			/* keys from @from_key to @to_key are not in the tree */
++			result = 0;
++			break;
++		}
++
++		/* cut data from one node */
++		*smallest_removed = *min_key();
++		result = kill_node_content(&left_coord,
++					   tap->coord,
++					   from_key,
++					   to_key,
++					   smallest_removed,
++					   next_node_lock.node,
++					   object, truncate);
++#if REISER4_DEBUG
++		/*node_check(node, ~0U); */
++#endif
++		tap_relse(tap);
++
++		if (result)
++			break;
++
++		++(*progress);
++
++		/* Check whether all items with keys >= from_key were removed
++		 * from the tree. */
++		if (keyle(smallest_removed, from_key))
++			/* result = 0; */
++			break;
++
++		if (next_node_lock.node == NULL)
++			break;
++
++		result = tap_move(tap, &next_node_lock);
++		done_lh(&next_node_lock);
++		if (result)
++			break;
++
++		/* Break long cut_tree operation (deletion of a large file) if
++		 * atom requires commit. */
++		if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
++		    && current_atom_should_commit()) {
++			result = -E_REPEAT;
++			break;
++		}
++	}
++	done_lh(&next_node_lock);
++	return result;
++}
++
++/* Append or expand hole in two steps (exclusive access should be aquired!)
++   1) write zeroes to the current real cluster,
++   2) expand hole via fake clusters (just increase i_size) */
++static int
++cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
++			  loff_t new_size)
++{
++	int result = 0;
++	hint_t *hint;
++	lock_handle *lh;
++	loff_t hole_size;
++	int nr_zeroes;
++	reiser4_slide_t win;
++	reiser4_cluster_t clust;
++
++	assert("edward-1133", inode->i_size < new_size);
++	assert("edward-1134", schedulable());
++	assert("edward-1135", crc_inode_ok(inode));
++	assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
++	assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
++
++	hint = kmalloc(sizeof(*hint), GFP_KERNEL);
++	if (hint == NULL)
++		return RETERR(-ENOMEM);
++	hint_init_zero(hint);
++	lh = &hint->lh;
++
++	reiser4_slide_init(&win);
++	cluster_init_read(&clust, &win);
++	clust.hint = hint;
++
++	result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
++	if (result)
++		goto out;
++	if (off_to_cloff(inode->i_size, inode) == 0)
++		goto fake_append;
++	hole_size = new_size - inode->i_size;
++	nr_zeroes = 
++		inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
++	if (hole_size < nr_zeroes)
++		nr_zeroes = hole_size;
++	set_window(&clust, &win, inode, inode->i_size,
++		   inode->i_size + nr_zeroes);
++	win.stat = HOLE_WINDOW;
++
++	assert("edward-1137",
++	       clust.index == off_to_clust(inode->i_size, inode));
++
++	result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
++
++	assert("edward-1271", !result || result == -ENOSPC);
++	if (result)
++		goto out;
++	assert("edward-1139",
++	       clust.dstat == PREP_DISK_CLUSTER ||
++	       clust.dstat == UNPR_DISK_CLUSTER);
++
++	assert("edward-1431", hole_size >= nr_zeroes);
++	if (hole_size == nr_zeroes)
++	/* nothing to append anymore */
++		goto out;
++      fake_append:
++	INODE_SET_FIELD(inode, i_size, new_size);
++      out:
++	done_lh(lh);
++	kfree(hint);
++	put_cluster_handle(&clust);
++	return result;
++}
++
++#if REISER4_DEBUG
++static int
++pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
++{
++	struct pagevec pvec;
++	int i;
++	int count;
++	int rest;
++
++	rest = count_to_nrpages(old_size) - start;
++
++	pagevec_init(&pvec, 0);
++	count = min_count(pagevec_space(&pvec), rest);
++
++	while (rest) {
++		count = min_count(pagevec_space(&pvec), rest);
++		pvec.nr = find_get_pages(inode->i_mapping, start,
++					 count, pvec.pages);
++		for (i = 0; i < pagevec_count(&pvec); i++) {
++			if (PageUptodate(pvec.pages[i])) {
++				warning("edward-1205",
++					"truncated page of index %lu is uptodate",
++					pvec.pages[i]->index);
++				return 0;
++			}
++		}
++		start += count;
++		rest -= count;
++		pagevec_release(&pvec);
++	}
++	return 1;
++}
++
++static int body_truncate_ok(struct inode *inode, cloff_t aidx)
++{
++	int result;
++	cloff_t raidx;
++
++	result = find_fake_appended(inode, &raidx);
++	return !result && (aidx == raidx);
++}
++#endif
++
++static int
++update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
++{
++	return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
++		? 0 : update_file_size(inode, key, update_sd));
++}
++
++/* prune cryptcompress file in two steps (exclusive access should be acquired!)
++   1) cut all disk clusters but the last one partially truncated,
++   2) set zeroes and capture last partially truncated page cluster if the last
++      one exists, otherwise truncate via prune fake cluster (just decrease i_size)
++*/
++static int
++prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
++		    cloff_t aidx)
++{
++	int result = 0;
++	unsigned nr_zeroes;
++	loff_t to_prune;
++	loff_t old_size;
++	cloff_t ridx;
++
++	hint_t *hint;
++	lock_handle *lh;
++	reiser4_slide_t win;
++	reiser4_cluster_t clust;
++
++	assert("edward-1140", inode->i_size >= new_size);
++	assert("edward-1141", schedulable());
++	assert("edward-1142", crc_inode_ok(inode));
++	assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
++
++	hint = kmalloc(sizeof(*hint), GFP_KERNEL);
++	if (hint == NULL)
++		return RETERR(-ENOMEM);
++	hint_init_zero(hint);
++	lh = &hint->lh;
++
++	reiser4_slide_init(&win);
++	cluster_init_read(&clust, &win);
++	clust.hint = hint;
++
++	/* rightmost completely truncated cluster */
++	ridx = count_to_nrclust(new_size, inode);
++
++	assert("edward-1174", ridx <= aidx);
++	old_size = inode->i_size;
++	if (ridx != aidx) {
++		result = cut_file_items(inode,
++					clust_to_off(ridx, inode),
++					update_sd,
++					clust_to_off(aidx, inode),
++					update_cryptcompress_size);
++		if (result)
++			goto out;
++	}
++	if (!off_to_cloff(new_size, inode)) {
++		/* no partially truncated clusters */
++		assert("edward-1145", inode->i_size == new_size);
++		goto finish;
++	}
++	assert("edward-1146", new_size < inode->i_size);
++
++	to_prune = inode->i_size - new_size;
++
++	/* partial truncate of leftmost cluster, 
++	   first check if it is fake */
++	result = find_real_disk_cluster(inode, &aidx, ridx);
++	if (result)
++		goto out;
++	if (!aidx)
++		/* yup, this is fake one */
++		goto finish;
++
++	assert("edward-1148", aidx == ridx);
++
++	/* do partial truncate of the leftmost page cluster,
++	   then try to capture this one */
++	result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
++	if (result)
++		goto out;
++	nr_zeroes = (off_to_pgoff(new_size) ?
++		     PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
++	set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
++	win.stat = HOLE_WINDOW;
++
++	assert("edward-1149", clust.index == ridx - 1);
++
++	result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
++	if (result)
++		goto out;
++	assert("edward-1151",
++	       clust.dstat == PREP_DISK_CLUSTER ||
++	       clust.dstat == UNPR_DISK_CLUSTER);
++
++	assert("edward-1191", inode->i_size == new_size);
++	assert("edward-1206", body_truncate_ok(inode, ridx));
++      finish:
++	/* drop all the pages that don't have jnodes (i.e. pages
++	   which can not be truncated by cut_file_items() because
++	   of holes represented by fake disk clusters) including
++	   the pages of partially truncated cluster which was
++	   released by prepare_cluster() */
++	truncate_inode_pages(inode->i_mapping, new_size);
++	INODE_SET_FIELD(inode, i_size, new_size);
++      out:
++	assert("edward-1334", !result || result == -ENOSPC);
++	assert("edward-1209",
++	       pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
++	done_lh(lh);
++	kfree(hint);
++	put_cluster_handle(&clust);
++	return result;
++}
++
++/* Prepare cryptcompress file for truncate:
++   prune or append rightmost fake logical clusters (if any)
++*/
++static int
++start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
++		    int update_sd)
++{
++	int result = 0;
++	int bytes;
++
++	if (new_size > inode->i_size) {
++		/* append */
++		if (inode->i_size < clust_to_off(aidx, inode))
++			/* no fake bytes */
++			return 0;
++		bytes = new_size - inode->i_size;
++		INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
++	} else {
++		/* prune */
++		if (inode->i_size <= clust_to_off(aidx, inode))
++			/* no fake bytes */
++			return 0;
++		bytes =
++		    inode->i_size - max_count(new_size,
++					      clust_to_off(aidx, inode));
++		if (!bytes)
++			return 0;
++		INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
++		/* In the case of fake prune we need to drop page cluster.
++		   There are only 2 cases for partially truncated page:
++		   1. If is is dirty, therefore it is anonymous
++		   (was dirtied via mmap), and will be captured
++		   later via ->capture().
++		   2. If is clean, therefore it is filled by zeroes.
++		   In both cases we don't need to make it dirty and
++		   capture here.
++		 */
++		truncate_inode_pages(inode->i_mapping, inode->i_size);
++	}
++	if (update_sd)
++		result = update_sd_cryptcompress(inode);
++	return result;
++}
++
++/* This is called in setattr_cryptcompress when it is used to truncate,
++   and in delete_cryptcompress */
++static int cryptcompress_truncate(struct inode *inode,	/* old size */
++				  loff_t new_size,	/* new size */
++				  int update_sd)
++{
++	int result;
++	cloff_t aidx;
++
++	result = find_fake_appended(inode, &aidx);
++	if (result)
++		return result;
++	assert("edward-1208",
++	       ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
++
++	result = start_truncate_fake(inode, aidx, new_size, update_sd);
++	if (result)
++		return result;
++	if (inode->i_size == new_size)
++		/* nothing to truncate anymore */
++		return 0;
++	return (inode->i_size < new_size ?
++		cryptcompress_append_hole(inode, new_size) :
++		prune_cryptcompress(inode, new_size, update_sd, aidx));
++}
++
++static void clear_moved_tag_cluster(struct address_space * mapping,
++				    reiser4_cluster_t * clust)
++{
++	int i;
++	void * ret;
++	read_lock_irq(&mapping->tree_lock);
++	for (i = 0; i < clust->nr_pages; i++) {
++		assert("edward-1438", clust->pages[i] != NULL);
++		ret = radix_tree_tag_clear(&mapping->page_tree, 
++					   clust->pages[i]->index,
++					   PAGECACHE_TAG_REISER4_MOVED);
++		assert("edward-1439", ret == clust->pages[i]);
++	}
++	read_unlock_irq(&mapping->tree_lock);
++}
++
++/* Capture an anonymous pager cluster. (Page cluser is
++   anonymous if it contains at least one anonymous page */
++static int
++capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
++{
++	int result;
++
++	assert("edward-1073", clust != NULL);
++	assert("edward-1074", inode != NULL);
++	assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
++
++	result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
++	if (result)
++		return result;
++	set_cluster_pages_dirty(clust);
++	clear_moved_tag_cluster(inode->i_mapping, clust);
++
++	result = try_capture_cluster(clust, inode);
++	put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
++	if (unlikely(result)) {
++		/* set cleared tag back, so it will be
++		   possible to capture it again later */
++		read_lock_irq(&inode->i_mapping->tree_lock);
++		radix_tree_tag_set(&inode->i_mapping->page_tree,
++				   clust_to_pg(clust->index, inode),
++				   PAGECACHE_TAG_REISER4_MOVED);
++		read_unlock_irq(&inode->i_mapping->tree_lock);
++		
++		release_cluster_pages_and_jnode(clust);
++	}
++	return result;
++}
++
++#define MAX_CLUSTERS_TO_CAPTURE(inode)    (1024 >> cluster_nrpages_shift(inode))
++
++/* read lock should be acquired */
++static int
++capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
++			   int to_capture)
++{
++	int result = 0;
++	int found;
++	int progress = 0;
++	struct page *page = NULL;
++	hint_t *hint;
++	lock_handle *lh;
++	reiser4_cluster_t clust;
++
++	assert("edward-1127", mapping != NULL);
++	assert("edward-1128", mapping->host != NULL);
++	assert("edward-1440",  mapping->host->i_mapping == mapping);
++
++	hint = kmalloc(sizeof(*hint), GFP_KERNEL);
++	if (hint == NULL)
++		return RETERR(-ENOMEM);
++	hint_init_zero(hint);
++	lh = &hint->lh;
++
++	cluster_init_read(&clust, NULL);
++	clust.hint = hint;
++
++	result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
++	if (result)
++		goto out;
++
++	while (to_capture > 0) {
++		found =
++		    find_get_pages_tag(mapping, index,
++				       PAGECACHE_TAG_REISER4_MOVED, 1, &page);
++		if (!found) {
++			*index = (pgoff_t) - 1;
++			break;
++		}
++		assert("edward-1109", page != NULL);
++
++		move_cluster_forward(&clust, mapping->host, page->index,
++				     &progress);
++		result = capture_page_cluster(&clust, mapping->host);
++		page_cache_release(page);
++		if (result)
++			break;
++		to_capture--;
++	}
++	if (result) {
++		warning("edward-1077",
++			"Cannot capture anon pages: result=%i (captured=%d)\n",
++			result,
++			((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
++			to_capture);
++	} else {
++		/* something had to be found */
++		assert("edward-1078",
++		       to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
++		if (to_capture <= 0)
++			/* there may be left more pages */
++			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
++	}
++      out:
++	done_lh(lh);
++	kfree(hint);
++	put_cluster_handle(&clust);
++	return result;
++}
++
++/* Check mapping for existence of not captured dirty pages.
++   This returns !0 if either page tree contains pages tagged
++   PAGECACHE_TAG_REISER4_MOVED */
++static int crc_inode_has_anon_pages(struct inode *inode)
++{
++	return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
++}
++
++/* this is implementation of vfs's writepages method of struct
++   address_space_operations */
++int
++writepages_cryptcompress(struct address_space *mapping,
++			 struct writeback_control *wbc)
++{
++	int result;
++	int to_capture;
++	pgoff_t nrpages;
++	pgoff_t index = 0;
++	cryptcompress_info_t *info;
++	struct inode *inode;
++
++	inode = mapping->host;
++	if (!crc_inode_has_anon_pages(inode)) {
++		result = 0;
++		goto end;
++	}
++
++	info = cryptcompress_inode_data(inode);
++	nrpages = count_to_nrpages(i_size_read(inode));
++
++	if (wbc->sync_mode != WB_SYNC_ALL)
++		to_capture =
++		    min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
++	else
++		to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
++	do {
++		reiser4_context *ctx;
++
++		if (is_in_reiser4_context()) {
++			/* FIXME-EDWARD: REMOVEME */
++			all_grabbed2free();
++
++			/* It can be in the context of write system call from
++			   balance_dirty_pages() */
++			if (down_read_trylock(&info->lock) == 0) {
++				result = RETERR(-EBUSY);
++				break;
++			}
++		} else
++			down_read(&info->lock);
++
++		ctx = init_context(inode->i_sb);
++		if (IS_ERR(ctx)) {
++			result = PTR_ERR(ctx);
++			break;
++		}
++		ctx->nobalance = 1;
++
++		assert("edward-1079",
++		       lock_stack_isclean(get_current_lock_stack()));
++
++		LOCK_CNT_INC(inode_sem_r);
++
++		result =
++		    capture_anonymous_clusters(inode->i_mapping, &index,
++					       to_capture);
++
++		up_read(&info->lock);
++
++		LOCK_CNT_DEC(inode_sem_r);
++
++		if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
++			reiser4_exit_context(ctx);
++			break;
++		}
++		result = txnmgr_force_commit_all(inode->i_sb, 0);
++		reiser4_exit_context(ctx);
++	} while (result == 0 && index < nrpages);
++
++      end:
++	if (is_in_reiser4_context()) {
++		if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
++			/* there are already pages to flush, flush them out, do
++			   not delay until end of reiser4_sync_inodes */
++			writeout(inode->i_sb, wbc);
++			get_current_context()->nr_captured = 0;
++		}
++	}
++	return result;
++}
++
++/* plugin->u.file.mmap */
++int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
++{
++	//return -ENOSYS;
++	return generic_file_mmap(file, vma);
++}
++
++/* plugin->u.file.release */
++/* plugin->u.file.get_block */
++
++/* this is implementation of delete method of file plugin for
++   cryptcompress objects */
++int delete_cryptcompress(struct inode *inode)
++{
++	int result;
++
++	assert("edward-429", inode->i_nlink == 0);
++
++	if (inode->i_size) {
++		result = cryptcompress_truncate(inode, 0, 0);
++		if (result) {
++			warning("edward-430",
++				"cannot truncate cryptcompress file  %lli: %i",
++				(unsigned long long)get_inode_oid(inode),
++				result);
++			return result;
++		}
++	}
++	/* and remove stat data */
++	return delete_object_common(inode);
++}
++
++/* plugin->u.file.setattr method
++   see plugin.h for description */
++int setattr_cryptcompress(struct dentry *dentry,	/* Object to change attributes */
++			  struct iattr *attr /* change description */ )
++{
++	int result;
++	struct inode *inode;
++
++	inode = dentry->d_inode;
++	result = check_cryptcompress(inode);
++	if (result)
++		return result;
++	if (attr->ia_valid & ATTR_SIZE) {
++		/* EDWARD-FIXME-HANS: VS-FIXME-HANS:
++		   Q: this case occurs when? truncate?
++		   A: yes
++
++		   Q: If so, why isn't this code in truncate itself instead of here?
++
++		   A: because vfs calls fs's truncate after it has called truncate_inode_pages to get rid of pages
++		   corresponding to part of file being truncated. In reiser4 it may cause existence of unallocated
++		   extents which do not have jnodes. Flush code does not expect that. Solution of this problem is
++		   straightforward. As vfs's truncate is implemented using setattr operation (common implementaion of
++		   which calls truncate_inode_pages and fs's truncate in case when size of file changes) - it seems
++		   reasonable to have reiser4_setattr which will take care of removing pages, jnodes and extents
++		   simultaneously in case of truncate.
++		   Q: do you think implementing truncate using setattr is ugly,
++		   and vfs needs improving, or is there some sense in which this is a good design?
++
++		   A: VS-FIXME-HANS:
++		 */
++
++		/* truncate does reservation itself and requires exclusive access obtained */
++		if (inode->i_size != attr->ia_size) {
++			reiser4_context *ctx;
++			loff_t old_size;
++			cryptcompress_info_t *info =
++			    cryptcompress_inode_data(inode);
++
++			ctx = init_context(dentry->d_inode->i_sb);
++			if (IS_ERR(ctx))
++				return PTR_ERR(ctx);
++
++			down_write(&info->lock);
++			LOCK_CNT_INC(inode_sem_w);
++
++			inode_check_scale(inode, inode->i_size, attr->ia_size);
++
++			old_size = inode->i_size;
++
++			result =
++			    cryptcompress_truncate(inode, attr->ia_size,
++						   1 /* update stat data */ );
++			if (result) {
++				warning("edward-1192",
++					"truncate_cryptcompress failed: oid %lli, "
++					"old size %lld, new size %lld, retval %d",
++					(unsigned long long)
++					get_inode_oid(inode), old_size,
++					attr->ia_size, result);
++			}
++			up_write(&info->lock);
++			LOCK_CNT_DEC(inode_sem_w);
++			context_set_commit_async(ctx);
++			reiser4_exit_context(ctx);
++		} else
++			result = 0;
++	} else
++		result = setattr_common(dentry, attr);
++	return result;
++}
++
++/* sendfile_cryptcompress - sendfile of struct file_operations */
++ssize_t
++sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
++		       read_actor_t actor, void *target)
++{
++	reiser4_context *ctx;
++	ssize_t result;
++	struct inode *inode;
++	cryptcompress_info_t *info;
++
++	inode = file->f_dentry->d_inode;
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	/*
++	 * generic_file_sndfile may want to call update_atime. Grab space for
++	 * stat data update
++	 */
++	result = reiser4_grab_space(estimate_update_common(inode),
++				    BA_CAN_COMMIT);
++	if (result)
++		goto exit;
++	info = cryptcompress_inode_data(inode);
++	down_read(&info->lock);
++	result = generic_file_sendfile(file, ppos, count, actor, target);
++	up_read(&info->lock);
++ exit:
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/*
++ * release_cryptcompress - release of struct file_operations
++ * @inode: inode of released file
++ * @file: file to release
++ */
++int release_cryptcompress(struct inode *inode, struct file *file)
++{
++	reiser4_context *ctx = init_context(inode->i_sb);
++
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	reiser4_free_file_fsdata(file);
++	reiser4_exit_context(ctx);
++	return 0;
++}
++
++static int
++save_len_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin)
++{
++	assert("edward-457", inode != NULL);
++	assert("edward-458", plugin != NULL);
++	assert("edward-459", plugin->h.id == CRC_FILE_PLUGIN_ID);
++	return 0;
++}
++
++static int
++load_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin,
++			  char **area, int *len)
++{
++	assert("edward-455", inode != NULL);
++	assert("edward-456", (reiser4_inode_data(inode)->pset != NULL));
++
++	plugin_set_file(&reiser4_inode_data(inode)->pset,
++			file_plugin_by_id(CRC_FILE_PLUGIN_ID));
++	return 0;
++}
++
++static int change_cryptcompress(struct inode *inode, reiser4_plugin * plugin)
++{
++	/* cannot change object plugin of already existing object */
++	return RETERR(-EINVAL);
++}
++
++struct reiser4_plugin_ops cryptcompress_plugin_ops = {
++	.load = load_cryptcompress_plugin,
++	.save_len = save_len_cryptcompress_plugin,
++	.save = NULL,
++	.alignment = 8,
++	.change = change_cryptcompress
++};
++
++/*
++  Local variables:
++  c-indentation-style: "K&R"
++  mode-name: "LC"
++  c-basic-offset: 8
++  tab-width: 8
++  fill-column: 80
++  scroll-step: 1
++  End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/cryptcompress.h
+@@ -0,0 +1,551 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* See http://www.namesys.com/cryptcompress_design.html */
++
++#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
++#define __FS_REISER4_CRYPTCOMPRESS_H__
++
++#include "../compress/compress.h"
++#include "../crypto/cipher.h"
++
++#include <linux/pagemap.h>
++#include <linux/vmalloc.h>
++
++#define MIN_CLUSTER_SIZE PAGE_CACHE_SIZE
++#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
++#define MAX_CLUSTER_SHIFT 16
++#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
++#define DC_CHECKSUM_SIZE 4
++
++static inline loff_t min_count(loff_t a, loff_t b)
++{
++	return (a < b ? a : b);
++}
++
++static inline loff_t max_count(loff_t a, loff_t b)
++{
++	return (a > b ? a : b);
++}
++
++#if REISER4_DEBUG
++static inline int cluster_shift_ok(int shift)
++{
++	return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
++}
++#endif
++
++typedef struct tfm_stream {
++	__u8 *data;
++	size_t size;
++} tfm_stream_t;
++
++typedef enum {
++	INPUT_STREAM,
++	OUTPUT_STREAM,
++	LAST_STREAM
++} tfm_stream_id;
++
++typedef tfm_stream_t *tfm_unit[LAST_STREAM];
++
++static inline __u8 *ts_data(tfm_stream_t * stm)
++{
++	assert("edward-928", stm != NULL);
++	return stm->data;
++}
++
++static inline size_t ts_size(tfm_stream_t * stm)
++{
++	assert("edward-929", stm != NULL);
++	return stm->size;
++}
++
++static inline void set_ts_size(tfm_stream_t * stm, size_t size)
++{
++	assert("edward-930", stm != NULL);
++
++	stm->size = size;
++}
++
++static inline int alloc_ts(tfm_stream_t ** stm)
++{
++	assert("edward-931", stm);
++	assert("edward-932", *stm == NULL);
++
++	*stm = kmalloc(sizeof **stm, GFP_KERNEL);
++	if (*stm == NULL)
++		return -ENOMEM;
++	memset(*stm, 0, sizeof **stm);
++	return 0;
++}
++
++static inline void free_ts(tfm_stream_t * stm)
++{
++	assert("edward-933", !ts_data(stm));
++	assert("edward-934", !ts_size(stm));
++
++	kfree(stm);
++}
++
++static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
++{
++	assert("edward-935", !ts_data(stm));
++	assert("edward-936", !ts_size(stm));
++	assert("edward-937", size != 0);
++
++	stm->data = vmalloc(size);
++	if (!stm->data)
++		return -ENOMEM;
++	set_ts_size(stm, size);
++	return 0;
++}
++
++static inline void free_ts_data(tfm_stream_t * stm)
++{
++	assert("edward-938", equi(ts_data(stm), ts_size(stm)));
++
++	if (ts_data(stm))
++		vfree(ts_data(stm));
++	memset(stm, 0, sizeof *stm);
++}
++
++/* Write modes for item conversion in flush convert phase */
++typedef enum {
++	CRC_APPEND_ITEM = 1,
++	CRC_OVERWRITE_ITEM = 2,
++	CRC_CUT_ITEM = 3
++} crc_write_mode_t;
++
++typedef enum {
++	PCL_UNKNOWN = 0,	/* invalid option */
++	PCL_APPEND = 1,		/* append and/or overwrite */
++	PCL_TRUNCATE = 2	/* truncate */
++} page_cluster_op;
++
++/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
++   using crypto/compression transforms implemented by reiser4 transform plugins.
++   Before each transform we allocate a pair of streams (tfm_unit) and assemble
++   page cluster into the input one. After transform we split output stream into
++   a set of items (disk cluster).
++*/
++typedef struct tfm_cluster {
++	coa_set coa;
++	tfm_unit tun;
++	tfm_action act;
++	int uptodate;
++	int lsize;        /* size of the logical cluster */
++	int len;          /* length of the transform stream */
++} tfm_cluster_t;
++
++static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
++{
++	return tc->coa[id][act];
++}
++
++static inline void
++set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
++{
++	tc->coa[id][act] = coa;
++}
++
++static inline int
++alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
++{
++	coa_t coa;
++
++	coa = cplug->alloc(tc->act);
++	if (IS_ERR(coa))
++		return PTR_ERR(coa);
++	set_coa(tc, cplug->h.id, tc->act, coa);
++	return 0;
++}
++
++static inline int
++grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
++{
++	return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
++		alloc_coa(tc, cplug) : 0);
++}
++
++static inline void free_coa_set(tfm_cluster_t * tc)
++{
++	tfm_action j;
++	reiser4_compression_id i;
++	compression_plugin *cplug;
++
++	assert("edward-810", tc != NULL);
++
++	for (j = 0; j < LAST_TFM; j++)
++		for (i = 0; i < LAST_COMPRESSION_ID; i++) {
++			if (!get_coa(tc, i, j))
++				continue;
++			cplug = compression_plugin_by_id(i);
++			assert("edward-812", cplug->free != NULL);
++			cplug->free(get_coa(tc, i, j), j);
++			set_coa(tc, i, j, 0);
++		}
++	return;
++}
++
++static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
++{
++	return tc->tun[id];
++}
++
++static inline void
++set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
++{
++	tc->tun[id] = ts;
++}
++
++static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
++{
++	return ts_data(tfm_stream(tc, id));
++}
++
++static inline void
++set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
++{
++	tfm_stream(tc, id)->data = data;
++}
++
++static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
++{
++	return ts_size(tfm_stream(tc, id));
++}
++
++static inline void
++set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
++{
++	tfm_stream(tc, id)->size = size;
++}
++
++static inline int
++alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
++{
++	assert("edward-939", tc != NULL);
++	assert("edward-940", !tfm_stream(tc, id));
++
++	tc->tun[id] = kmalloc(sizeof(tfm_stream_t), GFP_KERNEL);
++	if (!tc->tun[id])
++		return -ENOMEM;
++	memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
++	return alloc_ts_data(tfm_stream(tc, id), size);
++}
++
++static inline int
++realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
++{
++	assert("edward-941", tfm_stream_size(tc, id) < size);
++	free_ts_data(tfm_stream(tc, id));
++	return alloc_ts_data(tfm_stream(tc, id), size);
++}
++
++static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
++{
++	free_ts_data(tfm_stream(tc, id));
++	free_ts(tfm_stream(tc, id));
++	set_tfm_stream(tc, id, 0);
++}
++
++static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
++{
++	return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
++}
++
++static inline void free_tfm_unit(tfm_cluster_t * tc)
++{
++	tfm_stream_id id;
++	for (id = 0; id < LAST_STREAM; id++) {
++		if (!tfm_stream(tc, id))
++			continue;
++		free_tfm_stream(tc, id);
++	}
++}
++
++static inline void put_tfm_cluster(tfm_cluster_t * tc)
++{
++	assert("edward-942", tc != NULL);
++	free_coa_set(tc);
++	free_tfm_unit(tc);
++}
++
++static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
++{
++	assert("edward-943", tc != NULL);
++	assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
++	return (tc->uptodate == 1);
++}
++
++static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
++{
++	assert("edward-945", tc != NULL);
++	assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
++	tc->uptodate = 1;
++	return;
++}
++
++static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
++{
++	assert("edward-947", tc != NULL);
++	assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
++	tc->uptodate = 0;
++	return;
++}
++
++static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
++{
++	return (tfm_stream(tc, id) &&
++		tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
++}
++
++static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
++{
++	int i;
++	for (i = 0; i < LAST_STREAM; i++)
++		if (!tfm_stream_is_set(tc, i))
++			return 0;
++	return 1;
++}
++
++static inline void alternate_streams(tfm_cluster_t * tc)
++{
++	tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
++
++	set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
++	set_tfm_stream(tc, OUTPUT_STREAM, tmp);
++}
++
++/* a kind of data that we can write to the window */
++typedef enum {
++	DATA_WINDOW,		/* the data we copy form user space */
++	HOLE_WINDOW		/* zeroes if we write hole */
++} window_stat;
++
++/* Sliding window of cluster size which should be set to the approprite position
++   (defined by cluster index) in a file before page cluster modification by
++   file_write. Then we translate file size, offset to write from, number of
++   bytes to write, etc.. to the following configuration needed to estimate
++   number of pages to read before write, etc...
++*/
++typedef struct reiser4_slide {
++	unsigned off;		/* offset we start to write/truncate from */
++	unsigned count;		/* number of bytes (zeroes) to write/truncate */
++	unsigned delta;		/* number of bytes to append to the hole */
++	window_stat stat;	/* a kind of data to write to the window */
++} reiser4_slide_t;
++
++/* The following is a set of possible disk cluster states */
++typedef enum {
++	INVAL_DISK_CLUSTER,	/* unknown state */
++	PREP_DISK_CLUSTER,	/* disk cluster got converted by flush
++				   at least 1 time */
++	UNPR_DISK_CLUSTER,	/* disk cluster just created and should be
++				   converted by flush */
++	FAKE_DISK_CLUSTER	/* disk cluster doesn't exist neither in memory
++				   nor on disk */
++} disk_cluster_stat;
++
++/*
++   While implementing all transforms (from page to disk cluster, and back)
++   reiser4 cluster manager fills the following structure incapsulating pointers
++   to all the clusters for the same index including the sliding window above
++*/
++typedef struct reiser4_cluster {
++	tfm_cluster_t tc;	/* transform cluster */
++	int nr_pages;		/* number of pages */
++	struct page **pages;	/* page cluster */
++	page_cluster_op op;	/* page cluster operation */
++	struct file *file;
++	hint_t *hint;		/* disk cluster item for traversal */
++	disk_cluster_stat dstat;	/* state of the current disk cluster */
++	cloff_t index;		/* offset in the units of cluster size */
++	reiser4_slide_t *win;	/* sliding window of cluster size */
++	int reserved;		/* this indicates that space for disk
++				   cluster modification is reserved */
++#if REISER4_DEBUG
++	reiser4_context *ctx;
++	int reserved_prepped;
++	int reserved_unprepped;
++#endif
++
++} reiser4_cluster_t;
++
++static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
++{
++	return tfm_stream_data(&clust->tc, INPUT_STREAM);
++}
++
++static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
++{
++	return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
++}
++
++static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
++{
++	assert("edward-1057", clust->pages != NULL);
++	memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
++	return 0;
++}
++
++static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
++{
++	assert("edward-949", clust != NULL);
++	assert("edward-1362", clust->pages == NULL);
++	assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
++
++	clust->pages =
++		kmalloc(sizeof(*clust->pages) * nrpages, GFP_KERNEL);
++	if (!clust->pages)
++		return RETERR(-ENOMEM);
++	reset_cluster_pgset(clust, nrpages);
++	return 0;
++}
++
++static inline void free_cluster_pgset(reiser4_cluster_t * clust)
++{
++	assert("edward-951", clust->pages != NULL);
++	kfree(clust->pages);
++	clust->pages = NULL;
++}
++
++static inline void put_cluster_handle(reiser4_cluster_t * clust)
++{
++	assert("edward-435", clust != NULL);
++
++	put_tfm_cluster(&clust->tc);
++	if (clust->pages)
++		free_cluster_pgset(clust);
++	memset(clust, 0, sizeof *clust);
++}
++
++static inline void inc_keyload_count(crypto_stat_t * data)
++{
++ 	assert("edward-1410", data != NULL);
++ 	data->keyload_count++;
++}
++
++static inline void dec_keyload_count(crypto_stat_t * data)
++{
++ 	assert("edward-1411", data != NULL);
++ 	assert("edward-1412", data->keyload_count > 0);
++ 	data->keyload_count--;
++}
++
++/* cryptcompress specific part of reiser4_inode */
++typedef struct cryptcompress_info {
++	struct rw_semaphore lock;
++	crypto_stat_t *crypt;
++	int compress_toggle;      /* current status of compressibility
++				     is set by compression mode plugin */
++#if REISER4_DEBUG
++	int pgcount;              /* number of captured pages */
++#endif
++} cryptcompress_info_t;
++
++
++static inline void toggle_compression (cryptcompress_info_t * info, int val)
++{
++	info->compress_toggle = val;
++}
++
++static inline int compression_is_on (cryptcompress_info_t * info)
++{
++	return info->compress_toggle;
++}
++
++cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
++int equal_to_rdk(znode *, const reiser4_key *);
++int goto_right_neighbor(coord_t *, lock_handle *);
++int load_file_hint(struct file *, hint_t *);
++void save_file_hint(struct file *, const hint_t *);
++void hint_init_zero(hint_t *);
++int crc_inode_ok(struct inode *inode);
++int jnode_of_cluster(const jnode * node, struct page * page);
++extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *, int);
++extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
++			     struct page * page);
++extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
++					  struct inode * inode);
++int bind_cryptcompress(struct inode *child, struct inode *parent);
++void destroy_inode_cryptcompress(struct inode * inode);
++crypto_stat_t * inode_crypto_stat (struct inode * inode);
++void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
++				int (*can_inherit)(struct inode * child,
++						   struct inode * parent));
++void attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
++void detach_crypto_stat(struct inode * inode);
++void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
++crypto_stat_t * alloc_crypto_stat (struct inode * inode);
++
++
++static inline reiser4_tfma_t *
++info_get_tfma (crypto_stat_t * info, reiser4_tfm id)
++{
++	return &info->tfma[id];
++}
++
++static inline struct crypto_tfm *
++info_get_tfm (crypto_stat_t * info, reiser4_tfm id)
++{
++	return info_get_tfma(info, id)->tfm;
++}
++
++static inline void
++info_set_tfm (crypto_stat_t * info, reiser4_tfm id, struct crypto_tfm * tfm)
++{
++	info_get_tfma(info, id)->tfm = tfm;
++}
++
++static inline struct crypto_tfm *
++info_cipher_tfm (crypto_stat_t * info)
++{
++	return info_get_tfm(info, CIPHER_TFM);
++}
++
++static inline struct crypto_tfm *
++info_digest_tfm (crypto_stat_t * info)
++{
++	return info_get_tfm(info, DIGEST_TFM);
++}
++
++static inline cipher_plugin *
++info_cipher_plugin (crypto_stat_t * info)
++{
++	return &info_get_tfma(info, CIPHER_TFM)->plug->cipher;
++}
++
++static inline digest_plugin *
++info_digest_plugin (crypto_stat_t * info)
++{
++	return &info_get_tfma(info, DIGEST_TFM)->plug->digest;
++}
++
++static inline void
++info_set_plugin(crypto_stat_t * info, reiser4_tfm id, reiser4_plugin * plugin)
++{
++	info_get_tfma(info, id)->plug = plugin;
++}
++
++static inline void
++info_set_cipher_plugin(crypto_stat_t * info, cipher_plugin * cplug)
++{
++	info_set_plugin(info, CIPHER_TFM, cipher_plugin_to_plugin(cplug));
++}
++
++static inline void
++info_set_digest_plugin(crypto_stat_t * info, digest_plugin * plug)
++{
++	info_set_plugin(info, DIGEST_TFM, digest_plugin_to_plugin(plug));
++}
++
++#endif				/* __FS_REISER4_CRYPTCOMPRESS_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/file/file.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/file.c
+@@ -0,0 +1,2712 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/*
++ * this file contains implementations of inode/file/address_space/file plugin
++ * operations specific for "unix file plugin" (plugin id is
++ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
++ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
++ * no items but stat data)
++ */
++
++#include "../../inode.h"
++#include "../../super.h"
++#include "../../tree_walk.h"
++#include "../../carry.h"
++#include "../../page_cache.h"
++#include "../../ioctl.h"
++#include "../object.h"
++#include "../../safe_link.h"
++
++#include <linux/writeback.h>
++#include <linux/pagevec.h>
++#include <linux/syscalls.h>
++
++
++static int unpack(struct file *file, struct inode *inode, int forever);
++
++/* get unix file plugin specific portion of inode */
++unix_file_info_t *unix_file_inode_data(const struct inode *inode)
++{
++	return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
++}
++
++/**
++ * equal_to_rdk - compare key and znode's right delimiting key
++ * @node: node whose right delimiting key to compare with @key
++ * @key: key to compare with @node's right delimiting key
++ *
++ * Returns true if @key is equal to right delimiting key of @node.
++ */
++int equal_to_rdk(znode *node, const reiser4_key *key)
++{
++	int result;
++
++	read_lock_dk(znode_get_tree(node));
++	result = keyeq(key, znode_get_rd_key(node));
++	read_unlock_dk(znode_get_tree(node));
++	return result;
++}
++
++#if REISER4_DEBUG
++
++/**
++ * equal_to_ldk - compare key and znode's left delimiting key
++ * @node: node whose left delimiting key to compare with @key
++ * @key: key to compare with @node's left delimiting key
++ *
++ * Returns true if @key is equal to left delimiting key of @node.
++ */
++int equal_to_ldk(znode *node, const reiser4_key *key)
++{
++	int result;
++
++	read_lock_dk(znode_get_tree(node));
++	result = keyeq(key, znode_get_ld_key(node));
++	read_unlock_dk(znode_get_tree(node));
++	return result;
++}
++
++/**
++ * check_coord - check whether coord corresponds to key
++ * @coord: coord to check
++ * @key: key @coord has to correspond to
++ *
++ * Returns true if @coord is set as if it was set as result of lookup with @key
++ * in coord->node.
++ */
++static int check_coord(const coord_t *coord, const reiser4_key *key)
++{
++	coord_t twin;
++
++	node_plugin_by_node(coord->node)->lookup(coord->node, key,
++						 FIND_MAX_NOT_MORE_THAN, &twin);
++	return coords_equal(coord, &twin);
++}
++
++#endif /* REISER4_DEBUG */
++
++/**
++ * init_uf_coord - initialize extended coord
++ * @uf_coord:
++ * @lh:
++ *
++ *
++ */
++void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
++{
++	coord_init_zero(&uf_coord->coord);
++	coord_clear_iplug(&uf_coord->coord);
++	uf_coord->lh = lh;
++	init_lh(lh);
++	memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
++	uf_coord->valid = 0;
++}
++
++void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
++{
++	assert("vs-1333", uf_coord->valid == 0);
++
++	if (coord_is_between_items(&uf_coord->coord))
++		return;
++
++	assert("vs-1348",
++	       item_plugin_by_coord(&uf_coord->coord)->s.file.
++	       init_coord_extension);
++
++	item_body_by_coord(&uf_coord->coord);
++	item_plugin_by_coord(&uf_coord->coord)->s.file.
++	    init_coord_extension(uf_coord, offset);
++}
++
++/**
++ * goto_right_neighbor - lock right neighbor, drop current node lock
++ * @coord:
++ * @lh:
++ *
++ * Obtain lock on right neighbor and drop lock on current node.
++ */
++int goto_right_neighbor(coord_t *coord, lock_handle *lh)
++{
++	int result;
++	lock_handle lh_right;
++
++	assert("vs-1100", znode_is_locked(coord->node));
++
++	init_lh(&lh_right);
++	result = reiser4_get_right_neighbor(&lh_right, coord->node,
++					    znode_is_wlocked(coord->node) ?
++					    ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
++					    GN_CAN_USE_UPPER_LEVELS);
++	if (result) {
++		done_lh(&lh_right);
++		return result;
++	}
++
++	/*
++	 * we hold two longterm locks on neighboring nodes. Unlock left of
++	 * them
++	 */
++	done_lh(lh);
++
++	coord_init_first_unit_nocheck(coord, lh_right.node);
++	move_lh(lh, &lh_right);
++
++	return 0;
++
++}
++
++/**
++ * set_file_state
++ * @uf_info:
++ * @cbk_result:
++ * @level:
++ *
++ * This is to be used by find_file_item and in find_file_state to
++ * determine real state of file
++ */
++static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
++			   tree_level level)
++{
++	if (cbk_errored(cbk_result))
++		/* error happened in find_file_item */
++		return;
++
++	assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
++
++	if (uf_info->container == UF_CONTAINER_UNKNOWN) {
++		/*
++		 * container is unknown, therefore conversion can not be in
++		 * progress
++		 */
++		assert("", !inode_get_flag(unix_file_info_to_inode(uf_info),
++					   REISER4_PART_IN_CONV));
++		if (cbk_result == CBK_COORD_NOTFOUND)
++			uf_info->container = UF_CONTAINER_EMPTY;
++		else if (level == LEAF_LEVEL)
++			uf_info->container = UF_CONTAINER_TAILS;
++		else
++			uf_info->container = UF_CONTAINER_EXTENTS;
++	} else {
++		/*
++		 * file state is known, check whether it is set correctly if
++		 * file is not being tail converted
++		 */
++		if (!inode_get_flag(unix_file_info_to_inode(uf_info),
++				    REISER4_PART_IN_CONV)) {
++			assert("vs-1162",
++			       ergo(level == LEAF_LEVEL &&
++				    cbk_result == CBK_COORD_FOUND,
++				    uf_info->container == UF_CONTAINER_TAILS));
++			assert("vs-1165",
++			       ergo(level == TWIG_LEVEL &&
++				    cbk_result == CBK_COORD_FOUND,
++				    uf_info->container == UF_CONTAINER_EXTENTS));
++		}
++	}
++}
++
++int find_file_item_nohint(coord_t *coord, lock_handle *lh,
++			  const reiser4_key *key, znode_lock_mode lock_mode,
++			  struct inode *inode)
++{
++	return object_lookup(inode, key, coord, lh, lock_mode,
++			     FIND_MAX_NOT_MORE_THAN,
++			     TWIG_LEVEL, LEAF_LEVEL,
++			     (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
++			     (CBK_UNIQUE | CBK_FOR_INSERT),
++			     NULL /* ra_info */ );
++}
++
++/**
++ * find_file_item - look for file item in the tree
++ * @hint: provides coordinate, lock handle, seal
++ * @key: key for search
++ * @mode: mode of lock to put on returned node
++ * @ra_info:
++ * @inode:
++ *
++ * This finds position in the tree corresponding to @key. It first tries to use
++ * @hint's seal if it is set.
++ */
++int find_file_item(hint_t *hint, const reiser4_key *key,
++		   znode_lock_mode lock_mode,
++		   struct inode *inode)
++{
++	int result;
++	coord_t *coord;
++	lock_handle *lh;
++
++	assert("nikita-3030", schedulable());
++	assert("vs-1707", hint != NULL);
++	assert("vs-47", inode != NULL);
++
++	coord = &hint->ext_coord.coord;
++	lh = hint->ext_coord.lh;
++	init_lh(lh);
++
++	result = hint_validate(hint, key, 1 /* check key */, lock_mode);
++	if (!result) {
++		if (coord->between == AFTER_UNIT &&
++		    equal_to_rdk(coord->node, key)) {
++			result = goto_right_neighbor(coord, lh);
++			if (result == -E_NO_NEIGHBOR)
++				return RETERR(-EIO);
++			if (result)
++				return result;
++			assert("vs-1152", equal_to_ldk(coord->node, key));
++			/*
++			 * we moved to different node. Invalidate coord
++			 * extension, zload is necessary to init it again
++			 */
++			hint->ext_coord.valid = 0;
++		}
++
++		set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
++			       znode_get_level(coord->node));
++		
++		return CBK_COORD_FOUND;
++	}
++
++	coord_init_zero(coord);
++	result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
++	set_file_state(unix_file_inode_data(inode), result,
++		       znode_get_level(coord->node));
++
++	/* FIXME: we might already have coord extension initialized */
++	hint->ext_coord.valid = 0;
++	return result;
++}
++
++/* plugin->u.file.write_flowom = NULL
++   plugin->u.file.read_flow = NULL */
++
++void hint_init_zero(hint_t * hint)
++{
++	memset(hint, 0, sizeof(*hint));
++	init_lh(&hint->lh);
++	hint->ext_coord.lh = &hint->lh;
++}
++
++static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
++{
++	int result;
++	reiser4_key key;
++	coord_t coord;
++	lock_handle lh;
++
++	assert("vs-1628", ea_obtained(uf_info));
++
++	if (uf_info->container == UF_CONTAINER_UNKNOWN) {
++		key_by_inode_and_offset_common(inode, 0, &key);
++		init_lh(&lh);
++		result = find_file_item_nohint(&coord, &lh, &key,
++					       ZNODE_READ_LOCK, inode);
++		set_file_state(uf_info, result, znode_get_level(coord.node));
++		done_lh(&lh);
++		if (!cbk_errored(result))
++			result = 0;
++	} else
++		result = 0;
++	assert("vs-1074",
++	       ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
++	txn_restart_current();
++	return result;
++}
++
++/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
++   data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
++   if page corresponds to hole extent and unallocated one will have to be created */
++static int reserve_partial_page(reiser4_tree * tree)
++{
++	grab_space_enable();
++	return reiser4_grab_reserved(reiser4_get_current_sb(),
++				     1 +
++				     2 * estimate_one_insert_into_item(tree),
++				     BA_CAN_COMMIT);
++}
++
++/* estimate and reserve space needed to cut one item and update one stat data */
++static int reserve_cut_iteration(reiser4_tree * tree)
++{
++	__u64 estimate = estimate_one_item_removal(tree)
++	    + estimate_one_insert_into_item(tree);
++
++	assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
++
++	grab_space_enable();
++	/* We need to double our estimate now that we can delete more than one
++	   node. */
++	return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
++				     BA_CAN_COMMIT);
++}
++
++int update_file_size(struct inode *inode, reiser4_key * key, int update_sd)
++{
++	int result = 0;
++
++	INODE_SET_FIELD(inode, i_size, get_key_offset(key));
++	if (update_sd) {
++		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
++		result = reiser4_update_sd(inode);
++	}
++	return result;
++}
++
++/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
++   and update file stat data on every single cut from the tree */
++int
++cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
++	       loff_t cur_size, int (*update_actor) (struct inode *,
++						     reiser4_key *, int))
++{
++	reiser4_key from_key, to_key;
++	reiser4_key smallest_removed;
++	file_plugin *fplug = inode_file_plugin(inode);
++	int result;
++	int progress = 0;
++
++	assert("vs-1248",
++	       fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
++	       fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
++
++	fplug->key_by_inode(inode, new_size, &from_key);
++	to_key = from_key;
++	set_key_offset(&to_key, cur_size - 1 /*get_key_offset(max_key()) */ );
++	/* this loop normally runs just once */
++	while (1) {
++		result = reserve_cut_iteration(tree_by_inode(inode));
++		if (result)
++			break;
++
++		result = cut_tree_object(current_tree, &from_key, &to_key,
++					 &smallest_removed, inode, 1,
++					 &progress);
++		if (result == -E_REPEAT) {
++			/* -E_REPEAT is a signal to interrupt a long file truncation process */
++			if (progress) {
++				result =
++				    update_actor(inode, &smallest_removed,
++						 update_sd);
++				if (result)
++					break;
++			}
++
++			/* the below does up(sbinfo->delete_sema). Do not get folled */
++			reiser4_release_reserved(inode->i_sb);
++
++			/* cut_tree_object() was interrupted probably because
++			 * current atom requires commit, we have to release
++			 * transaction handle to allow atom commit. */
++			txn_restart_current();
++			continue;
++		}
++		if (result
++		    && !(result == CBK_COORD_NOTFOUND && new_size == 0
++			 && inode->i_size == 0))
++			break;
++
++		set_key_offset(&smallest_removed, new_size);
++		/* Final sd update after the file gets its correct size */
++		result = update_actor(inode, &smallest_removed, update_sd);
++		break;
++	}
++
++	/* the below does up(sbinfo->delete_sema). Do not get folled */
++	reiser4_release_reserved(inode->i_sb);
++
++	return result;
++}
++
++int find_or_create_extent(struct page *page);
++
++static int filler(void *vp, struct page *page)
++{
++	return readpage_unix_file_nolock(vp, page);
++}
++
++/* part of truncate_file_body: it is called when truncate is used to make file
++   shorter */
++static int shorten_file(struct inode *inode, loff_t new_size)
++{
++	int result;
++	struct page *page;
++	int padd_from;
++	unsigned long index;
++	char *kaddr;
++	unix_file_info_t *uf_info;
++
++	/*
++	 * all items of ordinary reiser4 file are grouped together. That is why
++	 * we can use cut_tree. Plan B files (for instance) can not be
++	 * truncated that simply
++	 */
++	result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
++				get_key_offset(max_key()), update_file_size);
++	if (result)
++		return result;
++
++	uf_info = unix_file_inode_data(inode);
++	assert("vs-1105", new_size == inode->i_size);
++	if (new_size == 0) {
++		uf_info->container = UF_CONTAINER_EMPTY;
++		return 0;
++	}
++
++	result = find_file_state(inode, uf_info);
++	if (result)
++		return result;
++	if (uf_info->container == UF_CONTAINER_TAILS)
++		/*
++		 * No need to worry about zeroing last page after new file
++		 * end
++		 */
++		return 0;
++
++	padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
++	if (!padd_from)
++		/* file is truncated to page boundary */
++		return 0;
++
++	result = reserve_partial_page(tree_by_inode(inode));
++	if (result) {
++		reiser4_release_reserved(inode->i_sb);
++		return result;
++	}
++
++	/* last page is partially truncated - zero its content */
++	index = (inode->i_size >> PAGE_CACHE_SHIFT);
++	page = read_cache_page(inode->i_mapping, index, filler, NULL);
++	if (IS_ERR(page)) {
++		/*
++		 * the below does up(sbinfo->delete_sema). Do not get
++		 * confused
++		 */
++		reiser4_release_reserved(inode->i_sb);
++		if (likely(PTR_ERR(page) == -EINVAL)) {
++			/* looks like file is built of tail items */
++			return 0;
++		}
++		return PTR_ERR(page);
++	}
++	wait_on_page_locked(page);
++	if (!PageUptodate(page)) {
++		page_cache_release(page);
++		/*
++		 * the below does up(sbinfo->delete_sema). Do not get
++		 * confused
++		 */
++		reiser4_release_reserved(inode->i_sb);
++		return RETERR(-EIO);
++	}
++
++	/*
++	 * if page correspons to hole extent unit - unallocated one will be
++	 * created here. This is not necessary
++	 */
++	result = find_or_create_extent(page);
++
++	/*
++	 * FIXME: cut_file_items has already updated inode. Probably it would
++	 * be better to update it here when file is really truncated
++	 */
++	if (result) {
++		page_cache_release(page);
++		/*
++		 * the below does up(sbinfo->delete_sema). Do not get
++		 * confused
++		 */
++		reiser4_release_reserved(inode->i_sb);
++		return result;
++	}
++
++	lock_page(page);
++	assert("vs-1066", PageLocked(page));
++	kaddr = kmap_atomic(page, KM_USER0);
++	memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
++	flush_dcache_page(page);
++	kunmap_atomic(kaddr, KM_USER0);
++	unlock_page(page);
++	page_cache_release(page);
++	/* the below does up(sbinfo->delete_sema). Do not get confused */
++	reiser4_release_reserved(inode->i_sb);
++	return 0;
++}
++
++/**
++ * should_have_notail
++ * @uf_info:
++ * @new_size:
++ *
++ * Calls formatting plugin to see whether file of size @new_size has to be
++ * stored in unformatted nodes or in tail items. 0 is returned for later case.
++ */
++static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
++{
++	if (!uf_info->tplug)
++		return 1;
++	return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
++					  new_size);
++
++}
++
++/**
++ * truncate_file_body - change length of file
++ * @inode: inode of file
++ * @new_size: new file length
++ *
++ * Adjusts items file @inode is built of to match @new_size. It may either cut
++ * items or add them to represent a hole at the end of file. The caller has to
++ * obtain exclusive access to the file.
++ */
++static int truncate_file_body(struct inode *inode, loff_t new_size)
++{
++	int result;
++
++	if (inode->i_size < new_size) {
++		/* expanding truncate */
++		struct dentry dentry;
++		struct file file;
++		unix_file_info_t *uf_info;
++
++		dentry.d_inode = inode;
++		file.f_dentry = &dentry;
++		file.private_data = NULL;
++		file.f_pos = new_size;
++		file.private_data = NULL;
++		uf_info = unix_file_inode_data(inode);
++		result = find_file_state(inode, uf_info);
++		if (result)
++			return result;
++		
++		if (should_have_notail(uf_info, new_size)) {
++			/*
++			 * file of size @new_size has to be built of
++			 * extents. If it is built of tails - convert to
++			 * extents
++			 */
++			if (uf_info->container ==  UF_CONTAINER_TAILS) {
++				/*
++				 * if file is being convered by another process
++				 * - wait until it completes
++				 */
++				while (1) {
++					if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
++						drop_exclusive_access(uf_info);
++						schedule();
++						get_exclusive_access(uf_info);
++						continue;
++					}
++					break;
++				}
++				
++				if (uf_info->container ==  UF_CONTAINER_TAILS) {
++					result = tail2extent(uf_info);
++					if (result)
++						return result;
++				}
++			}
++			result = write_extent(&file, NULL, 0, &new_size);
++			if (result)
++				return result;
++			uf_info->container = UF_CONTAINER_EXTENTS;
++		} else {
++			if (uf_info->container ==  UF_CONTAINER_EXTENTS) {
++				result = write_extent(&file, NULL, 0, &new_size);
++				if (result)
++					return result;
++			} else {
++				result = write_tail(&file, NULL, 0, &new_size);
++				if (result)
++					return result;
++				uf_info->container = UF_CONTAINER_TAILS;
++			}
++		}
++		BUG_ON(result > 0);
++		INODE_SET_FIELD(inode, i_size, new_size);
++		file_update_time(&file);
++		result = reiser4_update_sd(inode);
++		BUG_ON(result != 0);
++		reiser4_free_file_fsdata(&file);
++	} else
++		result = shorten_file(inode, new_size);
++	return result;
++}
++
++/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
++
++/**
++ * load_file_hint - copy hint from struct file to local variable
++ * @file: file to get hint from
++ * @hint: structure to fill
++ *
++ * Reiser4 specific portion of struct file may contain information (hint)
++ * stored on exiting from previous read or write. That information includes
++ * seal of znode and coord within that znode where previous read or write
++ * stopped. This function copies that information to @hint if it was stored or
++ * initializes @hint by 0s otherwise.
++ */
++int load_file_hint(struct file *file, hint_t *hint)
++{
++	reiser4_file_fsdata *fsdata;
++
++	if (file) {
++		fsdata = reiser4_get_file_fsdata(file);
++		if (IS_ERR(fsdata))
++			return PTR_ERR(fsdata);
++
++		spin_lock_inode(file->f_dentry->d_inode);
++		if (seal_is_set(&fsdata->reg.hint.seal)) {
++			*hint = fsdata->reg.hint;
++			init_lh(&hint->lh);
++			hint->ext_coord.lh = &hint->lh;
++			spin_unlock_inode(file->f_dentry->d_inode);
++			/*
++			 * force re-validation of the coord on the first
++			 * iteration of the read/write loop.
++			 */
++			hint->ext_coord.valid = 0;
++			assert("nikita-19892", coords_equal(&hint->seal.coord1,
++							    &hint->ext_coord.
++							    coord));
++			return 0;
++		}
++		memset(&fsdata->reg.hint, 0, sizeof(hint_t));
++		spin_unlock_inode(file->f_dentry->d_inode);
++	}
++	hint_init_zero(hint);
++	return 0;
++}
++
++/**
++ * save_file_hint - copy hint to reiser4 private struct file's part
++ * @file: file to save hint in
++ * @hint: hint to save
++ *
++ * This copies @hint to reiser4 private part of struct file. It can help
++ * speedup future accesses to the file.
++ */
++void save_file_hint(struct file *file, const hint_t *hint)
++{
++	reiser4_file_fsdata *fsdata;
++
++	assert("edward-1337", hint != NULL);
++
++	if (!file || !seal_is_set(&hint->seal))
++		return;
++	fsdata = reiser4_get_file_fsdata(file);
++	assert("vs-965", !IS_ERR(fsdata));
++	assert("nikita-19891",
++	       coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
++	assert("vs-30", hint->lh.owner == NULL);
++	spin_lock_inode(file->f_dentry->d_inode);
++	fsdata->reg.hint = *hint;
++	spin_unlock_inode(file->f_dentry->d_inode);
++	return;
++}
++
++void unset_hint(hint_t * hint)
++{
++	assert("vs-1315", hint);
++	hint->ext_coord.valid = 0;
++	seal_done(&hint->seal);
++	done_lh(&hint->lh);
++}
++
++/* coord must be set properly. So, that set_hint has nothing to do */
++void set_hint(hint_t * hint, const reiser4_key * key, znode_lock_mode mode)
++{
++	ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
++	assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
++
++	seal_init(&hint->seal, &hint->ext_coord.coord, key);
++	hint->offset = get_key_offset(key);
++	hint->mode = mode;
++	done_lh(&hint->lh);
++}
++
++int hint_is_set(const hint_t * hint)
++{
++	return seal_is_set(&hint->seal);
++}
++
++#if REISER4_DEBUG
++static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
++{
++	return (get_key_locality(k1) == get_key_locality(k2) &&
++		get_key_type(k1) == get_key_type(k2) &&
++		get_key_band(k1) == get_key_band(k2) &&
++		get_key_ordering(k1) == get_key_ordering(k2) &&
++		get_key_objectid(k1) == get_key_objectid(k2));
++}
++#endif
++
++int
++hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
++	      znode_lock_mode lock_mode)
++{
++	if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
++		/* hint either not set or set by different operation */
++		return RETERR(-E_REPEAT);
++
++	assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
++
++	if (check_key && get_key_offset(key) != hint->offset)
++		/* hint is set for different key */
++		return RETERR(-E_REPEAT);
++
++	assert("vs-31", hint->ext_coord.lh == &hint->lh);
++	return seal_validate(&hint->seal, &hint->ext_coord.coord, key,
++			     hint->ext_coord.lh, lock_mode, ZNODE_LOCK_LOPRI);
++}
++
++int xversion;
++
++/**
++ * find_or_create_extent - 
++ * @page: 
++ *
++ *
++ */
++/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
++   unallocated extent if it does not exist yet, initialize jnode, capture page */
++int find_or_create_extent(struct page *page)
++{
++	int result;
++	struct inode *inode;
++	int plugged_hole;
++
++	jnode *node;
++
++	assert("vs-1065", page->mapping && page->mapping->host);
++	inode = page->mapping->host;
++
++	lock_page(page);
++	node = jnode_of_page(page);
++	unlock_page(page);
++	if (IS_ERR(node))
++		return PTR_ERR(node);
++
++	if (node->blocknr == 0) {
++		plugged_hole = 0;
++		result = update_extent(inode, node,
++				       (loff_t)page->index << PAGE_CACHE_SHIFT,
++				       &plugged_hole);
++		if (result) {
++			jput(node);
++			warning("", "update_extent failed: %d", result);
++			return result;
++		}
++		if (plugged_hole)
++			reiser4_update_sd(inode);
++	} else {
++		spin_lock_jnode(node);
++		result = try_capture(node, ZNODE_WRITE_LOCK, 0);
++		BUG_ON(result != 0);
++		jnode_make_dirty_locked(node);
++		spin_unlock_jnode(node);
++	}
++
++	BUG_ON(node->atom == NULL);
++	jput(node);
++
++	if (get_current_context()->entd) {
++		entd_context *ent = get_entd_context(node->tree->super);
++
++		if (ent->cur_request->page == page)
++			ent->cur_request->node = node;
++	}
++	return 0;
++}
++
++/**
++ * has_anonymous_pages - check whether inode has pages dirtied via mmap
++ * @inode: inode to check
++ *
++ * Returns true if inode's mapping has dirty pages which do not belong to any
++ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
++ * tree or were eflushed and can be found via jnodes tagged
++ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
++ */
++static int has_anonymous_pages(struct inode *inode)
++{
++	int result;
++
++	read_lock_irq(&inode->i_mapping->tree_lock);
++	result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
++	read_unlock_irq(&inode->i_mapping->tree_lock);
++	return result;
++}
++
++/**
++ * capture_page_and_create_extent -
++ * @page: page to be captured
++ *
++ * Grabs space for extent creation and stat data update and calls function to
++ * do actual work.
++ */
++static int capture_page_and_create_extent(struct page *page)
++{
++	int result;
++	struct inode *inode;
++
++	assert("vs-1084", page->mapping && page->mapping->host);
++	inode = page->mapping->host;
++	assert("vs-1139",
++	       unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
++	/* page belongs to file */
++	assert("vs-1393",
++	       inode->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT));
++
++	/* page capture may require extent creation (if it does not exist yet)
++	   and stat data's update (number of blocks changes on extent
++	   creation) */
++	grab_space_enable();
++	result =
++	    reiser4_grab_space(2 *
++			       estimate_one_insert_into_item(tree_by_inode
++							     (inode)),
++			       BA_CAN_COMMIT);
++	if (likely(!result))
++		result = find_or_create_extent(page);
++
++	if (result != 0)
++		SetPageError(page);
++	return result;
++}
++
++/* this is implementation of method commit_write of struct
++   address_space_operations for unix file plugin */
++int
++commit_write_unix_file(struct file *file, struct page *page,
++		       unsigned from, unsigned to)
++{
++	reiser4_context *ctx;
++	struct inode *inode;
++	int result;
++
++	assert("umka-3101", file != NULL);
++	assert("umka-3102", page != NULL);
++	assert("umka-3093", PageLocked(page));
++
++	SetPageUptodate(page);
++
++	inode = page->mapping->host;
++	ctx = init_context(page->mapping->host->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	page_cache_get(page);
++	unlock_page(page);
++	result = capture_page_and_create_extent(page);
++	lock_page(page);
++	page_cache_release(page);
++
++	/* don't commit transaction under inode semaphore */
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/*
++ * Support for "anonymous" pages and jnodes.
++ *
++ * When file is write-accessed through mmap pages can be dirtied from the user
++ * level. In this case kernel is not notified until one of following happens:
++ *
++ *     (1) msync()
++ *
++ *     (2) truncate() (either explicit or through unlink)
++ *
++ *     (3) VM scanner starts reclaiming mapped pages, dirtying them before
++ *     starting write-back.
++ *
++ * As a result of (3) ->writepage may be called on a dirty page without
++ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
++ * (iozone) generate huge number of anonymous pages. Emergency flush handles
++ * this situation by creating jnode for anonymous page, starting IO on the
++ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
++ * memory. Such jnode is also called anonymous.
++ *
++ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
++ * tree. This is done by capture_anonymous_*() functions below.
++ */
++
++/**
++ * capture_anonymous_page - involve page into transaction
++ * @pg: page to deal with
++ *
++ * Takes care that @page has corresponding metadata in the tree, creates jnode
++ * for @page and captures it. On success 1 is returned.
++ */
++static int capture_anonymous_page(struct page *page)
++{
++	int result;
++
++	if (PageWriteback(page))
++		/* FIXME: do nothing? */
++		return 0;
++
++	result = capture_page_and_create_extent(page);
++	if (result == 0) {
++		result = 1;
++	} else
++		warning("nikita-3329",
++				"Cannot capture anon page: %i", result);
++
++	return result;
++}
++
++/**
++ * capture_anonymous_pages - find and capture pages dirtied via mmap
++ * @mapping: address space where to look for pages
++ * @index: start index
++ * @to_capture: maximum number of pages to capture
++ *
++ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
++ * captures (involves into atom) them, returns number of captured pages,
++ * updates @index to next page after the last captured one.
++ */
++static int
++capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
++			unsigned int to_capture)
++{
++	int result;
++	struct pagevec pvec;
++	unsigned int i, count;
++	int nr;
++
++	pagevec_init(&pvec, 0);
++	count = min(pagevec_space(&pvec), to_capture);
++	nr = 0;
++
++	/* find pages tagged MOVED */
++	write_lock_irq(&mapping->tree_lock);
++	pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
++					     (void **)pvec.pages, *index, count,
++					     PAGECACHE_TAG_REISER4_MOVED);
++	if (pagevec_count(&pvec) == 0) {
++		/*
++		 * there are no pages tagged MOVED in mapping->page_tree
++		 * starting from *index
++		 */
++		write_unlock_irq(&mapping->tree_lock);
++		*index = (pgoff_t)-1;
++		return 0;
++	}
++
++	/* clear MOVED tag for all found pages */
++	for (i = 0; i < pagevec_count(&pvec); i++) {
++		void *p;
++
++		page_cache_get(pvec.pages[i]);
++		p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
++					 PAGECACHE_TAG_REISER4_MOVED);
++		assert("vs-49", p == pvec.pages[i]);
++	}
++	write_unlock_irq(&mapping->tree_lock);
++
++
++	*index = pvec.pages[i - 1]->index + 1;
++
++	for (i = 0; i < pagevec_count(&pvec); i++) {
++		/*
++		 * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
++		 * set_page_dirty_internal which is called when jnode is
++		 * captured
++		 */
++		result = capture_anonymous_page(pvec.pages[i]);
++		if (result == 1)
++			nr++;
++		else {
++			if (result < 0) {
++				warning("vs-1454",
++					"failed to capture page: "
++					"result=%d, captured=%d)\n",
++					result, i);
++
++				/*
++				 * set MOVED tag to all pages which left not
++				 * captured
++				 */
++				write_lock_irq(&mapping->tree_lock);
++				for (; i < pagevec_count(&pvec); i ++) {
++					radix_tree_tag_set(&mapping->page_tree,
++							   pvec.pages[i]->index,
++							   PAGECACHE_TAG_REISER4_MOVED);
++				}
++				write_unlock_irq(&mapping->tree_lock);
++
++				pagevec_release(&pvec);
++				return result;
++			} else {
++				/*
++				 * result == 0. capture_anonymous_page returns
++				 * 0 for Writeback-ed page. Set MOVED tag on
++				 * that page
++				 */
++				write_lock_irq(&mapping->tree_lock);
++				radix_tree_tag_set(&mapping->page_tree,
++						   pvec.pages[i]->index,
++						   PAGECACHE_TAG_REISER4_MOVED);
++				write_unlock_irq(&mapping->tree_lock);
++				if (i == 0)
++					*index = pvec.pages[0]->index;
++				else
++					*index = pvec.pages[i - 1]->index + 1;
++			}
++		}
++	}
++	pagevec_release(&pvec);
++	return nr;
++}
++
++/**
++ * capture_anonymous_jnodes - find and capture anonymous jnodes
++ * @mapping: address space where to look for jnodes
++ * @from: start index
++ * @to: end index
++ * @to_capture: maximum number of jnodes to capture
++ *
++ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
++ * the range of indexes @from-@to and captures them, returns number of captured
++ * jnodes, updates @from to next jnode after the last captured one.
++ */
++static int
++capture_anonymous_jnodes(struct address_space *mapping,
++			 pgoff_t *from, pgoff_t to, int to_capture)
++{
++	*from = to;
++	return 0;
++}
++
++/*
++ * Commit atom of the jnode of a page.
++ */
++static int sync_page(struct page *page)
++{
++	int result;
++	do {
++		jnode *node;
++		txn_atom *atom;
++
++		lock_page(page);
++		node = jprivate(page);
++		if (node != NULL) {
++			spin_lock_jnode(node);
++			atom = jnode_get_atom(node);
++			spin_unlock_jnode(node);
++		} else
++			atom = NULL;
++		unlock_page(page);
++		result = sync_atom(atom);
++	} while (result == -E_REPEAT);
++	/*
++	 * ZAM-FIXME-HANS: document the logic of this loop, is it just to
++	 * handle the case where more pages get added to the atom while we are
++	 * syncing it?
++	 */
++	assert("nikita-3485", ergo(result == 0,
++				   get_current_context()->trans->atom == NULL));
++	return result;
++}
++
++/*
++ * Commit atoms of pages on @pages list.
++ * call sync_page for each page from mapping's page tree
++ */
++static int sync_page_list(struct inode *inode)
++{
++	int result;
++	struct address_space *mapping;
++	unsigned long from;	/* start index for radix_tree_gang_lookup */
++	unsigned int found;	/* return value for radix_tree_gang_lookup */
++
++	mapping = inode->i_mapping;
++	from = 0;
++	result = 0;
++	read_lock_irq(&mapping->tree_lock);
++	while (result == 0) {
++		struct page *page;
++
++		found =
++		    radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
++					   from, 1);
++		assert("", found < 2);
++		if (found == 0)
++			break;
++
++		/* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
++		   sys_fsync */
++		page_cache_get(page);
++		read_unlock_irq(&mapping->tree_lock);
++
++		from = page->index + 1;
++
++		result = sync_page(page);
++
++		page_cache_release(page);
++		read_lock_irq(&mapping->tree_lock);
++	}
++
++	read_unlock_irq(&mapping->tree_lock);
++	return result;
++}
++
++static int commit_file_atoms(struct inode *inode)
++{
++	int result;
++	unix_file_info_t *uf_info;
++
++	uf_info = unix_file_inode_data(inode);
++
++	get_exclusive_access(uf_info);
++	/*
++	 * find what items file is made from
++	 */
++	result = find_file_state(inode, uf_info);
++	drop_exclusive_access(uf_info);
++	if (result != 0)
++		return result;
++
++	/*
++	 * file state cannot change because we are under ->i_mutex
++	 */
++	switch (uf_info->container) {
++	case UF_CONTAINER_EXTENTS:
++		/* find_file_state might open join an atom */
++		txn_restart_current();
++		result =
++		    /*
++		     * when we are called by
++		     * filemap_fdatawrite->
++		     *    do_writepages()->
++		     *       reiser4_writepages()
++		     *
++		     * inode->i_mapping->dirty_pages are spices into
++		     * ->io_pages, leaving ->dirty_pages dirty.
++		     *
++		     * When we are called from
++		     * reiser4_fsync()->sync_unix_file(), we have to
++		     * commit atoms of all pages on the ->dirty_list.
++		     *
++		     * So for simplicity we just commit ->io_pages and
++		     * ->dirty_pages.
++		     */
++		    sync_page_list(inode);
++		break;
++	case UF_CONTAINER_TAILS:
++		/*
++		 * NOTE-NIKITA probably we can be smarter for tails. For now
++		 * just commit all existing atoms.
++		 */
++		result = txnmgr_force_commit_all(inode->i_sb, 0);
++		break;
++	case UF_CONTAINER_EMPTY:
++		result = 0;
++		break;
++	case UF_CONTAINER_UNKNOWN:
++	default:
++		result = -EIO;
++		break;
++	}
++
++	/*
++	 * commit current transaction: there can be captured nodes from
++	 * find_file_state() and finish_conversion().
++	 */
++	txn_restart_current();
++	return result;
++}
++
++/**
++ * writepages_unix_file - writepages of struct address_space_operations
++ * @mapping:
++ * @wbc:
++ *
++ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
++ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
++ * created by reiser4_writepage.
++ */
++int writepages_unix_file(struct address_space *mapping,
++		     struct writeback_control *wbc)
++{
++	int result;
++	unix_file_info_t *uf_info;
++	pgoff_t pindex, jindex, nr_pages;
++	long to_capture;
++	struct inode *inode;
++
++	inode = mapping->host;
++	if (!has_anonymous_pages(inode)) {
++		result = 0;
++		goto end;
++	}
++	jindex = pindex = wbc->start >> PAGE_CACHE_SHIFT;
++	result = 0;
++	nr_pages =
++	    (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
++	uf_info = unix_file_inode_data(inode);
++
++	do {
++		reiser4_context *ctx;
++
++		if (wbc->sync_mode != WB_SYNC_ALL)
++			to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
++		else
++			to_capture = CAPTURE_APAGE_BURST;
++
++		ctx = init_context(inode->i_sb);
++		if (IS_ERR(ctx)) {
++			result = PTR_ERR(ctx);
++			break;
++		}
++		/* avoid recursive calls to ->sync_inodes */
++		ctx->nobalance = 1;
++		assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
++		assert("", LOCK_CNT_NIL(inode_sem_w));
++		assert("", LOCK_CNT_NIL(inode_sem_r));
++
++		txn_restart_current();
++
++		/* we have to get nonexclusive access to the file */
++		if (get_current_context()->entd) {
++			/*
++			 * use nonblocking version of nonexclusive_access to
++			 * avoid deadlock which might look like the following:
++			 * process P1 holds NEA on file F1 and called entd to
++			 * reclaim some memory. Entd works for P1 and is going
++			 * to capture pages of file F2. To do that entd has to
++			 * get NEA to F2. F2 is held by process P2 which also
++			 * called entd. But entd is serving P1 at the moment
++			 * and P2 has to wait. Process P3 trying to get EA to
++			 * file F2. Existence of pending EA request to file F2
++			 * makes impossible for entd to get NEA to file
++			 * F2. Neither of these process can continue. Using
++			 * nonblocking version of gettign NEA is supposed to
++			 * avoid this deadlock.
++			 */
++			if (try_to_get_nonexclusive_access(uf_info) == 0) {
++				result = RETERR(-EBUSY);
++				reiser4_exit_context(ctx);
++				break;
++			}
++		} else
++			get_nonexclusive_access(uf_info);
++
++		while (to_capture > 0) {
++			pgoff_t start;
++
++			assert("vs-1727", jindex <= pindex);
++			if (pindex == jindex) {
++				start = pindex;
++				result =
++				    capture_anonymous_pages(inode->i_mapping,
++							    &pindex,
++							    to_capture);
++				if (result <= 0)
++					break;
++				to_capture -= result;
++				wbc->nr_to_write -= result;
++				if (start + result == pindex) {
++					jindex = pindex;
++					continue;
++				}
++				if (to_capture <= 0)
++					break;
++			}
++			/* deal with anonymous jnodes between jindex and pindex */
++			result =
++			    capture_anonymous_jnodes(inode->i_mapping, &jindex,
++						     pindex, to_capture);
++			if (result < 0)
++				break;
++			to_capture -= result;
++			get_current_context()->nr_captured += result;
++
++			if (jindex == (pgoff_t) - 1) {
++				assert("vs-1728", pindex == (pgoff_t) - 1);
++				break;
++			}
++		}
++		if (to_capture <= 0)
++			/* there may be left more pages */
++			__mark_inode_dirty(inode, I_DIRTY_PAGES);
++
++		drop_nonexclusive_access(uf_info);
++		if (result < 0) {
++			/* error happened */
++			reiser4_exit_context(ctx);
++			return result;
++		}
++		if (wbc->sync_mode != WB_SYNC_ALL) {
++			reiser4_exit_context(ctx);
++			return 0;
++		}
++		result = commit_file_atoms(inode);
++		reiser4_exit_context(ctx);
++		if (pindex >= nr_pages && jindex == pindex)
++			break;
++	} while (1);
++
++      end:
++	if (is_in_reiser4_context()) {
++		if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
++			/*
++			 * there are already pages to flush, flush them out, do
++			 * not delay until end of reiser4_sync_inodes
++			 */
++			writeout(inode->i_sb, wbc);
++			get_current_context()->nr_captured = 0;
++		}
++	}
++	return result;
++}
++
++/*
++ * ->sync() method for unix file.
++ *
++ * We are trying to be smart here. Instead of committing all atoms (original
++ * solution), we scan dirty pages of this file and commit all atoms they are
++ * part of.
++ *
++ * Situation is complicated by anonymous pages: i.e., extent-less pages
++ * dirtied through mmap. Fortunately sys_fsync() first calls
++ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
++ * all missing extents and capture anonymous pages.
++ */
++int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
++{
++	reiser4_context *ctx;
++	txn_atom *atom;
++	reiser4_block_nr reserve;
++
++	ctx = init_context(dentry->d_inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	reserve = estimate_update_common(dentry->d_inode);
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
++		reiser4_exit_context(ctx);
++		return RETERR(-ENOSPC);
++	}
++	write_sd_by_inode_common(dentry->d_inode);
++
++	atom = get_current_atom_locked();
++	spin_lock_txnh(ctx->trans);
++	force_commit_atom(ctx->trans);
++	reiser4_exit_context(ctx);
++	return 0;
++}
++
++/**
++ * readpage_unix_file_nolock - readpage of struct address_space_operations
++ * @file:
++ * @page:
++ *
++ * Compose a key and search for item containing information about @page
++ * data. If item is found - its readpage method is called.
++ */
++int readpage_unix_file_nolock(struct file *file, struct page *page)
++{
++	reiser4_context *ctx;
++	int result;
++	struct inode *inode;
++	reiser4_key key;
++	item_plugin *iplug;
++	hint_t *hint;
++	lock_handle *lh;
++	coord_t *coord;
++
++	assert("vs-1062", PageLocked(page));
++	assert("vs-976", !PageUptodate(page));
++	assert("vs-1061", page->mapping && page->mapping->host);
++
++	if ((page->mapping->host->i_size <=
++	     ((loff_t) page->index << PAGE_CACHE_SHIFT))) {
++		/* page is out of file already */
++		unlock_page(page);
++		return -EINVAL;
++	}
++
++	inode = page->mapping->host;
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx)) {
++		unlock_page(page);
++		return PTR_ERR(ctx);
++	}
++
++	hint = kmalloc(sizeof(*hint), get_gfp_mask());
++	if (hint == NULL) {
++		unlock_page(page);
++		reiser4_exit_context(ctx);
++		return RETERR(-ENOMEM);
++	}
++
++	result = load_file_hint(file, hint);
++	if (result) {
++		kfree(hint);
++		unlock_page(page);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++	lh = &hint->lh;
++
++	/* get key of first byte of the page */
++	key_by_inode_and_offset_common(inode,
++				       (loff_t) page->index << PAGE_CACHE_SHIFT,
++				       &key);
++
++	/* look for file metadata corresponding to first byte of page */
++	page_cache_get(page);
++	unlock_page(page);
++	result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
++	lock_page(page);
++	page_cache_release(page);
++
++	if (page->mapping == NULL) {
++		/*
++		 * readpage allows truncate to run concurrently. Page was
++		 * truncated while it was not locked
++		 */
++		done_lh(lh);
++		kfree(hint);
++		unlock_page(page);
++		txn_restart(ctx);
++		reiser4_exit_context(ctx);
++		return -EINVAL;
++	}
++
++	if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
++		if (result == CBK_COORD_FOUND &&
++		    hint->ext_coord.coord.between != AT_UNIT)
++			/* file is truncated */
++			result = -EINVAL;
++		done_lh(lh);
++		kfree(hint);
++		unlock_page(page);
++		txn_restart(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	/*
++	 * item corresponding to page is found. It can not be removed because
++	 * znode lock is held
++	 */
++	if (PageUptodate(page)) {
++		done_lh(lh);
++		kfree(hint);
++		unlock_page(page);
++		txn_restart(ctx);
++		reiser4_exit_context(ctx);
++		return 0;
++	}
++
++	coord = &hint->ext_coord.coord;
++	result = zload(coord->node);
++	if (result) {
++		done_lh(lh);
++		kfree(hint);
++		unlock_page(page);
++		txn_restart(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	validate_extended_coord(&hint->ext_coord,
++				(loff_t) page->index << PAGE_CACHE_SHIFT);
++
++	if (!coord_is_existing_unit(coord)) {
++		/* this indicates corruption */
++		warning("vs-280",
++			"Looking for page %lu of file %llu (size %lli). "
++			"No file items found (%d). File is corrupted?\n",
++			page->index, (unsigned long long)get_inode_oid(inode),
++			inode->i_size, result);
++		zrelse(coord->node);
++		done_lh(lh);
++		kfree(hint);
++		unlock_page(page);
++		txn_restart(ctx);
++		reiser4_exit_context(ctx);
++		return RETERR(-EIO);
++	}
++
++	/*
++	 * get plugin of found item or use plugin if extent if there are no
++	 * one
++	 */
++	iplug = item_plugin_by_coord(coord);
++	if (iplug->s.file.readpage)
++		result = iplug->s.file.readpage(coord, page);
++	else
++		result = RETERR(-EINVAL);
++
++	if (!result) {
++		set_key_offset(&key,
++			       (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
++		/* FIXME should call set_hint() */
++		unset_hint(hint);
++	} else {
++		unlock_page(page);
++		unset_hint(hint);
++	}
++	assert("vs-979",
++	       ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
++	assert("vs-9791", ergo(result != 0, !PageLocked(page)));
++
++	zrelse(coord->node);
++	done_lh(lh);
++
++	save_file_hint(file, hint);
++	kfree(hint);
++
++	/*
++	 * FIXME: explain why it is needed. HINT: page allocation in write can
++	 * not be done when atom is not NULL because reiser4_writepage can not
++	 * kick entd and have to eflush
++	 */
++	txn_restart(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/**
++ * readpage_unix_file - readpage of struct address_space_operations
++ * @file: file @page belongs to
++ * @page: page to read
++ *
++ * Get non exclusive access to a file to avoid races with truncate. If page is
++ * out of file - return error. Call readpage_unix_file_nolock to do the rest.
++ */
++int readpage_unix_file(struct file *file, struct page *page)
++{
++	return readpage_unix_file_nolock(file, page);
++}
++
++static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
++						loff_t count UNUSED_ARG)
++{
++	/* We should reserve one block, because of updating of the stat data
++	   item */
++	assert("vs-1249",
++	       inode_file_plugin(inode)->estimate.update ==
++	       estimate_update_common);
++	return estimate_update_common(inode);
++}
++
++/* this is called with nonexclusive access obtained, file's container can not change */
++static size_t read_file(hint_t * hint, struct file *file,	/* file to read from to */
++			char __user *buf,	/* address of user-space buffer */
++			size_t count,	/* number of bytes to read */
++			loff_t * off)
++{
++	int result;
++	struct inode *inode;
++	flow_t flow;
++	int (*read_f) (struct file *, flow_t *, hint_t *);
++	coord_t *coord;
++	znode *loaded;
++
++	inode = file->f_dentry->d_inode;
++
++	/* build flow */
++	assert("vs-1250",
++	       inode_file_plugin(inode)->flow_by_inode ==
++	       flow_by_inode_unix_file);
++	result =
++	    flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
++				    *off, READ_OP, &flow);
++	if (unlikely(result))
++		return result;
++
++	/* get seal and coord sealed with it from reiser4 private data
++	   of struct file.  The coord will tell us where our last read
++	   of this file finished, and the seal will help to determine
++	   if that location is still valid.
++	 */
++	coord = &hint->ext_coord.coord;
++	while (flow.length && result == 0) {
++		result =
++			find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
++		if (cbk_errored(result))
++			/* error happened */
++			break;
++
++		if (coord->between != AT_UNIT) {
++			/* there were no items corresponding to given offset */
++			done_lh(hint->ext_coord.lh);
++			break;
++		}
++
++		loaded = coord->node;
++		result = zload(loaded);
++		if (unlikely(result)) {
++			done_lh(hint->ext_coord.lh);
++			break;
++		}
++
++		if (hint->ext_coord.valid == 0)
++			validate_extended_coord(&hint->ext_coord,
++						get_key_offset(&flow.key));
++
++		assert("vs-4", hint->ext_coord.valid == 1);
++		assert("vs-33", hint->ext_coord.lh == &hint->lh);
++		/* call item's read method */
++		read_f = item_plugin_by_coord(coord)->s.file.read;
++		result = read_f(file, &flow, hint);
++		zrelse(loaded);
++		done_lh(hint->ext_coord.lh);
++	}
++
++	return (count - flow.length) ? (count - flow.length) : result;
++}
++
++/**
++ * read_unix_file - read of struct file_operations
++ * @file: file to read from
++ * @buf: address of user-space buffer
++ * @read_amount: number of bytes to read
++ * @off: position in file to read from
++ *
++ * This is implementation of vfs's read method of struct file_operations for
++ * unix file plugin.
++ */
++ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
++		       loff_t *off)
++{
++	reiser4_context *ctx;
++	int result;
++	struct inode *inode;
++	hint_t *hint;
++	unix_file_info_t *uf_info;
++	size_t count, read, left;
++	reiser4_block_nr needed;
++	loff_t size;
++
++	if (unlikely(read_amount == 0))
++		return 0;
++
++	assert("umka-072", file != NULL);
++	assert("umka-074", off != NULL);
++	inode = file->f_dentry->d_inode;
++	assert("vs-972", !inode_get_flag(inode, REISER4_NO_SD));
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	hint = kmalloc(sizeof(*hint), get_gfp_mask());
++	if (hint == NULL) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return RETERR(-ENOMEM);
++	}
++
++	result = load_file_hint(file, hint);
++	if (result) {
++		kfree(hint);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	left = read_amount;
++	count = 0;
++	uf_info = unix_file_inode_data(inode);
++	while (left > 0) {
++		txn_restart_current();
++
++		get_nonexclusive_access(uf_info);
++
++		size = i_size_read(inode);
++		if (*off >= size) {
++			/* position to read from is past the end of file */
++			drop_nonexclusive_access(uf_info);
++			break;
++		}
++		if (*off + left > size)
++			left = size - *off;
++
++		/* faultin user page */
++		if(fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left)) {
++			drop_nonexclusive_access(uf_info);
++			result = RETERR(-EFAULT);
++			break;
++		}
++
++		read = read_file(hint, file, buf,
++				 left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
++				 off);
++
++ 		drop_nonexclusive_access(uf_info);
++
++		if (read < 0) {
++			result = read;
++			break;
++		}
++		left -= read;
++		buf += read;
++
++		/* update position in a file */
++		*off += read;
++		/* total number of read bytes */
++		count += read;
++	}
++	save_file_hint(file, hint);
++	done_lh(&hint->lh);
++	kfree(hint);
++
++	if (count) {
++		/*
++		 * something was read. Grab space for stat data update and
++		 * update atime
++		 */
++		needed = unix_file_estimate_read(inode, read_amount);
++		result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
++		if (result == 0)
++			file_accessed(file);
++		else
++			warning("", "failed to grab space for atime update");
++	}
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++
++	/* return number of read bytes or error code if nothing is read */
++	return count ? count : result;
++}
++
++/* This function takes care about @file's pages. First of all it checks if
++   filesystems readonly and if so gets out. Otherwise, it throws out all
++   pages of file if it was mapped for read and going to be mapped for write
++   and consists of tails. This is done in order to not manage few copies
++   of the data (first in page cache and second one in tails them selves)
++   for the case of mapping files consisting tails.
++
++   Here also tail2extent conversion is performed if it is allowed and file
++   is going to be written or mapped for write. This functions may be called
++   from write_unix_file() or mmap_unix_file(). */
++static int check_pages_unix_file(struct file *file, struct inode *inode)
++{
++	reiser4_invalidate_pages(inode->i_mapping, 0,
++				 (inode->i_size + PAGE_CACHE_SIZE -
++				  1) >> PAGE_CACHE_SHIFT, 0);
++	return unpack(file, inode, 0 /* not forever */ );
++}
++
++/**
++ * mmap_unix_file - mmap of struct file_operations
++ * @file: file to mmap
++ * @vma:
++ *
++ * This is implementation of vfs's mmap method of struct file_operations for
++ * unix file plugin. It converts file to extent if necessary. Sets
++ * reiser4_inode's flag - REISER4_HAS_MMAP.
++ */
++int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
++{
++	reiser4_context *ctx;
++	int result;
++	struct inode *inode;
++	unix_file_info_t *uf_info;
++	reiser4_block_nr needed;
++
++	inode = file->f_dentry->d_inode;
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	uf_info = unix_file_inode_data(inode);
++
++	down(&uf_info->write);
++	get_exclusive_access(uf_info);
++
++	if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
++		/*
++		 * we need file built of extent items. If it is still built of
++		 * tail items we have to convert it. Find what items the file
++		 * is built of
++		 */
++		result = find_file_state(inode, uf_info);
++		if (result != 0) {
++			drop_exclusive_access(uf_info);
++			up(&uf_info->write);
++			reiser4_exit_context(ctx);
++			return result;
++		}
++
++		assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
++				   uf_info->container == UF_CONTAINER_EXTENTS ||
++				   uf_info->container == UF_CONTAINER_EMPTY));
++		if (uf_info->container == UF_CONTAINER_TAILS) {
++			/*
++			 * invalidate all pages and convert file from tails to
++			 * extents
++			 */
++			result = check_pages_unix_file(file, inode);
++			if (result) {
++				drop_exclusive_access(uf_info);
++				up(&uf_info->write);
++				reiser4_exit_context(ctx);
++				return result;
++			}
++		}
++	}
++
++	/*
++	 * generic_file_mmap will do update_atime. Grab space for stat data
++	 * update.
++	 */
++	needed = inode_file_plugin(inode)->estimate.update(inode);
++	result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
++	if (result) {
++		drop_exclusive_access(uf_info);
++		up(&uf_info->write);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	result = generic_file_mmap(file, vma);
++	if (result == 0) {
++		/* mark file as having mapping. */
++		inode_set_flag(inode, REISER4_HAS_MMAP);
++	}
++
++	drop_exclusive_access(uf_info);
++	up(&uf_info->write);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/**
++ * find_first_item
++ * @inode:
++ *
++ * Finds file item which is responsible for first byte in the file.
++ */
++static int find_first_item(struct inode *inode)
++{
++	coord_t coord;
++	lock_handle lh;
++	reiser4_key key;
++	int result;
++
++	coord_init_zero(&coord);
++	init_lh(&lh);
++	inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
++	result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
++				       inode);
++	if (result == CBK_COORD_FOUND) {
++		if (coord.between == AT_UNIT) {
++			result = zload(coord.node);
++			if (result == 0) {
++				result = item_id_by_coord(&coord);
++				zrelse(coord.node);
++				if (result != EXTENT_POINTER_ID &&
++				    result != FORMATTING_ID)
++					result = RETERR(-EIO);
++			}
++		} else
++			result = RETERR(-EIO);
++	}
++	done_lh(&lh);
++	return result;
++}
++
++/**
++ * open_unix_file
++ * @inode:
++ * @file:
++ *
++ * If filesystem is not readonly - complete uncompleted tail conversion if
++ * there was one
++ */
++int open_unix_file(struct inode *inode, struct file *file)
++{
++	int result;
++	reiser4_context *ctx;
++	unix_file_info_t *uf_info;
++
++	if (IS_RDONLY(inode))
++		return 0;
++
++	if (!inode_get_flag(inode, REISER4_PART_MIXED))
++		return 0;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	uf_info = unix_file_inode_data(inode);
++	get_exclusive_access(uf_info);
++
++	/*
++	 * it may happen that another process is doing tail conversion. Wait
++	 * until it completes
++	 */
++	while (1) {
++		if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
++			drop_exclusive_access(uf_info);
++			schedule();
++			get_exclusive_access(uf_info);
++			continue;
++		}
++		break;
++	}
++
++	if (!inode_get_flag(inode, REISER4_PART_MIXED)) {
++		/*
++		 * other process completed the conversion
++		 */
++		drop_exclusive_access(uf_info);
++		reiser4_exit_context(ctx);
++		return 0;
++	}
++
++	/*
++	 * file left in semi converted state after unclean shutdown or another
++	 * thread is doing conversion and dropped exclusive access which doing
++	 * balance dirty pages. Complete the conversion
++	 */
++	result = find_first_item(inode);
++	if (result == EXTENT_POINTER_ID)
++		/*
++		 * first item is extent, therefore there was incomplete
++		 * tail2extent conversion. Complete it
++		 */
++		result = tail2extent(unix_file_inode_data(inode));
++	else if (result == FORMATTING_ID)
++		/*
++		 * first item is formatting item, therefore there was
++		 * incomplete extent2tail conversion. Complete it
++		 */
++		result = extent2tail(unix_file_inode_data(inode));
++	else
++		result = -EIO;
++
++	assert("vs-1712",
++	       ergo(result == 0, (!inode_get_flag(inode, REISER4_PART_MIXED) &&
++				  !inode_get_flag(inode, REISER4_PART_IN_CONV))));
++	drop_exclusive_access(uf_info);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++#define NEITHER_OBTAINED 0
++#define EA_OBTAINED 1
++#define NEA_OBTAINED 2
++
++static void drop_access(unix_file_info_t *uf_info)
++{
++	if (uf_info->exclusive_use)
++		drop_exclusive_access(uf_info);
++	else
++		drop_nonexclusive_access(uf_info);
++}
++
++#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
++			      __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
++
++void balance_dirty_pages(struct address_space *mapping);
++
++/**
++ * write_unix_file - write of struct file_operations
++ * @file: file to write to
++ * @buf: address of user-space buffer
++ * @write_amount: number of bytes to write
++ * @off: position in file to write to
++ *
++ * This is implementation of vfs's write method of struct file_operations for
++ * unix file plugin.
++ */
++ssize_t write_unix_file(struct file *file, const char __user *buf,
++			size_t count, loff_t *pos)
++{
++	int result;
++	reiser4_context *ctx;
++	struct inode *inode;
++	unix_file_info_t *uf_info;
++	ssize_t written;
++	int try_free_space;
++	int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
++	size_t left;
++	ssize_t (*write_op)(struct file *, const char __user *, size_t,
++			    loff_t *pos);
++	int ea;
++	loff_t new_size;
++
++	inode = file->f_dentry->d_inode;
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	mutex_lock(&inode->i_mutex);
++
++	assert("vs-947", !inode_get_flag(inode, REISER4_NO_SD));
++	assert("vs-9471", (!inode_get_flag(inode, REISER4_PART_MIXED)));
++
++	/* check amount of bytes to write and writing position */
++	result = generic_write_checks(file, pos, &count, 0);
++	if (result) {
++		mutex_unlock(&inode->i_mutex);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	result = remove_suid(file->f_dentry);
++	if (result) {
++		mutex_unlock(&inode->i_mutex);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	uf_info = unix_file_inode_data(inode);
++
++	current->backing_dev_info = inode->i_mapping->backing_dev_info;
++	written = 0;
++	try_free_space = 0;
++	left = count;
++	ea = NEITHER_OBTAINED;
++
++	new_size = i_size_read(inode);
++	if (*pos + count > new_size)
++		new_size = *pos + count;
++
++	while (left) {
++		if (left < to_write)
++			to_write = left;
++
++		if (uf_info->container == UF_CONTAINER_EMPTY) {
++			get_exclusive_access(uf_info);
++			ea = EA_OBTAINED;
++			if (uf_info->container != UF_CONTAINER_EMPTY) {
++				/* file is made not empty by another process */
++				drop_exclusive_access(uf_info);
++				ea = NEITHER_OBTAINED;
++				continue;
++			}
++		} else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
++			/*
++			 * get exclusive access directly just to not have to
++			 * re-obtain it if file will appear empty
++			 */
++			get_exclusive_access(uf_info);
++			ea = EA_OBTAINED;
++			result = find_file_state(inode, uf_info);
++			if (result) {
++				drop_exclusive_access(uf_info);
++				ea = NEITHER_OBTAINED;
++				break;
++			}
++		} else {
++			get_nonexclusive_access(uf_info);
++			ea = NEA_OBTAINED;
++		}
++
++		/* either EA or NEA is obtained. Choose item write method */
++		if (uf_info->container == UF_CONTAINER_EXTENTS) {
++			/* file is built of extent items */
++			write_op = write_extent;
++		} else if (uf_info->container == UF_CONTAINER_EMPTY) {
++			/* file is empty */
++			if (should_have_notail(uf_info, new_size))
++				write_op = write_extent;
++			else
++				write_op = write_tail;
++		} else {
++			/* file is built of tail items */
++			if (should_have_notail(uf_info, new_size)) {
++				if (ea == NEA_OBTAINED) {
++					drop_nonexclusive_access(uf_info);
++					get_exclusive_access(uf_info);
++					ea = EA_OBTAINED;
++				}
++				if (uf_info->container == UF_CONTAINER_TAILS) {
++					/*
++					 * if file is being convered by another
++					 * process - wait until it completes
++					 */
++					while (1) {
++						if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
++							drop_exclusive_access(uf_info);
++							schedule();
++							get_exclusive_access(uf_info);
++							continue;
++						}
++						break;
++					}					
++					if (uf_info->container ==  UF_CONTAINER_TAILS) {
++						result = tail2extent(uf_info);
++						if (result)
++							break;
++					}
++				}
++				drop_exclusive_access(uf_info);
++				ea = NEITHER_OBTAINED;
++				continue;
++			}
++			write_op = write_tail;
++		}
++
++		written = write_op(file, buf, to_write, pos);
++		if (written == -ENOSPC && try_free_space) {
++			drop_access(uf_info);
++			txnmgr_force_commit_all(inode->i_sb, 0);
++			try_free_space = 0;
++			continue;
++		}
++		if (written < 0) {
++			drop_access(uf_info);
++			result = written;
++			break;
++		}
++		/* something is written. */
++		if (uf_info->container == UF_CONTAINER_EMPTY) {
++			assert("", ea == EA_OBTAINED);
++			uf_info->container = (write_op == write_extent) ? 
++				UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
++		} else {
++			assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
++					write_op == write_extent));
++			assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
++					write_op == write_tail));
++		}
++		if (*pos + written > inode->i_size)
++			INODE_SET_FIELD(inode, i_size, *pos + written);
++		file_update_time(file);
++		result = reiser4_update_sd(inode);
++		if (result) {
++			mutex_unlock(&inode->i_mutex);
++			current->backing_dev_info = NULL;
++			drop_access(uf_info);
++			context_set_commit_async(ctx);
++			reiser4_exit_context(ctx);
++			return result;
++		}
++		drop_access(uf_info);
++		ea = NEITHER_OBTAINED;
++		txn_restart(ctx);
++		current->journal_info = NULL;
++		/*
++		 * tell VM how many pages were dirtied. Maybe number of pages
++		 * which were dirty already should not be counted
++		 */
++		balance_dirty_pages(inode->i_mapping);
++		current->journal_info = ctx;
++
++		left -= written;
++		buf += written;
++		*pos += written;
++	}
++
++	mutex_unlock(&inode->i_mutex);
++
++	if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
++		txn_restart_current();
++		grab_space_enable();
++		result = sync_unix_file(file, file->f_dentry,
++					0 /* data and stat data */ );
++		if (result)
++			warning("reiser4-7", "failed to sync file %llu",
++				(unsigned long long)get_inode_oid(inode));
++	}
++
++	current->backing_dev_info = NULL;
++
++	reiser4_exit_context(ctx);
++
++	/*
++	 * return number of written bytes or error code if nothing is
++	 * written. Note, that it does not work correctly in case when
++	 * sync_unix_file returns error
++	 */
++	return (count - left) ? (count - left) : result;
++}
++
++/**
++ * release_unix_file - release of struct file_operations
++ * @inode: inode of released file
++ * @file: file to release
++ *
++ * Implementation of release method of struct file_operations for unix file
++ * plugin. If last reference to indode is released - convert all extent items
++ * into tail items if necessary. Frees reiser4 specific file data.
++ */
++int release_unix_file(struct inode *inode, struct file *file)
++{
++	reiser4_context *ctx;
++	unix_file_info_t *uf_info;
++	int result;
++	int in_reiser4;
++
++	in_reiser4 = is_in_reiser4_context();
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	result = 0;
++	if (in_reiser4 == 0) {
++		uf_info = unix_file_inode_data(inode);
++
++		down(&uf_info->write);
++		get_exclusive_access(uf_info);
++		if (atomic_read(&file->f_dentry->d_count) == 1 &&
++		    uf_info->container == UF_CONTAINER_EXTENTS &&
++		    !should_have_notail(uf_info, inode->i_size) &&
++		    !rofs_inode(inode)) {
++			result = extent2tail(uf_info);
++			if (result != 0) {
++				warning("nikita-3233",
++					"Failed (%d) to convert in %s (%llu)",
++					result, __FUNCTION__,
++					(unsigned long long)
++					get_inode_oid(inode));
++			}
++		}
++		drop_exclusive_access(uf_info);
++		up(&uf_info->write);
++	} else {
++		/*
++		   we are within reiser4 context already. How latter is
++		   possible? Simple:
++
++		   (gdb) bt
++		   #0  get_exclusive_access ()
++		   #2  0xc01e56d3 in release_unix_file ()
++		   #3  0xc01c3643 in reiser4_release ()
++		   #4  0xc014cae0 in __fput ()
++		   #5  0xc013ffc3 in remove_vm_struct ()
++		   #6  0xc0141786 in exit_mmap ()
++		   #7  0xc0118480 in mmput ()
++		   #8  0xc0133205 in oom_kill ()
++		   #9  0xc01332d1 in out_of_memory ()
++		   #10 0xc013bc1d in try_to_free_pages ()
++		   #11 0xc013427b in __alloc_pages ()
++		   #12 0xc013f058 in do_anonymous_page ()
++		   #13 0xc013f19d in do_no_page ()
++		   #14 0xc013f60e in handle_mm_fault ()
++		   #15 0xc01131e5 in do_page_fault ()
++		   #16 0xc0104935 in error_code ()
++		   #17 0xc025c0c6 in __copy_to_user_ll ()
++		   #18 0xc01d496f in read_tail ()
++		   #19 0xc01e4def in read_unix_file ()
++		   #20 0xc01c3504 in reiser4_read ()
++		   #21 0xc014bd4f in vfs_read ()
++		   #22 0xc014bf66 in sys_read ()
++		 */
++		warning("vs-44", "out of memory?");
++	}
++
++	reiser4_free_file_fsdata(file);
++
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++static void set_file_notail(struct inode *inode)
++{
++	reiser4_inode *state;
++	formatting_plugin *tplug;
++
++	state = reiser4_inode_data(inode);
++	tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
++	plugin_set_formatting(&state->pset, tplug);
++	inode_set_plugin(inode,
++			 formatting_plugin_to_plugin(tplug), PSET_FORMATTING);
++}
++
++/* if file is built of tails - convert it to extents */
++static int unpack(struct file *filp, struct inode *inode, int forever)
++{
++	int result = 0;
++	unix_file_info_t *uf_info;
++
++	uf_info = unix_file_inode_data(inode);
++	assert("vs-1628", ea_obtained(uf_info));
++
++	result = find_file_state(inode, uf_info);
++	if (result)
++		return result;
++	assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
++
++	if (uf_info->container == UF_CONTAINER_TAILS) {
++		/*
++		 * if file is being convered by another process - wait until it
++		 * completes
++		 */
++		while (1) {
++			if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
++				drop_exclusive_access(uf_info);
++				schedule();
++				get_exclusive_access(uf_info);
++				continue;
++			}
++			break;
++		}
++		if (uf_info->container == UF_CONTAINER_TAILS) {
++			result = tail2extent(uf_info);
++			if (result)
++				return result;
++		}
++	}
++	if (forever) {
++		/* safe new formatting plugin in stat data */
++		__u64 tograb;
++		
++		set_file_notail(inode);
++		
++		grab_space_enable();
++		tograb = inode_file_plugin(inode)->estimate.update(inode);
++		result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
++		result = reiser4_update_sd(inode);
++	}
++	
++	return result;
++}
++
++/* implentation of vfs' ioctl method of struct file_operations for unix file
++   plugin
++*/
++int
++ioctl_unix_file(struct inode *inode, struct file *filp,
++		unsigned int cmd, unsigned long arg UNUSED_ARG)
++{
++	reiser4_context *ctx;
++	int result;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	switch (cmd) {
++	case REISER4_IOC_UNPACK:
++		get_exclusive_access(unix_file_inode_data(inode));
++		result = unpack(filp, inode, 1 /* forever */ );
++		drop_exclusive_access(unix_file_inode_data(inode));
++		break;
++
++	default:
++		result = RETERR(-ENOSYS);
++		break;
++	}
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/* implentation of vfs' bmap method of struct address_space_operations for unix
++   file plugin
++*/
++sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
++{
++	reiser4_context *ctx;
++	sector_t result;
++	reiser4_key key;
++	coord_t coord;
++	lock_handle lh;
++	struct inode *inode;
++	item_plugin *iplug;
++	sector_t block;
++
++	inode = mapping->host;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	key_by_inode_and_offset_common(inode,
++				       (loff_t) lblock * current_blocksize,
++				       &key);
++
++	init_lh(&lh);
++	result =
++	    find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
++	if (cbk_errored(result)) {
++		done_lh(&lh);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	result = zload(coord.node);
++	if (result) {
++		done_lh(&lh);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	iplug = item_plugin_by_coord(&coord);
++	if (iplug->s.file.get_block) {
++		result = iplug->s.file.get_block(&coord, lblock, &block);
++		if (result == 0)
++			result = block;
++	} else
++		result = RETERR(-EINVAL);
++
++	zrelse(coord.node);
++	done_lh(&lh);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/**
++ * flow_by_inode_unix_file - initizlize structure flow
++ * @inode: inode of file for which read or write is abou
++ * @buf: buffer to perform read to or write from
++ * @user: flag showing whether @buf is user space or kernel space
++ * @size: size of buffer @buf
++ * @off: start offset fro read or write
++ * @op: READ or WRITE
++ * @flow:
++ *
++ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
++ */
++int flow_by_inode_unix_file(struct inode *inode,
++			    const char __user *buf, int user,
++			    loff_t size, loff_t off,
++			    rw_op op, flow_t *flow)
++{
++	assert("nikita-1100", inode != NULL);
++
++	flow->length = size;
++	memcpy(&flow->data, &buf, sizeof(buf));
++	flow->user = user;
++	flow->op = op;
++	assert("nikita-1931", inode_file_plugin(inode) != NULL);
++	assert("nikita-1932",
++	       inode_file_plugin(inode)->key_by_inode ==
++	       key_by_inode_and_offset_common);
++	/* calculate key of write position and insert it into flow->key */
++	return key_by_inode_and_offset_common(inode, off, &flow->key);
++}
++
++/* plugin->u.file.set_plug_in_sd = NULL
++   plugin->u.file.set_plug_in_inode = NULL
++   plugin->u.file.create_blank_sd = NULL */
++/* plugin->u.file.delete */
++/*
++   plugin->u.file.add_link = add_link_common
++   plugin->u.file.rem_link = NULL */
++
++/* plugin->u.file.owns_item
++   this is common_file_owns_item with assertion */
++/* Audited by: green(2002.06.15) */
++int
++owns_item_unix_file(const struct inode *inode /* object to check against */ ,
++		    const coord_t * coord /* coord to check */ )
++{
++	int result;
++
++	result = owns_item_common(inode, coord);
++	if (!result)
++		return 0;
++	if (item_type_by_coord(coord) != UNIX_FILE_METADATA_ITEM_TYPE)
++		return 0;
++	assert("vs-547",
++	       item_id_by_coord(coord) == EXTENT_POINTER_ID ||
++	       item_id_by_coord(coord) == FORMATTING_ID);
++	return 1;
++}
++
++static int setattr_truncate(struct inode *inode, struct iattr *attr)
++{
++	int result;
++	int s_result;
++	loff_t old_size;
++	reiser4_tree *tree;
++
++	inode_check_scale(inode, inode->i_size, attr->ia_size);
++
++	old_size = inode->i_size;
++	tree = tree_by_inode(inode);
++
++	result = safe_link_grab(tree, BA_CAN_COMMIT);
++	if (result == 0)
++		result = safe_link_add(inode, SAFE_TRUNCATE);
++	if (result == 0)
++		result = truncate_file_body(inode, attr->ia_size);
++	if (result)
++		warning("vs-1588", "truncate_file failed: oid %lli, "
++			"old size %lld, new size %lld, retval %d",
++			(unsigned long long)get_inode_oid(inode),
++			old_size, attr->ia_size, result);
++
++	s_result = safe_link_grab(tree, BA_CAN_COMMIT);
++	if (s_result == 0)
++		s_result =
++		    safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
++	if (s_result != 0) {
++		warning("nikita-3417", "Cannot kill safelink %lli: %i",
++			(unsigned long long)get_inode_oid(inode), s_result);
++	}
++	safe_link_release(tree);
++	return result;
++}
++
++/* plugin->u.file.setattr method */
++/* This calls inode_setattr and if truncate is in effect it also takes
++   exclusive inode access to avoid races */
++int setattr_unix_file(struct dentry *dentry,	/* Object to change attributes */
++		      struct iattr *attr /* change description */ )
++{
++	int result;
++
++	if (attr->ia_valid & ATTR_SIZE) {
++		reiser4_context *ctx;
++		unix_file_info_t *uf_info;
++
++		/* truncate does reservation itself and requires exclusive
++		   access obtained */
++		ctx = init_context(dentry->d_inode->i_sb);
++		if (IS_ERR(ctx))
++			return PTR_ERR(ctx);
++
++		uf_info = unix_file_inode_data(dentry->d_inode);
++		down(&uf_info->write);
++		get_exclusive_access(uf_info);
++		result = setattr_truncate(dentry->d_inode, attr);
++		drop_exclusive_access(uf_info);
++		up(&uf_info->write);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++	} else
++		result = setattr_common(dentry, attr);
++
++	return result;
++}
++
++/* plugin->u.file.init_inode_data */
++void
++init_inode_data_unix_file(struct inode *inode,
++			  reiser4_object_create_data * crd, int create)
++{
++	unix_file_info_t *data;
++
++	data = unix_file_inode_data(inode);
++	data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
++	init_rwsem(&data->latch);
++	sema_init(&data->write, 1);
++	data->tplug = inode_formatting_plugin(inode);
++	data->exclusive_use = 0;
++
++#if REISER4_DEBUG
++	data->ea_owner = NULL;
++	atomic_set(&data->nr_neas, 0);
++#endif
++	init_inode_ordering(inode, crd, create);
++}
++
++/**
++ * delete_object_unix_file - delete_object of file_plugin
++ * @inode: inode to be deleted
++ *
++ * Truncates file to length 0, removes stat data and safe link.
++ */
++int delete_object_unix_file(struct inode *inode)
++{
++	unix_file_info_t *uf_info;
++	int result;
++
++	if (inode_get_flag(inode, REISER4_NO_SD))
++		return 0;
++
++	/* truncate file bogy first */
++	uf_info = unix_file_inode_data(inode);
++	get_exclusive_access(uf_info);
++	result = truncate_file_body(inode, 0 /* size */ );
++	drop_exclusive_access(uf_info);
++
++	if (result)
++		warning("", "failed to truncate file (%llu) on removal: %d",
++			get_inode_oid(inode), result);
++
++	/* remove stat data and safe link */
++	return delete_object_common(inode);
++}
++
++/**
++ * sendfile_unix_file - sendfile of struct file_operations
++ * @file: file to be sent
++ * @ppos: position to start from
++ * @count: number of bytes to send
++ * @actor: function to copy data
++ * @target: where to copy read data
++ *
++ * Reads @count bytes from @file and calls @actor for every page read. This is
++ * needed for loop back devices support.
++ */
++ssize_t
++sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
++		   read_actor_t actor, void *target)
++{
++	reiser4_context *ctx;
++	ssize_t result;
++	struct inode *inode;
++	unix_file_info_t *uf_info;
++
++	inode = file->f_dentry->d_inode;
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	/*
++	 * generic_file_sndfile may want to call update_atime. Grab space for
++	 * stat data update
++	 */
++	result = reiser4_grab_space(estimate_update_common(inode),
++				    BA_CAN_COMMIT);
++	if (result)
++		goto error;
++	mutex_lock(&inode->i_mutex);
++	inode_set_flag(inode, REISER4_HAS_MMAP);
++	mutex_unlock(&inode->i_mutex);
++
++	uf_info = unix_file_inode_data(inode);
++	get_nonexclusive_access(uf_info);
++	result = generic_file_sendfile(file, ppos, count, actor, target);
++	drop_nonexclusive_access(uf_info);
++ error:
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++int
++prepare_write_unix_file(struct file *file, struct page *page,
++			unsigned from, unsigned to)
++{
++	reiser4_context *ctx;
++	unix_file_info_t *uf_info;
++	int ret;
++
++	ctx = init_context(file->f_dentry->d_inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	uf_info = unix_file_inode_data(file->f_dentry->d_inode);
++	get_exclusive_access(uf_info);
++	ret = find_file_state(file->f_dentry->d_inode, uf_info);
++	if (ret == 0) {
++		if (uf_info->container == UF_CONTAINER_TAILS)
++			ret = -EINVAL;
++		else
++			ret = do_prepare_write(file, page, from, to);
++	}
++	drop_exclusive_access(uf_info);
++
++	/* don't commit transaction under inode semaphore */
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return ret;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/file/file.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/file.h
+@@ -0,0 +1,257 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* this file contains declarations of methods implementing file plugins
++   (UNIX_FILE_PLUGIN_ID, SYMLINK_FILE_PLUGIN_ID and CRC_FILE_PLUGIN_ID) */
++
++#if !defined( __REISER4_FILE_H__ )
++#define __REISER4_FILE_H__
++
++/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
++
++/* inode operations */
++int setattr_unix_file(struct dentry *, struct iattr *);
++
++/* file operations */
++ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
++		       loff_t *off);
++ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
++			loff_t * off);
++int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
++		    unsigned long arg);
++int mmap_unix_file(struct file *, struct vm_area_struct *);
++int open_unix_file(struct inode *, struct file *);
++int release_unix_file(struct inode *, struct file *);
++int sync_unix_file(struct file *, struct dentry *, int datasync);
++ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
++			   read_actor_t, void *target);
++
++/* address space operations */
++int readpage_unix_file(struct file *, struct page *);
++int readpage_unix_file_nolock(struct file *, struct page *);
++int writepages_unix_file(struct address_space *, struct writeback_control *);
++int prepare_write_unix_file(struct file *, struct page *, unsigned from,
++			    unsigned to);
++int commit_write_unix_file(struct file *, struct page *, unsigned from,
++			   unsigned to);
++sector_t bmap_unix_file(struct address_space *, sector_t lblock);
++
++/* file plugin operations */
++int flow_by_inode_unix_file(struct inode *, const char __user *buf,
++			    int user, loff_t, loff_t, rw_op, flow_t *);
++int owns_item_unix_file(const struct inode *, const coord_t *);
++void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
++			       int create);
++int delete_object_unix_file(struct inode *);
++
++/*
++ * all the write into unix file is performed by item write method. Write method
++ * of unix file plugin only decides which item plugin (extent or tail) and in
++ * which mode (one from the enum below) to call
++ */
++typedef enum {
++	FIRST_ITEM = 1,
++	APPEND_ITEM = 2,
++	OVERWRITE_ITEM = 3
++} write_mode_t;
++
++/* unix file may be in one the following states */
++typedef enum {
++	UF_CONTAINER_UNKNOWN = 0,
++	UF_CONTAINER_TAILS = 1,
++	UF_CONTAINER_EXTENTS = 2,
++	UF_CONTAINER_EMPTY = 3
++} file_container_t;
++
++struct formatting_plugin;
++struct inode;
++
++/* unix file plugin specific part of reiser4 inode */
++typedef struct unix_file_info {
++	/*
++	 * this read-write lock protects file containerization change. Accesses
++	 * which do not change file containerization (see file_container_t)
++	 * (read, readpage, writepage, write (until tail conversion is
++	 * involved)) take read-lock. Accesses which modify file
++	 * containerization (truncate, conversion from tail to extent and back)
++	 * take write-lock.
++	 */
++	struct rw_semaphore latch;
++	/*
++	 * this semaphore is used to serialize writes instead of inode->i_mutex,
++	 * because write_unix_file uses get_user_pages which is to be used
++	 * under mm->mmap_sem and because it is required to take mm->mmap_sem
++	 * before inode->i_mutex, so inode->i_mutex would have to be unlocked
++	 * before calling to get_user_pages which is unacceptable
++	 */
++	struct semaphore write;
++	/* this enum specifies which items are used to build the file */
++	file_container_t container;
++	/*
++	 * plugin which controls when file is to be converted to extents and
++	 * back to tail
++	 */
++	struct formatting_plugin *tplug;
++	/* if this is set, file is in exclusive use */
++	int exclusive_use;
++#if REISER4_DEBUG
++	/* pointer to task struct of thread owning exclusive access to file */
++	void *ea_owner;
++	atomic_t nr_neas;
++	void *last_reader;
++#endif
++} unix_file_info_t;
++
++struct unix_file_info *unix_file_inode_data(const struct inode *inode);
++void get_exclusive_access(unix_file_info_t *);
++void drop_exclusive_access(unix_file_info_t *);
++void get_nonexclusive_access(unix_file_info_t *);
++void drop_nonexclusive_access(unix_file_info_t *);
++int try_to_get_nonexclusive_access(unix_file_info_t *);
++int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
++		   struct inode *);
++int find_file_item_nohint(coord_t *, lock_handle *,
++			  const reiser4_key *, znode_lock_mode,
++			  struct inode *);
++
++void validate_extended_coord(uf_coord_t *, loff_t offset);
++int load_file_hint(struct file *, hint_t *);
++void save_file_hint(struct file *, const hint_t *);
++
++
++#include "../item/extent.h"
++#include "../item/tail.h"
++#include "../item/ctail.h"
++
++struct uf_coord {
++	coord_t coord;
++	lock_handle *lh;
++	int valid;
++	union {
++		extent_coord_extension_t extent;
++		tail_coord_extension_t tail;
++		ctail_coord_extension_t ctail;
++	} extension;
++};
++
++#include "../../forward.h"
++#include "../../seal.h"
++#include "../../lock.h"
++
++/*
++ * This structure is used to speed up file operations (reads and writes).  A
++ * hint is a suggestion about where a key resolved to last time.  A seal
++ * indicates whether a node has been modified since a hint was last recorded.
++ * You check the seal, and if the seal is still valid, you can use the hint
++ * without traversing the tree again.
++ */
++struct hint {
++	seal_t seal; /* a seal over last file item accessed */
++	uf_coord_t ext_coord;
++	loff_t offset;
++	znode_lock_mode mode;
++	lock_handle lh;
++};
++
++void set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
++int hint_is_set(const hint_t *);
++void unset_hint(hint_t *);
++int hint_validate(hint_t *, const reiser4_key *, int check_key,
++		  znode_lock_mode);
++void hint_init_zero(hint_t *);
++
++int update_file_size(struct inode *, reiser4_key *, int update_sd);
++int cut_file_items(struct inode *, loff_t new_size, int update_sd,
++		   loff_t cur_size, int (*update_actor) (struct inode *,
++							 reiser4_key *, int));
++
++
++#if REISER4_DEBUG
++
++/* return 1 is exclusive access is obtained, 0 - otherwise */
++static inline int ea_obtained(unix_file_info_t * uf_info)
++{
++	int ret;
++
++	ret = down_read_trylock(&uf_info->latch);
++	if (ret)
++		up_read(&uf_info->latch);
++	return !ret;
++}
++
++#endif
++
++/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
++int create_symlink(struct inode *symlink, struct inode *dir,
++		   reiser4_object_create_data *);
++void destroy_inode_symlink(struct inode *);
++
++/* declarations of functions implementing CRC_FILE_PLUGIN_ID file plugin */
++
++/* inode operations */
++int setattr_cryptcompress(struct dentry *, struct iattr *);
++
++/* file operations */
++ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
++			   loff_t * off);
++ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
++			    loff_t * off);
++int mmap_cryptcompress(struct file *, struct vm_area_struct *);
++ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
++			       read_actor_t actor, void *target);
++int release_cryptcompress(struct inode *, struct file *);
++
++/* address space operations */
++extern int readpage_cryptcompress(struct file *, struct page *);
++extern int writepages_cryptcompress(struct address_space *,
++				     struct writeback_control *);
++
++
++/* file plugin operations */
++int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
++				int user, loff_t, loff_t, rw_op, flow_t *);
++int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
++int create_cryptcompress(struct inode *, struct inode *,
++			 reiser4_object_create_data *);
++int delete_cryptcompress(struct inode *);
++void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
++				   int create);
++int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
++				  const reiser4_key * to_key,
++				  reiser4_key * smallest_removed,
++				  struct inode *object, int truncate,
++				  int *progress);
++void destroy_inode_cryptcompress(struct inode *);
++
++extern reiser4_plugin_ops cryptcompress_plugin_ops;
++
++#define WRITE_GRANULARITY 32
++
++
++int tail2extent(unix_file_info_t *);
++int extent2tail(unix_file_info_t *);
++
++int goto_right_neighbor(coord_t *, lock_handle *);
++int find_or_create_extent(struct page *);
++int equal_to_ldk(znode *, const reiser4_key *);
++
++
++extern inline int cbk_errored(int cbk_result)
++{
++	return (cbk_result != CBK_COORD_NOTFOUND
++		&& cbk_result != CBK_COORD_FOUND);
++}
++
++/* __REISER4_FILE_H__ */
++#endif
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/file/invert.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/invert.c
+@@ -0,0 +1,493 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
++   buffer, without having a separate buffer for each 8 byte or so file.  Inverts are the way to do that.  An invert
++   provides you with the contents of a set of subfiles plus its own contents.  It is a file which inherits other files
++   when you read it, and allows you to write to it and through it to the files that it inherits from.  In order for it
++   to know which subfiles each part of your write should go into, there must be delimiters indicating that.  It tries to
++   make that easy for you by providing those delimiters in what you read from it.
++
++  When you read it, an invert performs an inverted assignment.  Instead of taking an assignment command and writing a
++  bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
++  would create those files.  But which files?  Well, that must be specified in the body of the invert using a special
++  syntax, and that specification is called the invert of the assignment.
++
++  When written to, an invert performs the assignment command that is written
++  to it, and modifies its own body to contain the invert of that
++  assignment.
++
++  In other words, writing to an invert file what you have read from it
++  is the identity operation.
++
++  Malformed assignments cause write errors.  Partial writes are not
++  supported in v4.0, but will be.
++
++  Example:
++
++    If an invert contains:
++
++    /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
++
++======================
++Each element in this definition should be an invert, and all files
++should be called recursively - too.  This is bad. If one of the
++included files in not a regular or invert file, then we can't read
++main file.
++
++I think to make it is possible easier:
++
++internal structure of invert file should be like symlink file. But
++read and write method should be explitely indicated in i/o operation..
++
++By default we read and write (if probably) as symlink and if we
++specify ..invert at reading time that too we can specify it at write time.
++
++example:
++/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
++will create  /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
++
++read of /my_invert_file/..invert will be
++/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
++
++but read of /my_invert_file/ will be
++The contents of filenameAsome text stored in the invertThe contents of filenameB
++
++we also can creat this file as
++/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
++will create  /my_invert_file , and use existing files /filenameA and /filenameB.
++
++and when we will read it will be as previously invert file.
++
++This is correct?
++
++ vv
++DEMIDOV-FIXME-HANS:
++
++Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
++
++Do you agree?  Discuss it on reiserfs-list....
++
++-Hans
++=======================
++
++  Then a read will return:
++
++    /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
++
++    and a write of the line above to the invert will set the contents of
++    the invert and filenameA and filenameB to their original values.
++
++  Note that the contents of an invert have no influence on the effect
++  of a write unless the write is a partial write (and a write of a
++  shorter file without using truncate first is a partial write).
++
++  truncate() has no effect on filenameA and filenameB, it merely
++  resets the value of the invert.
++
++  Writes to subfiles via the invert are implemented by preceding them
++  with truncates.
++
++  Parse failures cause write failures.
++
++  Questions to ponder: should the invert be acted on prior to file
++  close when writing to an open filedescriptor?
++
++ Example:
++
++ If an invert contains:
++
++   "(This text and a pair of quotes are all that is here.)
++
++Then a read will return:
++
++   "(This text and a pair of quotes are all that is here.)
++
++*/
++
++/* OPEN method places a struct file in memory associated with invert body
++  and returns something like file descriptor to the user for the future access
++  to the invert file.
++  During opening we parse the body of invert and get a list of the 'entryes'
++  (that describes all its subfiles) and place pointer on the first struct in
++  reiserfs-specific part of invert inode (arbitrary decision).
++
++  Each subfile is described by the struct inv_entry that has a pointer @sd on
++  in-core based stat-data and  a pointer on struct file @f (if we find that the
++  subfile uses more then one unformated node (arbitrary decision), we load
++  struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
++  of some other information we need)
++
++  Since READ and WRITE methods for inverts were formulated in assignment
++  language, they don't contain arguments 'size' and 'offset' that make sense
++  only in ordinary read/write methods.
++
++  READ method is a combination of two methods:
++  1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
++  with @f != 0, this method uses pointer on struct file as an argument
++  2) read method for inode-less files with @sd != 0, this method uses
++  in-core based stat-data instead struct file as an argument.
++  in the first case we don't use pagecache, just copy data that we got after
++  cbk() into userspace.
++
++  WRITE method for invert files is more complex.
++  Besides declared WRITE-interface in assignment languageb above we need
++  to have an opportunity to edit unwrapped body of invert file with some
++  text editor, it means we need GENERIC WRITE METHOD for invert file:
++
++  my_invert_file/..invert <- "string"
++
++  this method parses "string" and looks for correct subfile signatures, also
++  the parsing process splits this "string" on the set of flows in  accordance
++  with the set of subfiles specified by this signarure.
++  The found list of signatures #S is compared with the opened one #I of invert
++  file. If it doesn't have this one (#I==0, it will be so for instance if we
++  have just create this invert file) the write method assignes found signature
++  (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
++  itself to the some write methods for ordinary or light-weight, or call itself
++  recursively for invert files with corresponding flows.
++  I am not sure, but the list of signatures looks like what mr.Demidov means
++  by 'delimiters'.
++
++  The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
++  and cause delete (create new) subfiles (arbitrary decision - it may looks
++  too complex, but this interface will be the completest). The order of entries
++  of list #S (#I) and inherited order on #I (#S) must coincide.
++  The other parsing results give malformed signature that aborts READ method
++  and releases all resources.
++
++  Format of subfile (entry) signature:
++
++  "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
++
++  Legend:
++
++    START_MAGIC - keyword indicates the start of subfile signature;
++
++    <> indicates the start of 'subfile metadata', that is the pair
++  (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
++
++    TYPE - the string "type" indicates the start of one of the three words:
++  - ORDINARY_FILE,
++  - LIGHT_WEIGHT_FILE,
++  - INVERT_FILE;
++
++    LOOKUP_ARG - lookup argument depends on previous type:
++  */
++
++ /************************************************************/
++ /*       TYPE        *          LOOKUP ARGUMENT             */
++ /************************************************************/
++ /* LIGH_WEIGHT_FILE  *           stat-data key              */
++ /************************************************************/
++ /*   ORDINARY_FILE   *             filename                 */
++ /************************************************************/
++ /*   INVERT_FILE     *             filename                 */
++ /************************************************************/
++
++ /* where:
++  *stat-data key - the string contains stat data key of this subfile, it will be
++  passed to fast-access lookup method for light-weight files;
++  *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
++  for ordinary and invert files;
++
++  SUBFILE_BODY - data of this subfile (it will go to the flow)
++  END_MAGIC - the keyword indicates the end of subfile signature.
++
++  The other simbols inside the signature interpreted as 'unformatted content',
++  which is available with VFS's read_link() (arbitraruy decision).
++
++  NOTE: Parse method for a body of invert file uses mentioned signatures _without_
++  subfile bodies.
++
++  Now the only unclear thing is WRITE in regular light-weight subfile A that we
++  can describe only in  assignment language:
++
++  A <- "some_string"
++
++  I guess we don't want to change stat-data and body items of file A
++  if this file exist, and size(A) != size("some_string") because this operation is
++  expencive, so we only do the partial write if size(A) > size("some_string")
++  and do truncate of the "some_string", and then do A <- "truncated string", if
++  size(A) < size("some_string"). This decision is also arbitrary..
++  */
++
++/* here is infrastructure for formated flows */
++
++#define SUBFILE_HEADER_MAGIC 0x19196605
++#define FLOW_HEADER_MAGIC 0x01194304
++
++#include "../plugin.h"
++#include "../../debug.h"
++#include "../../forward.h"
++#include "../object.h"
++#include "../item/item.h"
++#include "../item/static_stat.h"
++#include "../../dformat.h"
++#include "../znode.h"
++#include "../inode.h"
++
++#include <linux/types.h>
++#include <linux/fs.h>		/* for struct file  */
++#include <linux/list.h>		/* for struct list_head */
++
++typedef enum {
++	LIGHT_WEIGHT_FILE,
++	ORDINARY_FILE,
++	INVERT_FILE
++} inv_entry_type;
++
++typedef struct flow_header {
++	d32 fl_magic;
++	d16 fl_nr;		/* number of subfiles in the flow */
++};
++
++typedef struct subfile_header {
++	d32 sh_magic;		/* subfile magic */
++	d16 sh_type;		/* type of subfile: light-weight, ordinary, invert */
++	d16 sh_arg_len;		/* lenght of lookup argument (filename, key) */
++	d32 sh_body_len;	/* lenght of subfile body */
++};
++
++/* functions to get/set fields of flow header */
++
++static void fl_set_magic(flow_header * fh, __u32 value)
++{
++	cputod32(value, &fh->fh_magic);
++}
++
++static __u32 fl_get_magic(flow_header * fh)
++{
++	return d32tocpu(&fh->fh_magic);
++}
++static void fl_set_number(flow_header * fh, __u16 value)
++{
++	cputod16(value, &fh->fh_nr);
++}
++static unsigned fl_get_number(flow_header * fh)
++{
++	return d16tocpu(&fh->fh_nr);
++}
++
++/* functions to get/set fields of subfile header */
++
++static void sh_set_magic(subfile_header * sh, __u32 value)
++{
++	cputod32(value, &sh->sh_magic);
++}
++
++static __u32 sh_get_magic(subfile_header * sh)
++{
++	return d32tocpu(&sh->sh_magic);
++}
++static void sh_set_type(subfile_header * sh, __u16 value)
++{
++	cputod16(value, &sh->sh_magic);
++}
++static unsigned sh_get_type(subfile_header * sh)
++{
++	return d16tocpu(&sh->sh_magic);
++}
++static void sh_set_arg_len(subfile_header * sh, __u16 value)
++{
++	cputod16(value, &sh->sh_arg_len);
++}
++static unsigned sh_get_arg_len(subfile_header * sh)
++{
++	return d16tocpu(&sh->sh_arg_len);
++}
++static void sh_set_body_len(subfile_header * sh, __u32 value)
++{
++	cputod32(value, &sh->sh_body_len);
++}
++
++static __u32 sh_get_body_len(subfile_header * sh)
++{
++	return d32tocpu(&sh->sh_body_len);
++}
++
++/* in-core minimal stat-data, light-weight analog of inode */
++
++struct incore_sd_base {
++	umode_t isd_mode;
++	nlink_t isd_nlink;
++	loff_t isd_size;
++	char *isd_data;		/* 'subflow' to write */
++};
++
++/* open invert create a list of invert entries,
++   every entry is represented by structure inv_entry */
++
++struct inv_entry {
++	struct list_head *ie_list;
++	struct file *ie_file;	/* this is NULL if the file doesn't
++				   have unformated nodes */
++	struct incore_sd_base *ie_sd;	/* inode-less analog of struct file */
++};
++
++/* allocate and init invert entry */
++
++static struct inv_entry *allocate_inv_entry(void)
++{
++	struct inv_entry *inv_entry;
++
++	inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
++	if (!inv_entry)
++		return ERR_PTR(RETERR(-ENOMEM));
++	inv_entry->ie_file = NULL;
++	inv_entry->ie_sd = NULL;
++	INIT_LIST_HEAD(&inv_entry->ie_list);
++	return inv_entry;
++}
++
++static int put_inv_entry(struct inv_entry *ientry)
++{
++	int result = 0;
++
++	assert("edward-96", ientry != NULL);
++	assert("edward-97", ientry->ie_list != NULL);
++
++	list_del(ientry->ie_list);
++	if (ientry->ie_sd != NULL) {
++		kfree(ientry->ie_sd);
++		kfree(ientry);
++	}
++	if (ientry->ie_file != NULL)
++		result = filp_close(ientry->file, NULL);
++	return result;
++}
++
++static int allocate_incore_sd_base(struct inv_entry *inv_entry)
++{
++	struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
++	assert("edward-99", inv_entry->ie_inode = NULL);
++	assert("edward-100", inv_entry->ie_sd = NULL);
++
++	isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
++	if (!isd_base)
++		return RETERR(-ENOMEM);
++	inv_entry->ie_sd = isd_base;
++	return 0;
++}
++
++/* this can be installed as ->init_inv_entry () method of
++   item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
++   Copies data from on-disk stat-data format into light-weight analog of inode .
++   Doesn't hanlde stat-data extensions. */
++
++static void sd_base_load(struct inv_entry *inv_entry, char *sd)
++{
++	reiser4_stat_data_base *sd_base;
++
++	assert("edward-101", inv_entry != NULL);
++	assert("edward-101", inv_entry->ie_sd != NULL);
++	assert("edward-102", sd != NULL);
++
++	sd_base = (reiser4_stat_data_base *) sd;
++	inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
++	inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
++	inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
++	inv_entry->incore_sd_base->isd_data = NULL;
++}
++
++/* initialise incore stat-data */
++
++static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
++{
++	reiser4_plugin *plugin = item_plugin_by_coord(coord);
++	void *body = item_body_by_coord(coord);
++
++	assert("edward-103", inv_entry != NULL);
++	assert("edward-104", plugin != NULL);
++	assert("edward-105", body != NULL);
++
++	sd_base_load(inv_entry, body);
++}
++
++/* takes a key or filename and allocates new invert_entry,
++   init and adds it into the list,
++   we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
++
++int get_inv_entry(struct inode *invert_inode,	/* inode of invert's body */
++		  inv_entry_type type,	/* LIGHT-WEIGHT or ORDINARY */
++		  const reiser4_key * key,	/* key of invert entry stat-data */
++		  char *filename,	/* filename of the file to be opened */
++		  int flags, int mode)
++{
++	int result;
++	struct inv_entry *ientry;
++
++	assert("edward-107", invert_inode != NULL);
++
++	ientry = allocate_inv_entry();
++	if (IS_ERR(ientry))
++		return (PTR_ERR(ientry));
++
++	if (type == LIGHT_WEIGHT_FILE) {
++		coord_t coord;
++		lock_handle lh;
++
++		assert("edward-108", key != NULL);
++
++		init_coord(&coord);
++		init_lh(&lh);
++		result =
++		    lookup_sd_by_key(tree_by_inode(invert_inode),
++				     ZNODE_READ_LOCK, &coord, &lh, key);
++		if (result == 0)
++			init_incore_sd_base(ientry, coord);
++
++		done_lh(&lh);
++		done_coord(&coord);
++		return (result);
++	} else {
++		struct file *file = filp_open(filename, flags, mode);
++		/* FIXME_EDWARD here we need to check if we
++		   did't follow to any mount point */
++
++		assert("edward-108", filename != NULL);
++
++		if (IS_ERR(file))
++			return (PTR_ERR(file));
++		ientry->ie_file = file;
++		return 0;
++	}
++}
++
++/* takes inode of invert, reads the body of this invert, parses it,
++   opens all invert entries and return pointer on the first inv_entry */
++
++struct inv_entry *open_invert(struct file *invert_file)
++{
++
++}
++
++ssize_t subfile_read(struct *invert_entry, flow * f)
++{
++
++}
++
++ssize_t subfile_write(struct *invert_entry, flow * f)
++{
++
++}
++
++ssize_t invert_read(struct *file, flow * f)
++{
++
++}
++
++ssize_t invert_write(struct *file, flow * f)
++{
++
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/file/symfile.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/symfile.c
+@@ -0,0 +1,87 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Symfiles are a generalization of Unix symlinks.
++
++   A symfile when read behaves as though you took its contents and
++   substituted them into the reiser4 naming system as the right hand side
++   of an assignment, and then read that which you had assigned to it.
++
++   A key issue for symfiles is how to implement writes through to
++   subfiles.  In general, one must have some method of determining what
++   of that which is written to the symfile is written to what subfile.
++   This can be done by use of custom plugin methods written by users, or
++   by using a few general methods we provide for those willing to endure
++   the insertion of delimiters into what is read.
++
++   Writing to symfiles without delimiters to denote what is written to
++   what subfile is not supported by any plugins we provide in this
++   release.  Our most sophisticated support for writes is that embodied
++   by the invert plugin (see invert.c).
++
++   A read only version of the /etc/passwd file might be
++   constructed as a symfile whose contents are as follows:
++
++   /etc/passwd/userlines/*
++
++   or
++
++   /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
++
++   or
++
++   /etc/passwd/userlines/(demidov+edward+reiser+root)
++
++   A symfile with contents
++
++   /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
++
++   will return when read
++
++   The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
++
++   and write of what has been read will not be possible to implement as
++   an identity operation because there are no delimiters denoting the
++   boundaries of what is to be written to what subfile.
++
++   Note that one could make this a read/write symfile if one specified
++   delimiters, and the write method understood those delimiters delimited
++   what was written to subfiles.
++
++   So, specifying the symfile in a manner that allows writes:
++
++   /etc/passwd/userlines/demidov+"(
++   )+/etc/passwd/userlines/edward+"(
++   )+/etc/passwd/userlines/reiser+"(
++   )+/etc/passwd/userlines/root+"(
++   )
++
++   or
++
++   /etc/passwd/userlines/(demidov+"(
++   )+edward+"(
++   )+reiser+"(
++   )+root+"(
++   ))
++
++   and the file demidov might be specified as:
++
++   /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
++
++   or
++
++   /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
++
++   Notice that if the file demidov has a carriage return in it, the
++   parsing fails, but then if you put carriage returns in the wrong place
++   in a normal /etc/passwd file it breaks things also.
++
++   Note that it is forbidden to have no text between two interpolations
++   if one wants to be able to define what parts of a write go to what
++   subfiles referenced in an interpolation.
++
++   If one wants to be able to add new lines by writing to the file, one
++   must either write a custom plugin for /etc/passwd that knows how to
++   name an added line, or one must use an invert, or one must use a more
++   sophisticated symfile syntax that we are not planning to write for
++   version 4.0.
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/file/symlink.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/symlink.c
+@@ -0,0 +1,92 @@
++/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "../../inode.h"
++
++#include <linux/types.h>
++#include <linux/fs.h>
++
++/* file plugin methods specific for symlink files
++   (SYMLINK_FILE_PLUGIN_ID) */
++
++/* this is implementation of create_object method of file plugin for
++   SYMLINK_FILE_PLUGIN_ID
++ */
++
++/**
++ * create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
++ * @symlink: inode of symlink object
++ * @dir: inode of parent directory
++ * @info:  parameters of new object
++ *
++ * Inserts stat data with symlink extension where into the tree.
++ */
++int create_symlink(struct inode *symlink,
++		   struct inode *dir UNUSED_ARG,
++		   reiser4_object_create_data *data	/* info passed to us,
++							 * this is filled by
++							 * reiser4() syscall
++							 * in particular */ )
++{
++	int result;
++
++	assert("nikita-680", symlink != NULL);
++	assert("nikita-681", S_ISLNK(symlink->i_mode));
++	assert("nikita-685", inode_get_flag(symlink, REISER4_NO_SD));
++	assert("nikita-682", dir != NULL);
++	assert("nikita-684", data != NULL);
++	assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
++
++	/*
++	 * stat data of symlink has symlink extension in which we store
++	 * symlink content, that is, path symlink is pointing to.
++	 */
++	reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
++
++	assert("vs-838", symlink->u.generic_ip == NULL);
++	symlink->u.generic_ip = (void *)data->name;
++
++	assert("vs-843", symlink->i_size == 0);
++	INODE_SET_FIELD(symlink, i_size, strlen(data->name));
++
++	/* insert stat data appended with data->name */
++	result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
++	if (result) {
++		/* FIXME-VS: Make sure that symlink->u.generic_ip is not attached
++		   to kmalloced data */
++		INODE_SET_FIELD(symlink, i_size, 0);
++	} else {
++		assert("vs-849", symlink->u.generic_ip
++		       && inode_get_flag(symlink, REISER4_GENERIC_PTR_USED));
++		assert("vs-850",
++		       !memcmp((char *)symlink->u.generic_ip, data->name,
++			       (size_t) symlink->i_size + 1));
++	}
++	return result;
++}
++
++/* this is implementation of destroy_inode method of file plugin for
++   SYMLINK_FILE_PLUGIN_ID
++ */
++void destroy_inode_symlink(struct inode *inode)
++{
++	assert("edward-799",
++	       inode_file_plugin(inode) ==
++	       file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
++	assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
++	assert("edward-801", inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
++	assert("vs-839", S_ISLNK(inode->i_mode));
++
++	kfree(inode->u.generic_ip);
++	inode->u.generic_ip = NULL;
++	inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
++}
++
++/* Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/file/tail_conversion.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file/tail_conversion.c
+@@ -0,0 +1,728 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "../../inode.h"
++#include "../../super.h"
++#include "../../page_cache.h"
++#include "../../carry.h"
++#include "../../safe_link.h"
++#include "../../vfs_ops.h"
++
++#include <linux/writeback.h>
++
++/* this file contains:
++   tail2extent and extent2tail */
++
++/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
++void get_exclusive_access(unix_file_info_t * uf_info)
++{
++	assert("nikita-3028", schedulable());
++	assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
++	assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
++	/*
++	 * "deadlock avoidance": sometimes we commit a transaction under
++	 * rw-semaphore on a file. Such commit can deadlock with another
++	 * thread that captured some block (hence preventing atom from being
++	 * committed) and waits on rw-semaphore.
++	 */
++	txn_restart_current();
++	LOCK_CNT_INC(inode_sem_w);
++	down_write(&uf_info->latch);
++	uf_info->exclusive_use = 1;
++	assert("vs-1713", uf_info->ea_owner == NULL);
++	assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
++	ON_DEBUG(uf_info->ea_owner = current);
++}
++
++void drop_exclusive_access(unix_file_info_t * uf_info)
++{
++	assert("vs-1714", uf_info->ea_owner == current);
++	assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
++	ON_DEBUG(uf_info->ea_owner = NULL);
++	uf_info->exclusive_use = 0;
++	up_write(&uf_info->latch);
++	assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
++	assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
++	LOCK_CNT_DEC(inode_sem_w);
++	txn_restart_current();
++}
++
++/**
++ * nea_grabbed - do something when file semaphore is down_read-ed
++ * @uf_info:
++ *
++ * This is called when nonexclisive access is obtained on file. All it does is
++ * for debugging purposes.
++ */
++static void nea_grabbed(unix_file_info_t *uf_info)
++{
++#if REISER4_DEBUG
++	LOCK_CNT_INC(inode_sem_r);
++	assert("vs-1716", uf_info->ea_owner == NULL);
++	atomic_inc(&uf_info->nr_neas);
++	uf_info->last_reader = current;
++#endif
++}
++
++/**
++ * get_nonexclusive_access - get nonexclusive access to a file
++ * @uf_info: unix file specific part of inode to obtain access to
++ *
++ * Nonexclusive access is obtained on a file before read, write, readpage.
++ */
++void get_nonexclusive_access(unix_file_info_t *uf_info)
++{
++	assert("nikita-3029", schedulable());
++	assert("nikita-3361", get_current_context()->trans->atom == NULL);
++
++	down_read(&uf_info->latch);
++	nea_grabbed(uf_info);
++}
++
++/**
++ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
++ * @uf_info: unix file specific part of inode to obtain access to
++ *
++ * Non-blocking version of nonexclusive access obtaining.
++ */
++int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
++{
++	int result;
++
++	result = down_read_trylock(&uf_info->latch);
++	if (result)
++		nea_grabbed(uf_info);
++	return result;
++}
++
++void drop_nonexclusive_access(unix_file_info_t * uf_info)
++{
++	assert("vs-1718", uf_info->ea_owner == NULL);
++	assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
++	ON_DEBUG(atomic_dec(&uf_info->nr_neas));
++
++	up_read(&uf_info->latch);
++
++	LOCK_CNT_DEC(inode_sem_r);
++	txn_restart_current();
++}
++
++/* part of tail2extent. Cut all items covering @count bytes starting from
++   @offset */
++/* Audited by: green(2002.06.15) */
++static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
++{
++	reiser4_key from, to;
++
++	/* AUDIT: How about putting an assertion here, what would check
++	   all provided range is covered by tail items only? */
++	/* key of first byte in the range to be cut  */
++	inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
++
++	/* key of last byte in that range */
++	to = from;
++	set_key_offset(&to, (__u64) (offset + count - 1));
++
++	/* cut everything between those keys */
++	return cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
++}
++
++static void release_all_pages(struct page **pages, unsigned nr_pages)
++{
++	unsigned i;
++
++	for (i = 0; i < nr_pages; i++) {
++		if (pages[i] == NULL) {
++			unsigned j;
++			for (j = i + 1; j < nr_pages; j++)
++				assert("vs-1620", pages[j] == NULL);
++			break;
++		}
++		page_cache_release(pages[i]);
++		pages[i] = NULL;
++	}
++}
++
++/* part of tail2extent. replace tail items with extent one. Content of tail
++   items (@count bytes) being cut are copied already into
++   pages. extent_writepage method is called to create extents corresponding to
++   those pages */
++static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
++{
++	int result;
++	unsigned i;
++	STORE_COUNTERS;
++
++	if (nr_pages == 0)
++		return 0;
++
++	assert("vs-596", pages[0]);
++
++	/* cut copied items */
++	result =
++	    cut_formatting_items(inode,
++				 (loff_t) pages[0]->index << PAGE_CACHE_SHIFT,
++				 count);
++	if (result)
++		return result;
++
++	CHECK_COUNTERS;
++
++	/* put into tree replacement for just removed items: extent item, namely */
++	for (i = 0; i < nr_pages; i++) {
++		result = add_to_page_cache_lru(pages[i], inode->i_mapping,
++					       pages[i]->index,
++					       mapping_gfp_mask(inode->
++								i_mapping));
++		if (result)
++			break;
++		unlock_page(pages[i]);
++		result = find_or_create_extent(pages[i]);
++		if (result)
++			break;
++		SetPageUptodate(pages[i]);
++	}
++	return result;
++}
++
++#define TAIL2EXTENT_PAGE_NUM 3	/* number of pages to fill before cutting tail
++				 * items */
++
++static int reserve_tail2extent_iteration(struct inode *inode)
++{
++	reiser4_block_nr unformatted_nodes;
++	reiser4_tree *tree;
++
++	tree = tree_by_inode(inode);
++
++	/* number of unformatted nodes which will be created */
++	unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
++
++	/*
++	 * space required for one iteration of extent->tail conversion:
++	 *
++	 *     1. kill N tail items
++	 *
++	 *     2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
++	 *
++	 *     3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
++	 *     extents) extent units.
++	 *
++	 *     4. drilling to the leaf level by coord_by_key()
++	 *
++	 *     5. possible update of stat-data
++	 *
++	 */
++	grab_space_enable();
++	return reiser4_grab_space
++	    (2 * tree->height +
++	     TAIL2EXTENT_PAGE_NUM +
++	     TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
++	     1 + estimate_one_insert_item(tree) +
++	     inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
++}
++
++/* clear stat data's flag indicating that conversion is being converted */
++static int complete_conversion(struct inode *inode)
++{
++	int result;
++
++	grab_space_enable();
++	result =
++	    reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
++			       BA_CAN_COMMIT);
++	if (result == 0) {
++		inode_clr_flag(inode, REISER4_PART_MIXED);
++		result = reiser4_update_sd(inode);
++	}
++	if (result)
++		warning("vs-1696", "Failed to clear converting bit of %llu: %i",
++			(unsigned long long)get_inode_oid(inode), result);
++	return 0;
++}
++
++/**
++ * find_start
++ * @inode:
++ * @id:
++ * @offset:
++ *
++ * this is used by tail2extent and extent2tail to detect where previous
++ * uncompleted conversion stopped
++ */
++static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
++{
++	int result;
++	lock_handle lh;
++	coord_t coord;
++	unix_file_info_t *ufo;
++	int found;
++	reiser4_key key;
++
++	ufo = unix_file_inode_data(inode);
++	init_lh(&lh);
++	result = 0;
++	found = 0;
++	inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
++	do {
++		init_lh(&lh);
++		result = find_file_item_nohint(&coord, &lh, &key,
++					       ZNODE_READ_LOCK, inode);
++
++		if (result == CBK_COORD_FOUND) {
++			if (coord.between == AT_UNIT) {
++				/*coord_clear_iplug(&coord); */
++				result = zload(coord.node);
++				if (result == 0) {
++					if (item_id_by_coord(&coord) == id)
++						found = 1;
++					else
++						item_plugin_by_coord(&coord)->s.
++						    file.append_key(&coord,
++								    &key);
++					zrelse(coord.node);
++				}
++			} else
++				result = RETERR(-ENOENT);
++		}
++		done_lh(&lh);
++	} while (result == 0 && !found);
++	*offset = get_key_offset(&key);
++	return result;
++}
++
++/**
++ * tail2extent
++ * @uf_info:
++ *
++ *
++ */
++int tail2extent(unix_file_info_t *uf_info)
++{
++	int result;
++	reiser4_key key;	/* key of next byte to be moved to page */
++	char *p_data;		/* data of page */
++	unsigned page_off = 0,	/* offset within the page where to copy data */
++	    count;		/* number of bytes of item which can be
++				 * copied to page */
++	struct page *pages[TAIL2EXTENT_PAGE_NUM];
++	struct page *page;
++	int done;		/* set to 1 when all file is read */
++	char *item;
++	int i;
++	struct inode *inode;
++	int first_iteration;
++	int bytes;
++	__u64 offset;
++
++	assert("nikita-3362", ea_obtained(uf_info));
++	inode = unix_file_info_to_inode(uf_info);
++	assert("nikita-3412", !IS_RDONLY(inode));
++	assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
++	assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
++
++	offset = 0;
++	first_iteration = 1;
++	result = 0;
++	if (inode_get_flag(inode, REISER4_PART_MIXED)) {
++		/*
++		 * file is marked on disk as there was a conversion which did
++		 * not complete due to either crash or some error. Find which
++		 * offset tail conversion stopped at
++		 */
++		result = find_start(inode, FORMATTING_ID, &offset);
++		if (result == -ENOENT) {
++			/* no tail items found, everything is converted */
++			uf_info->container = UF_CONTAINER_EXTENTS;
++			complete_conversion(inode);
++			return 0;
++		} else if (result != 0)
++			/* some other error */
++			return result;
++		first_iteration = 0;
++	}
++
++	inode_set_flag(inode, REISER4_PART_IN_CONV);
++
++	/* get key of first byte of a file */
++	inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
++
++	done = 0;
++	while (done == 0) {
++		memset(pages, 0, sizeof(pages));
++		result = reserve_tail2extent_iteration(inode);
++		if (result != 0)
++			goto out;
++		if (first_iteration) {
++			inode_set_flag(inode, REISER4_PART_MIXED);
++			reiser4_update_sd(inode);
++			first_iteration = 0;
++		}
++		bytes = 0;
++		for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
++			assert("vs-598",
++			       (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
++			page = alloc_page(get_gfp_mask());
++			if (!page) {
++				result = RETERR(-ENOMEM);
++				goto error;
++			}
++
++			page->index =
++			    (unsigned long)(get_key_offset(&key) >>
++					    PAGE_CACHE_SHIFT);
++			/*
++			 * usually when one is going to longterm lock znode (as
++			 * find_file_item does, for instance) he must not hold
++			 * locked pages. However, there is an exception for
++			 * case tail2extent. Pages appearing here are not
++			 * reachable to everyone else, they are clean, they do
++			 * not have jnodes attached so keeping them locked do
++			 * not risk deadlock appearance
++			 */
++			assert("vs-983", !PagePrivate(page));
++			reiser4_invalidate_pages(inode->i_mapping, page->index,
++						 1, 0);
++
++			for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
++				coord_t coord;
++				lock_handle lh;
++
++				/* get next item */
++				/* FIXME: we might want to readahead here */
++				init_lh(&lh);
++				result =
++				    find_file_item_nohint(&coord, &lh, &key,
++							  ZNODE_READ_LOCK,
++							  inode);
++				if (result != CBK_COORD_FOUND) {
++					/*
++					 * error happened of not items of file
++					 * were found
++					 */
++					done_lh(&lh);
++					page_cache_release(page);
++					goto error;
++				}
++
++				if (coord.between == AFTER_UNIT) {
++					/*
++					 * end of file is reached. Padd page
++					 * with zeros
++					 */
++					done_lh(&lh);
++					done = 1;
++					p_data = kmap_atomic(page, KM_USER0);
++					memset(p_data + page_off, 0,
++					       PAGE_CACHE_SIZE - page_off);
++					kunmap_atomic(p_data, KM_USER0);
++					break;
++				}
++
++				result = zload(coord.node);
++				if (result) {
++					page_cache_release(page);
++					done_lh(&lh);
++					goto error;
++				}
++				assert("vs-856", coord.between == AT_UNIT);
++				item = ((char *)item_body_by_coord(&coord)) +
++					coord.unit_pos;
++
++				/* how many bytes to copy */
++				count =
++				    item_length_by_coord(&coord) -
++				    coord.unit_pos;
++				/* limit length of copy to end of page */
++				if (count > PAGE_CACHE_SIZE - page_off)
++					count = PAGE_CACHE_SIZE - page_off;
++
++				/*
++				 * copy item (as much as will fit starting from
++				 * the beginning of the item) into the page
++				 */
++				p_data = kmap_atomic(page, KM_USER0);
++				memcpy(p_data + page_off, item, count);
++				kunmap_atomic(p_data, KM_USER0);
++
++				page_off += count;
++				bytes += count;
++				set_key_offset(&key,
++					       get_key_offset(&key) + count);
++
++				zrelse(coord.node);
++				done_lh(&lh);
++			} /* end of loop which fills one page by content of
++			   * formatting items */
++
++			if (page_off) {
++				/* something was copied into page */
++				pages[i] = page;
++			} else {
++				page_cache_release(page);
++				assert("vs-1648", done == 1);
++				break;
++			}
++		} /* end of loop through pages of one conversion iteration */
++
++		if (i > 0) {
++			result = replace(inode, pages, i, bytes);
++			release_all_pages(pages, sizeof_array(pages));
++			if (result)
++				goto error;
++			/*
++			 * we have to drop exclusive access to avoid deadlock
++			 * which may happen because called by
++			 * reiser4_writepages capture_unix_file requires to get
++			 * non-exclusive access to a file. It is safe to drop
++			 * EA in the middle of tail2extent conversion because
++			 * write_unix_file/unix_setattr(truncate)/release_unix_file(extent2tail)
++			 * are serialized by uf_info->write semaphore and
++			 * because read_unix_file works (should at least) on
++			 * partially converted files
++			 */
++			drop_exclusive_access(uf_info);
++			/* throttle the conversion */
++			reiser4_throttle_write(inode);
++			get_exclusive_access(uf_info);
++
++			/*
++			 * nobody is allowed to complete conversion but a
++			 * process which started it
++			 */
++			assert("", inode_get_flag(inode, REISER4_PART_MIXED));
++		}
++	}
++
++	inode_clr_flag(inode, REISER4_PART_IN_CONV);
++
++	if (result == 0) {
++		/* file is converted to extent items */
++		assert("vs-1697", inode_get_flag(inode, REISER4_PART_MIXED));
++
++		uf_info->container = UF_CONTAINER_EXTENTS;
++		complete_conversion(inode);
++	} else {
++		/*
++		 * conversion is not complete. Inode was already marked as
++		 * REISER4_PART_CONV and stat-data were updated at the first
++		 * iteration of the loop above.
++		 */
++	      error:
++		release_all_pages(pages, sizeof_array(pages));
++		warning("nikita-2282", "Partial conversion of %llu: %i",
++			(unsigned long long)get_inode_oid(inode), result);
++	}
++
++      out:
++	return result;
++}
++
++static int reserve_extent2tail_iteration(struct inode *inode)
++{
++	reiser4_tree *tree;
++
++	tree = tree_by_inode(inode);
++	/*
++	 * reserve blocks for (in this order):
++	 *
++	 *     1. removal of extent item
++	 *
++	 *     2. insertion of tail by insert_flow()
++	 *
++	 *     3. drilling to the leaf level by coord_by_key()
++	 *
++	 *     4. possible update of stat-data
++	 */
++	grab_space_enable();
++	return reiser4_grab_space
++	    (estimate_one_item_removal(tree) +
++	     estimate_insert_flow(tree->height) +
++	     1 + estimate_one_insert_item(tree) +
++	     inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
++}
++
++static int filler(void *vp, struct page *page)
++{
++	return readpage_unix_file_nolock(vp, page);
++}
++
++/* for every page of file: read page, cut part of extent pointing to this page,
++   put data of page tree by tail item */
++int extent2tail(unix_file_info_t *uf_info)
++{
++	int result;
++	struct inode *inode;
++	struct page *page;
++	unsigned long num_pages, i;
++	unsigned long start_page;
++	reiser4_key from;
++	reiser4_key to;
++	unsigned count;
++	__u64 offset;
++
++	assert("nikita-3362", ea_obtained(uf_info));
++	inode = unix_file_info_to_inode(uf_info);
++	assert("nikita-3412", !IS_RDONLY(inode));
++	assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
++	assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
++
++	offset = 0;
++	if (inode_get_flag(inode, REISER4_PART_MIXED)) {
++		/*
++		 * file is marked on disk as there was a conversion which did
++		 * not complete due to either crash or some error. Find which
++		 * offset tail conversion stopped at
++		 */
++		result = find_start(inode, EXTENT_POINTER_ID, &offset);
++		if (result == -ENOENT) {
++			/* no extent found, everything is converted */
++			uf_info->container = UF_CONTAINER_TAILS;
++			complete_conversion(inode);
++			return 0;
++		} else if (result != 0)
++			/* some other error */
++			return result;
++	}
++
++	inode_set_flag(inode, REISER4_PART_IN_CONV);
++
++	/* number of pages in the file */
++	num_pages =
++	    (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
++	start_page = offset >> PAGE_CACHE_SHIFT;
++
++	inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
++	to = from;
++
++	result = 0;
++	for (i = 0; i < num_pages; i++) {
++		__u64 start_byte;
++
++		result = reserve_extent2tail_iteration(inode);
++		if (result != 0)
++			break;
++		if (i == 0 && offset == 0) {
++			inode_set_flag(inode, REISER4_PART_MIXED);
++			reiser4_update_sd(inode);
++		}
++
++		page = read_cache_page(inode->i_mapping,
++				       (unsigned)(i + start_page), filler, NULL);
++		if (IS_ERR(page)) {
++			result = PTR_ERR(page);
++			break;
++		}
++
++		wait_on_page_locked(page);
++
++		if (!PageUptodate(page)) {
++			page_cache_release(page);
++			result = RETERR(-EIO);
++			break;
++		}
++
++		/* cut part of file we have read */
++		start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
++		set_key_offset(&from, start_byte);
++		set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
++		/*
++		 * cut_tree_object() returns -E_REPEAT to allow atom
++		 * commits during over-long truncates. But
++		 * extent->tail conversion should be performed in one
++		 * transaction.
++		 */
++		result = cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
++
++		if (result) {
++			page_cache_release(page);
++			break;
++		}
++
++		/* put page data into tree via tail_write */
++		count = PAGE_CACHE_SIZE;
++		if ((i == (num_pages - 1)) &&
++		    (inode->i_size & ~PAGE_CACHE_MASK))
++			/* last page can be incompleted */
++			count = (inode->i_size & ~PAGE_CACHE_MASK);
++		while (count) {
++			struct dentry dentry;
++			struct file file;
++			loff_t pos;
++
++			dentry.d_inode = inode;
++			file.f_dentry = &dentry;
++			file.private_data = NULL;
++			file.f_pos = start_byte;
++			file.private_data = NULL;
++			pos = start_byte;
++			result = write_tail(&file, (char __user *)kmap(page),
++					    count, &pos);
++			reiser4_free_file_fsdata(&file);
++			if (result <= 0) {
++				warning("", "write_tail failed");
++				page_cache_release(page);
++				inode_clr_flag(inode, REISER4_PART_IN_CONV);
++				return result;
++			}
++			count -= result;
++		}
++
++		/* release page */
++		lock_page(page);
++		/* page is already detached from jnode and mapping. */
++		assert("vs-1086", page->mapping == NULL);
++		assert("nikita-2690",
++		       (!PagePrivate(page) && jprivate(page) == 0));
++		/* waiting for writeback completion with page lock held is
++		 * perfectly valid. */
++		wait_on_page_writeback(page);
++		drop_page(page);
++		/* release reference taken by read_cache_page() above */
++		page_cache_release(page);
++
++		drop_exclusive_access(uf_info);
++		/* throttle the conversion */
++		reiser4_throttle_write(inode);
++		get_exclusive_access(uf_info);
++		/*
++		 * nobody is allowed to complete conversion but a process which
++		 * started it
++		 */
++		assert("", inode_get_flag(inode, REISER4_PART_MIXED));
++	}
++
++	inode_clr_flag(inode, REISER4_PART_IN_CONV);
++
++	if (i == num_pages) {
++		/* file is converted to formatted items */
++		assert("vs-1698", inode_get_flag(inode, REISER4_PART_MIXED));
++		assert("vs-1260",
++		       inode_has_no_jnodes(reiser4_inode_data(inode)));
++
++		uf_info->container = UF_CONTAINER_TAILS;
++		complete_conversion(inode);
++		return 0;
++	}
++	/*
++	 * conversion is not complete. Inode was already marked as
++	 * REISER4_PART_MIXED and stat-data were updated at the first *
++	 * iteration of the loop above.
++	 */
++	warning("nikita-2282",
++		"Partial conversion of %llu: %lu of %lu: %i",
++		(unsigned long long)get_inode_oid(inode), i,
++		num_pages, result);
++
++	return result;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/file_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file_ops.c
+@@ -0,0 +1,167 @@
++/* Copyright 2005 by Hans Reiser, licensing governed by
++   reiser4/README */
++
++/* this file contains typical implementations for some of methods of
++   struct file_operations and of struct address_space_operations
++*/
++
++#include "../inode.h"
++#include "object.h"
++
++/* file operations */
++
++/* implementation of vfs's llseek method of struct file_operations for
++   typical directory can be found in readdir_common.c
++*/
++loff_t llseek_common_dir(struct file *, loff_t, int origin);
++
++/* implementation of vfs's readdir method of struct file_operations for
++   typical directory can be found in readdir_common.c
++*/
++int readdir_common(struct file *, void *dirent, filldir_t);
++
++/**
++ * release_dir_common - release of struct file_operations
++ * @inode: inode of released file
++ * @file: file to release
++ *
++ * Implementation of release method of struct file_operations for typical
++ * directory. All it does is freeing of reiser4 specific file data.
++*/
++int release_dir_common(struct inode *inode, struct file *file)
++{
++	reiser4_context *ctx;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	reiser4_free_file_fsdata(file);
++	reiser4_exit_context(ctx);
++	return 0;
++}
++
++/* this is common implementation of vfs's fsync method of struct
++   file_operations
++*/
++int sync_common(struct file *file, struct dentry *dentry, int datasync)
++{
++	reiser4_context *ctx;
++	int result;
++
++	ctx = init_context(dentry->d_inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/* this is common implementation of vfs's sendfile method of struct
++   file_operations
++
++   Reads @count bytes from @file and calls @actor for every page read. This is
++   needed for loop back devices support.
++*/
++#if 0
++ssize_t
++sendfile_common(struct file *file, loff_t *ppos, size_t count,
++		read_actor_t actor, void *target)
++{
++	reiser4_context *ctx;
++	ssize_t result;
++
++	ctx = init_context(file->f_dentry->d_inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	result = generic_file_sendfile(file, ppos, count, actor, target);
++	reiser4_exit_context(ctx);
++	return result;
++}
++#endif  /*  0  */
++
++/* address space operations */
++
++/* this is common implementation of vfs's prepare_write method of struct
++   address_space_operations
++*/
++int
++prepare_write_common(struct file *file, struct page *page, unsigned from,
++		     unsigned to)
++{
++	reiser4_context *ctx;
++	int result;
++
++	ctx = init_context(page->mapping->host->i_sb);
++	result = do_prepare_write(file, page, from, to);
++
++	/* don't commit transaction under inode semaphore */
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++
++	return result;
++}
++
++/* this is helper for prepare_write_common and prepare_write_unix_file
++ */
++int
++do_prepare_write(struct file *file, struct page *page, unsigned from,
++		 unsigned to)
++{
++	int result;
++	file_plugin *fplug;
++	struct inode *inode;
++
++	assert("umka-3099", file != NULL);
++	assert("umka-3100", page != NULL);
++	assert("umka-3095", PageLocked(page));
++
++	if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
++		return 0;
++
++	inode = page->mapping->host;
++	fplug = inode_file_plugin(inode);
++
++	if (page->mapping->a_ops->readpage == NULL)
++		return RETERR(-EINVAL);
++
++	result = page->mapping->a_ops->readpage(file, page);
++	if (result != 0) {
++		SetPageError(page);
++		ClearPageUptodate(page);
++		/* All reiser4 readpage() implementations should return the
++		 * page locked in case of error. */
++		assert("nikita-3472", PageLocked(page));
++	} else {
++		/*
++		 * ->readpage() either:
++		 *
++		 *     1. starts IO against @page. @page is locked for IO in
++		 *     this case.
++		 *
++		 *     2. doesn't start IO. @page is unlocked.
++		 *
++		 * In either case, page should be locked.
++		 */
++		lock_page(page);
++		/*
++		 * IO (if any) is completed at this point. Check for IO
++		 * errors.
++		 */
++		if (!PageUptodate(page))
++			result = RETERR(-EIO);
++	}
++	assert("umka-3098", PageLocked(page));
++	return result;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/file_ops_readdir.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file_ops_readdir.c
+@@ -0,0 +1,654 @@
++/* Copyright 2005 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "../inode.h"
++
++/* return true, iff @coord points to the valid directory item that is part of
++ * @inode directory. */
++static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
++{
++	return
++	    item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE &&
++	    inode_file_plugin(inode)->owns_item(inode, coord);
++}
++
++/* compare two logical positions within the same directory */
++static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
++{
++	cmp_t result;
++
++	assert("nikita-2534", p1 != NULL);
++	assert("nikita-2535", p2 != NULL);
++
++	result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
++	if (result == EQUAL_TO) {
++		int diff;
++
++		diff = p1->pos - p2->pos;
++		result =
++		    (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
++	}
++	return result;
++}
++
++
++/* see comment before readdir_common() for overview of why "adjustment" is
++ * necessary. */
++static void
++adjust_dir_pos(struct file *dir,
++	       readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
++{
++	dir_pos *pos;
++
++	/*
++	 * new directory entry was added (adj == +1) or removed (adj == -1) at
++	 * the @mod_point. Directory file descriptor @dir is doing readdir and
++	 * is currently positioned at @readdir_spot. Latter has to be updated
++	 * to maintain stable readdir.
++	 */
++	/* directory is positioned to the beginning. */
++	if (readdir_spot->entry_no == 0)
++		return;
++
++	pos = &readdir_spot->position;
++	switch (dir_pos_cmp(mod_point, pos)) {
++	case LESS_THAN:
++		/* @mod_pos is _before_ @readdir_spot, that is, entry was
++		 * added/removed on the left (in key order) of current
++		 * position. */
++		/* logical number of directory entry readdir is "looking" at
++		 * changes */
++		readdir_spot->entry_no += adj;
++		assert("nikita-2577",
++		       ergo(dir != NULL, get_dir_fpos(dir) + adj >= 0));
++		if (de_id_cmp(&pos->dir_entry_key,
++			      &mod_point->dir_entry_key) == EQUAL_TO) {
++			assert("nikita-2575", mod_point->pos < pos->pos);
++			/*
++			 * if entry added/removed has the same key as current
++			 * for readdir, update counter of duplicate keys in
++			 * @readdir_spot.
++			 */
++			pos->pos += adj;
++		}
++		break;
++	case GREATER_THAN:
++		/* directory is modified after @pos: nothing to do. */
++		break;
++	case EQUAL_TO:
++		/* cannot insert an entry readdir is looking at, because it
++		   already exists. */
++		assert("nikita-2576", adj < 0);
++		/* directory entry to which @pos points to is being
++		   removed.
++
++		   NOTE-NIKITA: Right thing to do is to update @pos to point
++		   to the next entry. This is complex (we are under spin-lock
++		   for one thing). Just rewind it to the beginning. Next
++		   readdir will have to scan the beginning of
++		   directory. Proper solution is to use semaphore in
++		   spin lock's stead and use rewind_right() here.
++
++		   NOTE-NIKITA: now, semaphore is used, so...
++		 */
++		memset(readdir_spot, 0, sizeof *readdir_spot);
++	}
++}
++
++/* scan all file-descriptors for this directory and adjust their
++   positions respectively. Should be used by implementations of
++   add_entry and rem_entry of dir plugin */
++void
++adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj)
++{
++	reiser4_file_fsdata *scan;
++	dir_pos mod_point;
++
++	assert("nikita-2536", dir != NULL);
++	assert("nikita-2538", de != NULL);
++	assert("nikita-2539", adj != 0);
++
++	build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
++	mod_point.pos = offset;
++
++	spin_lock_inode(dir);
++
++	/*
++	 * new entry was added/removed in directory @dir. Scan all file
++	 * descriptors for @dir that are currently involved into @readdir and
++	 * update them.
++	 */
++
++	list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
++		adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
++
++	spin_unlock_inode(dir);
++}
++
++/*
++ * traverse tree to start/continue readdir from the readdir position @pos.
++ */
++static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
++{
++	reiser4_key key;
++	int result;
++	struct inode *inode;
++
++	assert("nikita-2554", pos != NULL);
++
++	inode = dir->f_dentry->d_inode;
++	result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
++	if (result != 0)
++		return result;
++	result = object_lookup(inode,
++			       &key,
++			       tap->coord,
++			       tap->lh,
++			       tap->mode,
++			       FIND_EXACT,
++			       LEAF_LEVEL, LEAF_LEVEL, 0, &tap->ra_info);
++	if (result == CBK_COORD_FOUND)
++		result = rewind_right(tap, (int)pos->position.pos);
++	else {
++		tap->coord->node = NULL;
++		done_lh(tap->lh);
++		result = RETERR(-EIO);
++	}
++	return result;
++}
++
++/*
++ * handling of non-unique keys: calculate at what ordinal position within
++ * sequence of directory items with identical keys @pos is.
++ */
++static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
++{
++	int result;
++	coord_t coord;
++	lock_handle lh;
++	tap_t scan;
++	de_id *did;
++	reiser4_key de_key;
++
++	coord_init_zero(&coord);
++	init_lh(&lh);
++	tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
++	tap_copy(&scan, tap);
++	tap_load(&scan);
++	pos->position.pos = 0;
++
++	did = &pos->position.dir_entry_key;
++
++	if (is_valid_dir_coord(inode, scan.coord)) {
++
++		build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
++
++		while (1) {
++
++			result = go_prev_unit(&scan);
++			if (result != 0)
++				break;
++
++			if (!is_valid_dir_coord(inode, scan.coord)) {
++				result = -EINVAL;
++				break;
++			}
++
++			/* get key of directory entry */
++			unit_key_by_coord(scan.coord, &de_key);
++			if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
++				/* duplicate-sequence is over */
++				break;
++			}
++			pos->position.pos++;
++		}
++	} else
++		result = RETERR(-ENOENT);
++	tap_relse(&scan);
++	tap_done(&scan);
++	return result;
++}
++
++/*
++ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
++ */
++static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
++{
++	__u64 destination;
++	__s64 shift;
++	int result;
++	struct inode *inode;
++	loff_t dirpos;
++
++	assert("nikita-2553", dir != NULL);
++	assert("nikita-2548", pos != NULL);
++	assert("nikita-2551", tap->coord != NULL);
++	assert("nikita-2552", tap->lh != NULL);
++
++	dirpos = get_dir_fpos(dir);
++	shift = dirpos - pos->fpos;
++	/* this is logical directory entry within @dir which we are rewinding
++	 * to */
++	destination = pos->entry_no + shift;
++
++	inode = dir->f_dentry->d_inode;
++	if (dirpos < 0)
++		return RETERR(-EINVAL);
++	else if (destination == 0ll || dirpos == 0) {
++		/* rewind to the beginning of directory */
++		memset(pos, 0, sizeof *pos);
++		return dir_go_to(dir, pos, tap);
++	} else if (destination >= inode->i_size)
++		return RETERR(-ENOENT);
++
++	if (shift < 0) {
++		/* I am afraid of negative numbers */
++		shift = -shift;
++		/* rewinding to the left */
++		if (shift <= (int)pos->position.pos) {
++			/* destination is within sequence of entries with
++			   duplicate keys. */
++			result = dir_go_to(dir, pos, tap);
++		} else {
++			shift -= pos->position.pos;
++			while (1) {
++				/* repetitions: deadlock is possible when
++				   going to the left. */
++				result = dir_go_to(dir, pos, tap);
++				if (result == 0) {
++					result = rewind_left(tap, shift);
++					if (result == -E_DEADLOCK) {
++						tap_done(tap);
++						continue;
++					}
++				}
++				break;
++			}
++		}
++	} else {
++		/* rewinding to the right */
++		result = dir_go_to(dir, pos, tap);
++		if (result == 0)
++			result = rewind_right(tap, shift);
++	}
++	if (result == 0) {
++		result = set_pos(inode, pos, tap);
++		if (result == 0) {
++			/* update pos->position.pos */
++			pos->entry_no = destination;
++			pos->fpos = dirpos;
++		}
++	}
++	return result;
++}
++
++/*
++ * Function that is called by common_readdir() on each directory entry while
++ * doing readdir. ->filldir callback may block, so we had to release long term
++ * lock while calling it. To avoid repeating tree traversal, seal is used. If
++ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
++ *
++ * Whether node is unlocked in case of any other error is undefined. It is
++ * guaranteed to be still locked if success (0) is returned.
++ *
++ * When ->filldir() wants no more, feed_entry() returns 1, and node is
++ * unlocked.
++ */
++static int
++feed_entry(struct file *f,
++	   readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
++{
++	item_plugin *iplug;
++	char *name;
++	reiser4_key sd_key;
++	int result;
++	char buf[DE_NAME_BUF_LEN];
++	char name_buf[32];
++	char *local_name;
++	unsigned file_type;
++	seal_t seal;
++	coord_t *coord;
++	reiser4_key entry_key;
++
++	coord = tap->coord;
++	iplug = item_plugin_by_coord(coord);
++
++	/* pointer to name within the node */
++	name = iplug->s.dir.extract_name(coord, buf);
++	assert("nikita-1371", name != NULL);
++
++	/* key of object the entry points to */
++	if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
++		return RETERR(-EIO);
++
++	/* we must release longterm znode lock before calling filldir to avoid
++	   deadlock which may happen if filldir causes page fault. So, copy
++	   name to intermediate buffer */
++	if (strlen(name) + 1 > sizeof(name_buf)) {
++		local_name = kmalloc(strlen(name) + 1, get_gfp_mask());
++		if (local_name == NULL)
++			return RETERR(-ENOMEM);
++	} else
++		local_name = name_buf;
++
++	strcpy(local_name, name);
++	file_type = iplug->s.dir.extract_file_type(coord);
++
++	unit_key_by_coord(coord, &entry_key);
++	seal_init(&seal, coord, &entry_key);
++
++	longterm_unlock_znode(tap->lh);
++
++	/*
++	 * send information about directory entry to the ->filldir() filler
++	 * supplied to us by caller (VFS).
++	 *
++	 * ->filldir is entitled to do weird things. For example, ->filldir
++	 * supplied by knfsd re-enters file system. Make sure no locks are
++	 * held.
++	 */
++	assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
++
++	result = filldir(dirent, name, (int)strlen(name),
++			 /* offset of this entry */
++			 f->f_pos,
++			 /* inode number of object bounden by this entry */
++			 oid_to_uino(get_key_objectid(&sd_key)), file_type);
++	if (local_name != name_buf)
++		kfree(local_name);
++	if (result < 0)
++		/* ->filldir() is satisfied. (no space in buffer, IOW) */
++		result = 1;
++	else
++		result = seal_validate(&seal, coord, &entry_key,
++				       tap->lh, tap->mode, ZNODE_LOCK_HIPRI);
++	return result;
++}
++
++static void move_entry(readdir_pos * pos, coord_t * coord)
++{
++	reiser4_key de_key;
++	de_id *did;
++
++	/* update @pos */
++	++pos->entry_no;
++	did = &pos->position.dir_entry_key;
++
++	/* get key of directory entry */
++	unit_key_by_coord(coord, &de_key);
++
++	if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
++		/* we are within sequence of directory entries
++		   with duplicate keys. */
++		++pos->position.pos;
++	else {
++		pos->position.pos = 0;
++		build_de_id_by_key(&de_key, did);
++	}
++	++pos->fpos;
++}
++
++/*
++ *     STATELESS READDIR
++ *
++ * readdir support in reiser4 relies on ability to update readdir_pos embedded
++ * into reiser4_file_fsdata on each directory modification (name insertion and
++ * removal), see readdir_common() function below. This obviously doesn't work
++ * when reiser4 is accessed over NFS, because NFS doesn't keep any state
++ * across client READDIR requests for the same directory.
++ *
++ * To address this we maintain a "pool" of detached reiser4_file_fsdata
++ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
++ * find detached reiser4_file_fsdata corresponding to previous readdir
++ * request. In other words, additional state is maintained on the
++ * server. (This is somewhat contrary to the design goals of NFS protocol.)
++ *
++ * To efficiently detect when our ->readdir() method is called by NFS server,
++ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
++ * file_is_stateless() function).
++ *
++ * To find out d_cursor in the pool, we encode client id (cid) in the highest
++ * bits of NFS readdir cookie: when first readdir request comes to the given
++ * directory from the given client, cookie is set to 0. This situation is
++ * detected, global cid_counter is incremented, and stored in highest bits of
++ * all direntry offsets returned to the client, including last one. As the
++ * only valid readdir cookie is one obtained as direntry->offset, we are
++ * guaranteed that next readdir request (continuing current one) will have
++ * current cid in the highest bits of starting readdir cookie. All d_cursors
++ * are hashed into per-super-block hash table by (oid, cid) key.
++ *
++ * In addition d_cursors are placed into per-super-block radix tree where they
++ * are keyed by oid alone. This is necessary to efficiently remove them during
++ * rmdir.
++ *
++ * At last, currently unused d_cursors are linked into special list. This list
++ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
++ *
++ */
++
++
++/*
++ * prepare for readdir.
++ */
++static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
++{
++	struct inode *inode;
++	reiser4_file_fsdata *fsdata;
++	int result;
++
++	assert("nikita-1359", f != NULL);
++	inode = f->f_dentry->d_inode;
++	assert("nikita-1360", inode != NULL);
++
++	if (!S_ISDIR(inode->i_mode))
++		return RETERR(-ENOTDIR);
++
++	/* try to find detached readdir state */
++	result = try_to_attach_fsdata(f, inode);
++	if (result != 0)
++		return result;
++
++	fsdata = reiser4_get_file_fsdata(f);
++	assert("nikita-2571", fsdata != NULL);
++	if (IS_ERR(fsdata))
++		return PTR_ERR(fsdata);
++
++	/* add file descriptor to the readdir list hanging of directory
++	 * inode. This list is used to scan "readdirs-in-progress" while
++	 * inserting or removing names in the directory. */
++	spin_lock_inode(inode);
++	if (list_empty_careful(&fsdata->dir.linkage))
++		list_add(&fsdata->dir.linkage, get_readdir_list(inode));
++	*pos = &fsdata->dir.readdir;
++	spin_unlock_inode(inode);
++
++	/* move @tap to the current position */
++	return dir_rewind(f, *pos, tap);
++}
++
++/* this is implementation of vfs's llseek method of struct file_operations for
++   typical directory
++   See comment before readdir_common() for explanation.
++*/
++loff_t llseek_common_dir(struct file * file, loff_t off, int origin)
++{
++	reiser4_context *ctx;
++	loff_t result;
++	struct inode *inode;
++
++	inode = file->f_dentry->d_inode;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	mutex_lock(&inode->i_mutex);
++
++	/* update ->f_pos */
++	result = default_llseek(file, off, origin);
++	if (result >= 0) {
++		int ff;
++		coord_t coord;
++		lock_handle lh;
++		tap_t tap;
++		readdir_pos *pos;
++
++		coord_init_zero(&coord);
++		init_lh(&lh);
++		tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
++
++		ff = dir_readdir_init(file, &tap, &pos);
++		detach_fsdata(file);
++		if (ff != 0)
++			result = (loff_t) ff;
++		tap_done(&tap);
++	}
++	detach_fsdata(file);
++	mutex_unlock(&inode->i_mutex);
++
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/* this is common implementation of vfs's readdir method of struct
++   file_operations
++
++   readdir problems:
++
++   readdir(2)/getdents(2) interface is based on implicit assumption that
++   readdir can be restarted from any particular point by supplying file system
++   with off_t-full of data. That is, file system fills ->d_off field in struct
++   dirent and later user passes ->d_off to the seekdir(3), which is, actually,
++   implemented by glibc as lseek(2) on directory.
++
++   Reiser4 cannot restart readdir from 64 bits of data, because two last
++   components of the key of directory entry are unknown, which given 128 bits:
++   locality and type fields in the key of directory entry are always known, to
++   start readdir() from given point objectid and offset fields have to be
++   filled.
++
++   Traditional UNIX API for scanning through directory
++   (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
++   assumption that directory is structured very much like regular file, in
++   particular, it is implied that each name within given directory (directory
++   entry) can be uniquely identified by scalar offset and that such offset is
++   stable across the life-time of the name is identifies.
++
++   This is manifestly not so for reiser4. In reiser4 the only stable unique
++   identifies for the directory entry is its key that doesn't fit into
++   seekdir/telldir API.
++
++   solution:
++
++   Within each file descriptor participating in readdir-ing of directory
++   plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
++   the "current" directory entry that file descriptor looks at. It contains a
++   key of directory entry (plus some additional info to deal with non-unique
++   keys that we wouldn't dwell onto here) and a logical position of this
++   directory entry starting from the beginning of the directory, that is
++   ordinal number of this entry in the readdir order.
++
++   Obviously this logical position is not stable in the face of directory
++   modifications. To work around this, on each addition or removal of directory
++   entry all file descriptors for directory inode are scanned and their
++   readdir_pos are updated accordingly (adjust_dir_pos()).
++*/
++int readdir_common(struct file *f /* directory file being read */ ,
++		   void *dirent /* opaque data passed to us by VFS */ ,
++		   filldir_t filld /* filler function passed to us by VFS */ )
++{
++	reiser4_context *ctx;
++	int result;
++	struct inode *inode;
++	coord_t coord;
++	lock_handle lh;
++	tap_t tap;
++	readdir_pos *pos;
++
++	assert("nikita-1359", f != NULL);
++	inode = f->f_dentry->d_inode;
++	assert("nikita-1360", inode != NULL);
++
++	if (!S_ISDIR(inode->i_mode))
++		return RETERR(-ENOTDIR);
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	coord_init_zero(&coord);
++	init_lh(&lh);
++	tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
++
++	reiser4_readdir_readahead_init(inode, &tap);
++
++      repeat:
++	result = dir_readdir_init(f, &tap, &pos);
++	if (result == 0) {
++		result = tap_load(&tap);
++		/* scan entries one by one feeding them to @filld */
++		while (result == 0) {
++			coord_t *coord;
++
++			coord = tap.coord;
++			assert("nikita-2572", coord_is_existing_unit(coord));
++			assert("nikita-3227", is_valid_dir_coord(inode, coord));
++
++			result = feed_entry(f, pos, &tap, filld, dirent);
++			if (result > 0) {
++				break;
++			} else if (result == 0) {
++				++f->f_pos;
++				result = go_next_unit(&tap);
++				if (result == -E_NO_NEIGHBOR ||
++				    result == -ENOENT) {
++					result = 0;
++					break;
++				} else if (result == 0) {
++					if (is_valid_dir_coord(inode, coord))
++						move_entry(pos, coord);
++					else
++						break;
++				}
++			} else if (result == -E_REPEAT) {
++				/* feed_entry() had to restart. */
++				++f->f_pos;
++				tap_relse(&tap);
++				goto repeat;
++			} else
++				warning("vs-1617",
++					"readdir_common: unexpected error %d",
++					result);
++		}
++		tap_relse(&tap);
++
++		if (result >= 0)
++			f->f_version = inode->i_version;
++	} else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
++		result = 0;
++	tap_done(&tap);
++	detach_fsdata(f);
++
++	/* try to update directory's atime */
++	if (reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
++			       BA_CAN_COMMIT) != 0)
++		warning("", "failed to update atime on readdir: %llu",
++			get_inode_oid(inode));
++	else
++		file_accessed(f);
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++
++	return (result <= 0) ? result : 0;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/file_plugin_common.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/file_plugin_common.c
+@@ -0,0 +1,929 @@
++/* Copyright 2005 by Hans Reiser, licensing governed by
++   reiser4/README */
++
++/* this file contains typical implementations for most of methods of
++   file plugin
++*/
++
++#include "../inode.h"
++#include "object.h"
++#include "../safe_link.h"
++
++#include <linux/quotaops.h>
++
++static int insert_new_sd(struct inode *inode);
++static int update_sd(struct inode *inode);
++
++/* this is common implementation of write_sd_by_inode method of file plugin
++   either insert stat data or update it
++ */
++int write_sd_by_inode_common(struct inode *inode /* object to save */ )
++{
++	int result;
++
++	assert("nikita-730", inode != NULL);
++
++	if (inode_get_flag(inode, REISER4_NO_SD))
++		/* object doesn't have stat-data yet */
++		result = insert_new_sd(inode);
++	else
++		result = update_sd(inode);
++	if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
++		/* Don't issue warnings about "name is too long" */
++		warning("nikita-2221", "Failed to save sd for %llu: %i",
++			(unsigned long long)get_inode_oid(inode), result);
++	return result;
++}
++
++/* this is common implementation of key_by_inode method of file plugin
++ */
++int
++key_by_inode_and_offset_common(struct inode *inode, loff_t off,
++			       reiser4_key * key)
++{
++	reiser4_key_init(key);
++	set_key_locality(key, reiser4_inode_data(inode)->locality_id);
++	set_key_ordering(key, get_inode_ordering(inode));
++	set_key_objectid(key, get_inode_oid(inode));	/*FIXME: inode->i_ino */
++	set_key_type(key, KEY_BODY_MINOR);
++	set_key_offset(key, (__u64) off);
++	return 0;
++}
++
++/* this is common implementation of set_plug_in_inode method of file plugin
++ */
++int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
++			     struct inode *parent /* parent object */ ,
++			     reiser4_object_create_data * data	/* creational
++								 * data */ )
++{
++	__u64 mask;
++
++	object->i_mode = data->mode;
++	/* this should be plugin decision */
++	object->i_uid = current->fsuid;
++	object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
++
++	/* support for BSD style group-id assignment. See mount's manual page
++	   description of bsdgroups ext2 mount options for more details */
++	if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
++		object->i_gid = parent->i_gid;
++	else if (parent->i_mode & S_ISGID) {
++		/* parent directory has sguid bit */
++		object->i_gid = parent->i_gid;
++		if (S_ISDIR(object->i_mode))
++			/* sguid is inherited by sub-directories */
++			object->i_mode |= S_ISGID;
++	} else
++		object->i_gid = current->fsgid;
++
++	/* this object doesn't have stat-data yet */
++	inode_set_flag(object, REISER4_NO_SD);
++#if 0
++	/* this is now called after all inode plugins are initialized:
++	   do_create_vfs_child after adjust_to_parent */
++	/* setup inode and file-operations for this inode */
++	setup_inode_ops(object, data);
++#endif
++	object->i_nlink = 0;
++	seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
++	mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
++	if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
++		mask |= (1 << LARGE_TIMES_STAT);
++
++	reiser4_inode_data(object)->extmask = mask;
++	return 0;
++}
++
++/* this is common implementation of adjust_to_parent method of file plugin for
++   regular files
++ */
++int adjust_to_parent_common(struct inode *object /* new object */ ,
++			    struct inode *parent /* parent directory */ ,
++			    struct inode *root /* root directory */ )
++{
++	assert("nikita-2165", object != NULL);
++	if (parent == NULL)
++		parent = root;
++	assert("nikita-2069", parent != NULL);
++
++	/*
++	 * inherit missing plugins from parent
++	 */
++
++	grab_plugin(object, parent, PSET_FILE);
++	grab_plugin(object, parent, PSET_SD);
++	grab_plugin(object, parent, PSET_FORMATTING);
++	grab_plugin(object, parent, PSET_PERM);
++	return 0;
++}
++
++/* this is common implementation of adjust_to_parent method of file plugin for
++   typical directories
++ */
++int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
++				struct inode *parent /* parent directory */ ,
++				struct inode *root /* root directory */ )
++{
++	int result = 0;
++	pset_member memb;
++
++	assert("nikita-2166", object != NULL);
++	if (parent == NULL)
++		parent = root;
++	assert("nikita-2167", parent != NULL);
++
++	/*
++	 * inherit missing plugins from parent
++	 */
++	for (memb = 0; memb < PSET_LAST; ++memb) {
++		result = grab_plugin(object, parent, memb);
++		if (result != 0)
++			break;
++	}
++	return result;
++}
++
++int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
++				   struct inode *parent /* parent directory */,
++				   struct inode *root /* root directory */)
++{
++ 	int result;
++ 	result = adjust_to_parent_common(object, parent, root);
++ 	if (result)
++ 		return result;
++ 	assert("edward-1416", parent != NULL);
++
++ 	grab_plugin(object, parent, PSET_CLUSTER);
++ 	grab_plugin(object, parent, PSET_CIPHER);
++ 	grab_plugin(object, parent, PSET_DIGEST);
++ 	grab_plugin(object, parent, PSET_COMPRESSION);
++ 	grab_plugin(object, parent, PSET_COMPRESSION_MODE);
++
++ 	return 0;
++}
++
++/* this is common implementation of create_object method of file plugin
++ */
++int
++create_object_common(struct inode *object, struct inode *parent UNUSED_ARG,
++		     reiser4_object_create_data * data UNUSED_ARG)
++{
++	reiser4_block_nr reserve;
++	assert("nikita-744", object != NULL);
++	assert("nikita-745", parent != NULL);
++	assert("nikita-747", data != NULL);
++	assert("nikita-748", inode_get_flag(object, REISER4_NO_SD));
++
++	reserve = estimate_create_common(object);
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
++		return RETERR(-ENOSPC);
++	return write_sd_by_inode_common(object);
++}
++
++static int common_object_delete_no_reserve(struct inode *inode);
++
++/**
++ * delete_object_common - delete_object of file_plugin
++ * @inode: inode to be deleted
++ *
++ * This is common implementation of delete_object method of file_plugin. It
++ * applies to object its deletion consists of removing two items - stat data
++ * and safe-link.
++ */
++int delete_object_common(struct inode *inode)
++{
++	int result;
++
++	assert("nikita-1477", inode != NULL);
++	/* FIXME: if file body deletion failed (i/o error, for instance),
++	   inode->i_size can be != 0 here */
++	assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
++	assert("nikita-3421", inode->i_nlink == 0);
++
++
++	if (!inode_get_flag(inode, REISER4_NO_SD)) {
++		reiser4_block_nr reserve;
++
++		/* grab space which is needed to remove 2 items from the tree:
++		   stat data and safe-link */
++		reserve = 2 * estimate_one_item_removal(tree_by_inode(inode));
++		if (reiser4_grab_space_force(reserve,
++					     BA_RESERVED | BA_CAN_COMMIT))
++			return RETERR(-ENOSPC);
++		result = common_object_delete_no_reserve(inode);
++	} else
++		result = 0;
++	return result;
++}
++
++/**
++ * delete_directory_common - delete_object of file_plugin
++ * @inode: inode to be deleted
++ *
++ * This is common implementation of delete_object method of file_plugin for
++ * typical directory. It calls done method of dir_plugin to remove "." and
++ * removes stat data and safe-link.
++ */
++int delete_directory_common(struct inode *inode)
++{
++	int result;
++	dir_plugin *dplug;
++
++	assert("", (get_current_context() &&
++		    get_current_context()->trans->atom == NULL));
++
++	dplug = inode_dir_plugin(inode);
++	assert("vs-1101", dplug && dplug->done);
++
++	/* kill cursors which might be attached to inode */
++	kill_cursors(inode);
++
++	/* grab space enough for removing two items */
++	if (reiser4_grab_space
++	    (2 * estimate_one_item_removal(tree_by_inode(inode)),
++	     BA_RESERVED | BA_CAN_COMMIT))
++		return RETERR(-ENOSPC);
++
++	result = dplug->done(inode);
++	if (!result)
++		result = common_object_delete_no_reserve(inode);
++	return result;
++}
++
++/* this is common implementation of add_link method of file plugin
++ */
++int add_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
++{
++	/*
++	 * increment ->i_nlink and update ->i_ctime
++	 */
++
++	INODE_INC_FIELD(object, i_nlink);
++	object->i_ctime = CURRENT_TIME;
++	return 0;
++}
++
++/* this is common implementation of rem_link method of file plugin
++ */
++int rem_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
++{
++	assert("nikita-2021", object != NULL);
++	assert("nikita-2163", object->i_nlink > 0);
++
++	/*
++	 * decrement ->i_nlink and update ->i_ctime
++	 */
++
++	INODE_DEC_FIELD(object, i_nlink);
++	object->i_ctime = CURRENT_TIME;
++	return 0;
++}
++
++/* this is common implementation of rem_link method of file plugin for typical
++   directory
++*/
++int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
++{
++	assert("nikita-20211", object != NULL);
++	assert("nikita-21631", object->i_nlink > 0);
++
++	/*
++	 * decrement ->i_nlink and update ->i_ctime
++	 */
++	INODE_DEC_FIELD(object, i_nlink);
++	if (object->i_nlink == 1)
++		INODE_DEC_FIELD(object, i_nlink);
++	object->i_ctime = CURRENT_TIME;
++	return 0;
++}
++
++/* this is common implementation of owns_item method of file plugin
++   compare objectids of keys in inode and coord */
++int owns_item_common(const struct inode *inode,	/* object to check
++						 * against */
++		     const coord_t * coord /* coord to check */ )
++{
++	reiser4_key item_key;
++	reiser4_key file_key;
++
++	assert("nikita-760", inode != NULL);
++	assert("nikita-761", coord != NULL);
++
++	return coord_is_existing_item(coord) &&
++	    (get_key_objectid(build_sd_key(inode, &file_key)) ==
++	     get_key_objectid(item_key_by_coord(coord, &item_key)));
++}
++
++/* this is common implementation of owns_item method of file plugin
++   for typical directory
++*/
++int owns_item_common_dir(const struct inode *inode,	/* object to check against */
++			 const coord_t * coord /* coord of item to check */ )
++{
++	reiser4_key item_key;
++
++	assert("nikita-1335", inode != NULL);
++	assert("nikita-1334", coord != NULL);
++
++	if (item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE)
++		return get_key_locality(item_key_by_coord(coord, &item_key)) ==
++		    get_inode_oid(inode);
++	else
++		return owns_item_common(inode, coord);
++}
++
++/* this is common implementation of can_add_link method of file plugin
++   checks whether yet another hard links to this object can be added
++*/
++int can_add_link_common(const struct inode *object /* object to check */ )
++{
++	assert("nikita-732", object != NULL);
++
++	/* inode->i_nlink is unsigned int, so just check for integer
++	   overflow */
++	return object->i_nlink + 1 != 0;
++}
++
++/* this is common implementation of can_rem_link method of file plugin for
++   typical directory
++*/
++int can_rem_link_common_dir(const struct inode *inode)
++{
++	/* is_dir_empty() returns 0 is dir is empty */
++	return !is_dir_empty(inode);
++}
++
++/* this is common implementation of detach method of file plugin for typical
++   directory
++*/
++int detach_common_dir(struct inode *child, struct inode *parent)
++{
++	dir_plugin *dplug;
++
++	dplug = inode_dir_plugin(child);
++	assert("nikita-2883", dplug != NULL);
++	assert("nikita-2884", dplug->detach != NULL);
++	return dplug->detach(child, parent);
++}
++
++/* this is common implementation of bind method of file plugin for typical
++   directory
++*/
++int bind_common_dir(struct inode *child, struct inode *parent)
++{
++	dir_plugin *dplug;
++
++	dplug = inode_dir_plugin(child);
++	assert("nikita-2646", dplug != NULL);
++	return dplug->attach(child, parent);
++}
++
++static int process_truncate(struct inode *, __u64 size);
++
++/* this is common implementation of safelink method of file plugin
++ */
++int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
++{
++	int result;
++
++	assert("vs-1705", get_current_context()->trans->atom == NULL);
++	if (link == SAFE_UNLINK)
++		/* nothing to do. iput() in the caller (process_safelink) will
++		 * finish with file */
++		result = 0;
++	else if (link == SAFE_TRUNCATE)
++		result = process_truncate(object, value);
++	else {
++		warning("nikita-3438", "Unrecognized safe-link type: %i", link);
++		result = RETERR(-EIO);
++	}
++	return result;
++}
++
++/* this is common implementation of estimate.create method of file plugin
++   can be used when object creation involves insertion of one item (usually stat
++   data) into tree
++*/
++reiser4_block_nr estimate_create_common(const struct inode * object)
++{
++	return estimate_one_insert_item(tree_by_inode(object));
++}
++
++/* this is common implementation of estimate.create method of file plugin for
++   typical directory
++   can be used when directory creation involves insertion of two items (usually
++   stat data and item containing "." and "..") into tree
++*/
++reiser4_block_nr estimate_create_common_dir(const struct inode * object)
++{
++	return 2 * estimate_one_insert_item(tree_by_inode(object));
++}
++
++/* this is common implementation of estimate.update method of file plugin
++   can be used when stat data update does not do more than inserting a unit
++   into a stat data item which is probably true for most cases
++*/
++reiser4_block_nr estimate_update_common(const struct inode * inode)
++{
++	return estimate_one_insert_into_item(tree_by_inode(inode));
++}
++
++/* this is common implementation of estimate.unlink method of file plugin
++ */
++reiser4_block_nr
++estimate_unlink_common(const struct inode * object UNUSED_ARG,
++		       const struct inode * parent UNUSED_ARG)
++{
++	return 0;
++}
++
++/* this is common implementation of estimate.unlink method of file plugin for
++   typical directory
++*/
++reiser4_block_nr
++estimate_unlink_common_dir(const struct inode * object,
++			   const struct inode * parent)
++{
++	dir_plugin *dplug;
++
++	dplug = inode_dir_plugin(object);
++	assert("nikita-2888", dplug != NULL);
++	assert("nikita-2887", dplug->estimate.unlink != NULL);
++	return dplug->estimate.unlink(object, parent);
++}
++
++char *wire_write_common(struct inode *inode, char *start)
++{
++	return build_inode_onwire(inode, start);
++}
++
++char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
++{
++	return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
++}
++
++struct dentry *wire_get_common(struct super_block *sb,
++			       reiser4_object_on_wire * obj)
++{
++	struct inode *inode;
++	struct dentry *dentry;
++	reiser4_key key;
++
++	extract_key_from_id(&obj->u.std.key_id, &key);
++	inode = reiser4_iget(sb, &key, 1);
++	if (!IS_ERR(inode)) {
++		reiser4_iget_complete(inode);
++		dentry = d_alloc_anon(inode);
++		if (dentry == NULL) {
++			iput(inode);
++			dentry = ERR_PTR(-ENOMEM);
++		} else
++			dentry->d_op = &get_super_private(sb)->ops.dentry;
++	} else if (PTR_ERR(inode) == -ENOENT)
++		/*
++		 * inode wasn't found at the key encoded in the file
++		 * handle. Hence, file handle is stale.
++		 */
++		dentry = ERR_PTR(RETERR(-ESTALE));
++	else
++		dentry = (void *)inode;
++	return dentry;
++}
++
++int wire_size_common(struct inode *inode)
++{
++	return inode_onwire_size(inode);
++}
++
++void wire_done_common(reiser4_object_on_wire * obj)
++{
++	/* nothing to do */
++}
++
++/* helper function to print errors */
++static void key_warning(const reiser4_key * key /* key to print */ ,
++			const struct inode *inode,
++			int code /* error code to print */ )
++{
++	assert("nikita-716", key != NULL);
++
++	if (code != -ENOMEM) {
++		warning("nikita-717", "Error for inode %llu (%i)",
++			(unsigned long long)get_key_objectid(key), code);
++		print_key("for key", key);
++	}
++}
++
++/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
++#if REISER4_DEBUG
++static void
++check_inode_seal(const struct inode *inode,
++		 const coord_t * coord, const reiser4_key * key)
++{
++	reiser4_key unit_key;
++
++	unit_key_by_coord(coord, &unit_key);
++	assert("nikita-2752",
++	       WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
++	assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
++}
++
++static void check_sd_coord(coord_t * coord, const reiser4_key * key)
++{
++	reiser4_key ukey;
++
++	coord_clear_iplug(coord);
++	if (zload(coord->node))
++		return;
++
++	if (!coord_is_existing_unit(coord) ||
++	    !item_plugin_by_coord(coord) ||
++	    !keyeq(unit_key_by_coord(coord, &ukey), key) ||
++	    (znode_get_level(coord->node) != LEAF_LEVEL) ||
++	    !item_is_statdata(coord)) {
++		warning("nikita-1901", "Conspicuous seal");
++		print_key("key", key);
++		print_coord("coord", coord, 1);
++		impossible("nikita-2877", "no way");
++	}
++	zrelse(coord->node);
++}
++
++#else
++#define check_inode_seal(inode, coord, key) noop
++#define check_sd_coord(coord, key) noop
++#endif
++
++/* insert new stat-data into tree. Called with inode state
++    locked. Return inode state locked. */
++static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
++{
++	int result;
++	reiser4_key key;
++	coord_t coord;
++	reiser4_item_data data;
++	char *area;
++	reiser4_inode *ref;
++	lock_handle lh;
++	oid_t oid;
++
++	assert("nikita-723", inode != NULL);
++	assert("nikita-3406", inode_get_flag(inode, REISER4_NO_SD));
++
++	ref = reiser4_inode_data(inode);
++	spin_lock_inode(inode);
++
++	if (ref->plugin_mask != 0)
++		/* inode has non-standard plugins */
++		inode_set_extension(inode, PLUGIN_STAT);
++	/*
++	 * prepare specification of new item to be inserted
++	 */
++
++	data.iplug = inode_sd_plugin(inode);
++	data.length = data.iplug->s.sd.save_len(inode);
++	spin_unlock_inode(inode);
++
++	data.data = NULL;
++	data.user = 0;
++/* could be optimized for case where there is only one node format in
++ * use in the filesystem, probably there are lots of such
++ * places we could optimize for only one node layout.... -Hans */
++	if (data.length > tree_by_inode(inode)->nplug->max_item_size()) {
++		/* This is silly check, but we don't know actual node where
++		   insertion will go into. */
++		return RETERR(-ENAMETOOLONG);
++	}
++	oid = oid_allocate(inode->i_sb);
++/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
++	if (oid == ABSOLUTE_MAX_OID)
++		return RETERR(-EOVERFLOW);
++
++	set_inode_oid(inode, oid);
++
++	coord_init_zero(&coord);
++	init_lh(&lh);
++
++	result = insert_by_key(tree_by_inode(inode),
++			       build_sd_key(inode, &key), &data, &coord, &lh,
++			       /* stat data lives on a leaf level */
++			       LEAF_LEVEL, CBK_UNIQUE);
++
++	/* we don't want to re-check that somebody didn't insert
++	   stat-data while we were doing io, because if it did,
++	   insert_by_key() returned error. */
++	/* but what _is_ possible is that plugin for inode's stat-data,
++	   list of non-standard plugins or their state would change
++	   during io, so that stat-data wouldn't fit into sd. To avoid
++	   this race we keep inode_state lock. This lock has to be
++	   taken each time you access inode in a way that would cause
++	   changes in sd size: changing plugins etc.
++	 */
++
++	if (result == IBK_INSERT_OK) {
++		coord_clear_iplug(&coord);
++		result = zload(coord.node);
++		if (result == 0) {
++			/* have we really inserted stat data? */
++			assert("nikita-725", item_is_statdata(&coord));
++
++			/* inode was just created. It is inserted into hash
++			   table, but no directory entry was yet inserted into
++			   parent. So, inode is inaccessible through
++			   ->lookup(). All places that directly grab inode
++			   from hash-table (like old knfsd), should check
++			   IMMUTABLE flag that is set by common_create_child.
++			 */
++			assert("nikita-3240", data.iplug != NULL);
++			assert("nikita-3241", data.iplug->s.sd.save != NULL);
++			area = item_body_by_coord(&coord);
++			result = data.iplug->s.sd.save(inode, &area);
++			znode_make_dirty(coord.node);
++			if (result == 0) {
++				/* object has stat-data now */
++				inode_clr_flag(inode, REISER4_NO_SD);
++				inode_set_flag(inode, REISER4_SDLEN_KNOWN);
++				/* initialise stat-data seal */
++				seal_init(&ref->sd_seal, &coord, &key);
++				ref->sd_coord = coord;
++				check_inode_seal(inode, &coord, &key);
++			} else if (result != -ENOMEM)
++				/*
++				 * convert any other error code to -EIO to
++				 * avoid confusing user level with unexpected
++				 * errors.
++				 */
++				result = RETERR(-EIO);
++			zrelse(coord.node);
++		}
++	}
++	done_lh(&lh);
++
++	if (result != 0)
++		key_warning(&key, inode, result);
++	else
++		oid_count_allocated();
++
++	return result;
++}
++
++/* find sd of inode in a tree, deal with errors */
++int lookup_sd(struct inode *inode /* inode to look sd for */ ,
++	      znode_lock_mode lock_mode /* lock mode */ ,
++	      coord_t * coord /* resulting coord */ ,
++	      lock_handle * lh /* resulting lock handle */ ,
++	      const reiser4_key * key /* resulting key */ ,
++	      int silent)
++{
++	int result;
++	__u32 flags;
++
++	assert("nikita-1692", inode != NULL);
++	assert("nikita-1693", coord != NULL);
++	assert("nikita-1694", key != NULL);
++
++	/* look for the object's stat data in a tree.
++	   This returns in "node" pointer to a locked znode and in "pos"
++	   position of an item found in node. Both are only valid if
++	   coord_found is returned. */
++	flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
++	flags |= CBK_UNIQUE;
++	/*
++	 * traverse tree to find stat data. We cannot use vroot here, because
++	 * it only covers _body_ of the file, and stat data don't belong
++	 * there.
++	 */
++	result = coord_by_key(tree_by_inode(inode),
++			      key,
++			      coord,
++			      lh,
++			      lock_mode,
++			      FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
++	if (REISER4_DEBUG && result == 0)
++		check_sd_coord(coord, key);
++
++	if (result != 0 && !silent)
++		key_warning(key, inode, result);
++	return result;
++}
++
++static int
++locate_inode_sd(struct inode *inode,
++		reiser4_key * key, coord_t * coord, lock_handle * lh)
++{
++	reiser4_inode *state;
++	seal_t seal;
++	int result;
++
++	assert("nikita-3483", inode != NULL);
++
++	state = reiser4_inode_data(inode);
++	spin_lock_inode(inode);
++	*coord = state->sd_coord;
++	coord_clear_iplug(coord);
++	seal = state->sd_seal;
++	spin_unlock_inode(inode);
++
++	build_sd_key(inode, key);
++	if (seal_is_set(&seal)) {
++		/* first, try to use seal */
++		result = seal_validate(&seal,
++				       coord,
++				       key,
++				       lh, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
++		if (result == 0)
++			check_sd_coord(coord, key);
++	} else
++		result = -E_REPEAT;
++
++	if (result != 0) {
++		coord_init_zero(coord);
++		result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
++	}
++	return result;
++}
++
++/* update stat-data at @coord */
++static int
++update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
++	     lock_handle * lh)
++{
++	int result;
++	reiser4_item_data data;
++	char *area;
++	reiser4_inode *state;
++	znode *loaded;
++
++	state = reiser4_inode_data(inode);
++
++	coord_clear_iplug(coord);
++	result = zload(coord->node);
++	if (result != 0)
++		return result;
++	loaded = coord->node;
++
++	spin_lock_inode(inode);
++	assert("nikita-728", inode_sd_plugin(inode) != NULL);
++	data.iplug = inode_sd_plugin(inode);
++
++	/* if inode has non-standard plugins, add appropriate stat data
++	 * extension */
++	if (state->plugin_mask != 0)
++		inode_set_extension(inode, PLUGIN_STAT);
++
++	/* data.length is how much space to add to (or remove
++	   from if negative) sd */
++	if (!inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
++		/* recalculate stat-data length */
++		data.length =
++		    data.iplug->s.sd.save_len(inode) -
++		    item_length_by_coord(coord);
++		inode_set_flag(inode, REISER4_SDLEN_KNOWN);
++	} else
++		data.length = 0;
++	spin_unlock_inode(inode);
++
++	/* if on-disk stat data is of different length than required
++	   for this inode, resize it */
++	if (data.length != 0) {
++		data.data = NULL;
++		data.user = 0;
++
++		/* insertion code requires that insertion point (coord) was
++		 * between units. */
++		coord->between = AFTER_UNIT;
++		result = resize_item(coord,
++				     &data, key, lh, COPI_DONT_SHIFT_LEFT);
++		if (result != 0) {
++			key_warning(key, inode, result);
++			zrelse(loaded);
++			return result;
++		}
++		if (loaded != coord->node) {
++			/* resize_item moved coord to another node. Zload it */
++			zrelse(loaded);
++			coord_clear_iplug(coord);
++			result = zload(coord->node);
++			if (result != 0)
++				return result;
++			loaded = coord->node;
++		}
++	}
++
++	area = item_body_by_coord(coord);
++	spin_lock_inode(inode);
++	result = data.iplug->s.sd.save(inode, &area);
++	znode_make_dirty(coord->node);
++
++	/* re-initialise stat-data seal */
++
++	/*
++	 * coord.between was possibly skewed from AT_UNIT when stat-data size
++	 * was changed and new extensions were pasted into item.
++	 */
++	coord->between = AT_UNIT;
++	seal_init(&state->sd_seal, coord, key);
++	state->sd_coord = *coord;
++	spin_unlock_inode(inode);
++	check_inode_seal(inode, coord, key);
++	zrelse(loaded);
++	return result;
++}
++
++/* Update existing stat-data in a tree. Called with inode state locked. Return
++   inode state locked. */
++static int update_sd(struct inode *inode /* inode to update sd for */ )
++{
++	int result;
++	reiser4_key key;
++	coord_t coord;
++	lock_handle lh;
++
++	assert("nikita-726", inode != NULL);
++
++	/* no stat-data, nothing to update?! */
++	assert("nikita-3482", !inode_get_flag(inode, REISER4_NO_SD));
++
++	init_lh(&lh);
++
++	result = locate_inode_sd(inode, &key, &coord, &lh);
++	if (result == 0)
++		result = update_sd_at(inode, &coord, &key, &lh);
++	done_lh(&lh);
++
++	return result;
++}
++
++/* helper for delete_object_common and delete_directory_common. Remove object
++   stat data. Space for that must be reserved by caller before
++*/
++static int
++common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
++{
++	int result;
++
++	assert("nikita-1477", inode != NULL);
++
++	if (!inode_get_flag(inode, REISER4_NO_SD)) {
++		reiser4_key sd_key;
++
++		DQUOT_FREE_INODE(inode);
++		DQUOT_DROP(inode);
++
++		build_sd_key(inode, &sd_key);
++		result =
++		    cut_tree(tree_by_inode(inode), &sd_key, &sd_key, NULL, 0);
++		if (result == 0) {
++			inode_set_flag(inode, REISER4_NO_SD);
++			result = oid_release(inode->i_sb, get_inode_oid(inode));
++			if (result == 0) {
++				oid_count_released();
++
++				result = safe_link_del(tree_by_inode(inode),
++						       get_inode_oid(inode),
++						       SAFE_UNLINK);
++			}
++		}
++	} else
++		result = 0;
++	return result;
++}
++
++/* helper for safelink_common */
++static int process_truncate(struct inode *inode, __u64 size)
++{
++	int result;
++	struct iattr attr;
++	file_plugin *fplug;
++	reiser4_context *ctx;
++	struct dentry dentry;
++
++	assert("vs-21", is_in_reiser4_context());
++	ctx = init_context(inode->i_sb);
++	assert("vs-22", !IS_ERR(ctx));
++
++	attr.ia_size = size;
++	attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
++	fplug = inode_file_plugin(inode);
++
++	mutex_lock(&inode->i_mutex);
++	assert("vs-1704", get_current_context()->trans->atom == NULL);
++	dentry.d_inode = inode;
++	result = inode->i_op->setattr(&dentry, &attr);
++	mutex_unlock(&inode->i_mutex);
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++
++	return result;
++}
++
++/* Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/hash.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/hash.c
+@@ -0,0 +1,350 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Hash functions */
++
++#include "../debug.h"
++#include "plugin_header.h"
++#include "plugin.h"
++#include "../super.h"
++#include "../inode.h"
++
++#include <linux/types.h>
++
++/* old rupasov (yura) hash */
++static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
++			  int len /* @name's length */ )
++{
++	int i;
++	int j;
++	int pow;
++	__u64 a;
++	__u64 c;
++
++	assert("nikita-672", name != NULL);
++	assert("nikita-673", len >= 0);
++
++	for (pow = 1, i = 1; i < len; ++i)
++		pow = pow * 10;
++
++	if (len == 1)
++		a = name[0] - 48;
++	else
++		a = (name[0] - 48) * pow;
++
++	for (i = 1; i < len; ++i) {
++		c = name[i] - 48;
++		for (pow = 1, j = i; j < len - 1; ++j)
++			pow = pow * 10;
++		a = a + c * pow;
++	}
++	for (; i < 40; ++i) {
++		c = '0' - 48;
++		for (pow = 1, j = i; j < len - 1; ++j)
++			pow = pow * 10;
++		a = a + c * pow;
++	}
++
++	for (; i < 256; ++i) {
++		c = i;
++		for (pow = 1, j = i; j < len - 1; ++j)
++			pow = pow * 10;
++		a = a + c * pow;
++	}
++
++	a = a << 7;
++	return a;
++}
++
++/* r5 hash */
++static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
++		     int len UNUSED_ARG /* @name's length */ )
++{
++	__u64 a = 0;
++
++	assert("nikita-674", name != NULL);
++	assert("nikita-675", len >= 0);
++
++	while (*name) {
++		a += *name << 4;
++		a += *name >> 4;
++		a *= 11;
++		name++;
++	}
++	return a;
++}
++
++/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
++     H0 = Key
++     Hi = E Mi(Hi-1) + Hi-1
++
++   (see Applied Cryptography, 2nd edition, p448).
++
++   Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
++
++   Jeremy has agreed to the contents of reiserfs/README. -Hans
++
++   This code was blindly upgraded to __u64 by s/__u32/__u64/g.
++*/
++static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
++		      int len /* @name's length */ )
++{
++	__u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
++
++	__u64 h0 = k[0], h1 = k[1];
++	__u64 a, b, c, d;
++	__u64 pad;
++	int i;
++
++	assert("nikita-676", name != NULL);
++	assert("nikita-677", len >= 0);
++
++#define DELTA 0x9E3779B9u
++#define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
++#define PARTROUNDS 6		/* 6 gets complete mixing */
++
++/* a, b, c, d - data; h0, h1 - accumulated hash */
++#define TEACORE(rounds)							\
++	do {								\
++		__u64 sum = 0;						\
++		int n = rounds;						\
++		__u64 b0, b1;						\
++									\
++		b0 = h0;						\
++		b1 = h1;						\
++									\
++		do							\
++		{							\
++			sum += DELTA;					\
++			b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);	\
++			b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);	\
++		} while(--n);						\
++									\
++		h0 += b0;						\
++		h1 += b1;						\
++	} while(0)
++
++	pad = (__u64) len | ((__u64) len << 8);
++	pad |= pad << 16;
++
++	while (len >= 16) {
++		a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
++		    16 | (__u64) name[3] << 24;
++		b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
++		    16 | (__u64) name[7] << 24;
++		c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
++		    16 | (__u64) name[11] << 24;
++		d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
++		    << 16 | (__u64) name[15] << 24;
++
++		TEACORE(PARTROUNDS);
++
++		len -= 16;
++		name += 16;
++	}
++
++	if (len >= 12) {
++		//assert(len < 16);
++		if (len >= 16)
++			*(int *)0 = 0;
++
++		a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
++		    16 | (__u64) name[3] << 24;
++		b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
++		    16 | (__u64) name[7] << 24;
++		c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
++		    16 | (__u64) name[11] << 24;
++
++		d = pad;
++		for (i = 12; i < len; i++) {
++			d <<= 8;
++			d |= name[i];
++		}
++	} else if (len >= 8) {
++		//assert(len < 12);
++		if (len >= 12)
++			*(int *)0 = 0;
++		a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
++		    16 | (__u64) name[3] << 24;
++		b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
++		    16 | (__u64) name[7] << 24;
++
++		c = d = pad;
++		for (i = 8; i < len; i++) {
++			c <<= 8;
++			c |= name[i];
++		}
++	} else if (len >= 4) {
++		//assert(len < 8);
++		if (len >= 8)
++			*(int *)0 = 0;
++		a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
++		    16 | (__u64) name[3] << 24;
++
++		b = c = d = pad;
++		for (i = 4; i < len; i++) {
++			b <<= 8;
++			b |= name[i];
++		}
++	} else {
++		//assert(len < 4);
++		if (len >= 4)
++			*(int *)0 = 0;
++		a = b = c = d = pad;
++		for (i = 0; i < len; i++) {
++			a <<= 8;
++			a |= name[i];
++		}
++	}
++
++	TEACORE(FULLROUNDS);
++
++/*	return 0;*/
++	return h0 ^ h1;
++
++}
++
++/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
++
++   See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
++
++   Excerpts:
++
++     FNV hashes are designed to be fast while maintaining a low collision
++     rate.
++
++     [This version also seems to preserve lexicographical order locally.]
++
++     FNV hash algorithms and source code have been released into the public
++     domain.
++
++*/
++static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
++		       int len UNUSED_ARG /* @name's length */ )
++{
++	unsigned long long a = 0xcbf29ce484222325ull;
++	const unsigned long long fnv_64_prime = 0x100000001b3ull;
++
++	assert("nikita-678", name != NULL);
++	assert("nikita-679", len >= 0);
++
++	/* FNV-1 hash each octet in the buffer */
++	for (; *name; ++name) {
++		/* multiply by the 32 bit FNV magic prime mod 2^64 */
++		a *= fnv_64_prime;
++		/* xor the bottom with the current octet */
++		a ^= (unsigned long long)(*name);
++	}
++	/* return our new hash value */
++	return a;
++}
++
++/* degenerate hash function used to simplify testing of non-unique key
++   handling */
++static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
++		      int len UNUSED_ARG /* @name's length */ )
++{
++	return 0xc0c0c0c010101010ull;
++}
++
++static int change_hash(struct inode *inode, reiser4_plugin * plugin)
++{
++	int result;
++
++	assert("nikita-3503", inode != NULL);
++	assert("nikita-3504", plugin != NULL);
++
++	assert("nikita-3505", is_reiser4_inode(inode));
++	assert("nikita-3506", inode_dir_plugin(inode) != NULL);
++	assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
++
++	result = 0;
++	if (inode_hash_plugin(inode) == NULL ||
++	    inode_hash_plugin(inode)->h.id != plugin->h.id) {
++		if (is_dir_empty(inode) == 0)
++			result =
++			    plugin_set_hash(&reiser4_inode_data(inode)->pset,
++					    &plugin->hash);
++		else
++			result = RETERR(-ENOTEMPTY);
++
++	}
++	return result;
++}
++
++static reiser4_plugin_ops hash_plugin_ops = {
++	.init = NULL,
++	.load = NULL,
++	.save_len = NULL,
++	.save = NULL,
++	.change = change_hash
++};
++
++/* hash plugins */
++hash_plugin hash_plugins[LAST_HASH_ID] = {
++	[RUPASOV_HASH_ID] = {
++		.h = {
++			.type_id = REISER4_HASH_PLUGIN_TYPE,
++			.id = RUPASOV_HASH_ID,
++			.pops = &hash_plugin_ops,
++			.label = "rupasov",
++			.desc = "Original Yura's hash",
++			.linkage = {NULL, NULL}
++		},
++		.hash = hash_rupasov
++	},
++	[R5_HASH_ID] = {
++		.h = {
++			.type_id = REISER4_HASH_PLUGIN_TYPE,
++			.id = R5_HASH_ID,
++			.pops = &hash_plugin_ops,
++			.label = "r5",
++			.desc = "r5 hash",
++			.linkage = {NULL, NULL}
++		},
++		.hash = hash_r5
++	},
++	[TEA_HASH_ID] = {
++		.h = {
++			.type_id = REISER4_HASH_PLUGIN_TYPE,
++			.id = TEA_HASH_ID,
++			.pops = &hash_plugin_ops,
++			.label = "tea",
++			.desc = "tea hash",
++			.linkage = {NULL, NULL}
++		},
++		.hash = hash_tea
++	},
++	[FNV1_HASH_ID] = {
++		.h = {
++			.type_id = REISER4_HASH_PLUGIN_TYPE,
++			.id = FNV1_HASH_ID,
++			.pops = &hash_plugin_ops,
++			.label = "fnv1",
++			.desc = "fnv1 hash",
++			.linkage = {NULL, NULL}
++		},
++		.hash = hash_fnv1
++	},
++	[DEGENERATE_HASH_ID] = {
++		.h = {
++			.type_id = REISER4_HASH_PLUGIN_TYPE,
++			.id = DEGENERATE_HASH_ID,
++			.pops = &hash_plugin_ops,
++			.label = "degenerate hash",
++			.desc = "Degenerate hash: only for testing",
++			.linkage = {NULL, NULL}
++		},
++		.hash = hash_deg
++	}
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/inode_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/inode_ops.c
+@@ -0,0 +1,886 @@
++/*
++ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
++ */
++
++/*
++ * this file contains typical implementations for most of methods of struct
++ * inode_operations
++ */
++
++#include "../inode.h"
++#include "../safe_link.h"
++
++#include <linux/quotaops.h>
++#include <linux/namei.h>
++
++
++static int create_vfs_object(struct inode *parent, struct dentry *dentry,
++		      reiser4_object_create_data *data);
++
++/**
++ * create_common - create of inode operations
++ * @parent: inode of parent directory
++ * @dentry: dentry of new object to create
++ * @mode: the permissions to use
++ * @nameidata:
++ *
++ * This is common implementation of vfs's create method of struct
++ * inode_operations.
++ * Creates regular file using file plugin from parent directory plugin set.
++ */
++int create_common(struct inode *parent, struct dentry *dentry,
++		  int mode, struct nameidata *nameidata)
++{
++	reiser4_object_create_data data;
++
++	memset(&data, 0, sizeof data);
++	data.mode = S_IFREG | mode;
++	data.id = inode_regular_plugin(parent)->id;
++	return create_vfs_object(parent, dentry, &data);
++}
++
++int lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
++void check_light_weight(struct inode *inode, struct inode *parent);
++
++/**
++ * lookup_common - lookup of inode operations
++ * @parent: inode of directory to lookup into
++ * @dentry: name to look for
++ * @nameidata:
++ *
++ * This is common implementation of vfs's lookup method of struct
++ * inode_operations.
++ */
++struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
++			     struct nameidata *nameidata)
++{
++	reiser4_context *ctx;
++	int result;
++	struct dentry *new;
++	struct inode *inode;
++	reiser4_dir_entry_desc entry;
++
++	ctx = init_context(parent->i_sb);
++	if (IS_ERR(ctx))
++		return (struct dentry *)ctx;
++
++	/* set up operations on dentry. */
++	dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
++
++	result = lookup_name(parent, dentry, &entry.key);
++	if (result) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		if (result == -ENOENT) {
++			/* object not found */
++			if (!IS_DEADDIR(parent))
++				d_add(dentry, NULL);
++			return NULL;
++		}
++		return ERR_PTR(result);
++	}
++
++	inode = reiser4_iget(parent->i_sb, &entry.key, 0);
++	if (IS_ERR(inode)) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return ERR_PTR(PTR_ERR(inode));
++	}
++
++	/* success */
++	check_light_weight(inode, parent);
++	new = d_splice_alias(inode, dentry);
++	reiser4_iget_complete(inode);
++
++	/* prevent balance_dirty_pages() from being called: we don't want to
++	 * do this under directory i_mutex. */
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return new;
++}
++
++static reiser4_block_nr common_estimate_link(struct inode *parent,
++					     struct inode *object);
++int reiser4_update_dir(struct inode *);
++
++/**
++ * link_common - link of inode operations
++ * @existing: dentry of object which is to get new name
++ * @parent: directory where new name is to be created
++ * @newname: new name
++ *
++ * This is common implementation of vfs's link method of struct
++ * inode_operations.
++ */
++int link_common(struct dentry *existing, struct inode *parent,
++		struct dentry *newname)
++{
++	reiser4_context *ctx;
++	int result;
++	struct inode *object;
++	dir_plugin *parent_dplug;
++	reiser4_dir_entry_desc entry;
++	reiser4_object_create_data data;
++	reiser4_block_nr reserve;
++
++	ctx = init_context(parent->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	assert("nikita-1431", existing != NULL);
++	assert("nikita-1432", parent != NULL);
++	assert("nikita-1433", newname != NULL);
++
++	object = existing->d_inode;
++	assert("nikita-1434", object != NULL);
++
++	/* check for race with create_object() */
++	if (inode_get_flag(object, REISER4_IMMUTABLE)) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return RETERR(-E_REPEAT);
++	}
++
++	parent_dplug = inode_dir_plugin(parent);
++
++	memset(&entry, 0, sizeof entry);
++	entry.obj = object;
++
++	data.mode = object->i_mode;
++	data.id = inode_file_plugin(object)->h.id;
++
++	reserve = common_estimate_link(parent, existing->d_inode);
++	if ((__s64) reserve < 0) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return reserve;
++	}
++
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return RETERR(-ENOSPC);
++	}
++
++	/*
++	 * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
++	 * means that link(2) can race against unlink(2) or rename(2), and
++	 * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
++	 *
++	 * For such inode we have to undo special processing done in
++	 * reiser4_unlink() viz. creation of safe-link.
++	 */
++	if (unlikely(object->i_nlink == 0)) {
++		result = safe_link_del(tree_by_inode(object),
++				       get_inode_oid(object), SAFE_UNLINK);
++		if (result != 0) {
++			context_set_commit_async(ctx);
++			reiser4_exit_context(ctx);
++			return result;
++		}
++	}
++
++	/* increment nlink of @existing and update its stat data */
++	result = reiser4_add_nlink(object, parent, 1);
++	if (result == 0) {
++		/* add entry to the parent */
++		result =
++		    parent_dplug->add_entry(parent, newname, &data, &entry);
++		if (result != 0) {
++			/* failed to add entry to the parent, decrement nlink
++			   of @existing */
++			reiser4_del_nlink(object, parent, 1);
++			/*
++			 * now, if that failed, we have a file with too big
++			 * nlink---space leak, much better than directory
++			 * entry pointing to nowhere
++			 */
++		}
++	}
++	if (result == 0) {
++		atomic_inc(&object->i_count);
++		/*
++		 * Upon successful completion, link() shall mark for update
++		 * the st_ctime field of the file. Also, the st_ctime and
++		 * st_mtime fields of the directory that contains the new
++		 * entry shall be marked for update. --SUS
++		 */
++		result = reiser4_update_dir(parent);
++	}
++	if (result == 0)
++		d_instantiate(newname, existing->d_inode);
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
++
++/**
++ * unlink_common - unlink of inode operations
++ * @parent: inode of directory to remove name from
++ * @victim: name to be removed
++ *
++ * This is common implementation of vfs's unlink method of struct
++ * inode_operations.
++ */
++int unlink_common(struct inode *parent, struct dentry *victim)
++{
++	reiser4_context *ctx;
++	int result;
++	struct inode *object;
++	file_plugin *fplug;
++
++	ctx = init_context(parent->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	object = victim->d_inode;
++	fplug = inode_file_plugin(object);
++	assert("nikita-2882", fplug->detach != NULL);
++
++	result = unlink_check_and_grab(parent, victim);
++	if (result != 0) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	result = fplug->detach(object, parent);
++	if (result == 0) {
++		dir_plugin *parent_dplug;
++		reiser4_dir_entry_desc entry;
++
++		parent_dplug = inode_dir_plugin(parent);
++		memset(&entry, 0, sizeof entry);
++
++		/* first, delete directory entry */
++		result = parent_dplug->rem_entry(parent, victim, &entry);
++		if (result == 0) {
++			/*
++			 * if name was removed successfully, we _have_ to
++			 * return 0 from this function, because upper level
++			 * caller (vfs_{rmdir,unlink}) expect this.
++			 *
++			 * now that directory entry is removed, update
++			 * stat-data
++			 */
++			reiser4_del_nlink(object, parent, 1);
++			/*
++			 * Upon successful completion, unlink() shall mark for
++			 * update the st_ctime and st_mtime fields of the
++			 * parent directory. Also, if the file's link count is
++			 * not 0, the st_ctime field of the file shall be
++			 * marked for update. --SUS
++			 */
++			reiser4_update_dir(parent);
++			/* add safe-link for this file */
++			if (object->i_nlink == 0)
++				safe_link_add(object, SAFE_UNLINK);
++		}
++	}
++
++	if (unlikely(result != 0)) {
++		if (result != -ENOMEM)
++			warning("nikita-3398", "Cannot unlink %llu (%i)",
++				(unsigned long long)get_inode_oid(object),
++				result);
++		/* if operation failed commit pending inode modifications to
++		 * the stat-data */
++		reiser4_update_sd(object);
++		reiser4_update_sd(parent);
++	}
++
++	reiser4_release_reserved(object->i_sb);
++
++	/* @object's i_ctime was updated by ->rem_link() method(). */
++
++	/* @victim can be already removed from the disk by this time. Inode is
++	   then marked so that iput() wouldn't try to remove stat data. But
++	   inode itself is still there.
++	 */
++
++	/*
++	 * we cannot release directory semaphore here, because name has
++	 * already been deleted, but dentry (@victim) still exists.  Prevent
++	 * balance_dirty_pages() from being called on exiting this context: we
++	 * don't want to do this under directory i_mutex.
++	 */
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/**
++ * symlink_common - symlink of inode operations
++ * @parent: inode of parent directory
++ * @dentry: dentry of object to be created
++ * @linkname: string symlink is to contain
++ *
++ * This is common implementation of vfs's symlink method of struct
++ * inode_operations.
++ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
++ */
++int symlink_common(struct inode *parent, struct dentry *dentry,
++		   const char *linkname)
++{
++	reiser4_object_create_data data;
++
++	memset(&data, 0, sizeof data);
++	data.name = linkname;
++	data.id = SYMLINK_FILE_PLUGIN_ID;
++	data.mode = S_IFLNK | S_IRWXUGO;
++	return create_vfs_object(parent, dentry, &data);
++}
++
++/**
++ * mkdir_common - mkdir of inode operations
++ * @parent: inode of parent directory
++ * @dentry: dentry of object to be created
++ * @mode: the permissions to use
++ *
++ * This is common implementation of vfs's mkdir method of struct
++ * inode_operations.
++ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
++ */
++int mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
++{
++	reiser4_object_create_data data;
++
++	memset(&data, 0, sizeof data);
++	data.mode = S_IFDIR | mode;
++	data.id = DIRECTORY_FILE_PLUGIN_ID;
++	return create_vfs_object(parent, dentry, &data);
++}
++
++/**
++ * mknod_common - mknod of inode operations
++ * @parent: inode of parent directory
++ * @dentry: dentry of object to be created
++ * @mode: the permissions to use and file type
++ * @rdev: minor and major of new device file
++ *
++ * This is common implementation of vfs's mknod method of struct
++ * inode_operations.
++ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
++ */
++int mknod_common(struct inode *parent, struct dentry *dentry,
++		 int mode, dev_t rdev)
++{
++	reiser4_object_create_data data;
++
++	memset(&data, 0, sizeof data);
++	data.mode = mode;
++	data.rdev = rdev;
++	data.id = SPECIAL_FILE_PLUGIN_ID;
++	return create_vfs_object(parent, dentry, &data);
++}
++
++/*
++ * implementation of vfs's rename method of struct inode_operations for typical
++ * directory is in inode_ops_rename.c
++ */
++
++/**
++ * follow_link_common - follow_link of inode operations
++ * @dentry: dentry of symlink
++ * @data:
++ *
++ * This is common implementation of vfs's followlink method of struct
++ * inode_operations.
++ * Assumes that inode's generic_ip points to the content of symbolic link.
++ */
++void *follow_link_common(struct dentry *dentry, struct nameidata *nd)
++{
++	assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
++
++	if (!dentry->d_inode->u.generic_ip
++	    || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED))
++		return ERR_PTR(RETERR(-EINVAL));
++	nd_set_link(nd, dentry->d_inode->u.generic_ip);
++	return NULL;
++}
++
++/**
++ * permission_common - permission of inode operations
++ * @inode: inode to check permissions for
++ * @mask: mode bits to check permissions for
++ * @nameidata:
++ *
++ * Uses generic function to check for rwx permissions.
++ */
++int permission_common(struct inode *inode, int mask,
++		      struct nameidata *nameidata)
++{
++	return generic_permission(inode, mask, NULL);
++}
++
++static int setattr_reserve(reiser4_tree *);
++
++/* this is common implementation of vfs's setattr method of struct
++   inode_operations
++*/
++int setattr_common(struct dentry *dentry, struct iattr *attr)
++{
++	reiser4_context *ctx;
++	struct inode *inode;
++	int result;
++
++	inode = dentry->d_inode;
++	result = inode_change_ok(inode, attr);
++	if (result)
++		return result;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
++
++	/*
++	 * grab disk space and call standard inode_setattr().
++	 */
++	result = setattr_reserve(tree_by_inode(inode));
++	if (!result) {
++		if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
++		    || (attr->ia_valid & ATTR_GID
++			&& attr->ia_gid != inode->i_gid)) {
++			result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
++			if (result) {
++				context_set_commit_async(ctx);
++				reiser4_exit_context(ctx);
++				return result;
++			}
++		}
++		result = inode_setattr(inode, attr);
++		if (!result)
++			reiser4_update_sd(inode);
++	}
++
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/* this is common implementation of vfs's getattr method of struct
++   inode_operations
++*/
++int
++getattr_common(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry,
++	       struct kstat *stat)
++{
++	struct inode *obj;
++
++	assert("nikita-2298", dentry != NULL);
++	assert("nikita-2299", stat != NULL);
++	assert("nikita-2300", dentry->d_inode != NULL);
++
++	obj = dentry->d_inode;
++
++	stat->dev = obj->i_sb->s_dev;
++	stat->ino = oid_to_uino(get_inode_oid(obj));
++	stat->mode = obj->i_mode;
++	/* don't confuse userland with huge nlink. This is not entirely
++	 * correct, because nlink_t is not necessary 16 bit signed. */
++	stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
++	stat->uid = obj->i_uid;
++	stat->gid = obj->i_gid;
++	stat->rdev = obj->i_rdev;
++	stat->atime = obj->i_atime;
++	stat->mtime = obj->i_mtime;
++	stat->ctime = obj->i_ctime;
++	stat->size = obj->i_size;
++	stat->blocks =
++	    (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
++	/* "preferred" blocksize for efficient file system I/O */
++	stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
++
++	return 0;
++}
++
++/* Estimate the maximum amount of nodes which might be allocated or changed on
++   typical new object creation. Typical creation consists of calling create
++   method of file plugin, adding directory entry to parent and update parent
++   directory's stat data.
++*/
++static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,	/* parent object */
++						   struct inode *object
++						   /* object */ )
++{
++	assert("vpf-309", parent != NULL);
++	assert("vpf-307", object != NULL);
++
++	return
++	    /* object creation estimation */
++	    inode_file_plugin(object)->estimate.create(object) +
++	    /* stat data of parent directory estimation */
++	    inode_file_plugin(parent)->estimate.update(parent) +
++	    /* adding entry estimation */
++	    inode_dir_plugin(parent)->estimate.add_entry(parent) +
++	    /* to undo in the case of failure */
++	    inode_dir_plugin(parent)->estimate.rem_entry(parent);
++}
++
++/* Create child in directory.
++
++   . get object's plugin
++   . get fresh inode
++   . initialize inode
++   . add object's stat-data
++   . initialize object's directory
++   . add entry to the parent
++   . instantiate dentry
++
++*/
++static int do_create_vfs_child(reiser4_object_create_data * data,	/* parameters of new
++									   object */
++			       struct inode **retobj)
++{
++	int result;
++
++	struct dentry *dentry;	/* parent object */
++	struct inode *parent;	/* new name */
++
++	dir_plugin *par_dir;	/* directory plugin on the parent */
++	dir_plugin *obj_dir;	/* directory plugin on the new object */
++	file_plugin *obj_plug;	/* object plugin on the new object */
++	struct inode *object;	/* new object */
++	reiser4_block_nr reserve;
++
++	reiser4_dir_entry_desc entry;	/* new directory entry */
++
++	assert("nikita-1420", data != NULL);
++	parent = data->parent;
++	dentry = data->dentry;
++
++	assert("nikita-1418", parent != NULL);
++	assert("nikita-1419", dentry != NULL);
++
++	/* check, that name is acceptable for parent */
++	par_dir = inode_dir_plugin(parent);
++	if (par_dir->is_name_acceptable &&
++	    !par_dir->is_name_acceptable(parent,
++					 dentry->d_name.name,
++					 (int)dentry->d_name.len))
++		return RETERR(-ENAMETOOLONG);
++
++	result = 0;
++	obj_plug = file_plugin_by_id((int)data->id);
++	if (obj_plug == NULL) {
++		warning("nikita-430", "Cannot find plugin %i", data->id);
++		return RETERR(-ENOENT);
++	}
++	object = new_inode(parent->i_sb);
++	if (object == NULL)
++		return RETERR(-ENOMEM);
++	/* we'll update i_nlink below */
++	object->i_nlink = 0;
++	/* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
++	 * to simplify error handling: if some error occurs before i_ino is
++	 * initialized with oid, i_ino should already be set to some
++	 * distinguished value. */
++	object->i_ino = 0;
++
++	/* So that on error iput will be called. */
++	*retobj = object;
++
++	if (DQUOT_ALLOC_INODE(object)) {
++		DQUOT_DROP(object);
++		object->i_flags |= S_NOQUOTA;
++		return RETERR(-EDQUOT);
++	}
++
++	memset(&entry, 0, sizeof entry);
++	entry.obj = object;
++
++	plugin_set_file(&reiser4_inode_data(object)->pset, obj_plug);
++	result = obj_plug->set_plug_in_inode(object, parent, data);
++	if (result) {
++		warning("nikita-431", "Cannot install plugin %i on %llx",
++			data->id, (unsigned long long)get_inode_oid(object));
++		DQUOT_FREE_INODE(object);
++		object->i_flags |= S_NOQUOTA;
++		return result;
++	}
++
++	/* reget plugin after installation */
++	obj_plug = inode_file_plugin(object);
++
++	if (obj_plug->create_object == NULL) {
++		DQUOT_FREE_INODE(object);
++		object->i_flags |= S_NOQUOTA;
++		return RETERR(-EPERM);
++	}
++
++	/* if any of hash, tail, sd or permission plugins for newly created
++	   object are not set yet set them here inheriting them from parent
++	   directory
++	 */
++	assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
++	result = obj_plug->adjust_to_parent(object,
++					    parent,
++					    object->i_sb->s_root->d_inode);
++	if (result != 0) {
++		warning("nikita-432", "Cannot inherit from %llx to %llx",
++			(unsigned long long)get_inode_oid(parent),
++			(unsigned long long)get_inode_oid(object));
++		DQUOT_FREE_INODE(object);
++		object->i_flags |= S_NOQUOTA;
++		return result;
++	}
++
++	/* setup inode and file-operations for this inode */
++	setup_inode_ops(object, data);
++
++	/* call file plugin's method to initialize plugin specific part of
++	 * inode */
++	if (obj_plug->init_inode_data)
++		obj_plug->init_inode_data(object, data, 1 /*create */ );
++
++	/* obtain directory plugin (if any) for new object. */
++	obj_dir = inode_dir_plugin(object);
++	if (obj_dir != NULL && obj_dir->init == NULL) {
++		DQUOT_FREE_INODE(object);
++		object->i_flags |= S_NOQUOTA;
++		return RETERR(-EPERM);
++	}
++
++	reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
++
++	reserve = estimate_create_vfs_object(parent, object);
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
++		DQUOT_FREE_INODE(object);
++		object->i_flags |= S_NOQUOTA;
++		return RETERR(-ENOSPC);
++	}
++
++	/* mark inode `immutable'. We disable changes to the file being
++	   created until valid directory entry for it is inserted. Otherwise,
++	   if file were expanded and insertion of directory entry fails, we
++	   have to remove file, but we only alloted enough space in
++	   transaction to remove _empty_ file. 3.x code used to remove stat
++	   data in different transaction thus possibly leaking disk space on
++	   crash. This all only matters if it's possible to access file
++	   without name, for example, by inode number
++	 */
++	inode_set_flag(object, REISER4_IMMUTABLE);
++
++	/* create empty object, this includes allocation of new objectid. For
++	   directories this implies creation of dot and dotdot  */
++	assert("nikita-2265", inode_get_flag(object, REISER4_NO_SD));
++
++	/* mark inode as `loaded'. From this point onward
++	   reiser4_delete_inode() will try to remove its stat-data. */
++	inode_set_flag(object, REISER4_LOADED);
++
++	result = obj_plug->create_object(object, parent, data);
++	if (result != 0) {
++		inode_clr_flag(object, REISER4_IMMUTABLE);
++		if (result != -ENAMETOOLONG && result != -ENOMEM)
++			warning("nikita-2219",
++				"Failed to create sd for %llu",
++				(unsigned long long)get_inode_oid(object));
++		DQUOT_FREE_INODE(object);
++		object->i_flags |= S_NOQUOTA;
++		return result;
++	}
++
++	if (obj_dir != NULL)
++		result = obj_dir->init(object, parent, data);
++	if (result == 0) {
++		assert("nikita-434", !inode_get_flag(object, REISER4_NO_SD));
++		/* insert inode into VFS hash table */
++		insert_inode_hash(object);
++		/* create entry */
++		result = par_dir->add_entry(parent, dentry, data, &entry);
++		if (result == 0) {
++			result = reiser4_add_nlink(object, parent, 0);
++			/* If O_CREAT is set and the file did not previously
++			   exist, upon successful completion, open() shall
++			   mark for update the st_atime, st_ctime, and
++			   st_mtime fields of the file and the st_ctime and
++			   st_mtime fields of the parent directory. --SUS
++			 */
++			/* @object times are already updated by
++			   reiser4_add_nlink() */
++			if (result == 0)
++				reiser4_update_dir(parent);
++			if (result != 0)
++				/* cleanup failure to add nlink */
++				par_dir->rem_entry(parent, dentry, &entry);
++		}
++		if (result != 0)
++			/* cleanup failure to add entry */
++			obj_plug->detach(object, parent);
++	} else if (result != -ENOMEM)
++		warning("nikita-2219", "Failed to initialize dir for %llu: %i",
++			(unsigned long long)get_inode_oid(object), result);
++
++	/*
++	 * update stat-data, committing all pending modifications to the inode
++	 * fields.
++	 */
++	reiser4_update_sd(object);
++	if (result != 0) {
++		DQUOT_FREE_INODE(object);
++		object->i_flags |= S_NOQUOTA;
++		/* if everything was ok (result == 0), parent stat-data is
++		 * already updated above (update_parent_dir()) */
++		reiser4_update_sd(parent);
++		/* failure to create entry, remove object */
++		obj_plug->delete_object(object);
++	}
++
++	/* file has name now, clear immutable flag */
++	inode_clr_flag(object, REISER4_IMMUTABLE);
++
++	/* on error, iput() will call ->delete_inode(). We should keep track
++	   of the existence of stat-data for this inode and avoid attempt to
++	   remove it in reiser4_delete_inode(). This is accomplished through
++	   REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
++	 */
++	return result;
++}
++
++/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
++   reiser4_mknod and reiser4_symlink
++*/
++static int
++create_vfs_object(struct inode *parent,
++		  struct dentry *dentry, reiser4_object_create_data * data)
++{
++	reiser4_context *ctx;
++	int result;
++	struct inode *child;
++
++	ctx = init_context(parent->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++	context_set_commit_async(ctx);
++
++	data->parent = parent;
++	data->dentry = dentry;
++	child = NULL;
++	result = do_create_vfs_child(data, &child);
++	if (unlikely(result != 0)) {
++		if (child != NULL) {
++			reiser4_make_bad_inode(child);
++			iput(child);
++		}
++	} else
++		d_instantiate(dentry, child);
++
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++/* helper for link_common. Estimate disk space necessary to add a link
++   from @parent to @object
++*/
++static reiser4_block_nr common_estimate_link(struct inode *parent,	/* parent directory */
++					     struct inode *object
++					     /* object to which new link is being cerated */
++					     )
++{
++	reiser4_block_nr res = 0;
++	file_plugin *fplug;
++	dir_plugin *dplug;
++
++	assert("vpf-317", object != NULL);
++	assert("vpf-318", parent != NULL);
++
++	fplug = inode_file_plugin(object);
++	dplug = inode_dir_plugin(parent);
++	/* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
++	/* reiser4_add_nlink(object) */
++	res += fplug->estimate.update(object);
++	/* add_entry(parent) */
++	res += dplug->estimate.add_entry(parent);
++	/* reiser4_del_nlink(object) */
++	res += fplug->estimate.update(object);
++	/* update_dir(parent) */
++	res += inode_file_plugin(parent)->estimate.update(parent);
++	/* safe-link */
++	res += estimate_one_item_removal(tree_by_inode(object));
++
++	return res;
++}
++
++/* Estimate disk space necessary to remove a link between @parent and
++   @object.
++*/
++static reiser4_block_nr estimate_unlink(struct inode *parent,	/* parent directory */
++					struct inode *object
++					/* object to which new link is being cerated */
++					)
++{
++	reiser4_block_nr res = 0;
++	file_plugin *fplug;
++	dir_plugin *dplug;
++
++	assert("vpf-317", object != NULL);
++	assert("vpf-318", parent != NULL);
++
++	fplug = inode_file_plugin(object);
++	dplug = inode_dir_plugin(parent);
++
++	/* rem_entry(parent) */
++	res += dplug->estimate.rem_entry(parent);
++	/* reiser4_del_nlink(object) */
++	res += fplug->estimate.update(object);
++	/* update_dir(parent) */
++	res += inode_file_plugin(parent)->estimate.update(parent);
++	/* fplug->unlink */
++	res += fplug->estimate.unlink(object, parent);
++	/* safe-link */
++	res += estimate_one_insert_item(tree_by_inode(object));
++
++	return res;
++}
++
++/* helper for unlink_common. Estimate and grab space for unlink. */
++static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
++{
++	file_plugin *fplug;
++	struct inode *child;
++	int result;
++
++	result = 0;
++	child = victim->d_inode;
++	fplug = inode_file_plugin(child);
++
++	/* check for race with create_object() */
++	if (inode_get_flag(child, REISER4_IMMUTABLE))
++		return RETERR(-E_REPEAT);
++	/* object being deleted should have stat data */
++	assert("vs-949", !inode_get_flag(child, REISER4_NO_SD));
++
++	/* ask object plugin */
++	if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
++		return RETERR(-ENOTEMPTY);
++
++	result = (int)estimate_unlink(parent, child);
++	if (result < 0)
++		return result;
++
++	return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
++}
++
++/* helper for setattr_common */
++static int setattr_reserve(reiser4_tree * tree)
++{
++	assert("vs-1096", is_grab_enabled(get_current_context()));
++	return reiser4_grab_space(estimate_one_insert_into_item(tree),
++				  BA_CAN_COMMIT);
++}
++
++/* helper function. Standards require that for many file-system operations
++   on success ctime and mtime of parent directory is to be updated. */
++int reiser4_update_dir(struct inode *dir)
++{
++	assert("nikita-2525", dir != NULL);
++
++	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++	return reiser4_update_sd(dir);
++}
+Index: linux-2.6.16/fs/reiser4/plugin/inode_ops_rename.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/inode_ops_rename.c
+@@ -0,0 +1,904 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "../inode.h"
++#include "../safe_link.h"
++
++static const char *possible_leak = "Possible disk space leak.";
++
++/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
++
++   Helper function called from hashed_rename() */
++static int replace_name(struct inode *to_inode,	/* inode where @from_coord is
++						 * to be re-targeted at */
++			struct inode *from_dir,	/* directory where @from_coord
++						 * lives */
++			struct inode *from_inode,	/* inode @from_coord
++							 * originally point to */
++			coord_t * from_coord,	/* where directory entry is in
++						 * the tree */
++			lock_handle * from_lh /* lock handle on @from_coord */ )
++{
++	item_plugin *from_item;
++	int result;
++	znode *node;
++
++	coord_clear_iplug(from_coord);
++	node = from_coord->node;
++	result = zload(node);
++	if (result != 0)
++		return result;
++	from_item = item_plugin_by_coord(from_coord);
++	if (item_type_by_coord(from_coord) == DIR_ENTRY_ITEM_TYPE) {
++		reiser4_key to_key;
++
++		build_sd_key(to_inode, &to_key);
++
++		/* everything is found and prepared to change directory entry
++		   at @from_coord to point to @to_inode.
++
++		   @to_inode is just about to get new name, so bump its link
++		   counter.
++
++		 */
++		result = reiser4_add_nlink(to_inode, from_dir, 0);
++		if (result != 0) {
++			/* Don't issue warning: this may be plain -EMLINK */
++			zrelse(node);
++			return result;
++		}
++
++		result =
++		    from_item->s.dir.update_key(from_coord, &to_key, from_lh);
++		if (result != 0) {
++			reiser4_del_nlink(to_inode, from_dir, 0);
++			zrelse(node);
++			return result;
++		}
++
++		/* @from_inode just lost its name, he-he.
++
++		   If @from_inode was directory, it contained dotdot pointing
++		   to @from_dir. @from_dir i_nlink will be decreased when
++		   iput() will be called on @from_inode.
++
++		   If file-system is not ADG (hard-links are
++		   supported on directories), iput(from_inode) will not remove
++		   @from_inode, and thus above is incorrect, but hard-links on
++		   directories are problematic in many other respects.
++		 */
++		result = reiser4_del_nlink(from_inode, from_dir, 0);
++		if (result != 0) {
++			warning("nikita-2330",
++				"Cannot remove link from source: %i. %s",
++				result, possible_leak);
++		}
++		/* Has to return success, because entry is already
++		 * modified. */
++		result = 0;
++
++		/* NOTE-NIKITA consider calling plugin method in stead of
++		   accessing inode fields directly. */
++		from_dir->i_mtime = CURRENT_TIME;
++	} else {
++		warning("nikita-2326", "Unexpected item type");
++		result = RETERR(-EIO);
++	}
++	zrelse(node);
++	return result;
++}
++
++/* add new entry pointing to @inode into @dir at @coord, locked by @lh
++
++   Helper function used by hashed_rename(). */
++static int add_name(struct inode *inode,	/* inode where @coord is to be
++						 * re-targeted at */
++		    struct inode *dir,	/* directory where @coord lives */
++		    struct dentry *name,	/* new name */
++		    coord_t * coord,	/* where directory entry is in the tree */
++		    lock_handle * lh,	/* lock handle on @coord */
++		    int is_dir /* true, if @inode is directory */ )
++{
++	int result;
++	reiser4_dir_entry_desc entry;
++
++	assert("nikita-2333", lh->node == coord->node);
++	assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
++
++	memset(&entry, 0, sizeof entry);
++	entry.obj = inode;
++	/* build key of directory entry description */
++	inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
++
++	/* ext2 does this in different order: first inserts new entry,
++	   then increases directory nlink. We don't want do this,
++	   because reiser4_add_nlink() calls ->add_link() plugin
++	   method that can fail for whatever reason, leaving as with
++	   cleanup problems.
++	 */
++	/* @inode is getting new name */
++	reiser4_add_nlink(inode, dir, 0);
++	/* create @new_name in @new_dir pointing to
++	   @old_inode */
++	result = WITH_COORD(coord,
++			    inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
++									coord,
++									lh,
++									name,
++									&entry));
++	if (result != 0) {
++		int result2;
++		result2 = reiser4_del_nlink(inode, dir, 0);
++		if (result2 != 0) {
++			warning("nikita-2327",
++				"Cannot drop link on %lli %i. %s",
++				(unsigned long long)get_inode_oid(inode),
++				result2, possible_leak);
++		}
++	} else
++		INODE_INC_FIELD(dir, i_size);
++	return result;
++}
++
++static reiser4_block_nr estimate_rename(struct inode *old_dir,	/* directory where @old is located */
++					struct dentry *old_name,	/* old name */
++					struct inode *new_dir,	/* directory where @new is located */
++					struct dentry *new_name /* new name */ )
++{
++	reiser4_block_nr res1, res2;
++	dir_plugin *p_parent_old, *p_parent_new;
++	file_plugin *p_child_old, *p_child_new;
++
++	assert("vpf-311", old_dir != NULL);
++	assert("vpf-312", new_dir != NULL);
++	assert("vpf-313", old_name != NULL);
++	assert("vpf-314", new_name != NULL);
++
++	p_parent_old = inode_dir_plugin(old_dir);
++	p_parent_new = inode_dir_plugin(new_dir);
++	p_child_old = inode_file_plugin(old_name->d_inode);
++	if (new_name->d_inode)
++		p_child_new = inode_file_plugin(new_name->d_inode);
++	else
++		p_child_new = NULL;
++
++	/* find_entry - can insert one leaf. */
++	res1 = res2 = 1;
++
++	/* replace_name */
++	{
++		/* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
++		res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
++		/* update key */
++		res1 += 1;
++		/* reiser4_del_nlink(p_child_new) */
++		if (p_child_new)
++			res1 += p_child_new->estimate.update(new_name->d_inode);
++	}
++
++	/* else add_name */
++	{
++		/* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
++		res2 +=
++		    2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
++		/* reiser4_add_nlink(p_parent_old) */
++		res2 += p_child_old->estimate.update(old_name->d_inode);
++		/* add_entry(p_parent_new) */
++		res2 += p_parent_new->estimate.add_entry(new_dir);
++		/* reiser4_del_nlink(p_parent_old) */
++		res2 += p_child_old->estimate.update(old_name->d_inode);
++	}
++
++	res1 = res1 < res2 ? res2 : res1;
++
++	/* reiser4_write_sd(p_parent_new) */
++	res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
++
++	/* reiser4_write_sd(p_child_new) */
++	if (p_child_new)
++		res1 += p_child_new->estimate.update(new_name->d_inode);
++
++	/* hashed_rem_entry(p_parent_old) */
++	res1 += p_parent_old->estimate.rem_entry(old_dir);
++
++	/* reiser4_del_nlink(p_child_old) */
++	res1 += p_child_old->estimate.update(old_name->d_inode);
++
++	/* replace_name */
++	{
++		/* reiser4_add_nlink(p_parent_dir_new) */
++		res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
++		/* update_key */
++		res1 += 1;
++		/* reiser4_del_nlink(p_parent_new) */
++		res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
++		/* reiser4_del_nlink(p_parent_old) */
++		res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
++	}
++
++	/* reiser4_write_sd(p_parent_old) */
++	res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
++
++	/* reiser4_write_sd(p_child_old) */
++	res1 += p_child_old->estimate.update(old_name->d_inode);
++
++	return res1;
++}
++
++static int hashed_rename_estimate_and_grab(struct inode *old_dir,	/* directory where @old is located */
++					   struct dentry *old_name,	/* old name */
++					   struct inode *new_dir,	/* directory where @new is located */
++					   struct dentry *new_name
++					   /* new name */ )
++{
++	reiser4_block_nr reserve;
++
++	reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
++
++	if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
++		return RETERR(-ENOSPC);
++
++	return 0;
++}
++
++/* check whether @old_inode and @new_inode can be moved within file system
++ * tree. This singles out attempts to rename pseudo-files, for example. */
++static int can_rename(struct inode *old_dir, struct inode *old_inode,
++		      struct inode *new_dir, struct inode *new_inode)
++{
++	file_plugin *fplug;
++	dir_plugin *dplug;
++
++	assert("nikita-3370", old_inode != NULL);
++
++	dplug = inode_dir_plugin(new_dir);
++	fplug = inode_file_plugin(old_inode);
++
++	if (dplug == NULL)
++		return RETERR(-ENOTDIR);
++	else if (new_dir->i_op->create == NULL)
++		return RETERR(-EPERM);
++	else if (!fplug->can_add_link(old_inode))
++		return RETERR(-EMLINK);
++	else if (new_inode != NULL) {
++		fplug = inode_file_plugin(new_inode);
++		if (fplug->can_rem_link != NULL &&
++		    !fplug->can_rem_link(new_inode))
++			return RETERR(-EBUSY);
++	}
++	return 0;
++}
++
++int find_entry(struct inode *, struct dentry *, lock_handle *,
++	       znode_lock_mode, reiser4_dir_entry_desc *);
++int reiser4_update_dir(struct inode *);
++
++/* this is common implementation of vfs's rename method of struct
++   inode_operations
++   See comments in the body.
++
++   It is arguable that this function can be made generic so, that it
++   will be applicable to any kind of directory plugin that deals with
++   directories composed out of directory entries. The only obstacle
++   here is that we don't have any data-type to represent directory
++   entry. This should be re-considered when more than one different
++   directory plugin will be implemented.
++*/
++int rename_common(struct inode *old_dir /* directory where @old is located */ ,
++		  struct dentry *old_name /* old name */ ,
++		  struct inode *new_dir /* directory where @new is located */ ,
++		  struct dentry *new_name /* new name */ )
++{
++	/* From `The Open Group Base Specifications Issue 6'
++
++	   If either the old or new argument names a symbolic link, rename()
++	   shall operate on the symbolic link itself, and shall not resolve
++	   the last component of the argument. If the old argument and the new
++	   argument resolve to the same existing file, rename() shall return
++	   successfully and perform no other action.
++
++	   [this is done by VFS: vfs_rename()]
++
++	   If the old argument points to the pathname of a file that is not a
++	   directory, the new argument shall not point to the pathname of a
++	   directory.
++
++	   [checked by VFS: vfs_rename->may_delete()]
++
++	   If the link named by the new argument exists, it shall
++	   be removed and old renamed to new. In this case, a link named new
++	   shall remain visible to other processes throughout the renaming
++	   operation and refer either to the file referred to by new or old
++	   before the operation began.
++
++	   [we should assure this]
++
++	   Write access permission is required for
++	   both the directory containing old and the directory containing new.
++
++	   [checked by VFS: vfs_rename->may_delete(), may_create()]
++
++	   If the old argument points to the pathname of a directory, the new
++	   argument shall not point to the pathname of a file that is not a
++	   directory.
++
++	   [checked by VFS: vfs_rename->may_delete()]
++
++	   If the directory named by the new argument exists, it
++	   shall be removed and old renamed to new. In this case, a link named
++	   new shall exist throughout the renaming operation and shall refer
++	   either to the directory referred to by new or old before the
++	   operation began.
++
++	   [we should assure this]
++
++	   If new names an existing directory, it shall be
++	   required to be an empty directory.
++
++	   [we should check this]
++
++	   If the old argument points to a pathname of a symbolic link, the
++	   symbolic link shall be renamed. If the new argument points to a
++	   pathname of a symbolic link, the symbolic link shall be removed.
++
++	   The new pathname shall not contain a path prefix that names
++	   old. Write access permission is required for the directory
++	   containing old and the directory containing new. If the old
++	   argument points to the pathname of a directory, write access
++	   permission may be required for the directory named by old, and, if
++	   it exists, the directory named by new.
++
++	   [checked by VFS: vfs_rename(), vfs_rename_dir()]
++
++	   If the link named by the new argument exists and the file's link
++	   count becomes 0 when it is removed and no process has the file
++	   open, the space occupied by the file shall be freed and the file
++	   shall no longer be accessible. If one or more processes have the
++	   file open when the last link is removed, the link shall be removed
++	   before rename() returns, but the removal of the file contents shall
++	   be postponed until all references to the file are closed.
++
++	   [iput() handles this, but we can do this manually, a la
++	   reiser4_unlink()]
++
++	   Upon successful completion, rename() shall mark for update the
++	   st_ctime and st_mtime fields of the parent directory of each file.
++
++	   [N/A]
++
++	 */
++	reiser4_context *ctx;
++	int result;
++	int is_dir;		/* is @old_name directory */
++
++	struct inode *old_inode;
++	struct inode *new_inode;
++	coord_t *new_coord;
++
++	reiser4_dentry_fsdata *new_fsdata;
++	dir_plugin *dplug;
++	file_plugin *fplug;
++
++	reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
++	lock_handle *new_lh, *dotdot_lh;
++	struct dentry *dotdot_name;
++	reiser4_dentry_fsdata *dataonstack;
++
++	ctx = init_context(old_dir->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
++			    sizeof(*dotdot_name) + sizeof(*dataonstack),
++			    GFP_KERNEL);
++	if (old_entry == NULL) {
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return RETERR(-ENOMEM);
++	}
++	memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
++	       sizeof(*dotdot_name) + sizeof(*dataonstack));
++
++	new_entry = old_entry + 1;
++	dotdot_entry = old_entry + 2;
++	new_lh = (lock_handle *)(old_entry + 3);
++	dotdot_lh = new_lh + 1;
++	dotdot_name = (struct dentry *)(new_lh + 2);
++	dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
++
++	assert("nikita-2318", old_dir != NULL);
++	assert("nikita-2319", new_dir != NULL);
++	assert("nikita-2320", old_name != NULL);
++	assert("nikita-2321", new_name != NULL);
++
++	old_inode = old_name->d_inode;
++	new_inode = new_name->d_inode;
++
++	dplug = inode_dir_plugin(old_dir);
++	fplug = NULL;
++
++	new_fsdata = reiser4_get_dentry_fsdata(new_name);
++	if (IS_ERR(new_fsdata)) {
++		kfree(old_entry);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return PTR_ERR(new_fsdata);
++	}
++
++	new_coord = &new_fsdata->dec.entry_coord;
++	coord_clear_iplug(new_coord);
++
++	is_dir = S_ISDIR(old_inode->i_mode);
++
++	assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
++
++	/* if target is existing directory and it's not empty---return error.
++
++	   This check is done specifically, because is_dir_empty() requires
++	   tree traversal and have to be done before locks are taken.
++	 */
++	if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
++		kfree(old_entry);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return RETERR(-ENOTEMPTY);
++	}
++
++	result = can_rename(old_dir, old_inode, new_dir, new_inode);
++	if (result != 0) {
++		kfree(old_entry);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	result = hashed_rename_estimate_and_grab(old_dir, old_name,
++						 new_dir, new_name);
++	if (result != 0) {
++		kfree(old_entry);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	init_lh(new_lh);
++
++	/* find entry for @new_name */
++	result = find_entry(new_dir,
++			    new_name, new_lh, ZNODE_WRITE_LOCK, new_entry);
++
++	if (IS_CBKERR(result)) {
++		done_lh(new_lh);
++		kfree(old_entry);
++		context_set_commit_async(ctx);
++		reiser4_exit_context(ctx);
++		return result;
++	}
++
++	seal_done(&new_fsdata->dec.entry_seal);
++
++	/* add or replace name for @old_inode as @new_name */
++	if (new_inode != NULL) {
++		/* target (@new_name) exists. */
++		/* Not clear what to do with objects that are
++		   both directories and files at the same time. */
++		if (result == CBK_COORD_FOUND) {
++			result = replace_name(old_inode,
++					      new_dir,
++					      new_inode, new_coord, new_lh);
++			if (result == 0)
++				fplug = inode_file_plugin(new_inode);
++		} else if (result == CBK_COORD_NOTFOUND) {
++			/* VFS told us that @new_name is bound to existing
++			   inode, but we failed to find directory entry. */
++			warning("nikita-2324", "Target not found");
++			result = RETERR(-ENOENT);
++		}
++	} else {
++		/* target (@new_name) doesn't exists. */
++		if (result == CBK_COORD_NOTFOUND)
++			result = add_name(old_inode,
++					  new_dir,
++					  new_name, new_coord, new_lh, is_dir);
++		else if (result == CBK_COORD_FOUND) {
++			/* VFS told us that @new_name is "negative" dentry,
++			   but we found directory entry. */
++			warning("nikita-2331", "Target found unexpectedly");
++			result = RETERR(-EIO);
++		}
++	}
++
++	assert("nikita-3462", ergo(result == 0,
++				   old_inode->i_nlink >= 2 + !!is_dir));
++
++	/* We are done with all modifications to the @new_dir, release lock on
++	   node. */
++	done_lh(new_lh);
++
++	if (fplug != NULL) {
++		/* detach @new_inode from name-space */
++		result = fplug->detach(new_inode, new_dir);
++		if (result != 0)
++			warning("nikita-2330", "Cannot detach %lli: %i. %s",
++				(unsigned long long)get_inode_oid(new_inode),
++				result, possible_leak);
++	}
++
++	if (new_inode != NULL)
++		reiser4_update_sd(new_inode);
++
++	if (result == 0) {
++		old_entry->obj = old_inode;
++
++		dplug->build_entry_key(old_dir,
++				       &old_name->d_name, &old_entry->key);
++
++		/* At this stage new name was introduced for
++		   @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
++		   counters were updated.
++
++		   We want to remove @old_name now. If @old_inode wasn't
++		   directory this is simple.
++		 */
++		result = dplug->rem_entry(old_dir, old_name, old_entry);
++		if (result != 0 && result != -ENOMEM) {
++			warning("nikita-2335",
++				"Cannot remove old name: %i", result);
++		} else {
++			result = reiser4_del_nlink(old_inode, old_dir, 0);
++			if (result != 0 && result != -ENOMEM) {
++				warning("nikita-2337",
++					"Cannot drop link on old: %i", result);
++			}
++		}
++
++		if (result == 0 && is_dir) {
++			/* @old_inode is directory. We also have to update
++			   dotdot entry. */
++			coord_t *dotdot_coord;
++
++			memset(dataonstack, 0, sizeof dataonstack);
++			memset(dotdot_entry, 0, sizeof dotdot_entry);
++			dotdot_entry->obj = old_dir;
++			memset(dotdot_name, 0, sizeof dotdot_name);
++			dotdot_name->d_name.name = "..";
++			dotdot_name->d_name.len = 2;
++			/*
++			 * allocate ->d_fsdata on the stack to avoid using
++			 * reiser4_get_dentry_fsdata(). Locking is not needed,
++			 * because dentry is private to the current thread.
++			 */
++			dotdot_name->d_fsdata = dataonstack;
++			init_lh(dotdot_lh);
++
++			dotdot_coord = &dataonstack->dec.entry_coord;
++			coord_clear_iplug(dotdot_coord);
++
++			result = find_entry(old_inode, dotdot_name, dotdot_lh,
++					    ZNODE_WRITE_LOCK, dotdot_entry);
++			if (result == 0) {
++				/* replace_name() decreases i_nlink on
++				 * @old_dir */
++				result = replace_name(new_dir,
++						      old_inode,
++						      old_dir,
++						      dotdot_coord, dotdot_lh);
++			} else
++				result = RETERR(-EIO);
++			done_lh(dotdot_lh);
++		}
++	}
++	reiser4_update_dir(new_dir);
++	reiser4_update_dir(old_dir);
++	reiser4_update_sd(old_inode);
++	if (result == 0) {
++		file_plugin *fplug;
++
++		if (new_inode != NULL) {
++			/* add safe-link for target file (in case we removed
++			 * last reference to the poor fellow */
++			fplug = inode_file_plugin(new_inode);
++			if (new_inode->i_nlink == 0)
++				result = safe_link_add(new_inode, SAFE_UNLINK);
++		}
++	}
++	kfree(old_entry);
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++
++#if 0
++int rename_common(struct inode *old_dir /* directory where @old is located */ ,
++		  struct dentry *old_name /* old name */ ,
++		  struct inode *new_dir /* directory where @new is located */ ,
++		  struct dentry *new_name /* new name */ )
++{
++	/* From `The Open Group Base Specifications Issue 6'
++
++	   If either the old or new argument names a symbolic link, rename()
++	   shall operate on the symbolic link itself, and shall not resolve
++	   the last component of the argument. If the old argument and the new
++	   argument resolve to the same existing file, rename() shall return
++	   successfully and perform no other action.
++
++	   [this is done by VFS: vfs_rename()]
++
++	   If the old argument points to the pathname of a file that is not a
++	   directory, the new argument shall not point to the pathname of a
++	   directory.
++
++	   [checked by VFS: vfs_rename->may_delete()]
++
++	   If the link named by the new argument exists, it shall
++	   be removed and old renamed to new. In this case, a link named new
++	   shall remain visible to other processes throughout the renaming
++	   operation and refer either to the file referred to by new or old
++	   before the operation began.
++
++	   [we should assure this]
++
++	   Write access permission is required for
++	   both the directory containing old and the directory containing new.
++
++	   [checked by VFS: vfs_rename->may_delete(), may_create()]
++
++	   If the old argument points to the pathname of a directory, the new
++	   argument shall not point to the pathname of a file that is not a
++	   directory.
++
++	   [checked by VFS: vfs_rename->may_delete()]
++
++	   If the directory named by the new argument exists, it
++	   shall be removed and old renamed to new. In this case, a link named
++	   new shall exist throughout the renaming operation and shall refer
++	   either to the directory referred to by new or old before the
++	   operation began.
++
++	   [we should assure this]
++
++	   If new names an existing directory, it shall be
++	   required to be an empty directory.
++
++	   [we should check this]
++
++	   If the old argument points to a pathname of a symbolic link, the
++	   symbolic link shall be renamed. If the new argument points to a
++	   pathname of a symbolic link, the symbolic link shall be removed.
++
++	   The new pathname shall not contain a path prefix that names
++	   old. Write access permission is required for the directory
++	   containing old and the directory containing new. If the old
++	   argument points to the pathname of a directory, write access
++	   permission may be required for the directory named by old, and, if
++	   it exists, the directory named by new.
++
++	   [checked by VFS: vfs_rename(), vfs_rename_dir()]
++
++	   If the link named by the new argument exists and the file's link
++	   count becomes 0 when it is removed and no process has the file
++	   open, the space occupied by the file shall be freed and the file
++	   shall no longer be accessible. If one or more processes have the
++	   file open when the last link is removed, the link shall be removed
++	   before rename() returns, but the removal of the file contents shall
++	   be postponed until all references to the file are closed.
++
++	   [iput() handles this, but we can do this manually, a la
++	   reiser4_unlink()]
++
++	   Upon successful completion, rename() shall mark for update the
++	   st_ctime and st_mtime fields of the parent directory of each file.
++
++	   [N/A]
++
++	 */
++	reiser4_context *ctx;
++	int result;
++	int is_dir;		/* is @old_name directory */
++	struct inode *old_inode;
++	struct inode *new_inode;
++	reiser4_dir_entry_desc old_entry;
++	reiser4_dir_entry_desc new_entry;
++	coord_t *new_coord;
++	reiser4_dentry_fsdata *new_fsdata;
++	lock_handle new_lh;
++	dir_plugin *dplug;
++	file_plugin *fplug;
++
++	ctx = init_context(old_dir->i_sb);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	assert("nikita-2318", old_dir != NULL);
++	assert("nikita-2319", new_dir != NULL);
++	assert("nikita-2320", old_name != NULL);
++	assert("nikita-2321", new_name != NULL);
++
++	old_inode = old_name->d_inode;
++	new_inode = new_name->d_inode;
++
++	dplug = inode_dir_plugin(old_dir);
++	fplug = NULL;
++
++	new_fsdata = reiser4_get_dentry_fsdata(new_name);
++	if (IS_ERR(new_fsdata)) {
++		result = PTR_ERR(new_fsdata);
++		goto exit;
++	}
++
++	new_coord = &new_fsdata->dec.entry_coord;
++	coord_clear_iplug(new_coord);
++
++	is_dir = S_ISDIR(old_inode->i_mode);
++
++	assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
++
++	/* if target is existing directory and it's not empty---return error.
++
++	   This check is done specifically, because is_dir_empty() requires
++	   tree traversal and have to be done before locks are taken.
++	 */
++	if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
++		return RETERR(-ENOTEMPTY);
++
++	result = can_rename(old_dir, old_inode, new_dir, new_inode);
++	if (result != 0)
++		goto exit;
++
++	result = hashed_rename_estimate_and_grab(old_dir, old_name,
++						 new_dir, new_name);
++	if (result != 0)
++		goto exit;
++
++	init_lh(&new_lh);
++
++	/* find entry for @new_name */
++	result = find_entry(new_dir,
++			    new_name, &new_lh, ZNODE_WRITE_LOCK, &new_entry);
++
++	if (IS_CBKERR(result)) {
++		done_lh(&new_lh);
++		goto exit;
++	}
++
++	seal_done(&new_fsdata->dec.entry_seal);
++
++	/* add or replace name for @old_inode as @new_name */
++	if (new_inode != NULL) {
++		/* target (@new_name) exists. */
++		/* Not clear what to do with objects that are
++		   both directories and files at the same time. */
++		if (result == CBK_COORD_FOUND) {
++			result = replace_name(old_inode,
++					      new_dir,
++					      new_inode, new_coord, &new_lh);
++			if (result == 0)
++				fplug = inode_file_plugin(new_inode);
++		} else if (result == CBK_COORD_NOTFOUND) {
++			/* VFS told us that @new_name is bound to existing
++			   inode, but we failed to find directory entry. */
++			warning("nikita-2324", "Target not found");
++			result = RETERR(-ENOENT);
++		}
++	} else {
++		/* target (@new_name) doesn't exists. */
++		if (result == CBK_COORD_NOTFOUND)
++			result = add_name(old_inode,
++					  new_dir,
++					  new_name, new_coord, &new_lh, is_dir);
++		else if (result == CBK_COORD_FOUND) {
++			/* VFS told us that @new_name is "negative" dentry,
++			   but we found directory entry. */
++			warning("nikita-2331", "Target found unexpectedly");
++			result = RETERR(-EIO);
++		}
++	}
++
++	assert("nikita-3462", ergo(result == 0,
++				   old_inode->i_nlink >= 2 + !!is_dir));
++
++	/* We are done with all modifications to the @new_dir, release lock on
++	   node. */
++	done_lh(&new_lh);
++
++	if (fplug != NULL) {
++		/* detach @new_inode from name-space */
++		result = fplug->detach(new_inode, new_dir);
++		if (result != 0)
++			warning("nikita-2330", "Cannot detach %lli: %i. %s",
++				(unsigned long long)get_inode_oid(new_inode),
++				result, possible_leak);
++	}
++
++	if (new_inode != NULL)
++		reiser4_update_sd(new_inode);
++
++	if (result == 0) {
++		memset(&old_entry, 0, sizeof old_entry);
++		old_entry.obj = old_inode;
++
++		dplug->build_entry_key(old_dir,
++				       &old_name->d_name, &old_entry.key);
++
++		/* At this stage new name was introduced for
++		   @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
++		   counters were updated.
++
++		   We want to remove @old_name now. If @old_inode wasn't
++		   directory this is simple.
++		 */
++		result = dplug->rem_entry(old_dir, old_name, &old_entry);
++		/*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
++		if (result != 0 && result != -ENOMEM) {
++			warning("nikita-2335",
++				"Cannot remove old name: %i", result);
++		} else {
++			result = reiser4_del_nlink(old_inode, old_dir, 0);
++			if (result != 0 && result != -ENOMEM) {
++				warning("nikita-2337",
++					"Cannot drop link on old: %i", result);
++			}
++		}
++
++		if (result == 0 && is_dir) {
++			/* @old_inode is directory. We also have to update
++			   dotdot entry. */
++			coord_t *dotdot_coord;
++			lock_handle dotdot_lh;
++			struct dentry dotdot_name;
++			reiser4_dir_entry_desc dotdot_entry;
++			reiser4_dentry_fsdata dataonstack;
++			reiser4_dentry_fsdata *fsdata;
++
++			memset(&dataonstack, 0, sizeof dataonstack);
++			memset(&dotdot_entry, 0, sizeof dotdot_entry);
++			dotdot_entry.obj = old_dir;
++			memset(&dotdot_name, 0, sizeof dotdot_name);
++			dotdot_name.d_name.name = "..";
++			dotdot_name.d_name.len = 2;
++			/*
++			 * allocate ->d_fsdata on the stack to avoid using
++			 * reiser4_get_dentry_fsdata(). Locking is not needed,
++			 * because dentry is private to the current thread.
++			 */
++			dotdot_name.d_fsdata = &dataonstack;
++			init_lh(&dotdot_lh);
++
++			fsdata = &dataonstack;
++			dotdot_coord = &fsdata->dec.entry_coord;
++			coord_clear_iplug(dotdot_coord);
++
++			result = find_entry(old_inode, &dotdot_name, &dotdot_lh,
++					    ZNODE_WRITE_LOCK, &dotdot_entry);
++			if (result == 0) {
++				/* replace_name() decreases i_nlink on
++				 * @old_dir */
++				result = replace_name(new_dir,
++						      old_inode,
++						      old_dir,
++						      dotdot_coord, &dotdot_lh);
++			} else
++				result = RETERR(-EIO);
++			done_lh(&dotdot_lh);
++		}
++	}
++	reiser4_update_dir(new_dir);
++	reiser4_update_dir(old_dir);
++	reiser4_update_sd(old_inode);
++	if (result == 0) {
++		file_plugin *fplug;
++
++		if (new_inode != NULL) {
++			/* add safe-link for target file (in case we removed
++			 * last reference to the poor fellow */
++			fplug = inode_file_plugin(new_inode);
++			if (new_inode->i_nlink == 0)
++				result = safe_link_add(new_inode, SAFE_UNLINK);
++		}
++	}
++      exit:
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++	return result;
++}
++#endif
+Index: linux-2.6.16/fs/reiser4/plugin/item/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/Makefile
+@@ -0,0 +1,18 @@
++obj-$(CONFIG_REISER4_FS) += item_plugins.o
++
++item_plugins-objs :=		\
++	item.o			\
++	static_stat.o		\
++	sde.o			\
++	cde.o			\
++	blackbox.o		\
++	internal.o		\
++	tail.o			\
++	ctail.o			\
++	extent.o		\
++	extent_item_ops.o	\
++	extent_file_ops.o	\
++	extent_flush_ops.o
++
++
++
+Index: linux-2.6.16/fs/reiser4/plugin/item/acl.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/acl.h
+@@ -0,0 +1,66 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Directory entry. */
++
++#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
++#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
++
++#include "../../forward.h"
++#include "../../dformat.h"
++#include "../../kassign.h"
++#include "../../key.h"
++
++#include <linux/fs.h>
++#include <linux/dcache.h>	/* for struct dentry */
++
++typedef struct directory_entry_format {
++	/* key of object stat-data. It's not necessary to store whole
++	   key here, because it's always key of stat-data, so minor
++	   packing locality and offset can be omitted here. But this
++	   relies on particular key allocation scheme for stat-data, so,
++	   for extensibility sake, whole key can be stored here.
++
++	   We store key as array of bytes, because we don't want 8-byte
++	   alignment of dir entries.
++	 */
++	obj_key_id id;
++	/* file name. Null terminated string. */
++	d8 name[0];
++} directory_entry_format;
++
++void print_de(const char *prefix, coord_t * coord);
++int extract_key_de(const coord_t * coord, reiser4_key * key);
++int update_key_de(const coord_t * coord, const reiser4_key * key,
++		  lock_handle * lh);
++char *extract_name_de(const coord_t * coord, char *buf);
++unsigned extract_file_type_de(const coord_t * coord);
++int add_entry_de(struct inode *dir, coord_t * coord,
++		 lock_handle * lh, const struct dentry *name,
++		 reiser4_dir_entry_desc * entry);
++int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
++		 lock_handle * lh, reiser4_dir_entry_desc * entry);
++int max_name_len_de(const struct inode *dir);
++
++int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
++
++char *extract_dent_name(const coord_t * coord,
++			directory_entry_format * dent, char *buf);
++
++#if REISER4_LARGE_KEY
++#define DE_NAME_BUF_LEN (24)
++#else
++#define DE_NAME_BUF_LEN (16)
++#endif
++
++/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/blackbox.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/blackbox.c
+@@ -0,0 +1,142 @@
++/* Copyright 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Black box item implementation */
++
++#include "../../forward.h"
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../../kassign.h"
++#include "../../coord.h"
++#include "../../tree.h"
++#include "../../lock.h"
++
++#include "blackbox.h"
++#include "item.h"
++#include "../plugin.h"
++
++int
++store_black_box(reiser4_tree * tree,
++		const reiser4_key * key, void *data, int length)
++{
++	int result;
++	reiser4_item_data idata;
++	coord_t coord;
++	lock_handle lh;
++
++	memset(&idata, 0, sizeof idata);
++
++	idata.data = data;
++	idata.user = 0;
++	idata.length = length;
++	idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
++
++	init_lh(&lh);
++	result = insert_by_key(tree, key,
++			       &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
++
++	assert("nikita-3413",
++	       ergo(result == 0,
++		    WITH_COORD(&coord,
++			       item_length_by_coord(&coord) == length)));
++
++	done_lh(&lh);
++	return result;
++}
++
++int
++load_black_box(reiser4_tree * tree,
++	       reiser4_key * key, void *data, int length, int exact)
++{
++	int result;
++	coord_t coord;
++	lock_handle lh;
++
++	init_lh(&lh);
++	result = coord_by_key(tree, key,
++			      &coord, &lh, ZNODE_READ_LOCK,
++			      exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
++			      LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
++
++	if (result == 0) {
++		int ilen;
++
++		result = zload(coord.node);
++		if (result == 0) {
++			ilen = item_length_by_coord(&coord);
++			if (ilen <= length) {
++				memcpy(data, item_body_by_coord(&coord), ilen);
++				unit_key_by_coord(&coord, key);
++			} else if (exact) {
++				/*
++				 * item is larger than buffer provided by the
++				 * user. Only issue a warning if @exact is
++				 * set. If @exact is false, we are iterating
++				 * over all safe-links and here we are reaching
++				 * the end of the iteration.
++				 */
++				warning("nikita-3415",
++					"Wrong black box length: %i > %i",
++					ilen, length);
++				result = RETERR(-EIO);
++			}
++			zrelse(coord.node);
++		}
++	}
++
++	done_lh(&lh);
++	return result;
++
++}
++
++int
++update_black_box(reiser4_tree * tree,
++		 const reiser4_key * key, void *data, int length)
++{
++	int result;
++	coord_t coord;
++	lock_handle lh;
++
++	init_lh(&lh);
++	result = coord_by_key(tree, key,
++			      &coord, &lh, ZNODE_READ_LOCK,
++			      FIND_EXACT,
++			      LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
++	if (result == 0) {
++		int ilen;
++
++		result = zload(coord.node);
++		if (result == 0) {
++			ilen = item_length_by_coord(&coord);
++			if (length <= ilen) {
++				memcpy(item_body_by_coord(&coord), data,
++				       length);
++			} else {
++				warning("nikita-3437",
++					"Wrong black box length: %i < %i",
++					ilen, length);
++				result = RETERR(-EIO);
++			}
++			zrelse(coord.node);
++		}
++	}
++
++	done_lh(&lh);
++	return result;
++
++}
++
++int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
++{
++	return cut_tree(tree, key, key, NULL, 1);
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/blackbox.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/blackbox.h
+@@ -0,0 +1,33 @@
++/* Copyright 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* "Black box" entry to fixed-width contain user supplied data */
++
++#if !defined( __FS_REISER4_BLACK_BOX_H__ )
++#define __FS_REISER4_BLACK_BOX_H__
++
++#include "../../forward.h"
++#include "../../dformat.h"
++#include "../../kassign.h"
++#include "../../key.h"
++
++extern int store_black_box(reiser4_tree * tree,
++			   const reiser4_key * key, void *data, int length);
++extern int load_black_box(reiser4_tree * tree,
++			  reiser4_key * key, void *data, int length, int exact);
++extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
++extern int update_black_box(reiser4_tree * tree,
++			    const reiser4_key * key, void *data, int length);
++
++/* __FS_REISER4_BLACK_BOX_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/cde.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/cde.c
+@@ -0,0 +1,1007 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Directory entry implementation */
++
++/* DESCRIPTION:
++
++   This is "compound" directory item plugin implementation. This directory
++   item type is compound (as opposed to the "simple directory item" in
++   fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
++   entries.
++
++   The reason behind this decision is disk space efficiency: all directory
++   entries inside the same directory have identical fragment in their
++   keys. This, of course, depends on key assignment policy. In our default key
++   assignment policy, all directory entries have the same locality which is
++   equal to the object id of their directory.
++
++   Composing directory item out of several directory entries for the same
++   directory allows us to store said key fragment only once. That is, this is
++   some ad hoc form of key compression (stem compression) that is implemented
++   here, because general key compression is not supposed to be implemented in
++   v4.0.
++
++   Another decision that was made regarding all directory item plugins, is
++   that they will store entry keys unaligned. This is for that sake of disk
++   space efficiency again.
++
++   In should be noted, that storing keys unaligned increases CPU consumption,
++   at least on some architectures.
++
++   Internal on-disk structure of the compound directory item is the following:
++
++        HEADER          cde_item_format.        Here number of entries is stored.
++        ENTRY_HEADER_0  cde_unit_header.        Here part of entry key and
++        ENTRY_HEADER_1                          offset of entry body are stored.
++        ENTRY_HEADER_2				(basically two last parts of key)
++        ...
++        ENTRY_HEADER_N
++        ENTRY_BODY_0    directory_entry_format. Here part of stat data key and
++        ENTRY_BODY_1                            NUL-terminated name are stored.
++        ENTRY_BODY_2				(part of statadta key in the
++  						 sence that since all SDs have
++  						 zero offset, this offset is not
++  						 stored on disk).
++        ...
++        ENTRY_BODY_N
++
++   When it comes to the balancing, each directory entry in compound directory
++   item is unit, that is, something that can be cut from one item and pasted
++   into another item of the same type. Handling of unit cut and paste is major
++   reason for the complexity of code below.
++
++*/
++
++#include "../../forward.h"
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../../kassign.h"
++#include "../../key.h"
++#include "../../coord.h"
++#include "sde.h"
++#include "cde.h"
++#include "item.h"
++#include "../node/node.h"
++#include "../plugin.h"
++#include "../../znode.h"
++#include "../../carry.h"
++#include "../../tree.h"
++#include "../../inode.h"
++
++#include <linux/fs.h>		/* for struct inode */
++#include <linux/dcache.h>	/* for struct dentry */
++#include <linux/quotaops.h>
++
++#if 0
++#define CHECKME(coord)						\
++({								\
++	const char *message;					\
++	coord_t dup;						\
++								\
++	coord_dup_nocheck(&dup, (coord));			\
++	dup.unit_pos = 0;					\
++	assert("nikita-2871", cde_check(&dup, &message) == 0);	\
++})
++#else
++#define CHECKME(coord) noop
++#endif
++
++/* return body of compound directory item at @coord */
++static inline cde_item_format *formatted_at(const coord_t * coord)
++{
++	assert("nikita-1282", coord != NULL);
++	return item_body_by_coord(coord);
++}
++
++/* return entry header at @coord */
++static inline cde_unit_header *header_at(const coord_t *
++					 coord /* coord of item */ ,
++					 int idx /* index of unit */ )
++{
++	assert("nikita-1283", coord != NULL);
++	return &formatted_at(coord)->entry[idx];
++}
++
++/* return number of units in compound directory item at @coord */
++static int units(const coord_t * coord /* coord of item */ )
++{
++	return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
++}
++
++/* return offset of the body of @idx-th entry in @coord */
++static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
++			      int idx /* index of unit */ )
++{
++	if (idx < units(coord))
++		return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
++	else if (idx == units(coord))
++		return item_length_by_coord(coord);
++	else
++		impossible("nikita-1308", "Wrong idx");
++	return 0;
++}
++
++/* set offset of the body of @idx-th entry in @coord */
++static void set_offset(const coord_t * coord /* coord of item */ ,
++		       int idx /* index of unit */ ,
++		       unsigned int offset /* new offset */ )
++{
++	put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
++}
++
++static void adj_offset(const coord_t * coord /* coord of item */ ,
++		       int idx /* index of unit */ ,
++		       int delta /* offset change */ )
++{
++	d16 *doffset;
++	__u16 offset;
++
++	doffset = &header_at(coord, idx)->offset;
++	offset = le16_to_cpu(get_unaligned(doffset));
++	offset += delta;
++	put_unaligned(cpu_to_le16((__u16) offset), doffset);
++}
++
++/* return pointer to @offset-th byte from the beginning of @coord */
++static char *address(const coord_t * coord /* coord of item */ ,
++		     int offset)
++{
++	return ((char *)item_body_by_coord(coord)) + offset;
++}
++
++/* return pointer to the body of @idx-th entry in @coord */
++static directory_entry_format *entry_at(const coord_t * coord	/* coord of
++								 * item */ ,
++					int idx /* index of unit */ )
++{
++	return (directory_entry_format *) address(coord,
++						  (int)offset_of(coord, idx));
++}
++
++/* return number of unit referenced by @coord */
++static int idx_of(const coord_t * coord /* coord of item */ )
++{
++	assert("nikita-1285", coord != NULL);
++	return coord->unit_pos;
++}
++
++/* find position where entry with @entry_key would be inserted into @coord */
++static int find(const coord_t * coord /* coord of item */ ,
++		const reiser4_key * entry_key /* key to look for */ ,
++		cmp_t * last /* result of last comparison */ )
++{
++	int entries;
++
++	int left;
++	int right;
++
++	cde_unit_header *header;
++
++	assert("nikita-1295", coord != NULL);
++	assert("nikita-1296", entry_key != NULL);
++	assert("nikita-1297", last != NULL);
++
++	entries = units(coord);
++	left = 0;
++	right = entries - 1;
++	while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
++		int median;
++
++		median = (left + right) >> 1;
++
++		header = header_at(coord, median);
++		*last = de_id_key_cmp(&header->hash, entry_key);
++		switch (*last) {
++		case LESS_THAN:
++			left = median;
++			break;
++		case GREATER_THAN:
++			right = median;
++			break;
++		case EQUAL_TO:{
++				do {
++					median--;
++					header--;
++				} while (median >= 0 &&
++					 de_id_key_cmp(&header->hash,
++						       entry_key) == EQUAL_TO);
++				return median + 1;
++			}
++		}
++	}
++	header = header_at(coord, left);
++	for (; left < entries; ++left, ++header) {
++		prefetch(header + 1);
++		*last = de_id_key_cmp(&header->hash, entry_key);
++		if (*last != LESS_THAN)
++			break;
++	}
++	if (left < entries)
++		return left;
++	else
++		return RETERR(-ENOENT);
++
++}
++
++/* expand @coord as to accommodate for insertion of @no new entries starting
++   from @pos, with total bodies size @size. */
++static int expand_item(const coord_t * coord /* coord of item */ ,
++		       int pos /* unit position */ , int no	/* number of new
++								 * units*/ ,
++		       int size /* total size of new units' data */ ,
++		       unsigned int data_size	/* free space already reserved
++						 * in the item for insertion */ )
++{
++	int entries;
++	cde_unit_header *header;
++	char *dent;
++	int i;
++
++	assert("nikita-1310", coord != NULL);
++	assert("nikita-1311", pos >= 0);
++	assert("nikita-1312", no > 0);
++	assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
++	assert("nikita-1343",
++	       item_length_by_coord(coord) >=
++	       (int)(size + data_size + no * sizeof *header));
++
++	entries = units(coord);
++
++	if (pos == entries)
++		dent = address(coord, size);
++	else
++		dent = (char *)entry_at(coord, pos);
++	/* place where new header will be in */
++	header = header_at(coord, pos);
++	/* free space for new entry headers */
++	memmove(header + no, header,
++		(unsigned)(address(coord, size) - (char *)header));
++	/* if adding to the end initialise first new header */
++	if (pos == entries) {
++		set_offset(coord, pos, (unsigned)size);
++	}
++
++	/* adjust entry pointer and size */
++	dent = dent + no * sizeof *header;
++	size += no * sizeof *header;
++	/* free space for new entries */
++	memmove(dent + data_size, dent,
++		(unsigned)(address(coord, size) - dent));
++
++	/* increase counter */
++	entries += no;
++	put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
++
++	/* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
++	   bytes.  */
++	for (i = 0; i <= pos; ++i)
++		adj_offset(coord, i, no * sizeof *header);
++	/* [ pos + no ... +\infty ) entries were shifted by ( no *
++	   sizeof *header + data_size ) bytes */
++	for (i = pos + no; i < entries; ++i)
++		adj_offset(coord, i, no * sizeof *header + data_size);
++	return 0;
++}
++
++/* insert new @entry into item */
++static int expand(const coord_t * coord /* coord of item */ ,
++		  cde_entry * entry /* entry to insert */ ,
++		  int len /* length of @entry data */ ,
++		  int *pos /* position to insert */ ,
++		  reiser4_dir_entry_desc * dir_entry	/* parameters for new
++							 * entry */ )
++{
++	cmp_t cmp_res;
++	int datasize;
++
++	*pos = find(coord, &dir_entry->key, &cmp_res);
++	if (*pos < 0)
++		*pos = units(coord);
++
++	datasize = sizeof(directory_entry_format);
++	if (is_longname(entry->name->name, entry->name->len))
++		datasize += entry->name->len + 1;
++
++	expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
++		    datasize);
++	return 0;
++}
++
++/* paste body of @entry into item */
++static int paste_entry(const coord_t * coord /* coord of item */ ,
++		       cde_entry * entry /* new entry */ ,
++		       int pos /* position to insert */ ,
++		       reiser4_dir_entry_desc * dir_entry	/* parameters for
++								 * new entry */ )
++{
++	cde_unit_header *header;
++	directory_entry_format *dent;
++	const char *name;
++	int len;
++
++	header = header_at(coord, pos);
++	dent = entry_at(coord, pos);
++
++	build_de_id_by_key(&dir_entry->key, &header->hash);
++	build_inode_key_id(entry->obj, &dent->id);
++	/* AUDIT unsafe strcpy() operation! It should be replaced with
++	   much less CPU hungry
++	   memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
++
++	   Also a more major thing is that there should be a way to figure out
++	   amount of space in dent -> name and be able to check that we are
++	   not going to overwrite more than we supposed to */
++	name = entry->name->name;
++	len = entry->name->len;
++	if (is_longname(name, len)) {
++		strcpy((unsigned char *)dent->name, name);
++		put_unaligned(0, &dent->name[len]);
++	}
++	return 0;
++}
++
++/* estimate how much space is necessary in item to insert/paste set of entries
++   described in @data. */
++int estimate_cde(const coord_t * coord /* coord of item */ ,
++		 const reiser4_item_data * data /* parameters for new item */ )
++{
++	cde_entry_data *e;
++	int result;
++	int i;
++
++	e = (cde_entry_data *) data->data;
++
++	assert("nikita-1288", e != NULL);
++	assert("nikita-1289", e->num_of_entries >= 0);
++
++	if (coord == NULL)
++		/* insert */
++		result = sizeof(cde_item_format);
++	else
++		/* paste */
++		result = 0;
++
++	result += e->num_of_entries *
++	    (sizeof(cde_unit_header) + sizeof(directory_entry_format));
++	for (i = 0; i < e->num_of_entries; ++i) {
++		const char *name;
++		int len;
++
++		name = e->entry[i].name->name;
++		len = e->entry[i].name->len;
++		assert("nikita-2054", strlen(name) == len);
++		if (is_longname(name, len))
++			result += len + 1;
++	}
++	((reiser4_item_data *) data)->length = result;
++	return result;
++}
++
++/* ->nr_units() method for this item plugin. */
++pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
++{
++	return units(coord);
++}
++
++/* ->unit_key() method for this item plugin. */
++reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
++			  reiser4_key * key /* resulting key */ )
++{
++	assert("nikita-1452", coord != NULL);
++	assert("nikita-1345", idx_of(coord) < units(coord));
++	assert("nikita-1346", key != NULL);
++
++	item_key_by_coord(coord, key);
++	extract_key_from_de_id(extract_dir_id_from_key(key),
++			       &header_at(coord, idx_of(coord))->hash, key);
++	return key;
++}
++
++/* mergeable_cde(): implementation of ->mergeable() item method.
++
++   Two directory items are mergeable iff they are from the same
++   directory. That simple.
++
++*/
++int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
++		  const coord_t * p2 /* coord of second item */ )
++{
++	reiser4_key k1;
++	reiser4_key k2;
++
++	assert("nikita-1339", p1 != NULL);
++	assert("nikita-1340", p2 != NULL);
++
++	return
++	    (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
++	    (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
++	     extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
++
++}
++
++/* ->max_key_inside() method for this item plugin. */
++reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
++				reiser4_key * result /* resulting key */ )
++{
++	assert("nikita-1342", coord != NULL);
++
++	item_key_by_coord(coord, result);
++	set_key_ordering(result, get_key_ordering(max_key()));
++	set_key_fulloid(result, get_key_fulloid(max_key()));
++	set_key_offset(result, get_key_offset(max_key()));
++	return result;
++}
++
++/* @data contains data which are to be put into tree */
++int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
++			const reiser4_key * key /* key to check */ ,
++			const reiser4_item_data * data	/* parameters of new
++							 * item/unit being
++							 * created */ )
++{
++	reiser4_key item_key;
++
++	/* FIXME-VS: do not rely on anything but iplug field of @data. Only
++	   data->iplug is initialized */
++	assert("vs-457", data && data->iplug);
++/*	assert( "vs-553", data -> user == 0 );*/
++	item_key_by_coord(coord, &item_key);
++
++	return (item_plugin_by_coord(coord) == data->iplug) &&
++	    (extract_dir_id_from_key(&item_key) ==
++	     extract_dir_id_from_key(key));
++}
++
++#if REISER4_DEBUG
++/* cde_check ->check() method for compressed directory items
++
++   used for debugging, every item should have here the most complete
++   possible check of the consistency of the item that the inventor can
++   construct
++*/
++int check_cde(const coord_t * coord /* coord of item to check */ ,
++	      const char **error /* where to store error message */ )
++{
++	int i;
++	int result;
++	char *item_start;
++	char *item_end;
++	reiser4_key key;
++
++	coord_t c;
++
++	assert("nikita-1357", coord != NULL);
++	assert("nikita-1358", error != NULL);
++
++	if (!ergo(coord->item_pos != 0,
++		  is_dot_key(item_key_by_coord(coord, &key)))) {
++		*error = "CDE doesn't start with dot";
++		return -1;
++	}
++	item_start = item_body_by_coord(coord);
++	item_end = item_start + item_length_by_coord(coord);
++
++	coord_dup(&c, coord);
++	result = 0;
++	for (i = 0; i < units(coord); ++i) {
++		directory_entry_format *entry;
++
++		if ((char *)(header_at(coord, i) + 1) >
++		    item_end - units(coord) * sizeof *entry) {
++			*error = "CDE header is out of bounds";
++			result = -1;
++			break;
++		}
++		entry = entry_at(coord, i);
++		if ((char *)entry < item_start + sizeof(cde_item_format)) {
++			*error = "CDE header is too low";
++			result = -1;
++			break;
++		}
++		if ((char *)(entry + 1) > item_end) {
++			*error = "CDE header is too high";
++			result = -1;
++			break;
++		}
++	}
++
++	return result;
++}
++#endif
++
++/* ->init() method for this item plugin. */
++int init_cde(coord_t * coord /* coord of item */ ,
++	     coord_t * from UNUSED_ARG, reiser4_item_data * data	/* structure used for insertion */
++	     UNUSED_ARG)
++{
++	put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
++	return 0;
++}
++
++/* ->lookup() method for this item plugin. */
++lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
++			 lookup_bias bias /* search bias */ ,
++			 coord_t * coord /* coord of item to lookup in */ )
++{
++	cmp_t last_comp;
++	int pos;
++
++	reiser4_key utmost_key;
++
++	assert("nikita-1293", coord != NULL);
++	assert("nikita-1294", key != NULL);
++
++	CHECKME(coord);
++
++	if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
++		coord->unit_pos = 0;
++		coord->between = BEFORE_UNIT;
++		return CBK_COORD_NOTFOUND;
++	}
++	pos = find(coord, key, &last_comp);
++	if (pos >= 0) {
++		coord->unit_pos = (int)pos;
++		switch (last_comp) {
++		case EQUAL_TO:
++			coord->between = AT_UNIT;
++			return CBK_COORD_FOUND;
++		case GREATER_THAN:
++			coord->between = BEFORE_UNIT;
++			return RETERR(-ENOENT);
++		case LESS_THAN:
++		default:
++			impossible("nikita-1298", "Broken find");
++			return RETERR(-EIO);
++		}
++	} else {
++		coord->unit_pos = units(coord) - 1;
++		coord->between = AFTER_UNIT;
++		return (bias ==
++			FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
++		    CBK_COORD_NOTFOUND;
++	}
++}
++
++/* ->paste() method for this item plugin. */
++int paste_cde(coord_t * coord /* coord of item */ ,
++	      reiser4_item_data * data	/* parameters of new unit being
++					 * inserted */ ,
++	      carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
++{
++	cde_entry_data *e;
++	int result;
++	int i;
++
++	CHECKME(coord);
++	e = (cde_entry_data *) data->data;
++
++	result = 0;
++	for (i = 0; i < e->num_of_entries; ++i) {
++		int pos;
++		int phantom_size;
++
++		phantom_size = data->length;
++		if (units(coord) == 0)
++			phantom_size -= sizeof(cde_item_format);
++
++		result =
++		    expand(coord, e->entry + i, phantom_size, &pos, data->arg);
++		if (result != 0)
++			break;
++		result = paste_entry(coord, e->entry + i, pos, data->arg);
++		if (result != 0)
++			break;
++	}
++	CHECKME(coord);
++	return result;
++}
++
++/* amount of space occupied by all entries starting from @idx both headers and
++   bodies. */
++static unsigned int part_size(const coord_t * coord /* coord of item */ ,
++			      int idx /* index of unit */ )
++{
++	assert("nikita-1299", coord != NULL);
++	assert("nikita-1300", idx < (int)units(coord));
++
++	return sizeof(cde_item_format) +
++	    (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
++							    idx + 1) -
++	    offset_of(coord, 0);
++}
++
++/* how many but not more than @want units of @source can be merged with
++   item in @target node. If pend == append - we try to append last item
++   of @target by first units of @source. If pend == prepend - we try to
++   "prepend" first item in @target by last units of @source. @target
++   node has @free_space bytes of free space. Total size of those units
++   are returned via @size */
++int can_shift_cde(unsigned free_space /* free space in item */ ,
++		  coord_t * coord /* coord of source item */ ,
++		  znode * target /* target node */ ,
++		  shift_direction pend /* shift direction */ ,
++		  unsigned *size /* resulting number of shifted bytes */ ,
++		  unsigned want /* maximal number of bytes to shift */ )
++{
++	int shift;
++
++	CHECKME(coord);
++	if (want == 0) {
++		*size = 0;
++		return 0;
++	}
++
++	/* pend == SHIFT_LEFT <==> shifting to the left */
++	if (pend == SHIFT_LEFT) {
++		for (shift = min((int)want - 1, units(coord)); shift >= 0;
++		     --shift) {
++			*size = part_size(coord, shift);
++			if (target != NULL)
++				*size -= sizeof(cde_item_format);
++			if (*size <= free_space)
++				break;
++		}
++		shift = shift + 1;
++	} else {
++		int total_size;
++
++		assert("nikita-1301", pend == SHIFT_RIGHT);
++
++		total_size = item_length_by_coord(coord);
++		for (shift = units(coord) - want - 1; shift < units(coord) - 1;
++		     ++shift) {
++			*size = total_size - part_size(coord, shift);
++			if (target == NULL)
++				*size += sizeof(cde_item_format);
++			if (*size <= free_space)
++				break;
++		}
++		shift = units(coord) - shift - 1;
++	}
++	if (shift == 0)
++		*size = 0;
++	CHECKME(coord);
++	return shift;
++}
++
++/* ->copy_units() method for this item plugin. */
++void copy_units_cde(coord_t * target /* coord of target item */ ,
++		    coord_t * source /* coord of source item */ ,
++		    unsigned from /* starting unit */ ,
++		    unsigned count /* how many units to copy */ ,
++		    shift_direction where_is_free_space /* shift direction */ ,
++		    unsigned free_space /* free space in item */ )
++{
++	char *header_from;
++	char *header_to;
++
++	char *entry_from;
++	char *entry_to;
++
++	int pos_in_target;
++	int data_size;
++	int data_delta;
++	int i;
++
++	assert("nikita-1303", target != NULL);
++	assert("nikita-1304", source != NULL);
++	assert("nikita-1305", (int)from < units(source));
++	assert("nikita-1307", (int)(from + count) <= units(source));
++
++	if (where_is_free_space == SHIFT_LEFT) {
++		assert("nikita-1453", from == 0);
++		pos_in_target = units(target);
++	} else {
++		assert("nikita-1309", (int)(from + count) == units(source));
++		pos_in_target = 0;
++		memmove(item_body_by_coord(target),
++			(char *)item_body_by_coord(target) + free_space,
++			item_length_by_coord(target) - free_space);
++	}
++
++	CHECKME(target);
++	CHECKME(source);
++
++	/* expand @target */
++	data_size =
++	    offset_of(source, (int)(from + count)) - offset_of(source,
++							       (int)from);
++
++	if (units(target) == 0)
++		free_space -= sizeof(cde_item_format);
++
++	expand_item(target, pos_in_target, (int)count,
++		    (int)(item_length_by_coord(target) - free_space),
++		    (unsigned)data_size);
++
++	/* copy first @count units of @source into @target */
++	data_delta =
++	    offset_of(target, pos_in_target) - offset_of(source, (int)from);
++
++	/* copy entries */
++	entry_from = (char *)entry_at(source, (int)from);
++	entry_to = (char *)entry_at(source, (int)(from + count));
++	memmove(entry_at(target, pos_in_target), entry_from,
++		(unsigned)(entry_to - entry_from));
++
++	/* copy headers */
++	header_from = (char *)header_at(source, (int)from);
++	header_to = (char *)header_at(source, (int)(from + count));
++	memmove(header_at(target, pos_in_target), header_from,
++		(unsigned)(header_to - header_from));
++
++	/* update offsets */
++	for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
++		adj_offset(target, i, data_delta);
++	CHECKME(target);
++	CHECKME(source);
++}
++
++/* ->cut_units() method for this item plugin. */
++int cut_units_cde(coord_t * coord /* coord of item */ ,
++		  pos_in_node_t from /* start unit pos */ ,
++		  pos_in_node_t to /* stop unit pos */ ,
++		  struct carry_cut_data *cdata UNUSED_ARG,
++		  reiser4_key * smallest_removed, reiser4_key * new_first)
++{
++	char *header_from;
++	char *header_to;
++
++	char *entry_from;
++	char *entry_to;
++
++	int size;
++	int entry_delta;
++	int header_delta;
++	int i;
++
++	unsigned count;
++
++	CHECKME(coord);
++
++	count = to - from + 1;
++
++	assert("nikita-1454", coord != NULL);
++	assert("nikita-1455", (int)(from + count) <= units(coord));
++
++	if (smallest_removed)
++		unit_key_by_coord(coord, smallest_removed);
++
++	if (new_first) {
++		coord_t next;
++
++		/* not everything is cut from item head */
++		assert("vs-1527", from == 0);
++		assert("vs-1528", to < units(coord) - 1);
++
++		coord_dup(&next, coord);
++		next.unit_pos++;
++		unit_key_by_coord(&next, new_first);
++	}
++
++	size = item_length_by_coord(coord);
++	if (count == (unsigned)units(coord)) {
++		return size;
++	}
++
++	header_from = (char *)header_at(coord, (int)from);
++	header_to = (char *)header_at(coord, (int)(from + count));
++
++	entry_from = (char *)entry_at(coord, (int)from);
++	entry_to = (char *)entry_at(coord, (int)(from + count));
++
++	/* move headers */
++	memmove(header_from, header_to,
++		(unsigned)(address(coord, size) - header_to));
++
++	header_delta = header_to - header_from;
++
++	entry_from -= header_delta;
++	entry_to -= header_delta;
++	size -= header_delta;
++
++	/* copy entries */
++	memmove(entry_from, entry_to,
++		(unsigned)(address(coord, size) - entry_to));
++
++	entry_delta = entry_to - entry_from;
++	size -= entry_delta;
++
++	/* update offsets */
++
++	for (i = 0; i < (int)from; ++i)
++		adj_offset(coord, i, -header_delta);
++
++	for (i = from; i < units(coord) - (int)count; ++i)
++		adj_offset(coord, i, -header_delta - entry_delta);
++
++	put_unaligned(cpu_to_le16((__u16) units(coord) - count),
++		      &formatted_at(coord)->num_of_entries);
++
++	if (from == 0) {
++		/* entries from head was removed - move remaining to right */
++		memmove((char *)item_body_by_coord(coord) +
++			header_delta + entry_delta, item_body_by_coord(coord),
++			(unsigned)size);
++		if (REISER4_DEBUG)
++			memset(item_body_by_coord(coord), 0,
++			       (unsigned)header_delta + entry_delta);
++	} else {
++		/* freed space is already at the end of item */
++		if (REISER4_DEBUG)
++			memset((char *)item_body_by_coord(coord) + size, 0,
++			       (unsigned)header_delta + entry_delta);
++	}
++
++	return header_delta + entry_delta;
++}
++
++int kill_units_cde(coord_t * coord /* coord of item */ ,
++		   pos_in_node_t from /* start unit pos */ ,
++		   pos_in_node_t to /* stop unit pos */ ,
++		   struct carry_kill_data *kdata UNUSED_ARG,
++		   reiser4_key * smallest_removed, reiser4_key * new_first)
++{
++	return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
++}
++
++/* ->s.dir.extract_key() method for this item plugin. */
++int extract_key_cde(const coord_t * coord /* coord of item */ ,
++		    reiser4_key * key /* resulting key */ )
++{
++	directory_entry_format *dent;
++
++	assert("nikita-1155", coord != NULL);
++	assert("nikita-1156", key != NULL);
++
++	dent = entry_at(coord, idx_of(coord));
++	return extract_key_from_id(&dent->id, key);
++}
++
++int
++update_key_cde(const coord_t * coord, const reiser4_key * key,
++	       lock_handle * lh UNUSED_ARG)
++{
++	directory_entry_format *dent;
++	obj_key_id obj_id;
++	int result;
++
++	assert("nikita-2344", coord != NULL);
++	assert("nikita-2345", key != NULL);
++
++	dent = entry_at(coord, idx_of(coord));
++	result = build_obj_key_id(key, &obj_id);
++	if (result == 0) {
++		dent->id = obj_id;
++		znode_make_dirty(coord->node);
++	}
++	return 0;
++}
++
++/* ->s.dir.extract_name() method for this item plugin. */
++char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
++{
++	directory_entry_format *dent;
++
++	assert("nikita-1157", coord != NULL);
++
++	dent = entry_at(coord, idx_of(coord));
++	return extract_dent_name(coord, dent, buf);
++}
++
++static int cde_bytes(int pasting, const reiser4_item_data * data)
++{
++	int result;
++
++	result = data->length;
++	if (!pasting)
++		result -= sizeof(cde_item_format);
++	return result;
++}
++
++/* ->s.dir.add_entry() method for this item plugin */
++int add_entry_cde(struct inode *dir /* directory object */ ,
++		  coord_t * coord /* coord of item */ ,
++		  lock_handle * lh /* lock handle for insertion */ ,
++		  const struct dentry *name /* name to insert */ ,
++		  reiser4_dir_entry_desc * dir_entry	/* parameters of new
++							 * directory entry */ )
++{
++	reiser4_item_data data;
++	cde_entry entry;
++	cde_entry_data edata;
++	int result;
++
++	assert("nikita-1656", coord->node == lh->node);
++	assert("nikita-1657", znode_is_write_locked(coord->node));
++
++	edata.num_of_entries = 1;
++	edata.entry = &entry;
++
++	entry.dir = dir;
++	entry.obj = dir_entry->obj;
++	entry.name = &name->d_name;
++
++	data.data = (char *)&edata;
++	data.user = 0;		/* &edata is not user space */
++	data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
++	data.arg = dir_entry;
++	assert("nikita-1302", data.iplug != NULL);
++
++	result = is_dot_key(&dir_entry->key);
++	data.length = estimate_cde(result ? coord : NULL, &data);
++
++	/* NOTE-NIKITA quota plugin? */
++	if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
++		return RETERR(-EDQUOT);
++
++	if (result)
++		result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
++	else
++		result = resize_item(coord, &data, &dir_entry->key, lh, 0);
++	return result;
++}
++
++/* ->s.dir.rem_entry() */
++int rem_entry_cde(struct inode *dir /* directory of item */ ,
++		  const struct qstr *name, coord_t * coord /* coord of item */ ,
++		  lock_handle * lh UNUSED_ARG	/* lock handle for
++						 * removal */ ,
++		  reiser4_dir_entry_desc * entry UNUSED_ARG	/* parameters of
++								 * directory entry
++								 * being removed */ )
++{
++	coord_t shadow;
++	int result;
++	int length;
++	ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
++
++	assert("nikita-2870", strlen(name->name) == name->len);
++	assert("nikita-2869",
++	       !strcmp(name->name, extract_name_cde(coord, buf)));
++
++	length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
++	if (is_longname(name->name, name->len))
++		length += name->len + 1;
++
++	if (inode_get_bytes(dir) < length) {
++		warning("nikita-2628", "Dir is broke: %llu: %llu",
++			(unsigned long long)get_inode_oid(dir),
++			inode_get_bytes(dir));
++
++		return RETERR(-EIO);
++	}
++
++	/* cut_node() is supposed to take pointers to _different_
++	   coords, because it will modify them without respect to
++	   possible aliasing. To work around this, create temporary copy
++	   of @coord.
++	 */
++	coord_dup(&shadow, coord);
++	result =
++	    kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
++	if (result == 0) {
++		/* NOTE-NIKITA quota plugin? */
++		DQUOT_FREE_SPACE_NODIRTY(dir, length);
++	}
++	return result;
++}
++
++/* ->s.dir.max_name_len() method for this item plugin */
++int max_name_len_cde(const struct inode *dir /* directory */ )
++{
++	return
++	    tree_by_inode(dir)->nplug->max_item_size() -
++	    sizeof(directory_entry_format) - sizeof(cde_item_format) -
++	    sizeof(cde_unit_header) - 2;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/cde.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/cde.h
+@@ -0,0 +1,87 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Compound directory item. See cde.c for description. */
++
++#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
++#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
++
++#include "../../forward.h"
++#include "../../kassign.h"
++#include "../../dformat.h"
++
++#include <linux/fs.h>		/* for struct inode */
++#include <linux/dcache.h>	/* for struct dentry, etc  */
++
++typedef struct cde_unit_header {
++	de_id hash;
++	d16 offset;
++} cde_unit_header;
++
++typedef struct cde_item_format {
++	d16 num_of_entries;
++	cde_unit_header entry[0];
++} cde_item_format;
++
++typedef struct cde_entry {
++	const struct inode *dir;
++	const struct inode *obj;
++	const struct qstr *name;
++} cde_entry;
++
++typedef struct cde_entry_data {
++	int num_of_entries;
++	cde_entry *entry;
++} cde_entry_data;
++
++/* plugin->item.b.* */
++reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
++int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
++			const reiser4_item_data *);
++int mergeable_cde(const coord_t * p1, const coord_t * p2);
++pos_in_node_t nr_units_cde(const coord_t * coord);
++reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
++int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
++void print_cde(const char *prefix, coord_t * coord);
++int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
++lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
++			 coord_t * coord);
++int paste_cde(coord_t * coord, reiser4_item_data * data,
++	      carry_plugin_info * info UNUSED_ARG);
++int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
++		  shift_direction pend, unsigned *size, unsigned want);
++void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
++		    unsigned count, shift_direction where_is_free_space,
++		    unsigned free_space);
++int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		  struct carry_cut_data *, reiser4_key * smallest_removed,
++		  reiser4_key * new_first);
++int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		   struct carry_kill_data *, reiser4_key * smallest_removed,
++		   reiser4_key * new_first);
++void print_cde(const char *prefix, coord_t * coord);
++int check_cde(const coord_t * coord, const char **error);
++
++/* plugin->u.item.s.dir.* */
++int extract_key_cde(const coord_t * coord, reiser4_key * key);
++int update_key_cde(const coord_t * coord, const reiser4_key * key,
++		   lock_handle * lh);
++char *extract_name_cde(const coord_t * coord, char *buf);
++int add_entry_cde(struct inode *dir, coord_t * coord,
++		  lock_handle * lh, const struct dentry *name,
++		  reiser4_dir_entry_desc * entry);
++int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
++		  lock_handle * lh, reiser4_dir_entry_desc * entry);
++int max_name_len_cde(const struct inode *dir);
++
++/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/ctail.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/ctail.c
+@@ -0,0 +1,1588 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* ctails (aka "clustered tails") are items for cryptcompress objects */
++
++/* DESCRIPTION:
++
++Each cryptcompress object is stored on disk as a set of clusters sliced
++into ctails.
++
++Internal on-disk structure:
++
++        HEADER   (1)  Here stored disk cluster shift
++	BODY
++*/
++
++#include "../../forward.h"
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../../kassign.h"
++#include "../../key.h"
++#include "../../coord.h"
++#include "item.h"
++#include "../node/node.h"
++#include "../plugin.h"
++#include "../object.h"
++#include "../../znode.h"
++#include "../../carry.h"
++#include "../../tree.h"
++#include "../../inode.h"
++#include "../../super.h"
++#include "../../context.h"
++#include "../../page_cache.h"
++#include "../cluster.h"
++#include "../../flush.h"
++#include "../../tree_walk.h"
++
++#include <linux/pagevec.h>
++#include <linux/swap.h>
++#include <linux/fs.h>
++
++/* return body of ctail item at @coord */
++static ctail_item_format *ctail_formatted_at(const coord_t * coord)
++{
++	assert("edward-60", coord != NULL);
++	return item_body_by_coord(coord);
++}
++
++int cluster_shift_by_coord(const coord_t * coord)
++{
++	return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
++}
++
++static loff_t off_by_coord(const coord_t * coord)
++{
++	reiser4_key key;
++	return get_key_offset(item_key_by_coord(coord, &key));
++}
++
++static int coord_is_unprepped_ctail(const coord_t * coord)
++{
++	assert("edward-1233", coord != NULL);
++	assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
++	assert("edward-1235",
++	       ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
++		    nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
++
++	return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
++}
++
++static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
++{
++	int shift;
++
++	if (inode != NULL) {
++		shift = inode_cluster_shift(inode);
++		assert("edward-1236",
++		       ergo(!coord_is_unprepped_ctail(coord),
++			    shift == cluster_shift_by_coord(coord)));
++	} else {
++		assert("edward-1237", !coord_is_unprepped_ctail(coord));
++		shift = cluster_shift_by_coord(coord);
++	}
++	return off_by_coord(coord) >> shift;
++}
++
++static int disk_cluster_size(const coord_t * coord)
++{
++	assert("edward-1156",
++	       item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
++	/* calculation of disk cluster size
++	   is meaninless if ctail is unprepped */
++	assert("edward-1238", !coord_is_unprepped_ctail(coord));
++
++	return 1 << cluster_shift_by_coord(coord);
++}
++
++/* true if the key is of first disk cluster item */
++static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
++{
++	assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
++
++	return coord_is_unprepped_ctail(coord) ||
++	    ((get_key_offset(key) &
++	      ((loff_t) disk_cluster_size(coord) - 1)) == 0);
++}
++
++static char *first_unit(coord_t * coord)
++{
++	/* FIXME: warning: pointer of type `void *' used in arithmetic */
++	return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
++}
++
++/* plugin->u.item.b.max_key_inside :
++   tail_max_key_inside */
++
++/* plugin->u.item.b.can_contain_key */
++int
++can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
++		      const reiser4_item_data * data)
++{
++	reiser4_key item_key;
++
++	if (item_plugin_by_coord(coord) != data->iplug)
++		return 0;
++
++	item_key_by_coord(coord, &item_key);
++	if (get_key_locality(key) != get_key_locality(&item_key) ||
++	    get_key_objectid(key) != get_key_objectid(&item_key))
++		return 0;
++	if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
++	    get_key_offset(key))
++		return 0;
++	if (is_disk_cluster_key(key, coord))
++		return 0;
++	return 1;
++}
++
++/* plugin->u.item.b.mergeable
++   c-tails of different clusters are not mergeable */
++int mergeable_ctail(const coord_t * p1, const coord_t * p2)
++{
++	reiser4_key key1, key2;
++
++	assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
++	assert("edward-61",
++	       item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
++
++	if (item_id_by_coord(p2) != CTAIL_ID) {
++		/* second item is of another type */
++		return 0;
++	}
++
++	item_key_by_coord(p1, &key1);
++	item_key_by_coord(p2, &key2);
++	if (get_key_locality(&key1) != get_key_locality(&key2) ||
++	    get_key_objectid(&key1) != get_key_objectid(&key2) ||
++	    get_key_type(&key1) != get_key_type(&key2)) {
++		/* items of different objects */
++		return 0;
++	}
++	if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
++		/*  not adjacent items */
++		return 0;
++	if (is_disk_cluster_key(&key2, p2))
++		return 0;
++	return 1;
++}
++
++/* plugin->u.item.b.nr_units */
++pos_in_node_t nr_units_ctail(const coord_t * coord)
++{
++	return (item_length_by_coord(coord) -
++		sizeof(ctail_formatted_at(coord)->cluster_shift));
++}
++
++/* plugin->u.item.b.estimate:
++   estimate how much space is needed to insert/paste @data->length bytes
++   into ctail at @coord */
++int estimate_ctail(const coord_t * coord /* coord of item */ ,
++		   const reiser4_item_data *
++		   data /* parameters for new item */ )
++{
++	if (coord == NULL)
++		/* insert */
++		return (sizeof(ctail_item_format) + data->length);
++	else
++		/* paste */
++		return data->length;
++}
++
++/* ->init() method for this item plugin. */
++int init_ctail(coord_t * to /* coord of item */ ,
++	       coord_t * from /* old_item */ ,
++	       reiser4_item_data * data /* structure used for insertion */ )
++{
++	int cluster_shift;	/* cpu value to convert */
++
++	if (data) {
++		assert("edward-463", data->length > sizeof(ctail_item_format));
++		cluster_shift = *((int *)(data->arg));
++		data->length -= sizeof(ctail_item_format);
++	} else {
++		assert("edward-464", from != NULL);
++		assert("edward-855", ctail_ok(from));
++		cluster_shift = (int)(cluster_shift_by_coord(from));
++	}
++	put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
++	assert("edward-856", ctail_ok(to));
++	return 0;
++}
++
++/* plugin->u.item.b.lookup:
++   NULL: We are looking for item keys only */
++
++#if REISER4_DEBUG
++int ctail_ok(const coord_t * coord)
++{
++	return coord_is_unprepped_ctail(coord) ||
++	    cluster_shift_ok(cluster_shift_by_coord(coord));
++}
++
++/* plugin->u.item.b.check */
++int check_ctail(const coord_t * coord, const char **error)
++{
++	if (!ctail_ok(coord)) {
++		if (error)
++			*error = "bad cluster shift in ctail";
++		return 1;
++	}
++	return 0;
++}
++#endif
++
++/* plugin->u.item.b.paste */
++int
++paste_ctail(coord_t * coord, reiser4_item_data * data,
++	    carry_plugin_info * info UNUSED_ARG)
++{
++	unsigned old_nr_units;
++
++	assert("edward-268", data->data != NULL);
++	/* copy only from kernel space */
++	assert("edward-66", data->user == 0);
++
++	old_nr_units =
++	    item_length_by_coord(coord) - sizeof(ctail_item_format) -
++	    data->length;
++
++	/* ctail items never get pasted in the middle */
++
++	if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
++
++		/* paste at the beginning when create new item */
++		assert("edward-450",
++		       item_length_by_coord(coord) ==
++		       data->length + sizeof(ctail_item_format));
++		assert("edward-451", old_nr_units == 0);
++	} else if (coord->unit_pos == old_nr_units - 1
++		   && coord->between == AFTER_UNIT) {
++
++		/* paste at the end */
++		coord->unit_pos++;
++	} else
++		impossible("edward-453", "bad paste position");
++
++	memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
++
++	assert("edward-857", ctail_ok(coord));
++
++	return 0;
++}
++
++/* plugin->u.item.b.fast_paste */
++
++/* plugin->u.item.b.can_shift
++   number of units is returned via return value, number of bytes via @size. For
++   ctail items they coincide */
++int
++can_shift_ctail(unsigned free_space, coord_t * source,
++		znode * target, shift_direction direction UNUSED_ARG,
++		unsigned *size /* number of bytes */ , unsigned want)
++{
++	/* make sure that that we do not want to shift more than we have */
++	assert("edward-68", want > 0 && want <= nr_units_ctail(source));
++
++	*size = min(want, free_space);
++
++	if (!target) {
++		/* new item will be created */
++		if (*size <= sizeof(ctail_item_format)) {
++			*size = 0;
++			return 0;
++		}
++		return *size - sizeof(ctail_item_format);
++	}
++	return *size;
++}
++
++/* plugin->u.item.b.copy_units
++   cooperates with ->can_shift() */
++void
++copy_units_ctail(coord_t * target, coord_t * source,
++		 unsigned from, unsigned count /* units */ ,
++		 shift_direction where_is_free_space,
++		 unsigned free_space /* bytes */ )
++{
++	/* make sure that item @target is expanded already */
++	assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
++	assert("edward-70", free_space == count || free_space == count + 1);
++
++	assert("edward-858", ctail_ok(source));
++
++	if (where_is_free_space == SHIFT_LEFT) {
++		/* append item @target with @count first bytes of @source:
++		   this restriction came from ordinary tails */
++		assert("edward-71", from == 0);
++		assert("edward-860", ctail_ok(target));
++
++		memcpy(first_unit(target) + nr_units_ctail(target) - count,
++		       first_unit(source), count);
++	} else {
++		/* target item is moved to right already */
++		reiser4_key key;
++
++		assert("edward-72", nr_units_ctail(source) == from + count);
++
++		if (free_space == count) {
++			init_ctail(target, source, NULL);
++		} else {
++			/* new item has been created */
++			assert("edward-862", ctail_ok(target));
++		}
++		memcpy(first_unit(target), first_unit(source) + from, count);
++
++		assert("edward-863", ctail_ok(target));
++
++		/* new units are inserted before first unit in an item,
++		   therefore, we have to update item key */
++		item_key_by_coord(source, &key);
++		set_key_offset(&key, get_key_offset(&key) + from);
++
++		node_plugin_by_node(target->node)->update_item_key(target, &key,
++								   NULL /*info */);
++	}
++}
++
++/* plugin->u.item.b.create_hook */
++int create_hook_ctail(const coord_t * coord, void *arg)
++{
++	assert("edward-864", znode_is_loaded(coord->node));
++
++	znode_set_convertible(coord->node);
++	return 0;
++}
++
++/* plugin->u.item.b.kill_hook */
++int
++kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
++		carry_kill_data * kdata)
++{
++	struct inode *inode;
++
++	assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
++	assert("edward-291", znode_is_write_locked(coord->node));
++
++	inode = kdata->inode;
++	if (inode) {
++		reiser4_key key;
++		item_key_by_coord(coord, &key);
++
++		if (from == 0 && is_disk_cluster_key(&key, coord)) {
++			cloff_t start =
++			    off_to_clust(get_key_offset(&key), inode);
++			truncate_page_cluster(inode, start);
++		}
++	}
++	return 0;
++}
++
++/* for shift_hook_ctail(),
++   return true if the first disk cluster item has dirty child
++*/
++static int ctail_convertible(const coord_t * coord)
++{
++	int result;
++	reiser4_key key;
++	jnode *child = NULL;
++
++	assert("edward-477", coord != NULL);
++	assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
++
++	if (coord_is_unprepped_ctail(coord))
++		/* unprepped ctail should be converted */
++		return 1;
++
++	item_key_by_coord(coord, &key);
++	child = jlookup(current_tree,
++			get_key_objectid(&key),
++			off_to_pg(off_by_coord(coord)));
++	if (!child)
++		return 0;
++	result = JF_ISSET(child, JNODE_DIRTY);
++	jput(child);
++	return result;
++}
++
++/* FIXME-EDWARD */
++/* plugin->u.item.b.shift_hook */
++int shift_hook_ctail(const coord_t * item /* coord of item */ ,
++		     unsigned from UNUSED_ARG /* start unit */ ,
++		     unsigned count UNUSED_ARG /* stop unit */ ,
++		     znode * old_node /* old parent */ )
++{
++	assert("edward-479", item != NULL);
++	assert("edward-480", item->node != old_node);
++
++	if (!znode_convertible(old_node) || znode_convertible(item->node))
++		return 0;
++	if (ctail_convertible(item))
++		znode_set_convertible(item->node);
++	return 0;
++}
++
++static int
++cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++			int cut, void *p, reiser4_key * smallest_removed,
++			reiser4_key * new_first)
++{
++	pos_in_node_t count;	/* number of units to cut */
++	char *item;
++
++	count = to - from + 1;
++	item = item_body_by_coord(coord);
++
++	assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
++
++	if (smallest_removed) {
++		/* store smallest key removed */
++		item_key_by_coord(coord, smallest_removed);
++		set_key_offset(smallest_removed,
++			       get_key_offset(smallest_removed) + from);
++	}
++
++	if (new_first) {
++		assert("vs-1531", from == 0);
++
++		item_key_by_coord(coord, new_first);
++		set_key_offset(new_first,
++			       get_key_offset(new_first) + from + count);
++	}
++
++	if (!cut)
++		kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
++
++	if (from == 0) {
++		if (count != nr_units_ctail(coord)) {
++			/* part of item is removed, so move free space at the beginning
++			   of the item and update item key */
++			reiser4_key key;
++			memcpy(item + to + 1, item, sizeof(ctail_item_format));
++			item_key_by_coord(coord, &key);
++			set_key_offset(&key, get_key_offset(&key) + count);
++			node_plugin_by_node(coord->node)->update_item_key(coord,
++									  &key,
++									  NULL);
++		} else {
++			/* cut_units should not be called to cut evrything */
++			assert("vs-1532", ergo(cut, 0));
++			/* whole item is cut, so more then amount of space occupied
++			   by units got freed */
++			count += sizeof(ctail_item_format);
++		}
++		if (REISER4_DEBUG)
++			memset(item, 0, count);
++	} else if (REISER4_DEBUG)
++		memset(item + sizeof(ctail_item_format) + from, 0, count);
++	return count;
++}
++
++/* plugin->u.item.b.cut_units */
++int
++cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
++		carry_cut_data * cdata, reiser4_key * smallest_removed,
++		reiser4_key * new_first)
++{
++	return cut_or_kill_ctail_units(item, from, to, 1, NULL,
++				       smallest_removed, new_first);
++}
++
++/* plugin->u.item.b.kill_units */
++int
++kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
++		 struct carry_kill_data *kdata, reiser4_key * smallest_removed,
++		 reiser4_key * new_first)
++{
++	return cut_or_kill_ctail_units(item, from, to, 0, kdata,
++				       smallest_removed, new_first);
++}
++
++/* plugin->u.item.s.file.read */
++int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
++{
++	uf_coord_t *uf_coord;
++	coord_t *coord;
++
++	uf_coord = &hint->ext_coord;
++	coord = &uf_coord->coord;
++	assert("edward-127", f->user == 0);
++	assert("edward-129", coord && coord->node);
++	assert("edward-130", coord_is_existing_unit(coord));
++	assert("edward-132", znode_is_loaded(coord->node));
++
++	/* start read only from the beginning of ctail */
++	assert("edward-133", coord->unit_pos == 0);
++	/* read only whole ctails */
++	assert("edward-135", nr_units_ctail(coord) <= f->length);
++
++	assert("edward-136", schedulable());
++	assert("edward-886", ctail_ok(coord));
++
++	if (f->data)
++		memcpy(f->data, (char *)first_unit(coord),
++		       (size_t) nr_units_ctail(coord));
++
++	dclust_set_extension(hint);
++	mark_page_accessed(znode_page(coord->node));
++	move_flow_forward(f, nr_units_ctail(coord));
++
++	return 0;
++}
++
++/* Reads a disk cluster consists of ctail items,
++   attaches a transform stream with plain text */
++int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
++			    int write)
++{
++	int result;
++	assert("edward-671", clust->hint != NULL);
++	assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
++	assert("edward-672", crc_inode_ok(inode));
++
++	/* set input stream */
++	result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
++	if (result)
++		return result;
++
++	result = find_cluster(clust, inode, 1 /* read */ , write);
++	assert("edward-1340", !result);
++	if (result)
++		return result;
++	if (!write)
++		/* write still need the lock to insert unprepped
++		   items, etc... */
++		put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
++
++	assert("edward-673",
++	       ergo(write, znode_is_write_locked(clust->hint->lh.node)));
++
++	if (clust->dstat == FAKE_DISK_CLUSTER ||
++	    clust->dstat == UNPR_DISK_CLUSTER) {
++		tfm_cluster_set_uptodate(&clust->tc);
++		return 0;
++	}
++	result = grab_coa(&clust->tc, inode_compression_plugin(inode));
++	if (result)
++		return result;
++	result = inflate_cluster(clust, inode);
++	if (result)
++		return result;
++	tfm_cluster_set_uptodate(&clust->tc);
++	return 0;
++}
++
++/* read one locked page */
++int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
++		      struct page *page)
++{
++	int ret;
++	unsigned cloff;
++	char *data;
++	size_t pgcnt;
++	tfm_cluster_t *tc = &clust->tc;
++
++	assert("edward-212", PageLocked(page));
++
++	if (PageUptodate(page))
++		goto exit;
++
++	if (!tfm_cluster_is_uptodate(&clust->tc)) {
++		clust->index = pg_to_clust(page->index, inode);
++		unlock_page(page);
++		ret = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
++		lock_page(page);
++		if (ret)
++			return ret;
++	}
++	if (PageUptodate(page))
++		/* races with another read/write */
++		goto exit;
++
++	/* bytes in the page */
++	pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
++
++	if (pgcnt == 0) {
++		assert("edward-1290", 0);
++		return RETERR(-EINVAL);
++	}
++	assert("edward-119", tfm_cluster_is_uptodate(tc));
++
++	switch (clust->dstat) {
++	case UNPR_DISK_CLUSTER:
++		assert("edward-1285", 0);
++#if REISER4_DEBUG
++		warning("edward-1168",
++			"page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
++			page->index, clust->index,
++			(unsigned long long)get_inode_oid(inode));
++#endif
++	case FAKE_DISK_CLUSTER:
++		/* fill the page by zeroes */
++		data = kmap_atomic(page, KM_USER0);
++
++		memset(data, 0, PAGE_CACHE_SIZE);
++		flush_dcache_page(page);
++		kunmap_atomic(data, KM_USER0);
++		SetPageUptodate(page);
++		break;
++	case PREP_DISK_CLUSTER:
++		/* fill the page by transformed data */
++		assert("edward-1058", !PageUptodate(page));
++		assert("edward-120", tc->len <= inode_cluster_size(inode));
++
++		/* start page offset in the cluster */
++		cloff = pg_to_off_to_cloff(page->index, inode);
++
++		data = kmap(page);
++		memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
++		memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
++		flush_dcache_page(page);
++		kunmap(page);
++		SetPageUptodate(page);
++		break;
++	default:
++		impossible("edward-1169", "bad disk cluster state");
++	}
++      exit:
++	return 0;
++}
++
++/* plugin->u.item.s.file.readpage */
++int readpage_ctail(void *vp, struct page *page)
++{
++	int result;
++	hint_t *hint;
++	reiser4_cluster_t *clust = vp;
++
++	assert("edward-114", clust != NULL);
++	assert("edward-115", PageLocked(page));
++	assert("edward-116", !PageUptodate(page));
++	assert("edward-117", !jprivate(page) && !PagePrivate(page));
++	assert("edward-118", page->mapping && page->mapping->host);
++	assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
++
++	hint = kmalloc(sizeof(*hint), GFP_KERNEL);
++	if (hint == NULL)
++		return RETERR(-ENOMEM);
++	clust->hint = hint;
++	result = load_file_hint(clust->file, hint);
++	if (result) {
++		kfree(hint);
++		return result;
++	}
++	assert("vs-25", hint->ext_coord.lh == &hint->lh);
++	result = do_readpage_ctail(page->mapping->host, clust, page);
++
++	assert("edward-213", PageLocked(page));
++	assert("edward-1163", ergo(!result, PageUptodate(page)));
++	assert("edward-868",
++	       ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
++
++	unlock_page(page);
++	done_lh(&hint->lh);
++	hint->ext_coord.valid = 0;
++	save_file_hint(clust->file, hint);
++	kfree(hint);
++	tfm_cluster_clr_uptodate(&clust->tc);
++
++	return result;
++}
++
++/* This unconditionally reads a disk cluster.
++   Helper function for ->readpages() */
++static int
++ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
++{
++	int i;
++	int result;
++	assert("edward-779", clust != NULL);
++	assert("edward-1059", clust->win == NULL);
++	assert("edward-780", inode != NULL);
++
++	result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
++	if (result)
++		return result;
++	result = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
++	if (result)
++		goto out;
++	/* at this point stream with valid plain text is attached */
++	assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
++
++	for (i = 0; i < clust->nr_pages; i++) {
++		struct page *page = clust->pages[i];
++		lock_page(page);
++		result = do_readpage_ctail(inode, clust, page);
++		unlock_page(page);
++		if (result)
++			break;
++	}
++	tfm_cluster_clr_uptodate(&clust->tc);
++      out:
++	release_cluster_pages(clust);
++	return result;
++}
++
++#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
++#define list_to_next_page(head) (list_entry((head)->prev->prev, struct page, lru))
++
++#if REISER4_DEBUG
++#define check_order(pages)                                                    \
++assert("edward-214", ergo(!list_empty(pages) && pages->next != pages->prev,   \
++       list_to_page(pages)->index < list_to_next_page(pages)->index))
++#endif
++
++/* plugin->u.item.s.file.readpages
++   Populate an address space with some page clusters,
++   and start reads against them.
++   FIXME-EDWARD: this function should return errors?
++*/
++void
++readpages_ctail(void *vp, struct address_space *mapping,
++		struct list_head *pages)
++{
++	int ret = 0;
++	hint_t *hint;
++	reiser4_cluster_t clust;
++	struct page *page;
++	struct pagevec lru_pvec;
++	struct inode *inode = mapping->host;
++	int progress = 0;
++
++	assert("edward-214", ergo(!list_empty(pages) &&
++				  pages->next != pages->prev,
++				  list_to_page(pages)->index <
++				  list_to_next_page(pages)->index));
++	pagevec_init(&lru_pvec, 0);
++	cluster_init_read(&clust, NULL);
++	clust.file = vp;
++	hint = kmalloc(sizeof(*hint), GFP_KERNEL);
++	if (hint == NULL) {
++		warning("vs-28", "failed to allocate hint");
++		goto exit1;
++	}
++	clust.hint = hint;
++	ret = load_file_hint(clust.file, hint);
++	if (ret)
++		goto exit2;
++	ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
++	if (ret)
++		goto exit3;
++	assert("vs-26", hint->ext_coord.lh == &hint->lh);
++
++	/* address_space-level file readahead doesn't know about
++	   reiser4 concept of clustering, so we work around this
++	   fact: with each page of the list @pages address space
++	   will be populated with the whole page cluster.
++	*/
++	while (!list_empty(pages)) {
++		page = list_to_page(pages);
++		list_del(&page->lru);
++		if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
++			page_cache_release(page);
++			continue;
++		}
++		if (PageUptodate(page)) {
++			if (!pagevec_add(&lru_pvec, page))
++				__pagevec_lru_add(&lru_pvec);
++			unlock_page(page);
++			continue;
++		}
++		unlock_page(page);
++
++		move_cluster_forward(&clust, inode, page->index, &progress);
++		ret = ctail_read_page_cluster(&clust, inode);
++		if (ret)
++			break;
++		assert("edward-869", !tfm_cluster_is_uptodate(&clust.tc));
++		lock_page(page);
++
++		ret = do_readpage_ctail(inode, &clust, page);
++		if (!pagevec_add(&lru_pvec, page))
++			__pagevec_lru_add(&lru_pvec);
++		if (ret) {
++			warning("edward-215", "do_readpage_ctail failed");
++			unlock_page(page);
++			break;
++		}
++		assert("edward-1061", PageUptodate(page));
++
++		unlock_page(page);
++	}
++	assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
++ exit3:
++	done_lh(&hint->lh);
++	save_file_hint(clust.file, hint);
++	hint->ext_coord.valid = 0;
++ exit2:
++	kfree(hint);
++ exit1:
++	while (!list_empty(pages)) {
++		struct page *victim;
++		victim = list_to_page(pages);
++		list_del(&victim->lru);
++		page_cache_release(victim);
++	}
++	put_cluster_handle(&clust);
++	pagevec_lru_add(&lru_pvec);
++	return;
++}
++
++/*
++   plugin->u.item.s.file.append_key
++   key of the first item of the next disk cluster
++*/
++reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
++{
++	assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
++	assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
++
++	item_key_by_coord(coord, key);
++	set_key_offset(key,
++		       ((__u64) (clust_by_coord(coord, NULL)) +
++			1) << cluster_shift_by_coord(coord));
++	return key;
++}
++
++static int
++insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
++{
++	int result;
++	char buf[UCTAIL_NR_UNITS];
++	reiser4_item_data data;
++	reiser4_key key;
++	int shift = (int)UCTAIL_SHIFT;
++
++	memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
++	result = key_by_inode_cryptcompress(inode,
++					    clust_to_off(clust->index, inode),
++					    &key);
++	if (result)
++		return result;
++	data.user = 0;
++	data.iplug = item_plugin_by_id(CTAIL_ID);
++	data.arg = &shift;
++	data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
++	data.data = buf;
++
++	result = insert_by_coord(&clust->hint->ext_coord.coord,
++				 &data, &key, clust->hint->ext_coord.lh, 0);
++	return result;
++}
++
++static int
++insert_crc_flow(coord_t * coord, lock_handle * lh, flow_t * f,
++		struct inode *inode)
++{
++	int result;
++	carry_pool *pool;
++	carry_level *lowest_level;
++	reiser4_item_data *data;
++	carry_op *op;
++	int cluster_shift = inode_cluster_shift(inode);
++
++	pool =
++	    init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
++			    sizeof(*data));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	lowest_level = (carry_level *) (pool + 1);
++	init_carry_level(lowest_level, pool);
++	data = (reiser4_item_data *) (lowest_level + 3);
++
++	assert("edward-466", coord->between == AFTER_ITEM
++	       || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
++	       || coord->between == EMPTY_NODE
++	       || coord->between == BEFORE_UNIT);
++
++	if (coord->between == AFTER_UNIT) {
++		coord->unit_pos = 0;
++		coord->between = AFTER_ITEM;
++	}
++	op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
++			0 /* operate directly on coord -> node */ );
++	if (IS_ERR(op) || (op == NULL)) {
++		done_carry_pool(pool);
++		return RETERR(op ? PTR_ERR(op) : -EIO);
++	}
++	data->user = 0;
++	data->iplug = item_plugin_by_id(CTAIL_ID);
++	data->arg = &cluster_shift;
++
++	data->length = 0;
++	data->data = NULL;
++
++	op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
++	op->u.insert_flow.insert_point = coord;
++	op->u.insert_flow.flow = f;
++	op->u.insert_flow.data = data;
++	op->u.insert_flow.new_nodes = 0;
++
++	lowest_level->track_type = CARRY_TRACK_CHANGE;
++	lowest_level->tracked = lh;
++
++	result = carry(lowest_level, NULL);
++	done_carry_pool(pool);
++
++	return result;
++}
++
++/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
++static int
++insert_crc_flow_in_place(coord_t * coord, lock_handle * lh, flow_t * f,
++			 struct inode *inode)
++{
++	int ret;
++	coord_t pos;
++	lock_handle lock;
++
++	assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
++	assert("edward-484", coord->between == AT_UNIT
++	       || coord->between == AFTER_ITEM);
++	assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
++
++	coord_dup(&pos, coord);
++	pos.unit_pos = 0;
++	pos.between = AFTER_ITEM;
++
++	init_lh(&lock);
++	copy_lh(&lock, lh);
++
++	ret = insert_crc_flow(&pos, &lock, f, inode);
++	done_lh(&lock);
++	assert("edward-1347", znode_is_write_locked(lh->node));
++	assert("edward-1228", !ret);
++	return ret;
++}
++
++/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
++static int overwrite_ctail(coord_t * coord, flow_t * f)
++{
++	unsigned count;
++
++	assert("edward-269", f->user == 0);
++	assert("edward-270", f->data != NULL);
++	assert("edward-271", f->length > 0);
++	assert("edward-272", coord_is_existing_unit(coord));
++	assert("edward-273", coord->unit_pos == 0);
++	assert("edward-274", znode_is_write_locked(coord->node));
++	assert("edward-275", schedulable());
++	assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
++	assert("edward-1243", ctail_ok(coord));
++
++	count = nr_units_ctail(coord);
++
++	if (count > f->length)
++		count = f->length;
++	memcpy(first_unit(coord), f->data, count);
++	move_flow_forward(f, count);
++	coord->unit_pos += count;
++	return 0;
++}
++
++/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
++   cut ctail (part or whole) starting from next unit position */
++static int cut_ctail(coord_t * coord)
++{
++	coord_t stop;
++
++	assert("edward-435", coord->between == AT_UNIT &&
++	       coord->item_pos < coord_num_items(coord) &&
++	       coord->unit_pos <= coord_num_units(coord));
++
++	if (coord->unit_pos == coord_num_units(coord))
++		/* nothing to cut */
++		return 0;
++	coord_dup(&stop, coord);
++	stop.unit_pos = coord_last_unit_pos(coord);
++
++	return cut_node_content(coord, &stop, NULL, NULL, NULL);
++}
++
++int
++ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
++{
++	int result;
++	assert("edward-1244", inode != NULL);
++	assert("edward-1245", clust->hint != NULL);
++	assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
++	assert("edward-1247", clust->reserved == 1);
++	assert("edward-1248", get_current_context()->grabbed_blocks ==
++	       estimate_insert_cluster(inode));
++
++	result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
++	if (cbk_errored(result))
++		return result;
++	assert("edward-1249", result == CBK_COORD_NOTFOUND);
++	assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
++
++	assert("edward-1295",
++	       clust->hint->ext_coord.lh->node ==
++	       clust->hint->ext_coord.coord.node);
++
++	coord_set_between_clusters(&clust->hint->ext_coord.coord);
++
++	result = insert_unprepped_ctail(clust, inode);
++	all_grabbed2free();
++
++	assert("edward-1251", !result);
++	assert("edward-1252", crc_inode_ok(inode));
++	assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
++	assert("edward-1254",
++	       reiser4_clustered_blocks(reiser4_get_current_sb()));
++	assert("edward-1255",
++	       znode_convertible(clust->hint->ext_coord.coord.node));
++
++	return result;
++}
++
++static int do_convert_ctail(flush_pos_t * pos, crc_write_mode_t mode)
++{
++	int result = 0;
++	convert_item_info_t *info;
++
++	assert("edward-468", pos != NULL);
++	assert("edward-469", pos->sq != NULL);
++	assert("edward-845", item_convert_data(pos) != NULL);
++
++	info = item_convert_data(pos);
++	assert("edward-679", info->flow.data != NULL);
++
++	switch (mode) {
++	case CRC_APPEND_ITEM:
++		assert("edward-1229", info->flow.length != 0);
++		assert("edward-1256",
++		       cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
++		result =
++		    insert_crc_flow_in_place(&pos->coord, &pos->lock,
++					     &info->flow, info->inode);
++		break;
++	case CRC_OVERWRITE_ITEM:
++		assert("edward-1230", info->flow.length != 0);
++		overwrite_ctail(&pos->coord, &info->flow);
++		if (info->flow.length != 0)
++			break;
++	case CRC_CUT_ITEM:
++		assert("edward-1231", info->flow.length == 0);
++		result = cut_ctail(&pos->coord);
++		break;
++	default:
++		result = RETERR(-EIO);
++		impossible("edward-244", "bad convert mode");
++	}
++	return result;
++}
++
++/* plugin->u.item.f.scan */
++int scan_ctail(flush_scan * scan)
++{
++	int result = 0;
++	struct page *page;
++	struct inode *inode;
++	jnode *node = scan->node;
++
++	assert("edward-227", scan->node != NULL);
++	assert("edward-228", jnode_is_cluster_page(scan->node));
++	assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
++
++	page = jnode_page(node);
++	inode = page->mapping->host;
++
++	if (!scanning_left(scan))
++		return result;
++	if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
++		znode_make_dirty(scan->parent_lock.node);
++
++	if (!znode_convertible(scan->parent_lock.node)) {
++		if (JF_ISSET(scan->node, JNODE_DIRTY))
++			znode_set_convertible(scan->parent_lock.node);
++		else {
++			warning("edward-681",
++				"cluster page is already processed");
++			return -EAGAIN;
++		}
++	}
++	return result;
++}
++
++/* If true, this function attaches children */
++static int should_attach_convert_idata(flush_pos_t * pos)
++{
++	int result;
++	assert("edward-431", pos != NULL);
++	assert("edward-432", pos->child == NULL);
++	assert("edward-619", znode_is_write_locked(pos->coord.node));
++	assert("edward-470",
++	       item_plugin_by_coord(&pos->coord) ==
++	       item_plugin_by_id(CTAIL_ID));
++
++	/* check for leftmost child */
++	utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
++
++	if (!pos->child)
++		return 0;
++	spin_lock_jnode(pos->child);
++	result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
++		  pos->child->atom == ZJNODE(pos->coord.node)->atom);
++	spin_unlock_jnode(pos->child);
++	if (!result && pos->child) {
++		/* existing child isn't to attach, clear up this one */
++		jput(pos->child);
++		pos->child = NULL;
++	}
++	return result;
++}
++
++/* plugin->init_convert_data() */
++static int
++init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
++{
++	assert("edward-813", idata != NULL);
++	assert("edward-814", inode != NULL);
++
++	idata->inode = inode;
++	idata->d_cur = DC_FIRST_ITEM;
++	idata->d_next = DC_INVALID_STATE;
++
++	return 0;
++}
++
++static int alloc_item_convert_data(convert_info_t * sq)
++{
++	assert("edward-816", sq != NULL);
++	assert("edward-817", sq->itm == NULL);
++
++	sq->itm = kmalloc(sizeof(*sq->itm), GFP_KERNEL);
++	if (sq->itm == NULL)
++		return RETERR(-ENOMEM);
++	return 0;
++}
++
++static void free_item_convert_data(convert_info_t * sq)
++{
++	assert("edward-818", sq != NULL);
++	assert("edward-819", sq->itm != NULL);
++	assert("edward-820", sq->iplug != NULL);
++
++	kfree(sq->itm);
++	sq->itm = NULL;
++	return;
++}
++
++static int alloc_convert_data(flush_pos_t * pos)
++{
++	assert("edward-821", pos != NULL);
++	assert("edward-822", pos->sq == NULL);
++
++	pos->sq = kmalloc(sizeof(*pos->sq), GFP_KERNEL);
++	if (!pos->sq)
++		return RETERR(-ENOMEM);
++	memset(pos->sq, 0, sizeof(*pos->sq));
++	cluster_init_write(&pos->sq->clust, 0);
++	return 0;
++}
++
++void free_convert_data(flush_pos_t * pos)
++{
++	convert_info_t *sq;
++
++	assert("edward-823", pos != NULL);
++	assert("edward-824", pos->sq != NULL);
++
++	sq = pos->sq;
++	if (sq->itm)
++		free_item_convert_data(sq);
++	put_cluster_handle(&sq->clust);
++	kfree(pos->sq);
++	pos->sq = NULL;
++	return;
++}
++
++static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
++{
++	convert_info_t *sq;
++
++	assert("edward-825", pos != NULL);
++	assert("edward-826", pos->sq != NULL);
++	assert("edward-827", item_convert_data(pos) != NULL);
++	assert("edward-828", inode != NULL);
++
++	sq = pos->sq;
++
++	memset(sq->itm, 0, sizeof(*sq->itm));
++
++	/* iplug->init_convert_data() */
++	return init_convert_data_ctail(sq->itm, inode);
++}
++
++/* create and attach disk cluster info used by 'convert' phase of the flush
++   squalloc() */
++static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
++{
++	int ret = 0;
++	convert_item_info_t *info;
++	reiser4_cluster_t *clust;
++	file_plugin *fplug = inode_file_plugin(inode);
++	compression_plugin *cplug = inode_compression_plugin(inode);
++
++	assert("edward-248", pos != NULL);
++	assert("edward-249", pos->child != NULL);
++	assert("edward-251", inode != NULL);
++	assert("edward-682", crc_inode_ok(inode));
++	assert("edward-252", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
++	assert("edward-473",
++	       item_plugin_by_coord(&pos->coord) ==
++	       item_plugin_by_id(CTAIL_ID));
++
++	if (!pos->sq) {
++		ret = alloc_convert_data(pos);
++		if (ret)
++			return ret;
++	}
++	clust = &pos->sq->clust;
++	ret = grab_coa(&clust->tc, cplug);
++	if (ret)
++		goto err;
++	ret = set_cluster_by_page(clust,
++				  jnode_page(pos->child),
++				  MAX_CLUSTER_NRPAGES);
++	if (ret)
++		goto err;
++
++	assert("edward-829", pos->sq != NULL);
++	assert("edward-250", item_convert_data(pos) == NULL);
++
++	pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
++
++	ret = alloc_item_convert_data(pos->sq);
++	if (ret)
++		goto err;
++	ret = init_item_convert_data(pos, inode);
++	if (ret)
++		goto err;
++	info = item_convert_data(pos);
++
++	ret = flush_cluster_pages(clust, pos->child, inode);
++	if (ret)
++		goto err;
++
++	deflate_cluster(clust, inode);
++	inc_item_convert_count(pos);
++
++	/* make flow by transformed stream */
++	fplug->flow_by_inode(info->inode,
++			     (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
++			     0 /* kernel space */ ,
++			     clust->tc.len,
++			     clust_to_off(clust->index, inode),
++			     WRITE_OP, &info->flow);
++	jput(pos->child);
++
++	assert("edward-683", crc_inode_ok(inode));
++	return 0;
++      err:
++	jput(pos->child);
++	free_convert_data(pos);
++	return ret;
++}
++
++/* clear up disk cluster info */
++static void detach_convert_idata(convert_info_t * sq)
++{
++	convert_item_info_t *info;
++
++	assert("edward-253", sq != NULL);
++	assert("edward-840", sq->itm != NULL);
++
++	info = sq->itm;
++	assert("edward-255", info->inode != NULL);
++	assert("edward-1212", info->flow.length == 0);
++
++	free_item_convert_data(sq);
++	return;
++}
++
++/* plugin->u.item.f.utmost_child */
++
++/* This function sets leftmost child for a first cluster item,
++   if the child exists, and NULL in other cases.
++   NOTE-EDWARD: Do not call this for RIGHT_SIDE */
++
++int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
++{
++	reiser4_key key;
++
++	item_key_by_coord(coord, &key);
++
++	assert("edward-257", coord != NULL);
++	assert("edward-258", child != NULL);
++	assert("edward-259", side == LEFT_SIDE);
++	assert("edward-260",
++	       item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
++
++	if (!is_disk_cluster_key(&key, coord))
++		*child = NULL;
++	else
++		*child = jlookup(current_tree,
++				 get_key_objectid(item_key_by_coord
++						  (coord, &key)),
++				 off_to_pg(get_key_offset(&key)));
++	return 0;
++}
++
++/* Returns true if @p2 is the next item to @p1
++   in the _same_ disk cluster.
++   Disk cluster is a set of items. If ->clustered() != NULL,
++   with each item the whole disk cluster should be read/modified
++*/
++static int clustered_ctail(const coord_t * p1, const coord_t * p2)
++{
++	return mergeable_ctail(p1, p2);
++}
++
++/* Go rightward and check for next disk cluster item, set
++   d_next to DC_CHAINED_ITEM, if the last one exists.
++   If the current position is last item, go to right neighbor.
++   Skip empty nodes. Note, that right neighbors may be not in
++   the slum because of races. If so, make it dirty and
++   convertible.
++*/
++static int next_item_dc_stat(flush_pos_t * pos)
++{
++	int ret = 0;
++	int stop = 0;
++	znode *cur;
++	coord_t coord;
++	lock_handle lh;
++	lock_handle right_lock;
++
++	assert("edward-1232", !node_is_empty(pos->coord.node));
++	assert("edward-1014",
++	       pos->coord.item_pos < coord_num_items(&pos->coord));
++	assert("edward-1015", chaining_data_present(pos));
++	assert("edward-1017",
++	       item_convert_data(pos)->d_next == DC_INVALID_STATE);
++
++	item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
++
++	if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
++		return ret;
++	if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
++		return ret;
++
++	/* check next slum item */
++	init_lh(&right_lock);
++	cur = pos->coord.node;
++
++	while (!stop) {
++		init_lh(&lh);
++		ret = reiser4_get_right_neighbor(&lh,
++						 cur,
++						 ZNODE_WRITE_LOCK,
++						 GN_CAN_USE_UPPER_LEVELS);
++		if (ret)
++			break;
++		ret = zload(lh.node);
++		if (ret) {
++			done_lh(&lh);
++			break;
++		}
++		coord_init_before_first_item(&coord, lh.node);
++
++		if (node_is_empty(lh.node)) {
++			znode_make_dirty(lh.node);
++			znode_set_convertible(lh.node);
++			stop = 0;
++		} else if (clustered_ctail(&pos->coord, &coord)) {
++
++			item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
++
++			if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
++				/*
++				   warning("edward-1024",
++				   "next slum item mergeable, "
++				   "but znode %p isn't dirty\n",
++				   lh.node);
++				 */
++				znode_make_dirty(lh.node);
++			}
++			if (!znode_convertible(lh.node)) {
++				/*
++				   warning("edward-1272",
++				   "next slum item mergeable, "
++				   "but znode %p isn't convertible\n",
++				   lh.node);
++				 */
++				znode_set_convertible(lh.node);
++			}
++			stop = 1;
++		} else
++			stop = 1;
++		zrelse(lh.node);
++		done_lh(&right_lock);
++		copy_lh(&right_lock, &lh);
++		done_lh(&lh);
++		cur = right_lock.node;
++	}
++	done_lh(&right_lock);
++
++	if (ret == -E_NO_NEIGHBOR)
++		ret = 0;
++	return ret;
++}
++
++static int
++assign_convert_mode(convert_item_info_t * idata, crc_write_mode_t * mode)
++{
++	int result = 0;
++
++	assert("edward-1025", idata != NULL);
++
++	if (idata->flow.length) {
++		/* append or overwrite */
++		switch (idata->d_cur) {
++		case DC_FIRST_ITEM:
++		case DC_CHAINED_ITEM:
++			*mode = CRC_OVERWRITE_ITEM;
++			break;
++		case DC_AFTER_CLUSTER:
++			*mode = CRC_APPEND_ITEM;
++			break;
++		default:
++			impossible("edward-1018", "wrong current item state");
++		}
++	} else {
++		/* cut or invalidate */
++		switch (idata->d_cur) {
++		case DC_FIRST_ITEM:
++		case DC_CHAINED_ITEM:
++			*mode = CRC_CUT_ITEM;
++			break;
++		case DC_AFTER_CLUSTER:
++			result = 1;
++			break;
++		default:
++			impossible("edward-1019", "wrong current item state");
++		}
++	}
++	return result;
++}
++
++/* plugin->u.item.f.convert */
++/* write ctail in guessed mode */
++int convert_ctail(flush_pos_t * pos)
++{
++	int result;
++	int nr_items;
++	crc_write_mode_t mode = CRC_OVERWRITE_ITEM;
++
++	assert("edward-1020", pos != NULL);
++	assert("edward-1213", coord_num_items(&pos->coord) != 0);
++	assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
++	assert("edward-1258", ctail_ok(&pos->coord));
++	assert("edward-261", pos->coord.node != NULL);
++
++	nr_items = coord_num_items(&pos->coord);
++	if (!chaining_data_present(pos)) {
++		if (should_attach_convert_idata(pos)) {
++			/* attach convert item info */
++			struct inode *inode;
++
++			assert("edward-264", pos->child != NULL);
++			assert("edward-265", jnode_page(pos->child) != NULL);
++			assert("edward-266",
++			       jnode_page(pos->child)->mapping != NULL);
++
++			inode = jnode_page(pos->child)->mapping->host;
++
++			assert("edward-267", inode != NULL);
++
++			/* attach item convert info by child and put the last one */
++			result = attach_convert_idata(pos, inode);
++			pos->child = NULL;
++			if (result == -E_REPEAT) {
++				/* jnode became clean, or there is no dirty
++				   pages (nothing to update in disk cluster) */
++				warning("edward-1021",
++					"convert_ctail: nothing to attach");
++				return 0;
++			}
++			if (result != 0)
++				return result;
++		} else
++			/* unconvertible */
++			return 0;
++	} else {
++		/* use old convert info */
++
++		convert_item_info_t *idata;
++
++		idata = item_convert_data(pos);
++
++		result = assign_convert_mode(idata, &mode);
++		if (result) {
++			/* disk cluster is over,
++			   nothing to update anymore */
++			detach_convert_idata(pos->sq);
++			return 0;
++		}
++	}
++
++	assert("edward-433", chaining_data_present(pos));
++	assert("edward-1022",
++	       pos->coord.item_pos < coord_num_items(&pos->coord));
++
++	result = next_item_dc_stat(pos);
++	if (result) {
++		detach_convert_idata(pos->sq);
++		return result;
++	}
++	result = do_convert_ctail(pos, mode);
++	if (result) {
++		detach_convert_idata(pos->sq);
++		return result;
++	}
++	switch (mode) {
++	case CRC_CUT_ITEM:
++		assert("edward-1214", item_convert_data(pos)->flow.length == 0);
++		assert("edward-1215",
++		       coord_num_items(&pos->coord) == nr_items ||
++		       coord_num_items(&pos->coord) == nr_items - 1);
++		if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
++			break;
++		if (coord_num_items(&pos->coord) != nr_items) {
++			/* the item was killed, no more chained items */
++			detach_convert_idata(pos->sq);
++			if (!node_is_empty(pos->coord.node))
++				/* make sure the next item will be scanned */
++				coord_init_before_item(&pos->coord);
++			break;
++		}
++	case CRC_APPEND_ITEM:
++		assert("edward-434", item_convert_data(pos)->flow.length == 0);
++		detach_convert_idata(pos->sq);
++		break;
++	case CRC_OVERWRITE_ITEM:
++		if (coord_is_unprepped_ctail(&pos->coord)) {
++			/* convert unpprepped ctail to prepped one */
++			int shift;
++			shift =
++			    inode_cluster_shift(item_convert_data(pos)->inode);
++			assert("edward-1259", cluster_shift_ok(shift));
++			put_unaligned((d8)shift,
++				&ctail_formatted_at(&pos->coord)->
++				cluster_shift);
++		}
++		break;
++	}
++	return result;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/ctail.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/ctail.h
+@@ -0,0 +1,89 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#if !defined( __FS_REISER4_CTAIL_H__ )
++#define __FS_REISER4_CTAIL_H__
++
++/* cryptcompress object item. See ctail.c for description. */
++
++#define UCTAIL_NR_UNITS 1
++#define UCTAIL_SHIFT 0xff
++
++typedef struct ctail_item_format {
++	/* cluster shift */
++	d8 cluster_shift;
++	/* ctail body */
++	d8 body[0];
++} __attribute__ ((packed)) ctail_item_format;
++
++/* The following is a set of various item states in a disk cluster.
++   Disk cluster is a set of items whose keys belong to the interval
++   [dc_key , dc_key + disk_cluster_size - 1] */
++typedef enum {
++	DC_INVALID_STATE = 0,
++	DC_FIRST_ITEM = 1,
++	DC_CHAINED_ITEM = 2,
++	DC_AFTER_CLUSTER = 3
++} dc_item_stat;
++
++typedef struct {
++	int shift;		/* we keep here a cpu value of cluster_shift field
++				   of ctail_item_format (see above) */
++} ctail_coord_extension_t;
++
++struct cut_list;
++
++/* plugin->item.b.* */
++int can_contain_key_ctail(const coord_t *, const reiser4_key *,
++			  const reiser4_item_data *);
++int mergeable_ctail(const coord_t * p1, const coord_t * p2);
++pos_in_node_t nr_units_ctail(const coord_t * coord);
++int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
++void print_ctail(const char *prefix, coord_t * coord);
++lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
++
++int paste_ctail(coord_t * coord, reiser4_item_data * data,
++		carry_plugin_info * info UNUSED_ARG);
++int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
++int can_shift_ctail(unsigned free_space, coord_t * coord,
++		    znode * target, shift_direction pend, unsigned *size,
++		    unsigned want);
++void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
++		      unsigned count, shift_direction where_is_free_space,
++		      unsigned free_space);
++int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		    carry_cut_data *, reiser4_key * smallest_removed,
++		    reiser4_key * new_first);
++int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		     carry_kill_data *, reiser4_key * smallest_removed,
++		     reiser4_key * new_first);
++int ctail_ok(const coord_t * coord);
++int check_ctail(const coord_t * coord, const char **error);
++
++/* plugin->u.item.s.* */
++int read_ctail(struct file *, flow_t *, hint_t *);
++int readpage_ctail(void *, struct page *);
++void readpages_ctail(void *, struct address_space *, struct list_head *);
++reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
++int create_hook_ctail(const coord_t * coord, void *arg);
++int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
++		    carry_kill_data *);
++int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
++
++/* plugin->u.item.f */
++int utmost_child_ctail(const coord_t *, sideof, jnode **);
++int scan_ctail(flush_scan *);
++int convert_ctail(flush_pos_t *);
++size_t inode_scaled_cluster_size(struct inode *);
++int cluster_shift_by_coord(const coord_t * coord);
++
++#endif				/* __FS_REISER4_CTAIL_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/extent.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/extent.c
+@@ -0,0 +1,197 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "item.h"
++#include "../../key.h"
++#include "../../super.h"
++#include "../../carry.h"
++#include "../../inode.h"
++#include "../../page_cache.h"
++#include "../../flush.h"
++#include "../object.h"
++
++/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
++/* Audited by: green(2002.06.13) */
++reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
++				   int nr_extents)
++{
++	data->data = ext_unit;
++	/* data->data is kernel space */
++	data->user = 0;
++	data->length = sizeof(reiser4_extent) * nr_extents;
++	data->arg = NULL;
++	data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
++	return data;
++}
++
++/* how many bytes are addressed by @nr first extents of the extent item */
++reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr)
++{
++	pos_in_node_t i;
++	reiser4_block_nr blocks;
++	reiser4_extent *ext;
++
++	ext = item_body_by_coord(coord);
++	assert("vs-263", nr <= nr_units_extent(coord));
++
++	blocks = 0;
++	for (i = 0; i < nr; i++, ext++) {
++		blocks += extent_get_width(ext);
++	}
++
++	return blocks * current_blocksize;
++}
++
++extent_state state_of_extent(reiser4_extent * ext)
++{
++	switch ((int)extent_get_start(ext)) {
++	case 0:
++		return HOLE_EXTENT;
++	case 1:
++		return UNALLOCATED_EXTENT;
++	default:
++		break;
++	}
++	return ALLOCATED_EXTENT;
++}
++
++int extent_is_unallocated(const coord_t * item)
++{
++	assert("jmacd-5133", item_is_extent(item));
++
++	return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
++}
++
++/* set extent's start and width */
++void
++set_extent(reiser4_extent * ext, reiser4_block_nr start, reiser4_block_nr width)
++{
++	extent_set_start(ext, start);
++	extent_set_width(ext, width);
++}
++
++
++/**
++ * replace_extent - replace extent and paste 1 or 2 after it
++ * @un_extent: coordinate of extent to be overwritten
++ * @lh: need better comment
++ * @key: need better comment
++ * @exts_to_add: data prepared for insertion into tree
++ * @replace: need better comment
++ * @flags: need better comment
++ * @return_insert_position: need better comment
++ *
++ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one.  If
++ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
++ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
++ * set to extent which was overwritten.
++ */
++int replace_extent(struct replace_handle *h, int return_inserted_position)
++{
++	int result;
++	znode *orig_znode;
++	/*ON_DEBUG(reiser4_extent orig_ext);*/	/* this is for debugging */
++
++	assert("vs-990", coord_is_existing_unit(h->coord));
++	assert("vs-1375", znode_is_write_locked(h->coord->node));
++	assert("vs-1426", extent_get_width(&h->overwrite) != 0);
++	assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
++	assert("vs-1427", ergo(h->nr_new_extents == 2,
++			       extent_get_width(&h->new_extents[1]) != 0));
++
++	/* compose structure for paste */
++	init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
++
++	coord_dup(&h->coord_after, h->coord);
++	init_lh(&h->lh_after);
++	copy_lh(&h->lh_after, h->lh);
++	tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
++	tap_monitor(&h->watch);
++
++	ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
++	orig_znode = h->coord->node;
++
++#if REISER4_DEBUG
++	/* make sure that key is set properly */
++	unit_key_by_coord(h->coord, &h->tmp);
++	set_key_offset(&h->tmp,
++		       get_key_offset(&h->tmp) +
++		       extent_get_width(&h->overwrite) * current_blocksize);
++	assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
++#endif
++
++	/* set insert point after unit to be replaced */
++	h->coord->between = AFTER_UNIT;
++
++	result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
++				  &h->paste_key, &h->item, h->flags);
++	if (!result) {
++		/* now we have to replace the unit after which new units were
++		   inserted. Its position is tracked by @watch */
++		reiser4_extent *ext;
++		znode *node;
++
++		node = h->coord_after.node;
++		if (node != orig_znode) {
++			coord_clear_iplug(&h->coord_after);
++			result = zload(node);
++		}
++
++		if (likely(!result)) {
++			ext = extent_by_coord(&h->coord_after);
++
++			assert("vs-987", znode_is_loaded(node));
++			assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
++
++			/* overwrite extent unit */
++			memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
++			znode_make_dirty(node);
++
++			if (node != orig_znode)
++				zrelse(node);
++
++			if (return_inserted_position == 0) {
++				/* coord and lh are to be set to overwritten
++				   extent */
++				assert("vs-1662",
++				       WITH_DATA(node, !memcmp(&h->overwrite,
++							       extent_by_coord(
++								       &h->coord_after),
++							       sizeof(reiser4_extent))));
++
++				*h->coord = h->coord_after;
++				done_lh(h->lh);
++				copy_lh(h->lh, &h->lh_after);
++			} else {
++				/* h->coord and h->lh are to be set to first of
++				   inserted units */
++				assert("vs-1663",
++				       WITH_DATA(h->coord->node,
++						 !memcmp(&h->new_extents[0],
++							 extent_by_coord(h->coord),
++							 sizeof(reiser4_extent))));
++				assert("vs-1664", h->lh->node == h->coord->node);
++			}
++		}
++	}
++	tap_done(&h->watch);
++
++	return result;
++}
++
++lock_handle *znode_lh(znode *node)
++{
++	assert("vs-1371", znode_is_write_locked(node));
++	assert("vs-1372", znode_is_wlocked_once(node));
++	return list_entry(node->lock.owners.next, lock_handle, owners_link);
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/item/extent.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/extent.h
+@@ -0,0 +1,228 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#ifndef __REISER4_EXTENT_H__
++#define __REISER4_EXTENT_H__
++
++/* on disk extent */
++typedef struct {
++	reiser4_dblock_nr start;
++	reiser4_dblock_nr width;
++} reiser4_extent;
++
++typedef struct extent_stat {
++	int unallocated_units;
++	int unallocated_blocks;
++	int allocated_units;
++	int allocated_blocks;
++	int hole_units;
++	int hole_blocks;
++} extent_stat;
++
++/* extents in an extent item can be either holes, or unallocated or allocated
++   extents */
++typedef enum {
++	HOLE_EXTENT,
++	UNALLOCATED_EXTENT,
++	ALLOCATED_EXTENT
++} extent_state;
++
++#define HOLE_EXTENT_START 0
++#define UNALLOCATED_EXTENT_START 1
++#define UNALLOCATED_EXTENT_START2 2
++
++typedef struct {
++	reiser4_block_nr pos_in_unit;
++	reiser4_block_nr width;	/* width of current unit */
++	pos_in_node_t nr_units;	/* number of units */
++	int ext_offset;		/* offset from the beginning of zdata() */
++	unsigned long expected_page;
++#if REISER4_DEBUG
++	reiser4_extent extent;
++#endif
++} extent_coord_extension_t;
++
++/* macros to set/get fields of on-disk extent */
++static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
++{
++	return le64_to_cpu(ext->start);
++}
++
++static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
++{
++	return le64_to_cpu(ext->width);
++}
++
++extern __u64 reiser4_current_block_count(void);
++
++static inline void
++extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
++{
++	cassert(sizeof(ext->start) == 8);
++	assert("nikita-2510",
++	       ergo(start > 1, start < reiser4_current_block_count()));
++	put_unaligned(cpu_to_le64(start), &ext->start);
++}
++
++static inline void
++extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
++{
++	cassert(sizeof(ext->width) == 8);
++	assert("", width > 0);
++	put_unaligned(cpu_to_le64(width), &ext->width);
++	assert("nikita-2511",
++	       ergo(extent_get_start(ext) > 1,
++		    extent_get_start(ext) + width <=
++		    reiser4_current_block_count()));
++}
++
++#define extent_item(coord) 					\
++({								\
++	assert("nikita-3143", item_is_extent(coord));		\
++	((reiser4_extent *)item_body_by_coord (coord));		\
++})
++
++#define extent_by_coord(coord)					\
++({								\
++	assert("nikita-3144", item_is_extent(coord));		\
++	(extent_item (coord) + (coord)->unit_pos);		\
++})
++
++#define width_by_coord(coord) 					\
++({								\
++	assert("nikita-3145", item_is_extent(coord));		\
++	extent_get_width (extent_by_coord(coord));		\
++})
++
++struct carry_cut_data;
++struct carry_kill_data;
++
++/* plugin->u.item.b.* */
++reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
++int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
++			   const reiser4_item_data *);
++int mergeable_extent(const coord_t * p1, const coord_t * p2);
++pos_in_node_t nr_units_extent(const coord_t *);
++lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
++void init_coord_extent(coord_t *);
++int init_extent(coord_t *, reiser4_item_data *);
++int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
++int can_shift_extent(unsigned free_space,
++		     coord_t * source, znode * target, shift_direction,
++		     unsigned *size, unsigned want);
++void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
++		       unsigned count, shift_direction where_is_free_space,
++		       unsigned free_space);
++int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
++		     struct carry_kill_data *);
++int create_hook_extent(const coord_t * coord, void *arg);
++int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		     struct carry_cut_data *, reiser4_key * smallest_removed,
++		     reiser4_key * new_first);
++int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		      struct carry_kill_data *, reiser4_key * smallest_removed,
++		      reiser4_key * new_first);
++reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
++reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
++void print_extent(const char *, coord_t *);
++int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
++int utmost_child_real_block_extent(const coord_t * coord, sideof side,
++				   reiser4_block_nr * block);
++void item_stat_extent(const coord_t * coord, void *vp);
++int check_extent(const coord_t * coord, const char **error);
++
++/* plugin->u.item.s.file.* */
++ssize_t write_extent(struct file *, const char __user *, size_t, loff_t *);
++int read_extent(struct file *, flow_t *, hint_t *);
++int readpage_extent(void *, struct page *);
++void readpages_extent(void *, struct address_space *, struct list_head *pages);
++reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
++void init_coord_extension_extent(uf_coord_t *, loff_t offset);
++int get_block_address_extent(const coord_t *, sector_t block,
++			     sector_t * result);
++
++/* these are used in flush.c
++   FIXME-VS: should they be somewhere in item_plugin? */
++int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
++int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
++			     reiser4_key * stop_key);
++
++int extent_is_unallocated(const coord_t * item);	/* True if this extent is unallocated (i.e., not a hole, not allocated). */
++__u64 extent_unit_index(const coord_t * item);	/* Block offset of this unit. */
++__u64 extent_unit_width(const coord_t * item);	/* Number of blocks in this unit. */
++
++/* plugin->u.item.f. */
++int scan_extent(flush_scan * scan);
++extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
++
++reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
++				   int nr_extents);
++reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr);
++extent_state state_of_extent(reiser4_extent * ext);
++void set_extent(reiser4_extent *, reiser4_block_nr start,
++		reiser4_block_nr width);
++int update_extent(struct inode *, jnode *, loff_t pos, int *plugged_hole);
++
++#include "../../coord.h"
++#include "../../lock.h"
++#include "../../tap.h"
++
++struct replace_handle {
++	/* these are to be set before calling replace_extent */
++	coord_t *coord;
++	lock_handle *lh;
++	reiser4_key key;
++	reiser4_key *pkey;
++	reiser4_extent overwrite;
++	reiser4_extent new_extents[2];
++	int nr_new_extents;
++	unsigned flags;
++
++	/* these are used by replace_extent */
++	reiser4_item_data item;
++	coord_t coord_after;
++	lock_handle lh_after;
++	tap_t watch;
++	reiser4_key paste_key;
++#if REISER4_DEBUG
++	reiser4_extent orig_ext;
++	reiser4_key tmp;
++#endif
++};
++
++/* this structure is kmalloced before calling make_extent to avoid excessive
++   stack consumption on plug_hole->replace_extent */
++struct make_extent_handle {
++	uf_coord_t *uf_coord;
++	reiser4_block_nr blocknr;
++	int created;
++	struct inode *inode;
++	union {
++		struct {
++		} append;
++		struct replace_handle replace;
++	} u;
++};
++
++int replace_extent(struct replace_handle *, int return_inserted_position);
++lock_handle *znode_lh(znode *);
++
++/* the reiser4 repacker support */
++struct repacker_cursor;
++extern int process_extent_backward_for_repacking(tap_t *,
++						 struct repacker_cursor *);
++extern int mark_extent_for_repacking(tap_t *, int);
++
++#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
++#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
++
++/* __REISER4_EXTENT_H__ */
++#endif
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/extent_file_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/extent_file_ops.c
+@@ -0,0 +1,1712 @@
++/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "item.h"
++#include "../../inode.h"
++#include "../../page_cache.h"
++#include "../object.h"
++
++#include <linux/quotaops.h>
++#include <linux/swap.h>
++#include "../../../../mm/filemap.h"
++
++
++static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
++{
++	reiser4_extent *ext;
++
++	ext = (reiser4_extent *) (zdata(node) + offset);
++	return ext;
++}
++
++/**
++ * check_uf_coord - verify coord extension
++ * @uf_coord:
++ * @key:
++ *
++ * Makes sure that all fields of @uf_coord are set properly. If @key is
++ * specified - check whether @uf_coord is set correspondingly.
++ */
++static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
++{
++#if REISER4_DEBUG
++	const coord_t *coord;
++	const extent_coord_extension_t *ext_coord;
++	reiser4_extent *ext;
++
++	coord = &uf_coord->coord;
++	ext_coord = &uf_coord->extension.extent;
++	ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
++
++	assert("",
++	       WITH_DATA(coord->node,
++			 (uf_coord->valid == 1 &&
++			  coord_is_iplug_set(coord) &&
++			  item_is_extent(coord) &&
++			  ext_coord->nr_units == nr_units_extent(coord) &&
++			  ext == extent_by_coord(coord) &&
++			  ext_coord->width == extent_get_width(ext) &&
++			  coord->unit_pos < ext_coord->nr_units &&
++			  ext_coord->pos_in_unit < ext_coord->width &&
++			  memcmp(ext, &ext_coord->extent,
++				 sizeof(reiser4_extent)) == 0)));
++	if (key) {
++		reiser4_key coord_key;
++		
++		unit_key_by_coord(&uf_coord->coord, &coord_key);
++		set_key_offset(&coord_key,
++			       get_key_offset(&coord_key) +
++			       (uf_coord->extension.extent.
++				pos_in_unit << PAGE_CACHE_SHIFT));
++		assert("", keyeq(key, &coord_key));
++	}
++#endif
++}
++
++static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
++{
++	check_uf_coord(uf_coord, NULL);
++
++	return ext_by_offset(uf_coord->coord.node,
++			     uf_coord->extension.extent.ext_offset);
++}
++
++#if REISER4_DEBUG
++
++/**
++ * offset_is_in_unit
++ *
++ *
++ *
++ */
++/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
++   pos_in_unit inside of unit correspondingly */
++static int offset_is_in_unit(const coord_t *coord, loff_t off)
++{
++	reiser4_key unit_key;
++	__u64 unit_off;
++	reiser4_extent *ext;
++
++	ext = extent_by_coord(coord);
++
++	unit_key_extent(coord, &unit_key);
++	unit_off = get_key_offset(&unit_key);
++	if (off < unit_off)
++		return 0;
++	if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
++		return 0;
++	return 1;
++}
++
++static int
++coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
++{
++	reiser4_key item_key;
++
++	assert("vs-771", coord_is_existing_unit(coord));
++	assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
++	assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
++
++	return offset_is_in_unit(coord, get_key_offset(key));
++}
++
++#endif
++
++/**
++ * can_append - 
++ * @key:
++ * @coord:
++ *
++ * Returns 1 if @key is equal to an append key of item @coord is set to
++ */
++static int can_append(const reiser4_key *key, const coord_t *coord)
++{
++	reiser4_key append_key;
++
++	return keyeq(key, append_key_extent(coord, &append_key));
++}
++
++/**
++ * append_hole
++ * @coord:
++ * @lh:
++ * @key:
++ *
++ */
++static int append_hole(coord_t *coord, lock_handle *lh,
++		       const reiser4_key *key)
++{
++	reiser4_key append_key;
++	reiser4_block_nr hole_width;
++	reiser4_extent *ext, new_ext;
++	reiser4_item_data idata;
++
++	/* last item of file may have to be appended with hole */
++	assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
++	assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
++
++	/* key of first byte which is not addressed by this extent */
++	append_key_extent(coord, &append_key);
++
++	assert("", keyle(&append_key, key));
++	
++	/*
++	 * extent item has to be appended with hole. Calculate length of that
++	 * hole
++	 */
++	hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
++		       current_blocksize - 1) >> current_blocksize_bits);
++	assert("vs-954", hole_width > 0);
++
++	/* set coord after last unit */
++	coord_init_after_item_end(coord);
++
++	/* get last extent in the item */
++	ext = extent_by_coord(coord);
++	if (state_of_extent(ext) == HOLE_EXTENT) {
++		/*
++		 * last extent of a file is hole extent. Widen that extent by
++		 * @hole_width blocks. Note that we do not worry about
++		 * overflowing - extent width is 64 bits
++		 */
++		set_extent(ext, HOLE_EXTENT_START,
++			   extent_get_width(ext) + hole_width);
++		znode_make_dirty(coord->node);
++		return 0;
++	}
++
++	/* append last item of the file with hole extent unit */
++	assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
++			  state_of_extent(ext) == UNALLOCATED_EXTENT));
++
++	set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
++	init_new_extent(&idata, &new_ext, 1);
++	return insert_into_item(coord, lh, &append_key, &idata, 0);
++}
++
++/**
++ * check_jnodes
++ * @twig: longterm locked twig node
++ * @key: 
++ *
++ */
++static void check_jnodes(znode *twig, const reiser4_key *key, int count)
++{
++#if REISER4_DEBUG
++	coord_t c;
++	reiser4_key node_key, jnode_key;
++
++	jnode_key = *key;
++
++	assert("", twig != NULL);
++	assert("", znode_get_level(twig) == TWIG_LEVEL);
++	assert("", znode_is_write_locked(twig));
++
++	zload(twig);
++	/* get the smallest key in twig node */
++	coord_init_first_unit(&c, twig);
++	unit_key_by_coord(&c, &node_key);
++	assert("", keyle(&node_key, &jnode_key));
++
++	coord_init_last_unit(&c, twig);
++	unit_key_by_coord(&c, &node_key);
++	if (item_plugin_by_coord(&c)->s.file.append_key)
++		item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
++	set_key_offset(&jnode_key,
++		       get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
++	assert("", keylt(&jnode_key, &node_key));
++	zrelse(twig);
++#endif
++}
++
++/**
++ * append_last_extent - append last file item
++ * @uf_coord: coord to start insertion from
++ * @jnodes: array of jnodes
++ * @count: number of jnodes in the array
++ *
++ * There is already at least one extent item of file @inode in the tree. Append
++ * the last of them with unallocated extent unit of width @count. Assign
++ * fake block numbers to jnodes corresponding to the inserted extent.
++ */
++static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
++			      jnode **jnodes, int count)
++{
++	int result;
++	reiser4_extent new_ext;
++	reiser4_item_data idata;
++	coord_t *coord;
++	extent_coord_extension_t *ext_coord;
++	reiser4_extent *ext;
++	reiser4_block_nr block;
++	jnode *node;
++	int i;
++
++	coord = &uf_coord->coord;
++	ext_coord = &uf_coord->extension.extent;
++	ext = ext_by_ext_coord(uf_coord);
++
++	/* check correctness of position in the item */
++	assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
++	assert("vs-1311", coord->between == AFTER_UNIT);
++	assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
++
++	if (!can_append(key, coord)) {
++		/* hole extent has to be inserted */
++		result = append_hole(coord, uf_coord->lh, key);
++		uf_coord->valid = 0;
++		return result;
++	}
++
++	if (count == 0)
++		return 0;
++
++	assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
++
++	result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
++					   count);
++	BUG_ON(result != 0);
++
++	switch (state_of_extent(ext)) {
++	case UNALLOCATED_EXTENT:
++		/*
++		 * last extent unit of the file is unallocated one. Increase
++		 * its width by @count
++		 */
++		set_extent(ext, UNALLOCATED_EXTENT_START,
++			   extent_get_width(ext) + count);
++		znode_make_dirty(coord->node);
++
++		/* update coord extension */
++		ext_coord->width += count;
++		ON_DEBUG(extent_set_width
++			 (&uf_coord->extension.extent.extent,
++			  ext_coord->width));
++		break;
++
++	case HOLE_EXTENT:
++	case ALLOCATED_EXTENT:
++		/*
++		 * last extent unit of the file is either hole or allocated
++		 * one. Append one unallocated extent of width @count
++		 */
++		set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
++		init_new_extent(&idata, &new_ext, 1);
++		result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
++		uf_coord->valid = 0;
++		if (result)
++			return result;
++		break;
++
++	default:
++		return RETERR(-EIO);
++	}
++
++	/*
++	 * make sure that we hold long term locked twig node containing all
++	 * jnodes we are about to capture
++	 */
++	check_jnodes(uf_coord->lh->node, key, count);
++
++	/*
++	 * assign fake block numbers to all jnodes. FIXME: make sure whether
++	 * twig node containing inserted extent item is locked
++	 */
++	block = fake_blocknr_unformatted(count);
++	for (i = 0; i < count; i ++, block ++) {
++		node = jnodes[i];
++		spin_lock_jnode(node);
++		JF_SET(node, JNODE_CREATED);
++		jnode_set_block(node, &block);
++ 		result = try_capture(node, ZNODE_WRITE_LOCK, 0);
++		BUG_ON(result != 0);
++		jnode_make_dirty_locked(node);
++		spin_unlock_jnode(node);		
++	}
++	return count;
++}
++
++/**
++ * insert_first_hole - inser hole extent into tree
++ * @coord:
++ * @lh:
++ * @key:
++ *
++ *
++ */
++static int insert_first_hole(coord_t *coord, lock_handle *lh,
++			     const reiser4_key *key)
++{
++	reiser4_extent new_ext;
++	reiser4_item_data idata;
++	reiser4_key item_key;
++	reiser4_block_nr hole_width;
++
++	/* @coord must be set for inserting of new item */
++	assert("vs-711", coord_is_between_items(coord));
++
++	item_key = *key;
++	set_key_offset(&item_key, 0ull);
++	
++	hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
++		      current_blocksize_bits);
++	assert("vs-710", hole_width > 0);
++
++	/* compose body of hole extent and insert item into tree */
++	set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
++	init_new_extent(&idata, &new_ext, 1);
++	return insert_extent_by_coord(coord, &idata, &item_key, lh);
++}
++
++
++/**
++ * insert_first_extent - insert first file item
++ * @inode: inode of file
++ * @uf_coord: coord to start insertion from
++ * @jnodes: array of jnodes
++ * @count: number of jnodes in the array
++ * @inode:
++ *
++ * There are no items of file @inode in the tree yet. Insert unallocated extent
++ * of width @count into tree or hole extent if writing not to the
++ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
++ * unallocated extent. Returns number of jnodes or error code.
++ */
++static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
++			       jnode **jnodes, int count,
++			       struct inode *inode)
++{
++	int result;
++	int i;
++	reiser4_extent new_ext;
++	reiser4_item_data idata;
++	reiser4_block_nr block;
++	unix_file_info_t *uf_info;
++	jnode *node;
++
++	/* first extent insertion starts at leaf level */
++	assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
++	assert("vs-711", coord_is_between_items(&uf_coord->coord));
++
++	if (get_key_offset(key) != 0) {
++		result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
++		uf_coord->valid = 0;
++		uf_info = unix_file_inode_data(inode);
++
++		/*
++		 * first item insertion is only possible when writing to empty
++		 * file or performing tail conversion
++		 */
++		assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
++			    (inode_get_flag(inode, REISER4_PART_MIXED) &&
++			     inode_get_flag(inode, REISER4_PART_IN_CONV))));
++
++		/* if file was empty - update its state */
++		if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
++			uf_info->container = UF_CONTAINER_EXTENTS;
++		return result;
++	}
++
++	if (count == 0)
++		return 0;
++
++	result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
++	BUG_ON(result != 0);
++
++	/*
++	 * prepare for tree modification: compose body of item and item data
++	 * structure needed for insertion
++	 */
++	set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
++	init_new_extent(&idata, &new_ext, 1);
++
++	/* insert extent item into the tree */
++	result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
++					uf_coord->lh);
++	if (result)
++		return result;
++
++	/*
++	 * make sure that we hold long term locked twig node containing all
++	 * jnodes we are about to capture
++	 */
++	check_jnodes(uf_coord->lh->node, key, count);
++	/*
++	 * assign fake block numbers to all jnodes, capture and mark them dirty
++	 */
++	block = fake_blocknr_unformatted(count);
++	for (i = 0; i < count; i ++, block ++) {
++		node = jnodes[i];
++		spin_lock_jnode(node);
++		JF_SET(node, JNODE_CREATED);
++		jnode_set_block(node, &block);
++ 		result = try_capture(node, ZNODE_WRITE_LOCK, 0);
++		BUG_ON(result != 0);
++		jnode_make_dirty_locked(node);
++		spin_unlock_jnode(node);		
++	}
++
++	/*
++	 * invalidate coordinate, research must be performed to continue
++	 * because write will continue on twig level
++	 */
++	uf_coord->valid = 0;
++	return count;
++}
++
++/**
++ * plug_hole - replace hole extent with unallocated and holes
++ * @uf_coord:
++ * @key:
++ * @node:
++ * @h: structure containing coordinate, lock handle, key, etc
++ *
++ * Creates an unallocated extent of width 1 within a hole. In worst case two
++ * additional extents can be created.
++ */
++static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
++{
++	struct replace_handle rh;
++	reiser4_extent *ext;
++	reiser4_block_nr width, pos_in_unit;
++	coord_t *coord;
++	extent_coord_extension_t *ext_coord;
++	int return_inserted_position;
++
++ 	check_uf_coord(uf_coord, key);
++
++	rh.coord = coord_by_uf_coord(uf_coord);
++	rh.lh = uf_coord->lh;
++	rh.flags = 0;
++
++	coord = coord_by_uf_coord(uf_coord);
++	ext_coord = ext_coord_by_uf_coord(uf_coord);
++	ext = ext_by_ext_coord(uf_coord);
++
++	width = ext_coord->width;
++	pos_in_unit = ext_coord->pos_in_unit;
++
++	*how = 0;
++	if (width == 1) {
++		set_extent(ext, UNALLOCATED_EXTENT_START, 1);
++		znode_make_dirty(coord->node);
++		/* update uf_coord */
++		ON_DEBUG(ext_coord->extent = *ext);
++		*how = 1;
++		return 0;
++	} else if (pos_in_unit == 0) {
++		/* we deal with first element of extent */
++		if (coord->unit_pos) {
++			/* there is an extent to the left */
++			if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
++				/*
++				 * left neighboring unit is an unallocated
++				 * extent. Increase its width and decrease
++				 * width of hole
++				 */
++				extent_set_width(ext - 1,
++						 extent_get_width(ext - 1) + 1);
++				extent_set_width(ext, width - 1);
++				znode_make_dirty(coord->node);
++
++				/* update coord extension */
++				coord->unit_pos--;
++				ext_coord->width = extent_get_width(ext - 1);
++				ext_coord->pos_in_unit = ext_coord->width - 1;
++				ext_coord->ext_offset -= sizeof(reiser4_extent);
++				ON_DEBUG(ext_coord->extent =
++					 *extent_by_coord(coord));
++				*how = 2;
++				return 0;
++			}
++		}
++		/* extent for replace */
++		set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
++		/* extent to be inserted */
++		set_extent(&rh.new_extents[0], HOLE_EXTENT_START, width - 1);
++		rh.nr_new_extents = 1;
++
++		/* have replace_extent to return with @coord and @uf_coord->lh
++		   set to unit which was replaced */
++		return_inserted_position = 0;
++		*how = 3;
++	} else if (pos_in_unit == width - 1) {
++		/* we deal with last element of extent */
++		if (coord->unit_pos < nr_units_extent(coord) - 1) {
++			/* there is an extent unit to the right */
++			if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
++				/*
++				 * right neighboring unit is an unallocated
++				 * extent. Increase its width and decrease
++				 * width of hole
++				 */
++				extent_set_width(ext + 1,
++						 extent_get_width(ext + 1) + 1);
++				extent_set_width(ext, width - 1);
++				znode_make_dirty(coord->node);
++
++				/* update coord extension */
++				coord->unit_pos++;
++				ext_coord->width = extent_get_width(ext + 1);
++				ext_coord->pos_in_unit = 0;
++				ext_coord->ext_offset += sizeof(reiser4_extent);
++				ON_DEBUG(ext_coord->extent =
++					 *extent_by_coord(coord));
++				*how = 4;
++				return 0;
++			}
++		}
++		/* extent for replace */
++		set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
++		/* extent to be inserted */
++		set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
++		rh.nr_new_extents = 1;
++
++		/* have replace_extent to return with @coord and @uf_coord->lh
++		   set to unit which was inserted */
++		return_inserted_position = 1;
++		*how = 5;
++	} else {
++		/* extent for replace */
++		set_extent(&rh.overwrite, HOLE_EXTENT_START, pos_in_unit);
++		/* extents to be inserted */
++		set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
++		set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
++			   width - pos_in_unit - 1);
++		rh.nr_new_extents = 2;
++
++		/* have replace_extent to return with @coord and @uf_coord->lh
++		   set to first of units which were inserted */
++		return_inserted_position = 1;
++		*how = 6;
++	}
++	unit_key_by_coord(coord, &rh.paste_key);
++	set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
++		       extent_get_width(&rh.overwrite) * current_blocksize);
++
++	uf_coord->valid = 0;
++	return replace_extent(&rh, return_inserted_position);
++}
++
++/**
++ * overwrite_one_block -
++ * @uf_coord:
++ * @key:
++ * @node:
++ *
++ * If @node corresponds to hole extent - create unallocated extent for it and
++ * assign fake block number. If @node corresponds to allocated extent - assign
++ * block number of jnode
++ */
++static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
++			       jnode *node, int *hole_plugged)
++{
++	int result;
++	extent_coord_extension_t *ext_coord;
++	reiser4_extent *ext;
++	reiser4_block_nr block;
++	int how;
++
++	assert("vs-1312", uf_coord->coord.between == AT_UNIT);
++
++	result = 0;
++	ext_coord = ext_coord_by_uf_coord(uf_coord);
++	ext = ext_by_ext_coord(uf_coord);
++	assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
++
++	switch (state_of_extent(ext)) {
++	case ALLOCATED_EXTENT:
++		block = extent_get_start(ext) + ext_coord->pos_in_unit;
++		break;
++
++	case HOLE_EXTENT:
++		result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
++		BUG_ON(result != 0);
++		result = plug_hole(uf_coord, key, &how);
++		if (result)
++			return result;
++		block = fake_blocknr_unformatted(1);
++		if (hole_plugged)
++			*hole_plugged = 1;
++		JF_SET(node, JNODE_CREATED);
++		break;
++
++	default:
++		return RETERR(-EIO);
++	}
++
++	jnode_set_block(node, &block);
++	return 0;
++}
++
++/**
++ * move_coord - move coordinate forward
++ * @uf_coord:
++ *
++ * Move coordinate one data block pointer forward. Return 1 if coord is set to
++ * the last one already or is invalid.
++ */
++static int move_coord(uf_coord_t *uf_coord)
++{
++	extent_coord_extension_t *ext_coord;
++
++	if (uf_coord->valid == 0)
++		return 1;
++	ext_coord = &uf_coord->extension.extent;
++	ext_coord->pos_in_unit ++;
++	if (ext_coord->pos_in_unit < ext_coord->width)
++		/* coordinate moved within the unit */
++		return 0;
++
++	/* end of unit is reached. Try to move to next unit */
++	ext_coord->pos_in_unit = 0;
++	uf_coord->coord.unit_pos ++;
++	if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
++		/* coordinate moved to next unit */
++		ext_coord->ext_offset += sizeof(reiser4_extent);
++		ext_coord->width =
++			extent_get_width(ext_by_offset
++					 (uf_coord->coord.node,
++					  ext_coord->ext_offset));
++		ON_DEBUG(ext_coord->extent =
++			 *ext_by_offset(uf_coord->coord.node,
++					ext_coord->ext_offset));
++		return 0;
++	}
++	/* end of item is reached */
++	uf_coord->valid = 0;
++	return 1;
++}
++
++/**
++ * overwrite_extent - 
++ * @inode:
++ *
++ * Returns number of handled jnodes.
++ */
++static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
++			    jnode **jnodes, int count, int *plugged_hole)
++{
++	int result;
++	reiser4_key k;
++	int i;
++	jnode *node;
++
++	k = *key;
++	for (i = 0; i < count; i ++) {
++		node = jnodes[i];
++		if (*jnode_get_block(node) == 0) {
++			result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
++			if (result)
++				return result;
++		}
++		/*
++		 * make sure that we hold long term locked twig node containing
++		 * all jnodes we are about to capture
++		 */
++		check_jnodes(uf_coord->lh->node, &k, 1);
++		/*
++		 * assign fake block numbers to all jnodes, capture and mark
++		 * them dirty
++		 */
++		spin_lock_jnode(node);
++		result = try_capture(node, ZNODE_WRITE_LOCK, 0);
++		BUG_ON(result != 0);
++		jnode_make_dirty_locked(node);
++		spin_unlock_jnode(node);
++
++		if (uf_coord->valid == 0)
++			return i + 1;
++
++		check_uf_coord(uf_coord, &k);
++
++		if (move_coord(uf_coord)) {
++			/*
++			 * failed to move to the next node pointer. Either end
++			 * of file or end of twig node is reached. In the later
++			 * case we might go to the right neighbor.
++			 */
++			uf_coord->valid = 0;
++			return i + 1;
++		}
++		set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
++	}
++
++	return count;
++}
++
++void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
++
++/**
++ * update_extent
++ * @file:
++ * @jnodes:
++ * @count:
++ * @off:
++ * 
++ */
++int update_extent(struct inode *inode, jnode *node, loff_t pos,
++		  int *plugged_hole)
++{
++	int result;
++	znode *loaded;
++	uf_coord_t uf_coord;
++	coord_t *coord;
++	lock_handle lh;
++	reiser4_key key;
++
++	assert("", lock_counters()->d_refs == 0);
++
++	key_by_inode_and_offset_common(inode, pos, &key);
++
++	init_uf_coord(&uf_coord, &lh);
++	coord = &uf_coord.coord;
++	result = find_file_item_nohint(coord, &lh, &key,
++				       ZNODE_WRITE_LOCK, inode);
++	if (IS_CBKERR(result)) {
++		assert("", lock_counters()->d_refs == 0);
++		return result;
++	}
++	
++	result = zload(coord->node);
++	BUG_ON(result != 0);
++	loaded = coord->node;
++
++	if (coord->between == AFTER_UNIT) {
++		/*
++		 * append existing extent item with unallocated extent of width
++		 * nr_jnodes
++		 */
++		init_coord_extension_extent(&uf_coord,
++					    get_key_offset(&key));
++		result = append_last_extent(&uf_coord, &key,
++					    &node, 1);
++	} else if (coord->between == AT_UNIT) {
++		/*
++		 * overwrite
++		 * not optimal yet. Will be optimized if new write will show
++		 * performance win.
++		 */
++		init_coord_extension_extent(&uf_coord,
++					    get_key_offset(&key));
++		result = overwrite_extent(&uf_coord, &key,
++					  &node, 1, plugged_hole);
++	} else {
++		/*
++		 * there are no items of this file in the tree yet. Create
++		 * first item of the file inserting one unallocated extent of
++		 * width nr_jnodes
++		 */
++		result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
++	}
++	assert("", result == 1 || result < 0);
++	zrelse(loaded);
++	done_lh(&lh);
++	assert("", lock_counters()->d_refs == 0);
++	return (result == 1) ? 0 : result;
++}
++
++/**
++ * update_extents
++ * @file:
++ * @jnodes:
++ * @count:
++ * @off:
++ * 
++ */
++static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
++{
++	struct inode *inode;
++	struct hint hint;
++	reiser4_key key;
++	int result;
++	znode *loaded;
++	
++	result = load_file_hint(file, &hint);
++	BUG_ON(result != 0);
++	
++	inode = file->f_dentry->d_inode;
++	if (count != 0)
++		/*
++		 * count == 0 is special case: expanding truncate
++		 */
++		pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
++	key_by_inode_and_offset_common(inode, pos, &key);
++
++	assert("", lock_counters()->d_refs == 0);
++	
++	do {
++		result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
++		if (IS_CBKERR(result)) {
++			assert("", lock_counters()->d_refs == 0);
++			return result;
++		}
++
++		result = zload(hint.ext_coord.coord.node);
++		BUG_ON(result != 0);
++		loaded = hint.ext_coord.coord.node;
++
++		if (hint.ext_coord.coord.between == AFTER_UNIT) {
++			/*
++			 * append existing extent item with unallocated extent
++			 * of width nr_jnodes
++			 */
++			if (hint.ext_coord.valid == 0)
++				/* NOTE: get statistics on this */
++				init_coord_extension_extent(&hint.ext_coord,
++							    get_key_offset(&key));
++			result = append_last_extent(&hint.ext_coord, &key,
++						    jnodes, count);
++		} else if (hint.ext_coord.coord.between == AT_UNIT) {
++			/*
++			 * overwrite
++			 * not optimal yet. Will be optimized if new write will
++			 * show performance win.
++			 */
++			if (hint.ext_coord.valid == 0)
++				/* NOTE: get statistics on this */
++				init_coord_extension_extent(&hint.ext_coord,
++							    get_key_offset(&key));
++			result = overwrite_extent(&hint.ext_coord, &key,
++						  jnodes, count, NULL);
++		} else {
++			/*
++			 * there are no items of this file in the tree
++			 * yet. Create first item of the file inserting one
++			 * unallocated extent of * width nr_jnodes
++			 */
++			result = insert_first_extent(&hint.ext_coord, &key,
++						     jnodes, count, inode);
++		}
++		zrelse(loaded);
++		if (result < 0) {
++			done_lh(hint.ext_coord.lh);
++			break;
++		}
++
++		jnodes += result;
++		count -= result;
++		set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
++
++		/* seal and unlock znode */
++		if (hint.ext_coord.valid)
++			set_hint(&hint, &key, ZNODE_WRITE_LOCK);
++		else
++			unset_hint(&hint);
++
++	} while (count > 0);
++
++	save_file_hint(file, &hint);
++	assert("", lock_counters()->d_refs == 0);
++	return result;
++}
++
++/**
++ * write_extent_reserve_space - reserve space for extent write operation
++ * @inode:
++ *
++ * Estimates and reserves space which may be required for writing
++ * WRITE_GRANULARITY pages of file.
++ */
++static int write_extent_reserve_space(struct inode *inode)
++{
++	__u64 count;
++	reiser4_tree *tree;
++
++	/*
++	 * to write WRITE_GRANULARITY pages to a file by extents we have to
++	 * reserve disk space for: 
++ 
++	 * 1. find_file_item may have to insert empty node to the tree (empty
++	 * leaf node between two extent items). This requires 1 block and
++	 * number of blocks which are necessary to perform insertion of an
++	 * internal item into twig level.
++
++	 * 2. for each of written pages there might be needed 1 block and
++	 * number of blocks which might be necessary to perform insertion of or
++	 * paste to an extent item.
++
++	 * 3. stat data update
++	 */
++	tree = tree_by_inode(inode);
++	count = estimate_one_insert_item(tree) +
++		WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
++		estimate_one_insert_item(tree);
++	grab_space_enable();
++	return reiser4_grab_space(count, 0 /* flags */);
++}
++
++/**
++ * write_extent - write method of extent item plugin
++ * @file: file to write to
++ * @buf: address of user-space buffer
++ * @write_amount: number of bytes to write
++ * @off: position in file to write to
++ *
++ */
++ssize_t write_extent(struct file *file, const char __user *buf, size_t count,
++		     loff_t *pos)
++{
++	int have_to_update_extent;
++	int nr_pages;
++	struct page *page;
++	jnode *jnodes[WRITE_GRANULARITY + 1];
++	struct inode *inode;
++	unsigned long index;
++	unsigned long end;
++	int i;
++	int to_page, page_off;
++	size_t left, written;
++	int result;
++
++	inode = file->f_dentry->d_inode;
++	if (write_extent_reserve_space(inode))
++		return RETERR(-ENOSPC);
++
++	if (count == 0) {
++		/* truncate case */
++		update_extents(file, jnodes, 0, *pos);
++		return 0;
++	}
++
++	BUG_ON(get_current_context()->trans->atom != NULL);
++
++	index = *pos >> PAGE_CACHE_SHIFT;
++	/* calculate number of pages which are to be written */
++      	end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
++	nr_pages = end - index + 1;
++	assert("", nr_pages <= WRITE_GRANULARITY + 1);
++
++	/* get pages and jnodes */
++	for (i = 0; i < nr_pages; i ++) {
++		page = find_or_create_page(inode->i_mapping, index + i, get_gfp_mask());
++		if (page == NULL) {
++			while(i --) {
++				unlock_page(jnode_page(jnodes[i]));
++				page_cache_release(jnode_page(jnodes[i]));
++			}
++			return RETERR(-ENOMEM);			
++		}
++
++		jnodes[i] = jnode_of_page(page);
++		if (IS_ERR(jnodes[i])) {
++			unlock_page(page);
++			page_cache_release(page);
++			while (i --) {
++				jput(jnodes[i]);
++				page_cache_release(jnode_page(jnodes[i]));
++			}
++			return RETERR(-ENOMEM);			
++		}
++		/* prevent jnode and page from disconnecting */
++		JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
++		unlock_page(page);
++	}
++
++	BUG_ON(get_current_context()->trans->atom != NULL);
++
++	have_to_update_extent = 0;
++
++	left = count;
++	page_off = (*pos & (PAGE_CACHE_SIZE - 1));
++	for (i = 0; i < nr_pages; i ++) {
++		to_page = PAGE_CACHE_SIZE - page_off;
++		if (to_page > left)
++			to_page = left;
++		page = jnode_page(jnodes[i]);
++		if (((loff_t)page->index << PAGE_CACHE_SHIFT) < inode->i_size &&
++		    !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
++			/*
++			 * the above is not optimal for partial write to last
++			 * page of file when file size is not at boundary of
++			 * page
++			 */
++			lock_page(page);
++			if (!PageUptodate(page)) {
++				result = readpage_unix_file(NULL, page);
++				BUG_ON(result != 0);
++				/* wait for read completion */
++				lock_page(page);
++				BUG_ON(!PageUptodate(page));
++				unlock_page(page);
++			} else
++				result = 0;
++		}
++
++		BUG_ON(get_current_context()->trans->atom != NULL);
++		fault_in_pages_readable(buf, to_page);
++		BUG_ON(get_current_context()->trans->atom != NULL);
++
++		lock_page(page);
++		if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
++			void *kaddr;
++
++			kaddr = kmap_atomic(page, KM_USER0);
++			memset(kaddr, 0, page_off);
++			memset(kaddr + page_off + to_page, 0,
++			       PAGE_CACHE_SIZE - (page_off + to_page));
++			flush_dcache_page(page);
++			kunmap_atomic(kaddr, KM_USER0);
++		}
++
++		written = filemap_copy_from_user(page, page_off, buf, to_page);
++		if (written != to_page) {
++			unlock_page(page);
++			page_cache_release(page);
++			nr_pages = i;
++			jput(jnodes[i]);
++			result = RETERR(-EFAULT);
++			break;
++		}
++		flush_dcache_page(page);
++		set_page_dirty_internal(page);
++		unlock_page(page);
++		mark_page_accessed(page);
++		SetPageUptodate(page);
++		page_cache_release(page);
++
++		if (jnodes[i]->blocknr == 0)
++			have_to_update_extent ++;
++
++		page_off = 0;
++		buf += to_page;
++		left -= to_page;
++		BUG_ON(get_current_context()->trans->atom != NULL);
++	}
++ 
++	if (have_to_update_extent) {
++		update_extents(file, jnodes, nr_pages, *pos);
++	} else {
++		for (i = 0; i < nr_pages; i ++) {
++			spin_lock_jnode(jnodes[i]);
++			result = try_capture(jnodes[i], ZNODE_WRITE_LOCK, 0);
++			BUG_ON(result != 0);
++			jnode_make_dirty_locked(jnodes[i]);
++			spin_unlock_jnode(jnodes[i]);
++		}
++	}
++
++	for (i = 0; i < nr_pages; i ++) {
++		JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
++		jput(jnodes[i]);
++	}
++
++	/* the only error handled so far is EFAULT on copy_from_user  */
++	return (count - left) ? (count - left) : -EFAULT;
++}
++
++static inline void zero_page(struct page *page)
++{
++	char *kaddr = kmap_atomic(page, KM_USER0);
++
++	memset(kaddr, 0, PAGE_CACHE_SIZE);
++	flush_dcache_page(page);
++	kunmap_atomic(kaddr, KM_USER0);
++	SetPageUptodate(page);
++	unlock_page(page);
++}
++
++static int
++do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
++		   struct page *page)
++{
++	jnode *j;
++	struct address_space *mapping;
++	unsigned long index;
++	oid_t oid;
++	reiser4_block_nr block;
++
++	mapping = page->mapping;
++	oid = get_inode_oid(mapping->host);
++	index = page->index;
++
++	switch (state_of_extent(ext)) {
++	case HOLE_EXTENT:
++		/*
++		 * it is possible to have hole page with jnode, if page was
++		 * eflushed previously.
++		 */
++		j = jfind(mapping, index);
++		if (j == NULL) {
++			zero_page(page);
++			return 0;
++		}
++		spin_lock_jnode(j);
++		if (!jnode_page(j)) {
++			jnode_attach_page(j, page);
++		} else {
++			BUG_ON(jnode_page(j) != page);
++			assert("vs-1504", jnode_page(j) == page);
++		}
++		block = *jnode_get_io_block(j);
++		spin_unlock_jnode(j);
++		if (block == 0) {
++			zero_page(page);
++			jput(j);
++			return 0;
++		}
++		break;
++
++	case ALLOCATED_EXTENT:
++		j = jnode_of_page(page);
++		if (IS_ERR(j))
++			return PTR_ERR(j);
++		if (*jnode_get_block(j) == 0) {
++			reiser4_block_nr blocknr;
++
++			blocknr = extent_get_start(ext) + pos;
++			jnode_set_block(j, &blocknr);
++		} else
++			assert("vs-1403",
++			       j->blocknr == extent_get_start(ext) + pos);
++		break;
++
++	case UNALLOCATED_EXTENT:
++		j = jfind(mapping, index);
++		assert("nikita-2688", j);
++		assert("vs-1426", jnode_page(j) == NULL);
++
++		spin_lock_jnode(j);
++		jnode_attach_page(j, page);
++		spin_unlock_jnode(j);
++		break;
++
++	default:
++		warning("vs-957", "wrong extent\n");
++		return RETERR(-EIO);
++	}
++
++	BUG_ON(j == 0);
++	page_io(page, j, READ, get_gfp_mask());
++	jput(j);
++	return 0;
++}
++
++static int
++move_coord_pages(coord_t * coord, extent_coord_extension_t * ext_coord,
++		 unsigned count)
++{
++	reiser4_extent *ext;
++
++	ext_coord->expected_page += count;
++
++	ext = ext_by_offset(coord->node, ext_coord->ext_offset);
++
++	do {
++		if (ext_coord->pos_in_unit + count < ext_coord->width) {
++			ext_coord->pos_in_unit += count;
++			break;
++		}
++
++		if (coord->unit_pos == ext_coord->nr_units - 1) {
++			coord->between = AFTER_UNIT;
++			return 1;
++		}
++
++		/* shift to next unit */
++		count -= (ext_coord->width - ext_coord->pos_in_unit);
++		coord->unit_pos++;
++		ext_coord->pos_in_unit = 0;
++		ext_coord->ext_offset += sizeof(reiser4_extent);
++		ext++;
++		ON_DEBUG(ext_coord->extent = *ext);
++		ext_coord->width = extent_get_width(ext);
++	} while (1);
++
++	return 0;
++}
++
++static int readahead_readpage_extent(void *vp, struct page *page)
++{
++	int result;
++	uf_coord_t *uf_coord;
++	coord_t *coord;
++	extent_coord_extension_t *ext_coord;
++
++	uf_coord = vp;
++	coord = &uf_coord->coord;
++
++	if (coord->between != AT_UNIT) {
++		unlock_page(page);
++		return RETERR(-EINVAL);
++	}
++
++	ext_coord = &uf_coord->extension.extent;
++	if (ext_coord->expected_page != page->index) {
++		/* read_cache_pages skipped few pages. Try to adjust coord to page */
++		assert("vs-1269", page->index > ext_coord->expected_page);
++		if (move_coord_pages
++		    (coord, ext_coord,
++		     page->index - ext_coord->expected_page)) {
++			/* extent pointing to this page is not here */
++			unlock_page(page);
++			return RETERR(-EINVAL);
++		}
++
++		assert("vs-1274", offset_is_in_unit(coord,
++						    (loff_t) page->
++						    index << PAGE_CACHE_SHIFT));
++		ext_coord->expected_page = page->index;
++	}
++
++	assert("vs-1281", page->index == ext_coord->expected_page);
++	result =
++	    do_readpage_extent(ext_by_ext_coord(uf_coord),
++			       ext_coord->pos_in_unit, page);
++	if (!result)
++		move_coord_pages(coord, ext_coord, 1);
++	return result;
++}
++
++static int move_coord_forward(uf_coord_t *ext_coord)
++{
++	coord_t *coord;
++	extent_coord_extension_t *extension;
++
++	check_uf_coord(ext_coord, NULL);
++
++	extension = &ext_coord->extension.extent;
++	extension->pos_in_unit++;
++	if (extension->pos_in_unit < extension->width)
++		/* stay within the same extent unit */
++		return 0;
++
++	coord = &ext_coord->coord;
++
++	/* try to move to the next extent unit */
++	coord->unit_pos++;
++	if (coord->unit_pos < extension->nr_units) {
++		/* went to the next extent unit */
++		reiser4_extent *ext;
++
++		extension->pos_in_unit = 0;
++		extension->ext_offset += sizeof(reiser4_extent);
++		ext = ext_by_offset(coord->node, extension->ext_offset);
++		ON_DEBUG(extension->extent = *ext);
++		extension->width = extent_get_width(ext);
++		return 0;
++	}
++
++	/* there is no units in the item anymore */
++	return 1;
++}
++
++/* this is called by read_cache_pages for each of readahead pages */
++static int extent_readpage_filler(void *data, struct page *page)
++{
++	hint_t *hint;
++	loff_t offset;
++	reiser4_key key;
++	uf_coord_t *ext_coord;
++	int result;
++
++	offset = (loff_t) page->index << PAGE_CACHE_SHIFT;
++	key_by_inode_and_offset_common(page->mapping->host, offset, &key);
++
++	hint = (hint_t *) data;
++	ext_coord = &hint->ext_coord;
++
++	BUG_ON(PageUptodate(page));
++	unlock_page(page);
++
++	if (hint_validate(hint, &key, 1 /* check key */ , ZNODE_READ_LOCK) != 0) {
++		result = coord_by_key(current_tree, &key, &ext_coord->coord,
++				      ext_coord->lh, ZNODE_READ_LOCK,
++				      FIND_EXACT, TWIG_LEVEL,
++				      TWIG_LEVEL, CBK_UNIQUE, NULL);
++		if (result != CBK_COORD_FOUND) {
++			unset_hint(hint);
++			return result;
++		}
++		ext_coord->valid = 0;
++	}
++
++	if (zload(ext_coord->coord.node)) {
++		unset_hint(hint);
++		return RETERR(-EIO);
++	}
++	if (!item_is_extent(&ext_coord->coord)) {
++		/* tail conversion is running in parallel */
++		zrelse(ext_coord->coord.node);
++		unset_hint(hint);
++		return RETERR(-EIO);
++	}
++
++	if (ext_coord->valid == 0)
++		init_coord_extension_extent(ext_coord, offset);
++
++	check_uf_coord(ext_coord, &key);
++
++	lock_page(page);
++	if (!PageUptodate(page)) {
++		result = do_readpage_extent(ext_by_ext_coord(ext_coord),
++					    ext_coord->extension.extent.
++					    pos_in_unit, page);
++		if (result)
++			unlock_page(page);
++	} else {
++		unlock_page(page);
++		result = 0;
++	}
++	if (!result && move_coord_forward(ext_coord) == 0) {
++		set_key_offset(&key, offset + PAGE_CACHE_SIZE);
++		set_hint(hint, &key, ZNODE_READ_LOCK);
++	} else
++		unset_hint(hint);
++	zrelse(ext_coord->coord.node);
++	return result;
++}
++
++/* this is called by reiser4_readpages */
++static void
++extent_readpages_hook(struct address_space *mapping, struct list_head *pages,
++		      void *data)
++{
++	/* FIXME: try whether having reiser4_read_cache_pages improves anything */
++	read_cache_pages(mapping, pages, extent_readpage_filler, data);
++}
++
++static int
++call_page_cache_readahead(struct address_space *mapping, struct file *file,
++			  hint_t * hint,
++			  unsigned long page_nr,
++			  unsigned long ra_pages, struct file_ra_state *ra)
++{
++	reiser4_file_fsdata *fsdata;
++	int result;
++
++	fsdata = reiser4_get_file_fsdata(file);
++	if (IS_ERR(fsdata))
++		return page_nr;
++	fsdata->ra2.data = hint;
++	fsdata->ra2.readpages = extent_readpages_hook;
++
++	result = page_cache_readahead(mapping, ra, file, page_nr, ra_pages);
++	fsdata->ra2.readpages = NULL;
++	return result;
++}
++
++/* this is called when readahead did not */
++static int call_readpage(struct file *file, struct page *page)
++{
++	int result;
++
++	result = readpage_unix_file_nolock(file, page);
++	if (result)
++		return result;
++
++	lock_page(page);
++	if (!PageUptodate(page)) {
++		unlock_page(page);
++		page_detach_jnode(page, page->mapping, page->index);
++		warning("jmacd-97178", "page is not up to date");
++		return RETERR(-EIO);
++	}
++	unlock_page(page);
++	return 0;
++}
++
++static int filler(void *vp, struct page *page)
++{
++	return readpage_unix_file_nolock(vp, page);
++}
++
++/* Implements plugin->u.item.s.file.read operation for extent items. */
++int read_extent(struct file *file, flow_t *flow, hint_t *hint)
++{
++	int result;
++	struct page *page;
++	unsigned long cur_page, next_page;
++	unsigned long page_off, count;
++	struct address_space *mapping;
++	loff_t file_off;
++	uf_coord_t *uf_coord;
++	coord_t *coord;
++	extent_coord_extension_t *ext_coord;
++	unsigned long nr_pages, prev_page;
++	struct file_ra_state ra;
++	char *kaddr;
++
++	assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
++	assert("vs-572", flow->user == 1);
++	assert("vs-1351", flow->length > 0);
++
++	uf_coord = &hint->ext_coord;
++	
++	check_uf_coord(uf_coord, NULL);
++	assert("vs-33", uf_coord->lh == &hint->lh);
++
++	coord = &uf_coord->coord;
++	assert("vs-1119", znode_is_rlocked(coord->node));
++	assert("vs-1120", znode_is_loaded(coord->node));
++	assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
++
++	mapping = file->f_dentry->d_inode->i_mapping;
++	ext_coord = &uf_coord->extension.extent;
++
++	/* offset in a file to start read from */
++	file_off = get_key_offset(&flow->key);
++	/* offset within the page to start read from */
++	page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
++	/* bytes which can be read from the page which contains file_off */
++	count = PAGE_CACHE_SIZE - page_off;
++
++	/* index of page containing offset read is to start from */
++	cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
++	next_page = cur_page;
++	/* number of pages flow spans over */
++	nr_pages =
++	    ((file_off + flow->length + PAGE_CACHE_SIZE -
++	      1) >> PAGE_CACHE_SHIFT) - cur_page;
++
++	/* we start having twig node read locked. However, we do not want to
++	   keep that lock all the time readahead works. So, set a sel and
++	   release twig node. */
++	set_hint(hint, &flow->key, ZNODE_READ_LOCK);
++	/* &hint->lh is done-ed */
++
++	ra = file->f_ra;
++	prev_page = ra.prev_page;
++	do {
++		txn_restart_current();
++		if (next_page == cur_page)
++			next_page =
++			    call_page_cache_readahead(mapping, file, hint,
++						      cur_page, nr_pages, &ra);
++
++		page = find_get_page(mapping, cur_page);
++		if (unlikely(page == NULL)) {
++			handle_ra_miss(mapping, &ra, cur_page);
++			page = read_cache_page(mapping, cur_page, filler, file);
++			if (IS_ERR(page))
++				return PTR_ERR(page);
++			lock_page(page);
++			if (!PageUptodate(page)) {
++				unlock_page(page);
++				page_detach_jnode(page, mapping, cur_page);
++				page_cache_release(page);
++				warning("jmacd-97178",
++					"extent_read: page is not up to date");
++				return RETERR(-EIO);
++			}
++			unlock_page(page);
++		} else {
++			if (!PageUptodate(page)) {
++				lock_page(page);
++
++				assert("", page->mapping == mapping);
++				if (PageUptodate(page))
++					unlock_page(page);
++				else {
++					result = call_readpage(file, page);
++					if (result) {
++						page_cache_release(page);
++						return RETERR(result);
++					}
++				}
++			}
++			if (prev_page != cur_page)
++				mark_page_accessed(page);
++			prev_page = cur_page;
++		}
++
++		/* If users can be writing to this page using arbitrary virtual
++		   addresses, take care about potential aliasing before reading
++		   the page on the kernel side.
++		 */
++		if (mapping_writably_mapped(mapping))
++			flush_dcache_page(page);
++
++		assert("nikita-3034", schedulable());
++
++		/* number of bytes which are to be read from the page */
++		if (count > flow->length)
++			count = flow->length;
++
++		result = fault_in_pages_writeable(flow->data, count);
++		if (result) {
++			page_cache_release(page);
++			return RETERR(-EFAULT);
++		}
++
++		kaddr = kmap_atomic(page, KM_USER0);
++		result = __copy_to_user_inatomic(flow->data,
++					       kaddr + page_off, count);
++		kunmap_atomic(kaddr, KM_USER0);
++		if (result != 0) {
++			kaddr = kmap(page);
++			result = __copy_to_user(flow->data, kaddr + page_off, count);
++			kunmap(page);
++			if (unlikely(result))
++				return RETERR(-EFAULT);
++		}
++
++		page_cache_release(page);
++
++		/* increase key (flow->key), update user area pointer (flow->data) */
++		move_flow_forward(flow, count);
++
++		page_off = 0;
++		cur_page ++;
++		count = PAGE_CACHE_SIZE;
++		nr_pages--;
++	} while (flow->length);
++
++	file->f_ra = ra;
++	return 0;
++}
++
++/*
++  plugin->u.item.s.file.readpages
++*/
++void
++readpages_extent(void *vp, struct address_space *mapping,
++		 struct list_head *pages)
++{
++	assert("vs-1739", 0);
++	if (vp)
++		read_cache_pages(mapping, pages, readahead_readpage_extent, vp);
++}
++
++/*
++   plugin->s.file.readpage
++   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
++   or
++   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
++
++   At the beginning: coord->node is read locked, zloaded, page is
++   locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
++*/
++int readpage_extent(void *vp, struct page *page)
++{
++	uf_coord_t *uf_coord = vp;
++	ON_DEBUG(coord_t * coord = &uf_coord->coord);
++	ON_DEBUG(reiser4_key key);
++
++	assert("vs-1040", PageLocked(page));
++	assert("vs-1050", !PageUptodate(page));
++	assert("vs-1039", page->mapping && page->mapping->host);
++
++	assert("vs-1044", znode_is_loaded(coord->node));
++	assert("vs-758", item_is_extent(coord));
++	assert("vs-1046", coord_is_existing_unit(coord));
++	assert("vs-1045", znode_is_rlocked(coord->node));
++	assert("vs-1047",
++	       page->mapping->host->i_ino ==
++	       get_key_objectid(item_key_by_coord(coord, &key)));
++	check_uf_coord(uf_coord, NULL);
++
++	return do_readpage_extent(ext_by_ext_coord(uf_coord),
++				  uf_coord->extension.extent.pos_in_unit, page);
++}
++
++/**
++ * get_block_address_extent
++ * @coord:
++ * @block:
++ * @result:
++ *
++ *
++ */
++int get_block_address_extent(const coord_t *coord, sector_t block,
++			     sector_t *result)
++{
++	reiser4_extent *ext;
++
++	if (!coord_is_existing_unit(coord))
++		return RETERR(-EINVAL);
++
++	ext = extent_by_coord(coord);
++
++	if (state_of_extent(ext) != ALLOCATED_EXTENT)
++		/* FIXME: bad things may happen if it is unallocated extent */
++		*result = 0;
++	else {
++		reiser4_key key;
++
++		unit_key_by_coord(coord, &key);
++		assert("vs-1645",
++		       block >= get_key_offset(&key) >> current_blocksize_bits);
++		assert("vs-1646",
++		       block <
++		       (get_key_offset(&key) >> current_blocksize_bits) +
++		       extent_get_width(ext));
++		*result =
++		    extent_get_start(ext) + (block -
++					     (get_key_offset(&key) >>
++					      current_blocksize_bits));
++	}
++	return 0;
++}
++
++/*
++  plugin->u.item.s.file.append_key
++  key of first byte which is the next to last byte by addressed by this extent
++*/
++reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
++{
++	item_key_by_coord(coord, key);
++	set_key_offset(key,
++		       get_key_offset(key) + extent_size(coord,
++							 nr_units_extent
++							 (coord)));
++
++	assert("vs-610", get_key_offset(key)
++	       && (get_key_offset(key) & (current_blocksize - 1)) == 0);
++	return key;
++}
++
++/* plugin->u.item.s.file.init_coord_extension */
++void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
++{
++	coord_t *coord;
++	extent_coord_extension_t *ext_coord;
++	reiser4_key key;
++	loff_t offset;
++
++	assert("vs-1295", uf_coord->valid == 0);
++
++	coord = &uf_coord->coord;
++	assert("vs-1288", coord_is_iplug_set(coord));
++	assert("vs-1327", znode_is_loaded(coord->node));
++
++	if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
++		return;
++
++	ext_coord = &uf_coord->extension.extent;
++	ext_coord->nr_units = nr_units_extent(coord);
++	ext_coord->ext_offset =
++	    (char *)extent_by_coord(coord) - zdata(coord->node);
++	ext_coord->width = extent_get_width(extent_by_coord(coord));
++	ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
++	uf_coord->valid = 1;
++
++	/* pos_in_unit is the only uninitialized field in extended coord */
++	if (coord->between == AFTER_UNIT) {
++		assert("vs-1330",
++		       coord->unit_pos == nr_units_extent(coord) - 1);
++
++		ext_coord->pos_in_unit = ext_coord->width - 1;
++	} else {
++		/* AT_UNIT */
++		unit_key_by_coord(coord, &key);
++		offset = get_key_offset(&key);
++
++		assert("vs-1328", offset <= lookuped);
++		assert("vs-1329",
++		       lookuped <
++		       offset + ext_coord->width * current_blocksize);
++		ext_coord->pos_in_unit =
++		    ((lookuped - offset) >> current_blocksize_bits);
++	}
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/item/extent_flush_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/extent_flush_ops.c
+@@ -0,0 +1,1018 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "item.h"
++#include "../../tree.h"
++#include "../../jnode.h"
++#include "../../super.h"
++#include "../../flush.h"
++#include "../../carry.h"
++#include "../object.h"
++
++#include <linux/pagemap.h>
++
++static reiser4_block_nr extent_unit_start(const coord_t * item);
++
++/* Return either first or last extent (depending on @side) of the item
++   @coord is set to. Set @pos_in_unit either to first or to last block
++   of extent. */
++static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
++					 reiser4_block_nr * pos_in_unit)
++{
++	reiser4_extent *ext;
++
++	if (side == LEFT_SIDE) {
++		/* get first extent of item */
++		ext = extent_item(coord);
++		*pos_in_unit = 0;
++	} else {
++		/* get last extent of item and last position within it */
++		assert("vs-363", side == RIGHT_SIDE);
++		ext = extent_item(coord) + coord_last_unit_pos(coord);
++		*pos_in_unit = extent_get_width(ext) - 1;
++	}
++
++	return ext;
++}
++
++/* item_plugin->f.utmost_child */
++/* Return the child. Coord is set to extent item. Find jnode corresponding
++   either to first or to last unformatted node pointed by the item */
++int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
++{
++	reiser4_extent *ext;
++	reiser4_block_nr pos_in_unit;
++
++	ext = extent_utmost_ext(coord, side, &pos_in_unit);
++
++	switch (state_of_extent(ext)) {
++	case HOLE_EXTENT:
++		*childp = NULL;
++		return 0;
++	case ALLOCATED_EXTENT:
++	case UNALLOCATED_EXTENT:
++		break;
++	default:
++		/* this should never happen */
++		assert("vs-1417", 0);
++	}
++
++	{
++		reiser4_key key;
++		reiser4_tree *tree;
++		unsigned long index;
++
++		if (side == LEFT_SIDE) {
++			/* get key of first byte addressed by the extent */
++			item_key_by_coord(coord, &key);
++		} else {
++			/* get key of byte which next after last byte addressed by the extent */
++			append_key_extent(coord, &key);
++		}
++
++		assert("vs-544",
++		       (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
++		/* index of first or last (depending on @side) page addressed
++		   by the extent */
++		index =
++		    (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
++		if (side == RIGHT_SIDE)
++			index--;
++
++		tree = coord->node->zjnode.tree;
++		*childp = jlookup(tree, get_key_objectid(&key), index);
++	}
++
++	return 0;
++}
++
++/* item_plugin->f.utmost_child_real_block */
++/* Return the child's block, if allocated. */
++int
++utmost_child_real_block_extent(const coord_t * coord, sideof side,
++			       reiser4_block_nr * block)
++{
++	reiser4_extent *ext;
++
++	ext = extent_by_coord(coord);
++
++	switch (state_of_extent(ext)) {
++	case ALLOCATED_EXTENT:
++		*block = extent_get_start(ext);
++		if (side == RIGHT_SIDE)
++			*block += extent_get_width(ext) - 1;
++		break;
++	case HOLE_EXTENT:
++	case UNALLOCATED_EXTENT:
++		*block = 0;
++		break;
++	default:
++		/* this should never happen */
++		assert("vs-1418", 0);
++	}
++
++	return 0;
++}
++
++/* item_plugin->f.scan */
++/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
++   This scan continues, advancing the parent coordinate, until either it encounters a
++   formatted child or it finishes scanning this node.
++
++   If unallocated, the entire extent must be dirty and in the same atom.  (Actually, I'm
++   not sure this is last property (same atom) is enforced, but it should be the case since
++   one atom must write the parent and the others must read the parent, thus fusing?).  In
++   any case, the code below asserts this case for unallocated extents.  Unallocated
++   extents are thus optimized because we can skip to the endpoint when scanning.
++
++   It returns control to scan_extent, handles these terminating conditions, e.g., by
++   loading the next twig.
++*/
++int scan_extent(flush_scan * scan)
++{
++	coord_t coord;
++	jnode *neighbor;
++	unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
++	reiser4_block_nr unit_start;
++	__u64 oid;
++	reiser4_key key;
++	int ret = 0, allocated, incr;
++	reiser4_tree *tree;
++
++	if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
++		scan->stop = 1;
++		return 0;	/* Race with truncate, this node is already
++				 * truncated. */
++	}
++
++	coord_dup(&coord, &scan->parent_coord);
++
++	assert("jmacd-1404", !scan_finished(scan));
++	assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
++	assert("jmacd-1406", jnode_is_unformatted(scan->node));
++
++	/* The scan_index variable corresponds to the current page index of the
++	   unformatted block scan position. */
++	scan_index = index_jnode(scan->node);
++
++	assert("jmacd-7889", item_is_extent(&coord));
++
++      repeat:
++	/* objectid of file */
++	oid = get_key_objectid(item_key_by_coord(&coord, &key));
++
++	allocated = !extent_is_unallocated(&coord);
++	/* Get the values of this extent unit: */
++	unit_index = extent_unit_index(&coord);
++	unit_width = extent_unit_width(&coord);
++	unit_start = extent_unit_start(&coord);
++
++	assert("jmacd-7187", unit_width > 0);
++	assert("jmacd-7188", scan_index >= unit_index);
++	assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
++
++	/* Depending on the scan direction, we set different maximum values for scan_index
++	   (scan_max) and the number of nodes that would be passed if the scan goes the
++	   entire way (scan_dist).  Incr is an integer reflecting the incremental
++	   direction of scan_index. */
++	if (scanning_left(scan)) {
++		scan_max = unit_index;
++		scan_dist = scan_index - unit_index;
++		incr = -1;
++	} else {
++		scan_max = unit_index + unit_width - 1;
++		scan_dist = scan_max - unit_index;
++		incr = +1;
++	}
++
++	tree = coord.node->zjnode.tree;
++
++	/* If the extent is allocated we have to check each of its blocks.  If the extent
++	   is unallocated we can skip to the scan_max. */
++	if (allocated) {
++		do {
++			neighbor = jlookup(tree, oid, scan_index);
++			if (neighbor == NULL)
++				goto stop_same_parent;
++
++			if (scan->node != neighbor
++			    && !scan_goto(scan, neighbor)) {
++				/* @neighbor was jput() by scan_goto(). */
++				goto stop_same_parent;
++			}
++
++			ret = scan_set_current(scan, neighbor, 1, &coord);
++			if (ret != 0) {
++				goto exit;
++			}
++
++			/* reference to @neighbor is stored in @scan, no need
++			   to jput(). */
++			scan_index += incr;
++
++		} while (incr + scan_max != scan_index);
++
++	} else {
++		/* Optimized case for unallocated extents, skip to the end. */
++		neighbor = jlookup(tree, oid, scan_max /*index */ );
++		if (neighbor == NULL) {
++			/* Race with truncate */
++			scan->stop = 1;
++			ret = 0;
++			goto exit;
++		}
++
++		assert("zam-1043", blocknr_is_fake(jnode_get_block(neighbor)));
++
++		ret = scan_set_current(scan, neighbor, scan_dist, &coord);
++		if (ret != 0) {
++			goto exit;
++		}
++	}
++
++	if (coord_sideof_unit(&coord, scan->direction) == 0
++	    && item_is_extent(&coord)) {
++		/* Continue as long as there are more extent units. */
++
++		scan_index =
++		    extent_unit_index(&coord) +
++		    (scanning_left(scan) ? extent_unit_width(&coord) - 1 : 0);
++		goto repeat;
++	}
++
++	if (0) {
++	      stop_same_parent:
++
++		/* If we are scanning left and we stop in the middle of an allocated
++		   extent, we know the preceder immediately.. */
++		/* middle of extent is (scan_index - unit_index) != 0. */
++		if (scanning_left(scan) && (scan_index - unit_index) != 0) {
++			/* FIXME(B): Someone should step-through and verify that this preceder
++			   calculation is indeed correct. */
++			/* @unit_start is starting block (number) of extent
++			   unit. Flush stopped at the @scan_index block from
++			   the beginning of the file, which is (scan_index -
++			   unit_index) block within extent.
++			 */
++			if (unit_start) {
++				/* skip preceder update when we are at hole */
++				scan->preceder_blk =
++				    unit_start + scan_index - unit_index;
++				check_preceder(scan->preceder_blk);
++			}
++		}
++
++		/* In this case, we leave coord set to the parent of scan->node. */
++		scan->stop = 1;
++
++	} else {
++		/* In this case, we are still scanning, coord is set to the next item which is
++		   either off-the-end of the node or not an extent. */
++		assert("jmacd-8912", scan->stop == 0);
++		assert("jmacd-7812",
++		       (coord_is_after_sideof_unit(&coord, scan->direction)
++			|| !item_is_extent(&coord)));
++	}
++
++	ret = 0;
++      exit:
++	return ret;
++}
++
++/* ask block allocator for some blocks */
++static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
++				   reiser4_block_nr wanted_count,
++				   reiser4_block_nr *first_allocated,
++				   reiser4_block_nr *allocated,
++				   block_stage_t block_stage)
++{
++	*allocated = wanted_count;
++	preceder->max_dist = 0;	/* scan whole disk, if needed */
++
++	/* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
++	preceder->block_stage = block_stage;
++
++	/* FIXME: we do not handle errors here now */
++	check_me("vs-420",
++		 reiser4_alloc_blocks(preceder, first_allocated, allocated,
++				      BA_PERMANENT) == 0);
++	/* update flush_pos's preceder to last allocated block number */
++	preceder->blk = *first_allocated + *allocated - 1;
++}
++
++/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
++   will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
++   to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
++static reiser4_block_nr reserve_replace(void)
++{
++	reiser4_block_nr grabbed, needed;
++
++	grabbed = get_current_context()->grabbed_blocks;
++	needed = estimate_one_insert_into_item(current_tree);
++	check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
++	return grabbed;
++}
++
++static void free_replace_reserved(reiser4_block_nr grabbed)
++{
++	reiser4_context *ctx;
++
++	ctx = get_current_context();
++	grabbed2free(ctx, get_super_private(ctx->super),
++		     ctx->grabbed_blocks - grabbed);
++}
++
++/* Block offset of first block addressed by unit */
++__u64 extent_unit_index(const coord_t * item)
++{
++	reiser4_key key;
++
++	assert("vs-648", coord_is_existing_unit(item));
++	unit_key_by_coord(item, &key);
++	return get_key_offset(&key) >> current_blocksize_bits;
++}
++
++/* AUDIT shouldn't return value be of reiser4_block_nr type?
++   Josh's answer: who knows?  Is a "number of blocks" the same type as "block offset"? */
++__u64 extent_unit_width(const coord_t * item)
++{
++	assert("vs-649", coord_is_existing_unit(item));
++	return width_by_coord(item);
++}
++
++/* Starting block location of this unit */
++static reiser4_block_nr extent_unit_start(const coord_t * item)
++{
++	return extent_get_start(extent_by_coord(item));
++}
++
++/**
++ * split_allocated_extent -
++ * @coord:
++ * @pos_in_unit:
++ *
++ * replace allocated extent with two allocated extents
++ */
++static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
++{
++	int result;
++	struct replace_handle *h;
++	reiser4_extent *ext;
++	reiser4_block_nr grabbed;
++
++	ext = extent_by_coord(coord);
++	assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
++	assert("vs-1411", extent_get_width(ext) > pos_in_unit);
++
++	h = kmalloc(sizeof(*h), get_gfp_mask());
++	if (h == NULL)
++		return RETERR(-ENOMEM);
++	h->coord = coord;
++	h->lh = znode_lh(coord->node);
++	h->pkey = &h->key;
++	unit_key_by_coord(coord, h->pkey);
++	set_key_offset(h->pkey,
++		       (get_key_offset(h->pkey) +
++			pos_in_unit * current_blocksize));
++	set_extent(&h->overwrite, extent_get_start(ext), pos_in_unit);
++	set_extent(&h->new_extents[0], extent_get_start(ext) + pos_in_unit,
++		   extent_get_width(ext) - pos_in_unit);
++	h->nr_new_extents = 1;
++	h->flags = COPI_DONT_SHIFT_LEFT;
++	h->paste_key = h->key;
++
++	/* reserve space for extent unit paste, @grabbed is reserved before */
++	grabbed = reserve_replace();
++	result = replace_extent(h, 0 /* leave @coord set to overwritten
++					extent */);
++	/* restore reserved */
++	free_replace_reserved(grabbed);
++	kfree(h);
++	return result;
++}
++
++/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
++   one). Return 1 if it succeeded, 0 - otherwise */
++static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
++		       reiser4_extent *replace)
++{
++	assert("vs-1415", extent_by_coord(coord) == ext);
++
++	if (coord->unit_pos == 0
++	    || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
++		/* @ext either does not exist or is not allocated extent */
++		return 0;
++	if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
++	    extent_get_start(replace))
++		return 0;
++
++	/* we can glue, widen previous unit */
++	extent_set_width(ext - 1,
++			 extent_get_width(ext - 1) + extent_get_width(replace));
++
++	if (extent_get_width(ext) != extent_get_width(replace)) {
++		/* make current extent narrower */
++		if (state_of_extent(ext) == ALLOCATED_EXTENT)
++			extent_set_start(ext,
++					 extent_get_start(ext) +
++					 extent_get_width(replace));
++		extent_set_width(ext,
++				 extent_get_width(ext) -
++				 extent_get_width(replace));
++	} else {
++		/* current extent completely glued with its left neighbor, remove it */
++		coord_t from, to;
++
++		coord_dup(&from, coord);
++		from.unit_pos = nr_units_extent(coord) - 1;
++		coord_dup(&to, &from);
++
++		/* currently cut from extent can cut either from the beginning or from the end. Move place which got
++		   freed after unit removal to end of item */
++		memmove(ext, ext + 1,
++			(from.unit_pos -
++			 coord->unit_pos) * sizeof(reiser4_extent));
++		/* wipe part of item which is going to be cut, so that node_check will not be confused */
++		cut_node_content(&from, &to, NULL, NULL, NULL);
++	}
++	znode_make_dirty(coord->node);
++	/* move coord back */
++	coord->unit_pos--;
++	return 1;
++}
++
++/**
++ * conv_extent - replace extent with 2 ones
++ * @coord: coordinate of extent to be replaced
++ * @replace: extent to overwrite the one @coord is set to
++ *
++ * Overwrites extent @coord is set to and paste one extent unit after
++ * overwritten one if @replace is shorter than initial extent
++ */
++static int conv_extent(coord_t *coord, reiser4_extent *replace)
++{
++	int result;
++	struct replace_handle *h;
++	reiser4_extent *ext;
++	reiser4_block_nr start, width, new_width;
++	reiser4_block_nr grabbed;
++	extent_state state;
++
++	ext = extent_by_coord(coord);
++	state = state_of_extent(ext);
++	start = extent_get_start(ext);
++	width = extent_get_width(ext);
++	new_width = extent_get_width(replace);
++
++	assert("vs-1458", (state == UNALLOCATED_EXTENT ||
++			   state == ALLOCATED_EXTENT));
++	assert("vs-1459", width >= new_width);
++
++	if (try_to_merge_with_left(coord, ext, replace)) {
++		/* merged @replace with left neighbor. Current unit is either
++		   removed or narrowed */
++		return 0;
++	}
++
++	if (width == new_width) {
++		/* replace current extent with @replace */
++		*ext = *replace;
++		znode_make_dirty(coord->node);
++		return 0;
++	}
++
++	h = kmalloc(sizeof(*h), get_gfp_mask());
++	if (h == NULL)
++		return RETERR(-ENOMEM);
++	h->coord = coord;
++	h->lh = znode_lh(coord->node);
++	h->pkey = &h->key;
++	unit_key_by_coord(coord, h->pkey);
++	set_key_offset(h->pkey,
++		       (get_key_offset(h->pkey) + new_width * current_blocksize));
++	h->overwrite = *replace;
++
++	/* replace @ext with @replace and padding extent */
++	set_extent(&h->new_extents[0],
++		   (state == ALLOCATED_EXTENT) ? (start + new_width) : UNALLOCATED_EXTENT_START,
++		   width - new_width);
++	h->nr_new_extents = 1;
++	h->flags = COPI_DONT_SHIFT_LEFT;
++	h->paste_key = h->key;
++
++	/* reserve space for extent unit paste, @grabbed is reserved before */
++	grabbed = reserve_replace();
++	result = replace_extent(h, 0 /* leave @coord set to overwritten
++					extent */);
++
++	/* restore reserved */
++	free_replace_reserved(grabbed);
++	kfree(h);
++	return result;
++}
++
++/**
++ * assign_real_blocknrs
++ * @flush_pos:
++ * @oid: objectid of file jnodes to assign block number to belongs to
++ * @index: first jnode on the range
++ * @count: number of jnodes to assign block numbers to
++ * @first: start of allocated block range
++ * 
++ * Assigns block numbers to each of @count jnodes. Index of first jnode is
++ * @index. Jnodes get lookuped with jlookup.
++ */
++static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
++				 unsigned long index, reiser4_block_nr count,
++				 reiser4_block_nr first)
++{
++	unsigned long i;
++	reiser4_tree *tree;
++	txn_atom *atom;
++	int nr;
++
++	atom = atom_locked_by_fq(flush_pos->fq);
++	assert("vs-1468", atom);
++	BUG_ON(atom == NULL);
++
++	nr = 0;
++	tree = current_tree;
++	for (i = 0; i < count; ++i, ++index) {
++		jnode *node;
++
++		node = jlookup(tree, oid, index);
++		assert("", node != NULL);
++		BUG_ON(node == NULL);
++
++		spin_lock_jnode(node);
++		assert("", !jnode_is_flushprepped(node));
++		assert("vs-1475", node->atom == atom);
++		assert("vs-1476", atomic_read(&node->x_count) > 0);
++
++		JF_CLR(node, JNODE_FLUSH_RESERVED);
++		jnode_set_block(node, &first);
++		unformatted_make_reloc(node, flush_pos->fq);
++		ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
++				     FQ_LIST, 0));
++		spin_unlock_jnode(node);
++		first++;
++
++		atomic_dec(&node->x_count);
++		nr ++;
++	}
++
++	spin_unlock_atom(atom);
++	return;
++}
++
++/**
++ * make_node_ovrwr - assign node to overwrite set
++ * @jnodes: overwrite set list head
++ * @node: jnode to belong to overwrite set
++ *
++ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
++ * which is an accumulator for nodes before they get to overwrite set list of
++ * atom.
++ */
++static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
++{
++	spin_lock_jnode(node);
++
++	assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
++	assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
++
++	JF_SET(node, JNODE_OVRWR);
++	list_move_tail(&node->capture_link, jnodes);
++	ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
++
++	spin_unlock_jnode(node);
++}
++
++/**
++ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
++ * @flush_pos: flush position
++ * @oid: objectid of file jnodes belong to
++ * @index: starting index
++ * @width: extent width
++ *
++ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
++ * overwrite set. Starting from the one with index @index. If end of slum is
++ * detected (node is not found or flushprepped) - stop iterating and set flush
++ * position's state to POS_INVALID.
++ */
++static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
++				  unsigned long index, reiser4_block_nr width)
++{
++	unsigned long i;
++	reiser4_tree *tree;
++	jnode *node;
++	txn_atom *atom;
++	LIST_HEAD(jnodes);
++
++	tree = current_tree;
++
++	atom = atom_locked_by_fq(pos_fq(flush_pos));
++	assert("vs-1478", atom);
++
++	for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
++		node = jlookup(tree, oid, index);
++		if (!node) {
++			flush_pos->state = POS_INVALID;
++			break;
++		}
++		if (jnode_check_flushprepped(node)) {
++			flush_pos->state = POS_INVALID;
++			atomic_dec(&node->x_count);
++			break;
++		}
++		if (node->atom != atom) {
++			flush_pos->state = POS_INVALID;
++			atomic_dec(&node->x_count);
++			break;
++		}
++		make_node_ovrwr(&jnodes, node);
++		atomic_dec(&node->x_count);
++	}
++
++	list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
++	spin_unlock_atom(atom);
++}
++
++/**
++ * allocated_extent_slum_size
++ * @flush_pos:
++ * @oid:
++ * @index:
++ * @count:
++ *
++ *
++ */
++static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
++				      unsigned long index, unsigned long count)
++{
++	unsigned long i;
++	reiser4_tree *tree;
++	txn_atom *atom;
++	int nr;
++
++	atom = atom_locked_by_fq(pos_fq(flush_pos));
++	assert("vs-1468", atom);
++
++	nr = 0;
++	tree = current_tree;
++	for (i = 0; i < count; ++i, ++index) {
++		jnode *node;
++
++		node = jlookup(tree, oid, index);
++		if (!node)
++			break;
++
++		if (jnode_check_flushprepped(node)) {
++			atomic_dec(&node->x_count);
++			break;
++		}
++
++		if (node->atom != atom) {
++			/*
++			 * this is possible on overwrite: extent_write may
++			 * capture several unformatted nodes without capturing
++			 * any formatted nodes.
++			 */
++			atomic_dec(&node->x_count);
++			break;			
++		}
++
++		assert("vs-1476", atomic_read(&node->x_count) > 1);
++		atomic_dec(&node->x_count);
++		nr ++;
++	}
++
++	spin_unlock_atom(atom);
++	return nr;
++}
++
++/**
++ * alloc_extent
++ * @flush_pos:
++ *
++ *
++ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
++ * is set to. It is to prepare for flushing sequence of not flushprepped nodes
++ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
++ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
++ * set to 1 and to overwrite set otherwise
++ */
++int alloc_extent(flush_pos_t *flush_pos)
++{
++	coord_t *coord;
++	reiser4_extent *ext;
++	reiser4_extent replace_ext;
++	oid_t oid;
++	reiser4_block_nr protected;
++	reiser4_block_nr start;
++	__u64 index;
++	__u64 width;
++	extent_state state;
++	int result;
++	reiser4_block_nr first_allocated;
++	__u64 allocated;
++	reiser4_key key;
++	block_stage_t block_stage;
++
++	assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
++	assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
++	       && item_is_extent(&flush_pos->coord));
++
++	coord = &flush_pos->coord;
++
++	ext = extent_by_coord(coord);
++	state = state_of_extent(ext);
++	if (state == HOLE_EXTENT) {
++		flush_pos->state = POS_INVALID;
++		return 0;
++	}
++
++	item_key_by_coord(coord, &key);
++	oid = get_key_objectid(&key);
++	index = extent_unit_index(coord) + flush_pos->pos_in_unit;
++	start = extent_get_start(ext);
++	width = extent_get_width(ext);
++
++	assert("vs-1457", width > flush_pos->pos_in_unit);
++
++	if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
++		/* relocate */
++		if (flush_pos->pos_in_unit) {
++			/* split extent unit into two */
++			result =
++			    split_allocated_extent(coord,
++						   flush_pos->pos_in_unit);
++			flush_pos->pos_in_unit = 0;
++			return result;
++		}
++
++		/* limit number of nodes to allocate */
++		if (flush_pos->nr_to_write < width)
++			width = flush_pos->nr_to_write;
++
++		if (state == ALLOCATED_EXTENT) {
++			/*
++			 * all protected nodes are not flushprepped, therefore
++			 * they are counted as flush_reserved
++			 */
++			block_stage = BLOCK_FLUSH_RESERVED;
++			protected = allocated_extent_slum_size(flush_pos, oid,
++							       index, width);
++			if (protected == 0) {
++				flush_pos->state = POS_INVALID;
++				flush_pos->pos_in_unit = 0;
++				return 0;
++ 			}
++		} else {
++			block_stage = BLOCK_UNALLOCATED;
++			protected = width;
++		}
++
++		/*
++		 * look at previous unit if possible. If it is allocated, make
++		 * preceder more precise
++		 */
++		if (coord->unit_pos &&
++		    (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
++			pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
++				extent_get_width(ext - 1);
++
++		/* allocate new block numbers for protected nodes */
++		extent_allocate_blocks(pos_hint(flush_pos), protected,
++				       &first_allocated, &allocated,
++				       block_stage);
++
++		if (state == ALLOCATED_EXTENT)
++			/*
++			 * on relocating - free nodes which are going to be
++			 * relocated
++			 */
++			reiser4_dealloc_blocks(&start, &allocated,
++					       BLOCK_ALLOCATED, BA_DEFER);
++
++		/* assign new block numbers to protected nodes */
++		assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
++
++
++		/* prepare extent which will replace current one */
++		set_extent(&replace_ext, first_allocated, allocated);
++
++		/* adjust extent item */
++		result = conv_extent(coord, &replace_ext);
++		if (result != 0 && result != -ENOMEM) {
++			warning("vs-1461",
++				"Failed to allocate extent. Should not happen\n");
++			return result;
++		}
++
++		/*
++		 * break flush: we prepared for flushing as many blocks as we
++		 * were asked for
++		 */
++		if (flush_pos->nr_to_write == allocated)
++			flush_pos->state = POS_INVALID;
++	} else {
++		/* overwrite */
++		mark_jnodes_overwrite(flush_pos, oid, index, width);
++	}
++	flush_pos->pos_in_unit = 0;
++	return 0;
++}
++
++/* if @key is glueable to the item @coord is set to */
++static int must_insert(const coord_t *coord, const reiser4_key *key)
++{
++	reiser4_key last;
++
++	if (item_id_by_coord(coord) == EXTENT_POINTER_ID
++	    && keyeq(append_key_extent(coord, &last), key))
++		return 0;
++	return 1;
++}
++
++/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
++   or modify last unit of last item to have greater width */
++static int put_unit_to_end(znode *node, const reiser4_key *key,
++			   reiser4_extent *copy_ext)
++{
++	int result;
++	coord_t coord;
++	cop_insert_flag flags;
++	reiser4_extent *last_ext;
++	reiser4_item_data data;
++
++	/* set coord after last unit in an item */
++	coord_init_last_unit(&coord, node);
++	coord.between = AFTER_UNIT;
++
++	flags =
++	    COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
++	if (must_insert(&coord, key)) {
++		result =
++		    insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
++				    key, NULL /*lh */ , flags);
++
++	} else {
++		/* try to glue with last unit */
++		last_ext = extent_by_coord(&coord);
++		if (state_of_extent(last_ext) &&
++		    extent_get_start(last_ext) + extent_get_width(last_ext) ==
++		    extent_get_start(copy_ext)) {
++			/* widen last unit of node */
++			extent_set_width(last_ext,
++					 extent_get_width(last_ext) +
++					 extent_get_width(copy_ext));
++			znode_make_dirty(node);
++			return 0;
++		}
++
++		/* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
++		result =
++		    insert_into_item(&coord, NULL /*lh */ , key,
++				     init_new_extent(&data, copy_ext, 1),
++				     flags);
++	}
++
++	assert("vs-438", result == 0 || result == -E_NODE_FULL);
++	return result;
++}
++
++/* @coord is set to extent unit */
++squeeze_result squalloc_extent(znode *left, const coord_t *coord,
++			       flush_pos_t *flush_pos,
++			       reiser4_key *stop_key)
++{
++	reiser4_extent *ext;
++	__u64 index;
++	__u64 width;
++	reiser4_block_nr start;
++	extent_state state;
++	oid_t oid;
++	reiser4_block_nr first_allocated;
++	__u64 allocated;
++	__u64 protected;
++	reiser4_extent copy_extent;
++	reiser4_key key;
++	int result;
++	block_stage_t block_stage;
++
++	assert("vs-1457", flush_pos->pos_in_unit == 0);
++	assert("vs-1467", coord_is_leftmost_unit(coord));
++	assert("vs-1467", item_is_extent(coord));
++
++	ext = extent_by_coord(coord);
++	index = extent_unit_index(coord);
++	start = extent_get_start(ext);
++	width = extent_get_width(ext);
++	state = state_of_extent(ext);
++	unit_key_by_coord(coord, &key);
++	oid = get_key_objectid(&key);
++
++	if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
++	    (state == UNALLOCATED_EXTENT)) {
++		/* relocate */
++		if (state == ALLOCATED_EXTENT) {
++			/* all protected nodes are not flushprepped, therefore
++			 * they are counted as flush_reserved */
++			block_stage = BLOCK_FLUSH_RESERVED;
++			protected = allocated_extent_slum_size(flush_pos, oid,
++							       index, width);
++			if (protected == 0) {
++				flush_pos->state = POS_INVALID;
++				flush_pos->pos_in_unit = 0;
++				return 0;
++ 			}
++		} else {
++			block_stage = BLOCK_UNALLOCATED;
++			protected = width;
++		}
++
++		/*
++		 * look at previous unit if possible. If it is allocated, make
++		 * preceder more precise
++		 */
++		if (coord->unit_pos &&
++		    (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
++			pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
++				extent_get_width(ext - 1);
++
++		/* allocate new block numbers for protected nodes */
++		extent_allocate_blocks(pos_hint(flush_pos), protected,
++				       &first_allocated, &allocated,
++				       block_stage);
++
++		/* prepare extent which will be copied to left */
++		set_extent(&copy_extent, first_allocated, allocated);
++
++		result = put_unit_to_end(left, &key, &copy_extent);
++		if (result == -E_NODE_FULL) {
++			int target_block_stage;
++
++			/* free blocks which were just allocated */
++			target_block_stage =
++			    (state ==
++			     ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
++			    BLOCK_UNALLOCATED;
++			reiser4_dealloc_blocks(&first_allocated, &allocated,
++					       target_block_stage,
++					       BA_PERMANENT);
++
++			/* rewind the preceder. */
++			flush_pos->preceder.blk = first_allocated;
++			check_preceder(flush_pos->preceder.blk);
++
++			return SQUEEZE_TARGET_FULL;
++		}
++
++		if (state == ALLOCATED_EXTENT) {
++			/* free nodes which were relocated */
++			reiser4_dealloc_blocks(&start, &allocated,
++					       BLOCK_ALLOCATED, BA_DEFER);
++		}
++
++		/* assign new block numbers to protected nodes */
++		assign_real_blocknrs(flush_pos, oid, index, allocated,
++				     first_allocated);
++
++		set_key_offset(&key,
++			       get_key_offset(&key) +
++			       (allocated << current_blocksize_bits));
++	} else {
++		/*
++		 * overwrite: try to copy unit as it is to left neighbor and
++		 * make all first not flushprepped nodes overwrite nodes
++		 */
++		set_extent(&copy_extent, start, width);
++		result = put_unit_to_end(left, &key, &copy_extent);
++		if (result == -E_NODE_FULL)
++			return SQUEEZE_TARGET_FULL;
++
++		if (state != HOLE_EXTENT)
++			mark_jnodes_overwrite(flush_pos, oid, index, width);
++		set_key_offset(&key,
++			       get_key_offset(&key) +
++			       (width << current_blocksize_bits));
++	}
++	*stop_key = key;
++	return SQUEEZE_CONTINUE;
++}
++
++int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
++{
++	return key_by_inode_and_offset_common(inode, off, key);
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/item/extent_item_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/extent_item_ops.c
+@@ -0,0 +1,882 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "item.h"
++#include "../../inode.h"
++#include "../../tree_walk.h"	/* check_sibling_list() */
++#include "../../page_cache.h"
++#include "../../carry.h"
++
++#include <linux/quotaops.h>
++
++/* item_plugin->b.max_key_inside */
++reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
++{
++	item_key_by_coord(coord, key);
++	set_key_offset(key, get_key_offset(max_key()));
++	return key;
++}
++
++/* item_plugin->b.can_contain_key
++   this checks whether @key of @data is matching to position set by @coord */
++int
++can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
++		       const reiser4_item_data * data)
++{
++	reiser4_key item_key;
++
++	if (item_plugin_by_coord(coord) != data->iplug)
++		return 0;
++
++	item_key_by_coord(coord, &item_key);
++	if (get_key_locality(key) != get_key_locality(&item_key) ||
++	    get_key_objectid(key) != get_key_objectid(&item_key) ||
++	    get_key_ordering(key) != get_key_ordering(&item_key))
++		return 0;
++
++	return 1;
++}
++
++/* item_plugin->b.mergeable
++   first item is of extent type */
++/* Audited by: green(2002.06.13) */
++int mergeable_extent(const coord_t * p1, const coord_t * p2)
++{
++	reiser4_key key1, key2;
++
++	assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
++	/* FIXME-VS: Which is it? Assert or return 0 */
++	if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
++		return 0;
++	}
++
++	item_key_by_coord(p1, &key1);
++	item_key_by_coord(p2, &key2);
++	if (get_key_locality(&key1) != get_key_locality(&key2) ||
++	    get_key_objectid(&key1) != get_key_objectid(&key2) ||
++	    get_key_ordering(&key1) != get_key_ordering(&key2) ||
++	    get_key_type(&key1) != get_key_type(&key2))
++		return 0;
++	if (get_key_offset(&key1) + extent_size(p1, nr_units_extent(p1)) !=
++	    get_key_offset(&key2))
++		return 0;
++	return 1;
++}
++
++/* item_plugin->b.nr_units */
++pos_in_node_t nr_units_extent(const coord_t * coord)
++{
++	/* length of extent item has to be multiple of extent size */
++	assert("vs-1424",
++	       (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
++	return item_length_by_coord(coord) / sizeof(reiser4_extent);
++}
++
++/* item_plugin->b.lookup */
++lookup_result
++lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
++	      coord_t * coord)
++{				/* znode and item_pos are
++				   set to an extent item to
++				   look through */
++	reiser4_key item_key;
++	reiser4_block_nr lookuped, offset;
++	unsigned i, nr_units;
++	reiser4_extent *ext;
++	unsigned blocksize;
++	unsigned char blocksize_bits;
++
++	item_key_by_coord(coord, &item_key);
++	offset = get_key_offset(&item_key);
++
++	/* key we are looking for must be greater than key of item @coord */
++	assert("vs-414", keygt(key, &item_key));
++
++	assert("umka-99945",
++	       !keygt(key, max_key_inside_extent(coord, &item_key)));
++
++	ext = extent_item(coord);
++	assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
++
++	blocksize = current_blocksize;
++	blocksize_bits = current_blocksize_bits;
++
++	/* offset we are looking for */
++	lookuped = get_key_offset(key);
++
++	nr_units = nr_units_extent(coord);
++	/* go through all extents until the one which address given offset */
++	for (i = 0; i < nr_units; i++, ext++) {
++		offset += (extent_get_width(ext) << blocksize_bits);
++		if (offset > lookuped) {
++			/* desired byte is somewhere in this extent */
++			coord->unit_pos = i;
++			coord->between = AT_UNIT;
++			return CBK_COORD_FOUND;
++		}
++	}
++
++	/* set coord after last unit */
++	coord->unit_pos = nr_units - 1;
++	coord->between = AFTER_UNIT;
++	return CBK_COORD_FOUND;
++}
++
++/* item_plugin->b.paste
++   item @coord is set to has been appended with @data->length of free
++   space. data->data contains data to be pasted into the item in position
++   @coord->in_item.unit_pos. It must fit into that free space.
++   @coord must be set between units.
++*/
++int
++paste_extent(coord_t * coord, reiser4_item_data * data,
++	     carry_plugin_info * info UNUSED_ARG)
++{
++	unsigned old_nr_units;
++	reiser4_extent *ext;
++	int item_length;
++
++	ext = extent_item(coord);
++	item_length = item_length_by_coord(coord);
++	old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
++
++	/* this is also used to copy extent into newly created item, so
++	   old_nr_units could be 0 */
++	assert("vs-260", item_length >= data->length);
++
++	/* make sure that coord is set properly */
++	assert("vs-35",
++	       ((!coord_is_existing_unit(coord))
++		|| (!old_nr_units && !coord->unit_pos)));
++
++	/* first unit to be moved */
++	switch (coord->between) {
++	case AFTER_UNIT:
++		coord->unit_pos++;
++	case BEFORE_UNIT:
++		coord->between = AT_UNIT;
++		break;
++	case AT_UNIT:
++		assert("vs-331", !old_nr_units && !coord->unit_pos);
++		break;
++	default:
++		impossible("vs-330", "coord is set improperly");
++	}
++
++	/* prepare space for new units */
++	memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
++		ext + coord->unit_pos,
++		(old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
++
++	/* copy new data from kernel space */
++	assert("vs-556", data->user == 0);
++	memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
++
++	/* after paste @coord is set to first of pasted units */
++	assert("vs-332", coord_is_existing_unit(coord));
++	assert("vs-333",
++	       !memcmp(data->data, extent_by_coord(coord),
++		       (unsigned)data->length));
++	return 0;
++}
++
++/* item_plugin->b.can_shift */
++int
++can_shift_extent(unsigned free_space, coord_t * source,
++		 znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
++		 unsigned *size, unsigned want)
++{
++	*size = item_length_by_coord(source);
++	if (*size > free_space)
++		/* never split a unit of extent item */
++		*size = free_space - free_space % sizeof(reiser4_extent);
++
++	/* we can shift *size bytes, calculate how many do we want to shift */
++	if (*size > want * sizeof(reiser4_extent))
++		*size = want * sizeof(reiser4_extent);
++
++	if (*size % sizeof(reiser4_extent) != 0)
++		impossible("vs-119", "Wrong extent size: %i %zd", *size,
++			   sizeof(reiser4_extent));
++	return *size / sizeof(reiser4_extent);
++
++}
++
++/* item_plugin->b.copy_units */
++void
++copy_units_extent(coord_t * target, coord_t * source,
++		  unsigned from, unsigned count,
++		  shift_direction where_is_free_space, unsigned free_space)
++{
++	char *from_ext, *to_ext;
++
++	assert("vs-217", free_space == count * sizeof(reiser4_extent));
++
++	from_ext = item_body_by_coord(source);
++	to_ext = item_body_by_coord(target);
++
++	if (where_is_free_space == SHIFT_LEFT) {
++		assert("vs-215", from == 0);
++
++		/* At this moment, item length was already updated in the item
++		   header by shifting code, hence nr_units_extent() will
++		   return "new" number of units---one we obtain after copying
++		   units.
++		 */
++		to_ext +=
++		    (nr_units_extent(target) - count) * sizeof(reiser4_extent);
++	} else {
++		reiser4_key key;
++		coord_t coord;
++
++		assert("vs-216",
++		       from + count == coord_last_unit_pos(source) + 1);
++
++		from_ext += item_length_by_coord(source) - free_space;
++
++		/* new units are inserted before first unit in an item,
++		   therefore, we have to update item key */
++		coord = *source;
++		coord.unit_pos = from;
++		unit_key_extent(&coord, &key);
++
++		node_plugin_by_node(target->node)->update_item_key(target, &key,
++								   NULL /*info */);
++	}
++
++	memcpy(to_ext, from_ext, free_space);
++}
++
++/* item_plugin->b.create_hook
++   @arg is znode of leaf node for which we need to update right delimiting key */
++int create_hook_extent(const coord_t * coord, void *arg)
++{
++	coord_t *child_coord;
++	znode *node;
++	reiser4_key key;
++	reiser4_tree *tree;
++
++	if (!arg)
++		return 0;
++
++	child_coord = arg;
++	tree = znode_get_tree(coord->node);
++
++	assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
++
++	write_lock_tree(tree);
++	write_lock_dk(tree);
++	/* find a node on the left level for which right delimiting key has to
++	   be updated */
++	if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
++		assert("vs-411", znode_is_left_connected(child_coord->node));
++		node = child_coord->node->left;
++	} else {
++		assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
++		node = child_coord->node;
++		assert("nikita-3314", node != NULL);
++	}
++
++	if (node != NULL) {
++		znode_set_rd_key(node, item_key_by_coord(coord, &key));
++
++		assert("nikita-3282", check_sibling_list(node));
++		/* break sibling links */
++		if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
++			ON_DEBUG(node->right->left_version =
++				 atomic_inc_return(&delim_key_version);
++				 node->right_version =
++				 atomic_inc_return(&delim_key_version););
++
++			node->right->left = NULL;
++			node->right = NULL;
++		}
++	}
++	write_unlock_dk(tree);
++	write_unlock_tree(tree);
++	return 0;
++}
++
++#define ITEM_TAIL_KILLED 0
++#define ITEM_HEAD_KILLED 1
++#define ITEM_KILLED 2
++
++/* item_plugin->b.kill_hook
++   this is called when @count units starting from @from-th one are going to be removed
++   */
++int
++kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
++		 struct carry_kill_data *kdata)
++{
++	reiser4_extent *ext;
++	reiser4_block_nr start, length;
++	const reiser4_key *pfrom_key, *pto_key;
++	struct inode *inode;
++	reiser4_tree *tree;
++	pgoff_t from_off, to_off, offset, skip;
++	int retval;
++
++	/* these are located in memory kmalloc-ed by kill_node_content */
++	reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
++	coord_t *dup, *next;
++
++	assert("zam-811", znode_is_write_locked(coord->node));
++	assert("nikita-3315", kdata != NULL);
++	assert("vs-34", kdata->buf != NULL);
++
++	/* map structures to kdata->buf */
++	min_item_key = (reiser4_key *) (kdata->buf);
++	max_item_key = min_item_key + 1;
++	from_key = max_item_key + 1;
++	to_key = from_key + 1;
++	key = to_key + 1;
++	dup = (coord_t *) (key + 1);
++	next = dup + 1;
++
++	item_key_by_coord(coord, min_item_key);
++	max_item_key_by_coord(coord, max_item_key);
++
++	if (kdata->params.from_key) {
++		pfrom_key = kdata->params.from_key;
++		pto_key = kdata->params.to_key;
++	} else {
++		assert("vs-1549", from == coord->unit_pos);
++		unit_key_by_coord(coord, from_key);
++		pfrom_key = from_key;
++
++		coord_dup(dup, coord);
++		dup->unit_pos = from + count - 1;
++		max_unit_key_by_coord(dup, to_key);
++		pto_key = to_key;
++	}
++
++	if (!keylt(pto_key, max_item_key)) {
++		if (!keygt(pfrom_key, min_item_key)) {
++			znode *left, *right;
++
++			/* item is to be removed completely */
++			assert("nikita-3316", kdata->left != NULL
++			       && kdata->right != NULL);
++
++			left = kdata->left->node;
++			right = kdata->right->node;
++
++			tree = current_tree;
++			/* we have to do two things:
++			 *
++			 *     1. link left and right formatted neighbors of
++			 *        extent being removed, and
++			 *
++			 *     2. update their delimiting keys.
++			 *
++			 * atomicity of these operations is protected by
++			 * taking dk-lock and tree-lock.
++			 */
++			/* if neighbors of item being removed are znodes -
++			 * link them */
++			write_lock_tree(tree);
++			write_lock_dk(tree);
++			link_left_and_right(left, right);
++			if (left) {
++				/* update right delimiting key of left
++				 * neighbor of extent item */
++				/*coord_t next;
++				   reiser4_key key; */
++
++				coord_dup(next, coord);
++
++				if (coord_next_item(next))
++					*key = *znode_get_rd_key(coord->node);
++				else
++					item_key_by_coord(next, key);
++				znode_set_rd_key(left, key);
++			}
++			write_unlock_dk(tree);
++			write_unlock_tree(tree);
++
++			from_off =
++			    get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
++			to_off =
++			    (get_key_offset(max_item_key) +
++			     1) >> PAGE_CACHE_SHIFT;
++			retval = ITEM_KILLED;
++		} else {
++			/* tail of item is to be removed */
++			from_off =
++			    (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
++			     1) >> PAGE_CACHE_SHIFT;
++			to_off =
++			    (get_key_offset(max_item_key) +
++			     1) >> PAGE_CACHE_SHIFT;
++			retval = ITEM_TAIL_KILLED;
++		}
++	} else {
++		/* head of item is to be removed */
++		assert("vs-1571", keyeq(pfrom_key, min_item_key));
++		assert("vs-1572",
++		       (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
++		       0);
++		assert("vs-1573",
++		       ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
++							 1)) == 0);
++
++		if (kdata->left->node) {
++			/* update right delimiting key of left neighbor of extent item */
++			/*reiser4_key key; */
++
++			*key = *pto_key;
++			set_key_offset(key, get_key_offset(pto_key) + 1);
++
++			write_lock_dk(current_tree);
++			znode_set_rd_key(kdata->left->node, key);
++			write_unlock_dk(current_tree);
++		}
++
++		from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
++		to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
++		retval = ITEM_HEAD_KILLED;
++	}
++
++	inode = kdata->inode;
++	assert("vs-1545", inode != NULL);
++	if (inode != NULL)
++		/* take care of pages and jnodes corresponding to part of item being killed */
++		reiser4_invalidate_pages(inode->i_mapping, from_off,
++					 to_off - from_off,
++					 kdata->params.truncate);
++
++	ext = extent_item(coord) + from;
++	offset =
++	    (get_key_offset(min_item_key) +
++	     extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
++
++	assert("vs-1551", from_off >= offset);
++	assert("vs-1552", from_off - offset <= extent_get_width(ext));
++	skip = from_off - offset;
++	offset = from_off;
++
++	while (offset < to_off) {
++		length = extent_get_width(ext) - skip;
++		if (state_of_extent(ext) == HOLE_EXTENT) {
++			skip = 0;
++			offset += length;
++			ext++;
++			continue;
++		}
++
++		if (offset + length > to_off) {
++			length = to_off - offset;
++		}
++
++		DQUOT_FREE_BLOCK_NODIRTY(inode, length);
++
++		if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
++			/* some jnodes corresponding to this unallocated extent */
++			fake_allocated2free(length, 0 /* unformatted */ );
++
++			skip = 0;
++			offset += length;
++			ext++;
++			continue;
++		}
++
++		assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
++
++		if (length != 0) {
++			start = extent_get_start(ext) + skip;
++
++			/* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
++			   immediately */
++			reiser4_dealloc_blocks(&start, &length,
++					       0 /* not used */ ,
++					       BA_DEFER
++					       /* unformatted with defer */ );
++		}
++		skip = 0;
++		offset += length;
++		ext++;
++	}
++	return retval;
++}
++
++/* item_plugin->b.kill_units */
++int
++kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		  struct carry_kill_data *kdata, reiser4_key * smallest_removed,
++		  reiser4_key * new_first)
++{
++	reiser4_extent *ext;
++	reiser4_key item_key;
++	pos_in_node_t count;
++	reiser4_key from_key, to_key;
++	const reiser4_key *pfrom_key, *pto_key;
++	loff_t off;
++	int result;
++
++	assert("vs-1541",
++	       ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
++		|| (kdata->params.from_key != NULL
++		    && kdata->params.to_key != NULL)));
++
++	if (kdata->params.from_key) {
++		pfrom_key = kdata->params.from_key;
++		pto_key = kdata->params.to_key;
++	} else {
++		coord_t dup;
++
++		/* calculate key range of kill */
++		assert("vs-1549", from == coord->unit_pos);
++		unit_key_by_coord(coord, &from_key);
++		pfrom_key = &from_key;
++
++		coord_dup(&dup, coord);
++		dup.unit_pos = to;
++		max_unit_key_by_coord(&dup, &to_key);
++		pto_key = &to_key;
++	}
++
++	item_key_by_coord(coord, &item_key);
++
++#if REISER4_DEBUG
++	{
++		reiser4_key max_item_key;
++
++		max_item_key_by_coord(coord, &max_item_key);
++
++		if (new_first) {
++			/* head of item is to be cut */
++			assert("vs-1542", keyeq(pfrom_key, &item_key));
++			assert("vs-1538", keylt(pto_key, &max_item_key));
++		} else {
++			/* tail of item is to be cut */
++			assert("vs-1540", keygt(pfrom_key, &item_key));
++			assert("vs-1543", !keylt(pto_key, &max_item_key));
++		}
++	}
++#endif
++
++	if (smallest_removed)
++		*smallest_removed = *pfrom_key;
++
++	if (new_first) {
++		/* item head is cut. Item key will change. This new key is calculated here */
++		assert("vs-1556",
++		       (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
++		       (PAGE_CACHE_SIZE - 1));
++		*new_first = *pto_key;
++		set_key_offset(new_first, get_key_offset(new_first) + 1);
++	}
++
++	count = to - from + 1;
++	result = kill_hook_extent(coord, from, count, kdata);
++	if (result == ITEM_TAIL_KILLED) {
++		assert("vs-1553",
++		       get_key_offset(pfrom_key) >=
++		       get_key_offset(&item_key) + extent_size(coord, from));
++		off =
++		    get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
++						 extent_size(coord, from));
++		if (off) {
++			/* unit @from is to be cut partially. Its width decreases */
++			ext = extent_item(coord) + from;
++			extent_set_width(ext,
++					 (off + PAGE_CACHE_SIZE -
++					  1) >> PAGE_CACHE_SHIFT);
++			count--;
++		}
++	} else {
++		__u64 max_to_offset;
++		__u64 rest;
++
++		assert("vs-1575", result == ITEM_HEAD_KILLED);
++		assert("", from == 0);
++		assert("",
++		       ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
++							 1)) == 0);
++		assert("",
++		       get_key_offset(pto_key) + 1 >
++		       get_key_offset(&item_key) + extent_size(coord, to));
++		max_to_offset =
++		    get_key_offset(&item_key) + extent_size(coord, to + 1) - 1;
++		assert("", get_key_offset(pto_key) <= max_to_offset);
++
++		rest =
++		    (max_to_offset -
++		     get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
++		if (rest) {
++			/* unit @to is to be cut partially */
++			ext = extent_item(coord) + to;
++
++			assert("", extent_get_width(ext) > rest);
++
++			if (state_of_extent(ext) == ALLOCATED_EXTENT)
++				extent_set_start(ext,
++						 extent_get_start(ext) +
++						 (extent_get_width(ext) -
++						  rest));
++
++			extent_set_width(ext, rest);
++			count--;
++		}
++	}
++	return count * sizeof(reiser4_extent);
++}
++
++/* item_plugin->b.cut_units
++   this is too similar to kill_units_extent */
++int
++cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		 struct carry_cut_data *cdata, reiser4_key * smallest_removed,
++		 reiser4_key * new_first)
++{
++	reiser4_extent *ext;
++	reiser4_key item_key;
++	pos_in_node_t count;
++	reiser4_key from_key, to_key;
++	const reiser4_key *pfrom_key, *pto_key;
++	loff_t off;
++
++	assert("vs-1541",
++	       ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
++		|| (cdata->params.from_key != NULL
++		    && cdata->params.to_key != NULL)));
++
++	if (cdata->params.from_key) {
++		pfrom_key = cdata->params.from_key;
++		pto_key = cdata->params.to_key;
++	} else {
++		coord_t dup;
++
++		/* calculate key range of kill */
++		coord_dup(&dup, coord);
++		dup.unit_pos = from;
++		unit_key_by_coord(&dup, &from_key);
++
++		dup.unit_pos = to;
++		max_unit_key_by_coord(&dup, &to_key);
++
++		pfrom_key = &from_key;
++		pto_key = &to_key;
++	}
++
++	assert("vs-1555",
++	       (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
++	assert("vs-1556",
++	       (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
++	       (PAGE_CACHE_SIZE - 1));
++
++	item_key_by_coord(coord, &item_key);
++
++#if REISER4_DEBUG
++	{
++		reiser4_key max_item_key;
++
++		assert("vs-1584",
++		       get_key_locality(pfrom_key) ==
++		       get_key_locality(&item_key));
++		assert("vs-1585",
++		       get_key_type(pfrom_key) == get_key_type(&item_key));
++		assert("vs-1586",
++		       get_key_objectid(pfrom_key) ==
++		       get_key_objectid(&item_key));
++		assert("vs-1587",
++		       get_key_ordering(pfrom_key) ==
++		       get_key_ordering(&item_key));
++
++		max_item_key_by_coord(coord, &max_item_key);
++
++		if (new_first != NULL) {
++			/* head of item is to be cut */
++			assert("vs-1542", keyeq(pfrom_key, &item_key));
++			assert("vs-1538", keylt(pto_key, &max_item_key));
++		} else {
++			/* tail of item is to be cut */
++			assert("vs-1540", keygt(pfrom_key, &item_key));
++			assert("vs-1543", keyeq(pto_key, &max_item_key));
++		}
++	}
++#endif
++
++	if (smallest_removed)
++		*smallest_removed = *pfrom_key;
++
++	if (new_first) {
++		/* item head is cut. Item key will change. This new key is calculated here */
++		*new_first = *pto_key;
++		set_key_offset(new_first, get_key_offset(new_first) + 1);
++	}
++
++	count = to - from + 1;
++
++	assert("vs-1553",
++	       get_key_offset(pfrom_key) >=
++	       get_key_offset(&item_key) + extent_size(coord, from));
++	off =
++	    get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
++					 extent_size(coord, from));
++	if (off) {
++		/* tail of unit @from is to be cut partially. Its width decreases */
++		assert("vs-1582", new_first == NULL);
++		ext = extent_item(coord) + from;
++		extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
++		count--;
++	}
++
++	assert("vs-1554",
++	       get_key_offset(pto_key) <=
++	       get_key_offset(&item_key) + extent_size(coord, to + 1) - 1);
++	off =
++	    (get_key_offset(&item_key) + extent_size(coord, to + 1) - 1) -
++	    get_key_offset(pto_key);
++	if (off) {
++		/* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
++		   and width decreased. */
++		assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
++		ext = extent_item(coord) + to;
++		if (state_of_extent(ext) == ALLOCATED_EXTENT)
++			extent_set_start(ext,
++					 extent_get_start(ext) +
++					 (extent_get_width(ext) -
++					  (off >> PAGE_CACHE_SHIFT)));
++
++		extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
++		count--;
++	}
++	return count * sizeof(reiser4_extent);
++}
++
++/* item_plugin->b.unit_key */
++reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
++{
++	assert("vs-300", coord_is_existing_unit(coord));
++
++	item_key_by_coord(coord, key);
++	set_key_offset(key,
++		       (get_key_offset(key) +
++			extent_size(coord, coord->unit_pos)));
++
++	return key;
++}
++
++/* item_plugin->b.max_unit_key */
++reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
++{
++	assert("vs-300", coord_is_existing_unit(coord));
++
++	item_key_by_coord(coord, key);
++	set_key_offset(key,
++		       (get_key_offset(key) +
++			extent_size(coord, coord->unit_pos + 1) - 1));
++	return key;
++}
++
++/* item_plugin->b.estimate
++   item_plugin->b.item_data_by_flow */
++
++#if REISER4_DEBUG
++
++/* item_plugin->b.check
++   used for debugging, every item should have here the most complete
++   possible check of the consistency of the item that the inventor can
++   construct
++*/
++int check_extent(const coord_t * coord /* coord of item to check */ ,
++		 const char **error /* where to store error message */ )
++{
++	reiser4_extent *ext, *first;
++	unsigned i, j;
++	reiser4_block_nr start, width, blk_cnt;
++	unsigned num_units;
++	reiser4_tree *tree;
++	oid_t oid;
++	reiser4_key key;
++	coord_t scan;
++
++	assert("vs-933", REISER4_DEBUG);
++
++	if (znode_get_level(coord->node) != TWIG_LEVEL) {
++		*error = "Extent on the wrong level";
++		return -1;
++	}
++	if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
++		*error = "Wrong item size";
++		return -1;
++	}
++	ext = first = extent_item(coord);
++	blk_cnt = reiser4_block_count(reiser4_get_current_sb());
++	num_units = coord_num_units(coord);
++	tree = znode_get_tree(coord->node);
++	item_key_by_coord(coord, &key);
++	oid = get_key_objectid(&key);
++	coord_dup(&scan, coord);
++
++	for (i = 0; i < num_units; ++i, ++ext) {
++		__u64 index;
++
++		scan.unit_pos = i;
++		index = extent_unit_index(&scan);
++
++#if 0
++		/* check that all jnodes are present for the unallocated
++		 * extent */
++		if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
++			for (j = 0; j < extent_get_width(ext); j++) {
++				jnode *node;
++
++				node = jlookup(tree, oid, index + j);
++				if (node == NULL) {
++					print_coord("scan", &scan, 0);
++					*error = "Jnode missing";
++					return -1;
++				}
++				jput(node);
++			}
++		}
++#endif
++
++		start = extent_get_start(ext);
++		if (start < 2)
++			continue;
++		/* extent is allocated one */
++		width = extent_get_width(ext);
++		if (start >= blk_cnt) {
++			*error = "Start too large";
++			return -1;
++		}
++		if (start + width > blk_cnt) {
++			*error = "End too large";
++			return -1;
++		}
++		/* make sure that this extent does not overlap with other
++		   allocated extents extents */
++		for (j = 0; j < i; j++) {
++			if (state_of_extent(first + j) != ALLOCATED_EXTENT)
++				continue;
++			if (!
++			    ((extent_get_start(ext) >=
++			      extent_get_start(first + j) +
++			      extent_get_width(first + j))
++			     || (extent_get_start(ext) +
++				 extent_get_width(ext) <=
++				 extent_get_start(first + j)))) {
++				*error = "Extent overlaps with others";
++				return -1;
++			}
++		}
++
++	}
++
++	return 0;
++}
++
++#endif				/* REISER4_DEBUG */
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/internal.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/internal.c
+@@ -0,0 +1,392 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Implementation of internal-item plugin methods. */
++
++#include "../../forward.h"
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../../key.h"
++#include "../../coord.h"
++#include "internal.h"
++#include "item.h"
++#include "../node/node.h"
++#include "../plugin.h"
++#include "../../jnode.h"
++#include "../../znode.h"
++#include "../../tree_walk.h"
++#include "../../tree_mod.h"
++#include "../../tree.h"
++#include "../../super.h"
++#include "../../block_alloc.h"
++
++/* see internal.h for explanation */
++
++/* plugin->u.item.b.mergeable */
++int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
++		       const coord_t * p2 UNUSED_ARG /* second item */ )
++{
++	/* internal items are not mergeable */
++	return 0;
++}
++
++/* ->lookup() method for internal items */
++lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
++			      lookup_bias bias UNUSED_ARG /* lookup bias */ ,
++			      coord_t * coord /* coord of item */ )
++{
++	reiser4_key ukey;
++
++	switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
++	default:
++		impossible("", "keycmp()?!");
++	case LESS_THAN:
++		/* FIXME-VS: AFTER_ITEM used to be here. But with new coord
++		   item plugin can not be taken using coord set this way */
++		assert("vs-681", coord->unit_pos == 0);
++		coord->between = AFTER_UNIT;
++	case EQUAL_TO:
++		return CBK_COORD_FOUND;
++	case GREATER_THAN:
++		return CBK_COORD_NOTFOUND;
++	}
++}
++
++/* return body of internal item at @coord */
++static internal_item_layout *internal_at(const coord_t * coord	/* coord of
++								 * item */ )
++{
++	assert("nikita-607", coord != NULL);
++	assert("nikita-1650",
++	       item_plugin_by_coord(coord) ==
++	       item_plugin_by_id(NODE_POINTER_ID));
++	return (internal_item_layout *) item_body_by_coord(coord);
++}
++
++void update_internal(const coord_t * coord, const reiser4_block_nr * blocknr)
++{
++	internal_item_layout *item = internal_at(coord);
++	assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
++
++	put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
++}
++
++/* return child block number stored in the internal item at @coord */
++static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
++{
++	assert("nikita-608", coord != NULL);
++	return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
++}
++
++/* get znode pointed to by internal @item */
++static znode *znode_at(const coord_t * item /* coord of item */ ,
++		       znode * parent /* parent node */ )
++{
++	return child_znode(item, parent, 1, 0);
++}
++
++/* store pointer from internal item into "block". Implementation of
++    ->down_link() method */
++void down_link_internal(const coord_t * coord /* coord of item */ ,
++			const reiser4_key * key UNUSED_ARG	/* key to get
++								 * pointer for */ ,
++			reiser4_block_nr * block /* resulting block number */ )
++{
++	ON_DEBUG(reiser4_key item_key);
++
++	assert("nikita-609", coord != NULL);
++	assert("nikita-611", block != NULL);
++	assert("nikita-612", (key == NULL) ||
++	       /* twig horrors */
++	       (znode_get_level(coord->node) == TWIG_LEVEL)
++	       || keyle(item_key_by_coord(coord, &item_key), key));
++
++	*block = pointer_at(coord);
++	assert("nikita-2960", reiser4_blocknr_is_sane(block));
++}
++
++/* Get the child's block number, or 0 if the block is unallocated. */
++int
++utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
++				 reiser4_block_nr * block)
++{
++	assert("jmacd-2059", coord != NULL);
++
++	*block = pointer_at(coord);
++	assert("nikita-2961", reiser4_blocknr_is_sane(block));
++
++	if (blocknr_is_fake(block)) {
++		*block = 0;
++	}
++
++	return 0;
++}
++
++/* Return the child. */
++int
++utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
++		      jnode ** childp)
++{
++	reiser4_block_nr block = pointer_at(coord);
++	znode *child;
++
++	assert("jmacd-2059", childp != NULL);
++	assert("nikita-2962", reiser4_blocknr_is_sane(&block));
++
++	child = zlook(znode_get_tree(coord->node), &block);
++
++	if (IS_ERR(child)) {
++		return PTR_ERR(child);
++	}
++
++	*childp = ZJNODE(child);
++
++	return 0;
++}
++
++static void check_link(znode * left, znode * right)
++{
++	znode *scan;
++
++	for (scan = left; scan != right; scan = scan->right) {
++		if (ZF_ISSET(scan, JNODE_RIP))
++			break;
++		if (znode_is_right_connected(scan) && scan->right != NULL) {
++			if (ZF_ISSET(scan->right, JNODE_RIP))
++				break;
++			assert("nikita-3285",
++			       znode_is_left_connected(scan->right));
++			assert("nikita-3265",
++			       ergo(scan != left,
++				    ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
++			assert("nikita-3284", scan->right->left == scan);
++		} else
++			break;
++	}
++}
++
++int check__internal(const coord_t * coord, const char **error)
++{
++	reiser4_block_nr blk;
++	znode *child;
++	coord_t cpy;
++
++	blk = pointer_at(coord);
++	if (!reiser4_blocknr_is_sane(&blk)) {
++		*error = "Invalid pointer";
++		return -1;
++	}
++	coord_dup(&cpy, coord);
++	child = znode_at(&cpy, cpy.node);
++	if (child != NULL) {
++		znode *left_child;
++		znode *right_child;
++
++		left_child = right_child = NULL;
++
++		assert("nikita-3256", znode_invariant(child));
++		if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
++			left_child = znode_at(&cpy, cpy.node);
++			if (left_child != NULL) {
++				read_lock_tree(znode_get_tree(child));
++				check_link(left_child, child);
++				read_unlock_tree(znode_get_tree(child));
++				zput(left_child);
++			}
++		}
++		coord_dup(&cpy, coord);
++		if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
++			right_child = znode_at(&cpy, cpy.node);
++			if (right_child != NULL) {
++				read_lock_tree(znode_get_tree(child));
++				check_link(child, right_child);
++				read_unlock_tree(znode_get_tree(child));
++				zput(right_child);
++			}
++		}
++		zput(child);
++	}
++	return 0;
++}
++
++/* return true only if this item really points to "block" */
++/* Audited by: green(2002.06.14) */
++int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
++			    const reiser4_block_nr * block	/* block number to
++								 * check */ )
++{
++	assert("nikita-613", coord != NULL);
++	assert("nikita-614", block != NULL);
++
++	return pointer_at(coord) == *block;
++}
++
++/* hook called by ->create_item() method of node plugin after new internal
++   item was just created.
++
++   This is point where pointer to new node is inserted into tree. Initialize
++   parent pointer in child znode, insert child into sibling list and slum.
++
++*/
++int create_hook_internal(const coord_t * item /* coord of item */ ,
++			 void *arg /* child's left neighbor, if any */ )
++{
++	znode *child;
++	__u64 child_ptr;
++
++	assert("nikita-1252", item != NULL);
++	assert("nikita-1253", item->node != NULL);
++	assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
++	assert("nikita-1450", item->unit_pos == 0);
++
++	/*
++	 * preparing to item insertion build_child_ptr_data sets pointer to
++	 * data to be inserted to jnode's blocknr which is in cpu byte
++	 * order. Node's create_item simply copied those data. As result we
++	 * have child pointer in cpu's byte order. Convert content of internal
++	 * item to little endian byte order.
++	 */
++	child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
++	update_internal(item, &child_ptr);
++
++	child = znode_at(item, item->node);
++	if (child != NULL && !IS_ERR(child)) {
++		znode *left;
++		int result = 0;
++		reiser4_tree *tree;
++
++		left = arg;
++		tree = znode_get_tree(item->node);
++		write_lock_tree(tree);
++		write_lock_dk(tree);
++		assert("nikita-1400", (child->in_parent.node == NULL)
++		       || (znode_above_root(child->in_parent.node)));
++		++item->node->c_count;
++		coord_to_parent_coord(item, &child->in_parent);
++		sibling_list_insert_nolock(child, left);
++
++		assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
++		ZF_CLR(child, JNODE_ORPHAN);
++
++		if ((left != NULL) && !keyeq(znode_get_rd_key(left),
++					     znode_get_rd_key(child))) {
++			znode_set_rd_key(child, znode_get_rd_key(left));
++		}
++		write_unlock_dk(tree);
++		write_unlock_tree(tree);
++		zput(child);
++		return result;
++	} else {
++		if (child == NULL)
++			child = ERR_PTR(-EIO);
++		return PTR_ERR(child);
++	}
++}
++
++/* hook called by ->cut_and_kill() method of node plugin just before internal
++   item is removed.
++
++   This is point where empty node is removed from the tree. Clear parent
++   pointer in child, and mark node for pending deletion.
++
++   Node will be actually deleted later and in several installations:
++
++    . when last lock on this node will be released, node will be removed from
++    the sibling list and its lock will be invalidated
++
++    . when last reference to this node will be dropped, bitmap will be updated
++    and node will be actually removed from the memory.
++
++
++*/
++int kill_hook_internal(const coord_t * item /* coord of item */ ,
++		       pos_in_node_t from UNUSED_ARG /* start unit */ ,
++		       pos_in_node_t count UNUSED_ARG /* stop unit */ ,
++		       struct carry_kill_data *p UNUSED_ARG)
++{
++	znode *child;
++
++	assert("nikita-1222", item != NULL);
++	assert("nikita-1224", from == 0);
++	assert("nikita-1225", count == 1);
++
++	child = znode_at(item, item->node);
++	if (IS_ERR(child))
++		return PTR_ERR(child);
++	else if (node_is_empty(child)) {
++		reiser4_tree *tree;
++
++		assert("nikita-1397", znode_is_write_locked(child));
++		assert("nikita-1398", child->c_count == 0);
++		assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
++
++		tree = znode_get_tree(item->node);
++		write_lock_tree(tree);
++		init_parent_coord(&child->in_parent, NULL);
++		--item->node->c_count;
++		write_unlock_tree(tree);
++		zput(child);
++		return 0;
++	} else {
++		warning("nikita-1223",
++			"Cowardly refuse to remove link to non-empty node");
++		zput(child);
++		return RETERR(-EIO);
++	}
++}
++
++/* hook called by ->shift() node plugin method when iternal item was just
++   moved from one node to another.
++
++   Update parent pointer in child and c_counts in old and new parent
++
++*/
++int shift_hook_internal(const coord_t * item /* coord of item */ ,
++			unsigned from UNUSED_ARG /* start unit */ ,
++			unsigned count UNUSED_ARG /* stop unit */ ,
++			znode * old_node /* old parent */ )
++{
++	znode *child;
++	znode *new_node;
++	reiser4_tree *tree;
++
++	assert("nikita-1276", item != NULL);
++	assert("nikita-1277", from == 0);
++	assert("nikita-1278", count == 1);
++	assert("nikita-1451", item->unit_pos == 0);
++
++	new_node = item->node;
++	assert("nikita-2132", new_node != old_node);
++	tree = znode_get_tree(item->node);
++	child = child_znode(item, old_node, 1, 0);
++	if (child == NULL)
++		return 0;
++	if (!IS_ERR(child)) {
++		write_lock_tree(tree);
++		++new_node->c_count;
++		assert("nikita-1395", znode_parent(child) == old_node);
++		assert("nikita-1396", old_node->c_count > 0);
++		coord_to_parent_coord(item, &child->in_parent);
++		assert("nikita-1781", znode_parent(child) == new_node);
++		assert("nikita-1782",
++		       check_tree_pointer(item, child) == NS_FOUND);
++		--old_node->c_count;
++		write_unlock_tree(tree);
++		zput(child);
++		return 0;
++	} else
++		return PTR_ERR(child);
++}
++
++/* plugin->u.item.b.max_key_inside - not defined */
++
++/* plugin->u.item.b.nr_units - item.c:single_unit */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/internal.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/internal.h
+@@ -0,0 +1,57 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* Internal item contains down-link to the child of the internal/twig
++   node in a tree. It is internal items that are actually used during
++   tree traversal. */
++
++#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
++#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
++
++#include "../../forward.h"
++#include "../../dformat.h"
++
++/* on-disk layout of internal item */
++typedef struct internal_item_layout {
++	/*  0 */ reiser4_dblock_nr pointer;
++	/*  4 */
++} internal_item_layout;
++
++struct cut_list;
++
++int mergeable_internal(const coord_t * p1, const coord_t * p2);
++lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
++			      coord_t * coord);
++/* store pointer from internal item into "block". Implementation of
++    ->down_link() method */
++extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
++			       reiser4_block_nr * block);
++extern int has_pointer_to_internal(const coord_t * coord,
++				   const reiser4_block_nr * block);
++extern int create_hook_internal(const coord_t * item, void *arg);
++extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
++			      pos_in_node_t count, struct carry_kill_data *);
++extern int shift_hook_internal(const coord_t * item, unsigned from,
++			       unsigned count, znode * old_node);
++extern void print_internal(const char *prefix, coord_t * coord);
++
++extern int utmost_child_internal(const coord_t * coord, sideof side,
++				 jnode ** child);
++int utmost_child_real_block_internal(const coord_t * coord, sideof side,
++				     reiser4_block_nr * block);
++
++extern void update_internal(const coord_t * coord,
++			    const reiser4_block_nr * blocknr);
++/* FIXME: reiserfs has check_internal */
++extern int check__internal(const coord_t * coord, const char **error);
++
++/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/item.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/item.c
+@@ -0,0 +1,727 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* definition of item plugins. */
++
++#include "../../forward.h"
++#include "../../debug.h"
++#include "../../key.h"
++#include "../../coord.h"
++#include "../plugin_header.h"
++#include "sde.h"
++#include "internal.h"
++#include "item.h"
++#include "static_stat.h"
++#include "../plugin.h"
++#include "../../znode.h"
++#include "../../tree.h"
++#include "../../context.h"
++#include "ctail.h"
++
++/* return pointer to item body */
++void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
++{
++	assert("nikita-324", coord != NULL);
++	assert("nikita-325", coord->node != NULL);
++	assert("nikita-326", znode_is_loaded(coord->node));
++	assert("nikita-3200", coord->offset == INVALID_OFFSET);
++
++	coord->offset =
++	    node_plugin_by_node(coord->node)->item_by_coord(coord) -
++	    zdata(coord->node);
++	ON_DEBUG(coord->body_v = coord->node->times_locked);
++}
++
++void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
++{
++	return zdata(coord->node) + coord->offset;
++}
++
++#if REISER4_DEBUG
++
++int item_body_is_valid(const coord_t * coord)
++{
++	return
++	    coord->offset ==
++	    node_plugin_by_node(coord->node)->item_by_coord(coord) -
++	    zdata(coord->node);
++}
++
++#endif
++
++/* return length of item at @coord */
++pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
++{
++	int len;
++
++	assert("nikita-327", coord != NULL);
++	assert("nikita-328", coord->node != NULL);
++	assert("nikita-329", znode_is_loaded(coord->node));
++
++	len = node_plugin_by_node(coord->node)->length_by_coord(coord);
++	return len;
++}
++
++void obtain_item_plugin(const coord_t * coord)
++{
++	assert("nikita-330", coord != NULL);
++	assert("nikita-331", coord->node != NULL);
++	assert("nikita-332", znode_is_loaded(coord->node));
++
++	coord_set_iplug((coord_t *) coord,
++			node_plugin_by_node(coord->node)->
++			plugin_by_coord(coord));
++	assert("nikita-2479",
++	       coord_iplug(coord) ==
++	       node_plugin_by_node(coord->node)->plugin_by_coord(coord));
++}
++
++/* return type of item at @coord */
++item_type_id item_type_by_coord(const coord_t * coord /* coord to query */ )
++{
++	assert("nikita-333", coord != NULL);
++	assert("nikita-334", coord->node != NULL);
++	assert("nikita-335", znode_is_loaded(coord->node));
++	assert("nikita-336", item_plugin_by_coord(coord) != NULL);
++
++	return item_plugin_by_coord(coord)->b.item_type;
++}
++
++/* return id of item */
++/* Audited by: green(2002.06.15) */
++item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
++{
++	assert("vs-539", coord != NULL);
++	assert("vs-538", coord->node != NULL);
++	assert("vs-537", znode_is_loaded(coord->node));
++	assert("vs-536", item_plugin_by_coord(coord) != NULL);
++	assert("vs-540",
++	       item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
++
++	return item_id_by_plugin(item_plugin_by_coord(coord));
++}
++
++/* return key of item at @coord */
++/* Audited by: green(2002.06.15) */
++reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
++			       reiser4_key * key /* result */ )
++{
++	assert("nikita-338", coord != NULL);
++	assert("nikita-339", coord->node != NULL);
++	assert("nikita-340", znode_is_loaded(coord->node));
++
++	return node_plugin_by_node(coord->node)->key_at(coord, key);
++}
++
++/* this returns max key in the item */
++reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
++				   reiser4_key * key /* result */ )
++{
++	coord_t last;
++
++	assert("nikita-338", coord != NULL);
++	assert("nikita-339", coord->node != NULL);
++	assert("nikita-340", znode_is_loaded(coord->node));
++
++	/* make coord pointing to last item's unit */
++	coord_dup(&last, coord);
++	last.unit_pos = coord_num_units(&last) - 1;
++	assert("vs-1560", coord_is_existing_unit(&last));
++
++	max_unit_key_by_coord(&last, key);
++	return key;
++}
++
++/* return key of unit at @coord */
++reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
++			       reiser4_key * key /* result */ )
++{
++	assert("nikita-772", coord != NULL);
++	assert("nikita-774", coord->node != NULL);
++	assert("nikita-775", znode_is_loaded(coord->node));
++
++	if (item_plugin_by_coord(coord)->b.unit_key != NULL)
++		return item_plugin_by_coord(coord)->b.unit_key(coord, key);
++	else
++		return item_key_by_coord(coord, key);
++}
++
++/* return the biggest key contained the unit @coord */
++reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
++				   reiser4_key * key /* result */ )
++{
++	assert("nikita-772", coord != NULL);
++	assert("nikita-774", coord->node != NULL);
++	assert("nikita-775", znode_is_loaded(coord->node));
++
++	if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
++		return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
++	else
++		return unit_key_by_coord(coord, key);
++}
++
++/* ->max_key_inside() method for items consisting of exactly one key (like
++    stat-data) */
++static reiser4_key *max_key_inside_single_key(const coord_t *
++					      coord /* coord of item */ ,
++					      reiser4_key *
++					      result /* resulting key */ )
++{
++	assert("nikita-604", coord != NULL);
++
++	/* coord -> key is starting key of this item and it has to be already
++	   filled in */
++	return unit_key_by_coord(coord, result);
++}
++
++/* ->nr_units() method for items consisting of exactly one unit always */
++static pos_in_node_t
++nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
++{
++	return 1;
++}
++
++static int
++paste_no_paste(coord_t * coord UNUSED_ARG,
++	       reiser4_item_data * data UNUSED_ARG,
++	       carry_plugin_info * info UNUSED_ARG)
++{
++	return 0;
++}
++
++/* default ->fast_paste() method */
++static int
++agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
++{
++	return 1;
++}
++
++int item_can_contain_key(const coord_t * item /* coord of item */ ,
++			 const reiser4_key * key /* key to check */ ,
++			 const reiser4_item_data * data	/* parameters of item
++							 * being created */ )
++{
++	item_plugin *iplug;
++	reiser4_key min_key_in_item;
++	reiser4_key max_key_in_item;
++
++	assert("nikita-1658", item != NULL);
++	assert("nikita-1659", key != NULL);
++
++	iplug = item_plugin_by_coord(item);
++	if (iplug->b.can_contain_key != NULL)
++		return iplug->b.can_contain_key(item, key, data);
++	else {
++		assert("nikita-1681", iplug->b.max_key_inside != NULL);
++		item_key_by_coord(item, &min_key_in_item);
++		iplug->b.max_key_inside(item, &max_key_in_item);
++
++		/* can contain key if
++		   min_key_in_item <= key &&
++		   key <= max_key_in_item
++		 */
++		return keyle(&min_key_in_item, key)
++		    && keyle(key, &max_key_in_item);
++	}
++}
++
++/* mergeable method for non mergeable items */
++static int
++not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
++{
++	return 0;
++}
++
++/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
++int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
++			const coord_t * i2 /* coord of second item */ )
++{
++	item_plugin *iplug;
++	reiser4_key k1;
++	reiser4_key k2;
++
++	assert("nikita-1336", i1 != NULL);
++	assert("nikita-1337", i2 != NULL);
++
++	iplug = item_plugin_by_coord(i1);
++	assert("nikita-1338", iplug != NULL);
++
++	/* NOTE-NIKITA are_items_mergeable() is also called by assertions in
++	   shifting code when nodes are in "suspended" state. */
++	assert("nikita-1663",
++	       keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
++
++	if (iplug->b.mergeable != NULL) {
++		return iplug->b.mergeable(i1, i2);
++	} else if (iplug->b.max_key_inside != NULL) {
++		iplug->b.max_key_inside(i1, &k1);
++		item_key_by_coord(i2, &k2);
++
++		/* mergeable if ->max_key_inside() >= key of i2; */
++		return keyge(iplug->b.max_key_inside(i1, &k1),
++			     item_key_by_coord(i2, &k2));
++	} else {
++		item_key_by_coord(i1, &k1);
++		item_key_by_coord(i2, &k2);
++
++		return
++		    (get_key_locality(&k1) == get_key_locality(&k2)) &&
++		    (get_key_objectid(&k1) == get_key_objectid(&k2))
++		    && (iplug == item_plugin_by_coord(i2));
++	}
++}
++
++int item_is_extent(const coord_t * item)
++{
++	assert("vs-482", coord_is_existing_item(item));
++	return item_id_by_coord(item) == EXTENT_POINTER_ID;
++}
++
++int item_is_tail(const coord_t * item)
++{
++	assert("vs-482", coord_is_existing_item(item));
++	return item_id_by_coord(item) == FORMATTING_ID;
++}
++
++int item_is_statdata(const coord_t * item)
++{
++	assert("vs-516", coord_is_existing_item(item));
++	return item_type_by_coord(item) == STAT_DATA_ITEM_TYPE;
++}
++
++int item_is_ctail(const coord_t * item)
++{
++	assert("edward-xx", coord_is_existing_item(item));
++	return item_id_by_coord(item) == CTAIL_ID;
++}
++
++static int change_item(struct inode *inode, reiser4_plugin * plugin)
++{
++	/* cannot change constituent item (sd, or dir_item) */
++	return RETERR(-EINVAL);
++}
++
++static reiser4_plugin_ops item_plugin_ops = {
++	.init = NULL,
++	.load = NULL,
++	.save_len = NULL,
++	.save = NULL,
++	.change = change_item
++};
++
++item_plugin item_plugins[LAST_ITEM_ID] = {
++	[STATIC_STAT_DATA_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = STATIC_STAT_DATA_ID,
++			.pops = &item_plugin_ops,
++			.label = "sd",
++			.desc = "stat-data",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = STAT_DATA_ITEM_TYPE,
++			.max_key_inside = max_key_inside_single_key,
++			.can_contain_key = NULL,
++			.mergeable = not_mergeable,
++			.nr_units = nr_units_single_unit,
++			.lookup = NULL,
++			.init = NULL,
++			.paste = paste_no_paste,
++			.fast_paste = NULL,
++			.can_shift = NULL,
++			.copy_units = NULL,
++			.create_hook = NULL,
++			.kill_hook = NULL,
++			.shift_hook = NULL,
++			.cut_units = NULL,
++			.kill_units = NULL,
++			.unit_key = NULL,
++			.max_unit_key = NULL,
++			.estimate = NULL,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = NULL
++#endif
++		},
++		.f = {
++			.utmost_child = NULL,
++			.utmost_child_real_block = NULL,
++			.update = NULL,
++			.scan = NULL,
++			.convert = NULL
++		},
++		.s = {
++			.sd = {
++				.init_inode = init_inode_static_sd,
++				.save_len = save_len_static_sd,
++				.save = save_static_sd
++			}
++		}
++	},
++	[SIMPLE_DIR_ENTRY_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = SIMPLE_DIR_ENTRY_ID,
++			.pops = &item_plugin_ops,
++			.label = "de",
++			.desc = "directory entry",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = DIR_ENTRY_ITEM_TYPE,
++			.max_key_inside = max_key_inside_single_key,
++			.can_contain_key = NULL,
++			.mergeable = NULL,
++			.nr_units = nr_units_single_unit,
++			.lookup = NULL,
++			.init = NULL,
++			.paste = NULL,
++			.fast_paste = NULL,
++			.can_shift = NULL,
++			.copy_units = NULL,
++			.create_hook = NULL,
++			.kill_hook = NULL,
++			.shift_hook = NULL,
++			.cut_units = NULL,
++			.kill_units = NULL,
++			.unit_key = NULL,
++			.max_unit_key = NULL,
++			.estimate = NULL,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = NULL
++#endif
++		},
++		.f = {
++			.utmost_child = NULL,
++			.utmost_child_real_block = NULL,
++			.update = NULL,
++			.scan = NULL,
++			.convert = NULL
++		},
++		.s = {
++			.dir = {
++				.extract_key = extract_key_de,
++				.update_key = update_key_de,
++				.extract_name = extract_name_de,
++				.extract_file_type = extract_file_type_de,
++				.add_entry = add_entry_de,
++				.rem_entry = rem_entry_de,
++				.max_name_len = max_name_len_de
++			}
++		}
++	},
++	[COMPOUND_DIR_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = COMPOUND_DIR_ID,
++			.pops = &item_plugin_ops,
++			.label = "cde",
++			.desc = "compressed directory entry",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = DIR_ENTRY_ITEM_TYPE,
++			.max_key_inside = max_key_inside_cde,
++			.can_contain_key = can_contain_key_cde,
++			.mergeable = mergeable_cde,
++			.nr_units = nr_units_cde,
++			.lookup = lookup_cde,
++			.init = init_cde,
++			.paste = paste_cde,
++			.fast_paste = agree_to_fast_op,
++			.can_shift = can_shift_cde,
++			.copy_units = copy_units_cde,
++			.create_hook = NULL,
++			.kill_hook = NULL,
++			.shift_hook = NULL,
++			.cut_units = cut_units_cde,
++			.kill_units = kill_units_cde,
++			.unit_key = unit_key_cde,
++			.max_unit_key = unit_key_cde,
++			.estimate = estimate_cde,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = check_cde
++#endif
++		},
++		.f = {
++			.utmost_child = NULL,
++			.utmost_child_real_block = NULL,
++			.update = NULL,
++			.scan = NULL,
++			.convert = NULL
++		},
++		.s = {
++			.dir = {
++				.extract_key = extract_key_cde,
++				.update_key = update_key_cde,
++				.extract_name = extract_name_cde,
++				.extract_file_type = extract_file_type_de,
++				.add_entry = add_entry_cde,
++				.rem_entry = rem_entry_cde,
++				.max_name_len = max_name_len_cde
++			}
++		}
++	},
++	[NODE_POINTER_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = NODE_POINTER_ID,
++			.pops = NULL,
++			.label = "internal",
++			.desc = "internal item",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = INTERNAL_ITEM_TYPE,
++			.max_key_inside = NULL,
++			.can_contain_key = NULL,
++			.mergeable = mergeable_internal,
++			.nr_units = nr_units_single_unit,
++			.lookup = lookup_internal,
++			.init = NULL,
++			.paste = NULL,
++			.fast_paste = NULL,
++			.can_shift = NULL,
++			.copy_units = NULL,
++			.create_hook = create_hook_internal,
++			.kill_hook = kill_hook_internal,
++			.shift_hook = shift_hook_internal,
++			.cut_units = NULL,
++			.kill_units = NULL,
++			.unit_key = NULL,
++			.max_unit_key = NULL,
++			.estimate = NULL,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = check__internal
++#endif
++		},
++		.f = {
++			.utmost_child = utmost_child_internal,
++			.utmost_child_real_block =
++			utmost_child_real_block_internal,
++			.update = update_internal,
++			.scan = NULL,
++			.convert = NULL
++		},
++		.s = {
++			.internal = {
++				.down_link = down_link_internal,
++				.has_pointer_to = has_pointer_to_internal
++			}
++		}
++	},
++	[EXTENT_POINTER_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = EXTENT_POINTER_ID,
++			.pops = NULL,
++			.label = "extent",
++			.desc = "extent item",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = UNIX_FILE_METADATA_ITEM_TYPE,
++			.max_key_inside = max_key_inside_extent,
++			.can_contain_key = can_contain_key_extent,
++			.mergeable = mergeable_extent,
++			.nr_units = nr_units_extent,
++			.lookup = lookup_extent,
++			.init = NULL,
++			.paste = paste_extent,
++			.fast_paste = agree_to_fast_op,
++			.can_shift = can_shift_extent,
++			.create_hook = create_hook_extent,
++			.copy_units = copy_units_extent,
++			.kill_hook = kill_hook_extent,
++			.shift_hook = NULL,
++			.cut_units = cut_units_extent,
++			.kill_units = kill_units_extent,
++			.unit_key = unit_key_extent,
++			.max_unit_key = max_unit_key_extent,
++			.estimate = NULL,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = check_extent
++#endif
++		},
++		.f = {
++			.utmost_child = utmost_child_extent,
++			.utmost_child_real_block =
++			utmost_child_real_block_extent,
++			.update = NULL,
++			.scan = scan_extent,
++			.convert = NULL,
++			.key_by_offset = key_by_offset_extent
++		},
++		.s = {
++			.file = {
++				.write = write_extent,
++				.read = read_extent,
++				.readpage = readpage_extent,
++				.get_block = get_block_address_extent,
++				.readpages = readpages_extent,
++				.append_key = append_key_extent,
++				.init_coord_extension =
++				init_coord_extension_extent
++			}
++		}
++	},
++	[FORMATTING_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = FORMATTING_ID,
++			.pops = NULL,
++			.label = "body",
++			.desc = "body (or tail?) item",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = UNIX_FILE_METADATA_ITEM_TYPE,
++			.max_key_inside = max_key_inside_tail,
++			.can_contain_key = can_contain_key_tail,
++			.mergeable = mergeable_tail,
++			.nr_units = nr_units_tail,
++			.lookup = lookup_tail,
++			.init = NULL,
++			.paste = paste_tail,
++			.fast_paste = agree_to_fast_op,
++			.can_shift = can_shift_tail,
++			.create_hook = NULL,
++			.copy_units = copy_units_tail,
++			.kill_hook = kill_hook_tail,
++			.shift_hook = NULL,
++			.cut_units = cut_units_tail,
++			.kill_units = kill_units_tail,
++			.unit_key = unit_key_tail,
++			.max_unit_key = unit_key_tail,
++			.estimate = NULL,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = NULL
++#endif
++		},
++		.f = {
++			.utmost_child = NULL,
++			.utmost_child_real_block = NULL,
++			.update = NULL,
++			.scan = NULL,
++			.convert = NULL
++		},
++		.s = {
++			.file = {
++				.write = write_tail,
++				.read = read_tail,
++				.readpage = readpage_tail,
++				.get_block = NULL,
++				.readpages = NULL,
++				.append_key = append_key_tail,
++				.init_coord_extension =
++				init_coord_extension_tail
++			}
++		}
++	},
++	[CTAIL_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = CTAIL_ID,
++			.pops = NULL,
++			.label = "ctail",
++			.desc = "cryptcompress tail item",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = UNIX_FILE_METADATA_ITEM_TYPE,
++			.max_key_inside = max_key_inside_tail,
++			.can_contain_key = can_contain_key_ctail,
++			.mergeable = mergeable_ctail,
++			.nr_units = nr_units_ctail,
++			.lookup = NULL,
++			.init = init_ctail,
++			.paste = paste_ctail,
++			.fast_paste = agree_to_fast_op,
++			.can_shift = can_shift_ctail,
++			.create_hook = create_hook_ctail,
++			.copy_units = copy_units_ctail,
++			.kill_hook = kill_hook_ctail,
++			.shift_hook = shift_hook_ctail,
++			.cut_units = cut_units_ctail,
++			.kill_units = kill_units_ctail,
++			.unit_key = unit_key_tail,
++			.max_unit_key = unit_key_tail,
++			.estimate = estimate_ctail,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = check_ctail
++#endif
++		},
++		.f = {
++			.utmost_child = utmost_child_ctail,
++			/* FIXME-EDWARD: write this */
++			.utmost_child_real_block = NULL,
++			.update = NULL,
++			.scan = scan_ctail,
++			.convert = convert_ctail
++		},
++		.s = {
++			.file = {
++				.write = NULL,
++				.read = read_ctail,
++				.readpage = readpage_ctail,
++				.get_block = get_block_address_tail,
++				.readpages = readpages_ctail,
++				.append_key = append_key_ctail,
++				.init_coord_extension =
++				init_coord_extension_tail
++			}
++		}
++	},
++	[BLACK_BOX_ID] = {
++		.h = {
++			.type_id = REISER4_ITEM_PLUGIN_TYPE,
++			.id = BLACK_BOX_ID,
++			.pops = NULL,
++			.label = "blackbox",
++			.desc = "black box item",
++			.linkage = {NULL, NULL}
++		},
++		.b = {
++			.item_type = OTHER_ITEM_TYPE,
++			.max_key_inside = NULL,
++			.can_contain_key = NULL,
++			.mergeable = not_mergeable,
++			.nr_units = nr_units_single_unit,
++			/* to need for ->lookup method */
++			.lookup = NULL,
++			.init = NULL,
++			.paste = NULL,
++			.fast_paste = NULL,
++			.can_shift = NULL,
++			.copy_units = NULL,
++			.create_hook = NULL,
++			.kill_hook = NULL,
++			.shift_hook = NULL,
++			.cut_units = NULL,
++			.kill_units = NULL,
++			.unit_key = NULL,
++			.max_unit_key = NULL,
++			.estimate = NULL,
++			.item_data_by_flow = NULL,
++#if REISER4_DEBUG
++			.check = NULL
++#endif
++		}
++	}
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/item.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/item.h
+@@ -0,0 +1,399 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* first read balance.c comments before reading this */
++
++/* An item_plugin implements all of the operations required for
++   balancing that are item specific. */
++
++/* an item plugin also implements other operations that are specific to that
++   item.  These go into the item specific operations portion of the item
++   handler, and all of the item specific portions of the item handler are put
++   into a union. */
++
++#if !defined( __REISER4_ITEM_H__ )
++#define __REISER4_ITEM_H__
++
++#include "../../forward.h"
++#include "../plugin_header.h"
++#include "../../dformat.h"
++#include "../../seal.h"
++#include "../../plugin/file/file.h"
++
++#include <linux/fs.h>		/* for struct file, struct inode  */
++#include <linux/mm.h>		/* for struct page */
++#include <linux/dcache.h>	/* for struct dentry */
++
++typedef enum {
++	STAT_DATA_ITEM_TYPE,
++	DIR_ENTRY_ITEM_TYPE,
++	INTERNAL_ITEM_TYPE,
++	UNIX_FILE_METADATA_ITEM_TYPE,
++	OTHER_ITEM_TYPE
++} item_type_id;
++
++/* this is the part of each item plugin that all items are expected to
++   support or at least explicitly fail to support by setting the
++   pointer to null. */
++typedef struct {
++	item_type_id item_type;
++
++	/* operations called by balancing
++
++	   It is interesting to consider that some of these item
++	   operations could be given sources or targets that are not
++	   really items in nodes.  This could be ok/useful.
++
++	 */
++	/* maximal key that can _possibly_ be occupied by this item
++
++	   When inserting, and node ->lookup() method (called by
++	   coord_by_key()) reaches an item after binary search,
++	   the  ->max_key_inside() item plugin method is used to determine
++	   whether new item should pasted into existing item
++	   (new_key<=max_key_inside()) or new item has to be created
++	   (new_key>max_key_inside()).
++
++	   For items that occupy exactly one key (like stat-data)
++	   this method should return this key. For items that can
++	   grow indefinitely (extent, directory item) this should
++	   return max_key().
++
++	   For example extent with the key
++
++	   (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
++
++	   ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
++	 */
++	reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
++
++	/* true if item @coord can merge data at @key. */
++	int (*can_contain_key) (const coord_t *, const reiser4_key *,
++				const reiser4_item_data *);
++	/* mergeable() - check items for mergeability
++
++	   Optional method. Returns true if two items can be merged.
++
++	 */
++	int (*mergeable) (const coord_t *, const coord_t *);
++
++	/* number of atomic things in an item */
++	 pos_in_node_t(*nr_units) (const coord_t *);
++
++	/* search within item for a unit within the item, and return a
++	   pointer to it.  This can be used to calculate how many
++	   bytes to shrink an item if you use pointer arithmetic and
++	   compare to the start of the item body if the item's data
++	   are continuous in the node, if the item's data are not
++	   continuous in the node, all sorts of other things are maybe
++	   going to break as well. */
++	 lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
++	/* method called by ode_plugin->create_item() to initialise new
++	   item */
++	int (*init) (coord_t * target, coord_t * from,
++		     reiser4_item_data * data);
++	/* method called (e.g., by resize_item()) to place new data into
++	   item when it grows */
++	int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
++	/* return true if paste into @coord is allowed to skip
++	   carry. That is, if such paste would require any changes
++	   at the parent level
++	 */
++	int (*fast_paste) (const coord_t *);
++	/* how many but not more than @want units of @source can be
++	   shifted into @target node. If pend == append - we try to
++	   append last item of @target by first units of @source. If
++	   pend == prepend - we try to "prepend" first item in @target
++	   by last units of @source. @target node has @free_space
++	   bytes of free space. Total size of those units are returned
++	   via @size.
++
++	   @target is not NULL if shifting to the mergeable item and
++	   NULL is new item will be created during shifting.
++	 */
++	int (*can_shift) (unsigned free_space, coord_t *,
++			  znode *, shift_direction, unsigned *size,
++			  unsigned want);
++
++	/* starting off @from-th unit of item @source append or
++	   prepend @count units to @target. @target has been already
++	   expanded by @free_space bytes. That must be exactly what is
++	   needed for those items in @target. If @where_is_free_space
++	   == SHIFT_LEFT - free space is at the end of @target item,
++	   othersize - it is in the beginning of it. */
++	void (*copy_units) (coord_t *, coord_t *,
++			    unsigned from, unsigned count,
++			    shift_direction where_is_free_space,
++			    unsigned free_space);
++
++	int (*create_hook) (const coord_t *, void *);
++	/* do whatever is necessary to do when @count units starting
++	   from @from-th one are removed from the tree */
++	/* FIXME-VS: this is used to be here for, in particular,
++	   extents and items of internal type to free blocks they point
++	   to at the same time with removing items from a
++	   tree. Problems start, however, when dealloc_block fails due
++	   to some reason. Item gets removed, but blocks it pointed to
++	   are not freed. It is not clear how to fix this for items of
++	   internal type because a need to remove internal item may
++	   appear in the middle of balancing, and there is no way to
++	   undo changes made. OTOH, if space allocator involves
++	   balancing to perform dealloc_block - this will probably
++	   break balancing due to deadlock issues
++	 */
++	int (*kill_hook) (const coord_t *, pos_in_node_t from,
++			  pos_in_node_t count, struct carry_kill_data *);
++	int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
++			   znode * _node);
++
++	/* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
++	   including boundaries. When units are cut from item beginning - move space which gets freed to head of
++	   item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
++	   item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
++	   @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
++	 */
++	int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
++			  struct carry_cut_data *,
++			  reiser4_key * smallest_removed,
++			  reiser4_key * new_first_key);
++
++	/* like cut_units, except that these units are removed from the
++	   tree, not only from a node */
++	int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
++			   struct carry_kill_data *,
++			   reiser4_key * smallest_removed,
++			   reiser4_key * new_first);
++
++	/* if @key_of_coord == 1 - returned key of coord, otherwise -
++	   key of unit is returned. If @coord is not set to certain
++	   unit - ERR_PTR(-ENOENT) is returned */
++	reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
++	reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
++	/* estimate how much space is needed for paste @data into item at
++	   @coord. if @coord==0 - estimate insertion, otherwise - estimate
++	   pasting
++	 */
++	int (*estimate) (const coord_t *, const reiser4_item_data *);
++
++	/* converts flow @f to item data. @coord == 0 on insert */
++	int (*item_data_by_flow) (const coord_t *, const flow_t *,
++				  reiser4_item_data *);
++
++	/*void (*show) (struct seq_file *, coord_t *); */
++
++#if REISER4_DEBUG
++	/* used for debugging, every item should have here the most
++	   complete possible check of the consistency of the item that
++	   the inventor can construct */
++	int (*check) (const coord_t *, const char **error);
++#endif
++
++} balance_ops;
++
++typedef struct {
++	/* return the right or left child of @coord, only if it is in memory */
++	int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
++
++	/* return whether the right or left child of @coord has a non-fake
++	   block number. */
++	int (*utmost_child_real_block) (const coord_t *, sideof side,
++					reiser4_block_nr *);
++	/* relocate child at @coord to the @block */
++	void (*update) (const coord_t *, const reiser4_block_nr *);
++	/* count unformatted nodes per item for leave relocation policy, etc.. */
++	int (*scan) (flush_scan * scan);
++	/* convert item by flush */
++	int (*convert) (flush_pos_t * pos);
++	/* backward mapping from jnode offset to a key.  */
++	int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
++} flush_ops;
++
++/* operations specific to the directory item */
++typedef struct {
++	/* extract stat-data key from directory entry at @coord and place it
++	   into @key. */
++	int (*extract_key) (const coord_t *, reiser4_key * key);
++	/* update object key in item. */
++	int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
++	/* extract name from directory entry at @coord and return it */
++	char *(*extract_name) (const coord_t *, char *buf);
++	/* extract file type (DT_* stuff) from directory entry at @coord and
++	   return it */
++	unsigned (*extract_file_type) (const coord_t *);
++	int (*add_entry) (struct inode * dir,
++			  coord_t *, lock_handle *,
++			  const struct dentry * name,
++			  reiser4_dir_entry_desc * entry);
++	int (*rem_entry) (struct inode * dir, const struct qstr * name,
++			  coord_t *, lock_handle *,
++			  reiser4_dir_entry_desc * entry);
++	int (*max_name_len) (const struct inode * dir);
++} dir_entry_ops;
++
++/* operations specific to items regular (unix) file metadata are built of */
++typedef struct {
++	int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
++	int (*read) (struct file *, flow_t *, hint_t *);
++	int (*readpage) (void *, struct page *);
++	int (*get_block) (const coord_t *, sector_t, sector_t *);
++	void (*readpages) (void *, struct address_space *,
++			   struct list_head * pages);
++	/*
++	 * key of first byte which is not addressed by the item @coord is set
++	 * to.
++	 * For example, for extent item with the key
++	 *
++	 * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
++	 *
++	 * ->append_key is
++	 *
++	 * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
++	 */
++	reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
++
++	void (*init_coord_extension) (uf_coord_t *, loff_t);
++} file_ops;
++
++/* operations specific to items of stat data type */
++typedef struct {
++	int (*init_inode) (struct inode * inode, char *sd, int len);
++	int (*save_len) (struct inode * inode);
++	int (*save) (struct inode * inode, char **area);
++} sd_ops;
++
++/* operations specific to internal item */
++typedef struct {
++	/* all tree traversal want to know from internal item is where
++	   to go next. */
++	void (*down_link) (const coord_t * coord,
++			   const reiser4_key * key, reiser4_block_nr * block);
++	/* check that given internal item contains given pointer. */
++	int (*has_pointer_to) (const coord_t * coord,
++			       const reiser4_block_nr * block);
++} internal_item_ops;
++
++struct item_plugin {
++	/* generic fields */
++	plugin_header h;
++
++	/* methods common for all item types */
++	balance_ops b;
++	/* methods used during flush */
++	flush_ops f;
++
++	/* methods specific to particular type of item */
++	union {
++		dir_entry_ops dir;
++		file_ops file;
++		sd_ops sd;
++		internal_item_ops internal;
++	} s;
++
++};
++
++static inline item_id item_id_by_plugin(item_plugin * plugin)
++{
++	return plugin->h.id;
++}
++
++static inline char get_iplugid(item_plugin * iplug)
++{
++	assert("nikita-2838", iplug != NULL);
++	assert("nikita-2839", iplug->h.id < 0xff);
++	return (char)item_id_by_plugin(iplug);
++}
++
++extern unsigned long znode_times_locked(const znode * z);
++
++static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
++{
++	assert("nikita-2837", coord != NULL);
++	assert("nikita-2838", iplug != NULL);
++	coord->iplugid = get_iplugid(iplug);
++	ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
++}
++
++static inline item_plugin *coord_iplug(const coord_t * coord)
++{
++	assert("nikita-2833", coord != NULL);
++	assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
++	assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
++	return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
++					    coord->iplugid);
++}
++
++extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
++				const reiser4_item_data *);
++extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
++extern int item_is_extent(const coord_t *);
++extern int item_is_tail(const coord_t *);
++extern int item_is_statdata(const coord_t * item);
++extern int item_is_ctail(const coord_t *);
++
++extern pos_in_node_t item_length_by_coord(const coord_t * coord);
++extern item_type_id item_type_by_coord(const coord_t * coord);
++extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
++extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
++extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
++extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
++extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
++					  reiser4_key * key);
++
++extern void obtain_item_plugin(const coord_t * coord);
++
++#if defined(REISER4_DEBUG)
++extern int znode_is_loaded(const znode * node);
++#endif
++
++/* return plugin of item at @coord */
++static inline item_plugin *item_plugin_by_coord(const coord_t *
++						coord /* coord to query */ )
++{
++	assert("nikita-330", coord != NULL);
++	assert("nikita-331", coord->node != NULL);
++	assert("nikita-332", znode_is_loaded(coord->node));
++
++	if (unlikely(!coord_is_iplug_set(coord)))
++		obtain_item_plugin(coord);
++	return coord_iplug(coord);
++}
++
++/* this returns true if item is of internal type */
++static inline int item_is_internal(const coord_t * item)
++{
++	assert("vs-483", coord_is_existing_item(item));
++	return item_type_by_coord(item) == INTERNAL_ITEM_TYPE;
++}
++
++extern void item_body_by_coord_hard(coord_t * coord);
++extern void *item_body_by_coord_easy(const coord_t * coord);
++#if REISER4_DEBUG
++extern int item_body_is_valid(const coord_t * coord);
++#endif
++
++/* return pointer to item body */
++static inline void *item_body_by_coord(const coord_t *
++				       coord /* coord to query */ )
++{
++	assert("nikita-324", coord != NULL);
++	assert("nikita-325", coord->node != NULL);
++	assert("nikita-326", znode_is_loaded(coord->node));
++
++	if (coord->offset == INVALID_OFFSET)
++		item_body_by_coord_hard((coord_t *) coord);
++	assert("nikita-3201", item_body_is_valid(coord));
++	assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
++	return item_body_by_coord_easy(coord);
++}
++
++/* __REISER4_ITEM_H__ */
++#endif
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/sde.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/sde.c
+@@ -0,0 +1,190 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Directory entry implementation */
++#include "../../forward.h"
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../../kassign.h"
++#include "../../coord.h"
++#include "sde.h"
++#include "item.h"
++#include "../plugin.h"
++#include "../../znode.h"
++#include "../../carry.h"
++#include "../../tree.h"
++#include "../../inode.h"
++
++#include <linux/fs.h>		/* for struct inode */
++#include <linux/dcache.h>	/* for struct dentry */
++#include <linux/quotaops.h>
++
++/* ->extract_key() method of simple directory item plugin. */
++int extract_key_de(const coord_t * coord /* coord of item */ ,
++		   reiser4_key * key /* resulting key */ )
++{
++	directory_entry_format *dent;
++
++	assert("nikita-1458", coord != NULL);
++	assert("nikita-1459", key != NULL);
++
++	dent = (directory_entry_format *) item_body_by_coord(coord);
++	assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
++	return extract_key_from_id(&dent->id, key);
++}
++
++int
++update_key_de(const coord_t * coord, const reiser4_key * key,
++	      lock_handle * lh UNUSED_ARG)
++{
++	directory_entry_format *dent;
++	obj_key_id obj_id;
++	int result;
++
++	assert("nikita-2342", coord != NULL);
++	assert("nikita-2343", key != NULL);
++
++	dent = (directory_entry_format *) item_body_by_coord(coord);
++	result = build_obj_key_id(key, &obj_id);
++	if (result == 0) {
++		dent->id = obj_id;
++		znode_make_dirty(coord->node);
++	}
++	return 0;
++}
++
++char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
++			char *buf)
++{
++	reiser4_key key;
++
++	unit_key_by_coord(coord, &key);
++	if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
++		reiser4_print_address("oops", znode_get_block(coord->node));
++	if (!is_longname_key(&key)) {
++		if (is_dot_key(&key))
++			return (char *)".";
++		else
++			return extract_name_from_key(&key, buf);
++	} else
++		return (char *)dent->name;
++}
++
++/* ->extract_name() method of simple directory item plugin. */
++char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
++{
++	directory_entry_format *dent;
++
++	assert("nikita-1460", coord != NULL);
++
++	dent = (directory_entry_format *) item_body_by_coord(coord);
++	return extract_dent_name(coord, dent, buf);
++}
++
++/* ->extract_file_type() method of simple directory item plugin. */
++unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG	/* coord of
++								 * item */ )
++{
++	assert("nikita-1764", coord != NULL);
++	/* we don't store file type in the directory entry yet.
++
++	   But see comments at kassign.h:obj_key_id
++	 */
++	return DT_UNKNOWN;
++}
++
++int add_entry_de(struct inode *dir /* directory of item */ ,
++		 coord_t * coord /* coord of item */ ,
++		 lock_handle * lh /* insertion lock handle */ ,
++		 const struct dentry *de /* name to add */ ,
++		 reiser4_dir_entry_desc * entry	/* parameters of new directory
++						 * entry */ )
++{
++	reiser4_item_data data;
++	directory_entry_format *dent;
++	int result;
++	const char *name;
++	int len;
++	int longname;
++
++	name = de->d_name.name;
++	len = de->d_name.len;
++	assert("nikita-1163", strlen(name) == len);
++
++	longname = is_longname(name, len);
++
++	data.length = sizeof *dent;
++	if (longname)
++		data.length += len + 1;
++	data.data = NULL;
++	data.user = 0;
++	data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
++
++	/* NOTE-NIKITA quota plugin */
++	if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
++		return -EDQUOT;
++
++	result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
++	if (result != 0)
++		return result;
++
++	dent = (directory_entry_format *) item_body_by_coord(coord);
++	build_inode_key_id(entry->obj, &dent->id);
++	if (longname) {
++		memcpy(dent->name, name, len);
++		put_unaligned(0, &dent->name[len]);
++	}
++	return 0;
++}
++
++int rem_entry_de(struct inode *dir /* directory of item */ ,
++		 const struct qstr *name UNUSED_ARG,
++		 coord_t * coord /* coord of item */ ,
++		 lock_handle * lh UNUSED_ARG	/* lock handle for
++						 * removal */ ,
++		 reiser4_dir_entry_desc * entry UNUSED_ARG	/* parameters of
++								 * directory entry
++								 * being removed */ )
++{
++	coord_t shadow;
++	int result;
++	int length;
++
++	length = item_length_by_coord(coord);
++	if (inode_get_bytes(dir) < length) {
++		warning("nikita-2627", "Dir is broke: %llu: %llu",
++			(unsigned long long)get_inode_oid(dir),
++			inode_get_bytes(dir));
++
++		return RETERR(-EIO);
++	}
++
++	/* cut_node() is supposed to take pointers to _different_
++	   coords, because it will modify them without respect to
++	   possible aliasing. To work around this, create temporary copy
++	   of @coord.
++	 */
++	coord_dup(&shadow, coord);
++	result =
++	    kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
++	if (result == 0) {
++		/* NOTE-NIKITA quota plugin */
++		DQUOT_FREE_SPACE_NODIRTY(dir, length);
++	}
++	return result;
++}
++
++int max_name_len_de(const struct inode *dir)
++{
++	return tree_by_inode(dir)->nplug->max_item_size() -
++	    sizeof(directory_entry_format) - 2;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/sde.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/sde.h
+@@ -0,0 +1,66 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Directory entry. */
++
++#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
++#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
++
++#include "../../forward.h"
++#include "../../dformat.h"
++#include "../../kassign.h"
++#include "../../key.h"
++
++#include <linux/fs.h>
++#include <linux/dcache.h>	/* for struct dentry */
++
++typedef struct directory_entry_format {
++	/* key of object stat-data. It's not necessary to store whole
++	   key here, because it's always key of stat-data, so minor
++	   packing locality and offset can be omitted here. But this
++	   relies on particular key allocation scheme for stat-data, so,
++	   for extensibility sake, whole key can be stored here.
++
++	   We store key as array of bytes, because we don't want 8-byte
++	   alignment of dir entries.
++	 */
++	obj_key_id id;
++	/* file name. Null terminated string. */
++	d8 name[0];
++} directory_entry_format;
++
++void print_de(const char *prefix, coord_t * coord);
++int extract_key_de(const coord_t * coord, reiser4_key * key);
++int update_key_de(const coord_t * coord, const reiser4_key * key,
++		  lock_handle * lh);
++char *extract_name_de(const coord_t * coord, char *buf);
++unsigned extract_file_type_de(const coord_t * coord);
++int add_entry_de(struct inode *dir, coord_t * coord,
++		 lock_handle * lh, const struct dentry *name,
++		 reiser4_dir_entry_desc * entry);
++int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
++		 lock_handle * lh, reiser4_dir_entry_desc * entry);
++int max_name_len_de(const struct inode *dir);
++
++int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
++
++char *extract_dent_name(const coord_t * coord,
++			directory_entry_format * dent, char *buf);
++
++#if REISER4_LARGE_KEY
++#define DE_NAME_BUF_LEN (24)
++#else
++#define DE_NAME_BUF_LEN (16)
++#endif
++
++/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/static_stat.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/static_stat.c
+@@ -0,0 +1,1040 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* stat data manipulation. */
++
++#include "../../forward.h"
++#include "../../super.h"
++#include "../../vfs_ops.h"
++#include "../../inode.h"
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../object.h"
++#include "../plugin.h"
++#include "../plugin_header.h"
++#include "static_stat.h"
++#include "item.h"
++
++#include <linux/types.h>
++#include <linux/fs.h>
++
++/* see static_stat.h for explanation */
++
++/* helper function used while we are dumping/loading inode/plugin state
++    to/from the stat-data. */
++
++static void move_on(int *length /* space remaining in stat-data */ ,
++		    char **area /* current coord in stat data */ ,
++		    int size_of /* how many bytes to move forward */ )
++{
++	assert("nikita-615", length != NULL);
++	assert("nikita-616", area != NULL);
++
++	*length -= size_of;
++	*area += size_of;
++
++	assert("nikita-617", *length >= 0);
++}
++
++/* helper function used while loading inode/plugin state from stat-data.
++    Complain if there is less space in stat-data than was expected.
++    Can only happen on disk corruption. */
++static int not_enough_space(struct inode *inode /* object being processed */ ,
++			    const char *where /* error message */ )
++{
++	assert("nikita-618", inode != NULL);
++
++	warning("nikita-619", "Not enough space in %llu while loading %s",
++		(unsigned long long)get_inode_oid(inode), where);
++
++	return RETERR(-EINVAL);
++}
++
++/* helper function used while loading inode/plugin state from
++    stat-data. Call it if invalid plugin id was found. */
++static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
++			  struct inode *inode /* object being processed */ )
++{
++	warning("nikita-620", "Unknown plugin %i in %llu",
++		id, (unsigned long long)get_inode_oid(inode));
++
++	return RETERR(-EINVAL);
++}
++
++/* this is installed as ->init_inode() method of
++    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
++    Copies data from on-disk stat-data format into inode.
++    Handles stat-data extensions. */
++/* was sd_load */
++int init_inode_static_sd(struct inode *inode /* object being processed */ ,
++			 char *sd /* stat-data body */ ,
++			 int len /* length of stat-data */ )
++{
++	int result;
++	int bit;
++	int chunk;
++	__u16 mask;
++	__u64 bigmask;
++	reiser4_stat_data_base *sd_base;
++	reiser4_inode *state;
++
++	assert("nikita-625", inode != NULL);
++	assert("nikita-626", sd != NULL);
++
++	result = 0;
++	sd_base = (reiser4_stat_data_base *) sd;
++	state = reiser4_inode_data(inode);
++	mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
++	bigmask = mask;
++	inode_set_flag(inode, REISER4_SDLEN_KNOWN);
++
++	move_on(&len, &sd, sizeof *sd_base);
++	for (bit = 0, chunk = 0;
++	     mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
++	     ++bit, mask >>= 1) {
++		if (((bit + 1) % 16) != 0) {
++			/* handle extension */
++			sd_ext_plugin *sdplug;
++
++			if (bit >= LAST_SD_EXTENSION) {
++				warning("vpf-1904",
++					"No such extension %i in inode %llu",
++					bit,
++					(unsigned long long)
++					get_inode_oid(inode));
++
++				result = RETERR(-EINVAL);
++				break;
++			}
++
++			sdplug = sd_ext_plugin_by_id(bit);
++			if (sdplug == NULL) {
++				warning("nikita-627",
++					"No such extension %i in inode %llu",
++					bit,
++					(unsigned long long)
++					get_inode_oid(inode));
++
++				result = RETERR(-EINVAL);
++				break;
++			}
++			if (mask & 1) {
++				assert("nikita-628", sdplug->present);
++				/* alignment is not supported in node layout
++				   plugin yet.
++				   result = align( inode, &len, &sd,
++				   sdplug -> alignment );
++				   if( result != 0 )
++				   return result; */
++				result = sdplug->present(inode, &sd, &len);
++			} else if (sdplug->absent != NULL)
++				result = sdplug->absent(inode);
++			if (result)
++				break;
++			/* else, we are looking at the last bit in 16-bit
++			   portion of bitmask */
++		} else if (mask & 1) {
++			/* next portion of bitmask */
++			if (len < (int)sizeof(d16)) {
++				warning("nikita-629",
++					"No space for bitmap in inode %llu",
++					(unsigned long long)
++					get_inode_oid(inode));
++
++				result = RETERR(-EINVAL);
++				break;
++			}
++			mask = le16_to_cpu(get_unaligned((d16 *)sd));
++			bigmask <<= 16;
++			bigmask |= mask;
++			move_on(&len, &sd, sizeof(d16));
++			++chunk;
++			if (chunk == 3) {
++				if (!(mask & 0x8000)) {
++					/* clear last bit */
++					mask &= ~0x8000;
++					continue;
++				}
++				/* too much */
++				warning("nikita-630",
++					"Too many extensions in %llu",
++					(unsigned long long)
++					get_inode_oid(inode));
++
++				result = RETERR(-EINVAL);
++				break;
++			}
++		} else
++			/* bitmask exhausted */
++			break;
++	}
++	state->extmask = bigmask;
++	/* common initialisations */
++	inode->i_blksize = get_super_private(inode->i_sb)->optimal_io_size;
++	if (len - (bit / 16 * sizeof(d16)) > 0) {
++		/* alignment in save_len_static_sd() is taken into account
++		   -edward */
++		warning("nikita-631", "unused space in inode %llu",
++			(unsigned long long)get_inode_oid(inode));
++	}
++
++	return result;
++}
++
++/* estimates size of stat-data required to store inode.
++    Installed as ->save_len() method of
++    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
++/* was sd_len */
++int save_len_static_sd(struct inode *inode /* object being processed */ )
++{
++	unsigned int result;
++	__u64 mask;
++	int bit;
++
++	assert("nikita-632", inode != NULL);
++
++	result = sizeof(reiser4_stat_data_base);
++	mask = reiser4_inode_data(inode)->extmask;
++	for (bit = 0; mask != 0; ++bit, mask >>= 1) {
++		if (mask & 1) {
++			sd_ext_plugin *sdplug;
++
++			sdplug = sd_ext_plugin_by_id(bit);
++			assert("nikita-633", sdplug != NULL);
++			/* no aligment support
++			   result +=
++			   round_up( result, sdplug -> alignment ) - result; */
++			result += sdplug->save_len(inode);
++		}
++	}
++	result += bit / 16 * sizeof(d16);
++	return result;
++}
++
++/* saves inode into stat-data.
++    Installed as ->save() method of
++    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
++/* was sd_save */
++int save_static_sd(struct inode *inode /* object being processed */ ,
++		   char **area /* where to save stat-data */ )
++{
++	int result;
++	__u64 emask;
++	int bit;
++	unsigned int len;
++	reiser4_stat_data_base *sd_base;
++
++	assert("nikita-634", inode != NULL);
++	assert("nikita-635", area != NULL);
++
++	result = 0;
++	emask = reiser4_inode_data(inode)->extmask;
++	sd_base = (reiser4_stat_data_base *) * area;
++	put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
++	/*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
++
++	*area += sizeof *sd_base;
++	len = 0xffffffffu;
++	for (bit = 0; emask != 0; ++bit, emask >>= 1) {
++		if (emask & 1) {
++			if ((bit + 1) % 16 != 0) {
++				sd_ext_plugin *sdplug;
++				sdplug = sd_ext_plugin_by_id(bit);
++				assert("nikita-636", sdplug != NULL);
++				/* no alignment support yet
++				   align( inode, &len, area,
++				   sdplug -> alignment ); */
++				result = sdplug->save(inode, area);
++				if (result)
++					break;
++			} else {
++				put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
++					      (d16 *)(*area));
++				/*cputod16((unsigned)(emask & 0xffff),
++				  (d16 *) * area);*/
++				*area += sizeof(d16);
++			}
++		}
++	}
++	return result;
++}
++
++/* stat-data extension handling functions. */
++
++static int present_lw_sd(struct inode *inode /* object being processed */ ,
++			 char **area /* position in stat-data */ ,
++			 int *len /* remaining length */ )
++{
++	if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
++		reiser4_light_weight_stat *sd_lw;
++
++		sd_lw = (reiser4_light_weight_stat *) * area;
++
++		inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
++		inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
++		inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
++		if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
++			inode->i_mode &= ~S_IFIFO;
++			warning("", "partially converted file is encountered");
++			inode_set_flag(inode, REISER4_PART_MIXED);
++		}
++		move_on(len, area, sizeof *sd_lw);
++		return 0;
++	} else
++		return not_enough_space(inode, "lw sd");
++}
++
++static int save_len_lw_sd(struct inode *inode UNUSED_ARG	/* object being
++								 * processed */ )
++{
++	return sizeof(reiser4_light_weight_stat);
++}
++
++static int save_lw_sd(struct inode *inode /* object being processed */ ,
++		      char **area /* position in stat-data */ )
++{
++	reiser4_light_weight_stat *sd;
++	mode_t delta;
++
++	assert("nikita-2705", inode != NULL);
++	assert("nikita-2706", area != NULL);
++	assert("nikita-2707", *area != NULL);
++
++	sd = (reiser4_light_weight_stat *) * area;
++
++	delta = (inode_get_flag(inode, REISER4_PART_MIXED) ? S_IFIFO : 0);
++	put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
++	put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
++	put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
++	*area += sizeof *sd;
++	return 0;
++}
++
++static int present_unix_sd(struct inode *inode /* object being processed */ ,
++			   char **area /* position in stat-data */ ,
++			   int *len /* remaining length */ )
++{
++	assert("nikita-637", inode != NULL);
++	assert("nikita-638", area != NULL);
++	assert("nikita-639", *area != NULL);
++	assert("nikita-640", len != NULL);
++	assert("nikita-641", *len > 0);
++
++	if (*len >= (int)sizeof(reiser4_unix_stat)) {
++		reiser4_unix_stat *sd;
++
++		sd = (reiser4_unix_stat *) * area;
++
++		inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
++		inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
++		inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
++		inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
++		inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
++		if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
++			inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
++		else
++			inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
++		move_on(len, area, sizeof *sd);
++		return 0;
++	} else
++		return not_enough_space(inode, "unix sd");
++}
++
++static int absent_unix_sd(struct inode *inode /* object being processed */ )
++{
++	inode->i_uid = get_super_private(inode->i_sb)->default_uid;
++	inode->i_gid = get_super_private(inode->i_sb)->default_gid;
++	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
++	inode_set_bytes(inode, inode->i_size);
++	/* mark inode as lightweight, so that caller (reiser4_lookup) will
++	   complete initialisation by copying [ug]id from a parent. */
++	inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
++	return 0;
++}
++
++/* Audited by: green(2002.06.14) */
++static int save_len_unix_sd(struct inode *inode UNUSED_ARG	/* object being
++								 * processed */ )
++{
++	return sizeof(reiser4_unix_stat);
++}
++
++static int save_unix_sd(struct inode *inode /* object being processed */ ,
++			char **area /* position in stat-data */ )
++{
++	reiser4_unix_stat *sd;
++
++	assert("nikita-642", inode != NULL);
++	assert("nikita-643", area != NULL);
++	assert("nikita-644", *area != NULL);
++
++	sd = (reiser4_unix_stat *) * area;
++	put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
++	put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
++	put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
++	put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
++	put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
++	if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
++		put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
++	else
++		put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
++	*area += sizeof *sd;
++	return 0;
++}
++
++static int
++present_large_times_sd(struct inode *inode /* object being processed */ ,
++		       char **area /* position in stat-data */ ,
++		       int *len /* remaining length */ )
++{
++	if (*len >= (int)sizeof(reiser4_large_times_stat)) {
++		reiser4_large_times_stat *sd_lt;
++
++		sd_lt = (reiser4_large_times_stat *) * area;
++
++		inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
++		inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
++		inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
++
++		move_on(len, area, sizeof *sd_lt);
++		return 0;
++	} else
++		return not_enough_space(inode, "large times sd");
++}
++
++static int
++save_len_large_times_sd(struct inode *inode UNUSED_ARG
++			/* object being processed */ )
++{
++	return sizeof(reiser4_large_times_stat);
++}
++
++static int
++save_large_times_sd(struct inode *inode /* object being processed */ ,
++		    char **area /* position in stat-data */ )
++{
++	reiser4_large_times_stat *sd;
++
++	assert("nikita-2817", inode != NULL);
++	assert("nikita-2818", area != NULL);
++	assert("nikita-2819", *area != NULL);
++
++	sd = (reiser4_large_times_stat *) * area;
++
++	put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
++	put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
++	put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
++
++	*area += sizeof *sd;
++	return 0;
++}
++
++/* symlink stat data extension */
++
++/* allocate memory for symlink target and attach it to inode->u.generic_ip */
++static int
++symlink_target_to_inode(struct inode *inode, const char *target, int len)
++{
++	assert("vs-845", inode->u.generic_ip == NULL);
++	assert("vs-846", !inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
++
++	/* FIXME-VS: this is prone to deadlock. Not more than other similar
++	   places, though */
++	inode->u.generic_ip = kmalloc((size_t) len + 1, get_gfp_mask());
++	if (!inode->u.generic_ip)
++		return RETERR(-ENOMEM);
++
++	memcpy((char *)(inode->u.generic_ip), target, (size_t) len);
++	((char *)(inode->u.generic_ip))[len] = 0;
++	inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
++	return 0;
++}
++
++/* this is called on read_inode. There is nothing to do actually, but some
++   sanity checks */
++static int present_symlink_sd(struct inode *inode, char **area, int *len)
++{
++	int result;
++	int length;
++	reiser4_symlink_stat *sd;
++
++	length = (int)inode->i_size;
++	/*
++	 * *len is number of bytes in stat data item from *area to the end of
++	 * item. It must be not less than size of symlink + 1 for ending 0
++	 */
++	if (length > *len)
++		return not_enough_space(inode, "symlink");
++
++	if (*(*area + length) != 0) {
++		warning("vs-840", "Symlink is not zero terminated");
++		return RETERR(-EIO);
++	}
++
++	sd = (reiser4_symlink_stat *) * area;
++	result = symlink_target_to_inode(inode, sd->body, length);
++
++	move_on(len, area, length + 1);
++	return result;
++}
++
++static int save_len_symlink_sd(struct inode *inode)
++{
++	return inode->i_size + 1;
++}
++
++/* this is called on create and update stat data. Do nothing on update but
++   update @area */
++static int save_symlink_sd(struct inode *inode, char **area)
++{
++	int result;
++	int length;
++	reiser4_symlink_stat *sd;
++
++	length = (int)inode->i_size;
++	/* inode->i_size must be set already */
++	assert("vs-841", length);
++
++	result = 0;
++	sd = (reiser4_symlink_stat *) * area;
++	if (!inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
++		const char *target;
++
++		target = (const char *)(inode->u.generic_ip);
++		inode->u.generic_ip = NULL;
++
++		result = symlink_target_to_inode(inode, target, length);
++
++		/* copy symlink to stat data */
++		memcpy(sd->body, target, (size_t) length);
++		(*area)[length] = 0;
++	} else {
++		/* there is nothing to do in update but move area */
++		assert("vs-844",
++		       !memcmp(inode->u.generic_ip, sd->body,
++			       (size_t) length + 1));
++	}
++
++	*area += (length + 1);
++	return result;
++}
++
++static int present_flags_sd(struct inode *inode /* object being processed */ ,
++			    char **area /* position in stat-data */ ,
++			    int *len /* remaining length */ )
++{
++	assert("nikita-645", inode != NULL);
++	assert("nikita-646", area != NULL);
++	assert("nikita-647", *area != NULL);
++	assert("nikita-648", len != NULL);
++	assert("nikita-649", *len > 0);
++
++	if (*len >= (int)sizeof(reiser4_flags_stat)) {
++		reiser4_flags_stat *sd;
++
++		sd = (reiser4_flags_stat *) * area;
++		inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
++		move_on(len, area, sizeof *sd);
++		return 0;
++	} else
++		return not_enough_space(inode, "generation and attrs");
++}
++
++/* Audited by: green(2002.06.14) */
++static int save_len_flags_sd(struct inode *inode UNUSED_ARG	/* object being
++								 * processed */ )
++{
++	return sizeof(reiser4_flags_stat);
++}
++
++static int save_flags_sd(struct inode *inode /* object being processed */ ,
++			 char **area /* position in stat-data */ )
++{
++	reiser4_flags_stat *sd;
++
++	assert("nikita-650", inode != NULL);
++	assert("nikita-651", area != NULL);
++	assert("nikita-652", *area != NULL);
++
++	sd = (reiser4_flags_stat *) * area;
++	put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
++	*area += sizeof *sd;
++	return 0;
++}
++
++static int absent_plugin_sd(struct inode *inode);
++static int present_plugin_sd(struct inode *inode /* object being processed */ ,
++			     char **area /* position in stat-data */ ,
++			     int *len /* remaining length */ )
++{
++	reiser4_plugin_stat *sd;
++	reiser4_plugin *plugin;
++	int i;
++	__u16 mask;
++	int result;
++	int num_of_plugins;
++
++	assert("nikita-653", inode != NULL);
++	assert("nikita-654", area != NULL);
++	assert("nikita-655", *area != NULL);
++	assert("nikita-656", len != NULL);
++	assert("nikita-657", *len > 0);
++
++	if (*len < (int)sizeof(reiser4_plugin_stat))
++		return not_enough_space(inode, "plugin");
++
++	sd = (reiser4_plugin_stat *) * area;
++
++	mask = 0;
++	num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
++	move_on(len, area, sizeof *sd);
++	result = 0;
++	for (i = 0; i < num_of_plugins; ++i) {
++		reiser4_plugin_slot *slot;
++		reiser4_plugin_type type;
++		pset_member memb;
++
++		slot = (reiser4_plugin_slot *) * area;
++		if (*len < (int)sizeof *slot)
++			return not_enough_space(inode, "additional plugin");
++
++		memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
++		type = pset_member_to_type_unsafe(memb);
++		if (type == REISER4_PLUGIN_TYPES) {
++			warning("nikita-3502",
++				"wrong pset member (%i) for %llu", memb,
++				(unsigned long long)get_inode_oid(inode));
++			return RETERR(-EINVAL);
++		}
++		plugin = plugin_by_disk_id(tree_by_inode(inode),
++					   type, &slot->id);
++		if (plugin == NULL)
++			return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
++
++		/* plugin is loaded into inode, mark this into inode's
++		   bitmask of loaded non-standard plugins */
++		if (!(mask & (1 << memb))) {
++			mask |= (1 << memb);
++		} else {
++			warning("nikita-658", "duplicate plugin for %llu",
++				(unsigned long long)get_inode_oid(inode));
++			return RETERR(-EINVAL);
++		}
++		move_on(len, area, sizeof *slot);
++		/* load plugin data, if any */
++		if (plugin->h.pops != NULL && plugin->h.pops->load) {
++			result = plugin->h.pops->load(inode, plugin, area, len);
++			if (result != 0)
++				return result;
++		} else
++			result = grab_plugin_from(inode, memb, plugin);
++	}
++	/* if object plugin wasn't loaded from stat-data, guess it by
++	   mode bits */
++	plugin = file_plugin_to_plugin(inode_file_plugin(inode));
++	if (plugin == NULL)
++		result = absent_plugin_sd(inode);
++
++	reiser4_inode_data(inode)->plugin_mask = mask;
++	return result;
++}
++
++/* Determine object plugin for @inode based on i_mode.
++
++   Many objects in reiser4 file system are controlled by standard object
++   plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
++
++   For such files we don't explicitly store plugin id in object stat
++   data. Rather required plugin is guessed from mode bits, where file "type"
++   is encoded (see stat(2)).
++*/
++static int
++guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
++{
++	int fplug_id;
++	int dplug_id;
++	reiser4_inode *info;
++
++	assert("nikita-736", inode != NULL);
++
++	dplug_id = fplug_id = -1;
++
++	switch (inode->i_mode & S_IFMT) {
++	case S_IFSOCK:
++	case S_IFBLK:
++	case S_IFCHR:
++	case S_IFIFO:
++		fplug_id = SPECIAL_FILE_PLUGIN_ID;
++		break;
++	case S_IFLNK:
++		fplug_id = SYMLINK_FILE_PLUGIN_ID;
++		break;
++	case S_IFDIR:
++		fplug_id = DIRECTORY_FILE_PLUGIN_ID;
++		dplug_id = HASHED_DIR_PLUGIN_ID;
++		break;
++	default:
++		warning("nikita-737", "wrong file mode: %o", inode->i_mode);
++		return RETERR(-EIO);
++	case S_IFREG:
++		fplug_id = UNIX_FILE_PLUGIN_ID;
++		break;
++	}
++	info = reiser4_inode_data(inode);
++	plugin_set_file(&info->pset,
++			(fplug_id >= 0) ? file_plugin_by_id(fplug_id) : NULL);
++	plugin_set_dir(&info->pset,
++		       (dplug_id >= 0) ? dir_plugin_by_id(dplug_id) : NULL);
++	return 0;
++}
++
++/* Audited by: green(2002.06.14) */
++static int absent_plugin_sd(struct inode *inode /* object being processed */ )
++{
++	int result;
++
++	assert("nikita-659", inode != NULL);
++
++	result = guess_plugin_by_mode(inode);
++	/* if mode was wrong, guess_plugin_by_mode() returns "regular file",
++	   but setup_inode_ops() will call make_bad_inode().
++	   Another, more logical but bit more complex solution is to add
++	   "bad-file plugin". */
++	/* FIXME-VS: activate was called here */
++	return result;
++}
++
++/* helper function for plugin_sd_save_len(): calculate how much space
++    required to save state of given plugin */
++/* Audited by: green(2002.06.14) */
++static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
++		   struct inode *inode /* object being processed */ ,
++		   pset_member memb, int len)
++{
++	reiser4_inode *info;
++	assert("nikita-661", inode != NULL);
++
++	info = reiser4_inode_data(inode);
++	if (plugin != NULL && (info->plugin_mask & (1 << memb))) {
++		len += sizeof(reiser4_plugin_slot);
++		if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
++			/* non-standard plugin, call method */
++			/* commented as it is incompatible with alignment
++			 * policy in save_plug() -edward */
++			/* len = round_up(len, plugin->h.pops->alignment); */
++			len += plugin->h.pops->save_len(inode, plugin);
++		}
++	}
++	return len;
++}
++
++/* calculate how much space is required to save state of all plugins,
++    associated with inode */
++static int save_len_plugin_sd(struct inode *inode /* object being processed */ )
++{
++	int len;
++	reiser4_inode *state;
++	pset_member memb;
++
++	assert("nikita-663", inode != NULL);
++
++	state = reiser4_inode_data(inode);
++	/* common case: no non-standard plugins */
++	if (state->plugin_mask == 0)
++		return 0;
++	len = sizeof(reiser4_plugin_stat);
++	for (memb = 0; memb < PSET_LAST; ++memb)
++		len = len_for(pset_get(state->pset, memb), inode, memb, len);
++	assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
++	return len;
++}
++
++/* helper function for plugin_sd_save(): save plugin, associated with
++    inode. */
++static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
++		     struct inode *inode /* object being processed */ ,
++		     pset_member memb /* what element of pset is saved */ ,
++		     char **area /* position in stat-data */ ,
++		     int *count	/* incremented if plugin were actually
++				 * saved. */ )
++{
++	reiser4_plugin_slot *slot;
++	int fake_len;
++	int result;
++
++	assert("nikita-665", inode != NULL);
++	assert("nikita-666", area != NULL);
++	assert("nikita-667", *area != NULL);
++
++	if (plugin == NULL)
++		return 0;
++	if (!(reiser4_inode_data(inode)->plugin_mask & (1 << memb)))
++		return 0;
++	slot = (reiser4_plugin_slot *) * area;
++	put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
++	put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
++	fake_len = (int)0xffff;
++	move_on(&fake_len, area, sizeof *slot);
++	++*count;
++	result = 0;
++	if (plugin->h.pops != NULL) {
++		if (plugin->h.pops->save != NULL)
++			result = plugin->h.pops->save(inode, plugin, area);
++	}
++	return result;
++}
++
++/* save state of all non-standard plugins associated with inode */
++static int save_plugin_sd(struct inode *inode /* object being processed */ ,
++			  char **area /* position in stat-data */ )
++{
++	int result = 0;
++	int num_of_plugins;
++	reiser4_plugin_stat *sd;
++	reiser4_inode *state;
++	int fake_len;
++	pset_member memb;
++
++	assert("nikita-669", inode != NULL);
++	assert("nikita-670", area != NULL);
++	assert("nikita-671", *area != NULL);
++
++	state = reiser4_inode_data(inode);
++	if (state->plugin_mask == 0)
++		return 0;
++	sd = (reiser4_plugin_stat *) * area;
++	fake_len = (int)0xffff;
++	move_on(&fake_len, area, sizeof *sd);
++
++	num_of_plugins = 0;
++	for (memb = 0; memb < PSET_LAST; ++memb) {
++		result = save_plug(pset_get(state->pset, memb),
++				   inode, memb, area, &num_of_plugins);
++		if (result != 0)
++			break;
++	}
++
++	put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
++	return result;
++}
++
++/* helper function for crypto_sd_present(), crypto_sd_save.
++   Allocates memory for crypto stat, keyid and attaches it to the inode */
++static int extract_crypto_stat (struct inode * inode,
++				reiser4_crypto_stat * sd)
++{
++	crypto_stat_t * info;
++	assert("edward-11", !inode_crypto_stat(inode));
++	assert("edward-1413",
++	       !inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
++	/* create and attach a crypto-stat without secret key loaded */
++	info = alloc_crypto_stat(inode);
++	if (IS_ERR(info))
++		return PTR_ERR(info);
++	info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
++	memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
++	attach_crypto_stat(inode, info);
++	inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
++	return 0;
++}
++
++/* crypto stat-data extension */
++
++static int present_crypto_sd(struct inode *inode, char **area, int *len)
++{
++	int result;
++	reiser4_crypto_stat *sd;
++	digest_plugin *dplug = inode_digest_plugin(inode);
++
++	assert("edward-06", dplug != NULL);
++	assert("edward-684", dplug->fipsize);
++	assert("edward-07", area != NULL);
++	assert("edward-08", *area != NULL);
++	assert("edward-09", len != NULL);
++	assert("edward-10", *len > 0);
++
++	if (*len < (int)sizeof(reiser4_crypto_stat)) {
++		return not_enough_space(inode, "crypto-sd");
++	}
++	/* *len is number of bytes in stat data item from *area to the end of
++	   item. It must be not less than size of this extension */
++	assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
++
++	sd = (reiser4_crypto_stat *) * area;
++	result = extract_crypto_stat(inode, sd);
++	move_on(len, area, sizeof(*sd) + dplug->fipsize);
++
++	return result;
++}
++
++static int save_len_crypto_sd(struct inode *inode)
++{
++	return sizeof(reiser4_crypto_stat) +
++		inode_digest_plugin(inode)->fipsize;
++}
++
++static int save_crypto_sd(struct inode *inode, char **area)
++{
++	int result = 0;
++	reiser4_crypto_stat *sd;
++	crypto_stat_t * info = inode_crypto_stat(inode);
++	digest_plugin *dplug = inode_digest_plugin(inode);
++
++	assert("edward-12", dplug != NULL);
++	assert("edward-13", area != NULL);
++	assert("edward-14", *area != NULL);
++	assert("edward-15", info != NULL);
++	assert("edward-1414", info->keyid != NULL);
++	assert("edward-1415", info->keysize != 0);
++	assert("edward-76", reiser4_inode_data(inode) != NULL);
++
++	if (!inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
++		/* file is just created */
++		sd = (reiser4_crypto_stat *) *area;
++		/* copy everything but private key to the disk stat-data */
++		put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
++		memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
++		inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
++	}
++	*area += (sizeof(*sd) + dplug->fipsize);
++	return result;
++}
++
++static int eio(struct inode *inode, char **area, int *len)
++{
++	return RETERR(-EIO);
++}
++
++sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
++	[LIGHT_WEIGHT_STAT] = {
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = LIGHT_WEIGHT_STAT,
++			.pops = NULL,
++			.label = "light-weight sd",
++			.desc = "sd for light-weight files",
++			.linkage = {NULL,NULL}
++		},
++		.present = present_lw_sd,
++		.absent = NULL,
++		.save_len = save_len_lw_sd,
++		.save = save_lw_sd,
++		.alignment = 8
++	},
++	[UNIX_STAT] = {
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = UNIX_STAT,
++			.pops = NULL,
++			.label = "unix-sd",
++			.desc = "unix stat-data fields",
++			.linkage = {NULL,NULL}
++		},
++		.present = present_unix_sd,
++		.absent = absent_unix_sd,
++		.save_len = save_len_unix_sd,
++		.save = save_unix_sd,
++		.alignment = 8
++	},
++	[LARGE_TIMES_STAT] = {
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = LARGE_TIMES_STAT,
++			.pops = NULL,
++			.label = "64time-sd",
++			.desc = "nanosecond resolution for times",
++			.linkage = {NULL,NULL}
++		},
++		.present = present_large_times_sd,
++		.absent = NULL,
++		.save_len = save_len_large_times_sd,
++		.save = save_large_times_sd,
++		.alignment = 8
++	},
++	[SYMLINK_STAT] = {
++		/* stat data of symlink has this extension */
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = SYMLINK_STAT,
++			.pops = NULL,
++			.label = "symlink-sd",
++			.desc =
++			"stat data is appended with symlink name",
++			.linkage = {NULL,NULL}
++		},
++		.present = present_symlink_sd,
++		.absent = NULL,
++		.save_len = save_len_symlink_sd,
++		.save = save_symlink_sd,
++		.alignment = 8
++	},
++	[PLUGIN_STAT] = {
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = PLUGIN_STAT,
++			.pops = NULL,
++			.label = "plugin-sd",
++			.desc = "plugin stat-data fields",
++			.linkage = {NULL,NULL}
++		},
++		.present = present_plugin_sd,
++		.absent = absent_plugin_sd,
++		.save_len = save_len_plugin_sd,
++		.save = save_plugin_sd,
++		.alignment = 8
++	},
++	[FLAGS_STAT] = {
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = FLAGS_STAT,
++			.pops = NULL,
++			.label = "flags-sd",
++			.desc = "inode bit flags",
++			.linkage = {NULL, NULL}
++		},
++		.present = present_flags_sd,
++		.absent = NULL,
++		.save_len = save_len_flags_sd,
++		.save = save_flags_sd,
++		.alignment = 8
++	},
++	[CAPABILITIES_STAT] = {
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = CAPABILITIES_STAT,
++			.pops = NULL,
++			.label = "capabilities-sd",
++			.desc = "capabilities",
++			.linkage = {NULL, NULL}
++		},
++		.present = eio,
++		.absent = NULL,
++		.save_len = save_len_flags_sd,
++		.save = save_flags_sd,
++		.alignment = 8
++	},
++	[CRYPTO_STAT] = {
++		.h = {
++			.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++			.id = CRYPTO_STAT,
++			.pops = NULL,
++			.label = "crypto-sd",
++			.desc = "secret key size and id",
++			.linkage = {NULL, NULL}
++		},
++		.present = present_crypto_sd,
++		.absent = NULL,
++		.save_len = save_len_crypto_sd,
++		.save = save_crypto_sd,
++		.alignment = 8
++	}
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/static_stat.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/static_stat.h
+@@ -0,0 +1,219 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
++
++In the case where each file has not less than the fields needed by the
++stat() syscall, it is more compact to store those fields in this
++struct.
++
++If this item does not exist, then all stats are dynamically resolved.
++At the moment, we either resolve all stats dynamically or all of them
++statically.  If you think this is not fully optimal, and the rest of
++reiser4 is working, then fix it...:-)
++
++*/
++
++#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
++#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
++
++#include "../../forward.h"
++#include "../../dformat.h"
++
++#include <linux/fs.h>		/* for struct inode */
++
++/* Stat data layout: goals and implementation.
++
++   We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
++   them, including not having semantic metadata attached to them.
++
++   There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
++   want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
++   sized structure because the statically sized structure knows without recording it what the names and lengths of the
++   attributes are.
++
++   This leads to a natural compromise, which is to special case those files which have simply the standard unix file
++   attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
++   file in their use of file attributes.
++
++   Yet this compromise deserves to be compromised a little.
++
++   We accommodate the case where you have no more than the standard unix file attributes by using an "extension
++   bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
++
++   If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
++   from parent directory (as uid, gid) or initialised to some sane values.
++
++   To capitalize on existing code infrastructure, extensions are
++   implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
++   Each stat-data extension plugin implements four methods:
++
++    ->present() called by sd_load() when this extension is found in stat-data
++    ->absent() called by sd_load() when this extension is not found in stat-data
++    ->save_len() called by sd_len() to calculate total length of stat-data
++    ->save() called by sd_save() to store extension data into stat-data
++
++    Implementation is in fs/reiser4/plugin/item/static_stat.c
++*/
++
++/* stat-data extension. Please order this by presumed frequency of use */
++typedef enum {
++	/* support for light-weight files */
++	LIGHT_WEIGHT_STAT,
++	/* data required to implement unix stat(2) call. Layout is in
++	   reiser4_unix_stat. If this is not present, file is light-weight */
++	UNIX_STAT,
++	/* this contains additional set of 32bit [anc]time fields to implement
++	   nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
++	   if this extension is governed by 32bittimes mount option. */
++	LARGE_TIMES_STAT,
++	/* stat data has link name included */
++	SYMLINK_STAT,
++	/* if this is present, file is controlled by non-standard
++	   plugin (that is, plugin that cannot be deduced from file
++	   mode bits), for example, aggregation, interpolation etc. */
++	PLUGIN_STAT,
++	/* this extension contains persistent inode flags. These flags are
++	   single bits: immutable, append, only, etc. Layout is in
++	   reiser4_flags_stat. */
++	FLAGS_STAT,
++	/* this extension contains capabilities sets, associated with this
++	   file. Layout is in reiser4_capabilities_stat */
++	CAPABILITIES_STAT,
++	/* this extension contains size and public id of the secret key.
++	   Layout is in reiser4_crypto_stat */
++	CRYPTO_STAT,
++	LAST_SD_EXTENSION,
++	/*
++	 * init_inode_static_sd() iterates over extension mask until all
++	 * non-zero bits are processed. This means, that neither ->present(),
++	 * nor ->absent() methods will be called for stat-data extensions that
++	 * go after last present extension. But some basic extensions, we want
++	 * either ->absent() or ->present() method to be called, because these
++	 * extensions set up something in inode even when they are not
++	 * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
++	 * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
++	 * ->present(), or ->absent() method will be called, independently of
++	 * what other extensions are present.
++	 */
++	LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT,
++} sd_ext_bits;
++
++/* minimal stat-data. This allows to support light-weight files. */
++typedef struct reiser4_stat_data_base {
++	/*  0 */ __le16 extmask;
++	/*  2 */
++} PACKED reiser4_stat_data_base;
++
++typedef struct reiser4_light_weight_stat {
++	/*  0 */ __le16 mode;
++	/*  2 */ __le32 nlink;
++	/*  8 */ __le64 size;
++	/* size in bytes */
++	/* 16 */
++} PACKED reiser4_light_weight_stat;
++
++typedef struct reiser4_unix_stat {
++	/* owner id */
++	/*  0 */ __le32 uid;
++	/* group id */
++	/*  4 */ __le32 gid;
++	/* access time */
++	/*  8 */ __le32 atime;
++	/* modification time */
++	/* 12 */ __le32 mtime;
++	/* change time */
++	/* 16 */ __le32 ctime;
++	union {
++		/* minor:major for device files */
++		/* 20 */ __le64 rdev;
++		/* bytes used by file */
++		/* 20 */ __le64 bytes;
++	} u;
++	/* 28 */
++} PACKED reiser4_unix_stat;
++
++/* symlink stored as part of inode */
++typedef struct reiser4_symlink_stat {
++	char body[0];
++} PACKED reiser4_symlink_stat;
++
++typedef struct reiser4_plugin_slot {
++	/*  0 */ __le16 pset_memb;
++	/*  2 */ __le16 id;
++	/*  4 *//* here plugin stores its persistent state */
++} PACKED reiser4_plugin_slot;
++
++/* stat-data extension for files with non-standard plugin. */
++typedef struct reiser4_plugin_stat {
++	/* number of additional plugins, associated with this object */
++	/*  0 */ __le16 plugins_no;
++	/*  2 */ reiser4_plugin_slot slot[0];
++	/*  2 */
++} PACKED reiser4_plugin_stat;
++
++/* stat-data extension for inode flags. Currently it is just fixed-width 32
++ * bit mask. If need arise, this can be replaced with variable width
++ * bitmask. */
++typedef struct reiser4_flags_stat {
++	/*  0 */ __le32 flags;
++	/*  4 */
++} PACKED reiser4_flags_stat;
++
++typedef struct reiser4_capabilities_stat {
++	/*  0 */ __le32 effective;
++	/*  8 */ __le32 permitted;
++	/* 16 */
++} PACKED reiser4_capabilities_stat;
++
++typedef struct reiser4_cluster_stat {
++/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
++	/* 0 */ d8 cluster_shift;
++	/* 1 */
++} PACKED reiser4_cluster_stat;
++
++typedef struct reiser4_crypto_stat {
++	/* secret key size, bits */
++	/*  0 */ d16 keysize;
++	/* secret key id */
++	/*  2 */ d8 keyid[0];
++	/* 2 */
++} PACKED reiser4_crypto_stat;
++
++typedef struct reiser4_large_times_stat {
++	/* access time */
++	/*  0 */ d32 atime;
++	/* modification time */
++	/*  8 */ d32 mtime;
++	/* change time */
++	/* 16 */ d32 ctime;
++	/* 24 */
++} PACKED reiser4_large_times_stat;
++
++/* this structure is filled by sd_item_stat */
++typedef struct sd_stat {
++	int dirs;
++	int files;
++	int others;
++} sd_stat;
++
++/* plugin->item.common.* */
++extern void print_sd(const char *prefix, coord_t * coord);
++extern void item_stat_static_sd(const coord_t * coord, void *vp);
++
++/* plugin->item.s.sd.* */
++extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
++extern int save_len_static_sd(struct inode *inode);
++extern int save_static_sd(struct inode *inode, char **area);
++
++/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/item/tail.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/tail.c
+@@ -0,0 +1,805 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "item.h"
++#include "../../inode.h"
++#include "../../page_cache.h"
++#include "../../carry.h"
++#include "../../vfs_ops.h"
++
++#include <linux/quotaops.h>
++#include <asm/uaccess.h>
++#include <linux/swap.h>
++#include <linux/writeback.h>
++
++/* plugin->u.item.b.max_key_inside */
++reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
++{
++	item_key_by_coord(coord, key);
++	set_key_offset(key, get_key_offset(max_key()));
++	return key;
++}
++
++/* plugin->u.item.b.can_contain_key */
++int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
++			 const reiser4_item_data *data)
++{
++	reiser4_key item_key;
++
++	if (item_plugin_by_coord(coord) != data->iplug)
++		return 0;
++
++	item_key_by_coord(coord, &item_key);
++	if (get_key_locality(key) != get_key_locality(&item_key) ||
++	    get_key_objectid(key) != get_key_objectid(&item_key))
++		return 0;
++
++	return 1;
++}
++
++/* plugin->u.item.b.mergeable
++   first item is of tail type */
++/* Audited by: green(2002.06.14) */
++int mergeable_tail(const coord_t *p1, const coord_t *p2)
++{
++	reiser4_key key1, key2;
++
++	assert("vs-535",
++	       item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
++	assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
++
++	if (item_id_by_coord(p2) != FORMATTING_ID) {
++		/* second item is of another type */
++		return 0;
++	}
++
++	item_key_by_coord(p1, &key1);
++	item_key_by_coord(p2, &key2);
++	if (get_key_locality(&key1) != get_key_locality(&key2) ||
++	    get_key_objectid(&key1) != get_key_objectid(&key2)
++	    || get_key_type(&key1) != get_key_type(&key2)) {
++		/* items of different objects */
++		return 0;
++	}
++	if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
++		/* not adjacent items */
++		return 0;
++	}
++	return 1;
++}
++
++/* plugin->u.item.b.print
++   plugin->u.item.b.check */
++
++/* plugin->u.item.b.nr_units */
++pos_in_node_t nr_units_tail(const coord_t * coord)
++{
++	return item_length_by_coord(coord);
++}
++
++/* plugin->u.item.b.lookup */
++lookup_result
++lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
++{
++	reiser4_key item_key;
++	__u64 lookuped, offset;
++	unsigned nr_units;
++
++	item_key_by_coord(coord, &item_key);
++	offset = get_key_offset(item_key_by_coord(coord, &item_key));
++	nr_units = nr_units_tail(coord);
++
++	/* key we are looking for must be greater than key of item @coord */
++	assert("vs-416", keygt(key, &item_key));
++
++	/* offset we are looking for */
++	lookuped = get_key_offset(key);
++
++	if (lookuped >= offset && lookuped < offset + nr_units) {
++		/* byte we are looking for is in this item */
++		coord->unit_pos = lookuped - offset;
++		coord->between = AT_UNIT;
++		return CBK_COORD_FOUND;
++	}
++
++	/* set coord after last unit */
++	coord->unit_pos = nr_units - 1;
++	coord->between = AFTER_UNIT;
++	return bias ==
++	    FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
++}
++
++/* plugin->u.item.b.paste */
++int
++paste_tail(coord_t *coord, reiser4_item_data *data,
++	   carry_plugin_info *info UNUSED_ARG)
++{
++	unsigned old_item_length;
++	char *item;
++
++	/* length the item had before resizing has been performed */
++	old_item_length = item_length_by_coord(coord) - data->length;
++
++	/* tail items never get pasted in the middle */
++	assert("vs-363",
++	       (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
++	       (coord->unit_pos == old_item_length - 1 &&
++		coord->between == AFTER_UNIT) ||
++	       (coord->unit_pos == 0 && old_item_length == 0
++		&& coord->between == AT_UNIT));
++
++	item = item_body_by_coord(coord);
++	if (coord->unit_pos == 0)
++		/* make space for pasted data when pasting at the beginning of
++		   the item */
++		memmove(item + data->length, item, old_item_length);
++
++	if (coord->between == AFTER_UNIT)
++		coord->unit_pos++;
++
++	if (data->data) {
++		assert("vs-554", data->user == 0 || data->user == 1);
++		if (data->user) {
++			assert("nikita-3035", schedulable());
++			/* copy from user space */
++			if (__copy_from_user(item + coord->unit_pos,
++					     (const char __user *)data->data,
++					     (unsigned)data->length))
++				return RETERR(-EFAULT);
++		} else
++			/* copy from kernel space */
++			memcpy(item + coord->unit_pos, data->data,
++			       (unsigned)data->length);
++	} else {
++		memset(item + coord->unit_pos, 0, (unsigned)data->length);
++	}
++	return 0;
++}
++
++/* plugin->u.item.b.fast_paste */
++
++/* plugin->u.item.b.can_shift
++   number of units is returned via return value, number of bytes via @size. For
++   tail items they coincide */
++int
++can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
++	       znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
++	       unsigned *size, unsigned want)
++{
++	/* make sure that that we do not want to shift more than we have */
++	assert("vs-364", want > 0
++	       && want <= (unsigned)item_length_by_coord(source));
++
++	*size = min(want, free_space);
++	return *size;
++}
++
++/* plugin->u.item.b.copy_units */
++void
++copy_units_tail(coord_t * target, coord_t * source,
++		unsigned from, unsigned count,
++		shift_direction where_is_free_space,
++		unsigned free_space UNUSED_ARG)
++{
++	/* make sure that item @target is expanded already */
++	assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
++	assert("vs-370", free_space >= count);
++
++	if (where_is_free_space == SHIFT_LEFT) {
++		/* append item @target with @count first bytes of @source */
++		assert("vs-365", from == 0);
++
++		memcpy((char *)item_body_by_coord(target) +
++		       item_length_by_coord(target) - count,
++		       (char *)item_body_by_coord(source), count);
++	} else {
++		/* target item is moved to right already */
++		reiser4_key key;
++
++		assert("vs-367",
++		       (unsigned)item_length_by_coord(source) == from + count);
++
++		memcpy((char *)item_body_by_coord(target),
++		       (char *)item_body_by_coord(source) + from, count);
++
++		/* new units are inserted before first unit in an item,
++		   therefore, we have to update item key */
++		item_key_by_coord(source, &key);
++		set_key_offset(&key, get_key_offset(&key) + from);
++
++		node_plugin_by_node(target->node)->update_item_key(target, &key,
++								   NULL /*info */);
++	}
++}
++
++/* plugin->u.item.b.create_hook */
++
++/* item_plugin->b.kill_hook
++   this is called when @count units starting from @from-th one are going to be removed
++   */
++int
++kill_hook_tail(const coord_t * coord, pos_in_node_t from,
++	       pos_in_node_t count, struct carry_kill_data *kdata)
++{
++	reiser4_key key;
++	loff_t start, end;
++
++	assert("vs-1577", kdata);
++	assert("vs-1579", kdata->inode);
++
++	item_key_by_coord(coord, &key);
++	start = get_key_offset(&key) + from;
++	end = start + count;
++	fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
++	return 0;
++}
++
++/* plugin->u.item.b.shift_hook */
++
++/* helper for kill_units_tail and cut_units_tail */
++static int
++do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++	       reiser4_key * smallest_removed, reiser4_key * new_first)
++{
++	pos_in_node_t count;
++
++	/* this method is only called to remove part of item */
++	assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
++	/* tails items are never cut from the middle of an item */
++	assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
++	assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
++
++	count = to - from + 1;
++
++	if (smallest_removed) {
++		/* store smallest key removed */
++		item_key_by_coord(coord, smallest_removed);
++		set_key_offset(smallest_removed,
++			       get_key_offset(smallest_removed) + from);
++	}
++	if (new_first) {
++		/* head of item is cut */
++		assert("vs-1529", from == 0);
++
++		item_key_by_coord(coord, new_first);
++		set_key_offset(new_first,
++			       get_key_offset(new_first) + from + count);
++	}
++
++	if (REISER4_DEBUG)
++		memset((char *)item_body_by_coord(coord) + from, 0, count);
++	return count;
++}
++
++/* plugin->u.item.b.cut_units */
++int
++cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++	       struct carry_cut_data *cdata UNUSED_ARG,
++	       reiser4_key * smallest_removed, reiser4_key * new_first)
++{
++	return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
++}
++
++/* plugin->u.item.b.kill_units */
++int
++kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
++		struct carry_kill_data *kdata, reiser4_key * smallest_removed,
++		reiser4_key * new_first)
++{
++	kill_hook_tail(coord, from, to - from + 1, kdata);
++	return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
++}
++
++/* plugin->u.item.b.unit_key */
++reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
++{
++	assert("vs-375", coord_is_existing_unit(coord));
++
++	item_key_by_coord(coord, key);
++	set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
++
++	return key;
++}
++
++/* plugin->u.item.b.estimate
++   plugin->u.item.b.item_data_by_flow */
++
++/* tail redpage function. It is called from readpage_tail(). */
++static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
++{
++	tap_t tap;
++	int result;
++	coord_t coord;
++	lock_handle lh;
++	int count, mapped;
++	struct inode *inode;
++	char *pagedata;
++
++	/* saving passed coord in order to do not move it by tap. */
++	init_lh(&lh);
++	copy_lh(&lh, uf_coord->lh);
++	inode = page->mapping->host;
++	coord_dup(&coord, &uf_coord->coord);
++
++	tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
++
++	if ((result = tap_load(&tap)))
++		goto out_tap_done;
++
++	/* lookup until page is filled up. */
++	for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
++		/* number of bytes to be copied to page */
++		count = item_length_by_coord(&coord) - coord.unit_pos;
++		if (count > PAGE_CACHE_SIZE - mapped)
++			count = PAGE_CACHE_SIZE - mapped;
++
++		/* attach @page to address space and get data address */
++		pagedata = kmap_atomic(page, KM_USER0);
++
++		/* copy tail item to page */
++		memcpy(pagedata + mapped,
++		       ((char *)item_body_by_coord(&coord) + coord.unit_pos),
++		       count);
++		mapped += count;
++
++		flush_dcache_page(page);
++
++		/* dettach page from address space */
++		kunmap_atomic(pagedata, KM_USER0);
++
++		/* Getting next tail item. */
++		if (mapped < PAGE_CACHE_SIZE) {
++			/*
++			 * unlock page in order to avoid keep it locked
++			 * during tree lookup, which takes long term locks
++			 */
++			unlock_page(page);
++
++			/* getting right neighbour. */
++			result = go_dir_el(&tap, RIGHT_SIDE, 0);
++
++			/* lock page back */
++			lock_page(page);
++			if (PageUptodate(page)) {
++				/*
++				 * another thread read the page, we have
++				 * nothing to do
++				 */
++				result = 0;
++				goto out_unlock_page;
++			}
++
++			if (result) {
++				if (result == -E_NO_NEIGHBOR) {
++					/*
++					 * rigth neighbor is not a formatted
++					 * node
++					 */
++					result = 0;
++					goto done;
++				} else {
++					goto out_tap_relse;
++				}
++			} else {
++				if (!inode_file_plugin(inode)->
++				    owns_item(inode, &coord)) {
++					/* item of another file is found */
++					result = 0;
++					goto done;
++				}
++			}
++		}
++	}
++
++ done:
++	if (mapped != PAGE_CACHE_SIZE) {
++		pagedata = kmap_atomic(page, KM_USER0);
++		memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
++		flush_dcache_page(page);
++		kunmap_atomic(pagedata, KM_USER0);
++	}
++	SetPageUptodate(page);
++ out_unlock_page:
++	unlock_page(page);
++ out_tap_relse:
++	tap_relse(&tap);
++ out_tap_done:
++	tap_done(&tap);
++	return result;
++}
++
++/*
++   plugin->s.file.readpage
++   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
++   or
++   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
++
++   At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
++   item. */
++int readpage_tail(void *vp, struct page *page)
++{
++	uf_coord_t *uf_coord = vp;
++	ON_DEBUG(coord_t * coord = &uf_coord->coord);
++	ON_DEBUG(reiser4_key key);
++
++	assert("umka-2515", PageLocked(page));
++	assert("umka-2516", !PageUptodate(page));
++	assert("umka-2517", !jprivate(page) && !PagePrivate(page));
++	assert("umka-2518", page->mapping && page->mapping->host);
++
++	assert("umka-2519", znode_is_loaded(coord->node));
++	assert("umka-2520", item_is_tail(coord));
++	assert("umka-2521", coord_is_existing_unit(coord));
++	assert("umka-2522", znode_is_rlocked(coord->node));
++	assert("umka-2523",
++	       page->mapping->host->i_ino ==
++	       get_key_objectid(item_key_by_coord(coord, &key)));
++
++	return do_readpage_tail(uf_coord, page);
++}
++
++/**
++ * overwrite_tail
++ * @flow:
++ * @coord:
++ *
++ * Overwrites tail item or its part by user data. Returns number of bytes
++ * written or error code.
++ */
++static int overwrite_tail(flow_t *flow, coord_t *coord)
++{
++	unsigned count;
++
++	assert("vs-570", flow->user == 1);
++	assert("vs-946", flow->data);
++	assert("vs-947", coord_is_existing_unit(coord));
++	assert("vs-948", znode_is_write_locked(coord->node));
++	assert("nikita-3036", schedulable());
++
++	count = item_length_by_coord(coord) - coord->unit_pos;
++	if (count > flow->length)
++		count = flow->length;
++
++	if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
++			     (const char __user *)flow->data, count))
++		return RETERR(-EFAULT);
++
++	znode_make_dirty(coord->node);
++	return count;
++}
++
++/**
++ * insert_first_tail
++ * @inode:
++ * @flow:
++ * @coord:
++ * @lh:
++ *
++ * Returns number of bytes written or error code.
++ */
++static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
++				 coord_t *coord, lock_handle *lh)
++{
++	int result;
++	loff_t to_write;
++	unix_file_info_t *uf_info;
++
++	if (get_key_offset(&flow->key) != 0) {
++		/*
++		 * file is empty and we have to write not to the beginning of
++		 * file. Create a hole at the beginning of file. On success
++		 * insert_flow returns 0 as number of written bytes which is
++		 * what we have to return on padding a file with holes
++		 */
++		flow->data = NULL;
++		flow->length = get_key_offset(&flow->key);
++		set_key_offset(&flow->key, 0);
++		/*
++		 * holes in files built of tails are stored just like if there
++		 * were real data which are all zeros. Therefore we have to
++		 * allocate quota here as well
++		 */
++		if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
++			return RETERR(-EDQUOT);
++		result = insert_flow(coord, lh, flow);
++		if (flow->length)
++			DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
++
++		uf_info = unix_file_inode_data(inode);
++
++		/*
++		 * first item insertion is only possible when writing to empty
++		 * file or performing tail conversion
++		 */
++		assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
++			    (inode_get_flag(inode, REISER4_PART_MIXED) &&
++			     inode_get_flag(inode, REISER4_PART_IN_CONV))));
++
++		/* if file was empty - update its state */
++		if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
++			uf_info->container = UF_CONTAINER_TAILS;
++		return result;
++	}
++
++	/* check quota before appending data */
++	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
++		return RETERR(-EDQUOT);
++
++	to_write = flow->length;
++	result = insert_flow(coord, lh, flow);
++	if (flow->length)
++		DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
++	return (to_write - flow->length) ? (to_write - flow->length) : result;
++}
++
++/**
++ * append_tail
++ * @inode:
++ * @flow:
++ * @coord:
++ * @lh:
++ *
++ * Returns number of bytes written or error code.
++ */
++static ssize_t append_tail(struct inode *inode,
++			   flow_t *flow, coord_t *coord, lock_handle *lh)
++{
++	int result;
++	reiser4_key append_key;
++	loff_t to_write;
++	
++	if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
++		flow->data = NULL;
++		flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
++		set_key_offset(&flow->key, get_key_offset(&append_key));
++		/*
++		 * holes in files built of tails are stored just like if there
++		 * were real data which are all zeros. Therefore we have to
++		 * allocate quota here as well
++		 */
++		if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
++			return RETERR(-EDQUOT);
++		result = insert_flow(coord, lh, flow);
++		if (flow->length)
++			DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
++		return result;
++	}
++
++	/* check quota before appending data */
++	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
++		return RETERR(-EDQUOT);
++
++	to_write = flow->length;
++	result = insert_flow(coord, lh, flow);
++	if (flow->length)
++		DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
++	return (to_write - flow->length) ? (to_write - flow->length) : result;
++}
++
++/**
++ * write_tail_reserve_space - reserve space for tail write operation
++ * @inode:
++ *
++ * Estimates and reserves space which may be required for writing one flow to a
++ * file
++ */
++static int write_extent_reserve_space(struct inode *inode)
++{
++	__u64 count;
++	reiser4_tree *tree;
++
++	/*
++	 * to write one flow to a file by tails we have to reserve disk space for:
++ 
++	 * 1. find_file_item may have to insert empty node to the tree (empty
++	 * leaf node between two extent items). This requires 1 block and
++	 * number of blocks which are necessary to perform insertion of an
++	 * internal item into twig level.
++	 *
++	 * 2. flow insertion
++	 *
++	 * 3. stat data update
++	 */
++	tree = tree_by_inode(inode);
++	count = estimate_one_insert_item(tree) + 
++		estimate_insert_flow(tree->height) +
++		estimate_one_insert_item(tree);
++	grab_space_enable();
++	return reiser4_grab_space(count, 0 /* flags */);
++}
++
++#define PAGE_PER_FLOW 4
++
++static loff_t faultin_user_pages(const char __user *buf, size_t count)
++{
++	loff_t faulted;	
++	int to_fault;
++
++	if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
++		count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
++	faulted = 0;
++	while (count > 0) {
++		to_fault = PAGE_CACHE_SIZE;
++		if (count < to_fault)
++			to_fault = count;
++		fault_in_pages_readable(buf + faulted, to_fault);
++		count -= to_fault;
++		faulted += to_fault;
++	}
++	return faulted;
++}
++
++/**
++ * write_extent - write method of tail item plugin
++ * @file: file to write to
++ * @buf: address of user-space buffer
++ * @count: number of bytes to write
++ * @pos: position in file to write to
++ *
++ * Returns number of written bytes or error code.
++ */
++ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
++		   loff_t *pos)
++{
++	struct inode *inode;
++	struct hint hint;
++	int result;
++	flow_t flow;
++	coord_t *coord;
++	lock_handle *lh;
++	znode *loaded;
++
++	inode = file->f_dentry->d_inode;
++
++	if (write_extent_reserve_space(inode))
++		return RETERR(-ENOSPC);
++
++	result = load_file_hint(file, &hint);
++	BUG_ON(result != 0);
++
++	flow.length = faultin_user_pages(buf, count);
++	flow.user = 1;
++	memcpy(&flow.data, &buf, sizeof(buf));
++	flow.op = WRITE_OP;
++	key_by_inode_and_offset_common(inode, *pos, &flow.key);
++
++	result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
++	if (IS_CBKERR(result))
++		return result;
++
++	coord = &hint.ext_coord.coord;
++	lh = hint.ext_coord.lh;
++
++	result = zload(coord->node);
++	BUG_ON(result != 0);
++	loaded = coord->node;
++	
++	if (coord->between == AFTER_UNIT) {
++		/* append with data or hole */
++		result = append_tail(inode, &flow, coord, lh);
++	} else if (coord->between == AT_UNIT) {
++		/* overwrite */
++		result = overwrite_tail(&flow, coord);
++	} else {
++		/* no items of this file yet. insert data or hole */
++		result = insert_first_tail(inode, &flow, coord, lh);
++	}
++	zrelse(loaded);
++	if (result < 0) {
++		done_lh(lh);
++		return result;
++	}
++	
++	/* seal and unlock znode */
++	hint.ext_coord.valid = 0;
++	if (hint.ext_coord.valid)
++		set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
++	else
++		unset_hint(&hint);
++
++	save_file_hint(file, &hint);
++	return result;
++}
++
++#if REISER4_DEBUG
++
++static int
++coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
++{
++	reiser4_key item_key;
++
++	assert("vs-1356", coord_is_existing_unit(coord));
++	assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
++	assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
++	return get_key_offset(key) ==
++	    get_key_offset(&item_key) + coord->unit_pos;
++
++}
++
++#endif
++
++/* plugin->u.item.s.file.read */
++int read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
++{
++	unsigned count;
++	int item_length;
++	coord_t *coord;
++	uf_coord_t *uf_coord;
++
++	uf_coord = &hint->ext_coord;
++	coord = &uf_coord->coord;
++
++	assert("vs-571", f->user == 1);
++	assert("vs-571", f->data);
++	assert("vs-967", coord && coord->node);
++	assert("vs-1117", znode_is_rlocked(coord->node));
++	assert("vs-1118", znode_is_loaded(coord->node));
++
++	assert("nikita-3037", schedulable());
++	assert("vs-1357", coord_matches_key_tail(coord, &f->key));
++
++	/* calculate number of bytes to read off the item */
++	item_length = item_length_by_coord(coord);
++	count = item_length_by_coord(coord) - coord->unit_pos;
++	if (count > f->length)
++		count = f->length;
++
++	/* user page has to be brought in so that major page fault does not
++	 * occur here when longtem lock is held */
++	if (__copy_to_user((char __user *)f->data,
++			   ((char *)item_body_by_coord(coord) + coord->unit_pos),
++			   count))
++		return RETERR(-EFAULT);
++
++	/* probably mark_page_accessed() should only be called if
++	 * coord->unit_pos is zero. */
++	mark_page_accessed(znode_page(coord->node));
++	move_flow_forward(f, count);
++
++	coord->unit_pos += count;
++	if (item_length == coord->unit_pos) {
++		coord->unit_pos--;
++		coord->between = AFTER_UNIT;
++	}
++
++	return 0;
++}
++
++/*
++   plugin->u.item.s.file.append_key
++   key of first byte which is the next to last byte by addressed by this item
++*/
++reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
++{
++	item_key_by_coord(coord, key);
++	set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
++	return key;
++}
++
++/* plugin->u.item.s.file.init_coord_extension */
++void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
++{
++	uf_coord->valid = 1;
++}
++
++/*
++  plugin->u.item.s.file.get_block
++*/
++int
++get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
++{
++	assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
++
++	*block = *znode_get_block(coord->node);
++	return 0;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/item/tail.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/item/tail.h
+@@ -0,0 +1,58 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#if !defined( __REISER4_TAIL_H__ )
++#define __REISER4_TAIL_H__
++
++typedef struct {
++	int not_used;
++} tail_coord_extension_t;
++
++struct cut_list;
++
++/* plugin->u.item.b.* */
++reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
++int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
++			 const reiser4_item_data *);
++int mergeable_tail(const coord_t * p1, const coord_t * p2);
++pos_in_node_t nr_units_tail(const coord_t *);
++lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
++int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
++int can_shift_tail(unsigned free_space, coord_t * source,
++		   znode * target, shift_direction, unsigned *size,
++		   unsigned want);
++void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
++		     unsigned count, shift_direction, unsigned free_space);
++int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
++		   struct carry_kill_data *);
++int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
++		   struct carry_cut_data *, reiser4_key * smallest_removed,
++		   reiser4_key * new_first);
++int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
++		    struct carry_kill_data *, reiser4_key * smallest_removed,
++		    reiser4_key * new_first);
++reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
++
++/* plugin->u.item.s.* */
++ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
++		   loff_t *pos);
++int read_tail(struct file *, flow_t *, hint_t *);
++int readpage_tail(void *vp, struct page *page);
++reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
++void init_coord_extension_tail(uf_coord_t *, loff_t offset);
++int get_block_address_tail(const coord_t *, sector_t, sector_t *);
++int item_balance_dirty_pages(struct address_space *, const flow_t *,
++			     hint_t *, int back_to_dirty, int set_hint);
++
++/* __REISER4_TAIL_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/node/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/node/Makefile
+@@ -0,0 +1,5 @@
++obj-$(CONFIG_REISER4_FS) += node_plugins.o
++
++node_plugins-objs :=	\
++	node.o		\
++	node40.o
+Index: linux-2.6.16/fs/reiser4/plugin/node/node.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/node/node.c
+@@ -0,0 +1,131 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Node plugin interface.
++
++   Description: The tree provides the abstraction of flows, which it
++   internally fragments into items which it stores in nodes.
++
++   A key_atom is a piece of data bound to a single key.
++
++   For reasonable space efficiency to be achieved it is often
++   necessary to store key_atoms in the nodes in the form of items, where
++   an item is a sequence of key_atoms of the same or similar type. It is
++   more space-efficient, because the item can implement (very)
++   efficient compression of key_atom's bodies using internal knowledge
++   about their semantics, and it can often avoid having a key for each
++   key_atom. Each type of item has specific operations implemented by its
++   item handler (see balance.c).
++
++   Rationale: the rest of the code (specifically balancing routines)
++   accesses leaf level nodes through this interface. This way we can
++   implement various block layouts and even combine various layouts
++   within the same tree. Balancing/allocating algorithms should not
++   care about peculiarities of splitting/merging specific item types,
++   but rather should leave that to the item's item handler.
++
++   Items, including those that provide the abstraction of flows, have
++   the property that if you move them in part or in whole to another
++   node, the balancing code invokes their is_left_mergeable()
++   item_operation to determine if they are mergeable with their new
++   neighbor in the node you have moved them to.  For some items the
++   is_left_mergeable() function always returns null.
++
++   When moving the bodies of items from one node to another:
++
++     if a partial item is shifted to another node the balancing code invokes
++     an item handler method to handle the item splitting.
++
++     if the balancing code needs to merge with an item in the node it
++     is shifting to, it will invoke an item handler method to handle
++     the item merging.
++
++     if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
++     adjusting the item headers after the move is done using the node handler.
++*/
++
++#include "../../forward.h"
++#include "../../debug.h"
++#include "../../key.h"
++#include "../../coord.h"
++#include "../plugin_header.h"
++#include "../item/item.h"
++#include "node.h"
++#include "../plugin.h"
++#include "../../znode.h"
++#include "../../tree.h"
++#include "../../super.h"
++#include "../../reiser4.h"
++
++/**
++ * leftmost_key_in_node - get the smallest key in node
++ * @node:
++ * @key: store result here
++ *
++ * Stores the leftmost key of @node in @key.
++ */
++reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
++{
++	assert("nikita-1634", node != NULL);
++	assert("nikita-1635", key != NULL);
++
++	if (!node_is_empty(node)) {
++		coord_t first_item;
++
++		coord_init_first_unit(&first_item, (znode *) node);
++		item_key_by_coord(&first_item, key);
++	} else
++		*key = *max_key();
++	return key;
++}
++
++node_plugin node_plugins[LAST_NODE_ID] = {
++	[NODE40_ID] = {
++		.h = {
++			.type_id = REISER4_NODE_PLUGIN_TYPE,
++			.id = NODE40_ID,
++			.pops = NULL,
++			.label = "unified",
++			.desc = "unified node layout",
++			.linkage = {NULL, NULL}
++		},
++		.item_overhead = item_overhead_node40,
++		.free_space = free_space_node40,
++		.lookup = lookup_node40,
++		.num_of_items = num_of_items_node40,
++		.item_by_coord = item_by_coord_node40,
++		.length_by_coord = length_by_coord_node40,
++		.plugin_by_coord = plugin_by_coord_node40,
++		.key_at = key_at_node40,
++		.estimate = estimate_node40,
++		.check = check_node40,
++		.parse = parse_node40,
++		.init = init_node40,
++#ifdef GUESS_EXISTS
++		.guess = guess_node40,
++#endif
++		.change_item_size = change_item_size_node40,
++		.create_item = create_item_node40,
++		.update_item_key = update_item_key_node40,
++		.cut_and_kill = kill_node40,
++		.cut = cut_node40,
++		.shift = shift_node40,
++		.shrink_item = shrink_item_node40,
++		.fast_insert = fast_insert_node40,
++		.fast_paste = fast_paste_node40,
++		.fast_cut = fast_cut_node40,
++		.max_item_size = max_item_size_node40,
++		.prepare_removal = prepare_removal_node40,
++		.set_item_plugin = set_item_plugin_node40
++	}
++};
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/node/node.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/node/node.h
+@@ -0,0 +1,272 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* We need a definition of the default node layout here. */
++
++/* Generally speaking, it is best to have free space in the middle of the
++   node so that two sets of things can grow towards it, and to have the
++   item bodies on the left so that the last one of them grows into free
++   space.  We optimize for the case where we append new items to the end
++   of the node, or grow the last item, because it hurts nothing to so
++   optimize and it is a common special case to do massive insertions in
++   increasing key order (and one of cases more likely to have a real user
++   notice the delay time for).
++
++   formatted leaf default layout: (leaf1)
++
++   |node header:item bodies:free space:key + pluginid + item offset|
++
++   We grow towards the middle, optimizing layout for the case where we
++   append new items to the end of the node.  The node header is fixed
++   length.  Keys, and item offsets plus pluginids for the items
++   corresponding to them are in increasing key order, and are fixed
++   length.  Item offsets are relative to start of node (16 bits creating
++   a node size limit of 64k, 12 bits might be a better choice....).  Item
++   bodies are in decreasing key order.  Item bodies have a variable size.
++   There is a one to one to one mapping of keys to item offsets to item
++   bodies.  Item offsets consist of pointers to the zeroth byte of the
++   item body.  Item length equals the start of the next item minus the
++   start of this item, except the zeroth item whose length equals the end
++   of the node minus the start of that item (plus a byte).  In other
++   words, the item length is not recorded anywhere, and it does not need
++   to be since it is computable.
++
++   Leaf variable length items and keys layout : (lvar)
++
++   |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
++
++   We grow towards the middle, optimizing layout for the case where we
++   append new items to the end of the node.  The node header is fixed
++   length.  Keys and item offsets for the items corresponding to them are
++   in increasing key order, and keys are variable length.  Item offsets
++   are relative to start of node (16 bits).  Item bodies are in
++   decreasing key order.  Item bodies have a variable size.  There is a
++   one to one to one mapping of keys to item offsets to item bodies.
++   Item offsets consist of pointers to the zeroth byte of the item body.
++   Item length equals the start of the next item's key minus the start of
++   this item, except the zeroth item whose length equals the end of the
++   node minus the start of that item (plus a byte).
++
++   leaf compressed keys layout: (lcomp)
++
++   |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
++
++   We grow towards the middle, optimizing layout for the case where we
++   append new items to the end of the node.  The node header is fixed
++   length.  Keys and item offsets for the items corresponding to them are
++   in increasing key order, and keys are variable length.  The "key
++   inherit" field indicates how much of the key prefix is identical to
++   the previous key (stem compression as described in "Managing
++   Gigabytes" is used).  key_inherit is a one byte integer.  The
++   intra-node searches performed through this layout are linear searches,
++   and this is theorized to not hurt performance much due to the high
++   cost of processor stalls on modern CPUs, and the small number of keys
++   in a single node.  Item offsets are relative to start of node (16
++   bits).  Item bodies are in decreasing key order.  Item bodies have a
++   variable size.  There is a one to one to one mapping of keys to item
++   offsets to item bodies.  Item offsets consist of pointers to the
++   zeroth byte of the item body.  Item length equals the start of the
++   next item minus the start of this item, except the zeroth item whose
++   length equals the end of the node minus the start of that item (plus a
++   byte).  In other words, item length and key length is not recorded
++   anywhere, and it does not need to be since it is computable.
++
++   internal node default layout: (idef1)
++
++   just like ldef1 except that item bodies are either blocknrs of
++   children or extents, and moving them may require updating parent
++   pointers in the nodes that they point to.
++*/
++
++/* There is an inherent 3-way tradeoff between optimizing and
++   exchanging disks between different architectures and code
++   complexity.  This is optimal and simple and inexchangeable.
++   Someone else can do the code for exchanging disks and make it
++   complex. It would not be that hard.  Using other than the PAGE_SIZE
++   might be suboptimal.
++*/
++
++#if !defined( __REISER4_NODE_H__ )
++#define __REISER4_NODE_H__
++
++#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
++
++#include "../../dformat.h"
++#include "../plugin_header.h"
++
++#include <linux/types.h>
++
++typedef enum {
++	NS_FOUND = 0,
++	NS_NOT_FOUND = -ENOENT
++} node_search_result;
++
++/* Maximal possible space overhead for creation of new item in a node */
++#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
++
++typedef enum {
++	REISER4_NODE_DKEYS = (1 << 0),
++	REISER4_NODE_TREE_STABLE = (1 << 1)
++} reiser4_node_check_flag;
++
++/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
++struct cut_list {
++	coord_t *from;
++	coord_t *to;
++	const reiser4_key *from_key;
++	const reiser4_key *to_key;
++	reiser4_key *smallest_removed;
++	carry_plugin_info *info;
++	__u32 flags;
++	struct inode *inode;	/* this is to pass list of eflushed jnodes down to extent_kill_hook */
++	lock_handle *left;
++	lock_handle *right;
++};
++
++struct carry_cut_data;
++struct carry_kill_data;
++
++/* The responsibility of the node plugin is to store and give access
++   to the sequence of items within the node.  */
++typedef struct node_plugin {
++	/* generic plugin fields */
++	plugin_header h;
++
++	/* calculates the amount of space that will be required to store an
++	   item which is in addition to the space consumed by the item body.
++	   (the space consumed by the item body can be gotten by calling
++	   item->estimate) */
++	 size_t(*item_overhead) (const znode * node, flow_t * f);
++
++	/* returns free space by looking into node (i.e., without using
++	   znode->free_space). */
++	 size_t(*free_space) (znode * node);
++	/* search within the node for the one item which might
++	   contain the key, invoking item->search_within to search within
++	   that item to see if it is in there */
++	 node_search_result(*lookup) (znode * node, const reiser4_key * key,
++				      lookup_bias bias, coord_t * coord);
++	/* number of items in node */
++	int (*num_of_items) (const znode * node);
++
++	/* store information about item in @coord in @data */
++	/* break into several node ops, don't add any more uses of this before doing so */
++	/*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
++	char *(*item_by_coord) (const coord_t * coord);
++	int (*length_by_coord) (const coord_t * coord);
++	item_plugin *(*plugin_by_coord) (const coord_t * coord);
++
++	/* store item key in @key */
++	reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
++	/* conservatively estimate whether unit of what size can fit
++	   into node. This estimation should be performed without
++	   actually looking into the node's content (free space is saved in
++	   znode). */
++	 size_t(*estimate) (znode * node);
++
++	/* performs every consistency check the node plugin author could
++	   imagine. Optional. */
++	int (*check) (const znode * node, __u32 flags, const char **error);
++
++	/* Called when node is read into memory and node plugin is
++	   already detected. This should read some data into znode (like free
++	   space counter) and, optionally, check data consistency.
++	 */
++	int (*parse) (znode * node);
++	/* This method is called on a new node to initialise plugin specific
++	   data (header, etc.) */
++	int (*init) (znode * node);
++	/* Check whether @node content conforms to this plugin format.
++	   Probably only useful after support for old V3.x formats is added.
++	   Uncomment after 4.0 only.
++	 */
++	/*      int ( *guess )( const znode *node ); */
++#if REISER4_DEBUG
++	void (*print) (const char *prefix, const znode * node, __u32 flags);
++#endif
++	/* change size of @item by @by bytes. @item->node has enough free
++	   space. When @by > 0 - free space is appended to end of item. When
++	   @by < 0 - item is truncated - it is assumed that last @by bytes if
++	   the item are freed already */
++	void (*change_item_size) (coord_t * item, int by);
++
++	/* create new item @length bytes long in coord @target */
++	int (*create_item) (coord_t * target, const reiser4_key * key,
++			    reiser4_item_data * data, carry_plugin_info * info);
++
++	/* update key of item. */
++	void (*update_item_key) (coord_t * target, const reiser4_key * key,
++				 carry_plugin_info * info);
++
++	int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
++	int (*cut) (struct carry_cut_data *, carry_plugin_info *);
++
++	/*
++	 * shrink item pointed to by @coord by @delta bytes.
++	 */
++	int (*shrink_item) (coord_t * coord, int delta);
++
++	/* copy as much as possible but not more than up to @stop from
++	   @stop->node to @target. If (pend == append) then data from beginning of
++	   @stop->node are copied to the end of @target. If (pend == prepend) then
++	   data from the end of @stop->node are copied to the beginning of
++	   @target. Copied data are removed from @stop->node. Information
++	   about what to do on upper level is stored in @todo */
++	int (*shift) (coord_t * stop, znode * target, shift_direction pend,
++		      int delete_node, int including_insert_coord,
++		      carry_plugin_info * info);
++	/* return true if this node allows skip carry() in some situations
++	   (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
++	   emulation doesn't.
++
++	   This will speedup insertions that doesn't require updates to the
++	   parent, by bypassing initialisation of carry() structures. It's
++	   believed that majority of insertions will fit there.
++
++	 */
++	int (*fast_insert) (const coord_t * coord);
++	int (*fast_paste) (const coord_t * coord);
++	int (*fast_cut) (const coord_t * coord);
++	/* this limits max size of item which can be inserted into a node and
++	   number of bytes item in a node may be appended with */
++	int (*max_item_size) (void);
++	int (*prepare_removal) (znode * empty, carry_plugin_info * info);
++	/* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
++	 * files */
++	int (*set_item_plugin) (coord_t * coord, item_id);
++} node_plugin;
++
++typedef enum {
++	/* standard unified node layout used for both leaf and internal
++	   nodes */
++	NODE40_ID,
++	LAST_NODE_ID
++} reiser4_node_id;
++
++extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
++#if REISER4_DEBUG
++extern void print_node_content(const char *prefix, const znode * node,
++			       __u32 flags);
++#endif
++
++extern void indent_znode(const znode * node);
++
++typedef struct common_node_header {
++	/*
++	 * identifier of node plugin. Must be located at the very beginning of
++	 * a node.
++	 */
++	__le16 plugin_id;
++} common_node_header;
++
++/* __REISER4_NODE_H__ */
++#endif
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * scroll-step: 1
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/node/node40.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/node/node40.c
+@@ -0,0 +1,2924 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "../../debug.h"
++#include "../../key.h"
++#include "../../coord.h"
++#include "../plugin_header.h"
++#include "../item/item.h"
++#include "node.h"
++#include "node40.h"
++#include "../plugin.h"
++#include "../../jnode.h"
++#include "../../znode.h"
++#include "../../pool.h"
++#include "../../carry.h"
++#include "../../tap.h"
++#include "../../tree.h"
++#include "../../super.h"
++#include "../../reiser4.h"
++
++#include <asm/uaccess.h>
++#include <linux/types.h>
++#include <linux/prefetch.h>
++
++/* leaf 40 format:
++
++  [node header | item 0, item 1, .., item N-1 |  free space | item_head N-1, .. item_head 1, item head 0 ]
++   plugin_id (16)                                                key
++   free_space (16)                                               pluginid (16)
++   free_space_start (16)                                         offset (16)
++   level (8)
++   num_items (16)
++   magic (32)
++   flush_time (32)
++*/
++/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs.  Change to "ReIs". */
++/* magic number that is stored in ->magic field of node header */
++static const __u32 REISER4_NODE_MAGIC = 0x52344653;	/* (*(__u32 *)"R4FS"); */
++
++static int prepare_for_update(znode * left, znode * right,
++			      carry_plugin_info * info);
++
++/* header of node of reiser40 format is at the beginning of node */
++static inline node40_header *node40_node_header(const znode * node	/* node to
++									 * query */ )
++{
++	assert("nikita-567", node != NULL);
++	assert("nikita-568", znode_page(node) != NULL);
++	assert("nikita-569", zdata(node) != NULL);
++	return (node40_header *) zdata(node);
++}
++
++/* functions to get/set fields of node40_header */
++#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
++#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
++#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
++#define nh40_get_level(nh) get_unaligned(&(nh)->level)
++#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
++#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
++
++#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
++#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
++#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
++#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
++#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
++#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
++
++
++/* plugin field of node header should be read/set by
++   plugin_by_disk_id/save_disk_plugin */
++
++/* array of item headers is at the end of node */
++static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
++{
++	return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
++}
++
++/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
++ */
++static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
++{
++	return (item_header40 *) (zdata(coord->node) +
++				  znode_size(coord->node)) - (coord->item_pos) -
++	    1;
++}
++
++/* functions to get/set fields of item_header40 */
++#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
++
++#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
++
++/* plugin field of item header should be read/set by
++   plugin_by_disk_id/save_disk_plugin */
++
++/* plugin methods */
++
++/* plugin->u.node.item_overhead
++   look for description of this method in plugin/node/node.h */
++size_t
++item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
++{
++	return sizeof(item_header40);
++}
++
++/* plugin->u.node.free_space
++   look for description of this method in plugin/node/node.h */
++size_t free_space_node40(znode * node)
++{
++	assert("nikita-577", node != NULL);
++	assert("nikita-578", znode_is_loaded(node));
++	assert("nikita-579", zdata(node) != NULL);
++
++	return nh40_get_free_space(node40_node_header(node));
++}
++
++/* private inline version of node40_num_of_items() for use in this file. This
++   is necessary, because address of node40_num_of_items() is taken and it is
++   never inlined as a result. */
++static inline short node40_num_of_items_internal(const znode * node)
++{
++	return nh40_get_num_items(node40_node_header(node));
++}
++
++#if REISER4_DEBUG
++static inline void check_num_items(const znode * node)
++{
++	assert("nikita-2749",
++	       node40_num_of_items_internal(node) == node->nr_items);
++	assert("nikita-2746", znode_is_write_locked(node));
++}
++#else
++#define check_num_items(node) noop
++#endif
++
++/* plugin->u.node.num_of_items
++   look for description of this method in plugin/node/node.h */
++int num_of_items_node40(const znode * node)
++{
++	return node40_num_of_items_internal(node);
++}
++
++static void
++node40_set_num_items(znode * node, node40_header * nh, unsigned value)
++{
++	assert("nikita-2751", node != NULL);
++	assert("nikita-2750", nh == node40_node_header(node));
++
++	check_num_items(node);
++	nh40_set_num_items(nh, value);
++	node->nr_items = value;
++	check_num_items(node);
++}
++
++/* plugin->u.node.item_by_coord
++   look for description of this method in plugin/node/node.h */
++char *item_by_coord_node40(const coord_t * coord)
++{
++	item_header40 *ih;
++	char *p;
++
++	/* @coord is set to existing item */
++	assert("nikita-596", coord != NULL);
++	assert("vs-255", coord_is_existing_item(coord));
++
++	ih = node40_ih_at_coord(coord);
++	p = zdata(coord->node) + ih40_get_offset(ih);
++	return p;
++}
++
++/* plugin->u.node.length_by_coord
++   look for description of this method in plugin/node/node.h */
++int length_by_coord_node40(const coord_t * coord)
++{
++	item_header40 *ih;
++	int result;
++
++	/* @coord is set to existing item */
++	assert("vs-256", coord != NULL);
++	assert("vs-257", coord_is_existing_item(coord));
++
++	ih = node40_ih_at_coord(coord);
++	if ((int)coord->item_pos ==
++	    node40_num_of_items_internal(coord->node) - 1)
++		result =
++		    nh40_get_free_space_start(node40_node_header(coord->node)) -
++		    ih40_get_offset(ih);
++	else
++		result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
++
++	return result;
++}
++
++static pos_in_node_t
++node40_item_length(const znode * node, pos_in_node_t item_pos)
++{
++	item_header40 *ih;
++	pos_in_node_t result;
++
++	/* @coord is set to existing item */
++	assert("vs-256", node != NULL);
++	assert("vs-257", node40_num_of_items_internal(node) > item_pos);
++
++	ih = node40_ih_at(node, item_pos);
++	if (item_pos == node40_num_of_items_internal(node) - 1)
++		result =
++		    nh40_get_free_space_start(node40_node_header(node)) -
++		    ih40_get_offset(ih);
++	else
++		result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
++
++	return result;
++}
++
++/* plugin->u.node.plugin_by_coord
++   look for description of this method in plugin/node/node.h */
++item_plugin *plugin_by_coord_node40(const coord_t * coord)
++{
++	item_header40 *ih;
++	item_plugin *result;
++
++	/* @coord is set to existing item */
++	assert("vs-258", coord != NULL);
++	assert("vs-259", coord_is_existing_item(coord));
++
++	ih = node40_ih_at_coord(coord);
++	/* pass NULL in stead of current tree. This is time critical call. */
++	result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
++	return result;
++}
++
++/* plugin->u.node.key_at
++   look for description of this method in plugin/node/node.h */
++reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
++{
++	item_header40 *ih;
++
++	assert("nikita-1765", coord_is_existing_item(coord));
++
++	/* @coord is set to existing item */
++	ih = node40_ih_at_coord(coord);
++	memcpy(key, &ih->key, sizeof(reiser4_key));
++	return key;
++}
++
++/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
++
++#define NODE_INCSTAT(n, counter)						\
++	reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
++
++#define NODE_ADDSTAT(n, counter, val)						\
++	reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
++
++/* plugin->u.node.lookup
++   look for description of this method in plugin/node/node.h */
++node_search_result lookup_node40(znode * node /* node to query */ ,
++				 const reiser4_key * key /* key to look for */ ,
++				 lookup_bias bias /* search bias */ ,
++				 coord_t * coord /* resulting coord */ )
++{
++	int left;
++	int right;
++	int found;
++	int items;
++
++	item_header40 *lefth;
++	item_header40 *righth;
++
++	item_plugin *iplug;
++	item_header40 *bstop;
++	item_header40 *ih;
++	cmp_t order;
++
++	assert("nikita-583", node != NULL);
++	assert("nikita-584", key != NULL);
++	assert("nikita-585", coord != NULL);
++	assert("nikita-2693", znode_is_any_locked(node));
++	cassert(REISER4_SEQ_SEARCH_BREAK > 2);
++
++	items = node_num_items(node);
++
++	if (unlikely(items == 0)) {
++		coord_init_first_unit(coord, node);
++		return NS_NOT_FOUND;
++	}
++
++	/* binary search for item that can contain given key */
++	left = 0;
++	right = items - 1;
++	coord->node = node;
++	coord_clear_iplug(coord);
++	found = 0;
++
++	lefth = node40_ih_at(node, left);
++	righth = node40_ih_at(node, right);
++
++	/* It is known that for small arrays sequential search is on average
++	   more efficient than binary. This is because sequential search is
++	   coded as tight loop that can be better optimized by compilers and
++	   for small array size gain from this optimization makes sequential
++	   search the winner. Another, maybe more important, reason for this,
++	   is that sequential array is more CPU cache friendly, whereas binary
++	   search effectively destroys CPU caching.
++
++	   Critical here is the notion of "smallness". Reasonable value of
++	   REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
++	   fs/reiser4/ulevel/ulevel.c:test_search().
++
++	   Don't try to further optimize sequential search by scanning from
++	   right to left in attempt to use more efficient loop termination
++	   condition (comparison with 0). This doesn't work.
++
++	 */
++
++	while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
++		int median;
++		item_header40 *medianh;
++
++		median = (left + right) / 2;
++		medianh = node40_ih_at(node, median);
++
++		assert("nikita-1084", median >= 0);
++		assert("nikita-1085", median < items);
++		switch (keycmp(key, &medianh->key)) {
++		case LESS_THAN:
++			right = median;
++			righth = medianh;
++			break;
++		default:
++			wrong_return_value("nikita-586", "keycmp");
++		case GREATER_THAN:
++			left = median;
++			lefth = medianh;
++			break;
++		case EQUAL_TO:
++			do {
++				--median;
++				/* headers are ordered from right to left */
++				++medianh;
++			} while (median >= 0 && keyeq(key, &medianh->key));
++			right = left = median + 1;
++			ih = lefth = righth = medianh - 1;
++			found = 1;
++			break;
++		}
++	}
++	/* sequential scan. Item headers, and, therefore, keys are stored at
++	   the rightmost part of a node from right to left. We are trying to
++	   access memory from left to right, and hence, scan in _descending_
++	   order of item numbers.
++	 */
++	if (!found) {
++		for (left = right, ih = righth; left >= 0; ++ih, --left) {
++			cmp_t comparison;
++
++			prefetchkey(&(ih + 1)->key);
++			comparison = keycmp(&ih->key, key);
++			if (comparison == GREATER_THAN)
++				continue;
++			if (comparison == EQUAL_TO) {
++				found = 1;
++				do {
++					--left;
++					++ih;
++				} while (left >= 0 && keyeq(&ih->key, key));
++				++left;
++				--ih;
++			} else {
++				assert("nikita-1256", comparison == LESS_THAN);
++			}
++			break;
++		}
++		if (unlikely(left < 0))
++			left = 0;
++	}
++
++	assert("nikita-3212", right >= left);
++	assert("nikita-3214",
++	       equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
++
++	coord_set_item_pos(coord, left);
++	coord->unit_pos = 0;
++	coord->between = AT_UNIT;
++
++	/* key < leftmost key in a mode or node is corrupted and keys
++	   are not sorted  */
++	bstop = node40_ih_at(node, (unsigned)left);
++	order = keycmp(&bstop->key, key);
++	if (unlikely(order == GREATER_THAN)) {
++		if (unlikely(left != 0)) {
++			/* screw up */
++			warning("nikita-587", "Key less than %i key in a node",
++				left);
++			print_key("key", key);
++			print_key("min", &bstop->key);
++			print_coord_content("coord", coord);
++			return RETERR(-EIO);
++		} else {
++			coord->between = BEFORE_UNIT;
++			return NS_NOT_FOUND;
++		}
++	}
++	/* left <= key, ok */
++	iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
++
++	if (unlikely(iplug == NULL)) {
++		warning("nikita-588", "Unknown plugin %i",
++			le16_to_cpu(get_unaligned(&bstop->plugin_id)));
++		print_key("key", key);
++		print_coord_content("coord", coord);
++		return RETERR(-EIO);
++	}
++
++	coord_set_iplug(coord, iplug);
++
++	/* if exact key from item header was found by binary search, no
++	   further checks are necessary. */
++	if (found) {
++		assert("nikita-1259", order == EQUAL_TO);
++		return NS_FOUND;
++	}
++	if (iplug->b.max_key_inside != NULL) {
++		reiser4_key max_item_key;
++
++		/* key > max_item_key --- outside of an item */
++		if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
++			coord->unit_pos = 0;
++			coord->between = AFTER_ITEM;
++			/* FIXME-VS: key we are looking for does not fit into
++			   found item. Return NS_NOT_FOUND then. Without that
++			   the following case does not work: there is extent of
++			   file 10000, 10001. File 10000, 10002 has been just
++			   created. When writing to position 0 in that file -
++			   traverse_tree will stop here on twig level. When we
++			   want it to go down to leaf level
++			 */
++			return NS_NOT_FOUND;
++		}
++	}
++
++	if (iplug->b.lookup != NULL) {
++		return iplug->b.lookup(key, bias, coord);
++	} else {
++		assert("nikita-1260", order == LESS_THAN);
++		coord->between = AFTER_UNIT;
++		return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
++	}
++}
++
++#undef NODE_ADDSTAT
++#undef NODE_INCSTAT
++
++/* plugin->u.node.estimate
++   look for description of this method in plugin/node/node.h */
++size_t estimate_node40(znode * node)
++{
++	size_t result;
++
++	assert("nikita-597", node != NULL);
++
++	result = free_space_node40(node) - sizeof(item_header40);
++
++	return (result > 0) ? result : 0;
++}
++
++/* plugin->u.node.check
++   look for description of this method in plugin/node/node.h */
++int check_node40(const znode * node /* node to check */ ,
++		 __u32 flags /* check flags */ ,
++		 const char **error /* where to store error message */ )
++{
++	int nr_items;
++	int i;
++	reiser4_key prev;
++	unsigned old_offset;
++	tree_level level;
++	coord_t coord;
++	int result;
++
++	assert("nikita-580", node != NULL);
++	assert("nikita-581", error != NULL);
++	assert("nikita-2948", znode_is_loaded(node));
++
++	if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
++		return 0;
++
++	assert("nikita-582", zdata(node) != NULL);
++
++	nr_items = node40_num_of_items_internal(node);
++	if (nr_items < 0) {
++		*error = "Negative number of items";
++		return -1;
++	}
++
++	if (flags & REISER4_NODE_DKEYS)
++		prev = *znode_get_ld_key((znode *) node);
++	else
++		prev = *min_key();
++
++	old_offset = 0;
++	coord_init_zero(&coord);
++	coord.node = (znode *) node;
++	coord.unit_pos = 0;
++	coord.between = AT_UNIT;
++	level = znode_get_level(node);
++	for (i = 0; i < nr_items; i++) {
++		item_header40 *ih;
++		reiser4_key unit_key;
++		unsigned j;
++
++		ih = node40_ih_at(node, (unsigned)i);
++		coord_set_item_pos(&coord, i);
++		if ((ih40_get_offset(ih) >=
++		     znode_size(node) - nr_items * sizeof(item_header40)) ||
++		    (ih40_get_offset(ih) < sizeof(node40_header))) {
++			*error = "Offset is out of bounds";
++			return -1;
++		}
++		if (ih40_get_offset(ih) <= old_offset) {
++			*error = "Offsets are in wrong order";
++			return -1;
++		}
++		if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
++			*error = "Wrong offset of first item";
++			return -1;
++		}
++		old_offset = ih40_get_offset(ih);
++
++		if (keygt(&prev, &ih->key)) {
++			*error = "Keys are in wrong order";
++			return -1;
++		}
++		if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
++			*error = "Wrong key of first unit";
++			return -1;
++		}
++		prev = ih->key;
++		for (j = 0; j < coord_num_units(&coord); ++j) {
++			coord.unit_pos = j;
++			unit_key_by_coord(&coord, &unit_key);
++			if (keygt(&prev, &unit_key)) {
++				*error = "Unit keys are in wrong order";
++				return -1;
++			}
++			prev = unit_key;
++		}
++		coord.unit_pos = 0;
++		if (level != TWIG_LEVEL && item_is_extent(&coord)) {
++			*error = "extent on the wrong level";
++			return -1;
++		}
++		if (level == LEAF_LEVEL && item_is_internal(&coord)) {
++			*error = "internal item on the wrong level";
++			return -1;
++		}
++		if (level != LEAF_LEVEL &&
++		    !item_is_internal(&coord) && !item_is_extent(&coord)) {
++			*error = "wrong item on the internal level";
++			return -1;
++		}
++		if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
++			*error = "non-internal item on the internal level";
++			return -1;
++		}
++#if REISER4_DEBUG
++		if (item_plugin_by_coord(&coord)->b.check
++		    && item_plugin_by_coord(&coord)->b.check(&coord, error))
++			return -1;
++#endif
++		if (i) {
++			coord_t prev_coord;
++			/* two neighboring items can not be mergeable */
++			coord_dup(&prev_coord, &coord);
++			coord_prev_item(&prev_coord);
++			if (are_items_mergeable(&prev_coord, &coord)) {
++				*error = "mergeable items in one node";
++				return -1;
++			}
++
++		}
++	}
++
++	if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
++		coord_t coord;
++		item_plugin *iplug;
++
++		coord_init_last_unit(&coord, node);
++		iplug = item_plugin_by_coord(&coord);
++		if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
++		    iplug->s.file.append_key != NULL) {
++			reiser4_key mkey;
++
++			iplug->s.file.append_key(&coord, &mkey);
++			set_key_offset(&mkey, get_key_offset(&mkey) - 1);
++			read_lock_dk(current_tree);
++			result = keygt(&mkey, znode_get_rd_key((znode *) node));
++			read_unlock_dk(current_tree);
++			if (result) {
++				*error = "key of rightmost item is too large";
++				return -1;
++			}
++		}
++	}
++	if (flags & REISER4_NODE_DKEYS) {
++		read_lock_tree(current_tree);
++		read_lock_dk(current_tree);
++
++		flags |= REISER4_NODE_TREE_STABLE;
++
++		if (keygt(&prev, znode_get_rd_key((znode *) node))) {
++			if (flags & REISER4_NODE_TREE_STABLE) {
++				*error = "Last key is greater than rdkey";
++				read_unlock_dk(current_tree);
++				read_unlock_tree(current_tree);
++				return -1;
++			}
++		}
++		if (keygt
++		    (znode_get_ld_key((znode *) node),
++		     znode_get_rd_key((znode *) node))) {
++			*error = "ldkey is greater than rdkey";
++			read_unlock_dk(current_tree);
++			read_unlock_tree(current_tree);
++			return -1;
++		}
++		if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
++		    (node->left != NULL) &&
++		    !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
++		    ergo(flags & REISER4_NODE_TREE_STABLE,
++			 !keyeq(znode_get_rd_key(node->left),
++				znode_get_ld_key((znode *) node)))
++		    && ergo(!(flags & REISER4_NODE_TREE_STABLE),
++			    keygt(znode_get_rd_key(node->left),
++				  znode_get_ld_key((znode *) node)))) {
++			*error = "left rdkey or ldkey is wrong";
++ 			read_unlock_dk(current_tree);
++			read_unlock_tree(current_tree);
++			return -1;
++		}
++		if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
++		    (node->right != NULL) &&
++		    !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
++		    ergo(flags & REISER4_NODE_TREE_STABLE,
++			 !keyeq(znode_get_rd_key((znode *) node),
++				znode_get_ld_key(node->right)))
++		    && ergo(!(flags & REISER4_NODE_TREE_STABLE),
++			    keygt(znode_get_rd_key((znode *) node),
++				  znode_get_ld_key(node->right)))) {
++			*error = "rdkey or right ldkey is wrong";
++ 			read_unlock_dk(current_tree);
++			read_unlock_tree(current_tree);
++			return -1;
++		}
++
++		read_unlock_dk(current_tree);
++		read_unlock_tree(current_tree);
++	}
++
++	return 0;
++}
++
++/* plugin->u.node.parse
++   look for description of this method in plugin/node/node.h */
++int parse_node40(znode * node /* node to parse */ )
++{
++	node40_header *header;
++	int result;
++	d8 level;
++
++	header = node40_node_header((znode *) node);
++	result = -EIO;
++	level = nh40_get_level(header);
++	if (unlikely(((__u8) znode_get_level(node)) != level))
++		warning("nikita-494", "Wrong level found in node: %i != %i",
++			znode_get_level(node), level);
++	else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
++		warning("nikita-495",
++			"Wrong magic in tree node: want %x, got %x",
++			REISER4_NODE_MAGIC, nh40_get_magic(header));
++	else {
++		node->nr_items = node40_num_of_items_internal(node);
++		result = 0;
++	}
++	if (unlikely(result != 0))
++		/* print_znode("node", node) */ ;
++	return RETERR(result);
++}
++
++/* plugin->u.node.init
++   look for description of this method in plugin/node/node.h */
++int init_node40(znode * node /* node to initialise */ )
++{
++	node40_header *header;
++
++	assert("nikita-570", node != NULL);
++	assert("nikita-572", zdata(node) != NULL);
++
++	header = node40_node_header(node);
++	memset(header, 0, sizeof(node40_header));
++	nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
++	nh40_set_free_space_start(header, sizeof(node40_header));
++	/* sane hypothesis: 0 in CPU format is 0 in disk format */
++	/* items: 0 */
++	save_plugin_id(node_plugin_to_plugin(node->nplug),
++		       &header->common_header.plugin_id);
++	nh40_set_level(header, znode_get_level(node));
++	nh40_set_magic(header, REISER4_NODE_MAGIC);
++	node->nr_items = 0;
++	nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
++
++	/* flags: 0 */
++	return 0;
++}
++
++#ifdef GUESS_EXISTS
++int guess_node40(const znode * node /* node to guess plugin of */ )
++{
++	node40_header *nethack;
++
++	assert("nikita-1058", node != NULL);
++	nethack = node40_node_header(node);
++	return
++	    (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
++	    (plugin_by_disk_id(znode_get_tree(node),
++			       REISER4_NODE_PLUGIN_TYPE,
++			       &nethack->common_header.plugin_id)->h.id ==
++	     NODE40_ID);
++}
++#endif
++
++/* plugin->u.node.chage_item_size
++   look for description of this method in plugin/node/node.h */
++void change_item_size_node40(coord_t * coord, int by)
++{
++	node40_header *nh;
++	item_header40 *ih;
++	char *item_data;
++	int item_length;
++	unsigned i;
++
++	/* make sure that @item is coord of existing item */
++	assert("vs-210", coord_is_existing_item(coord));
++
++	nh = node40_node_header(coord->node);
++
++	item_data = item_by_coord_node40(coord);
++	item_length = length_by_coord_node40(coord);
++
++	/* move item bodies */
++	ih = node40_ih_at_coord(coord);
++	memmove(item_data + item_length + by, item_data + item_length,
++		nh40_get_free_space_start(node40_node_header(coord->node)) -
++		(ih40_get_offset(ih) + item_length));
++
++	/* update offsets of moved items */
++	for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
++		ih = node40_ih_at(coord->node, i);
++		ih40_set_offset(ih, ih40_get_offset(ih) + by);
++	}
++
++	/* update node header */
++	nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
++	nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
++}
++
++static int should_notify_parent(const znode * node)
++{
++	/* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
++	return !disk_addr_eq(znode_get_block(node),
++			     &znode_get_tree(node)->root_block);
++}
++
++/* plugin->u.node.create_item
++   look for description of this method in plugin/node/node.h */
++int
++create_item_node40(coord_t *target, const reiser4_key *key,
++		   reiser4_item_data *data, carry_plugin_info *info)
++{
++	node40_header *nh;
++	item_header40 *ih;
++	unsigned offset;
++	unsigned i;
++
++	nh = node40_node_header(target->node);
++
++	assert("vs-212", coord_is_between_items(target));
++	/* node must have enough free space */
++	assert("vs-254",
++	       free_space_node40(target->node) >=
++	       data->length + sizeof(item_header40));
++	assert("vs-1410", data->length >= 0);
++
++	if (coord_set_to_right(target))
++		/* there are not items to the right of @target, so, new item
++		   will be inserted after last one */
++		coord_set_item_pos(target, nh40_get_num_items(nh));
++
++	if (target->item_pos < nh40_get_num_items(nh)) {
++		/* there are items to be moved to prepare space for new
++		   item */
++		ih = node40_ih_at_coord(target);
++		/* new item will start at this offset */
++		offset = ih40_get_offset(ih);
++
++		memmove(zdata(target->node) + offset + data->length,
++			zdata(target->node) + offset,
++			nh40_get_free_space_start(nh) - offset);
++		/* update headers of moved items */
++		for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
++			ih = node40_ih_at(target->node, i);
++			ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
++		}
++
++		/* @ih is set to item header of the last item, move item headers */
++		memmove(ih - 1, ih,
++			sizeof(item_header40) * (nh40_get_num_items(nh) -
++						 target->item_pos));
++	} else {
++		/* new item will start at this offset */
++		offset = nh40_get_free_space_start(nh);
++	}
++
++	/* make item header for the new item */
++	ih = node40_ih_at_coord(target);
++	memcpy(&ih->key, key, sizeof(reiser4_key));
++	ih40_set_offset(ih, offset);
++	save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
++
++	/* update node header */
++	nh40_set_free_space(nh,
++			    nh40_get_free_space(nh) - data->length -
++			    sizeof(item_header40));
++	nh40_set_free_space_start(nh,
++				  nh40_get_free_space_start(nh) + data->length);
++	node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
++
++	/* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
++	target->unit_pos = 0;
++	target->between = AT_UNIT;
++	coord_clear_iplug(target);
++
++	/* initialize item */
++	if (data->iplug->b.init != NULL) {
++		data->iplug->b.init(target, NULL, data);
++	}
++	/* copy item body */
++	if (data->iplug->b.paste != NULL) {
++		data->iplug->b.paste(target, data, info);
++	} else if (data->data != NULL) {
++		if (data->user) {
++			/* AUDIT: Are we really should not check that pointer
++			   from userspace was valid and data bytes were
++			   available? How will we return -EFAULT of some kind
++			   without this check? */
++			assert("nikita-3038", schedulable());
++			/* copy data from user space */
++			__copy_from_user(zdata(target->node) + offset,
++					 (const char __user *)data->data,
++					 (unsigned)data->length);
++		} else
++			/* copy from kernel space */
++			memcpy(zdata(target->node) + offset, data->data,
++			       (unsigned)data->length);
++	}
++
++	if (target->item_pos == 0) {
++		/* left delimiting key has to be updated */
++		prepare_for_update(NULL, target->node, info);
++	}
++
++	if (item_plugin_by_coord(target)->b.create_hook != NULL) {
++		item_plugin_by_coord(target)->b.create_hook(target, data->arg);
++	}
++
++	return 0;
++}
++
++/* plugin->u.node.update_item_key
++   look for description of this method in plugin/node/node.h */
++void
++update_item_key_node40(coord_t * target, const reiser4_key * key,
++		       carry_plugin_info * info)
++{
++	item_header40 *ih;
++
++	ih = node40_ih_at_coord(target);
++	memcpy(&ih->key, key, sizeof(reiser4_key));
++
++	if (target->item_pos == 0) {
++		prepare_for_update(NULL, target->node, info);
++	}
++}
++
++/* this bits encode cut mode */
++#define CMODE_TAIL 1
++#define CMODE_WHOLE 2
++#define CMODE_HEAD 4
++
++struct cut40_info {
++	int mode;
++	pos_in_node_t tail_removed;	/* position of item which gets tail removed */
++	pos_in_node_t first_removed;	/* position of first the leftmost item among items removed completely */
++	pos_in_node_t removed_count;	/* number of items removed completely */
++	pos_in_node_t head_removed;	/* position of item which gets head removed */
++
++	pos_in_node_t freed_space_start;
++	pos_in_node_t freed_space_end;
++	pos_in_node_t first_moved;
++	pos_in_node_t head_removed_location;
++};
++
++static void init_cinfo(struct cut40_info *cinfo)
++{
++	cinfo->mode = 0;
++	cinfo->tail_removed = MAX_POS_IN_NODE;
++	cinfo->first_removed = MAX_POS_IN_NODE;
++	cinfo->removed_count = MAX_POS_IN_NODE;
++	cinfo->head_removed = MAX_POS_IN_NODE;
++	cinfo->freed_space_start = MAX_POS_IN_NODE;
++	cinfo->freed_space_end = MAX_POS_IN_NODE;
++	cinfo->first_moved = MAX_POS_IN_NODE;
++	cinfo->head_removed_location = MAX_POS_IN_NODE;
++}
++
++/* complete cut_node40/kill_node40 content by removing the gap created by */
++static void compact(znode * node, struct cut40_info *cinfo)
++{
++	node40_header *nh;
++	item_header40 *ih;
++	pos_in_node_t freed;
++	pos_in_node_t pos, nr_items;
++
++	assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
++			   cinfo->freed_space_end != MAX_POS_IN_NODE &&
++			   cinfo->first_moved != MAX_POS_IN_NODE));
++	assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
++
++	nh = node40_node_header(node);
++	nr_items = nh40_get_num_items(nh);
++
++	/* remove gap made up by removal */
++	memmove(zdata(node) + cinfo->freed_space_start,
++		zdata(node) + cinfo->freed_space_end,
++		nh40_get_free_space_start(nh) - cinfo->freed_space_end);
++
++	/* update item headers of moved items - change their locations */
++	pos = cinfo->first_moved;
++	ih = node40_ih_at(node, pos);
++	if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
++		assert("vs-1580", pos == cinfo->head_removed);
++		ih40_set_offset(ih, cinfo->head_removed_location);
++		pos++;
++		ih--;
++	}
++
++	freed = cinfo->freed_space_end - cinfo->freed_space_start;
++	for (; pos < nr_items; pos++, ih--) {
++		assert("vs-1581", ih == node40_ih_at(node, pos));
++		ih40_set_offset(ih, ih40_get_offset(ih) - freed);
++	}
++
++	/* free space start moved to right */
++	nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
++
++	if (cinfo->removed_count != MAX_POS_IN_NODE) {
++		/* number of items changed. Remove item headers of those items */
++		ih = node40_ih_at(node, nr_items - 1);
++		memmove(ih + cinfo->removed_count, ih,
++			sizeof(item_header40) * (nr_items -
++						 cinfo->removed_count -
++						 cinfo->first_removed));
++		freed += sizeof(item_header40) * cinfo->removed_count;
++		node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
++	}
++
++	/* total amount of free space increased */
++	nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
++}
++
++int shrink_item_node40(coord_t * coord, int delta)
++{
++	node40_header *nh;
++	item_header40 *ih;
++	pos_in_node_t pos;
++	pos_in_node_t nr_items;
++	char *end;
++	znode *node;
++	int off;
++
++	assert("nikita-3487", coord != NULL);
++	assert("nikita-3488", delta >= 0);
++
++	node = coord->node;
++	nh = node40_node_header(node);
++	nr_items = nh40_get_num_items(nh);
++
++	ih = node40_ih_at_coord(coord);
++	assert("nikita-3489", delta <= length_by_coord_node40(coord));
++	off = ih40_get_offset(ih) + length_by_coord_node40(coord);
++	end = zdata(node) + off;
++
++	/* remove gap made up by removal */
++	memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
++
++	/* update item headers of moved items - change their locations */
++	pos = coord->item_pos + 1;
++	ih = node40_ih_at(node, pos);
++	for (; pos < nr_items; pos++, ih--) {
++		assert("nikita-3490", ih == node40_ih_at(node, pos));
++		ih40_set_offset(ih, ih40_get_offset(ih) - delta);
++	}
++
++	/* free space start moved to left */
++	nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
++	/* total amount of free space increased */
++	nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
++	/*
++	 * This method does _not_ changes number of items. Hence, it cannot
++	 * make node empty. Also it doesn't remove items at all, which means
++	 * that no keys have to be updated either.
++	 */
++	return 0;
++}
++
++/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
++   of cut. First is when a unit is removed from the middle of an item.  In this case this function returns 1. All the
++   rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
++   getting head cut. Function returns 0 in this case */
++static int
++parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
++{
++	reiser4_key left_key, right_key;
++	reiser4_key min_from_key, max_to_key;
++	const reiser4_key *from_key, *to_key;
++
++	init_cinfo(cinfo);
++
++	/* calculate minimal key stored in first item of items to be cut (params->from) */
++	item_key_by_coord(params->from, &min_from_key);
++	/* and max key stored in last item of items to be cut (params->to) */
++	max_item_key_by_coord(params->to, &max_to_key);
++
++	/* if cut key range is not defined in input parameters - define it using cut coord range */
++	if (params->from_key == NULL) {
++		assert("vs-1513", params->to_key == NULL);
++		unit_key_by_coord(params->from, &left_key);
++		from_key = &left_key;
++		max_unit_key_by_coord(params->to, &right_key);
++		to_key = &right_key;
++	} else {
++		from_key = params->from_key;
++		to_key = params->to_key;
++	}
++
++	if (params->from->item_pos == params->to->item_pos) {
++		if (keylt(&min_from_key, from_key)
++		    && keylt(to_key, &max_to_key))
++			return 1;
++
++		if (keygt(from_key, &min_from_key)) {
++			/* tail of item is to be cut cut */
++			cinfo->tail_removed = params->from->item_pos;
++			cinfo->mode |= CMODE_TAIL;
++		} else if (keylt(to_key, &max_to_key)) {
++			/* head of item is to be cut */
++			cinfo->head_removed = params->from->item_pos;
++			cinfo->mode |= CMODE_HEAD;
++		} else {
++			/* item is removed completely */
++			cinfo->first_removed = params->from->item_pos;
++			cinfo->removed_count = 1;
++			cinfo->mode |= CMODE_WHOLE;
++		}
++	} else {
++		cinfo->first_removed = params->from->item_pos + 1;
++		cinfo->removed_count =
++		    params->to->item_pos - params->from->item_pos - 1;
++
++		if (keygt(from_key, &min_from_key)) {
++			/* first item is not cut completely */
++			cinfo->tail_removed = params->from->item_pos;
++			cinfo->mode |= CMODE_TAIL;
++		} else {
++			cinfo->first_removed--;
++			cinfo->removed_count++;
++		}
++		if (keylt(to_key, &max_to_key)) {
++			/* last item is not cut completely */
++			cinfo->head_removed = params->to->item_pos;
++			cinfo->mode |= CMODE_HEAD;
++		} else {
++			cinfo->removed_count++;
++		}
++		if (cinfo->removed_count)
++			cinfo->mode |= CMODE_WHOLE;
++	}
++
++	return 0;
++}
++
++static void
++call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
++		carry_kill_data * kdata)
++{
++	coord_t coord;
++	item_plugin *iplug;
++	pos_in_node_t pos;
++
++	coord.node = node;
++	coord.unit_pos = 0;
++	coord.between = AT_UNIT;
++	for (pos = 0; pos < count; pos++) {
++		coord_set_item_pos(&coord, from + pos);
++		coord.unit_pos = 0;
++		coord.between = AT_UNIT;
++		iplug = item_plugin_by_coord(&coord);
++		if (iplug->b.kill_hook) {
++			iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
++					   kdata);
++		}
++	}
++}
++
++/* this is used to kill item partially */
++static pos_in_node_t
++kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
++	   reiser4_key * smallest_removed, reiser4_key * new_first_key)
++{
++	struct carry_kill_data *kdata;
++	item_plugin *iplug;
++
++	kdata = data;
++	iplug = item_plugin_by_coord(coord);
++
++	assert("vs-1524", iplug->b.kill_units);
++	return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
++				   new_first_key);
++}
++
++/* call item plugin to cut tail of file */
++static pos_in_node_t
++kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
++{
++	struct carry_kill_data *kdata;
++	pos_in_node_t to;
++
++	kdata = data;
++	to = coord_last_unit_pos(coord);
++	return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
++			  NULL);
++}
++
++/* call item plugin to cut head of item */
++static pos_in_node_t
++kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
++	  reiser4_key * new_first_key)
++{
++	return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
++			  new_first_key);
++}
++
++/* this is used to cut item partially */
++static pos_in_node_t
++cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
++	  reiser4_key * smallest_removed, reiser4_key * new_first_key)
++{
++	carry_cut_data *cdata;
++	item_plugin *iplug;
++
++	cdata = data;
++	iplug = item_plugin_by_coord(coord);
++	assert("vs-302", iplug->b.cut_units);
++	return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
++				  new_first_key);
++}
++
++/* call item plugin to cut tail of file */
++static pos_in_node_t
++cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
++{
++	carry_cut_data *cdata;
++	pos_in_node_t to;
++
++	cdata = data;
++	to = coord_last_unit_pos(cdata->params.from);
++	return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
++}
++
++/* call item plugin to cut head of item */
++static pos_in_node_t
++cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
++	 reiser4_key * new_first_key)
++{
++	return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
++			 new_first_key);
++}
++
++/* this returns 1 of key of first item changed, 0 - if it did not */
++static int
++prepare_for_compact(struct cut40_info *cinfo,
++		    const struct cut_kill_params *params, int is_cut,
++		    void *data, carry_plugin_info * info)
++{
++	znode *node;
++	item_header40 *ih;
++	pos_in_node_t freed;
++	pos_in_node_t item_pos;
++	coord_t coord;
++	reiser4_key new_first_key;
++	pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
++				      void *, reiser4_key *, reiser4_key *);
++	pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
++	pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
++				     reiser4_key *);
++	int retval;
++
++	retval = 0;
++
++	node = params->from->node;
++
++	assert("vs-184", node == params->to->node);
++	assert("vs-312", !node_is_empty(node));
++	assert("vs-297",
++	       coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
++
++	if (is_cut) {
++		kill_units_f = cut_units;
++		kill_tail_f = cut_tail;
++		kill_head_f = cut_head;
++	} else {
++		kill_units_f = kill_units;
++		kill_tail_f = kill_tail;
++		kill_head_f = kill_head;
++	}
++
++	if (parse_cut(cinfo, params) == 1) {
++		/* cut from the middle of item */
++		freed =
++		    kill_units_f(params->from, params->from->unit_pos,
++				 params->to->unit_pos, data,
++				 params->smallest_removed, NULL);
++
++		item_pos = params->from->item_pos;
++		ih = node40_ih_at(node, item_pos);
++		cinfo->freed_space_start =
++		    ih40_get_offset(ih) + node40_item_length(node,
++							     item_pos) - freed;
++		cinfo->freed_space_end = cinfo->freed_space_start + freed;
++		cinfo->first_moved = item_pos + 1;
++	} else {
++		assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
++				   cinfo->first_removed != MAX_POS_IN_NODE ||
++				   cinfo->head_removed != MAX_POS_IN_NODE));
++
++		switch (cinfo->mode) {
++		case CMODE_TAIL:
++			/* one item gets cut partially from its end */
++			assert("vs-1562",
++			       cinfo->tail_removed == params->from->item_pos);
++
++			freed =
++			    kill_tail_f(params->from, data,
++					params->smallest_removed);
++
++			item_pos = cinfo->tail_removed;
++			ih = node40_ih_at(node, item_pos);
++			cinfo->freed_space_start =
++			    ih40_get_offset(ih) + node40_item_length(node,
++								     item_pos) -
++			    freed;
++			cinfo->freed_space_end =
++			    cinfo->freed_space_start + freed;
++			cinfo->first_moved = cinfo->tail_removed + 1;
++			break;
++
++		case CMODE_WHOLE:
++			/* one or more items get removed completely */
++			assert("vs-1563",
++			       cinfo->first_removed == params->from->item_pos);
++			assert("vs-1564", cinfo->removed_count > 0
++			       && cinfo->removed_count != MAX_POS_IN_NODE);
++
++			/* call kill hook for all items removed completely */
++			if (is_cut == 0)
++				call_kill_hooks(node, cinfo->first_removed,
++						cinfo->removed_count, data);
++
++			item_pos = cinfo->first_removed;
++			ih = node40_ih_at(node, item_pos);
++
++			if (params->smallest_removed)
++				memcpy(params->smallest_removed, &ih->key,
++				       sizeof(reiser4_key));
++
++			cinfo->freed_space_start = ih40_get_offset(ih);
++
++			item_pos += (cinfo->removed_count - 1);
++			ih -= (cinfo->removed_count - 1);
++			cinfo->freed_space_end =
++			    ih40_get_offset(ih) + node40_item_length(node,
++								     item_pos);
++			cinfo->first_moved = item_pos + 1;
++			if (cinfo->first_removed == 0)
++				/* key of first item of the node changes */
++				retval = 1;
++			break;
++
++		case CMODE_HEAD:
++			/* one item gets cut partially from its head */
++			assert("vs-1565",
++			       cinfo->head_removed == params->from->item_pos);
++
++			freed =
++			    kill_head_f(params->to, data,
++					params->smallest_removed,
++					&new_first_key);
++
++			item_pos = cinfo->head_removed;
++			ih = node40_ih_at(node, item_pos);
++			cinfo->freed_space_start = ih40_get_offset(ih);
++			cinfo->freed_space_end = ih40_get_offset(ih) + freed;
++			cinfo->first_moved = cinfo->head_removed + 1;
++
++			/* item head is removed, therefore, item key changed */
++			coord.node = node;
++			coord_set_item_pos(&coord, item_pos);
++			coord.unit_pos = 0;
++			coord.between = AT_UNIT;
++			update_item_key_node40(&coord, &new_first_key, NULL);
++			if (item_pos == 0)
++				/* key of first item of the node changes */
++				retval = 1;
++			break;
++
++		case CMODE_TAIL | CMODE_WHOLE:
++			/* one item gets cut from its end and one or more items get removed completely */
++			assert("vs-1566",
++			       cinfo->tail_removed == params->from->item_pos);
++			assert("vs-1567",
++			       cinfo->first_removed == cinfo->tail_removed + 1);
++			assert("vs-1564", cinfo->removed_count > 0
++			       && cinfo->removed_count != MAX_POS_IN_NODE);
++
++			freed =
++			    kill_tail_f(params->from, data,
++					params->smallest_removed);
++
++			item_pos = cinfo->tail_removed;
++			ih = node40_ih_at(node, item_pos);
++			cinfo->freed_space_start =
++			    ih40_get_offset(ih) + node40_item_length(node,
++								     item_pos) -
++			    freed;
++
++			/* call kill hook for all items removed completely */
++			if (is_cut == 0)
++				call_kill_hooks(node, cinfo->first_removed,
++						cinfo->removed_count, data);
++
++			item_pos += cinfo->removed_count;
++			ih -= cinfo->removed_count;
++			cinfo->freed_space_end =
++			    ih40_get_offset(ih) + node40_item_length(node,
++								     item_pos);
++			cinfo->first_moved = item_pos + 1;
++			break;
++
++		case CMODE_WHOLE | CMODE_HEAD:
++			/* one or more items get removed completely and one item gets cut partially from its head */
++			assert("vs-1568",
++			       cinfo->first_removed == params->from->item_pos);
++			assert("vs-1564", cinfo->removed_count > 0
++			       && cinfo->removed_count != MAX_POS_IN_NODE);
++			assert("vs-1569",
++			       cinfo->head_removed ==
++			       cinfo->first_removed + cinfo->removed_count);
++
++			/* call kill hook for all items removed completely */
++			if (is_cut == 0)
++				call_kill_hooks(node, cinfo->first_removed,
++						cinfo->removed_count, data);
++
++			item_pos = cinfo->first_removed;
++			ih = node40_ih_at(node, item_pos);
++
++			if (params->smallest_removed)
++				memcpy(params->smallest_removed, &ih->key,
++				       sizeof(reiser4_key));
++
++			freed =
++			    kill_head_f(params->to, data, NULL, &new_first_key);
++
++			cinfo->freed_space_start = ih40_get_offset(ih);
++
++			ih = node40_ih_at(node, cinfo->head_removed);
++			/* this is the most complex case. Item which got head removed and items which are to be moved
++			   intact change their location differently. */
++			cinfo->freed_space_end = ih40_get_offset(ih) + freed;
++			cinfo->first_moved = cinfo->head_removed;
++			cinfo->head_removed_location = cinfo->freed_space_start;
++
++			/* item head is removed, therefore, item key changed */
++			coord.node = node;
++			coord_set_item_pos(&coord, cinfo->head_removed);
++			coord.unit_pos = 0;
++			coord.between = AT_UNIT;
++			update_item_key_node40(&coord, &new_first_key, NULL);
++
++			assert("vs-1579", cinfo->first_removed == 0);
++			/* key of first item of the node changes */
++			retval = 1;
++			break;
++
++		case CMODE_TAIL | CMODE_HEAD:
++			/* one item get cut from its end and its neighbor gets cut from its tail */
++			impossible("vs-1576", "this can not happen currently");
++			break;
++
++		case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
++			impossible("vs-1577", "this can not happen currently");
++			break;
++		default:
++			impossible("vs-1578", "unexpected cut mode");
++			break;
++		}
++	}
++	return retval;
++}
++
++/* plugin->u.node.kill
++   return value is number of items removed completely */
++int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
++{
++	znode *node;
++	struct cut40_info cinfo;
++	int first_key_changed;
++
++	node = kdata->params.from->node;
++
++	first_key_changed =
++	    prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
++				info);
++	compact(node, &cinfo);
++
++	if (info) {
++		/* it is not called by node40_shift, so we have to take care
++		   of changes on upper levels */
++		if (node_is_empty(node)
++		    && !(kdata->flags & DELETE_RETAIN_EMPTY))
++			/* all contents of node is deleted */
++			prepare_removal_node40(node, info);
++		else if (first_key_changed) {
++			prepare_for_update(NULL, node, info);
++		}
++	}
++
++	coord_clear_iplug(kdata->params.from);
++	coord_clear_iplug(kdata->params.to);
++
++	znode_make_dirty(node);
++	return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
++}
++
++/* plugin->u.node.cut
++   return value is number of items removed completely */
++int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
++{
++	znode *node;
++	struct cut40_info cinfo;
++	int first_key_changed;
++
++	node = cdata->params.from->node;
++
++	first_key_changed =
++	    prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
++				info);
++	compact(node, &cinfo);
++
++	if (info) {
++		/* it is not called by node40_shift, so we have to take care
++		   of changes on upper levels */
++		if (node_is_empty(node))
++			/* all contents of node is deleted */
++			prepare_removal_node40(node, info);
++		else if (first_key_changed) {
++			prepare_for_update(NULL, node, info);
++		}
++	}
++
++	coord_clear_iplug(cdata->params.from);
++	coord_clear_iplug(cdata->params.to);
++
++	znode_make_dirty(node);
++	return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
++}
++
++/* this structure is used by shift method of node40 plugin */
++struct shift_params {
++	shift_direction pend;	/* when @pend == append - we are shifting to
++				   left, when @pend == prepend - to right */
++	coord_t wish_stop;	/* when shifting to left this is last unit we
++				   want shifted, when shifting to right - this
++				   is set to unit we want to start shifting
++				   from */
++	znode *target;
++	int everything;		/* it is set to 1 if everything we have to shift is
++				   shifted, 0 - otherwise */
++
++	/* FIXME-VS: get rid of read_stop */
++
++	/* these are set by estimate_shift */
++	coord_t real_stop;	/* this will be set to last unit which will be
++				   really shifted */
++
++	/* coordinate in source node before operation of unit which becomes
++	   first after shift to left of last after shift to right */
++	union {
++		coord_t future_first;
++		coord_t future_last;
++	} u;
++
++	unsigned merging_units;	/* number of units of first item which have to
++				   be merged with last item of target node */
++	unsigned merging_bytes;	/* number of bytes in those units */
++
++	unsigned entire;	/* items shifted in their entirety */
++	unsigned entire_bytes;	/* number of bytes in those items */
++
++	unsigned part_units;	/* number of units of partially copied item */
++	unsigned part_bytes;	/* number of bytes in those units */
++
++	unsigned shift_bytes;	/* total number of bytes in items shifted (item
++				   headers not included) */
++
++};
++
++static int item_creation_overhead(coord_t *item)
++{
++	return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
++}
++
++/* how many units are there in @source starting from source->unit_pos
++   but not further than @stop_coord */
++static int
++wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
++{
++	if (pend == SHIFT_LEFT) {
++		assert("vs-181", source->unit_pos == 0);
++	} else {
++		assert("vs-182",
++		       source->unit_pos == coord_last_unit_pos(source));
++	}
++
++	if (source->item_pos != stop_coord->item_pos) {
++		/* @source and @stop_coord are different items */
++		return coord_last_unit_pos(source) + 1;
++	}
++
++	if (pend == SHIFT_LEFT) {
++		return stop_coord->unit_pos + 1;
++	} else {
++		return source->unit_pos - stop_coord->unit_pos + 1;
++	}
++}
++
++/* this calculates what can be copied from @shift->wish_stop.node to
++   @shift->target */
++static void
++estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
++{
++	unsigned target_free_space, size;
++	pos_in_node_t stop_item;	/* item which estimating should not consider */
++	unsigned want;		/* number of units of item we want shifted */
++	coord_t source;		/* item being estimated */
++	item_plugin *iplug;
++
++	/* shifting to left/right starts from first/last units of
++	   @shift->wish_stop.node */
++	if (shift->pend == SHIFT_LEFT) {
++		coord_init_first_unit(&source, shift->wish_stop.node);
++	} else {
++		coord_init_last_unit(&source, shift->wish_stop.node);
++	}
++	shift->real_stop = source;
++
++	/* free space in target node and number of items in source */
++	target_free_space = znode_free_space(shift->target);
++
++	shift->everything = 0;
++	if (!node_is_empty(shift->target)) {
++		/* target node is not empty, check for boundary items
++		   mergeability */
++		coord_t to;
++
++		/* item we try to merge @source with */
++		if (shift->pend == SHIFT_LEFT) {
++			coord_init_last_unit(&to, shift->target);
++		} else {
++			coord_init_first_unit(&to, shift->target);
++		}
++
++		if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
++								      &source) :
++		    are_items_mergeable(&source, &to)) {
++			/* how many units of @source do we want to merge to
++			   item @to */
++			want =
++			    wanted_units(&source, &shift->wish_stop,
++					 shift->pend);
++
++			/* how many units of @source we can merge to item
++			   @to */
++			iplug = item_plugin_by_coord(&source);
++			if (iplug->b.can_shift != NULL)
++				shift->merging_units =
++				    iplug->b.can_shift(target_free_space,
++						       &source, shift->target,
++						       shift->pend, &size,
++						       want);
++			else {
++				shift->merging_units = 0;
++				size = 0;
++			}
++			shift->merging_bytes = size;
++			shift->shift_bytes += size;
++			/* update stop coord to be set to last unit of @source
++			   we can merge to @target */
++			if (shift->merging_units)
++				/* at least one unit can be shifted */
++				shift->real_stop.unit_pos =
++				    (shift->merging_units - source.unit_pos -
++				     1) * shift->pend;
++			else {
++				/* nothing can be shifted */
++				if (shift->pend == SHIFT_LEFT)
++					coord_init_before_first_item(&shift->
++								     real_stop,
++								     source.
++								     node);
++				else
++					coord_init_after_last_item(&shift->
++								   real_stop,
++								   source.node);
++			}
++			assert("nikita-2081", shift->real_stop.unit_pos + 1);
++
++			if (shift->merging_units != want) {
++				/* we could not copy as many as we want, so,
++				   there is no reason for estimating any
++				   longer */
++				return;
++			}
++
++			target_free_space -= size;
++			coord_add_item_pos(&source, shift->pend);
++		}
++	}
++
++	/* number of item nothing of which we want to shift */
++	stop_item = shift->wish_stop.item_pos + shift->pend;
++
++	/* calculate how many items can be copied into given free
++	   space as whole */
++	for (; source.item_pos != stop_item;
++	     coord_add_item_pos(&source, shift->pend)) {
++		if (shift->pend == SHIFT_RIGHT)
++			source.unit_pos = coord_last_unit_pos(&source);
++
++		/* how many units of @source do we want to copy */
++		want = wanted_units(&source, &shift->wish_stop, shift->pend);
++
++		if (want == coord_last_unit_pos(&source) + 1) {
++			/* we want this item to be copied entirely */
++			size =
++			    item_length_by_coord(&source) +
++			    item_creation_overhead(&source);
++			if (size <= target_free_space) {
++				/* item fits into target node as whole */
++				target_free_space -= size;
++				shift->shift_bytes +=
++				    size - item_creation_overhead(&source);
++				shift->entire_bytes +=
++				    size - item_creation_overhead(&source);
++				shift->entire++;
++
++				/* update shift->real_stop coord to be set to
++				   last unit of @source we can merge to
++				   @target */
++				shift->real_stop = source;
++				if (shift->pend == SHIFT_LEFT)
++					shift->real_stop.unit_pos =
++					    coord_last_unit_pos(&shift->
++								real_stop);
++				else
++					shift->real_stop.unit_pos = 0;
++				continue;
++			}
++		}
++
++		/* we reach here only for an item which does not fit into
++		   target node in its entirety. This item may be either
++		   partially shifted, or not shifted at all. We will have to
++		   create new item in target node, so decrease amout of free
++		   space by an item creation overhead. We can reach here also
++		   if stop coord is in this item */
++		if (target_free_space >=
++		    (unsigned)item_creation_overhead(&source)) {
++			target_free_space -= item_creation_overhead(&source);
++			iplug = item_plugin_by_coord(&source);
++			if (iplug->b.can_shift) {
++				shift->part_units = iplug->b.can_shift(target_free_space,
++								       &source,
++								       NULL, /* target */
++								       shift->pend,
++								       &size,
++								       want);
++			} else {
++				target_free_space = 0;
++				shift->part_units = 0;
++				size = 0;
++			}
++		} else {
++			target_free_space = 0;
++			shift->part_units = 0;
++			size = 0;
++		}
++		shift->part_bytes = size;
++		shift->shift_bytes += size;
++
++		/* set @shift->real_stop to last unit of @source we can merge
++		   to @shift->target */
++		if (shift->part_units) {
++			shift->real_stop = source;
++			shift->real_stop.unit_pos =
++			    (shift->part_units - source.unit_pos -
++			     1) * shift->pend;
++			assert("nikita-2082", shift->real_stop.unit_pos + 1);
++		}
++
++		if (want != shift->part_units)
++			/* not everything wanted were shifted */
++			return;
++		break;
++	}
++
++	shift->everything = 1;
++}
++
++static void
++copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
++	   shift_direction dir, unsigned free_space)
++{
++	item_plugin *iplug;
++
++	assert("nikita-1463", target != NULL);
++	assert("nikita-1464", source != NULL);
++	assert("nikita-1465", from + count <= coord_num_units(source));
++
++	iplug = item_plugin_by_coord(source);
++	assert("nikita-1468", iplug == item_plugin_by_coord(target));
++	iplug->b.copy_units(target, source, from, count, dir, free_space);
++
++	if (dir == SHIFT_RIGHT) {
++		/* FIXME-VS: this looks not necessary. update_item_key was
++		   called already by copy_units method */
++		reiser4_key split_key;
++
++		assert("nikita-1469", target->unit_pos == 0);
++
++		unit_key_by_coord(target, &split_key);
++		node_plugin_by_coord(target)->update_item_key(target,
++							      &split_key, NULL);
++	}
++}
++
++/* copy part of @shift->real_stop.node starting either from its beginning or
++   from its end and ending at @shift->real_stop to either the end or the
++   beginning of @shift->target */
++static void copy(struct shift_params *shift)
++{
++	node40_header *nh;
++	coord_t from;
++	coord_t to;
++	item_header40 *from_ih, *to_ih;
++	int free_space_start;
++	int new_items;
++	unsigned old_items;
++	int old_offset;
++	unsigned i;
++
++	nh = node40_node_header(shift->target);
++	free_space_start = nh40_get_free_space_start(nh);
++	old_items = nh40_get_num_items(nh);
++	new_items = shift->entire + (shift->part_units ? 1 : 0);
++	assert("vs-185",
++	       shift->shift_bytes ==
++	       shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
++
++	from = shift->wish_stop;
++
++	coord_init_first_unit(&to, shift->target);
++
++	/* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
++	   hence to.between is set to EMPTY_NODE above. Looks like we want it
++	   to be AT_UNIT.
++
++	   Oh, wonders of ->betweeness...
++
++	 */
++	to.between = AT_UNIT;
++
++	if (shift->pend == SHIFT_LEFT) {
++		/* copying to left */
++
++		coord_set_item_pos(&from, 0);
++		from_ih = node40_ih_at(from.node, 0);
++
++		coord_set_item_pos(&to,
++				   node40_num_of_items_internal(to.node) - 1);
++		if (shift->merging_units) {
++			/* expand last item, so that plugin methods will see
++			   correct data */
++			free_space_start += shift->merging_bytes;
++			nh40_set_free_space_start(nh,
++						  (unsigned)free_space_start);
++			nh40_set_free_space(nh,
++					    nh40_get_free_space(nh) -
++					    shift->merging_bytes);
++
++			/* appending last item of @target */
++			copy_units(&to, &from, 0,	/* starting from 0-th unit */
++				   shift->merging_units, SHIFT_LEFT,
++				   shift->merging_bytes);
++			coord_inc_item_pos(&from);
++			from_ih--;
++			coord_inc_item_pos(&to);
++		}
++
++		to_ih = node40_ih_at(shift->target, old_items);
++		if (shift->entire) {
++			/* copy @entire items entirely */
++
++			/* copy item headers */
++			memcpy(to_ih - shift->entire + 1,
++			       from_ih - shift->entire + 1,
++			       shift->entire * sizeof(item_header40));
++			/* update item header offset */
++			old_offset = ih40_get_offset(from_ih);
++			/* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
++			for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
++				ih40_set_offset(to_ih,
++						ih40_get_offset(from_ih) -
++						old_offset + free_space_start);
++
++			/* copy item bodies */
++			memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset,	/*ih40_get_offset (from_ih), */
++			       shift->entire_bytes);
++
++			coord_add_item_pos(&from, (int)shift->entire);
++			coord_add_item_pos(&to, (int)shift->entire);
++		}
++
++		nh40_set_free_space_start(nh,
++					  free_space_start +
++					  shift->shift_bytes -
++					  shift->merging_bytes);
++		nh40_set_free_space(nh,
++				    nh40_get_free_space(nh) -
++				    (shift->shift_bytes - shift->merging_bytes +
++				     sizeof(item_header40) * new_items));
++
++		/* update node header */
++		node40_set_num_items(shift->target, nh, old_items + new_items);
++		assert("vs-170",
++		       nh40_get_free_space(nh) < znode_size(shift->target));
++
++		if (shift->part_units) {
++			/* copy heading part (@part units) of @source item as
++			   a new item into @target->node */
++
++			/* copy item header of partially copied item */
++			coord_set_item_pos(&to,
++					   node40_num_of_items_internal(to.node)
++					   - 1);
++			memcpy(to_ih, from_ih, sizeof(item_header40));
++			ih40_set_offset(to_ih,
++					nh40_get_free_space_start(nh) -
++					shift->part_bytes);
++			if (item_plugin_by_coord(&to)->b.init)
++				item_plugin_by_coord(&to)->b.init(&to, &from,
++								  NULL);
++			copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
++				   shift->part_bytes);
++		}
++
++	} else {
++		/* copying to right */
++
++		coord_set_item_pos(&from,
++				   node40_num_of_items_internal(from.node) - 1);
++		from_ih = node40_ih_at_coord(&from);
++
++		coord_set_item_pos(&to, 0);
++
++		/* prepare space for new items */
++		memmove(zdata(to.node) + sizeof(node40_header) +
++			shift->shift_bytes,
++			zdata(to.node) + sizeof(node40_header),
++			free_space_start - sizeof(node40_header));
++		/* update item headers of moved items */
++		to_ih = node40_ih_at(to.node, 0);
++		/* first item gets @merging_bytes longer. free space appears
++		   at its beginning */
++		if (!node_is_empty(to.node))
++			ih40_set_offset(to_ih,
++					ih40_get_offset(to_ih) +
++					shift->shift_bytes -
++					shift->merging_bytes);
++
++		for (i = 1; i < old_items; i++)
++			ih40_set_offset(to_ih - i,
++					ih40_get_offset(to_ih - i) +
++					shift->shift_bytes);
++
++		/* move item headers to make space for new items */
++		memmove(to_ih - old_items + 1 - new_items,
++			to_ih - old_items + 1,
++			sizeof(item_header40) * old_items);
++		to_ih -= (new_items - 1);
++
++		nh40_set_free_space_start(nh,
++					  free_space_start +
++					  shift->shift_bytes);
++		nh40_set_free_space(nh,
++				    nh40_get_free_space(nh) -
++				    (shift->shift_bytes +
++				     sizeof(item_header40) * new_items));
++
++		/* update node header */
++		node40_set_num_items(shift->target, nh, old_items + new_items);
++		assert("vs-170",
++		       nh40_get_free_space(nh) < znode_size(shift->target));
++
++		if (shift->merging_units) {
++			coord_add_item_pos(&to, new_items);
++			to.unit_pos = 0;
++			to.between = AT_UNIT;
++			/* prepend first item of @to */
++			copy_units(&to, &from,
++				   coord_last_unit_pos(&from) -
++				   shift->merging_units + 1,
++				   shift->merging_units, SHIFT_RIGHT,
++				   shift->merging_bytes);
++			coord_dec_item_pos(&from);
++			from_ih++;
++		}
++
++		if (shift->entire) {
++			/* copy @entire items entirely */
++
++			/* copy item headers */
++			memcpy(to_ih, from_ih,
++			       shift->entire * sizeof(item_header40));
++
++			/* update item header offset */
++			old_offset =
++			    ih40_get_offset(from_ih + shift->entire - 1);
++			/* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
++			for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
++				ih40_set_offset(to_ih,
++						ih40_get_offset(from_ih) -
++						old_offset +
++						sizeof(node40_header) +
++						shift->part_bytes);
++			/* copy item bodies */
++			coord_add_item_pos(&from, -(int)(shift->entire - 1));
++			memcpy(zdata(to.node) + sizeof(node40_header) +
++			       shift->part_bytes, item_by_coord_node40(&from),
++			       shift->entire_bytes);
++			coord_dec_item_pos(&from);
++		}
++
++		if (shift->part_units) {
++			coord_set_item_pos(&to, 0);
++			to.unit_pos = 0;
++			to.between = AT_UNIT;
++			/* copy heading part (@part units) of @source item as
++			   a new item into @target->node */
++
++			/* copy item header of partially copied item */
++			memcpy(to_ih, from_ih, sizeof(item_header40));
++			ih40_set_offset(to_ih, sizeof(node40_header));
++			if (item_plugin_by_coord(&to)->b.init)
++				item_plugin_by_coord(&to)->b.init(&to, &from,
++								  NULL);
++			copy_units(&to, &from,
++				   coord_last_unit_pos(&from) -
++				   shift->part_units + 1, shift->part_units,
++				   SHIFT_RIGHT, shift->part_bytes);
++		}
++	}
++}
++
++/* remove everything either before or after @fact_stop. Number of items
++   removed completely is returned */
++static int delete_copied(struct shift_params *shift)
++{
++	coord_t from;
++	coord_t to;
++	struct carry_cut_data cdata;
++
++	if (shift->pend == SHIFT_LEFT) {
++		/* we were shifting to left, remove everything from the
++		   beginning of @shift->wish_stop->node upto
++		   @shift->wish_stop */
++		coord_init_first_unit(&from, shift->real_stop.node);
++		to = shift->real_stop;
++
++		/* store old coordinate of unit which will be first after
++		   shift to left */
++		shift->u.future_first = to;
++		coord_next_unit(&shift->u.future_first);
++	} else {
++		/* we were shifting to right, remove everything from
++		   @shift->stop_coord upto to end of
++		   @shift->stop_coord->node */
++		from = shift->real_stop;
++		coord_init_last_unit(&to, from.node);
++
++		/* store old coordinate of unit which will be last after
++		   shift to right */
++		shift->u.future_last = from;
++		coord_prev_unit(&shift->u.future_last);
++	}
++
++	cdata.params.from = &from;
++	cdata.params.to = &to;
++	cdata.params.from_key = NULL;
++	cdata.params.to_key = NULL;
++	cdata.params.smallest_removed = NULL;
++	return cut_node40(&cdata, NULL);
++}
++
++/* something was moved between @left and @right. Add carry operation to @info
++   list to have carry to update delimiting key between them */
++static int
++prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
++{
++	carry_op *op;
++	carry_node *cn;
++
++	if (info == NULL)
++		/* nowhere to send operation to. */
++		return 0;
++
++	if (!should_notify_parent(right))
++		return 0;
++
++	op = node_post_carry(info, COP_UPDATE, right, 1);
++	if (IS_ERR(op) || op == NULL)
++		return op ? PTR_ERR(op) : -EIO;
++
++	if (left != NULL) {
++		carry_node *reference;
++
++		if (info->doing)
++			reference = insert_carry_node(info->doing,
++						      info->todo, left);
++		else
++			reference = op->node;
++		assert("nikita-2992", reference != NULL);
++		cn = add_carry(info->todo, POOLO_BEFORE, reference);
++		if (IS_ERR(cn))
++			return PTR_ERR(cn);
++		cn->parent = 1;
++		cn->node = left;
++		if (ZF_ISSET(left, JNODE_ORPHAN))
++			cn->left_before = 1;
++		op->u.update.left = cn;
++	} else
++		op->u.update.left = NULL;
++	return 0;
++}
++
++/* plugin->u.node.prepare_removal
++   to delete a pointer to @empty from the tree add corresponding carry
++   operation (delete) to @info list */
++int prepare_removal_node40(znode * empty, carry_plugin_info * info)
++{
++	carry_op *op;
++	reiser4_tree *tree;
++
++	if (!should_notify_parent(empty))
++		return 0;
++	/* already on a road to Styx */
++	if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
++		return 0;
++	op = node_post_carry(info, COP_DELETE, empty, 1);
++	if (IS_ERR(op) || op == NULL)
++		return RETERR(op ? PTR_ERR(op) : -EIO);
++
++	op->u.delete.child = NULL;
++	op->u.delete.flags = 0;
++
++	/* fare thee well */
++	tree = znode_get_tree(empty);
++	read_lock_tree(tree);
++	write_lock_dk(tree);
++	znode_set_ld_key(empty, znode_get_rd_key(empty));
++	if (znode_is_left_connected(empty) && empty->left)
++		znode_set_rd_key(empty->left, znode_get_rd_key(empty));
++	write_unlock_dk(tree);
++	read_unlock_tree(tree);
++
++	ZF_SET(empty, JNODE_HEARD_BANSHEE);
++	return 0;
++}
++
++/* something were shifted from @insert_coord->node to @shift->target, update
++   @insert_coord correspondingly */
++static void
++adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
++	     int including_insert_coord)
++{
++	/* item plugin was invalidated by shifting */
++	coord_clear_iplug(insert_coord);
++
++	if (node_is_empty(shift->wish_stop.node)) {
++		assert("vs-242", shift->everything);
++		if (including_insert_coord) {
++			if (shift->pend == SHIFT_RIGHT) {
++				/* set @insert_coord before first unit of
++				   @shift->target node */
++				coord_init_before_first_item(insert_coord,
++							     shift->target);
++			} else {
++				/* set @insert_coord after last in target node */
++				coord_init_after_last_item(insert_coord,
++							   shift->target);
++			}
++		} else {
++			/* set @insert_coord inside of empty node. There is
++			   only one possible coord within an empty
++			   node. init_first_unit will set that coord */
++			coord_init_first_unit(insert_coord,
++					      shift->wish_stop.node);
++		}
++		return;
++	}
++
++	if (shift->pend == SHIFT_RIGHT) {
++		/* there was shifting to right */
++		if (shift->everything) {
++			/* everything wanted was shifted */
++			if (including_insert_coord) {
++				/* @insert_coord is set before first unit of
++				   @to node */
++				coord_init_before_first_item(insert_coord,
++							     shift->target);
++				insert_coord->between = BEFORE_UNIT;
++			} else {
++				/* @insert_coord is set after last unit of
++				   @insert->node */
++				coord_init_last_unit(insert_coord,
++						     shift->wish_stop.node);
++				insert_coord->between = AFTER_UNIT;
++			}
++		}
++		return;
++	}
++
++	/* there was shifting to left */
++	if (shift->everything) {
++		/* everything wanted was shifted */
++		if (including_insert_coord) {
++			/* @insert_coord is set after last unit in @to node */
++			coord_init_after_last_item(insert_coord, shift->target);
++		} else {
++			/* @insert_coord is set before first unit in the same
++			   node */
++			coord_init_before_first_item(insert_coord,
++						     shift->wish_stop.node);
++		}
++		return;
++	}
++
++	/* FIXME-VS: the code below is complicated because with between ==
++	   AFTER_ITEM unit_pos is set to 0 */
++
++	if (!removed) {
++		/* no items were shifted entirely */
++		assert("vs-195", shift->merging_units == 0
++		       || shift->part_units == 0);
++
++		if (shift->real_stop.item_pos == insert_coord->item_pos) {
++			if (shift->merging_units) {
++				if (insert_coord->between == AFTER_UNIT) {
++					assert("nikita-1441",
++					       insert_coord->unit_pos >=
++					       shift->merging_units);
++					insert_coord->unit_pos -=
++					    shift->merging_units;
++				} else if (insert_coord->between == BEFORE_UNIT) {
++					assert("nikita-2090",
++					       insert_coord->unit_pos >
++					       shift->merging_units);
++					insert_coord->unit_pos -=
++					    shift->merging_units;
++				}
++
++				assert("nikita-2083",
++				       insert_coord->unit_pos + 1);
++			} else {
++				if (insert_coord->between == AFTER_UNIT) {
++					assert("nikita-1442",
++					       insert_coord->unit_pos >=
++					       shift->part_units);
++					insert_coord->unit_pos -=
++					    shift->part_units;
++				} else if (insert_coord->between == BEFORE_UNIT) {
++					assert("nikita-2089",
++					       insert_coord->unit_pos >
++					       shift->part_units);
++					insert_coord->unit_pos -=
++					    shift->part_units;
++				}
++
++				assert("nikita-2084",
++				       insert_coord->unit_pos + 1);
++			}
++		}
++		return;
++	}
++
++	/* we shifted to left and there was no enough space for everything */
++	switch (insert_coord->between) {
++	case AFTER_UNIT:
++	case BEFORE_UNIT:
++		if (shift->real_stop.item_pos == insert_coord->item_pos)
++			insert_coord->unit_pos -= shift->part_units;
++	case AFTER_ITEM:
++		coord_add_item_pos(insert_coord, -removed);
++		break;
++	default:
++		impossible("nikita-2087", "not ready");
++	}
++	assert("nikita-2085", insert_coord->unit_pos + 1);
++}
++
++static int call_shift_hooks(struct shift_params *shift)
++{
++	unsigned i, shifted;
++	coord_t coord;
++	item_plugin *iplug;
++
++	assert("vs-275", !node_is_empty(shift->target));
++
++	/* number of items shift touches */
++	shifted =
++	    shift->entire + (shift->merging_units ? 1 : 0) +
++	    (shift->part_units ? 1 : 0);
++
++	if (shift->pend == SHIFT_LEFT) {
++		/* moved items are at the end */
++		coord_init_last_unit(&coord, shift->target);
++		coord.unit_pos = 0;
++
++		assert("vs-279", shift->pend == 1);
++		for (i = 0; i < shifted; i++) {
++			unsigned from, count;
++
++			iplug = item_plugin_by_coord(&coord);
++			if (i == 0 && shift->part_units) {
++				assert("vs-277",
++				       coord_num_units(&coord) ==
++				       shift->part_units);
++				count = shift->part_units;
++				from = 0;
++			} else if (i == shifted - 1 && shift->merging_units) {
++				count = shift->merging_units;
++				from = coord_num_units(&coord) - count;
++			} else {
++				count = coord_num_units(&coord);
++				from = 0;
++			}
++
++			if (iplug->b.shift_hook) {
++				iplug->b.shift_hook(&coord, from, count,
++						    shift->wish_stop.node);
++			}
++			coord_add_item_pos(&coord, -shift->pend);
++		}
++	} else {
++		/* moved items are at the beginning */
++		coord_init_first_unit(&coord, shift->target);
++
++		assert("vs-278", shift->pend == -1);
++		for (i = 0; i < shifted; i++) {
++			unsigned from, count;
++
++			iplug = item_plugin_by_coord(&coord);
++			if (i == 0 && shift->part_units) {
++				assert("vs-277",
++				       coord_num_units(&coord) ==
++				       shift->part_units);
++				count = coord_num_units(&coord);
++				from = 0;
++			} else if (i == shifted - 1 && shift->merging_units) {
++				count = shift->merging_units;
++				from = 0;
++			} else {
++				count = coord_num_units(&coord);
++				from = 0;
++			}
++
++			if (iplug->b.shift_hook) {
++				iplug->b.shift_hook(&coord, from, count,
++						    shift->wish_stop.node);
++			}
++			coord_add_item_pos(&coord, -shift->pend);
++		}
++	}
++
++	return 0;
++}
++
++/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
++static int
++unit_moved_left(const struct shift_params *shift, const coord_t * old)
++{
++	assert("vs-944", shift->real_stop.node == old->node);
++
++	if (shift->real_stop.item_pos < old->item_pos)
++		return 0;
++	if (shift->real_stop.item_pos == old->item_pos) {
++		if (shift->real_stop.unit_pos < old->unit_pos)
++			return 0;
++	}
++	return 1;
++}
++
++/* shift to right is completed. Return 1 if unit @old was moved to right
++   neighbor */
++static int
++unit_moved_right(const struct shift_params *shift, const coord_t * old)
++{
++	assert("vs-944", shift->real_stop.node == old->node);
++
++	if (shift->real_stop.item_pos > old->item_pos)
++		return 0;
++	if (shift->real_stop.item_pos == old->item_pos) {
++		if (shift->real_stop.unit_pos > old->unit_pos)
++			return 0;
++	}
++	return 1;
++}
++
++/* coord @old was set in node from which shift was performed. What was shifted
++   is stored in @shift. Update @old correspondingly to performed shift */
++static coord_t *adjust_coord2(const struct shift_params *shift,
++			      const coord_t * old, coord_t * new)
++{
++	coord_clear_iplug(new);
++	new->between = old->between;
++
++	coord_clear_iplug(new);
++	if (old->node == shift->target) {
++		if (shift->pend == SHIFT_LEFT) {
++			/* coord which is set inside of left neighbor does not
++			   change during shift to left */
++			coord_dup(new, old);
++			return new;
++		}
++		new->node = old->node;
++		coord_set_item_pos(new,
++				   old->item_pos + shift->entire +
++				   (shift->part_units ? 1 : 0));
++		new->unit_pos = old->unit_pos;
++		if (old->item_pos == 0 && shift->merging_units)
++			new->unit_pos += shift->merging_units;
++		return new;
++	}
++
++	assert("vs-977", old->node == shift->wish_stop.node);
++	if (shift->pend == SHIFT_LEFT) {
++		if (unit_moved_left(shift, old)) {
++			/* unit @old moved to left neighbor. Calculate its
++			   coordinate there */
++			new->node = shift->target;
++			coord_set_item_pos(new,
++					   node_num_items(shift->target) -
++					   shift->entire -
++					   (shift->part_units ? 1 : 0) +
++					   old->item_pos);
++
++			new->unit_pos = old->unit_pos;
++			if (shift->merging_units) {
++				coord_dec_item_pos(new);
++				if (old->item_pos == 0) {
++					/* unit_pos only changes if item got
++					   merged */
++					new->unit_pos =
++					    coord_num_units(new) -
++					    (shift->merging_units -
++					     old->unit_pos);
++				}
++			}
++		} else {
++			/* unit @old did not move to left neighbor.
++
++			   Use _nocheck, because @old is outside of its node.
++			 */
++			coord_dup_nocheck(new, old);
++			coord_add_item_pos(new,
++					   -shift->u.future_first.item_pos);
++			if (new->item_pos == 0)
++				new->unit_pos -= shift->u.future_first.unit_pos;
++		}
++	} else {
++		if (unit_moved_right(shift, old)) {
++			/* unit @old moved to right neighbor */
++			new->node = shift->target;
++			coord_set_item_pos(new,
++					   old->item_pos -
++					   shift->real_stop.item_pos);
++			if (new->item_pos == 0) {
++				/* unit @old might change unit pos */
++				coord_set_item_pos(new,
++						   old->unit_pos -
++						   shift->real_stop.unit_pos);
++			}
++		} else {
++			/* unit @old did not move to right neighbor, therefore
++			   it did not change */
++			coord_dup(new, old);
++		}
++	}
++	coord_set_iplug(new, item_plugin_by_coord(new));
++	return new;
++}
++
++/* this is called when shift is completed (something of source node is copied
++   to target and deleted in source) to update all taps set in current
++   context */
++static void update_taps(const struct shift_params *shift)
++{
++	tap_t *tap;
++	coord_t new;
++
++	for_all_taps(tap) {
++		/* update only taps set to nodes participating in shift */
++		if (tap->coord->node == shift->wish_stop.node
++		    || tap->coord->node == shift->target)
++			tap_to_coord(tap,
++				     adjust_coord2(shift, tap->coord, &new));
++	}
++}
++
++#if REISER4_DEBUG
++
++struct shift_check {
++	reiser4_key key;
++	__u16 plugin_id;
++	union {
++		__u64 bytes;
++		__u64 entries;
++		void *unused;
++	} u;
++};
++
++void *shift_check_prepare(const znode * left, const znode * right)
++{
++	pos_in_node_t i, nr_items;
++	int mergeable;
++	struct shift_check *data;
++	item_header40 *ih;
++
++	if (node_is_empty(left) || node_is_empty(right))
++		mergeable = 0;
++	else {
++		coord_t l, r;
++
++		coord_init_last_unit(&l, left);
++		coord_init_first_unit(&r, right);
++		mergeable = are_items_mergeable(&l, &r);
++	}
++	nr_items =
++	    node40_num_of_items_internal(left) +
++	    node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
++	data =
++		kmalloc(sizeof(struct shift_check) * nr_items, get_gfp_mask());
++	if (data != NULL) {
++		coord_t coord;
++		pos_in_node_t item_pos;
++
++		coord_init_first_unit(&coord, left);
++		i = 0;
++
++		for (item_pos = 0;
++		     item_pos < node40_num_of_items_internal(left);
++		     item_pos++) {
++
++			coord_set_item_pos(&coord, item_pos);
++			ih = node40_ih_at_coord(&coord);
++
++			data[i].key = ih->key;
++			data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
++			switch (data[i].plugin_id) {
++			case CTAIL_ID:
++			case FORMATTING_ID:
++				data[i].u.bytes = coord_num_units(&coord);
++				break;
++			case EXTENT_POINTER_ID:
++				data[i].u.bytes =
++				    extent_size(&coord,
++						coord_num_units(&coord));
++				break;
++			case COMPOUND_DIR_ID:
++				data[i].u.entries = coord_num_units(&coord);
++				break;
++			default:
++				data[i].u.unused = NULL;
++				break;
++			}
++			i++;
++		}
++
++		coord_init_first_unit(&coord, right);
++
++		if (mergeable) {
++			assert("vs-1609", i != 0);
++
++			ih = node40_ih_at_coord(&coord);
++
++			assert("vs-1589",
++			       data[i - 1].plugin_id ==
++			       le16_to_cpu(get_unaligned(&ih->plugin_id)));
++			switch (data[i - 1].plugin_id) {
++			case CTAIL_ID:
++			case FORMATTING_ID:
++				data[i - 1].u.bytes += coord_num_units(&coord);
++				break;
++			case EXTENT_POINTER_ID:
++				data[i - 1].u.bytes +=
++				    extent_size(&coord,
++						coord_num_units(&coord));
++				break;
++			case COMPOUND_DIR_ID:
++				data[i - 1].u.entries +=
++				    coord_num_units(&coord);
++				break;
++			default:
++				impossible("vs-1605", "wrong mergeable item");
++				break;
++			}
++			item_pos = 1;
++		} else
++			item_pos = 0;
++		for (; item_pos < node40_num_of_items_internal(right);
++		     item_pos++) {
++
++			assert("vs-1604", i < nr_items);
++			coord_set_item_pos(&coord, item_pos);
++			ih = node40_ih_at_coord(&coord);
++
++			data[i].key = ih->key;
++			data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
++			switch (data[i].plugin_id) {
++			case CTAIL_ID:
++			case FORMATTING_ID:
++				data[i].u.bytes = coord_num_units(&coord);
++				break;
++			case EXTENT_POINTER_ID:
++				data[i].u.bytes =
++				    extent_size(&coord,
++						coord_num_units(&coord));
++				break;
++			case COMPOUND_DIR_ID:
++				data[i].u.entries = coord_num_units(&coord);
++				break;
++			default:
++				data[i].u.unused = NULL;
++				break;
++			}
++			i++;
++		}
++		assert("vs-1606", i == nr_items);
++	}
++	return data;
++}
++
++void shift_check(void *vp, const znode * left, const znode * right)
++{
++	pos_in_node_t i, nr_items;
++	coord_t coord;
++	__u64 last_bytes;
++	int mergeable;
++	item_header40 *ih;
++	pos_in_node_t item_pos;
++	struct shift_check *data;
++
++	data = (struct shift_check *)vp;
++
++	if (data == NULL)
++		return;
++
++	if (node_is_empty(left) || node_is_empty(right))
++		mergeable = 0;
++	else {
++		coord_t l, r;
++
++		coord_init_last_unit(&l, left);
++		coord_init_first_unit(&r, right);
++		mergeable = are_items_mergeable(&l, &r);
++	}
++
++	nr_items =
++	    node40_num_of_items_internal(left) +
++	    node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
++
++	i = 0;
++	last_bytes = 0;
++
++	coord_init_first_unit(&coord, left);
++
++	for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
++	     item_pos++) {
++
++		coord_set_item_pos(&coord, item_pos);
++		ih = node40_ih_at_coord(&coord);
++
++		assert("vs-1611", i == item_pos);
++		assert("vs-1590", keyeq(&ih->key, &data[i].key));
++		assert("vs-1591",
++		       le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
++		if ((i < (node40_num_of_items_internal(left) - 1))
++		    || !mergeable) {
++			switch (data[i].plugin_id) {
++			case CTAIL_ID:
++			case FORMATTING_ID:
++				assert("vs-1592",
++				       data[i].u.bytes ==
++				       coord_num_units(&coord));
++				break;
++			case EXTENT_POINTER_ID:
++				assert("vs-1593",
++				       data[i].u.bytes == extent_size(&coord,
++								      coord_num_units
++								      (&coord)));
++				break;
++			case COMPOUND_DIR_ID:
++				assert("vs-1594",
++				       data[i].u.entries ==
++				       coord_num_units(&coord));
++				break;
++			default:
++				break;
++			}
++		}
++		if (item_pos == (node40_num_of_items_internal(left) - 1)
++		    && mergeable) {
++			switch (data[i].plugin_id) {
++			case CTAIL_ID:
++			case FORMATTING_ID:
++				last_bytes = coord_num_units(&coord);
++				break;
++			case EXTENT_POINTER_ID:
++				last_bytes =
++				    extent_size(&coord,
++						coord_num_units(&coord));
++				break;
++			case COMPOUND_DIR_ID:
++				last_bytes = coord_num_units(&coord);
++				break;
++			default:
++				impossible("vs-1595", "wrong mergeable item");
++				break;
++			}
++		}
++		i++;
++	}
++
++	coord_init_first_unit(&coord, right);
++	if (mergeable) {
++		ih = node40_ih_at_coord(&coord);
++
++		assert("vs-1589",
++		       data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
++		assert("vs-1608", last_bytes != 0);
++		switch (data[i - 1].plugin_id) {
++		case CTAIL_ID:
++		case FORMATTING_ID:
++			assert("vs-1596",
++			       data[i - 1].u.bytes ==
++			       last_bytes + coord_num_units(&coord));
++			break;
++
++		case EXTENT_POINTER_ID:
++			assert("vs-1597",
++			       data[i - 1].u.bytes ==
++			       last_bytes + extent_size(&coord,
++							coord_num_units
++							(&coord)));
++			break;
++
++		case COMPOUND_DIR_ID:
++			assert("vs-1598",
++			       data[i - 1].u.bytes ==
++			       last_bytes + coord_num_units(&coord));
++			break;
++		default:
++			impossible("vs-1599", "wrong mergeable item");
++			break;
++		}
++		item_pos = 1;
++	} else
++		item_pos = 0;
++
++	for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
++
++		coord_set_item_pos(&coord, item_pos);
++		ih = node40_ih_at_coord(&coord);
++
++		assert("vs-1612", keyeq(&ih->key, &data[i].key));
++		assert("vs-1613",
++		       le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
++		switch (data[i].plugin_id) {
++		case CTAIL_ID:
++		case FORMATTING_ID:
++			assert("vs-1600",
++			       data[i].u.bytes == coord_num_units(&coord));
++			break;
++		case EXTENT_POINTER_ID:
++			assert("vs-1601",
++			       data[i].u.bytes == extent_size(&coord,
++							      coord_num_units
++							      (&coord)));
++			break;
++		case COMPOUND_DIR_ID:
++			assert("vs-1602",
++			       data[i].u.entries == coord_num_units(&coord));
++			break;
++		default:
++			break;
++		}
++		i++;
++	}
++
++	assert("vs-1603", i == nr_items);
++	kfree(data);
++}
++
++#endif
++
++/* plugin->u.node.shift
++   look for description of this method in plugin/node/node.h */
++int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child,	/* if @from->node becomes empty - it will be
++											   deleted from the tree if this is set to 1 */
++		 int including_stop_coord, carry_plugin_info * info)
++{
++	struct shift_params shift;
++	int result;
++	znode *left, *right;
++	znode *source;
++	int target_empty;
++
++	assert("nikita-2161", coord_check(from));
++
++	memset(&shift, 0, sizeof(shift));
++	shift.pend = pend;
++	shift.wish_stop = *from;
++	shift.target = to;
++
++	assert("nikita-1473", znode_is_write_locked(from->node));
++	assert("nikita-1474", znode_is_write_locked(to));
++
++	source = from->node;
++
++	/* set @shift.wish_stop to rightmost/leftmost unit among units we want
++	   shifted */
++	if (pend == SHIFT_LEFT) {
++		result = coord_set_to_left(&shift.wish_stop);
++		left = to;
++		right = from->node;
++	} else {
++		result = coord_set_to_right(&shift.wish_stop);
++		left = from->node;
++		right = to;
++	}
++
++	if (result) {
++		/* move insertion coord even if there is nothing to move */
++		if (including_stop_coord) {
++			/* move insertion coord (@from) */
++			if (pend == SHIFT_LEFT) {
++				/* after last item in target node */
++				coord_init_after_last_item(from, to);
++			} else {
++				/* before first item in target node */
++				coord_init_before_first_item(from, to);
++			}
++		}
++
++		if (delete_child && node_is_empty(shift.wish_stop.node))
++			result =
++			    prepare_removal_node40(shift.wish_stop.node, info);
++		else
++			result = 0;
++		/* there is nothing to shift */
++		assert("nikita-2078", coord_check(from));
++		return result;
++	}
++
++	target_empty = node_is_empty(to);
++
++	/* when first node plugin with item body compression is implemented,
++	   this must be changed to call node specific plugin */
++
++	/* shift->stop_coord is updated to last unit which really will be
++	   shifted */
++	estimate_shift(&shift, get_current_context());
++	if (!shift.shift_bytes) {
++		/* we could not shift anything */
++		assert("nikita-2079", coord_check(from));
++		return 0;
++	}
++
++	copy(&shift);
++
++	/* result value of this is important. It is used by adjust_coord below */
++	result = delete_copied(&shift);
++
++	assert("vs-1610", result >= 0);
++	assert("vs-1471",
++	       ((reiser4_context *) current->journal_info)->magic ==
++	       context_magic);
++
++	/* item which has been moved from one node to another might want to do
++	   something on that event. This can be done by item's shift_hook
++	   method, which will be now called for every moved items */
++	call_shift_hooks(&shift);
++
++	assert("vs-1472",
++	       ((reiser4_context *) current->journal_info)->magic ==
++	       context_magic);
++
++	update_taps(&shift);
++
++	assert("vs-1473",
++	       ((reiser4_context *) current->journal_info)->magic ==
++	       context_magic);
++
++	/* adjust @from pointer in accordance with @including_stop_coord flag
++	   and amount of data which was really shifted */
++	adjust_coord(from, &shift, result, including_stop_coord);
++
++	if (target_empty)
++		/*
++		 * items were shifted into empty node. Update delimiting key.
++		 */
++		result = prepare_for_update(NULL, left, info);
++
++	/* add update operation to @info, which is the list of operations to
++	   be performed on a higher level */
++	result = prepare_for_update(left, right, info);
++	if (!result && node_is_empty(source) && delete_child) {
++		/* all contents of @from->node is moved to @to and @from->node
++		   has to be removed from the tree, so, on higher level we
++		   will be removing the pointer to node @from->node */
++		result = prepare_removal_node40(source, info);
++	}
++	assert("nikita-2080", coord_check(from));
++	return result ? result : (int)shift.shift_bytes;
++}
++
++/* plugin->u.node.fast_insert()
++   look for description of this method in plugin/node/node.h */
++int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
++{
++	return 1;
++}
++
++/* plugin->u.node.fast_paste()
++   look for description of this method in plugin/node/node.h */
++int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
++{
++	return 1;
++}
++
++/* plugin->u.node.fast_cut()
++   look for description of this method in plugin/node/node.h */
++int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
++{
++	return 1;
++}
++
++/* plugin->u.node.modify - not defined */
++
++/* plugin->u.node.max_item_size */
++int max_item_size_node40(void)
++{
++	return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
++	    sizeof(item_header40);
++}
++
++/* plugin->u.node.set_item_plugin */
++int set_item_plugin_node40(coord_t *coord, item_id id)
++{
++	item_header40 *ih;
++
++	ih = node40_ih_at_coord(coord);
++	put_unaligned(cpu_to_le16(id), &ih->plugin_id);
++	coord->iplugid = id;
++	return 0;
++}
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/node/node40.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/node/node40.h
+@@ -0,0 +1,125 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#if !defined( __REISER4_NODE40_H__ )
++#define __REISER4_NODE40_H__
++
++#include "../../forward.h"
++#include "../../dformat.h"
++#include "node.h"
++
++#include <linux/types.h>
++
++/* format of node header for 40 node layouts. Keep bloat out of this struct.  */
++typedef struct node40_header {
++	/* identifier of node plugin. Must be located at the very beginning
++	   of a node. */
++	common_node_header common_header;	/* this is 16 bits */
++	/* number of items. Should be first element in the node header,
++	   because we haven't yet finally decided whether it shouldn't go into
++	   common_header.
++	 */
++/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
++ * node format at compile time, and it is this one, accesses do not function dereference when
++ * accessing these fields (and otherwise they do).  Probably 80% of users will only have one node format at a time throughout the life of reiser4.  */
++	d16 nr_items;
++	/* free space in node measured in bytes */
++	d16 free_space;
++	/* offset to start of free space in node */
++	d16 free_space_start;
++	/* for reiser4_fsck.  When information about what is a free
++	   block is corrupted, and we try to recover everything even
++	   if marked as freed, then old versions of data may
++	   duplicate newer versions, and this field allows us to
++	   restore the newer version.  Also useful for when users
++	   who don't have the new trashcan installed on their linux distro
++	   delete the wrong files and send us desperate emails
++	   offering $25 for them back.  */
++
++	/* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
++	d32 magic;
++	/* flushstamp is made of mk_id and write_counter. mk_id is an
++	   id generated randomly at mkreiserfs time. So we can just
++	   skip all nodes with different mk_id. write_counter is d64
++	   incrementing counter of writes on disk. It is used for
++	   choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
++
++	d32 mkfs_id;
++	d64 flush_id;
++	/* node flags to be used by fsck (reiser4ck or reiser4fsck?)
++	   and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
++	d16 flags;
++
++	/* 1 is leaf level, 2 is twig level, root is the numerically
++	   largest level */
++	d8 level;
++
++	d8 pad;
++} PACKED node40_header;
++
++/* item headers are not standard across all node layouts, pass
++   pos_in_node to functions instead */
++typedef struct item_header40 {
++	/* key of item */
++	/*  0 */ reiser4_key key;
++	/* offset from start of a node measured in 8-byte chunks */
++	/* 24 */ d16 offset;
++	/* 26 */ d16 flags;
++	/* 28 */ d16 plugin_id;
++} PACKED item_header40;
++
++size_t item_overhead_node40(const znode * node, flow_t * aflow);
++size_t free_space_node40(znode * node);
++node_search_result lookup_node40(znode * node, const reiser4_key * key,
++				 lookup_bias bias, coord_t * coord);
++int num_of_items_node40(const znode * node);
++char *item_by_coord_node40(const coord_t * coord);
++int length_by_coord_node40(const coord_t * coord);
++item_plugin *plugin_by_coord_node40(const coord_t * coord);
++reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
++size_t estimate_node40(znode * node);
++int check_node40(const znode * node, __u32 flags, const char **error);
++int parse_node40(znode * node);
++int init_node40(znode * node);
++#ifdef GUESS_EXISTS
++int guess_node40(const znode * node);
++#endif
++void change_item_size_node40(coord_t * coord, int by);
++int create_item_node40(coord_t * target, const reiser4_key * key,
++		       reiser4_item_data * data, carry_plugin_info * info);
++void update_item_key_node40(coord_t * target, const reiser4_key * key,
++			    carry_plugin_info * info);
++int kill_node40(struct carry_kill_data *, carry_plugin_info *);
++int cut_node40(struct carry_cut_data *, carry_plugin_info *);
++int shift_node40(coord_t * from, znode * to, shift_direction pend,
++		 /* if @from->node becomes
++		    empty - it will be deleted from
++		    the tree if this is set to 1
++		  */
++		 int delete_child, int including_stop_coord,
++		 carry_plugin_info * info);
++
++int fast_insert_node40(const coord_t * coord);
++int fast_paste_node40(const coord_t * coord);
++int fast_cut_node40(const coord_t * coord);
++int max_item_size_node40(void);
++int prepare_removal_node40(znode * empty, carry_plugin_info * info);
++int set_item_plugin_node40(coord_t * coord, item_id id);
++int shrink_item_node40(coord_t * coord, int delta);
++
++#if REISER4_DEBUG
++void *shift_check_prepare(const znode *left, const znode *right);
++void shift_check(void *vp, const znode *left, const znode *right);
++#endif
++
++/* __REISER4_NODE40_H__ */
++#endif
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/object.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/object.c
+@@ -0,0 +1,501 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/*
++ * Examples of object plugins: file, directory, symlink, special file.
++ *
++ * Plugins associated with inode:
++ *
++ * Plugin of inode is plugin referenced by plugin-id field of on-disk
++ * stat-data. How we store this plugin in in-core inode is not
++ * important. Currently pointers are used, another variant is to store offsets
++ * and do array lookup on each access.
++ *
++ * Now, each inode has one selected plugin: object plugin that
++ * determines what type of file this object is: directory, regular etc.
++ *
++ * This main plugin can use other plugins that are thus subordinated to
++ * it. Directory instance of object plugin uses hash; regular file
++ * instance uses tail policy plugin.
++ *
++ * Object plugin is either taken from id in stat-data or guessed from
++ * i_mode bits. Once it is established we ask it to install its
++ * subordinate plugins, by looking again in stat-data or inheriting them
++ * from parent.
++ *
++ * How new inode is initialized during ->read_inode():
++ * 1 read stat-data and initialize inode fields: i_size, i_mode,
++ *   i_generation, capabilities etc.
++ * 2 read plugin id from stat data or try to guess plugin id
++ *   from inode->i_mode bits if plugin id is missing.
++ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
++ *
++ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3?  What
++ * if stat data does contain i_size, etc., due to it being an unusual plugin?
++ *
++ * 4 Call ->activate() method of object's plugin. Plugin is either read from
++ *    from stat-data or guessed from mode bits
++ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
++ *    plugins from parent.
++ *
++ * Easy induction proves that on last step all plugins of inode would be
++ * initialized.
++ *
++ * When creating new object:
++ * 1 obtain object plugin id (see next period)
++ * NIKITA-FIXME-HANS: period?
++ * 2 ->install() this plugin
++ * 3 ->inherit() the rest from the parent
++ *
++ * We need some examples of creating an object with default and non-default
++ * plugin ids.  Nikita, please create them.
++ */
++
++#include "../inode.h"
++
++static int _bugop(void)
++{
++	BUG_ON(1);
++	return 0;
++}
++
++#define bugop ((void *)_bugop)
++
++static int _dummyop(void)
++{
++	return 0;
++}
++
++#define dummyop ((void *)_dummyop)
++
++static int change_file(struct inode *inode, reiser4_plugin * plugin)
++{
++	/* cannot change object plugin of already existing object */
++	return RETERR(-EINVAL);
++}
++
++static reiser4_plugin_ops file_plugin_ops = {
++	.change = change_file
++};
++
++/*
++ * Definitions of object plugins.
++ */
++
++file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
++	[UNIX_FILE_PLUGIN_ID] = {
++		.h = {
++			.type_id = REISER4_FILE_PLUGIN_TYPE,
++			.id = UNIX_FILE_PLUGIN_ID,
++			.pops = &file_plugin_ops,
++			.label = "reg",
++			.desc = "regular file",
++			.linkage = {NULL, NULL},
++		},
++		.inode_ops = {
++			.permission = permission_common,
++			.setattr = setattr_unix_file,
++			.getattr = getattr_common
++		},
++		.file_ops = {
++			.llseek = generic_file_llseek,
++			.read = read_unix_file,
++			.write = write_unix_file,
++			.ioctl = ioctl_unix_file,
++			.mmap = mmap_unix_file,
++			.open = open_unix_file,
++			.release = release_unix_file,
++			.fsync = sync_unix_file,
++			.sendfile = sendfile_unix_file
++		},
++		.as_ops = {
++			.writepage = reiser4_writepage,
++			.readpage = readpage_unix_file,
++			.sync_page = block_sync_page,
++			.writepages = writepages_unix_file,
++			.set_page_dirty = reiser4_set_page_dirty,
++			.readpages = reiser4_readpages,
++			.prepare_write = prepare_write_unix_file,
++			.commit_write =	commit_write_unix_file,
++			.bmap = bmap_unix_file,
++			.invalidatepage = reiser4_invalidatepage,
++			.releasepage = reiser4_releasepage
++		},
++		.write_sd_by_inode = write_sd_by_inode_common,
++		.flow_by_inode = flow_by_inode_unix_file,
++		.key_by_inode = key_by_inode_and_offset_common,
++		.set_plug_in_inode = set_plug_in_inode_common,
++		.adjust_to_parent = adjust_to_parent_common,
++		.create_object = create_object_common,	/* this is not inode_operations's create */
++		.delete_object = delete_object_unix_file,
++		.add_link = add_link_common,
++		.rem_link = rem_link_common,
++		.owns_item = owns_item_unix_file,
++		.can_add_link = can_add_link_common,
++		.detach = dummyop,
++		.bind = dummyop,
++		.safelink = safelink_common,
++		.estimate = {
++			.create = estimate_create_common,
++			.update = estimate_update_common,
++			.unlink = estimate_unlink_common
++		},
++		.init_inode_data = init_inode_data_unix_file,
++		.cut_tree_worker = cut_tree_worker_common,
++		.wire = {
++			.write = wire_write_common,
++			.read = wire_read_common,
++			.get = wire_get_common,
++			.size = wire_size_common,
++			.done = wire_done_common
++		}
++	},
++	[DIRECTORY_FILE_PLUGIN_ID] = {
++		.h = {
++			.type_id = REISER4_FILE_PLUGIN_TYPE,
++			.id = DIRECTORY_FILE_PLUGIN_ID,
++			.pops = &file_plugin_ops,
++			.label = "dir",
++			.desc = "directory",
++			.linkage = {NULL, NULL}
++		},
++		.inode_ops = {NULL,},
++		.file_ops = {NULL,},
++		.as_ops = {NULL,},
++
++		.write_sd_by_inode = write_sd_by_inode_common,
++		.flow_by_inode = bugop,
++		.key_by_inode = bugop,
++		.set_plug_in_inode = set_plug_in_inode_common,
++		.adjust_to_parent = adjust_to_parent_common_dir,
++		.create_object = create_object_common,
++		.delete_object = delete_directory_common,
++		.add_link = add_link_common,
++		.rem_link = rem_link_common_dir,
++		.owns_item = owns_item_common_dir,
++		.can_add_link = can_add_link_common,
++		.can_rem_link = can_rem_link_common_dir,
++		.detach = detach_common_dir,
++		.bind = bind_common_dir,
++		.safelink = safelink_common,
++		.estimate = {
++			.create = estimate_create_common_dir,
++			.update = estimate_update_common,
++			.unlink = estimate_unlink_common_dir
++		},
++		.wire = {
++			.write = wire_write_common,
++			.read = wire_read_common,
++			.get = wire_get_common,
++			.size = wire_size_common,
++			.done = wire_done_common
++		},
++		.init_inode_data = init_inode_ordering,
++		.cut_tree_worker = cut_tree_worker_common,
++	},
++	[SYMLINK_FILE_PLUGIN_ID] = {
++		.h = {
++			.type_id = REISER4_FILE_PLUGIN_TYPE,
++			.id = SYMLINK_FILE_PLUGIN_ID,
++			.pops = &file_plugin_ops,
++			.label = "symlink",
++			.desc = "symbolic link",
++			.linkage = {NULL,NULL}
++		},
++		.inode_ops = {
++			.readlink = generic_readlink,
++			.follow_link = follow_link_common,
++			.permission = permission_common,
++			.setattr = setattr_common,
++			.getattr = getattr_common
++		},
++		/* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
++		.file_ops = {NULL,},
++		.as_ops = {NULL,},
++
++		.write_sd_by_inode = write_sd_by_inode_common,
++		.set_plug_in_inode = set_plug_in_inode_common,
++		.adjust_to_parent = adjust_to_parent_common,
++		.create_object = create_symlink,
++		.delete_object = delete_object_common,
++		.add_link = add_link_common,
++		.rem_link = rem_link_common,
++		.can_add_link = can_add_link_common,
++		.detach = dummyop,
++		.bind = dummyop,
++		.safelink = safelink_common,
++		.estimate = {
++			.create = estimate_create_common,
++			.update = estimate_update_common,
++			.unlink = estimate_unlink_common
++		},
++		.init_inode_data = init_inode_ordering,
++		.cut_tree_worker = cut_tree_worker_common,
++		.destroy_inode = destroy_inode_symlink,
++		.wire = {
++			.write = wire_write_common,
++			.read = wire_read_common,
++			.get = wire_get_common,
++			.size = wire_size_common,
++			.done = wire_done_common
++		}
++	},
++	[SPECIAL_FILE_PLUGIN_ID] = {
++		.h = {
++			.type_id = REISER4_FILE_PLUGIN_TYPE,
++			.id = SPECIAL_FILE_PLUGIN_ID,
++			.pops = &file_plugin_ops,
++			.label = "special",
++			.desc =
++			"special: fifo, device or socket",
++			.linkage = {NULL, NULL}
++		},
++		.inode_ops = {
++			.permission = permission_common,
++			.setattr = setattr_common,
++			.getattr = getattr_common
++		},
++		/* file_ops of special files (sockets, block, char, fifo) are
++		   initialized by init_special_inode. */
++		.file_ops = {NULL,},
++		.as_ops = {NULL,},
++
++		.write_sd_by_inode = write_sd_by_inode_common,
++		.set_plug_in_inode = set_plug_in_inode_common,
++		.adjust_to_parent = adjust_to_parent_common,
++		.create_object = create_object_common,
++		.delete_object = delete_object_common,
++		.add_link = add_link_common,
++		.rem_link = rem_link_common,
++		.owns_item = owns_item_common,
++		.can_add_link = can_add_link_common,
++		.detach = dummyop,
++		.bind = dummyop,
++		.safelink = safelink_common,
++		.estimate = {
++			.create = estimate_create_common,
++			.update = estimate_update_common,
++			.unlink = estimate_unlink_common
++		},
++		.init_inode_data = init_inode_ordering,
++		.cut_tree_worker = cut_tree_worker_common,
++		.wire = {
++			.write = wire_write_common,
++			.read = wire_read_common,
++			.get = wire_get_common,
++			.size = wire_size_common,
++			.done = wire_done_common
++		}
++	},
++	[CRC_FILE_PLUGIN_ID] = {
++		.h = {
++			.type_id = REISER4_FILE_PLUGIN_TYPE,
++			.id = CRC_FILE_PLUGIN_ID,
++			.pops = &cryptcompress_plugin_ops,
++			.label = "cryptcompress",
++			.desc = "cryptcompress file",
++			.linkage = {NULL, NULL}
++		},
++		.inode_ops = {
++			.permission = permission_common,
++			.setattr = setattr_cryptcompress,
++			.getattr = getattr_common
++		},
++		.file_ops = {
++			.llseek = generic_file_llseek,
++			.read = read_cryptcompress,
++			.write = write_cryptcompress,
++			.mmap = mmap_cryptcompress,
++			.release = release_cryptcompress,
++			.fsync = sync_common,
++			.sendfile = sendfile_cryptcompress
++		},
++		.as_ops = {
++			.writepage = reiser4_writepage,
++			.readpage = readpage_cryptcompress,
++			.sync_page = block_sync_page,
++			.writepages = writepages_cryptcompress,
++			.set_page_dirty = reiser4_set_page_dirty,
++			.readpages = reiser4_readpages,
++			.prepare_write = prepare_write_common,
++			.invalidatepage = reiser4_invalidatepage,
++			.releasepage = reiser4_releasepage
++		},
++		.write_sd_by_inode = write_sd_by_inode_common,
++		.flow_by_inode = flow_by_inode_cryptcompress,
++		.key_by_inode = key_by_inode_cryptcompress,
++		.set_plug_in_inode = set_plug_in_inode_common,
++		.adjust_to_parent = adjust_to_parent_cryptcompress,
++		.create_object = create_cryptcompress,
++		.open_object = open_cryptcompress,
++		.delete_object = delete_cryptcompress,
++		.add_link = add_link_common,
++		.rem_link = rem_link_common,
++		.owns_item = owns_item_common,
++		.can_add_link = can_add_link_common,
++		.detach = dummyop,
++		.bind = dummyop,
++		.safelink = safelink_common,
++		.estimate = {
++			.create = estimate_create_common,
++			.update = estimate_update_common,
++			.unlink = estimate_unlink_common
++		},
++		.init_inode_data = init_inode_data_cryptcompress,
++		.cut_tree_worker = cut_tree_worker_cryptcompress,
++		.destroy_inode = destroy_inode_cryptcompress,
++		.wire = {
++			.write = wire_write_common,
++			.read = wire_read_common,
++			.get = wire_get_common,
++			.size = wire_size_common,
++			.done = wire_done_common
++		}
++	}
++};
++
++static int change_dir(struct inode *inode, reiser4_plugin * plugin)
++{
++	/* cannot change dir plugin of already existing object */
++	return RETERR(-EINVAL);
++}
++
++static reiser4_plugin_ops dir_plugin_ops = {
++	.change = change_dir
++};
++
++/*
++ * definition of directory plugins
++ */
++
++dir_plugin dir_plugins[LAST_DIR_ID] = {
++	/* standard hashed directory plugin */
++	[HASHED_DIR_PLUGIN_ID] = {
++		.h = {
++			.type_id = REISER4_DIR_PLUGIN_TYPE,
++			.id = HASHED_DIR_PLUGIN_ID,
++			.pops = &dir_plugin_ops,
++			.label = "dir",
++			.desc = "hashed directory",
++			.linkage = {NULL, NULL}
++		},
++		.inode_ops = {
++			.create = create_common,
++			.lookup = lookup_common,
++			.link = link_common,
++			.unlink = unlink_common,
++			.symlink = symlink_common,
++			.mkdir = mkdir_common,
++			.rmdir = unlink_common,
++			.mknod = mknod_common,
++			.rename = rename_common,
++			.permission = permission_common,
++			.setattr = setattr_common,
++			.getattr = getattr_common
++		},
++		.file_ops = {
++			.llseek = llseek_common_dir,
++			.read = generic_read_dir,
++			.readdir = readdir_common,
++			.release = release_dir_common,
++			.fsync = sync_common
++		},
++		.as_ops = {
++			.writepage = bugop,
++			.sync_page = bugop,
++			.writepages = dummyop,
++			.set_page_dirty = bugop,
++			.readpages = bugop,
++			.prepare_write = bugop,
++			.commit_write = bugop,
++			.bmap = bugop,
++			.invalidatepage = bugop,
++			.releasepage = bugop
++		},
++		.get_parent = get_parent_common,
++		.is_name_acceptable = is_name_acceptable_common,
++		.build_entry_key = build_entry_key_hashed,
++		.build_readdir_key = build_readdir_key_common,
++		.add_entry = add_entry_common,
++		.rem_entry = rem_entry_common,
++		.init = init_common,
++		.done = done_common,
++		.attach = attach_common,
++		.detach = detach_common,
++		.estimate = {
++			.add_entry = estimate_add_entry_common,
++			.rem_entry = estimate_rem_entry_common,
++			.unlink = dir_estimate_unlink_common
++		}
++	},
++	/* hashed directory for which seekdir/telldir are guaranteed to
++	 * work. Brain-damage. */
++	[SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
++		.h = {
++			.type_id = REISER4_DIR_PLUGIN_TYPE,
++			.id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
++			.pops = &dir_plugin_ops,
++			.label = "dir32",
++			.desc = "directory hashed with 31 bit hash",
++			.linkage = {NULL, NULL}
++		},
++		.inode_ops = {
++			.create = create_common,
++			.lookup = lookup_common,
++			.link = link_common,
++			.unlink = unlink_common,
++			.symlink = symlink_common,
++			.mkdir = mkdir_common,
++			.rmdir = unlink_common,
++			.mknod = mknod_common,
++			.rename = rename_common,
++			.permission = permission_common,
++			.setattr = setattr_common,
++			.getattr = getattr_common
++		},
++		.file_ops = {
++			.llseek = llseek_common_dir,
++			.read =	generic_read_dir,
++			.readdir = readdir_common,
++			.release = release_dir_common,
++			.fsync = sync_common
++		},
++		.as_ops = {
++			.writepage = bugop,
++			.sync_page = bugop,
++			.writepages = dummyop,
++			.set_page_dirty = bugop,
++			.readpages = bugop,
++			.prepare_write = bugop,
++			.commit_write = bugop,
++			.bmap = bugop,
++			.invalidatepage = bugop,
++			.releasepage = bugop
++		},
++		.get_parent = get_parent_common,
++		.is_name_acceptable = is_name_acceptable_common,
++		.build_entry_key = build_entry_key_seekable,
++		.build_readdir_key = build_readdir_key_common,
++		.add_entry = add_entry_common,
++		.rem_entry = rem_entry_common,
++		.init = init_common,
++		.done = done_common,
++		.attach = attach_common,
++		.detach = detach_common,
++		.estimate = {
++			.add_entry = estimate_add_entry_common,
++			.rem_entry = estimate_rem_entry_common,
++			.unlink = dir_estimate_unlink_common
++		}
++	}
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/object.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/object.h
+@@ -0,0 +1,121 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Declaration of object plugin functions. */
++
++#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
++#define __FS_REISER4_PLUGIN_OBJECT_H__
++
++#include "../type_safe_hash.h"
++
++/* common implementations of inode operations */
++int create_common(struct inode *parent, struct dentry *dentry,
++		  int mode, struct nameidata *);
++struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
++			     struct nameidata *nameidata);
++int link_common(struct dentry *existing, struct inode *parent,
++		struct dentry *newname);
++int unlink_common(struct inode *parent, struct dentry *victim);
++int mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
++int symlink_common(struct inode *parent, struct dentry *dentry,
++		   const char *linkname);
++int mknod_common(struct inode *parent, struct dentry *dentry,
++		 int mode, dev_t rdev);
++int rename_common(struct inode *old_dir, struct dentry *old_name,
++		  struct inode *new_dir, struct dentry *new_name);
++void *follow_link_common(struct dentry *, struct nameidata *data);
++int permission_common(struct inode *, int mask,	/* mode bits to check permissions for */
++		      struct nameidata *nameidata);
++int setattr_common(struct dentry *, struct iattr *);
++int getattr_common(struct vfsmount *mnt, struct dentry *, struct kstat *);
++
++/* common implementations of file operations */
++loff_t llseek_common_dir(struct file *, loff_t off, int origin);
++int readdir_common(struct file *, void *dirent, filldir_t);
++int release_dir_common(struct inode *, struct file *);
++int sync_common(struct file *, struct dentry *, int datasync);
++
++/* common implementations of address space operations */
++int prepare_write_common(struct file *, struct page *, unsigned from,
++			 unsigned to);
++
++/* file plugin operations: common implementations */
++int write_sd_by_inode_common(struct inode *);
++int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
++int set_plug_in_inode_common(struct inode *object, struct inode *parent,
++			     reiser4_object_create_data *);
++int adjust_to_parent_common(struct inode *object, struct inode *parent,
++			    struct inode *root);
++int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
++				struct inode *root);
++int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
++				   struct inode *root);
++int create_object_common(struct inode *object, struct inode *parent,
++			 reiser4_object_create_data *);
++int delete_object_common(struct inode *);
++int delete_directory_common(struct inode *);
++int add_link_common(struct inode *object, struct inode *parent);
++int rem_link_common(struct inode *object, struct inode *parent);
++int rem_link_common_dir(struct inode *object, struct inode *parent);
++int owns_item_common(const struct inode *, const coord_t *);
++int owns_item_common_dir(const struct inode *, const coord_t *);
++int can_add_link_common(const struct inode *);
++int can_rem_link_common_dir(const struct inode *);
++int detach_common_dir(struct inode *child, struct inode *parent);
++int open_cryptcompress(struct inode * inode, struct file * file);
++int bind_common_dir(struct inode *child, struct inode *parent);
++int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
++reiser4_block_nr estimate_create_common(const struct inode *);
++reiser4_block_nr estimate_create_common_dir(const struct inode *);
++reiser4_block_nr estimate_update_common(const struct inode *);
++reiser4_block_nr estimate_unlink_common(const struct inode *,
++					const struct inode *);
++reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
++					    const struct inode *);
++char *wire_write_common(struct inode *, char *start);
++char *wire_read_common(char *addr, reiser4_object_on_wire *);
++struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
++int wire_size_common(struct inode *);
++void wire_done_common(reiser4_object_on_wire *);
++
++/* dir plugin operations: common implementations */
++struct dentry *get_parent_common(struct inode *child);
++int is_name_acceptable_common(const struct inode *, const char *name, int len);
++void build_entry_key_common(const struct inode *,
++			    const struct qstr *qname, reiser4_key *);
++int build_readdir_key_common(struct file *dir, reiser4_key *);
++int add_entry_common(struct inode *object, struct dentry *where,
++		     reiser4_object_create_data *, reiser4_dir_entry_desc *);
++int rem_entry_common(struct inode *object, struct dentry *where,
++		     reiser4_dir_entry_desc *);
++int init_common(struct inode *object, struct inode *parent,
++		reiser4_object_create_data *);
++int done_common(struct inode *);
++int attach_common(struct inode *child, struct inode *parent);
++int detach_common(struct inode *object, struct inode *parent);
++reiser4_block_nr estimate_add_entry_common(const struct inode *);
++reiser4_block_nr estimate_rem_entry_common(const struct inode *);
++reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
++					    const struct inode *);
++
++/* these are essential parts of common implementations, they are to make
++   customized implementations easier */
++int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
++
++/* merely useful functions */
++int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
++	      const reiser4_key *, int silent);
++
++
++/* __FS_REISER4_PLUGIN_OBJECT_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/plugin.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/plugin.c
+@@ -0,0 +1,533 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Basic plugin infrastructure, lookup etc. */
++
++/* PLUGINS:
++
++   Plugins are internal Reiser4 "modules" or "objects" used to increase
++   extensibility and allow external users to easily adapt reiser4 to
++   their needs.
++
++   Plugins are classified into several disjoint "types". Plugins
++   belonging to the particular plugin type are termed "instances" of
++   this type. Currently the following types are present:
++
++    . object plugin
++    . hash plugin
++    . tail plugin
++    . perm plugin
++    . item plugin
++    . node layout plugin
++
++NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
++
++   Object (file) plugin determines how given file-system object serves
++   standard VFS requests for read, write, seek, mmap etc. Instances of
++   file plugins are: regular file, directory, symlink. Another example
++   of file plugin is audit plugin, that optionally records accesses to
++   underlying object and forwards requests to it.
++
++   Hash plugins compute hashes used by reiser4 to store and locate
++   files within directories. Instances of hash plugin type are: r5,
++   tea, rupasov.
++
++   Tail plugins (or, more precisely, tail policy plugins) determine
++   when last part of the file should be stored in a formatted item.
++
++   Perm plugins control permissions granted for a process accessing a file.
++
++   Scope and lookup:
++
++   label such that pair ( type_label, plugin_label ) is unique.  This
++   pair is a globally persistent and user-visible plugin
++   identifier. Internally kernel maintains plugins and plugin types in
++   arrays using an index into those arrays as plugin and plugin type
++   identifiers. File-system in turn, also maintains persistent
++   "dictionary" which is mapping from plugin label to numerical
++   identifier which is stored in file-system objects.  That is, we
++   store the offset into the plugin array for that plugin type as the
++   plugin id in the stat data of the filesystem object.
++
++   plugin_labels have meaning for the user interface that assigns
++   plugins to files, and may someday have meaning for dynamic loading of
++   plugins and for copying of plugins from one fs instance to
++   another by utilities like cp and tar.
++
++   Internal kernel plugin type identifier (index in plugins[] array) is
++   of type reiser4_plugin_type. Set of available plugin types is
++   currently static, but dynamic loading doesn't seem to pose
++   insurmountable problems.
++
++   Within each type plugins are addressed by the identifiers of type
++   reiser4_plugin_id (indices in
++   reiser4_plugin_type_data.builtin[]). Such identifiers are only
++   required to be unique within one type, not globally.
++
++   Thus, plugin in memory is uniquely identified by the pair (type_id,
++   id).
++
++   Usage:
++
++   There exists only one instance of each plugin instance, but this
++   single instance can be associated with many entities (file-system
++   objects, items, nodes, transactions, file-descriptors etc.). Entity
++   to which plugin of given type is termed (due to the lack of
++   imagination) "subject" of this plugin type and, by abuse of
++   terminology, subject of particular instance of this type to which
++   it's attached currently. For example, inode is subject of object
++   plugin type. Inode representing directory is subject of directory
++   plugin, hash plugin type and some particular instance of hash plugin
++   type. Inode, representing regular file is subject of "regular file"
++   plugin, tail-policy plugin type etc.
++
++   With each subject the plugin possibly stores some state. For example,
++   the state of a directory plugin (instance of object plugin type) is pointer
++   to hash plugin (if directories always use hashing that is). State of
++   audit plugin is file descriptor (struct file) of log file or some
++   magic value to do logging through printk().
++
++   Interface:
++
++   In addition to a scalar identifier, each plugin type and plugin
++   proper has a "label": short string and a "description"---longer
++   descriptive string. Labels and descriptions of plugin types are
++   hard-coded into plugins[] array, declared and defined in
++   plugin.c. Label and description of plugin are stored in .label and
++   .desc fields of reiser4_plugin_header respectively. It's possible to
++   locate plugin by the pair of labels.
++
++   Features:
++
++    . user-level plugin manipulations:
++      + reiser4("filename/..file_plugin<='audit'");
++      + write(open("filename/..file_plugin"), "audit", 8);
++
++    . user level utilities lsplug and chplug to manipulate plugins.
++      Utilities are not of primary priority. Possibly they will be not
++      working on v4.0
++
++NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree?  I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
++
++    . mount option "plug" to set-up plugins of root-directory.
++      "plug=foo:bar" will set "bar" as default plugin of type "foo".
++
++   Limitations:
++
++    . each plugin type has to provide at least one builtin
++      plugin. This is technical limitation and it can be lifted in the
++      future.
++
++   TODO:
++
++   New plugin types/plugings:
++   Things we should be able to separately choose to inherit:
++
++   security plugins
++
++   stat data
++
++   file bodies
++
++   file plugins
++
++   dir plugins
++
++    . perm:acl
++
++    d audi---audit plugin intercepting and possibly logging all
++      accesses to object. Requires to put stub functions in file_operations
++      in stead of generic_file_*.
++
++NIKITA-FIXME-HANS: why make overflows a plugin?
++    . over---handle hash overflows
++
++    . sqnt---handle different access patterns and instruments read-ahead
++
++NIKITA-FIXME-HANS: describe the line below in more detail.
++
++    . hier---handle inheritance of plugins along file-system hierarchy
++
++   Different kinds of inheritance: on creation vs. on access.
++   Compatible/incompatible plugins.
++   Inheritance for multi-linked files.
++   Layered plugins.
++   Notion of plugin context is abandoned.
++
++Each file is associated
++   with one plugin and dependant plugins (hash, etc.) are stored as
++   main plugin state. Now, if we have plugins used for regular files
++   but not for directories, how such plugins would be inherited?
++    . always store them with directories also
++
++NIKTIA-FIXME-HANS: Do the line above.  It is not exclusive of doing the line below which is also useful.
++
++    . use inheritance hierarchy, independent of file-system namespace
++
++*/
++
++#include "../debug.h"
++#include "../dformat.h"
++#include "plugin_header.h"
++#include "item/static_stat.h"
++#include "node/node.h"
++#include "security/perm.h"
++#include "space/space_allocator.h"
++#include "disk_format/disk_format.h"
++#include "plugin.h"
++#include "../reiser4.h"
++#include "../jnode.h"
++#include "../inode.h"
++
++#include <linux/fs.h>		/* for struct super_block  */
++
++/* public interface */
++
++/* initialise plugin sub-system. Just call this once on reiser4 startup. */
++int init_plugins(void);
++int setup_plugins(struct super_block *super, reiser4_plugin ** area);
++int locate_plugin(struct inode *inode, plugin_locator * loc);
++
++
++/**
++ * init_plugins - initialize plugins
++ *
++ * Initializes plugin sub-system. It is part of reiser4 module
++ * initialization. For each plugin of each type init method is called and each
++ * plugin is put into list of plugins.
++ */
++int init_plugins(void)
++{
++	reiser4_plugin_type type_id;
++
++	for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
++		reiser4_plugin_type_data *ptype;
++		int i;
++
++		ptype = &plugins[type_id];
++		assert("nikita-3508", ptype->label != NULL);
++		assert("nikita-3509", ptype->type_id == type_id);
++
++		INIT_LIST_HEAD(&ptype->plugins_list);
++/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
++		for (i = 0; i < ptype->builtin_num; ++i) {
++			reiser4_plugin *plugin;
++
++			plugin = plugin_at(ptype, i);
++
++			if (plugin->h.label == NULL)
++				/* uninitialized slot encountered */
++				continue;
++			assert("nikita-3445", plugin->h.type_id == type_id);
++			plugin->h.id = i;
++			if (plugin->h.pops != NULL &&
++			    plugin->h.pops->init != NULL) {
++				int result;
++
++				result = plugin->h.pops->init(plugin);
++				if (result != 0)
++					return result;
++			}
++			INIT_LIST_HEAD(&plugin->h.linkage);
++			list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
++		}
++	}
++	return 0;
++}
++
++/* true if plugin type id is valid */
++int is_type_id_valid(reiser4_plugin_type type_id /* plugin type id */ )
++{
++	/* "type_id" is unsigned, so no comparison with 0 is
++	   necessary */
++	return (type_id < REISER4_PLUGIN_TYPES);
++}
++
++/* true if plugin id is valid */
++int is_plugin_id_valid(reiser4_plugin_type type_id /* plugin type id */ ,
++		       reiser4_plugin_id id /* plugin id */ )
++{
++	assert("nikita-1653", is_type_id_valid(type_id));
++	return id < plugins[type_id].builtin_num;
++}
++
++/* return plugin by its @type_id and @id.
++
++   Both arguments are checked for validness: this is supposed to be called
++   from user-level.
++
++NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
++user space, and passed to the filesystem by use of method files? Your
++comment really confused me on the first reading....
++
++*/
++reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id	/* plugin
++								 * type id,
++								 * unchecked */ ,
++				    reiser4_plugin_id id	/* plugin id,
++								 * unchecked */ )
++{
++	if (is_type_id_valid(type_id)) {
++		if (is_plugin_id_valid(type_id, id))
++			return plugin_at(&plugins[type_id], id);
++		else
++			/* id out of bounds */
++			warning("nikita-2913",
++				"Invalid plugin id: [%i:%i]", type_id, id);
++	} else
++		/* type_id out of bounds */
++		warning("nikita-2914", "Invalid type_id: %i", type_id);
++	return NULL;
++}
++
++/**
++ * save_plugin_id - store plugin id in disk format
++ * @plugin: plugin to convert
++ * @area: where to store result
++ *
++ * Puts id of @plugin in little endian format to address @area.
++ */
++int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
++		   d16 *area /* where to store result */ )
++{
++	assert("nikita-1261", plugin != NULL);
++	assert("nikita-1262", area != NULL);
++
++	put_unaligned(cpu_to_le16(plugin->h.id), area);
++	return 0;
++}
++
++/* list of all plugins of given type */
++struct list_head *get_plugin_list(reiser4_plugin_type type_id	/* plugin type
++								 * id */ )
++{
++	assert("nikita-1056", is_type_id_valid(type_id));
++	return &plugins[type_id].plugins_list;
++}
++
++int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb)
++{
++	reiser4_plugin *plug;
++	reiser4_inode *parent;
++
++	parent = reiser4_inode_data(ancestor);
++	plug = pset_get(parent->hset, memb) ? : pset_get(parent->pset, memb);
++	return grab_plugin_from(self, memb, plug);
++}
++
++static void update_plugin_mask(reiser4_inode * info, pset_member memb)
++{
++	struct dentry *rootdir;
++	reiser4_inode *root;
++
++	rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
++	if (rootdir != NULL) {
++		root = reiser4_inode_data(rootdir->d_inode);
++		/*
++		 * if inode is different from the default one, or we are
++		 * changing plugin of root directory, update plugin_mask
++		 */
++		if (pset_get(info->pset, memb) != pset_get(root->pset, memb) ||
++		    info == root)
++			info->plugin_mask |= (1 << memb);
++	}
++}
++
++int
++grab_plugin_from(struct inode *self, pset_member memb, reiser4_plugin * plug)
++{
++	reiser4_inode *info;
++	int result = 0;
++
++	info = reiser4_inode_data(self);
++	if (pset_get(info->pset, memb) == NULL) {
++		result = pset_set(&info->pset, memb, plug);
++		if (result == 0)
++			update_plugin_mask(info, memb);
++	}
++	return result;
++}
++
++int force_plugin(struct inode *self, pset_member memb, reiser4_plugin * plug)
++{
++	reiser4_inode *info;
++	int result = 0;
++
++	info = reiser4_inode_data(self);
++	if (plug->h.pops != NULL && plug->h.pops->change != NULL)
++		result = plug->h.pops->change(self, plug);
++	else
++		result = pset_set(&info->pset, memb, plug);
++	if (result == 0)
++		update_plugin_mask(info, memb);
++	return result;
++}
++
++reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
++	/* C90 initializers */
++	[REISER4_FILE_PLUGIN_TYPE] = {
++		.type_id = REISER4_FILE_PLUGIN_TYPE,
++		.label = "file",
++		.desc = "Object plugins",
++		.builtin_num = sizeof_array(file_plugins),
++		.builtin = file_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(file_plugin)
++	},
++	[REISER4_DIR_PLUGIN_TYPE] = {
++		.type_id = REISER4_DIR_PLUGIN_TYPE,
++		.label = "dir",
++		.desc = "Directory plugins",
++		.builtin_num = sizeof_array(dir_plugins),
++		.builtin = dir_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(dir_plugin)
++	},
++	[REISER4_HASH_PLUGIN_TYPE] = {
++		.type_id = REISER4_HASH_PLUGIN_TYPE,
++		.label = "hash",
++		.desc = "Directory hashes",
++		.builtin_num = sizeof_array(hash_plugins),
++		.builtin = hash_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(hash_plugin)
++	},
++	[REISER4_FIBRATION_PLUGIN_TYPE] = {
++		.type_id =
++		REISER4_FIBRATION_PLUGIN_TYPE,
++		.label = "fibration",
++		.desc = "Directory fibrations",
++		.builtin_num = sizeof_array(fibration_plugins),
++		.builtin = fibration_plugins,
++		.plugins_list =	{NULL, NULL},
++		.size = sizeof(fibration_plugin)
++	},
++	[REISER4_CIPHER_PLUGIN_TYPE] = {
++		.type_id = REISER4_CIPHER_PLUGIN_TYPE,
++		.label = "cipher",
++		.desc = "Cipher plugins",
++		.builtin_num = sizeof_array(cipher_plugins),
++		.builtin = cipher_plugins,
++		.plugins_list =	{NULL, NULL},
++		.size = sizeof(cipher_plugin)
++	},
++	[REISER4_DIGEST_PLUGIN_TYPE] = {
++		.type_id = REISER4_DIGEST_PLUGIN_TYPE,
++		.label = "digest",
++		.desc = "Digest plugins",
++		.builtin_num = sizeof_array(digest_plugins),
++		.builtin = digest_plugins,
++		.plugins_list =	{NULL, NULL},
++		.size = sizeof(digest_plugin)
++	},
++	[REISER4_COMPRESSION_PLUGIN_TYPE] = {
++		.type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
++		.label = "compression",
++		.desc = "Compression plugins",
++		.builtin_num = sizeof_array(compression_plugins),
++		.builtin = compression_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(compression_plugin)
++	},
++	[REISER4_FORMATTING_PLUGIN_TYPE] = {
++		.type_id = REISER4_FORMATTING_PLUGIN_TYPE,
++		.label = "formatting",
++		.desc = "Tail inlining policies",
++		.builtin_num = sizeof_array(formatting_plugins),
++		.builtin = formatting_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(formatting_plugin)
++	},
++	[REISER4_PERM_PLUGIN_TYPE] = {
++		.type_id = REISER4_PERM_PLUGIN_TYPE,
++		.label = "perm",
++		.desc = "Permission checks",
++		.builtin_num = sizeof_array(perm_plugins),
++		.builtin = perm_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(perm_plugin)
++	},
++	[REISER4_ITEM_PLUGIN_TYPE] = {
++		.type_id = REISER4_ITEM_PLUGIN_TYPE,
++		.label = "item",
++		.desc = "Item handlers",
++		.builtin_num = sizeof_array(item_plugins),
++		.builtin = item_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(item_plugin)
++	},
++	[REISER4_NODE_PLUGIN_TYPE] = {
++		.type_id = REISER4_NODE_PLUGIN_TYPE,
++		.label = "node",
++		.desc = "node layout handlers",
++		.builtin_num = sizeof_array(node_plugins),
++		.builtin = node_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(node_plugin)
++	},
++	[REISER4_SD_EXT_PLUGIN_TYPE] = {
++		.type_id = REISER4_SD_EXT_PLUGIN_TYPE,
++		.label = "sd_ext",
++		.desc = "Parts of stat-data",
++		.builtin_num = sizeof_array(sd_ext_plugins),
++		.builtin = sd_ext_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(sd_ext_plugin)
++	},
++	[REISER4_FORMAT_PLUGIN_TYPE] = {
++		.type_id = REISER4_FORMAT_PLUGIN_TYPE,
++		.label = "disk_layout",
++		.desc = "defines filesystem on disk layout",
++		.builtin_num = sizeof_array(format_plugins),
++		.builtin = format_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(disk_format_plugin)
++	},
++	[REISER4_JNODE_PLUGIN_TYPE] = {
++		.type_id = REISER4_JNODE_PLUGIN_TYPE,
++		.label = "jnode",
++		.desc = "defines kind of jnode",
++		.builtin_num = sizeof_array(jnode_plugins),
++		.builtin = jnode_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(jnode_plugin)
++	},
++	[REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
++		.type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++		.label = "compression_mode",
++		.desc = "Defines compression mode",
++		.builtin_num = sizeof_array(compression_mode_plugins),
++		.builtin = compression_mode_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(compression_mode_plugin)
++	},
++	[REISER4_CLUSTER_PLUGIN_TYPE] = {
++		.type_id = REISER4_CLUSTER_PLUGIN_TYPE,
++		.label = "cluster",
++		.desc = "Defines cluster size",
++		.builtin_num = sizeof_array(cluster_plugins),
++		.builtin = cluster_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(cluster_plugin)
++	},
++	[REISER4_REGULAR_PLUGIN_TYPE] = {
++		.type_id = REISER4_REGULAR_PLUGIN_TYPE,
++		.label = "regular",
++		.desc = "Defines kind of regular file",
++		.builtin_num =
++		sizeof_array(regular_plugins),
++		.builtin = regular_plugins,
++		.plugins_list = {NULL, NULL},
++		.size = sizeof(regular_plugin)
++	}
++};
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/plugin.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/plugin.h
+@@ -0,0 +1,936 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Basic plugin data-types.
++   see fs/reiser4/plugin/plugin.c for details */
++
++#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
++#define __FS_REISER4_PLUGIN_TYPES_H__
++
++#include "../forward.h"
++#include "../debug.h"
++#include "../dformat.h"
++#include "../key.h"
++#include "compress/compress.h"
++#include "crypto/cipher.h"
++#include "plugin_header.h"
++#include "item/static_stat.h"
++#include "item/internal.h"
++#include "item/sde.h"
++#include "item/cde.h"
++#include "item/item.h"
++#include "node/node.h"
++#include "node/node40.h"
++#include "security/perm.h"
++#include "fibration.h"
++
++#include "space/bitmap.h"
++#include "space/space_allocator.h"
++
++#include "disk_format/disk_format40.h"
++#include "disk_format/disk_format.h"
++
++#include <linux/fs.h>		/* for struct super_block, address_space  */
++#include <linux/mm.h>		/* for struct page */
++#include <linux/buffer_head.h>	/* for struct buffer_head */
++#include <linux/dcache.h>	/* for struct dentry */
++#include <linux/types.h>
++#include <linux/crypto.h>
++
++typedef struct reiser4_object_on_wire reiser4_object_on_wire;
++
++/*
++ * File plugin.  Defines the set of methods that file plugins implement, some
++ * of which are optional.
++ *
++ * A file plugin offers to the caller an interface for IO ( writing to and/or
++ * reading from) to what the caller sees as one sequence of bytes.  An IO to it
++ * may affect more than one physical sequence of bytes, or no physical sequence
++ * of bytes, it may affect sequences of bytes offered by other file plugins to
++ * the semantic layer, and the file plugin may invoke other plugins and
++ * delegate work to them, but its interface is structured for offering the
++ * caller the ability to read and/or write what the caller sees as being a
++ * single sequence of bytes.
++ *
++ * The file plugin must present a sequence of bytes to the caller, but it does
++ * not necessarily have to store a sequence of bytes, it does not necessarily
++ * have to support efficient tree traversal to any offset in the sequence of
++ * bytes (tail and extent items, whose keys contain offsets, do however provide
++ * efficient non-sequential lookup of any offset in the sequence of bytes).
++ *
++ * Directory plugins provide methods for selecting file plugins by resolving a
++ * name for them.
++ *
++ * The functionality other filesystems call an attribute, and rigidly tie
++ * together, we decompose into orthogonal selectable features of files.  Using
++ * the terminology we will define next, an attribute is a perhaps constrained,
++ * perhaps static length, file whose parent has a uni-count-intra-link to it,
++ * which might be grandparent-major-packed, and whose parent has a deletion
++ * method that deletes it.
++ *
++ * File plugins can implement constraints.
++ *
++ * Files can be of variable length (e.g. regular unix files), or of static
++ * length (e.g. static sized attributes).
++ *
++ * An object may have many sequences of bytes, and many file plugins, but, it
++ * has exactly one objectid.  It is usually desirable that an object has a
++ * deletion method which deletes every item with that objectid.  Items cannot
++ * in general be found by just their objectids.  This means that an object must
++ * have either a method built into its deletion plugin method for knowing what
++ * items need to be deleted, or links stored with the object that provide the
++ * plugin with a method for finding those items.  Deleting a file within an
++ * object may or may not have the effect of deleting the entire object,
++ * depending on the file plugin's deletion method.
++ *
++ * LINK TAXONOMY:
++ *
++ * Many objects have a reference count, and when the reference count reaches 0
++ * the object's deletion method is invoked.  Some links embody a reference
++ * count increase ("countlinks"), and others do not ("nocountlinks").
++ *
++ * Some links are bi-directional links ("bilinks"), and some are
++ * uni-directional("unilinks").
++ *
++ * Some links are between parts of the same object ("intralinks"), and some are
++ * between different objects ("interlinks").
++ *
++ * PACKING TAXONOMY:
++ *
++ * Some items of an object are stored with a major packing locality based on
++ * their object's objectid (e.g. unix directory items in plan A), and these are
++ * called "self-major-packed".
++ *
++ * Some items of an object are stored with a major packing locality based on
++ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
++ * and these are called "parent-major-packed".
++ *
++ * Some items of an object are stored with a major packing locality based on
++ * their semantic grandparent, and these are called "grandparent-major-packed".
++ * Now carefully notice that we run into trouble with key length if we have to
++ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
++ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
++ * a 24 byte key.  One of these fields must be sacrificed if an item is to be
++ * grandparent-major-packed, and which to sacrifice is left to the item author
++ * choosing to make the item grandparent-major-packed.  You cannot make tail
++ * items and extent items grandparent-major-packed, though you could make them
++ * self-major-packed (usually they are parent-major-packed).
++ *
++ * In the case of ACLs (which are composed of fixed length ACEs which consist
++ * of {subject-type, subject, and permission bitmask} triples), it makes sense
++ * to not have an offset field in the ACE item key, and to allow duplicate keys
++ * for ACEs.  Thus, the set of ACES for a given file is found by looking for a
++ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
++ * a directory together), the minor packing locality of ACE, the objectid of
++ * the file, and 0.
++ *
++ * IO involves moving data from one location to another, which means that two
++ * locations must be specified, source and destination.
++ *
++ * This source and destination can be in the filesystem, or they can be a
++ * pointer in the user process address space plus a byte count.
++ *
++ * If both source and destination are in the filesystem, then at least one of
++ * them must be representable as a pure stream of bytes (which we call a flow,
++ * and define as a struct containing a key, a data pointer, and a length).
++ * This may mean converting one of them into a flow.  We provide a generic
++ * cast_into_flow() method, which will work for any plugin supporting
++ * read_flow(), though it is inefficiently implemented in that it temporarily
++ * stores the flow in a buffer (Question: what to do with huge flows that
++ * cannot fit into memory?  Answer: we must not convert them all at once. )
++ *
++ * Performing a write requires resolving the write request into a flow defining
++ * the source, and a method that performs the write, and a key that defines
++ * where in the tree the write is to go.
++ *
++ * Performing a read requires resolving the read request into a flow defining
++ * the target, and a method that performs the read, and a key that defines
++ * where in the tree the read is to come from.
++ *
++ * There will exist file plugins which have no pluginid stored on the disk for
++ * them, and which are only invoked by other plugins.
++ */
++
++/* builtin file-plugins */
++typedef enum {
++	/* regular file */
++	UNIX_FILE_PLUGIN_ID,
++	/* directory */
++	DIRECTORY_FILE_PLUGIN_ID,
++	/* symlink */
++	SYMLINK_FILE_PLUGIN_ID,
++	/* for objects completely handled by the VFS: fifos, devices,
++	   sockets  */
++	SPECIAL_FILE_PLUGIN_ID,
++	/* regular cryptcompress file */
++	CRC_FILE_PLUGIN_ID,
++	/* number of file plugins. Used as size of arrays to hold
++	   file plugins. */
++	LAST_FILE_PLUGIN_ID
++} reiser4_file_id;
++
++typedef struct file_plugin {
++
++	/* generic fields */
++	plugin_header h;
++
++	struct inode_operations inode_ops;
++	struct file_operations file_ops;
++	struct address_space_operations as_ops;
++
++	/* save inode cached stat-data onto disk. It was called
++	   reiserfs_update_sd() in 3.x */
++	int (*write_sd_by_inode) (struct inode *);
++
++	/*
++	 * private methods: These are optional.  If used they will allow you to
++	 * minimize the amount of code needed to implement a deviation from
++	 * some other method that also uses them.
++	 */
++
++	/*
++	 * Construct flow into @flow according to user-supplied data.
++	 *
++	 * This is used by read/write methods to construct a flow to
++	 * write/read. ->flow_by_inode() is plugin method, rather than single
++	 * global implementation, because key in a flow used by plugin may
++	 * depend on data in a @buf.
++	 *
++	 * NIKITA-FIXME-HANS: please create statistics on what functions are
++	 * dereferenced how often for the mongo benchmark.  You can supervise
++	 * Elena doing this for you if that helps.  Email me the list of the
++	 * top 10, with their counts, and an estimate of the total number of
++	 * CPU cycles spent dereferencing as a percentage of CPU cycles spent
++	 * processing (non-idle processing).  If the total percent is, say,
++	 * less than 1%, it will make our coding discussions much easier, and
++	 * keep me from questioning whether functions like the below are too
++	 * frequently called to be dereferenced.  If the total percent is more
++	 * than 1%, perhaps private methods should be listed in a "required"
++	 * comment at the top of each plugin (with stern language about how if
++	 * the comment is missing it will not be accepted by the maintainer),
++	 * and implemented using macros not dereferenced functions.  How about
++	 * replacing this whole private methods part of the struct with a
++	 * thorough documentation of what the standard helper functions are for
++	 * use in constructing plugins?  I think users have been asking for
++	 * that, though not in so many words.
++	 */
++	int (*flow_by_inode) (struct inode *, const char __user *buf,
++			      int user, loff_t size,
++			      loff_t off, rw_op op, flow_t *);
++
++	/*
++	 * Return the key used to retrieve an offset of a file. It is used by
++	 * default implementation of ->flow_by_inode() method
++	 * (common_build_flow()) and, among other things, to get to the extent
++	 * from jnode of unformatted node.
++	 */
++	int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
++
++	/* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
++	/*
++	 * set the plugin for a file.  Called during file creation in creat()
++	 * but not reiser4() unless an inode already exists for the file.
++	 */
++	int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
++				  reiser4_object_create_data *);
++
++	/* NIKITA-FIXME-HANS: comment and name seem to say different things,
++	 * are you setting up the object itself also or just adjusting the
++	 * parent?.... */
++	/* set up plugins for new @object created in @parent. @root is root
++	   directory. */
++	int (*adjust_to_parent) (struct inode *object, struct inode *parent,
++				 struct inode *root);
++	/*
++	 * this does whatever is necessary to do when object is created. For
++	 * instance, for unix files stat data is inserted. It is supposed to be
++	 * called by create of struct inode_operations.
++	 */
++	int (*create_object) (struct inode *object, struct inode *parent,
++			      reiser4_object_create_data *);
++
++	/* this does whatever is necessary to do when object is opened */
++	int (*open_object) (struct inode * inode, struct file * file);
++	/*
++	 * this method should check REISER4_NO_SD and set REISER4_NO_SD on
++	 * success. Deletion of an object usually includes removal of items
++	 * building file body (for directories this is removal of "." and "..")
++	 * and removal of stat-data item.
++	 */
++	int (*delete_object) (struct inode *);
++
++	/* add link from @parent to @object */
++	int (*add_link) (struct inode *object, struct inode *parent);
++
++	/* remove link from @parent to @object */
++	int (*rem_link) (struct inode *object, struct inode *parent);
++
++	/*
++	 * return true if item addressed by @coord belongs to @inode.  This is
++	 * used by read/write to properly slice flow into items in presence of
++	 * multiple key assignment policies, because items of a file are not
++	 * necessarily contiguous in a key space, for example, in a plan-b.
++	 */
++	int (*owns_item) (const struct inode *, const coord_t *);
++
++	/* checks whether yet another hard links to this object can be
++	   added  */
++	int (*can_add_link) (const struct inode *);
++
++	/* checks whether hard links to this object can be removed */
++	int (*can_rem_link) (const struct inode *);
++
++	/* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
++	   detach of directory plugin to remove ".." */
++	int (*detach) (struct inode * child, struct inode * parent);
++
++	/* called when @child was just looked up in the @parent. It is not
++	   empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
++	   directory plugin */
++	int (*bind) (struct inode * child, struct inode * parent);
++
++	/* process safe-link during mount */
++	int (*safelink) (struct inode * object, reiser4_safe_link_t link,
++			 __u64 value);
++
++	/* The couple of estimate methods for all file operations */
++	struct {
++		reiser4_block_nr(*create) (const struct inode *);
++		reiser4_block_nr(*update) (const struct inode *);
++		reiser4_block_nr(*unlink) (const struct inode *,
++					   const struct inode *);
++	} estimate;
++
++	/*
++	 * reiser4 specific part of inode has a union of structures which are
++	 * specific to a plugin. This method is called when inode is read
++	 * (read_inode) and when file is created (common_create_child) so that
++	 * file plugin could initialize its inode data
++	 */
++	void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
++				 int);
++
++	/*
++	 * This method performs progressive deletion of items and whole nodes
++	 * from right to left.
++	 *
++	 * @tap: the point deletion process begins from,
++	 * @from_key: the beginning of the deleted key range,
++	 * @to_key: the end of the deleted key range,
++	 * @smallest_removed: the smallest removed key,
++	 *
++	 * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
++	 * operation was interrupted for allowing atom commit .
++	 */
++	int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
++				const reiser4_key * to_key,
++				reiser4_key * smallest_removed, struct inode *,
++				int, int *);
++
++	/* called from ->destroy_inode() */
++	void (*destroy_inode) (struct inode *);
++
++	/*
++	 * methods to serialize object identify. This is used, for example, by
++	 * reiser4_{en,de}code_fh().
++	 */
++	struct {
++		/* store object's identity at @area */
++		char *(*write) (struct inode * inode, char *area);
++		/* parse object from wire to the @obj */
++		char *(*read) (char *area, reiser4_object_on_wire * obj);
++		/* given object identity in @obj, find or create its dentry */
++		struct dentry *(*get) (struct super_block * s,
++				       reiser4_object_on_wire * obj);
++		/* how many bytes ->wire.write() consumes */
++		int (*size) (struct inode * inode);
++		/* finish with object identify */
++		void (*done) (reiser4_object_on_wire * obj);
++	} wire;
++} file_plugin;
++
++extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
++
++struct reiser4_object_on_wire {
++	file_plugin *plugin;
++	union {
++		struct {
++			obj_key_id key_id;
++		} std;
++		void *generic;
++	} u;
++};
++
++/* builtin dir-plugins */
++typedef enum {
++	HASHED_DIR_PLUGIN_ID,
++	SEEKABLE_HASHED_DIR_PLUGIN_ID,
++	LAST_DIR_ID
++} reiser4_dir_id;
++
++typedef struct dir_plugin {
++	/* generic fields */
++	plugin_header h;
++
++	struct inode_operations inode_ops;
++	struct file_operations file_ops;
++	struct address_space_operations as_ops;
++
++	/*
++	 * private methods: These are optional.  If used they will allow you to
++	 * minimize the amount of code needed to implement a deviation from
++	 * some other method that uses them.  You could logically argue that
++	 * they should be a separate type of plugin.
++	 */
++
++	struct dentry *(*get_parent) (struct inode * childdir);
++
++	/*
++	 * check whether "name" is acceptable name to be inserted into this
++	 * object. Optionally implemented by directory-like objects.  Can check
++	 * for maximal length, reserved symbols etc
++	 */
++	int (*is_name_acceptable) (const struct inode * inode, const char *name,
++				   int len);
++
++	void (*build_entry_key) (const struct inode * dir	/* directory where
++								 * entry is (or will
++								 * be) in.*/ ,
++				 const struct qstr * name	/* name of file
++								 * referenced by this
++								 * entry */ ,
++				 reiser4_key * result	/* resulting key of
++							 * directory entry */ );
++	int (*build_readdir_key) (struct file * dir, reiser4_key * result);
++	int (*add_entry) (struct inode * object, struct dentry * where,
++			  reiser4_object_create_data * data,
++			  reiser4_dir_entry_desc * entry);
++	int (*rem_entry) (struct inode * object, struct dentry * where,
++			  reiser4_dir_entry_desc * entry);
++
++	/*
++	 * initialize directory structure for newly created object. For normal
++	 * unix directories, insert dot and dotdot.
++	 */
++	int (*init) (struct inode * object, struct inode * parent,
++		     reiser4_object_create_data * data);
++
++	/* destroy directory */
++	int (*done) (struct inode * child);
++
++	/* called when @subdir was just looked up in the @dir */
++	int (*attach) (struct inode * subdir, struct inode * dir);
++	int (*detach) (struct inode * subdir, struct inode * dir);
++
++	struct {
++		reiser4_block_nr(*add_entry) (const struct inode *);
++		reiser4_block_nr(*rem_entry) (const struct inode *);
++		reiser4_block_nr(*unlink) (const struct inode *,
++					   const struct inode *);
++	} estimate;
++} dir_plugin;
++
++extern dir_plugin dir_plugins[LAST_DIR_ID];
++
++typedef struct formatting_plugin {
++	/* generic fields */
++	plugin_header h;
++	/* returns non-zero iff file's tail has to be stored
++	   in a direct item. */
++	int (*have_tail) (const struct inode * inode, loff_t size);
++} formatting_plugin;
++
++typedef struct hash_plugin {
++	/* generic fields */
++	plugin_header h;
++	/* computes hash of the given name */
++	 __u64(*hash) (const unsigned char *name, int len);
++} hash_plugin;
++
++typedef struct cipher_plugin {
++	/* generic fields */
++	plugin_header h;
++	struct crypto_tfm * (*alloc) (void);
++	void (*free) (struct crypto_tfm * tfm);
++	/* Offset translator. For each offset this returns (k * offset), where
++	   k (k >= 1) is an expansion factor of the cipher algorithm.
++	   For all symmetric algorithms k == 1. For asymmetric algorithms (which
++	   inflate data) offset translation guarantees that all disk cluster's
++	   units will have keys smaller then next cluster's one.
++	 */
++	 loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
++	/* Cipher algorithms can accept data only by chunks of cipher block
++	   size. This method is to align any flow up to cipher block size when
++	   we pass it to cipher algorithm. To align means to append padding of
++	   special format specific to the cipher algorithm */
++	int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
++	/* low-level key manager (check, install, etc..) */
++	int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
++		       unsigned int keylen);
++	/* main text processing procedures */
++	void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
++	void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
++} cipher_plugin;
++
++typedef struct digest_plugin {
++	/* generic fields */
++	plugin_header h;
++	/* fingerprint size in bytes */
++	int fipsize;
++	struct crypto_tfm * (*alloc) (void);
++	void (*free) (struct crypto_tfm * tfm);
++} digest_plugin;
++
++typedef struct compression_plugin {
++	/* generic fields */
++	plugin_header h;
++	int (*init) (void);
++	/* the maximum number of bytes the size of the "compressed" data can
++	 * exceed the uncompressed data. */
++	int (*overrun) (unsigned src_len);
++	 coa_t(*alloc) (tfm_action act);
++	void (*free) (coa_t coa, tfm_action act);
++	/* minimal size of the flow we still try to compress */
++	int (*min_size_deflate) (void);
++	 __u32(*checksum) (char *data, __u32 length);
++	/* main transform procedures */
++	void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
++			  __u8 * dst_first, unsigned *dst_len);
++	void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
++			    __u8 * dst_first, unsigned *dst_len);
++} compression_plugin;
++
++typedef struct compression_mode_plugin {
++	/* generic fields */
++	plugin_header h;
++	/* this is called when estimating compressibility
++	   of a logical cluster by its content */
++	int (*should_deflate) (struct inode * inode, cloff_t index);
++	/* this is called when results of compression should be saved */
++	int (*accept_hook) (struct inode * inode, cloff_t index);
++	/* this is called when results of compression should be discarded */
++	int (*discard_hook) (struct inode * inode, cloff_t index);
++} compression_mode_plugin;
++
++typedef struct regular_plugin {
++	/* generic fields */
++	plugin_header h;
++	/* file plugin id which implements regular file */
++	reiser4_file_id id;
++} regular_plugin;
++
++typedef struct cluster_plugin {
++	/* generic fields */
++	plugin_header h;
++	int shift;
++} cluster_plugin;
++
++typedef struct sd_ext_plugin {
++	/* generic fields */
++	plugin_header h;
++	int (*present) (struct inode * inode, char **area, int *len);
++	int (*absent) (struct inode * inode);
++	int (*save_len) (struct inode * inode);
++	int (*save) (struct inode * inode, char **area);
++	/* alignment requirement for this stat-data part */
++	int alignment;
++} sd_ext_plugin;
++
++/* this plugin contains methods to allocate objectid for newly created files,
++   to deallocate objectid when file gets removed, to report number of used and
++   free objectids */
++typedef struct oid_allocator_plugin {
++	/* generic fields */
++	plugin_header h;
++	int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
++				   __u64 oids);
++	/* used to report statfs->f_files */
++	 __u64(*oids_used) (reiser4_oid_allocator * map);
++	/* get next oid to use */
++	 __u64(*next_oid) (reiser4_oid_allocator * map);
++	/* used to report statfs->f_ffree */
++	 __u64(*oids_free) (reiser4_oid_allocator * map);
++	/* allocate new objectid */
++	int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
++	/* release objectid */
++	int (*release_oid) (reiser4_oid_allocator * map, oid_t);
++	/* how many pages to reserve in transaction for allocation of new
++	   objectid */
++	int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
++	/* how many pages to reserve in transaction for freeing of an
++	   objectid */
++	int (*oid_reserve_release) (reiser4_oid_allocator * map);
++	void (*print_info) (const char *, reiser4_oid_allocator *);
++} oid_allocator_plugin;
++
++/* disk layout plugin: this specifies super block, journal, bitmap (if there
++   are any) locations, etc */
++typedef struct disk_format_plugin {
++	/* generic fields */
++	plugin_header h;
++	/* replay journal, initialize super_info_data, etc */
++	int (*init_format) (struct super_block *, void *data);
++
++	/* key of root directory stat data */
++	const reiser4_key *(*root_dir_key) (const struct super_block *);
++
++	int (*release) (struct super_block *);
++	jnode *(*log_super) (struct super_block *);
++	int (*check_open) (const struct inode * object);
++} disk_format_plugin;
++
++struct jnode_plugin {
++	/* generic fields */
++	plugin_header h;
++	int (*init) (jnode * node);
++	int (*parse) (jnode * node);
++	struct address_space *(*mapping) (const jnode * node);
++	unsigned long (*index) (const jnode * node);
++	jnode *(*clone) (jnode * node);
++};
++
++/* plugin instance.                                                         */
++/*                                                                          */
++/* This is "wrapper" union for all types of plugins. Most of the code uses  */
++/* plugins of particular type (file_plugin, dir_plugin, etc.)  rather than  */
++/* operates with pointers to reiser4_plugin. This union is only used in     */
++/* some generic code in plugin/plugin.c that operates on all                */
++/* plugins. Technically speaking purpose of this union is to add type       */
++/* safety to said generic code: each plugin type (file_plugin, for          */
++/* example), contains plugin_header as its first memeber. This first member */
++/* is located at the same place in memory as .h member of                   */
++/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and      */
++/* looks in the .h which is header of plugin type located in union. This    */
++/* allows to avoid type-casts.                                              */
++union reiser4_plugin {
++	/* generic fields */
++	plugin_header h;
++	/* file plugin */
++	file_plugin file;
++	/* directory plugin */
++	dir_plugin dir;
++	/* hash plugin, used by directory plugin */
++	hash_plugin hash;
++	/* fibration plugin used by directory plugin */
++	fibration_plugin fibration;
++	/* cipher transform plugin, used by file plugin */
++	cipher_plugin cipher;
++	/* digest transform plugin, used by file plugin */
++	digest_plugin digest;
++	/* compression transform plugin, used by file plugin */
++	compression_plugin compression;
++	/* tail plugin, used by file plugin */
++	formatting_plugin formatting;
++	/* permission plugin */
++	perm_plugin perm;
++	/* node plugin */
++	node_plugin node;
++	/* item plugin */
++	item_plugin item;
++	/* stat-data extension plugin */
++	sd_ext_plugin sd_ext;
++	/* disk layout plugin */
++	disk_format_plugin format;
++	/* object id allocator plugin */
++	oid_allocator_plugin oid_allocator;
++	/* plugin for different jnode types */
++	jnode_plugin jnode;
++	/* compression mode plugin, used by object plugin */
++	compression_mode_plugin compression_mode;
++	/* cluster plugin, used by object plugin */
++	cluster_plugin clust;
++	/* regular plugin, used by directory plugin */
++	regular_plugin regular;
++	/* place-holder for new plugin types that can be registered
++	   dynamically, and used by other dynamically loaded plugins.  */
++	void *generic;
++};
++
++struct reiser4_plugin_ops {
++	/* called when plugin is initialized */
++	int (*init) (reiser4_plugin * plugin);
++	/* called when plugin is unloaded */
++	int (*done) (reiser4_plugin * plugin);
++	/* load given plugin from disk */
++	int (*load) (struct inode * inode,
++		     reiser4_plugin * plugin, char **area, int *len);
++	/* how many space is required to store this plugin's state
++	   in stat-data */
++	int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
++	/* save persistent plugin-data to disk */
++	int (*save) (struct inode * inode, reiser4_plugin * plugin,
++		     char **area);
++	/* alignment requirement for on-disk state of this plugin
++	   in number of bytes */
++	int alignment;
++	/* install itself into given inode. This can return error
++	   (e.g., you cannot change hash of non-empty directory). */
++	int (*change) (struct inode * inode, reiser4_plugin * plugin);
++	/* install itself into given inode. This can return error
++	   (e.g., you cannot change hash of non-empty directory). */
++	int (*inherit) (struct inode * inode, struct inode * parent,
++			reiser4_plugin * plugin);
++};
++
++/* functions implemented in fs/reiser4/plugin/plugin.c */
++
++/* stores plugin reference in reiser4-specific part of inode */
++extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
++extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
++extern int init_plugins(void);
++
++/* builtin plugins */
++
++/* builtin hash-plugins */
++
++typedef enum {
++	RUPASOV_HASH_ID,
++	R5_HASH_ID,
++	TEA_HASH_ID,
++	FNV1_HASH_ID,
++	DEGENERATE_HASH_ID,
++	LAST_HASH_ID
++} reiser4_hash_id;
++
++/* builtin cipher plugins */
++
++typedef enum {
++	NONE_CIPHER_ID,
++	AES_CIPHER_ID,
++	LAST_CIPHER_ID
++} reiser4_cipher_id;
++
++/* builtin digest plugins */
++
++typedef enum {
++	SHA256_32_DIGEST_ID,
++	LAST_DIGEST_ID
++} reiser4_digest_id;
++
++/* builtin compression mode plugins */
++typedef enum {
++	NONE_COMPRESSION_MODE_ID,
++	COL_8_COMPRESSION_MODE_ID,
++	COL_16_COMPRESSION_MODE_ID,
++	COL_32_COMPRESSION_MODE_ID,
++	COZ_COMPRESSION_MODE_ID,
++	FORCE_COMPRESSION_MODE_ID,
++	TEST_COMPRESSION_MODE_ID,
++  	LAST_COMPRESSION_MODE_ID
++} reiser4_compression_mode_id;
++
++/* builtin cluster plugins */
++typedef enum {
++	CLUSTER_64K_ID,
++	CLUSTER_32K_ID,
++	CLUSTER_16K_ID,
++	CLUSTER_8K_ID,
++	CLUSTER_4K_ID,
++	LAST_CLUSTER_ID
++} reiser4_cluster_id;
++
++/* builtin regular plugins */
++typedef enum {
++	UF_REGULAR_ID,
++	CRC_REGULAR_ID,
++	LAST_REGULAR_ID
++} reiser4_regular_id;
++
++/* builtin tail-plugins */
++
++typedef enum {
++	NEVER_TAILS_FORMATTING_ID,
++	ALWAYS_TAILS_FORMATTING_ID,
++	SMALL_FILE_FORMATTING_ID,
++	LAST_TAIL_FORMATTING_ID
++} reiser4_formatting_id;
++
++/* compression/clustering specific data */
++typedef struct compression_data {
++	reiser4_compression_id coa;	/* id of the compression algorithm */
++} compression_data_t;
++
++typedef __u8 cluster_data_t;	/* cluster info */
++
++/* data type used to pack parameters that we pass to vfs object creation
++   function create_object() */
++struct reiser4_object_create_data {
++	/* plugin to control created object */
++	reiser4_file_id id;
++	/* mode of regular file, directory or special file */
++/* what happens if some other sort of perm plugin is in use? */
++	int mode;
++	/* rdev of special file */
++	dev_t rdev;
++	/* symlink target */
++	const char *name;
++	/* add here something for non-standard objects you invent, like
++	   query for interpolation file etc. */
++
++ 	crypto_stat_t * crypto;
++	compression_data_t *compression;
++	cluster_data_t *cluster;
++
++	struct inode *parent;
++	struct dentry *dentry;
++};
++
++/* description of directory entry being created/destroyed/sought for
++
++   It is passed down to the directory plugin and farther to the
++   directory item plugin methods. Creation of new directory is done in
++   several stages: first we search for an entry with the same name, then
++   create new one. reiser4_dir_entry_desc is used to store some information
++   collected at some stage of this process and required later: key of
++   item that we want to insert/delete and pointer to an object that will
++   be bound by the new directory entry. Probably some more fields will
++   be added there.
++
++*/
++struct reiser4_dir_entry_desc {
++	/* key of directory entry */
++	reiser4_key key;
++	/* object bound by this entry. */
++	struct inode *obj;
++};
++
++#define MAX_PLUGIN_TYPE_LABEL_LEN  32
++#define MAX_PLUGIN_PLUG_LABEL_LEN  32
++
++/* used for interface with user-land: table-driven parsing in
++    reiser4(). */
++typedef struct plugin_locator {
++	reiser4_plugin_type type_id;
++	reiser4_plugin_id id;
++	char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
++	char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
++} plugin_locator;
++
++extern int locate_plugin(struct inode *inode, plugin_locator * loc);
++
++
++#define PLUGIN_BY_ID(TYPE,ID,FIELD)					\
++static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id )		\
++{									\
++	reiser4_plugin *plugin = plugin_by_id ( ID, id );		\
++	return plugin ? & plugin -> FIELD : NULL;			\
++}									\
++static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id )	\
++{									\
++	reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id );	\
++	return plugin ? & plugin -> FIELD : NULL;			\
++}									\
++static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id )	\
++{									\
++	reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id );	\
++	return plugin ? & plugin -> FIELD : NULL;			\
++}									\
++static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin )	\
++{									\
++	return ( reiser4_plugin * ) plugin;				\
++}									\
++static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin )		\
++{									\
++	return TYPE ## _to_plugin (plugin) -> h.id;			\
++}									\
++typedef struct { int foo; } TYPE ## _plugin_dummy
++
++PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
++PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
++PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
++PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
++PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
++PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
++PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
++PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
++PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
++PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
++PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
++PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
++PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
++PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
++PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++	     compression_mode);
++PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
++PLUGIN_BY_ID(regular_plugin, REISER4_REGULAR_PLUGIN_TYPE, regular);
++
++extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
++
++extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
++
++#define for_all_plugins(ptype, plugin)							\
++for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage);	\
++     get_plugin_list(ptype) != &plugin->h.linkage;					\
++     plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
++
++
++/* enumeration of fields within plugin_set */
++typedef enum {
++	PSET_FILE,
++	PSET_DIR,		/* PSET_FILE and PSET_DIR should be first elements:
++				 * inode.c:read_inode() depends on this. */
++	PSET_PERM,
++	PSET_FORMATTING,
++	PSET_HASH,
++	PSET_FIBRATION,
++	PSET_SD,
++	PSET_DIR_ITEM,
++	PSET_CIPHER,
++	PSET_DIGEST,
++	PSET_COMPRESSION,
++	PSET_COMPRESSION_MODE,
++	PSET_CLUSTER,
++	PSET_REGULAR_ENTRY,
++	PSET_LAST
++} pset_member;
++
++int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb);
++int grab_plugin_from(struct inode *self, pset_member memb,
++		     reiser4_plugin * plug);
++int force_plugin(struct inode *self, pset_member memb, reiser4_plugin * plug);
++
++/* defined in fs/reiser4/plugin/object.c */
++extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
++/* defined in fs/reiser4/plugin/object.c */
++extern dir_plugin dir_plugins[LAST_DIR_ID];
++/* defined in fs/reiser4/plugin/item/static_stat.c */
++extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
++/* defined in fs/reiser4/plugin/hash.c */
++extern hash_plugin hash_plugins[LAST_HASH_ID];
++/* defined in fs/reiser4/plugin/fibration.c */
++extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
++/* defined in fs/reiser4/plugin/crypt.c */
++extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
++/* defined in fs/reiser4/plugin/digest.c */
++extern digest_plugin digest_plugins[LAST_DIGEST_ID];
++/* defined in fs/reiser4/plugin/compress/compress.c */
++extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
++/* defined in fs/reiser4/plugin/compress/compression_mode.c */
++extern compression_mode_plugin
++compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
++/* defined in fs/reiser4/plugin/cluster.c */
++extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
++/* defined in fs/reiser4/plugin/regular.c */
++extern regular_plugin regular_plugins[LAST_REGULAR_ID];
++/* defined in fs/reiser4/plugin/tail.c */
++extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
++/* defined in fs/reiser4/plugin/security/security.c */
++extern perm_plugin perm_plugins[LAST_PERM_ID];
++/* defined in fs/reiser4/plugin/item/item.c */
++extern item_plugin item_plugins[LAST_ITEM_ID];
++/* defined in fs/reiser4/plugin/node/node.c */
++extern node_plugin node_plugins[LAST_NODE_ID];
++/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
++extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
++
++/* __FS_REISER4_PLUGIN_TYPES_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/plugin_header.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/plugin_header.h
+@@ -0,0 +1,136 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* plugin header. Data structures required by all plugin types. */
++
++#if !defined( __PLUGIN_HEADER_H__ )
++#define __PLUGIN_HEADER_H__
++
++/* plugin data-types and constants */
++
++#include "../debug.h"
++#include "../dformat.h"
++
++typedef enum {
++	REISER4_FILE_PLUGIN_TYPE,
++	REISER4_DIR_PLUGIN_TYPE,
++	REISER4_ITEM_PLUGIN_TYPE,
++	REISER4_NODE_PLUGIN_TYPE,
++	REISER4_HASH_PLUGIN_TYPE,
++	REISER4_FIBRATION_PLUGIN_TYPE,
++	REISER4_FORMATTING_PLUGIN_TYPE,
++	REISER4_PERM_PLUGIN_TYPE,
++	REISER4_SD_EXT_PLUGIN_TYPE,
++	REISER4_FORMAT_PLUGIN_TYPE,
++	REISER4_JNODE_PLUGIN_TYPE,
++	REISER4_CIPHER_PLUGIN_TYPE,
++	REISER4_DIGEST_PLUGIN_TYPE,
++	REISER4_COMPRESSION_PLUGIN_TYPE,
++	REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
++	REISER4_CLUSTER_PLUGIN_TYPE,
++	REISER4_REGULAR_PLUGIN_TYPE,
++	REISER4_PLUGIN_TYPES
++} reiser4_plugin_type;
++
++struct reiser4_plugin_ops;
++/* generic plugin operations, supported by each
++    plugin type. */
++typedef struct reiser4_plugin_ops reiser4_plugin_ops;
++
++/* the common part of all plugin instances. */
++typedef struct plugin_header {
++	/* plugin type */
++	reiser4_plugin_type type_id;
++	/* id of this plugin */
++	reiser4_plugin_id id;
++	/* plugin operations */
++	reiser4_plugin_ops *pops;
++/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
++	/* short label of this plugin */
++	const char *label;
++	/* descriptive string.. */
++	const char *desc;
++	/* list linkage */
++	struct list_head linkage;
++} plugin_header;
++
++/* PRIVATE INTERFACES */
++/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
++/* plugin type representation. */
++typedef struct reiser4_plugin_type_data {
++	/* internal plugin type identifier. Should coincide with
++	   index of this item in plugins[] array. */
++	reiser4_plugin_type type_id;
++	/* short symbolic label of this plugin type. Should be no longer
++	   than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
++	const char *label;
++	/* plugin type description longer than .label */
++	const char *desc;
++
++/* NIKITA-FIXME-HANS: define built-in */
++	/* number of built-in plugin instances of this type */
++	int builtin_num;
++	/* array of built-in plugins */
++	void *builtin;
++	struct list_head plugins_list;
++	size_t size;
++} reiser4_plugin_type_data;
++
++extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
++
++int is_type_id_valid(reiser4_plugin_type type_id);
++int is_plugin_id_valid(reiser4_plugin_type type_id, reiser4_plugin_id id);
++
++static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
++{
++	char *builtin;
++
++	builtin = ptype->builtin;
++	return (reiser4_plugin *) (builtin + i * ptype->size);
++}
++
++/* return plugin by its @type_id and @id */
++static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type_id
++					   /* plugin type id */ ,
++					   reiser4_plugin_id id /* plugin id */
++					   )
++{
++	assert("nikita-1651", is_type_id_valid(type_id));
++	assert("nikita-1652", is_plugin_id_valid(type_id, id));
++	return plugin_at(&plugins[type_id], id);
++}
++
++extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
++					   reiser4_plugin_id id);
++
++/**
++ * plugin_by_disk_id - get reiser4_plugin
++ * @type_id: plugin type id
++ * @did: plugin id in disk format
++ *
++ * Returns reiser4_plugin by plugin type id an dplugin_id.
++ */
++static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
++						reiser4_plugin_type type_id,
++						__le16 *plugin_id)
++{
++	/*
++	 * what we should do properly is to maintain within each file-system a
++	 * dictionary that maps on-disk plugin ids to "universal" ids. This
++	 * dictionary will be resolved on mount time, so that this function
++	 * will perform just one additional array lookup.
++	 */
++	return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
++}
++
++/* __PLUGIN_HEADER_H__ */
++#endif
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/plugin_set.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/plugin_set.c
+@@ -0,0 +1,378 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++/* NIKITA-FIXME-HANS: you didn't discuss this with me before coding it did you?  Remove plugin-sets from code by March 15th, 2004 */
++/* plugin-sets */
++
++/*
++ * Each inode comes with a whole set of plugins: file plugin, directory
++ * plugin, hash plugin, tail policy plugin, security plugin, etc.
++ *
++ * Storing them (pointers to them, that is) in inode is a waste of
++ * space. Especially, given that on average file system plugins of vast
++ * majority of files will belong to few sets (e.g., one set for regular files,
++ * another set for standard directory, etc.)
++ *
++ * Plugin set (pset) is an object containing pointers to all plugins required
++ * by inode. Inode only stores a pointer to pset. psets are "interned", that
++ * is, different inodes with the same set of plugins point to the same
++ * pset. This is archived by storing psets in global hash table. Races are
++ * avoided by simple (and efficient so far) solution of never recycling psets,
++ * even when last inode pointing to it is destroyed.
++ *
++ */
++
++#include "../debug.h"
++#include "../super.h"
++#include "plugin_set.h"
++
++#include <linux/slab.h>
++#include <linux/stddef.h>
++
++/* slab for plugin sets */
++static kmem_cache_t *plugin_set_slab;
++
++static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
++	[0 ... 7] = SPIN_LOCK_UNLOCKED
++};
++
++/* hash table support */
++
++#define PS_TABLE_SIZE (32)
++
++static inline plugin_set *cast_to(const unsigned long *a)
++{
++	return container_of(a, plugin_set, hashval);
++}
++
++static inline int pseq(const unsigned long *a1, const unsigned long *a2)
++{
++	plugin_set *set1;
++	plugin_set *set2;
++
++	/* make sure fields are not missed in the code below */
++	cassert(sizeof *set1 ==
++		sizeof set1->hashval +
++		sizeof set1->link +
++		sizeof set1->file +
++		sizeof set1->dir +
++		sizeof set1->perm +
++		sizeof set1->formatting +
++		sizeof set1->hash +
++		sizeof set1->fibration +
++		sizeof set1->sd +
++		sizeof set1->dir_item +
++		sizeof set1->cipher +
++		sizeof set1->digest +
++		sizeof set1->compression +
++		sizeof set1->compression_mode +
++		sizeof set1->cluster + sizeof set1->regular_entry);
++
++	set1 = cast_to(a1);
++	set2 = cast_to(a2);
++	return
++	    set1->hashval == set2->hashval &&
++	    set1->file == set2->file &&
++	    set1->dir == set2->dir &&
++	    set1->perm == set2->perm &&
++	    set1->formatting == set2->formatting &&
++	    set1->hash == set2->hash &&
++	    set1->fibration == set2->fibration &&
++	    set1->sd == set2->sd &&
++	    set1->dir_item == set2->dir_item &&
++	    set1->cipher == set2->cipher &&
++	    set1->digest == set2->digest &&
++	    set1->compression == set2->compression &&
++	    set1->compression_mode == set2->compression_mode &&
++	    set1->cluster == set2->cluster &&
++	    set1->regular_entry == set2->regular_entry;
++}
++
++#define HASH_FIELD(hash, set, field)		\
++({						\
++        (hash) += (unsigned long)(set)->field >> 2;	\
++})
++
++static inline unsigned long calculate_hash(const plugin_set * set)
++{
++	unsigned long result;
++
++	result = 0;
++	HASH_FIELD(result, set, file);
++	HASH_FIELD(result, set, dir);
++	HASH_FIELD(result, set, perm);
++	HASH_FIELD(result, set, formatting);
++	HASH_FIELD(result, set, hash);
++	HASH_FIELD(result, set, fibration);
++	HASH_FIELD(result, set, sd);
++	HASH_FIELD(result, set, dir_item);
++	HASH_FIELD(result, set, cipher);
++	HASH_FIELD(result, set, digest);
++	HASH_FIELD(result, set, compression);
++	HASH_FIELD(result, set, compression_mode);
++	HASH_FIELD(result, set, cluster);
++	HASH_FIELD(result, set, regular_entry);
++	return result & (PS_TABLE_SIZE - 1);
++}
++
++static inline unsigned long
++pshash(ps_hash_table * table, const unsigned long *a)
++{
++	return *a;
++}
++
++/* The hash table definition */
++#define KMALLOC(size) kmalloc((size), get_gfp_mask())
++#define KFREE(ptr, size) kfree(ptr)
++TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
++		      pseq);
++#undef KFREE
++#undef KMALLOC
++
++static ps_hash_table ps_table;
++static plugin_set empty_set = {
++	.hashval = 0,
++	.file = NULL,
++	.dir = NULL,
++	.perm = NULL,
++	.formatting = NULL,
++	.hash = NULL,
++	.fibration = NULL,
++	.sd = NULL,
++	.dir_item = NULL,
++	.cipher = NULL,
++	.digest = NULL,
++	.compression = NULL,
++	.compression_mode = NULL,
++	.cluster = NULL,
++	.regular_entry = NULL,
++	.link = {NULL}
++};
++
++plugin_set *plugin_set_get_empty(void)
++{
++	return &empty_set;
++}
++
++void plugin_set_put(plugin_set * set)
++{
++}
++
++static inline unsigned long *pset_field(plugin_set * set, int offset)
++{
++	return (unsigned long *)(((char *)set) + offset);
++}
++
++static int plugin_set_field(plugin_set ** set, const unsigned long val,
++			    const int offset)
++{
++	unsigned long *spot;
++	spinlock_t *lock;
++	plugin_set replica;
++	plugin_set *twin;
++	plugin_set *psal;
++	plugin_set *orig;
++
++	assert("nikita-2902", set != NULL);
++	assert("nikita-2904", *set != NULL);
++
++	spot = pset_field(*set, offset);
++	if (unlikely(*spot == val))
++		return 0;
++
++	replica = *(orig = *set);
++	*pset_field(&replica, offset) = val;
++	replica.hashval = calculate_hash(&replica);
++	rcu_read_lock();
++	twin = ps_hash_find(&ps_table, &replica.hashval);
++	if (unlikely(twin == NULL)) {
++		rcu_read_unlock();
++		psal = kmem_cache_alloc(plugin_set_slab, get_gfp_mask());
++		if (psal == NULL)
++			return RETERR(-ENOMEM);
++		*psal = replica;
++		lock = &plugin_set_lock[replica.hashval & 7];
++		spin_lock(lock);
++		twin = ps_hash_find(&ps_table, &replica.hashval);
++		if (likely(twin == NULL)) {
++			*set = psal;
++			ps_hash_insert_rcu(&ps_table, psal);
++		} else {
++			*set = twin;
++			kmem_cache_free(plugin_set_slab, psal);
++		}
++		spin_unlock(lock);
++	} else {
++		rcu_read_unlock();
++		*set = twin;
++	}
++	return 0;
++}
++
++static struct {
++	int offset;
++	reiser4_plugin_type type;
++} pset_descr[PSET_LAST] = {
++	[PSET_FILE] = {
++		.offset = offsetof(plugin_set, file),
++		.type = REISER4_FILE_PLUGIN_TYPE
++	},
++	[PSET_DIR] = {
++		.offset = offsetof(plugin_set, dir),
++		.type = REISER4_DIR_PLUGIN_TYPE
++	},
++	[PSET_PERM] = {
++		.offset = offsetof(plugin_set, perm),
++		.type = REISER4_PERM_PLUGIN_TYPE
++	},
++	[PSET_FORMATTING] = {
++		.offset = offsetof(plugin_set, formatting),
++		.type = REISER4_FORMATTING_PLUGIN_TYPE
++	},
++	[PSET_HASH] = {
++		.offset = offsetof(plugin_set, hash),
++		.type = REISER4_HASH_PLUGIN_TYPE
++	},
++	[PSET_FIBRATION] = {
++		.offset = offsetof(plugin_set, fibration),
++		.type = REISER4_FIBRATION_PLUGIN_TYPE
++	},
++	[PSET_SD] = {
++		.offset = offsetof(plugin_set, sd),
++		.type = REISER4_ITEM_PLUGIN_TYPE
++	},
++	[PSET_DIR_ITEM] = {
++		.offset = offsetof(plugin_set, dir_item),
++		.type = REISER4_ITEM_PLUGIN_TYPE
++	},
++	[PSET_CIPHER] = {
++		.offset = offsetof(plugin_set, cipher),
++		.type = REISER4_CIPHER_PLUGIN_TYPE
++	},
++	[PSET_DIGEST] = {
++		.offset = offsetof(plugin_set, digest),
++		.type = REISER4_DIGEST_PLUGIN_TYPE
++	},
++	[PSET_COMPRESSION] = {
++		.offset = offsetof(plugin_set, compression),
++		.type = REISER4_COMPRESSION_PLUGIN_TYPE
++	},
++	[PSET_COMPRESSION_MODE] = {
++		.offset = offsetof(plugin_set, compression_mode),
++		.type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE
++	},
++	[PSET_CLUSTER] = {
++		.offset = offsetof(plugin_set, cluster),
++		.type = REISER4_CLUSTER_PLUGIN_TYPE
++	},
++	[PSET_REGULAR_ENTRY] = {
++		.offset = offsetof(plugin_set, regular_entry),
++		.type = REISER4_REGULAR_PLUGIN_TYPE
++	}
++};
++
++#if REISER4_DEBUG
++static reiser4_plugin_type pset_member_to_type(pset_member memb)
++{
++	assert("nikita-3501", 0 <= memb && memb < PSET_LAST);
++	return pset_descr[memb].type;
++}
++#endif
++
++reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb)
++{
++	if (0 <= memb && memb < PSET_LAST)
++		return pset_descr[memb].type;
++	else
++		return REISER4_PLUGIN_TYPES;
++}
++
++int pset_set(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
++{
++	assert("nikita-3492", set != NULL);
++	assert("nikita-3493", *set != NULL);
++	assert("nikita-3494", plugin != NULL);
++	assert("nikita-3495", 0 <= memb && memb < PSET_LAST);
++	assert("nikita-3496", plugin->h.type_id == pset_member_to_type(memb));
++
++	return plugin_set_field(set,
++				(unsigned long)plugin, pset_descr[memb].offset);
++}
++
++reiser4_plugin *pset_get(plugin_set * set, pset_member memb)
++{
++	assert("nikita-3497", set != NULL);
++	assert("nikita-3498", 0 <= memb && memb < PSET_LAST);
++
++	return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset);
++}
++
++#define DEFINE_PLUGIN_SET(type, field)					\
++int plugin_set_ ## field(plugin_set **set, type *val)	\
++{									\
++	cassert(sizeof val == sizeof(unsigned long));			\
++	return plugin_set_field(set, (unsigned long)val,		\
++				offsetof(plugin_set, field));		\
++}
++
++DEFINE_PLUGIN_SET(file_plugin, file)
++    DEFINE_PLUGIN_SET(dir_plugin, dir)
++    DEFINE_PLUGIN_SET(formatting_plugin, formatting)
++    DEFINE_PLUGIN_SET(hash_plugin, hash)
++    DEFINE_PLUGIN_SET(fibration_plugin, fibration)
++    DEFINE_PLUGIN_SET(item_plugin, sd)
++    /* DEFINE_PLUGIN_SET(cipher_plugin, cipher) */
++    /* DEFINE_PLUGIN_SET(digest_plugin, digest) */
++    DEFINE_PLUGIN_SET(compression_plugin, compression)
++    /* DEFINE_PLUGIN_SET(compression_mode_plugin, compression_mode) */
++    DEFINE_PLUGIN_SET(cluster_plugin, cluster)
++    /* DEFINE_PLUGIN_SET(regular_plugin, regular_entry) */
++
++
++/**
++ * init_plugin_set - create pset cache and hash table
++ *
++ * Initializes slab cache of plugin_set-s and their hash table. It is part of
++ * reiser4 module initialization.
++ */
++int init_plugin_set(void)
++{
++	int result;
++
++	result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
++	if (result == 0) {
++		plugin_set_slab = kmem_cache_create("plugin_set",
++						    sizeof(plugin_set), 0,
++						    SLAB_HWCACHE_ALIGN,
++						    NULL, NULL);
++		if (plugin_set_slab == NULL)
++			result = RETERR(-ENOMEM);
++	}
++	return result;
++}
++
++/**
++ * done_plugin_set - delete plugin_set cache and plugin_set hash table
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_plugin_set(void)
++{
++	plugin_set *cur, *next;
++
++	for_all_in_htable(&ps_table, ps, cur, next) {
++		ps_hash_remove(&ps_table, cur);
++		kmem_cache_free(plugin_set_slab, cur);
++	}
++	destroy_reiser4_cache(&plugin_set_slab);
++	ps_hash_done(&ps_table);
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/plugin_set.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/plugin_set.h
+@@ -0,0 +1,83 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* plugin-sets. see fs/reiser4/plugin/plugin_set.c for details */
++
++#if !defined( __PLUGIN_SET_H__ )
++#define __PLUGIN_SET_H__
++
++#include "../type_safe_hash.h"
++#include "plugin.h"
++
++#include <linux/rcupdate.h>
++
++struct plugin_set;
++typedef struct plugin_set plugin_set;
++
++TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
++
++struct plugin_set {
++	unsigned long hashval;
++	/* plugin of file */
++	file_plugin *file;
++	/* plugin of dir */
++	dir_plugin *dir;
++	/* perm plugin for this file */
++	perm_plugin *perm;
++	/* tail policy plugin. Only meaningful for regular files */
++	formatting_plugin *formatting;
++	/* hash plugin. Only meaningful for directories. */
++	hash_plugin *hash;
++	/* fibration plugin. Only meaningful for directories. */
++	fibration_plugin *fibration;
++	/* plugin of stat-data */
++	item_plugin *sd;
++	/* plugin of items a directory is built of */
++	item_plugin *dir_item;
++	/* cipher plugin */
++	cipher_plugin *cipher;
++	/* digest plugin */
++	digest_plugin *digest;
++	/* compression plugin */
++	compression_plugin *compression;
++	/* compression mode plugin */
++	compression_mode_plugin *compression_mode;
++	/* cluster plugin */
++	cluster_plugin *cluster;
++	/* plugin of regular child should be created */
++	regular_plugin *regular_entry;
++	ps_hash_link link;
++};
++
++extern plugin_set *plugin_set_get_empty(void);
++extern void plugin_set_put(plugin_set * set);
++
++extern int plugin_set_file(plugin_set ** set, file_plugin * plug);
++extern int plugin_set_dir(plugin_set ** set, dir_plugin * plug);
++extern int plugin_set_formatting(plugin_set ** set, formatting_plugin * plug);
++extern int plugin_set_hash(plugin_set ** set, hash_plugin * plug);
++extern int plugin_set_fibration(plugin_set ** set, fibration_plugin * plug);
++extern int plugin_set_sd(plugin_set ** set, item_plugin * plug);
++extern int plugin_set_compression(plugin_set ** set, compression_plugin * plug);
++extern int plugin_set_cluster(plugin_set ** set, cluster_plugin * plug);
++
++extern int init_plugin_set(void);
++extern void done_plugin_set(void);
++
++extern int pset_set(plugin_set ** set, pset_member memb,
++		    reiser4_plugin * plugin);
++extern reiser4_plugin *pset_get(plugin_set * set, pset_member memb);
++
++extern reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb);
++
++/* __PLUGIN_SET_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/regular.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/regular.c
+@@ -0,0 +1,44 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Contains Reiser4 regular plugins which:
++   . specify a set of reiser4 regular object plugins,
++   . used by directory plugin to create entries powered by specified
++     regular plugins */
++
++#include "plugin.h"
++
++regular_plugin regular_plugins[LAST_REGULAR_ID] = {
++	[UF_REGULAR_ID] = {
++		.h = {
++			.type_id = REISER4_REGULAR_PLUGIN_TYPE,
++			.id = UF_REGULAR_ID,
++			.pops = NULL,
++			.label = "unixfile",
++			.desc = "Unix file regular plugin",
++			.linkage = {NULL, NULL}
++		},
++		.id = UNIX_FILE_PLUGIN_ID
++	},
++	[CRC_REGULAR_ID] = {
++		.h = {
++			.type_id = REISER4_REGULAR_PLUGIN_TYPE,
++			.id = CRC_REGULAR_ID,
++			.pops = NULL,
++			.label = "cryptcompress",
++			.desc = "Cryptcompress regular plugin",
++			.linkage = {NULL, NULL}
++		},
++		.id = CRC_FILE_PLUGIN_ID
++	}
++};
++
++/*
++  Local variables:
++  c-indentation-style: "K&R"
++  mode-name: "LC"
++  c-basic-offset: 8
++  tab-width: 8
++  fill-column: 120
++  scroll-step: 1
++  End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/security/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/security/Makefile
+@@ -0,0 +1,4 @@
++obj-$(CONFIG_REISER4_FS) += security_plugins.o
++
++security_plugins-objs :=	\
++	perm.o
+Index: linux-2.6.16/fs/reiser4/plugin/security/perm.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/security/perm.c
+@@ -0,0 +1,44 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/*
++ * this file contains implementation of permission plugins. Currently, only
++ * RWX_PERM_ID is implemented
++ */
++
++#include "../plugin.h"
++#include "../plugin_header.h"
++#include "../../debug.h"
++
++perm_plugin perm_plugins[LAST_PERM_ID] = {
++	[NULL_PERM_ID] = {
++		.h = {
++			.type_id = REISER4_PERM_PLUGIN_TYPE,
++			.id = NULL_PERM_ID,
++			.pops = NULL,
++			.label = "null",
++			.desc = "stub permission plugin",
++			.linkage = {NULL, NULL}
++		},
++		.read_ok = NULL,
++		.write_ok = NULL,
++		.lookup_ok = NULL,
++		.create_ok = NULL,
++		.link_ok = NULL,
++		.unlink_ok = NULL,
++		.delete_ok = NULL,
++		.mask_ok = NULL,
++		.setattr_ok = NULL,
++		.getattr_ok = NULL,
++		.rename_ok = NULL,
++	}
++};
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/plugin/security/perm.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/security/perm.h
+@@ -0,0 +1,82 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Perm (short for "permissions") plugins common stuff. */
++
++#if !defined( __REISER4_PERM_H__ )
++#define __REISER4_PERM_H__
++
++#include "../../forward.h"
++#include "../plugin_header.h"
++
++#include <linux/types.h>
++#include <linux/fs.h>		/* for struct file  */
++#include <linux/dcache.h>	/* for struct dentry */
++
++/* interface for perm plugin.
++
++   Perm plugin method can be implemented through:
++
++    1. consulting ->i_mode bits in stat data
++
++    2. obtaining acl from the tree and inspecting it
++
++    3. asking some kernel module or user-level program to authorize access.
++
++   This allows for integration with things like capabilities, SELinux-style
++   secutiry contexts, etc.
++
++*/
++/* NIKITA-FIXME-HANS: define what this is targeted for.  It does not seem to be intended for use with sys_reiser4.  Explain. */
++typedef struct perm_plugin {
++	/* generic plugin fields */
++	plugin_header h;
++
++	/* check permissions for read/write */
++	int (*read_ok) (struct file *file, const char __user *buf,
++			size_t size, loff_t *off);
++	int (*write_ok) (struct file *file, const char __user *buf,
++			 size_t size, loff_t *off);
++
++	/* check permissions for lookup */
++	int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
++
++	/* check permissions for create */
++	int (*create_ok) (struct inode * parent, struct dentry * dentry,
++			  reiser4_object_create_data * data);
++
++	/* check permissions for linking @where to @existing */
++	int (*link_ok) (struct dentry * existing, struct inode * parent,
++			struct dentry * where);
++
++	/* check permissions for unlinking @victim from @parent */
++	int (*unlink_ok) (struct inode * parent, struct dentry * victim);
++
++	/* check permissions for deletion of @object whose last reference is
++	   by @parent */
++	int (*delete_ok) (struct inode * parent, struct dentry * victim);
++	int (*mask_ok) (struct inode * inode, int mask);
++	/* check whether attribute change is acceptable */
++	int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
++
++	/* check whether stat(2) is allowed */
++	int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
++			   struct dentry * dentry, struct kstat * stat);
++	/* check whether rename(2) is allowed */
++	int (*rename_ok) (struct inode * old_dir, struct dentry * old,
++			  struct inode * new_dir, struct dentry * new);
++} perm_plugin;
++
++typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
++
++/* __REISER4_PERM_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/space/Makefile
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/space/Makefile
+@@ -0,0 +1,4 @@
++obj-$(CONFIG_REISER4_FS) += space_plugins.o
++
++space_plugins-objs := \
++	bitmap.o
+Index: linux-2.6.16/fs/reiser4/plugin/space/bitmap.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/space/bitmap.c
+@@ -0,0 +1,1592 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#include "../../debug.h"
++#include "../../dformat.h"
++#include "../../txnmgr.h"
++#include "../../jnode.h"
++#include "../../block_alloc.h"
++#include "../../tree.h"
++#include "../../super.h"
++#include "../plugin.h"
++#include "space_allocator.h"
++#include "bitmap.h"
++
++#include <linux/types.h>
++#include <linux/fs.h>		/* for struct super_block  */
++#include <asm/semaphore.h>
++#include <linux/vmalloc.h>
++#include <asm/div64.h>
++
++/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
++ * blocks
++
++   A useful optimization of reiser4 bitmap handling would be dynamic bitmap
++   blocks loading/unloading which is different from v3.x where all bitmap
++   blocks are loaded at mount time.
++
++   To implement bitmap blocks unloading we need to count bitmap block usage
++   and detect currently unused blocks allowing them to be unloaded. It is not
++   a simple task since we allow several threads to modify one bitmap block
++   simultaneously.
++
++   Briefly speaking, the following schema is proposed: we count in special
++   variable associated with each bitmap block. That is for counting of block
++   alloc/dealloc operations on that bitmap block. With a deferred block
++   deallocation feature of reiser4 all those operation will be represented in
++   atom dirty/deleted lists as jnodes for freshly allocated or deleted
++   nodes.
++
++   So, we increment usage counter for each new node allocated or deleted, and
++   decrement it at atom commit one time for each node from the dirty/deleted
++   atom's list.  Of course, freshly allocated node deletion and node reusing
++   from atom deleted (if we do so) list should decrement bitmap usage counter
++   also.
++
++   This schema seems to be working but that reference counting is
++   not easy to debug. I think we should agree with Hans and do not implement
++   it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
++
++   For simplicity all bitmap nodes (both commit and working bitmap blocks) are
++   loaded into memory on fs mount time or each bitmap nodes are loaded at the
++   first access to it, the "dont_load_bitmap" mount option controls whether
++   bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
++   nodes currently is not supported. */
++
++#define CHECKSUM_SIZE    4
++
++#define BYTES_PER_LONG   (sizeof(long))
++
++#if BITS_PER_LONG == 64
++#  define LONG_INT_SHIFT (6)
++#else
++#  define LONG_INT_SHIFT (5)
++#endif
++
++#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
++
++typedef unsigned long ulong_t;
++
++#define bmap_size(blocksize)	    ((blocksize) - CHECKSUM_SIZE)
++#define bmap_bit_count(blocksize)   (bmap_size(blocksize) << 3)
++
++/* Block allocation/deallocation are done through special bitmap objects which
++   are allocated in an array at fs mount. */
++struct bitmap_node {
++	struct semaphore sema;	/* long term lock object */
++
++	jnode *wjnode;		/* j-nodes for WORKING ... */
++	jnode *cjnode;		/* ... and COMMIT bitmap blocks */
++
++	bmap_off_t first_zero_bit;	/* for skip_busy option implementation */
++
++	atomic_t loaded;	/* a flag which shows that bnode is loaded
++				 * already */
++};
++
++static inline char *bnode_working_data(struct bitmap_node *bnode)
++{
++	char *data;
++
++	data = jdata(bnode->wjnode);
++	assert("zam-429", data != NULL);
++
++	return data + CHECKSUM_SIZE;
++}
++
++static inline char *bnode_commit_data(const struct bitmap_node *bnode)
++{
++	char *data;
++
++	data = jdata(bnode->cjnode);
++	assert("zam-430", data != NULL);
++
++	return data + CHECKSUM_SIZE;
++}
++
++static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
++{
++	char *data;
++
++	data = jdata(bnode->cjnode);
++	assert("vpf-261", data != NULL);
++
++	return le32_to_cpu(get_unaligned((d32 *)data));
++}
++
++static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
++{
++	char *data;
++
++	data = jdata(bnode->cjnode);
++	assert("vpf-261", data != NULL);
++
++	put_unaligned(cpu_to_le32(crc), (d32 *)data);
++}
++
++/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
++ * written the code, does this added abstraction still have */
++/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
++ * reiser4_space_allocator structure) */
++/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
++/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
++ * someday?". What they about?  If there is a reason to have a union, it should
++ * be a union, if not, it should not be a union.  "..might be someday" means no
++ * reason. */
++struct bitmap_allocator_data {
++	/* an array for bitmap blocks direct access */
++	struct bitmap_node *bitmap;
++};
++
++#define get_barray(super) \
++(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
++
++#define get_bnode(super, i) (get_barray(super) + i)
++
++/* allocate and initialize jnode with JNODE_BITMAP type */
++static jnode *bnew(void)
++{
++	jnode *jal = jalloc();
++
++	if (jal)
++		jnode_init(jal, current_tree, JNODE_BITMAP);
++
++	return jal;
++}
++
++/* this file contains:
++   - bitmap based implementation of space allocation plugin
++   - all the helper functions like set bit, find_first_zero_bit, etc */
++
++/* Audited by: green(2002.06.12) */
++static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
++{
++	ulong_t mask = 1UL << start_bit;
++	int i = start_bit;
++
++	while ((word & mask) != 0) {
++		mask <<= 1;
++		if (++i >= BITS_PER_LONG)
++			break;
++	}
++
++	return i;
++}
++
++#include <asm/bitops.h>
++
++#if BITS_PER_LONG == 64
++
++#define OFF(addr)  (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
++#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
++
++static inline void reiser4_set_bit(int nr, void *addr)
++{
++	ext2_set_bit(nr + OFF(addr), BASE(addr));
++}
++
++static inline void reiser4_clear_bit(int nr, void *addr)
++{
++	ext2_clear_bit(nr + OFF(addr), BASE(addr));
++}
++
++static inline int reiser4_test_bit(int nr, void *addr)
++{
++	return ext2_test_bit(nr + OFF(addr), BASE(addr));
++}
++static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
++					     int offset)
++{
++	int off = OFF(addr);
++
++	return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
++				       offset + off) - off;
++}
++
++#else
++
++#define reiser4_set_bit(nr, addr)    ext2_set_bit(nr, addr)
++#define reiser4_clear_bit(nr, addr)  ext2_clear_bit(nr, addr)
++#define reiser4_test_bit(nr, addr)  ext2_test_bit(nr, addr)
++
++#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
++ext2_find_next_zero_bit(addr, maxoffset, offset)
++#endif
++
++/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
++ * are counted from @addr, return the offset of the first bit if it is found,
++ * @maxoffset otherwise. */
++static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
++					      bmap_off_t start_offset)
++{
++	ulong_t *base = addr;
++	/* start_offset is in bits, convert it to byte offset within bitmap. */
++	int word_nr = start_offset >> LONG_INT_SHIFT;
++	/* bit number within the byte. */
++	int bit_nr = start_offset & LONG_INT_MASK;
++	int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
++
++	assert("zam-387", max_offset != 0);
++
++	/* Unaligned @start_offset case.  */
++	if (bit_nr != 0) {
++		bmap_nr_t nr;
++
++		nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
++
++		if (nr < BITS_PER_LONG)
++			return (word_nr << LONG_INT_SHIFT) + nr;
++
++		++word_nr;
++	}
++
++	/* Fast scan trough aligned words. */
++	while (word_nr <= max_word_nr) {
++		if (base[word_nr] != 0) {
++			return (word_nr << LONG_INT_SHIFT)
++			    + find_next_zero_bit_in_word(~(base[word_nr]), 0);
++		}
++
++		++word_nr;
++	}
++
++	return max_offset;
++}
++
++#if BITS_PER_LONG == 64
++
++static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
++					    bmap_off_t start_offset)
++{
++	bmap_off_t off = OFF(addr);
++
++	return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
++					   start_offset + off) - off;
++}
++
++#else
++#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
++  __reiser4_find_next_set_bit(addr, max_offset, start_offset)
++#endif
++
++/* search for the first set bit in single word. */
++static int find_last_set_bit_in_word(ulong_t word, int start_bit)
++{
++	ulong_t bit_mask;
++	int nr = start_bit;
++
++	assert("zam-965", start_bit < BITS_PER_LONG);
++	assert("zam-966", start_bit >= 0);
++
++	bit_mask = (1UL << nr);
++
++	while (bit_mask != 0) {
++		if (bit_mask & word)
++			return nr;
++		bit_mask >>= 1;
++		nr--;
++	}
++	return BITS_PER_LONG;
++}
++
++/* Search bitmap for a set bit in backward direction from the end to the
++ * beginning of given region
++ *
++ * @result: result offset of the last set bit
++ * @addr:   base memory address,
++ * @low_off:  low end of the search region, edge bit included into the region,
++ * @high_off: high end of the search region, edge bit included into the region,
++ *
++ * @return: 0 - set bit was found, -1 otherwise.
++ */
++static int
++reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
++			  bmap_off_t high_off)
++{
++	ulong_t *base = addr;
++	int last_word;
++	int first_word;
++	int last_bit;
++	int nr;
++
++	assert("zam-961", high_off >= 0);
++	assert("zam-962", high_off >= low_off);
++
++	last_word = high_off >> LONG_INT_SHIFT;
++	last_bit = high_off & LONG_INT_MASK;
++	first_word = low_off >> LONG_INT_SHIFT;
++
++	if (last_bit < BITS_PER_LONG) {
++		nr = find_last_set_bit_in_word(base[last_word], last_bit);
++		if (nr < BITS_PER_LONG) {
++			*result = (last_word << LONG_INT_SHIFT) + nr;
++			return 0;
++		}
++		--last_word;
++	}
++	while (last_word >= first_word) {
++		if (base[last_word] != 0x0) {
++			last_bit =
++			    find_last_set_bit_in_word(base[last_word],
++						      BITS_PER_LONG - 1);
++			assert("zam-972", last_bit < BITS_PER_LONG);
++			*result = (last_word << LONG_INT_SHIFT) + last_bit;
++			return 0;
++		}
++		--last_word;
++	}
++
++	return -1;		/* set bit not found */
++}
++
++/* Search bitmap for a clear bit in backward direction from the end to the
++ * beginning of given region */
++static int
++reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
++			   bmap_off_t high_off)
++{
++	ulong_t *base = addr;
++	int last_word;
++	int first_word;
++	int last_bit;
++	int nr;
++
++	last_word = high_off >> LONG_INT_SHIFT;
++	last_bit = high_off & LONG_INT_MASK;
++	first_word = low_off >> LONG_INT_SHIFT;
++
++	if (last_bit < BITS_PER_LONG) {
++		nr = find_last_set_bit_in_word(~base[last_word], last_bit);
++		if (nr < BITS_PER_LONG) {
++			*result = (last_word << LONG_INT_SHIFT) + nr;
++			return 0;
++		}
++		--last_word;
++	}
++	while (last_word >= first_word) {
++		if (base[last_word] != (ulong_t) (-1)) {
++			*result = (last_word << LONG_INT_SHIFT) +
++			    find_last_set_bit_in_word(~base[last_word],
++						      BITS_PER_LONG - 1);
++			return 0;
++		}
++		--last_word;
++	}
++
++	return -1;		/* zero bit not found */
++}
++
++/* Audited by: green(2002.06.12) */
++static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
++{
++	int first_byte;
++	int last_byte;
++
++	unsigned char first_byte_mask = 0xFF;
++	unsigned char last_byte_mask = 0xFF;
++
++	assert("zam-410", start < end);
++
++	first_byte = start >> 3;
++	last_byte = (end - 1) >> 3;
++
++	if (last_byte > first_byte + 1)
++		memset(addr + first_byte + 1, 0,
++		       (size_t) (last_byte - first_byte - 1));
++
++	first_byte_mask >>= 8 - (start & 0x7);
++	last_byte_mask <<= ((end - 1) & 0x7) + 1;
++
++	if (first_byte == last_byte) {
++		addr[first_byte] &= (first_byte_mask | last_byte_mask);
++	} else {
++		addr[first_byte] &= first_byte_mask;
++		addr[last_byte] &= last_byte_mask;
++	}
++}
++
++/* Audited by: green(2002.06.12) */
++/* ZAM-FIXME-HANS: comment this */
++static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
++{
++	int first_byte;
++	int last_byte;
++
++	unsigned char first_byte_mask = 0xFF;
++	unsigned char last_byte_mask = 0xFF;
++
++	assert("zam-386", start < end);
++
++	first_byte = start >> 3;
++	last_byte = (end - 1) >> 3;
++
++	if (last_byte > first_byte + 1)
++		memset(addr + first_byte + 1, 0xFF,
++		       (size_t) (last_byte - first_byte - 1));
++
++	first_byte_mask <<= start & 0x7;
++	last_byte_mask >>= 7 - ((end - 1) & 0x7);
++
++	if (first_byte == last_byte) {
++		addr[first_byte] |= (first_byte_mask & last_byte_mask);
++	} else {
++		addr[first_byte] |= first_byte_mask;
++		addr[last_byte] |= last_byte_mask;
++	}
++}
++
++#define ADLER_BASE    65521
++#define ADLER_NMAX    5552
++
++/* Calculates the adler32 checksum for the data pointed by `data` of the
++    length `len`. This function was originally taken from zlib, version 1.1.3,
++    July 9th, 1998.
++
++    Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
++
++    This software is provided 'as-is', without any express or implied
++    warranty.  In no event will the authors be held liable for any damages
++    arising from the use of this software.
++
++    Permission is granted to anyone to use this software for any purpose,
++    including commercial applications, and to alter it and redistribute it
++    freely, subject to the following restrictions:
++
++    1. The origin of this software must not be misrepresented; you must not
++	claim that you wrote the original software. If you use this software
++	in a product, an acknowledgment in the product documentation would be
++	appreciated but is not required.
++    2. Altered source versions must be plainly marked as such, and must not be
++	misrepresented as being the original software.
++    3. This notice may not be removed or altered from any source distribution.
++
++    Jean-loup Gailly        Mark Adler
++    jloup@gzip.org          madler@alumni.caltech.edu
++
++    The above comment applies only to the reiser4_adler32 function.
++*/
++
++__u32 reiser4_adler32(char *data, __u32 len)
++{
++	unsigned char *t = data;
++	__u32 s1 = 1;
++	__u32 s2 = 0;
++	int k;
++
++	while (len > 0) {
++		k = len < ADLER_NMAX ? len : ADLER_NMAX;
++		len -= k;
++
++		while (k--) {
++			s1 += *t++;
++			s2 += s1;
++		}
++
++		s1 %= ADLER_BASE;
++		s2 %= ADLER_BASE;
++	}
++	return (s2 << 16) | s1;
++}
++
++#define sb_by_bnode(bnode) \
++	((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
++
++static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
++{
++	return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
++}
++
++static int
++bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
++{
++	if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
++		bmap_nr_t bmap;
++
++		bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
++
++		warning("vpf-263",
++			"Checksum for the bitmap block %llu is incorrect",
++			bmap);
++
++		return RETERR(-EIO);
++	}
++
++	return 0;
++}
++
++#define REISER4_CHECK_BMAP_CRC (0)
++
++#if REISER4_CHECK_BMAP_CRC
++static int bnode_check_crc(const struct bitmap_node *bnode)
++{
++	return bnode_check_adler32(bnode,
++				   bmap_size(sb_by_bnode(bnode)->s_blocksize));
++}
++
++/* REISER4_CHECK_BMAP_CRC */
++#else
++
++#define bnode_check_crc(bnode) (0)
++
++/* REISER4_CHECK_BMAP_CRC */
++#endif
++
++/* Recalculates the adler32 checksum for only 1 byte change.
++    adler - previous adler checksum
++    old_data, data - old, new byte values.
++    tail == (chunk - offset) : length, checksum was calculated for, - offset of
++    the changed byte within this chunk.
++    This function can be used for checksum calculation optimisation.
++*/
++
++static __u32
++adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
++	       __u32 tail)
++{
++	__u32 delta = data - old_data + 2 * ADLER_BASE;
++	__u32 s1 = adler & 0xffff;
++	__u32 s2 = (adler >> 16) & 0xffff;
++
++	s1 = (delta + s1) % ADLER_BASE;
++	s2 = (delta * tail + s2) % ADLER_BASE;
++
++	return (s2 << 16) | s1;
++}
++
++#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
++
++/**
++ * get_nr_bitmap - calculate number of bitmap blocks
++ * @super: super block with initialized blocksize and block count
++ *
++ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
++ * maintain free disk space. It assumes that each bitmap addresses the same
++ * number of blocks which is calculated by bmap_block_count macro defined in
++ * above. Number of blocks in the filesystem has to be initialized in reiser4
++ * private data of super block already so that it can be obtained via
++ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
++ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
++ * to use special function to divide and modulo 64bits filesystem block
++ * counters.
++ *
++ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
++ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
++ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
++ */
++static bmap_nr_t get_nr_bmap(const struct super_block *super)
++{
++	u64 quotient;
++
++	assert("zam-393", reiser4_block_count(super) != 0);
++
++	quotient = reiser4_block_count(super) - 1;
++	do_div(quotient, bmap_bit_count(super->s_blocksize));
++	return quotient + 1;
++}
++
++/**
++ * parse_blocknr - calculate bitmap number and offset in it by block number
++ * @block: pointer to block number to calculate location in bitmap of
++ * @bmap: pointer where to store bitmap block number
++ * @offset: pointer where to store offset within bitmap block
++ *
++ * Calculates location of bit which is responsible for allocation/freeing of
++ * block @*block. That location is represented by bitmap block number and offset
++ * within that bitmap block.
++ */
++static void
++parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
++	      bmap_off_t *offset)
++{
++	struct super_block *super = get_current_context()->super;
++	u64 quotient = *block;
++
++	*offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
++	*bmap = quotient;
++
++	assert("zam-433", *bmap < get_nr_bmap(super));
++	assert("", *offset < bmap_bit_count(super->s_blocksize));
++}
++
++#if REISER4_DEBUG
++/* Audited by: green(2002.06.12) */
++static void
++check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
++{
++	struct super_block *sb = reiser4_get_current_sb();
++
++	assert("zam-436", sb != NULL);
++
++	assert("zam-455", start != NULL);
++	assert("zam-437", *start != 0);
++	assert("zam-541", !blocknr_is_fake(start));
++	assert("zam-441", *start < reiser4_block_count(sb));
++
++	if (len != NULL) {
++		assert("zam-438", *len != 0);
++		assert("zam-442", *start + *len <= reiser4_block_count(sb));
++	}
++}
++
++static void check_bnode_loaded(const struct bitmap_node *bnode)
++{
++	assert("zam-485", bnode != NULL);
++	assert("zam-483", jnode_page(bnode->wjnode) != NULL);
++	assert("zam-484", jnode_page(bnode->cjnode) != NULL);
++	assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
++	assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
++}
++
++#else
++
++#  define check_block_range(start, len) do { /* nothing */} while(0)
++#  define check_bnode_loaded(bnode)     do { /* nothing */} while(0)
++
++#endif
++
++/* modify bnode->first_zero_bit (if we free bits before); bnode should be
++   spin-locked */
++static inline void
++adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
++{
++	if (offset < bnode->first_zero_bit)
++		bnode->first_zero_bit = offset;
++}
++
++/* return a physical disk address for logical bitmap number @bmap */
++/* FIXME-VS: this is somehow related to disk layout? */
++/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
++ * per block allocation so that performance is not affected.  Probably this
++ * whole file should be considered part of the disk layout plugin, and other
++ * disk layouts can use other defines and efficiency will not be significantly
++ * affected.  */
++
++#define REISER4_FIRST_BITMAP_BLOCK \
++	((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
++
++/* Audited by: green(2002.06.12) */
++static void
++get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
++		   reiser4_block_nr * bnr)
++{
++
++	assert("zam-390", bmap < get_nr_bmap(super));
++
++#ifdef CONFIG_REISER4_BADBLOCKS
++#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
++	/* Check if the diskmap have this already, first. */
++	if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
++		return;		/* Found it in diskmap */
++#endif
++	/* FIXME_ZAM: before discussing of disk layouts and disk format
++	   plugins I implement bitmap location scheme which is close to scheme
++	   used in reiser 3.6 */
++	if (bmap == 0) {
++		*bnr = REISER4_FIRST_BITMAP_BLOCK;
++	} else {
++		*bnr = bmap * bmap_bit_count(super->s_blocksize);
++	}
++}
++
++/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
++/* Audited by: green(2002.06.12) */
++static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
++{
++	*bnr =
++	    (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
++				REISER4_BITMAP_BLOCKS_STATUS_VALUE);
++}
++
++/* bnode structure initialization */
++static void
++init_bnode(struct bitmap_node *bnode,
++	   struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
++{
++	memset(bnode, 0, sizeof(struct bitmap_node));
++
++	sema_init(&bnode->sema, 1);
++	atomic_set(&bnode->loaded, 0);
++}
++
++static void release(jnode * node)
++{
++	jrelse(node);
++	JF_SET(node, JNODE_HEARD_BANSHEE);
++	jput(node);
++}
++
++/* This function is for internal bitmap.c use because it assumes that jnode is
++   in under full control of this thread */
++static void done_bnode(struct bitmap_node *bnode)
++{
++	if (bnode) {
++		atomic_set(&bnode->loaded, 0);
++		if (bnode->wjnode != NULL)
++			release(bnode->wjnode);
++		if (bnode->cjnode != NULL)
++			release(bnode->cjnode);
++		bnode->wjnode = bnode->cjnode = NULL;
++	}
++}
++
++/* ZAM-FIXME-HANS: comment this.  Called only by load_and_lock_bnode()*/
++static int
++prepare_bnode(struct bitmap_node *bnode, jnode ** cjnode_ret,
++	      jnode ** wjnode_ret)
++{
++	struct super_block *super;
++	jnode *cjnode;
++	jnode *wjnode;
++	bmap_nr_t bmap;
++	int ret;
++
++	super = reiser4_get_current_sb();
++
++	*wjnode_ret = wjnode = bnew();
++	if (wjnode == NULL) {
++		*cjnode_ret = NULL;
++		return RETERR(-ENOMEM);
++	}
++
++	*cjnode_ret = cjnode = bnew();
++	if (cjnode == NULL)
++		return RETERR(-ENOMEM);
++
++	bmap = bnode - get_bnode(super, 0);
++
++	get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
++	get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
++
++	jref(cjnode);
++	jref(wjnode);
++
++	/* load commit bitmap */
++	ret = jload_gfp(cjnode, GFP_NOFS, 1);
++
++	if (ret)
++		goto error;
++
++	/* allocate memory for working bitmap block. Note that for
++	 * bitmaps jinit_new() doesn't actually modifies node content,
++	 * so parallel calls to this are ok. */
++	ret = jinit_new(wjnode, GFP_NOFS);
++
++	if (ret != 0) {
++		jrelse(cjnode);
++		goto error;
++	}
++
++	return 0;
++
++      error:
++	jput(cjnode);
++	jput(wjnode);
++	*wjnode_ret = *cjnode_ret = NULL;
++	return ret;
++
++}
++
++/* Check the bnode data on read. */
++static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
++{
++	void *data;
++	int ret;
++
++	/* Check CRC */
++	ret = bnode_check_adler32(bnode, blksize);
++
++	if (ret) {
++		return ret;
++	}
++
++	data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
++
++	/* Check the very first bit -- it must be busy. */
++	if (!reiser4_test_bit(0, data)) {
++		warning("vpf-1362", "The allocator block %llu is not marked "
++			"as used.", (unsigned long long)bnode->cjnode->blocknr);
++
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++/* load bitmap blocks "on-demand" */
++static int load_and_lock_bnode(struct bitmap_node *bnode)
++{
++	int ret;
++
++	jnode *cjnode;
++	jnode *wjnode;
++
++	assert("nikita-3040", schedulable());
++
++/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
++ * need to be atomic, right? Just leave a comment that if bitmaps were
++ * unloadable, this would need to be atomic.  */
++	if (atomic_read(&bnode->loaded)) {
++		/* bitmap is already loaded, nothing to do */
++		check_bnode_loaded(bnode);
++		down(&bnode->sema);
++		assert("nikita-2827", atomic_read(&bnode->loaded));
++		return 0;
++	}
++
++	ret = prepare_bnode(bnode, &cjnode, &wjnode);
++	if (ret == 0) {
++		down(&bnode->sema);
++
++		if (!atomic_read(&bnode->loaded)) {
++			assert("nikita-2822", cjnode != NULL);
++			assert("nikita-2823", wjnode != NULL);
++			assert("nikita-2824", jnode_is_loaded(cjnode));
++			assert("nikita-2825", jnode_is_loaded(wjnode));
++
++			bnode->wjnode = wjnode;
++			bnode->cjnode = cjnode;
++
++			ret = check_struct_bnode(bnode, current_blocksize);
++			if (!ret) {
++				cjnode = wjnode = NULL;
++				atomic_set(&bnode->loaded, 1);
++				/* working bitmap is initialized by on-disk
++				 * commit bitmap. This should be performed
++				 * under semaphore. */
++				memcpy(bnode_working_data(bnode),
++				       bnode_commit_data(bnode),
++				       bmap_size(current_blocksize));
++			} else {
++				up(&bnode->sema);
++			}
++		} else
++			/* race: someone already loaded bitmap while we were
++			 * busy initializing data. */
++			check_bnode_loaded(bnode);
++	}
++
++	if (wjnode != NULL) {
++		release(wjnode);
++		bnode->wjnode = NULL;
++	}
++	if (cjnode != NULL) {
++		release(cjnode);
++		bnode->cjnode = NULL;
++	}
++
++	return ret;
++}
++
++static void release_and_unlock_bnode(struct bitmap_node *bnode)
++{
++	check_bnode_loaded(bnode);
++	up(&bnode->sema);
++}
++
++/* This function does all block allocation work but only for one bitmap
++   block.*/
++/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
++   block responsibility zone boundaries. This had no sense in v3.6 but may
++   have it in v4.x */
++/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
++static int
++search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
++			  bmap_off_t max_offset, int min_len, int max_len)
++{
++	struct super_block *super = get_current_context()->super;
++	struct bitmap_node *bnode = get_bnode(super, bmap);
++
++	char *data;
++
++	bmap_off_t search_end;
++	bmap_off_t start;
++	bmap_off_t end;
++
++	int set_first_zero_bit = 0;
++
++	int ret;
++
++	assert("zam-364", min_len > 0);
++	assert("zam-365", max_len >= min_len);
++	assert("zam-366", *offset <= max_offset);
++
++	ret = load_and_lock_bnode(bnode);
++
++	if (ret)
++		return ret;
++
++	data = bnode_working_data(bnode);
++
++	start = *offset;
++
++	if (bnode->first_zero_bit >= start) {
++		start = bnode->first_zero_bit;
++		set_first_zero_bit = 1;
++	}
++
++	while (start + min_len < max_offset) {
++
++		start =
++		    reiser4_find_next_zero_bit((long *)data, max_offset, start);
++		if (set_first_zero_bit) {
++			bnode->first_zero_bit = start;
++			set_first_zero_bit = 0;
++		}
++		if (start >= max_offset)
++			break;
++
++		search_end = LIMIT(start + max_len, max_offset);
++		end =
++		    reiser4_find_next_set_bit((long *)data, search_end, start);
++		if (end >= start + min_len) {
++			/* we can't trust find_next_set_bit result if set bit
++			   was not fount, result may be bigger than
++			   max_offset */
++			if (end > search_end)
++				end = search_end;
++
++			ret = end - start;
++			*offset = start;
++
++			reiser4_set_bits(data, start, end);
++
++			/* FIXME: we may advance first_zero_bit if [start,
++			   end] region overlaps the first_zero_bit point */
++
++			break;
++		}
++
++		start = end + 1;
++	}
++
++	release_and_unlock_bnode(bnode);
++
++	return ret;
++}
++
++static int
++search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
++			   bmap_off_t end_offset, int min_len, int max_len)
++{
++	struct super_block *super = get_current_context()->super;
++	struct bitmap_node *bnode = get_bnode(super, bmap);
++	char *data;
++	bmap_off_t start;
++	int ret;
++
++	assert("zam-958", min_len > 0);
++	assert("zam-959", max_len >= min_len);
++	assert("zam-960", *start_offset >= end_offset);
++
++	ret = load_and_lock_bnode(bnode);
++	if (ret)
++		return ret;
++
++	data = bnode_working_data(bnode);
++	start = *start_offset;
++
++	while (1) {
++		bmap_off_t end, search_end;
++
++		/* Find the beginning of the zero filled region */
++		if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
++			break;
++		/* Is there more than `min_len' bits from `start' to
++		 * `end_offset'?  */
++		if (start < end_offset + min_len - 1)
++			break;
++
++		/* Do not search to `end_offset' if we need to find less than
++		 * `max_len' zero bits. */
++		if (end_offset + max_len - 1 < start)
++			search_end = start - max_len + 1;
++		else
++			search_end = end_offset;
++
++		if (reiser4_find_last_set_bit(&end, data, search_end, start))
++			end = search_end;
++		else
++			end++;
++
++		if (end + min_len <= start + 1) {
++			if (end < search_end)
++				end = search_end;
++			ret = start - end + 1;
++			*start_offset = end;	/* `end' is lowest offset */
++			assert("zam-987",
++			       reiser4_find_next_set_bit(data, start + 1,
++							 end) >= start + 1);
++			reiser4_set_bits(data, end, start + 1);
++			break;
++		}
++
++		if (end <= end_offset)
++			/* left search boundary reached. */
++			break;
++		start = end - 1;
++	}
++
++	release_and_unlock_bnode(bnode);
++	return ret;
++}
++
++/* allocate contiguous range of blocks in bitmap */
++static int bitmap_alloc_forward(reiser4_block_nr * start,
++				const reiser4_block_nr * end, int min_len,
++				int max_len)
++{
++	bmap_nr_t bmap, end_bmap;
++	bmap_off_t offset, end_offset;
++	int len;
++
++	reiser4_block_nr tmp;
++
++	struct super_block *super = get_current_context()->super;
++	const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
++
++	parse_blocknr(start, &bmap, &offset);
++
++	tmp = *end - 1;
++	parse_blocknr(&tmp, &end_bmap, &end_offset);
++	++end_offset;
++
++	assert("zam-358", end_bmap >= bmap);
++	assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
++
++	for (; bmap < end_bmap; bmap++, offset = 0) {
++		len =
++		    search_one_bitmap_forward(bmap, &offset, max_offset,
++					      min_len, max_len);
++		if (len != 0)
++			goto out;
++	}
++
++	len =
++	    search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
++				      max_len);
++      out:
++	*start = bmap * max_offset + offset;
++	return len;
++}
++
++/* allocate contiguous range of blocks in bitmap (from @start to @end in
++ * backward direction) */
++static int bitmap_alloc_backward(reiser4_block_nr * start,
++				 const reiser4_block_nr * end, int min_len,
++				 int max_len)
++{
++	bmap_nr_t bmap, end_bmap;
++	bmap_off_t offset, end_offset;
++	int len;
++	struct super_block *super = get_current_context()->super;
++	const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
++
++	parse_blocknr(start, &bmap, &offset);
++	parse_blocknr(end, &end_bmap, &end_offset);
++
++	assert("zam-961", end_bmap <= bmap);
++	assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
++
++	for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
++		len =
++		    search_one_bitmap_backward(bmap, &offset, 0, min_len,
++					       max_len);
++		if (len != 0)
++			goto out;
++	}
++
++	len =
++	    search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
++				       max_len);
++      out:
++	*start = bmap * max_offset + offset;
++	return len;
++}
++
++/* plugin->u.space_allocator.alloc_blocks() */
++static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
++				reiser4_block_nr *start, reiser4_block_nr *len)
++{
++	struct super_block *super = get_current_context()->super;
++	int actual_len;
++
++	reiser4_block_nr search_start;
++	reiser4_block_nr search_end;
++
++	assert("zam-398", super != NULL);
++	assert("zam-412", hint != NULL);
++	assert("zam-397", hint->blk <= reiser4_block_count(super));
++
++	if (hint->max_dist == 0)
++		search_end = reiser4_block_count(super);
++	else
++		search_end =
++		    LIMIT(hint->blk + hint->max_dist,
++			  reiser4_block_count(super));
++
++	/* We use @hint -> blk as a search start and search from it to the end
++	   of the disk or in given region if @hint -> max_dist is not zero */
++	search_start = hint->blk;
++
++	actual_len =
++	    bitmap_alloc_forward(&search_start, &search_end, 1, needed);
++
++	/* There is only one bitmap search if max_dist was specified or first
++	   pass was from the beginning of the bitmap. We also do one pass for
++	   scanning bitmap in backward direction. */
++	if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
++		/* next step is a scanning from 0 to search_start */
++		search_end = search_start;
++		search_start = 0;
++		actual_len =
++		    bitmap_alloc_forward(&search_start, &search_end, 1, needed);
++	}
++	if (actual_len == 0)
++		return RETERR(-ENOSPC);
++	if (actual_len < 0)
++		return RETERR(actual_len);
++	*len = actual_len;
++	*start = search_start;
++	return 0;
++}
++
++static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
++				 reiser4_block_nr * start,
++				 reiser4_block_nr * len)
++{
++	reiser4_block_nr search_start;
++	reiser4_block_nr search_end;
++	int actual_len;
++
++	ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
++
++	assert("zam-969", super != NULL);
++	assert("zam-970", hint != NULL);
++	assert("zam-971", hint->blk <= reiser4_block_count(super));
++
++	search_start = hint->blk;
++	if (hint->max_dist == 0 || search_start <= hint->max_dist)
++		search_end = 0;
++	else
++		search_end = search_start - hint->max_dist;
++
++	actual_len =
++	    bitmap_alloc_backward(&search_start, &search_end, 1, needed);
++	if (actual_len == 0)
++		return RETERR(-ENOSPC);
++	if (actual_len < 0)
++		return RETERR(actual_len);
++	*len = actual_len;
++	*start = search_start;
++	return 0;
++}
++
++/* plugin->u.space_allocator.alloc_blocks() */
++int
++alloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
++		    reiser4_blocknr_hint * hint, int needed,
++		    reiser4_block_nr * start, reiser4_block_nr * len)
++{
++	if (hint->backward)
++		return alloc_blocks_backward(hint, needed, start, len);
++	return alloc_blocks_forward(hint, needed, start, len);
++}
++
++/* plugin->u.space_allocator.dealloc_blocks(). */
++/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
++   nodes deletion is deferred until transaction commit.  However, deallocation
++   of temporary objects like wandered blocks and transaction commit records
++   requires immediate node deletion from WORKING BITMAP.*/
++void
++dealloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
++		      reiser4_block_nr start, reiser4_block_nr len)
++{
++	struct super_block *super = reiser4_get_current_sb();
++
++	bmap_nr_t bmap;
++	bmap_off_t offset;
++
++	struct bitmap_node *bnode;
++	int ret;
++
++	assert("zam-468", len != 0);
++	check_block_range(&start, &len);
++
++	parse_blocknr(&start, &bmap, &offset);
++
++	assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
++
++	bnode = get_bnode(super, bmap);
++
++	assert("zam-470", bnode != NULL);
++
++	ret = load_and_lock_bnode(bnode);
++	assert("zam-481", ret == 0);
++
++	reiser4_clear_bits(bnode_working_data(bnode), offset,
++			   (bmap_off_t) (offset + len));
++
++	adjust_first_zero_bit(bnode, offset);
++
++	release_and_unlock_bnode(bnode);
++}
++
++/* plugin->u.space_allocator.check_blocks(). */
++void
++check_blocks_bitmap(const reiser4_block_nr * start,
++		    const reiser4_block_nr * len, int desired)
++{
++#if REISER4_DEBUG
++	struct super_block *super = reiser4_get_current_sb();
++
++	bmap_nr_t bmap;
++	bmap_off_t start_offset;
++	bmap_off_t end_offset;
++
++	struct bitmap_node *bnode;
++	int ret;
++
++	assert("zam-622", len != NULL);
++	check_block_range(start, len);
++	parse_blocknr(start, &bmap, &start_offset);
++
++	end_offset = start_offset + *len;
++	assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
++
++	bnode = get_bnode(super, bmap);
++
++	assert("nikita-2215", bnode != NULL);
++
++	ret = load_and_lock_bnode(bnode);
++	assert("zam-626", ret == 0);
++
++	assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
++
++	if (desired) {
++		assert("zam-623",
++		       reiser4_find_next_zero_bit(bnode_working_data(bnode),
++						  end_offset, start_offset)
++		       >= end_offset);
++	} else {
++		assert("zam-624",
++		       reiser4_find_next_set_bit(bnode_working_data(bnode),
++						 end_offset, start_offset)
++		       >= end_offset);
++	}
++
++	release_and_unlock_bnode(bnode);
++#endif
++}
++
++/* conditional insertion of @node into atom's overwrite set  if it was not there */
++static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
++{
++	assert("zam-546", atom != NULL);
++	assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
++	assert("zam-548", node != NULL);
++
++	spin_lock_atom(atom);
++	spin_lock_jnode(node);
++
++	if (node->atom == NULL) {
++		JF_SET(node, JNODE_OVRWR);
++		insert_into_atom_ovrwr_list(atom, node);
++	} else {
++		assert("zam-549", node->atom == atom);
++	}
++
++	spin_unlock_jnode(node);
++	spin_unlock_atom(atom);
++}
++
++/* an actor which applies delete set to COMMIT bitmap pages and link modified
++   pages in a single-linked list */
++static int
++apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
++			  const reiser4_block_nr * len, void *data)
++{
++
++	bmap_nr_t bmap;
++	bmap_off_t offset;
++	int ret;
++
++	long long *blocks_freed_p = data;
++
++	struct bitmap_node *bnode;
++
++	struct super_block *sb = reiser4_get_current_sb();
++
++	check_block_range(start, len);
++
++	parse_blocknr(start, &bmap, &offset);
++
++	/* FIXME-ZAM: we assume that all block ranges are allocated by this
++	   bitmap-based allocator and each block range can't go over a zone of
++	   responsibility of one bitmap block; same assumption is used in
++	   other journal hooks in bitmap code. */
++	bnode = get_bnode(sb, bmap);
++	assert("zam-448", bnode != NULL);
++
++	/* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
++	assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
++	ret = load_and_lock_bnode(bnode);
++	if (ret)
++		return ret;
++
++	/* put bnode into atom's overwrite set */
++	cond_add_to_overwrite_set(atom, bnode->cjnode);
++
++	data = bnode_commit_data(bnode);
++
++	ret = bnode_check_crc(bnode);
++	if (ret != 0)
++		return ret;
++
++	if (len != NULL) {
++		/* FIXME-ZAM: a check that all bits are set should be there */
++		assert("zam-443",
++		       offset + *len <= bmap_bit_count(sb->s_blocksize));
++		reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
++
++		(*blocks_freed_p) += *len;
++	} else {
++		reiser4_clear_bit(offset, data);
++		(*blocks_freed_p)++;
++	}
++
++	bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
++
++	release_and_unlock_bnode(bnode);
++
++	return 0;
++}
++
++/* plugin->u.space_allocator.pre_commit_hook(). */
++/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
++   rest is done by transaction manager (allocate wandered locations for COMMIT
++   BITMAP blocks, copy COMMIT BITMAP blocks data). */
++/* Only one instance of this function can be running at one given time, because
++   only one transaction can be committed a time, therefore it is safe to access
++   some global variables without any locking */
++
++int pre_commit_hook_bitmap(void)
++{
++	struct super_block *super = reiser4_get_current_sb();
++	txn_atom *atom;
++
++	long long blocks_freed = 0;
++
++	atom = get_current_atom_locked();
++	assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
++	spin_unlock_atom(atom);
++
++	{			/* scan atom's captured list and find all freshly allocated nodes,
++				 * mark corresponded bits in COMMIT BITMAP as used */
++		struct list_head *head = ATOM_CLEAN_LIST(atom);
++		jnode *node = list_entry(head->next, jnode, capture_link);
++
++		while (head != &node->capture_link) {
++			/* we detect freshly allocated jnodes */
++			if (JF_ISSET(node, JNODE_RELOC)) {
++				int ret;
++				bmap_nr_t bmap;
++
++				bmap_off_t offset;
++				bmap_off_t index;
++				struct bitmap_node *bn;
++				__u32 size = bmap_size(super->s_blocksize);
++				__u32 crc;
++				char byte;
++
++				assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
++				assert("zam-460",
++				       !blocknr_is_fake(&node->blocknr));
++
++				parse_blocknr(&node->blocknr, &bmap, &offset);
++				bn = get_bnode(super, bmap);
++
++				index = offset >> 3;
++				assert("vpf-276", index < size);
++
++				ret = bnode_check_crc(bnode);
++				if (ret != 0)
++					return ret;
++
++				check_bnode_loaded(bn);
++				load_and_lock_bnode(bn);
++
++				byte = *(bnode_commit_data(bn) + index);
++				reiser4_set_bit(offset, bnode_commit_data(bn));
++
++				crc = adler32_recalc(bnode_commit_crc(bn), byte,
++						     *(bnode_commit_data(bn) +
++						       index),
++						     size - index),
++				    bnode_set_commit_crc(bn, crc);
++
++				release_and_unlock_bnode(bn);
++
++				ret = bnode_check_crc(bn);
++				if (ret != 0)
++					return ret;
++
++				/* working of this depends on how it inserts
++				   new j-node into clean list, because we are
++				   scanning the same list now. It is OK, if
++				   insertion is done to the list front */
++				cond_add_to_overwrite_set(atom, bn->cjnode);
++			}
++
++			node = list_entry(node->capture_link.next, jnode, capture_link);
++		}
++	}
++
++	blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
++			     &blocks_freed, 0);
++
++	blocks_freed -= atom->nr_blocks_allocated;
++
++	{
++		reiser4_super_info_data *sbinfo;
++
++		sbinfo = get_super_private(super);
++
++		spin_lock_reiser4_super(sbinfo);
++		sbinfo->blocks_free_committed += blocks_freed;
++		spin_unlock_reiser4_super(sbinfo);
++	}
++
++	return 0;
++}
++
++/* plugin->u.space_allocator.init_allocator
++    constructor of reiser4_space_allocator object. It is called on fs mount */
++int
++init_allocator_bitmap(reiser4_space_allocator * allocator,
++		      struct super_block *super, void *arg UNUSED_ARG)
++{
++	struct bitmap_allocator_data *data = NULL;
++	bmap_nr_t bitmap_blocks_nr;
++	bmap_nr_t i;
++
++	assert("nikita-3039", schedulable());
++
++	/* getting memory for bitmap allocator private data holder */
++	data =
++		kmalloc(sizeof(struct bitmap_allocator_data), GFP_KERNEL);
++
++	if (data == NULL)
++		return RETERR(-ENOMEM);
++
++	/* allocation and initialization for the array of bnodes */
++	bitmap_blocks_nr = get_nr_bmap(super);
++
++	/* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
++	   which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
++	   may I never meet someone who still uses the ia32 architecture when
++	   storage devices of that size enter the market, and wants to use ia32
++	   with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
++	   probably, another dynamic data structure should replace a static
++	   array of bnodes. */
++	/*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
++	data->bitmap = vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
++	if (data->bitmap == NULL) {
++		kfree(data);
++		return RETERR(-ENOMEM);
++	}
++
++	for (i = 0; i < bitmap_blocks_nr; i++)
++		init_bnode(data->bitmap + i, super, i);
++
++	allocator->u.generic = data;
++
++#if REISER4_DEBUG
++	get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
++#endif
++
++	/* Load all bitmap blocks at mount time. */
++	if (!test_bit
++	    (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
++		__u64 start_time, elapsed_time;
++		struct bitmap_node *bnode;
++		int ret;
++
++		if (REISER4_DEBUG)
++			printk(KERN_INFO "loading reiser4 bitmap...");
++		start_time = jiffies;
++
++		for (i = 0; i < bitmap_blocks_nr; i++) {
++			bnode = data->bitmap + i;
++			ret = load_and_lock_bnode(bnode);
++			if (ret) {
++				destroy_allocator_bitmap(allocator, super);
++				return ret;
++			}
++			release_and_unlock_bnode(bnode);
++		}
++
++		elapsed_time = jiffies - start_time;
++		if (REISER4_DEBUG)
++			printk("...done (%llu jiffies)\n",
++			       (unsigned long long)elapsed_time);
++	}
++
++	return 0;
++}
++
++/* plugin->u.space_allocator.destroy_allocator
++   destructor. It is called on fs unmount */
++int
++destroy_allocator_bitmap(reiser4_space_allocator * allocator,
++			 struct super_block *super)
++{
++	bmap_nr_t bitmap_blocks_nr;
++	bmap_nr_t i;
++
++	struct bitmap_allocator_data *data = allocator->u.generic;
++
++	assert("zam-414", data != NULL);
++	assert("zam-376", data->bitmap != NULL);
++
++	bitmap_blocks_nr = get_nr_bmap(super);
++
++	for (i = 0; i < bitmap_blocks_nr; i++) {
++		struct bitmap_node *bnode = data->bitmap + i;
++
++		down(&bnode->sema);
++
++#if REISER4_DEBUG
++		if (atomic_read(&bnode->loaded)) {
++			jnode *wj = bnode->wjnode;
++			jnode *cj = bnode->cjnode;
++
++			assert("zam-480", jnode_page(cj) != NULL);
++			assert("zam-633", jnode_page(wj) != NULL);
++
++			assert("zam-634",
++			       memcmp(jdata(wj), jdata(wj),
++				      bmap_size(super->s_blocksize)) == 0);
++
++		}
++#endif
++		done_bnode(bnode);
++		up(&bnode->sema);
++	}
++
++	vfree(data->bitmap);
++	kfree(data);
++
++	allocator->u.generic = NULL;
++
++	return 0;
++}
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 80
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/space/bitmap.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/space/bitmap.h
+@@ -0,0 +1,47 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
++#define __REISER4_PLUGIN_SPACE_BITMAP_H__
++
++#include "../../dformat.h"
++#include "../../block_alloc.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block  */
++/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
++/* declarations of functions implementing methods of space allocator plugin for
++   bitmap based allocator. The functions themselves are in bitmap.c */
++extern int init_allocator_bitmap(reiser4_space_allocator *,
++				 struct super_block *, void *);
++extern int destroy_allocator_bitmap(reiser4_space_allocator *,
++				    struct super_block *);
++extern int alloc_blocks_bitmap(reiser4_space_allocator *,
++			       reiser4_blocknr_hint *, int needed,
++			       reiser4_block_nr * start,
++			       reiser4_block_nr * len);
++extern void check_blocks_bitmap(const reiser4_block_nr *,
++				const reiser4_block_nr *, int);
++
++extern void dealloc_blocks_bitmap(reiser4_space_allocator *, reiser4_block_nr,
++				  reiser4_block_nr);
++extern int pre_commit_hook_bitmap(void);
++
++#define post_commit_hook_bitmap() do{}while(0)
++#define post_write_back_hook_bitmap() do{}while(0)
++#define print_info_bitmap(pref, al) do{}while(0)
++
++typedef __u64 bmap_nr_t;
++typedef __u32 bmap_off_t;
++
++#endif				/* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/space/space_allocator.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/space/space_allocator.h
+@@ -0,0 +1,80 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#ifndef __SPACE_ALLOCATOR_H__
++#define __SPACE_ALLOCATOR_H__
++
++#include "../../forward.h"
++#include "bitmap.h"
++/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
++ * but... */
++#define DEF_SPACE_ALLOCATOR(allocator)											\
++															\
++static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque)		\
++{															\
++	return init_allocator_##allocator (al, s, opaque);								\
++}															\
++															\
++static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s)				\
++{															\
++	destroy_allocator_##allocator (al, s);										\
++}															\
++															\
++static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, 				\
++				   int needed, reiser4_block_nr * start, reiser4_block_nr * len)			\
++{															\
++	return alloc_blocks_##allocator (al, hint, needed, start, len);							\
++}															\
++static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len)	\
++{															\
++	dealloc_blocks_##allocator (al, start, len); 									\
++}															\
++															\
++static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) 		\
++{															\
++	check_blocks_##allocator (start, end, desired);								        \
++}															\
++															\
++static inline void sa_pre_commit_hook (void)										\
++{ 															\
++	pre_commit_hook_##allocator ();											\
++}															\
++															\
++static inline void sa_post_commit_hook (void) 										\
++{ 															\
++	post_commit_hook_##allocator ();										\
++}															\
++															\
++static inline void sa_post_write_back_hook (void) 									\
++{ 															\
++	post_write_back_hook_##allocator();										\
++}															\
++															\
++static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al)					\
++{															\
++	print_info_##allocator (prefix, al);                                                                            \
++}
++
++DEF_SPACE_ALLOCATOR(bitmap)
++
++/* this object is part of reiser4 private in-core super block */
++struct reiser4_space_allocator {
++	union {
++		/* space allocators might use this pointer to reference their
++		 * data. */
++		void *generic;
++	} u;
++};
++
++/* __SPACE_ALLOCATOR_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/plugin/tail_policy.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/plugin/tail_policy.c
+@@ -0,0 +1,113 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Formatting policy plugins */
++
++/*
++ * Formatting policy plugin is used by object plugin (of regular file) to
++ * convert file between two representations.
++ *
++ * Currently following policies are implemented:
++ *  never store file in formatted nodes
++ *  always store file in formatted nodes
++ *  store file in formatted nodes if file is smaller than 4 blocks (default)
++ */
++
++#include "../tree.h"
++#include "../inode.h"
++#include "../super.h"
++#include "object.h"
++#include "plugin.h"
++#include "node/node.h"
++#include "plugin_header.h"
++
++#include <linux/pagemap.h>
++#include <linux/fs.h>		/* For struct inode */
++
++/**
++ * have_formatting_never -
++ * @inode:
++ * @size:
++ *
++ *
++ */
++/* Never store file's tail as direct item */
++/* Audited by: green(2002.06.12) */
++static int have_formatting_never(const struct inode *inode UNUSED_ARG
++		      /* inode to operate on */ ,
++		      loff_t size UNUSED_ARG /* new object size */ )
++{
++	return 0;
++}
++
++/* Always store file's tail as direct item */
++/* Audited by: green(2002.06.12) */
++static int
++have_formatting_always(const struct inode *inode UNUSED_ARG
++		       /* inode to operate on */ ,
++		       loff_t size UNUSED_ARG /* new object size */ )
++{
++	return 1;
++}
++
++/* This function makes test if we should store file denoted @inode as tails only or
++   as extents only. */
++static int
++have_formatting_default(const struct inode *inode UNUSED_ARG
++			/* inode to operate on */ ,
++			loff_t size /* new object size */ )
++{
++	assert("umka-1253", inode != NULL);
++
++	if (size > inode->i_sb->s_blocksize * 4)
++		return 0;
++
++	return 1;
++}
++
++/* tail plugins */
++formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
++	[NEVER_TAILS_FORMATTING_ID] = {
++		.h = {
++			.type_id = REISER4_FORMATTING_PLUGIN_TYPE,
++			.id = NEVER_TAILS_FORMATTING_ID,
++			.pops = NULL,
++			.label = "never",
++			.desc = "Never store file's tail",
++			.linkage = {NULL, NULL}
++		},
++		.have_tail = have_formatting_never
++	},
++	[ALWAYS_TAILS_FORMATTING_ID] = {
++		.h = {
++			.type_id = REISER4_FORMATTING_PLUGIN_TYPE,
++			.id = ALWAYS_TAILS_FORMATTING_ID,
++			.pops = NULL,
++			.label = "always",
++			.desc =	"Always store file's tail",
++			.linkage = {NULL, NULL}
++		},
++		.have_tail = have_formatting_always
++	},
++	[SMALL_FILE_FORMATTING_ID] = {
++		.h = {
++			.type_id = REISER4_FORMATTING_PLUGIN_TYPE,
++			.id = SMALL_FILE_FORMATTING_ID,
++			.pops = NULL,
++			.label = "4blocks",
++			.desc = "store files shorter than 4 blocks in tail items",
++			.linkage = {NULL, NULL}
++		},
++		.have_tail = have_formatting_default
++	}
++};
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/pool.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/pool.c
+@@ -0,0 +1,236 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Fast pool allocation.
++
++   There are situations when some sub-system normally asks memory allocator
++   for only few objects, but under some circumstances could require much
++   more. Typical and actually motivating example is tree balancing. It needs
++   to keep track of nodes that were involved into it, and it is well-known
++   that in reasonable packed balanced tree most (92.938121%) percent of all
++   balancings end up after working with only few nodes (3.141592 on
++   average). But in rare cases balancing can involve much more nodes
++   (3*tree_height+1 in extremal situation).
++
++   On the one hand, we don't want to resort to dynamic allocation (slab,
++    malloc(), etc.) to allocate data structures required to keep track of
++   nodes during balancing. On the other hand, we cannot statically allocate
++   required amount of space on the stack, because first: it is useless wastage
++   of precious resource, and second: this amount is unknown in advance (tree
++   height can change).
++
++   Pools, implemented in this file are solution for this problem:
++
++    - some configurable amount of objects is statically preallocated on the
++    stack
++
++    - if this preallocated pool is exhausted and more objects is requested
++    they are allocated dynamically.
++
++   Pools encapsulate distinction between statically and dynamically allocated
++   objects. Both allocation and recycling look exactly the same.
++
++   To keep track of dynamically allocated objects, pool adds its own linkage
++   to each object.
++
++   NOTE-NIKITA This linkage also contains some balancing-specific data. This
++   is not perfect. On the other hand, balancing is currently the only client
++   of pool code.
++
++   NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
++   functions in the style of tslist/tshash, i.e., make them unreadable, but
++   type-safe.
++
++
++*/
++
++#include "debug.h"
++#include "pool.h"
++#include "super.h"
++
++#include <linux/types.h>
++#include <linux/err.h>
++
++/* initialize new pool object */
++static void reiser4_init_pool_obj(reiser4_pool_header * h	/* pool object to
++								 * initialize */ )
++{
++	INIT_LIST_HEAD(&h->usage_linkage);
++	INIT_LIST_HEAD(&h->level_linkage);
++	INIT_LIST_HEAD(&h->extra_linkage);
++}
++
++/* initialize new pool */
++void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
++		       size_t obj_size /* size of objects in @pool */ ,
++		       int num_of_objs /* number of preallocated objects */ ,
++		       char *data /* area for preallocated objects */ )
++{
++	reiser4_pool_header *h;
++	int i;
++
++	assert("nikita-955", pool != NULL);
++	assert("nikita-1044", obj_size > 0);
++	assert("nikita-956", num_of_objs >= 0);
++	assert("nikita-957", data != NULL);
++
++	memset(pool, 0, sizeof *pool);
++	pool->obj_size = obj_size;
++	pool->data = data;
++	INIT_LIST_HEAD(&pool->free);
++	INIT_LIST_HEAD(&pool->used);
++	INIT_LIST_HEAD(&pool->extra);
++	memset(data, 0, obj_size * num_of_objs);
++	for (i = 0; i < num_of_objs; ++i) {
++		h = (reiser4_pool_header *) (data + i * obj_size);
++		reiser4_init_pool_obj(h);
++		/* add pool header to the end of pool's free list */
++		list_add_tail(&h->usage_linkage, &pool->free);
++	}
++}
++
++/* release pool resources
++
++   Release all resources acquired by this pool, specifically, dynamically
++   allocated objects.
++
++*/
++void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
++{
++}
++
++/* allocate carry object from pool
++
++   First, try to get preallocated object. If this fails, resort to dynamic
++   allocation.
++
++*/
++static void *reiser4_pool_alloc(reiser4_pool * pool	/* pool to allocate object
++							 * from */ )
++{
++	reiser4_pool_header *result;
++
++	assert("nikita-959", pool != NULL);
++
++	if (!list_empty(&pool->free)) {
++		struct list_head *linkage;
++
++		linkage = pool->free.next;
++		list_del(linkage);
++		INIT_LIST_HEAD(linkage);
++		result = list_entry(linkage, reiser4_pool_header, usage_linkage);
++		BUG_ON(!list_empty(&result->level_linkage) ||
++		       !list_empty(&result->extra_linkage));
++	} else {
++		/* pool is empty. Extra allocations don't deserve dedicated
++		   slab to be served from, as they are expected to be rare. */
++		result = kmalloc(pool->obj_size, get_gfp_mask());
++		if (result != 0) {
++			reiser4_init_pool_obj(result);
++			list_add(&result->extra_linkage, &pool->extra);
++		} else
++			return ERR_PTR(RETERR(-ENOMEM));
++		BUG_ON(!list_empty(&result->usage_linkage) ||
++		       !list_empty(&result->level_linkage));
++	}
++	++pool->objs;
++	list_add(&result->usage_linkage, &pool->used);
++	memset(result + 1, 0, pool->obj_size - sizeof *result);
++	return result;
++}
++
++/* return object back to the pool */
++void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h	/* pool to return object back
++									 * into */ )
++{
++	assert("nikita-961", h != NULL);
++	assert("nikita-962", pool != NULL);
++
++	--pool->objs;
++	assert("nikita-963", pool->objs >= 0);
++
++	list_del_init(&h->usage_linkage);
++	list_del_init(&h->level_linkage);
++
++	if (list_empty(&h->extra_linkage))
++		/*
++		 * pool header is not an extra one. Push it onto free list
++		 * using usage_linkage
++		 */
++		list_add(&h->usage_linkage, &pool->free);
++	else {
++		/* remove pool header from pool's extra list and kfree it */
++		list_del(&h->extra_linkage);
++		kfree(h);
++	}
++}
++
++/* add new object to the carry level list
++
++   Carry level is FIFO most of the time, but not always. Complications arise
++   when make_space() function tries to go to the left neighbor and thus adds
++   carry node before existing nodes, and also, when updating delimiting keys
++   after moving data between two nodes, we want left node to be locked before
++   right node.
++
++   Latter case is confusing at the first glance. Problem is that COP_UPDATE
++   opration that updates delimiting keys is sometimes called with two nodes
++   (when data are moved between two nodes) and sometimes with only one node
++   (when leftmost item is deleted in a node). In any case operation is
++   supplied with at least node whose left delimiting key is to be updated
++   (that is "right" node).
++
++*/
++reiser4_pool_header *add_obj(reiser4_pool * pool	/* pool from which to
++							 * allocate new object */ ,
++			     struct list_head *list,	/* list where to add
++							 * object */
++			     pool_ordering order /* where to add */ ,
++			     reiser4_pool_header * reference	/* after (or
++								 * before) which
++								 * existing
++								 * object to
++								 * add */ )
++{
++	reiser4_pool_header *result;
++
++	assert("nikita-972", pool != NULL);
++
++	result = reiser4_pool_alloc(pool);
++	if (IS_ERR(result))
++		return result;
++
++	assert("nikita-973", result != NULL);
++
++	switch (order) {
++	case POOLO_BEFORE:
++		__list_add(&result->level_linkage,
++			   reference->level_linkage.prev,
++			   &reference->level_linkage);
++		break;
++	case POOLO_AFTER:
++		__list_add(&result->level_linkage,
++			   &reference->level_linkage,
++			   reference->level_linkage.next);
++		break;
++	case POOLO_LAST:
++		list_add_tail(&result->level_linkage, list);
++		break;
++	case POOLO_FIRST:
++		list_add(&result->level_linkage, list);
++		break;
++	default:
++		wrong_return_value("nikita-927", "order");
++	}
++	return result;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/pool.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/pool.h
+@@ -0,0 +1,54 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Fast pool allocation */
++
++#ifndef __REISER4_POOL_H__
++#define __REISER4_POOL_H__
++
++#include <linux/types.h>
++
++typedef struct reiser4_pool {
++	size_t obj_size;
++	int objs;
++	char *data;
++	struct list_head free;
++	struct list_head used;
++	struct list_head extra;
++} reiser4_pool;
++
++typedef struct reiser4_pool_header {
++	/* object is either on free or "used" lists */
++	struct list_head usage_linkage;
++	struct list_head level_linkage;
++	struct list_head extra_linkage;
++} reiser4_pool_header;
++
++typedef enum {
++	POOLO_BEFORE,
++	POOLO_AFTER,
++	POOLO_LAST,
++	POOLO_FIRST
++} pool_ordering;
++
++/* pool manipulation functions */
++
++extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
++			      int num_of_objs, char *data);
++extern void reiser4_done_pool(reiser4_pool * pool);
++extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
++reiser4_pool_header *add_obj(reiser4_pool * pool, struct list_head * list,
++			     pool_ordering order,
++			     reiser4_pool_header * reference);
++
++/* __REISER4_POOL_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/readahead.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/readahead.c
+@@ -0,0 +1,138 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "forward.h"
++#include "tree.h"
++#include "tree_walk.h"
++#include "super.h"
++#include "inode.h"
++#include "key.h"
++#include "znode.h"
++
++#include <linux/swap.h>		/* for totalram_pages */
++
++void init_ra_info(ra_info_t * rai)
++{
++	rai->key_to_stop = *min_key();
++}
++
++/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
++static inline int ra_adjacent_only(int flags)
++{
++	return flags & RA_ADJACENT_ONLY;
++}
++
++/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
++   if right neighbor's first key is less or equal to readahead's stop key */
++static int should_readahead_neighbor(znode * node, ra_info_t * info)
++{
++	int result;
++
++	read_lock_dk(znode_get_tree(node));
++	result = keyle(znode_get_rd_key(node), &info->key_to_stop);
++	read_unlock_dk(znode_get_tree(node));
++	return result;
++}
++
++#define LOW_MEM_PERCENTAGE (5)
++
++static int low_on_memory(void)
++{
++	unsigned int freepages;
++
++	freepages = nr_free_pages();
++	return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
++}
++
++/* start read for @node and for a few of its right neighbors */
++void formatted_readahead(znode * node, ra_info_t * info)
++{
++	ra_params_t *ra_params;
++	znode *cur;
++	int i;
++	int grn_flags;
++	lock_handle next_lh;
++
++	/* do nothing if node block number has not been assigned to node (which means it is still in cache). */
++	if (blocknr_is_fake(znode_get_block(node)))
++		return;
++
++	ra_params = get_current_super_ra_params();
++
++	if (znode_page(node) == NULL)
++		jstartio(ZJNODE(node));
++
++	if (znode_get_level(node) != LEAF_LEVEL)
++		return;
++
++	/* don't waste memory for read-ahead when low on memory */
++	if (low_on_memory())
++		return;
++
++	/* We can have locked nodes on upper tree levels, in this situation lock
++	   priorities do not help to resolve deadlocks, we have to use TRY_LOCK
++	   here. */
++	grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
++
++	i = 0;
++	cur = zref(node);
++	init_lh(&next_lh);
++	while (i < ra_params->max) {
++		const reiser4_block_nr *nextblk;
++
++		if (!should_readahead_neighbor(cur, info))
++			break;
++
++		if (reiser4_get_right_neighbor
++		    (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
++			break;
++
++		nextblk = znode_get_block(next_lh.node);
++		if (blocknr_is_fake(nextblk) ||
++		    (ra_adjacent_only(ra_params->flags)
++		     && *nextblk != *znode_get_block(cur) + 1)) {
++			break;
++		}
++
++		zput(cur);
++		cur = zref(next_lh.node);
++		done_lh(&next_lh);
++		if (znode_page(cur) == NULL)
++			jstartio(ZJNODE(cur));
++		else
++			/* Do not scan read-ahead window if pages already
++			 * allocated (and i/o already started). */
++			break;
++
++		i++;
++	}
++	zput(cur);
++	done_lh(&next_lh);
++}
++
++void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
++{
++	reiser4_key *stop_key;
++
++	assert("nikita-3542", dir != NULL);
++	assert("nikita-3543", tap != NULL);
++
++	stop_key = &tap->ra_info.key_to_stop;
++	/* initialize readdir readahead information: include into readahead
++	 * stat data of all files of the directory */
++	set_key_locality(stop_key, get_inode_oid(dir));
++	set_key_type(stop_key, KEY_SD_MINOR);
++	set_key_ordering(stop_key, get_key_ordering(max_key()));
++	set_key_objectid(stop_key, get_key_objectid(max_key()));
++	set_key_offset(stop_key, get_key_offset(max_key()));
++}
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 80
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/readahead.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/readahead.h
+@@ -0,0 +1,48 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#ifndef __READAHEAD_H__
++#define __READAHEAD_H__
++
++#include "key.h"
++
++typedef enum {
++	RA_ADJACENT_ONLY = 1,	/* only requests nodes which are adjacent. Default is NO (not only adjacent) */
++} ra_global_flags;
++
++/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
++typedef struct formatted_read_ahead_params {
++	unsigned long max;	/* request not more than this amount of nodes. Default is totalram_pages / 4 */
++	int flags;
++} ra_params_t;
++
++typedef struct {
++	reiser4_key key_to_stop;
++} ra_info_t;
++
++void formatted_readahead(znode *, ra_info_t *);
++void init_ra_info(ra_info_t * rai);
++
++struct reiser4_file_ra_state {
++	loff_t start;		/* Current window */
++	loff_t size;
++	loff_t next_size;	/* Next window size */
++	loff_t ahead_start;	/* Ahead window */
++	loff_t ahead_size;
++	loff_t max_window_size;	/* Maximum readahead window */
++	loff_t slow_start;	/* enlarging r/a size algorithm. */
++};
++
++extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
++
++/* __READAHEAD_H__ */
++#endif
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/reiser4.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/reiser4.h
+@@ -0,0 +1,276 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* definitions of common constants used by reiser4 */
++
++#if !defined( __REISER4_H__ )
++#define __REISER4_H__
++
++#include <linux/config.h>
++#include <asm/param.h>		/* for HZ */
++#include <linux/errno.h>
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <asm/hardirq.h>
++#include <linux/sched.h>
++
++/*
++ * reiser4 compilation options.
++ */
++
++#if defined(CONFIG_REISER4_DEBUG)
++/* turn on assertion checks */
++#define REISER4_DEBUG (1)
++#else
++#define REISER4_DEBUG (0)
++#endif
++
++#if defined(CONFIG_ZLIB_INFLATE)
++/* turn on zlib */
++#define REISER4_ZLIB (1)
++#else
++#define REISER4_ZLIB (0)
++#endif
++
++#if defined(CONFIG_CRYPTO_SHA256)
++#define REISER4_SHA256 (1)
++#else
++#define REISER4_SHA256 (0)
++#endif
++
++#if defined(CONFIG_CRYPTO_AES_586)
++#define REISER4_AES (1)
++#else
++#define REISER4_AES (0)
++#endif
++
++/*
++ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
++ * 8-byte components. In the old "small key" mode, it's 3 8-byte
++ * components. Additional component, referred to as "ordering" is used to
++ * order items from which given object is composed of. As such, ordering is
++ * placed between locality and objectid. For directory item ordering contains
++ * initial prefix of the file name this item is for. This sorts all directory
++ * items within given directory lexicographically (but see
++ * fibration.[ch]). For file body and stat-data, ordering contains initial
++ * prefix of the name file was initially created with. In the common case
++ * (files with single name) this allows to order file bodies and stat-datas in
++ * the same order as their respective directory entries, thus speeding up
++ * readdir.
++ *
++ * Note, that kernel can only mount file system with the same key size as one
++ * it is compiled for, so flipping this option may render your data
++ * inaccessible.
++ */
++#define REISER4_LARGE_KEY (1)
++/*#define REISER4_LARGE_KEY (0)*/
++
++/*#define GUESS_EXISTS 1*/
++
++/*
++ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
++ * option
++ */
++
++extern const char *REISER4_SUPER_MAGIC_STRING;
++extern const int REISER4_MAGIC_OFFSET;	/* offset to magic string from the
++					 * beginning of device */
++
++/* here go tunable parameters that are not worth special entry in kernel
++   configuration */
++
++/* default number of slots in coord-by-key caches */
++#define CBK_CACHE_SLOTS    (16)
++/* how many elementary tree operation to carry on the next level */
++#define CARRIES_POOL_SIZE        (5)
++/* size of pool of preallocated nodes for carry process. */
++#define NODES_LOCKED_POOL_SIZE   (5)
++
++#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
++#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
++#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
++#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
++
++/* we are supporting reservation of disk space on uid basis */
++#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
++/* we are supporting reservation of disk space for groups */
++#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
++/* we are supporting reservation of disk space for root */
++#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
++/* we use rapid flush mode, see flush.c for comments.  */
++#define REISER4_USE_RAPID_FLUSH (1)
++
++/*
++ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
++ */
++#define REISER4_USE_ENTD (1)
++
++/* key allocation is Plan-A */
++#define REISER4_PLANA_KEY_ALLOCATION (1)
++/* key allocation follows good old 3.x scheme */
++#define REISER4_3_5_KEY_ALLOCATION (0)
++
++/* size of hash-table for znodes */
++#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
++
++/* number of buckets in lnode hash-table */
++#define LNODE_HTABLE_BUCKETS (1024)
++
++/* some ridiculously high maximal limit on height of znode tree. This
++    is used in declaration of various per level arrays and
++    to allocate stattistics gathering array for per-level stats. */
++#define REISER4_MAX_ZTREE_HEIGHT     (8)
++
++#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
++
++/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
++   sequential search is on average faster than binary. This is because
++   of better optimization and because sequential search is more CPU
++   cache friendly. This number (25) was found by experiments on dual AMD
++   Athlon(tm), 1400MHz.
++
++   NOTE: testing in kernel has shown that binary search is more effective than
++   implied by results of the user level benchmarking. Probably because in the
++   node keys are separated by other data. So value was adjusted after few
++   tests. More thorough tuning is needed.
++*/
++#define REISER4_SEQ_SEARCH_BREAK      (3)
++
++/* don't allow tree to be lower than this */
++#define REISER4_MIN_TREE_HEIGHT       (TWIG_LEVEL)
++
++/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
++ * available memory. */
++/* Default value of maximal atom size. Can be ovewritten by
++   tmgr.atom_max_size mount option. By default infinity. */
++#define REISER4_ATOM_MAX_SIZE         ((unsigned)(~0))
++
++/* Default value of maximal atom age (in jiffies). After reaching this age
++   atom will be forced to commit, either synchronously or asynchronously. Can
++   be overwritten by tmgr.atom_max_age mount option. */
++#define REISER4_ATOM_MAX_AGE          (600 * HZ)
++
++/* sleeping period for ktxnmrgd */
++#define REISER4_TXNMGR_TIMEOUT  (5 * HZ)
++
++/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
++#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
++
++/* start complaining after that many restarts in coord_by_key().
++
++   This either means incredibly heavy contention for this part of a tree, or
++   some corruption or bug.
++*/
++#define REISER4_CBK_ITERATIONS_LIMIT  (100)
++
++/* return -EIO after that many iterations in coord_by_key().
++
++   I have witnessed more than 800 iterations (in 30 thread test) before cbk
++   finished. --nikita
++*/
++#define REISER4_MAX_CBK_ITERATIONS    500000
++
++/* put a per-inode limit on maximal number of directory entries with identical
++   keys in hashed directory.
++
++   Disable this until inheritance interfaces stabilize: we need some way to
++   set per directory limit.
++*/
++#define REISER4_USE_COLLISION_LIMIT    (0)
++
++/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
++   will force them to be relocated. */
++#define FLUSH_RELOCATE_THRESHOLD 64
++/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
++   from the preceder it will relocate to that position. */
++#define FLUSH_RELOCATE_DISTANCE  64
++
++/* If we have written this much or more blocks before encountering busy jnode
++   in flush list - abort flushing hoping that next time we get called
++   this jnode will be clean already, and we will save some seeks. */
++#define FLUSH_WRITTEN_THRESHOLD 50
++
++/* The maximum number of nodes to scan left on a level during flush. */
++#define FLUSH_SCAN_MAXNODES 10000
++
++/* per-atom limit of flushers */
++#define ATOM_MAX_FLUSHERS (1)
++
++/* default tracing buffer size */
++#define REISER4_TRACE_BUF_SIZE (1 << 15)
++
++/* what size units of IO we would like cp, etc., to use, in writing to
++   reiser4. In bytes.
++
++   Can be overwritten by optimal_io_size mount option.
++*/
++#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
++
++/* see comments in inode.c:oid_to_uino() */
++#define REISER4_UINO_SHIFT (1 << 30)
++
++/* Mark function argument as unused to avoid compiler warnings. */
++#define UNUSED_ARG __attribute__((unused))
++
++#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
++#define NONNULL __attribute__((nonnull))
++#else
++#define NONNULL
++#endif
++
++/* master super block offset in bytes.*/
++#define REISER4_MASTER_OFFSET 65536
++
++/* size of VFS block */
++#define VFS_BLKSIZE 512
++/* number of bits in size of VFS block (512==2^9) */
++#define VFS_BLKSIZE_BITS 9
++
++#define REISER4_I reiser4_inode_data
++
++/* implication */
++#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
++/* logical equivalence */
++#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
++
++#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
++
++#define NOT_YET                       (0)
++
++/** Reiser4 specific error codes **/
++
++#define REISER4_ERROR_CODE_BASE 500
++
++/* Neighbor is not available (side neighbor or parent) */
++#define E_NO_NEIGHBOR  (REISER4_ERROR_CODE_BASE)
++
++/* Node was not found in cache */
++#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
++
++/* node has no free space enough for completion of balancing operation */
++#define E_NODE_FULL    (REISER4_ERROR_CODE_BASE + 2)
++
++/* repeat operation */
++#define E_REPEAT       (REISER4_ERROR_CODE_BASE + 3)
++
++/* deadlock happens */
++#define E_DEADLOCK     (REISER4_ERROR_CODE_BASE + 4)
++
++/* operation cannot be performed, because it would block and non-blocking mode
++ * was requested. */
++#define E_BLOCK        (REISER4_ERROR_CODE_BASE + 5)
++
++/* wait some event (depends on context), then repeat */
++#define E_WAIT         (REISER4_ERROR_CODE_BASE + 6)
++
++#endif				/* __REISER4_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/safe_link.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/safe_link.c
+@@ -0,0 +1,351 @@
++/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Safe-links. */
++
++/*
++ * Safe-links are used to maintain file system consistency during operations
++ * that spawns multiple transactions. For example:
++ *
++ *     1. Unlink. UNIX supports "open-but-unlinked" files, that is files
++ *     without user-visible names in the file system, but still opened by some
++ *     active process. What happens here is that unlink proper (i.e., removal
++ *     of the last file name) and file deletion (truncate of file body to zero
++ *     and deletion of stat-data, that happens when last file descriptor is
++ *     closed), may belong to different transactions T1 and T2. If a crash
++ *     happens after T1 commit, but before T2 commit, on-disk file system has
++ *     a file without name, that is, disk space leak.
++ *
++ *     2. Truncate. Truncate of large file may spawn multiple transactions. If
++ *     system crashes while truncate was in-progress, file is left partially
++ *     truncated, which violates "atomicity guarantees" of reiser4, viz. that
++ *     every system is atomic.
++ *
++ * Safe-links address both above cases. Basically, safe-link is a way post
++ * some operation to be executed during commit of some other transaction than
++ * current one. (Another way to look at the safe-link is to interpret it as a
++ * logical logging.)
++ *
++ * Specifically, at the beginning of unlink safe-link in inserted in the
++ * tree. This safe-link is normally removed by file deletion code (during
++ * transaction T2 in the above terms). Truncate also inserts safe-link that is
++ * normally removed when truncate operation is finished.
++ *
++ * This means, that in the case of "clean umount" there are no safe-links in
++ * the tree. If safe-links are observed during mount, it means that (a) system
++ * was terminated abnormally, and (b) safe-link correspond to the "pending"
++ * (i.e., not finished) operations that were in-progress during system
++ * termination. Each safe-link record enough information to complete
++ * corresponding operation, and mount simply "replays" them (hence, the
++ * analogy with the logical logging).
++ *
++ * Safe-links are implemented as blackbox items (see
++ * plugin/item/blackbox.[ch]).
++ *
++ * For the reference: ext3 also has similar mechanism, it's called "an orphan
++ * list" there.
++ */
++
++#include "safe_link.h"
++#include "debug.h"
++#include "inode.h"
++
++#include "plugin/item/blackbox.h"
++
++#include <linux/fs.h>
++
++/*
++ * On-disk format of safe-link.
++ */
++typedef struct safelink {
++	reiser4_key sdkey;	/* key of stat-data for the file safe-link is
++				 * for */
++	d64 size;		/* size to which file should be truncated */
++} safelink_t;
++
++/*
++ * locality where safe-link items are stored. Next to the objectid of root
++ * directory.
++ */
++static oid_t safe_link_locality(reiser4_tree * tree)
++{
++	return get_key_objectid(get_super_private(tree->super)->df_plug->
++				root_dir_key(tree->super)) + 1;
++}
++
++/*
++  Construct a key for the safe-link. Key has the following format:
++
++|        60     | 4 |        64        | 4 |      60       |         64       |
+++---------------+---+------------------+---+---------------+------------------+
++|   locality    | 0 |        0         | 0 |   objectid    |     link type    |
+++---------------+---+------------------+---+---------------+------------------+
++|                   |                  |                   |                  |
++|     8 bytes       |     8 bytes      |      8 bytes      |      8 bytes     |
++
++   This is in large keys format. In small keys format second 8 byte chunk is
++   out. Locality is a constant returned by safe_link_locality(). objectid is
++   an oid of a file on which operation protected by this safe-link is
++   performed. link-type is used to distinguish safe-links for different
++   operations.
++
++ */
++static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
++				   reiser4_safe_link_t link, reiser4_key * key)
++{
++	reiser4_key_init(key);
++	set_key_locality(key, safe_link_locality(tree));
++	set_key_objectid(key, oid);
++	set_key_offset(key, link);
++	return key;
++}
++
++/*
++ * how much disk space is necessary to insert and remove (in the
++ * error-handling path) safe-link.
++ */
++static __u64 safe_link_tograb(reiser4_tree * tree)
++{
++	return
++	    /* insert safe link */
++	    estimate_one_insert_item(tree) +
++	    /* remove safe link */
++	    estimate_one_item_removal(tree) +
++	    /* drill to the leaf level during insertion */
++	    1 + estimate_one_insert_item(tree) +
++	    /*
++	     * possible update of existing safe-link. Actually, if
++	     * safe-link existed already (we failed to remove it), then no
++	     * insertion is necessary, so this term is already "covered",
++	     * but for simplicity let's left it.
++	     */
++	    1;
++}
++
++/*
++ * grab enough disk space to insert and remove (in the error-handling path)
++ * safe-link.
++ */
++int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
++{
++	int result;
++
++	grab_space_enable();
++	/* The sbinfo->delete semaphore can be taken here.
++	 * safe_link_release() should be called before leaving reiser4
++	 * context. */
++	result =
++	    reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
++	grab_space_enable();
++	return result;
++}
++
++/*
++ * release unused disk space reserved by safe_link_grab().
++ */
++void safe_link_release(reiser4_tree * tree)
++{
++	reiser4_release_reserved(tree->super);
++}
++
++/*
++ * insert into tree safe-link for operation @link on inode @inode.
++ */
++int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
++{
++	reiser4_key key;
++	safelink_t sl;
++	int length;
++	int result;
++	reiser4_tree *tree;
++
++	build_sd_key(inode, &sl.sdkey);
++	length = sizeof sl.sdkey;
++
++	if (link == SAFE_TRUNCATE) {
++		/*
++		 * for truncate we have to store final file length also,
++		 * expand item.
++		 */
++		length += sizeof(sl.size);
++		put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
++	}
++	tree = tree_by_inode(inode);
++	build_link_key(tree, get_inode_oid(inode), link, &key);
++
++	result = store_black_box(tree, &key, &sl, length);
++	if (result == -EEXIST)
++		result = update_black_box(tree, &key, &sl, length);
++	return result;
++}
++
++/*
++ * remove safe-link corresponding to the operation @link on inode @inode from
++ * the tree.
++ */
++int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
++{
++	reiser4_key key;
++
++	return kill_black_box(tree, build_link_key(tree, oid, link, &key));
++}
++
++/*
++ * in-memory structure to keep information extracted from safe-link. This is
++ * used to iterate over all safe-links.
++ */
++typedef struct {
++	reiser4_tree *tree;	/* internal tree */
++	reiser4_key key;	/* safe-link key */
++	reiser4_key sdkey;	/* key of object stat-data */
++	reiser4_safe_link_t link;	/* safe-link type */
++	oid_t oid;		/* object oid */
++	__u64 size;		/* final size for truncate */
++} safe_link_context;
++
++/*
++ * start iterating over all safe-links.
++ */
++static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
++{
++	ctx->tree = tree;
++	reiser4_key_init(&ctx->key);
++	set_key_locality(&ctx->key, safe_link_locality(tree));
++	set_key_objectid(&ctx->key, get_key_objectid(max_key()));
++	set_key_offset(&ctx->key, get_key_offset(max_key()));
++}
++
++/*
++ * return next safe-link.
++ */
++static int safe_link_iter_next(safe_link_context * ctx)
++{
++	int result;
++	safelink_t sl;
++
++	result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
++	if (result == 0) {
++		ctx->oid = get_key_objectid(&ctx->key);
++		ctx->link = get_key_offset(&ctx->key);
++		ctx->sdkey = sl.sdkey;
++		if (ctx->link == SAFE_TRUNCATE)
++			ctx->size = le64_to_cpu(get_unaligned(&sl.size));
++	}
++	return result;
++}
++
++/*
++ * check are there any more safe-links left in the tree.
++ */
++static int safe_link_iter_finished(safe_link_context * ctx)
++{
++	return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
++}
++
++/*
++ * finish safe-link iteration.
++ */
++static void safe_link_iter_end(safe_link_context * ctx)
++{
++	/* nothing special */
++}
++
++/*
++ * process single safe-link.
++ */
++static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
++			    reiser4_key * sdkey, oid_t oid, __u64 size)
++{
++	struct inode *inode;
++	int result;
++
++	/*
++	 * obtain object inode by reiser4_iget(), then call object plugin
++	 * ->safelink() method to do actual work, then delete safe-link on
++	 * success.
++	 */
++	inode = reiser4_iget(super, sdkey, 1);
++	if (!IS_ERR(inode)) {
++		file_plugin *fplug;
++
++		fplug = inode_file_plugin(inode);
++		assert("nikita-3428", fplug != NULL);
++		assert("", oid == get_inode_oid(inode));
++		if (fplug->safelink != NULL) {
++			/* txn_restart_current is not necessary because
++			 * mounting is signle thread. However, without it
++			 * deadlock detection code will complain (see
++			 * nikita-3361). */
++			txn_restart_current();
++			result = fplug->safelink(inode, link, size);
++		} else {
++			warning("nikita-3430",
++				"Cannot handle safelink for %lli",
++				(unsigned long long)oid);
++			print_key("key", sdkey);
++			result = 0;
++		}
++		if (result != 0) {
++			warning("nikita-3431",
++				"Error processing safelink for %lli: %i",
++				(unsigned long long)oid, result);
++		}
++		reiser4_iget_complete(inode);
++		iput(inode);
++		if (result == 0) {
++			result = safe_link_grab(get_tree(super), BA_CAN_COMMIT);
++			if (result == 0)
++				result =
++				    safe_link_del(get_tree(super), oid, link);
++			safe_link_release(get_tree(super));
++			/*
++			 * restart transaction: if there was large number of
++			 * safe-links, their processing may fail to fit into
++			 * single transaction.
++			 */
++			if (result == 0)
++				txn_restart_current();
++		}
++	} else
++		result = PTR_ERR(inode);
++	return result;
++}
++
++/*
++ * iterate over all safe-links in the file-system processing them one by one.
++ */
++int process_safelinks(struct super_block *super)
++{
++	safe_link_context ctx;
++	int result;
++
++	if (rofs_super(super))
++		/* do nothing on the read-only file system */
++		return 0;
++	safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
++	result = 0;
++	do {
++		result = safe_link_iter_next(&ctx);
++		if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
++			result = 0;
++			break;
++		}
++		if (result == 0)
++			result = process_safelink(super, ctx.link,
++						  &ctx.sdkey, ctx.oid,
++						  ctx.size);
++	} while (result == 0);
++	safe_link_iter_end(&ctx);
++	return result;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/safe_link.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/safe_link.h
+@@ -0,0 +1,29 @@
++/* Copyright 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Safe-links. See safe_link.c for details. */
++
++#if !defined( __FS_SAFE_LINK_H__ )
++#define __FS_SAFE_LINK_H__
++
++#include "tree.h"
++
++int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
++void safe_link_release(reiser4_tree * tree);
++int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
++int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
++
++int process_safelinks(struct super_block *super);
++
++/* __FS_SAFE_LINK_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/seal.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/seal.c
+@@ -0,0 +1,217 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++/* Seals implementation. */
++/* Seals are "weak" tree pointers. They are analogous to tree coords in
++   allowing to bypass tree traversal. But normal usage of coords implies that
++   node pointed to by coord is locked, whereas seals don't keep a lock (or
++   even a reference) to znode. In stead, each znode contains a version number,
++   increased on each znode modification. This version number is copied into a
++   seal when seal is created. Later, one can "validate" seal by calling
++   seal_validate(). If znode is in cache and its version number is still the
++   same, seal is "pristine" and coord associated with it can be re-used
++   immediately.
++
++   If, on the other hand, znode is out of cache, or it is obviously different
++   one from the znode seal was initially attached to (for example, it is on
++   the different level, or is being removed from the tree), seal is
++   irreparably invalid ("burned") and tree traversal has to be repeated.
++
++   Otherwise, there is some hope, that while znode was modified (and seal was
++   "broken" as a result), key attached to the seal is still in the node. This
++   is checked by first comparing this key with delimiting keys of node and, if
++   key is ok, doing intra-node lookup.
++
++   Znode version is maintained in the following way:
++
++   there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
++   znode_epoch is incremented and its new value is stored in ->version field
++   of new znode. Whenever znode is dirtied (which means it was probably
++   modified), znode_epoch is also incremented and its new value is stored in
++   znode->version. This is done so, because just incrementing znode->version
++   on each update is not enough: it may so happen, that znode get deleted, new
++   znode is allocated for the same disk block and gets the same version
++   counter, tricking seal code into false positive.
++*/
++
++#include "forward.h"
++#include "debug.h"
++#include "key.h"
++#include "coord.h"
++#include "seal.h"
++#include "plugin/item/item.h"
++#include "plugin/node/node.h"
++#include "jnode.h"
++#include "znode.h"
++#include "super.h"
++
++static znode *seal_node(const seal_t * seal);
++static int seal_matches(const seal_t * seal, znode * node);
++
++/* initialise seal. This can be called several times on the same seal. @coord
++   and @key can be NULL.  */
++void seal_init(seal_t * seal /* seal to initialise */ ,
++	       const coord_t * coord /* coord @seal will be attached to */ ,
++	       const reiser4_key * key UNUSED_ARG	/* key @seal will be
++							 * attached to */ )
++{
++	assert("nikita-1886", seal != NULL);
++	memset(seal, 0, sizeof *seal);
++	if (coord != NULL) {
++		znode *node;
++
++		node = coord->node;
++		assert("nikita-1987", node != NULL);
++		spin_lock_znode(node);
++		seal->version = node->version;
++		assert("nikita-1988", seal->version != 0);
++		seal->block = *znode_get_block(node);
++#if REISER4_DEBUG
++		seal->coord1 = *coord;
++		if (key != NULL)
++			seal->key = *key;
++#endif
++		spin_unlock_znode(node);
++	}
++}
++
++/* finish with seal */
++void seal_done(seal_t * seal /* seal to clear */ )
++{
++	assert("nikita-1887", seal != NULL);
++	seal->version = 0;
++}
++
++/* true if seal was initialised */
++int seal_is_set(const seal_t * seal /* seal to query */ )
++{
++	assert("nikita-1890", seal != NULL);
++	return seal->version != 0;
++}
++
++#if REISER4_DEBUG
++/* helper function for seal_validate(). It checks that item at @coord has
++ * expected key. This is to detect cases where node was modified but wasn't
++ * marked dirty. */
++static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
++				   const reiser4_key * k /* expected key */ )
++{
++	reiser4_key ukey;
++
++	return (coord->between != AT_UNIT) ||
++	    /* FIXME-VS: we only can compare keys for items whose units
++	       represent exactly one key */
++	    ((coord_is_existing_unit(coord))
++	     && (item_is_extent(coord)
++		 || keyeq(k, unit_key_by_coord(coord, &ukey))))
++	    || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
++		&& keyge(k, unit_key_by_coord(coord, &ukey)));
++}
++#endif
++
++/* this is used by seal_validate. It accepts return value of
++ * longterm_lock_znode and returns 1 if it can be interpreted as seal
++ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
++ * seal_validate returns -E_REPEAT and caller will call tre search. We cannot
++ * do this in longterm_lock_znode(), because sometimes we want to distinguish
++ * between -EINVAL and -E_REPEAT. */
++static int should_repeat(int return_code)
++{
++	return return_code == -EINVAL;
++}
++
++/* (re-)validate seal.
++
++   Checks whether seal is pristine, and try to revalidate it if possible.
++
++   If seal was burned, or broken irreparably, return -E_REPEAT.
++
++   NOTE-NIKITA currently seal_validate() returns -E_REPEAT if key we are
++   looking for is in range of keys covered by the sealed node, but item wasn't
++   found by node ->lookup() method. Alternative is to return -ENOENT in this
++   case, but this would complicate callers logic.
++
++*/
++int seal_validate(seal_t * seal /* seal to validate */ ,
++		  coord_t * coord /* coord to validate against */ ,
++		  const reiser4_key * key /* key to validate against */ ,
++		  lock_handle * lh /* resulting lock handle */ ,
++		  znode_lock_mode mode /* lock node */ ,
++		  znode_lock_request request /* locking priority */ )
++{
++	znode *node;
++	int result;
++
++	assert("nikita-1889", seal != NULL);
++	assert("nikita-1881", seal_is_set(seal));
++	assert("nikita-1882", key != NULL);
++	assert("nikita-1883", coord != NULL);
++	assert("nikita-1884", lh != NULL);
++	assert("nikita-1885", keyeq(&seal->key, key));
++	assert("nikita-1989", coords_equal(&seal->coord1, coord));
++
++	/* obtain znode by block number */
++	node = seal_node(seal);
++	if (node != NULL) {
++		/* znode was in cache, lock it */
++		result = longterm_lock_znode(lh, node, mode, request);
++		zput(node);
++		if (result == 0) {
++			if (seal_matches(seal, node)) {
++				/* if seal version and znode version
++				   coincide */
++				ON_DEBUG(coord_update_v(coord));
++				assert("nikita-1990",
++				       node == seal->coord1.node);
++				assert("nikita-1898",
++				       WITH_DATA_RET(coord->node, 1,
++						     check_seal_match(coord,
++								      key)));
++			} else
++				result = RETERR(-E_REPEAT);
++		}
++		if (result != 0) {
++			if (should_repeat(result))
++				result = RETERR(-E_REPEAT);
++			/* unlock node on failure */
++			done_lh(lh);
++		}
++	} else {
++		/* znode wasn't in cache */
++		result = RETERR(-E_REPEAT);
++	}
++	return result;
++}
++
++/* helpers functions */
++
++/* obtain reference to znode seal points to, if in cache */
++static znode *seal_node(const seal_t * seal /* seal to query */ )
++{
++	assert("nikita-1891", seal != NULL);
++	return zlook(current_tree, &seal->block);
++}
++
++/* true if @seal version and @node version coincide */
++static int seal_matches(const seal_t * seal /* seal to check */ ,
++			znode * node /* node to check */ )
++{
++	int result;
++
++	assert("nikita-1991", seal != NULL);
++	assert("nikita-1993", node != NULL);
++
++	spin_lock_znode(node);
++	result = (seal->version == node->version);
++	spin_unlock_znode(node);
++	return result;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/seal.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/seal.h
+@@ -0,0 +1,49 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
++
++#ifndef __SEAL_H__
++#define __SEAL_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++
++/* for __u?? types */
++/*#include <linux/types.h>*/
++
++/* seal. See comment at the top of seal.c */
++typedef struct seal_s {
++	/* version of znode recorder at the time of seal creation */
++	__u64 version;
++	/* block number of znode attached to this seal */
++	reiser4_block_nr block;
++#if REISER4_DEBUG
++	/* coord this seal is attached to. For debugging. */
++	coord_t coord1;
++	/* key this seal is attached to. For debugging. */
++	reiser4_key key;
++#endif
++} seal_t;
++
++extern void seal_init(seal_t *, const coord_t *, const reiser4_key *);
++extern void seal_done(seal_t *);
++extern int seal_is_set(const seal_t *);
++extern int seal_validate(seal_t *, coord_t *,
++			 const reiser4_key *, lock_handle *,
++			 znode_lock_mode mode, znode_lock_request request);
++
++/* __SEAL_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/search.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/search.c
+@@ -0,0 +1,1611 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++#include "seal.h"
++#include "plugin/item/item.h"
++#include "plugin/node/node.h"
++#include "plugin/plugin.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree_walk.h"
++#include "tree.h"
++#include "reiser4.h"
++#include "super.h"
++#include "inode.h"
++
++#include <linux/slab.h>
++
++static const char *bias_name(lookup_bias bias);
++
++/* tree searching algorithm, intranode searching algorithms are in
++   plugin/node/ */
++
++/* tree lookup cache
++ *
++ * The coord by key cache consists of small list of recently accessed nodes
++ * maintained according to the LRU discipline. Before doing real top-to-down
++ * tree traversal this cache is scanned for nodes that can contain key
++ * requested.
++ *
++ * The efficiency of coord cache depends heavily on locality of reference for
++ * tree accesses. Our user level simulations show reasonably good hit ratios
++ * for coord cache under most loads so far.
++ */
++
++/* Initialise coord cache slot */
++static void cbk_cache_init_slot(cbk_cache_slot *slot)
++{
++	assert("nikita-345", slot != NULL);
++
++	INIT_LIST_HEAD(&slot->lru);
++	slot->node = NULL;
++}
++
++/* Initialize coord cache */
++int cbk_cache_init(cbk_cache *cache /* cache to init */ )
++{
++	int i;
++
++	assert("nikita-346", cache != NULL);
++
++	cache->slot =
++	    kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, GFP_KERNEL);
++	if (cache->slot == NULL)
++		return RETERR(-ENOMEM);
++
++	INIT_LIST_HEAD(&cache->lru);
++	for (i = 0; i < cache->nr_slots; ++i) {
++		cbk_cache_init_slot(cache->slot + i);
++		list_add_tail(&((cache->slot + i)->lru), &cache->lru);
++	}
++	rwlock_init(&cache->guard);
++	return 0;
++}
++
++/* free cbk cache data */
++void cbk_cache_done(cbk_cache * cache /* cache to release */ )
++{
++	assert("nikita-2493", cache != NULL);
++	if (cache->slot != NULL) {
++		kfree(cache->slot);
++		cache->slot = NULL;
++	}
++}
++
++/* macro to iterate over all cbk cache slots */
++#define for_all_slots(cache, slot)						\
++	for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru);	\
++	     &(cache)->lru != &(slot)->lru;					\
++	     (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
++
++
++#if REISER4_DEBUG
++/* this function assures that [cbk-cache-invariant] invariant holds */
++static int cbk_cache_invariant(const cbk_cache *cache)
++{
++	cbk_cache_slot *slot;
++	int result;
++	int unused;
++
++	if (cache->nr_slots == 0)
++		return 1;
++
++	assert("nikita-2469", cache != NULL);
++	unused = 0;
++	result = 1;
++	read_lock(&((cbk_cache *)cache)->guard);
++	for_all_slots(cache, slot) {
++		/* in LRU first go all `used' slots followed by `unused' */
++		if (unused && (slot->node != NULL))
++			result = 0;
++		if (slot->node == NULL)
++			unused = 1;
++		else {
++			cbk_cache_slot *scan;
++
++			/* all cached nodes are different */
++			scan = slot;
++			while (result) {
++				scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
++				if (&cache->lru == &scan->lru)
++					break;
++				if (slot->node == scan->node)
++					result = 0;
++			}
++		}
++		if (!result)
++			break;
++	}
++	read_unlock(&((cbk_cache *)cache)->guard);
++	return result;
++}
++
++#endif
++
++/* Remove references, if any, to @node from coord cache */
++void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
++			  reiser4_tree * tree /* tree to remove node from */ )
++{
++	cbk_cache_slot *slot;
++	cbk_cache *cache;
++	int i;
++
++	assert("nikita-350", node != NULL);
++	assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
++
++	cache = &tree->cbk_cache;
++	assert("nikita-2470", cbk_cache_invariant(cache));
++
++	write_lock(&(cache->guard));
++	for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
++		if (slot->node == node) {
++			list_move_tail(&slot->lru, &cache->lru);
++			slot->node = NULL;
++			break;
++		}
++	}
++	write_unlock(&(cache->guard));
++	assert("nikita-2471", cbk_cache_invariant(cache));
++}
++
++/* add to the cbk-cache in the "tree" information about "node". This
++    can actually be update of existing slot in a cache. */
++static void cbk_cache_add(const znode *node /* node to add to the cache */ )
++{
++	cbk_cache *cache;
++	cbk_cache_slot *slot;
++	int i;
++
++	assert("nikita-352", node != NULL);
++
++	cache = &znode_get_tree(node)->cbk_cache;
++	assert("nikita-2472", cbk_cache_invariant(cache));
++
++	if (cache->nr_slots == 0)
++		return;
++
++	write_lock(&(cache->guard));
++	/* find slot to update/add */
++	for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
++		/* oops, this node is already in a cache */
++		if (slot->node == node)
++			break;
++	}
++	/* if all slots are used, reuse least recently used one */
++	if (i == cache->nr_slots) {
++		slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
++		slot->node = (znode *) node;
++	}
++	list_move(&slot->lru, &cache->lru);
++	write_unlock(&(cache->guard));
++	assert("nikita-2473", cbk_cache_invariant(cache));
++}
++
++static int setup_delimiting_keys(cbk_handle * h);
++static lookup_result coord_by_handle(cbk_handle * handle);
++static lookup_result traverse_tree(cbk_handle * h);
++static int cbk_cache_search(cbk_handle * h);
++
++static level_lookup_result cbk_level_lookup(cbk_handle * h);
++static level_lookup_result cbk_node_lookup(cbk_handle * h);
++
++/* helper functions */
++
++static void update_stale_dk(reiser4_tree * tree, znode * node);
++
++/* release parent node during traversal */
++static void put_parent(cbk_handle * h);
++/* check consistency of fields */
++static int sanity_check(cbk_handle * h);
++/* release resources in handle */
++static void hput(cbk_handle * h);
++
++static level_lookup_result search_to_left(cbk_handle * h);
++
++/* pack numerous (numberous I should say) arguments of coord_by_key() into
++ * cbk_handle */
++static cbk_handle *cbk_pack(cbk_handle * handle,
++			    reiser4_tree * tree,
++			    const reiser4_key * key,
++			    coord_t * coord,
++			    lock_handle * active_lh,
++			    lock_handle * parent_lh,
++			    znode_lock_mode lock_mode,
++			    lookup_bias bias,
++			    tree_level lock_level,
++			    tree_level stop_level,
++			    __u32 flags, ra_info_t * info)
++{
++	memset(handle, 0, sizeof *handle);
++
++	handle->tree = tree;
++	handle->key = key;
++	handle->lock_mode = lock_mode;
++	handle->bias = bias;
++	handle->lock_level = lock_level;
++	handle->stop_level = stop_level;
++	handle->coord = coord;
++	/* set flags. See comment in tree.h:cbk_flags */
++	handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
++
++	handle->active_lh = active_lh;
++	handle->parent_lh = parent_lh;
++	handle->ra_info = info;
++	return handle;
++}
++
++/* main tree lookup procedure
++
++   Check coord cache. If key we are looking for is not found there, call cbk()
++   to do real tree traversal.
++
++   As we have extents on the twig level, @lock_level and @stop_level can
++   be different from LEAF_LEVEL and each other.
++
++   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
++   long term locks) while calling this.
++*/
++lookup_result coord_by_key(reiser4_tree * tree	/* tree to perform search
++						 * in. Usually this tree is
++						 * part of file-system
++						 * super-block */ ,
++			   const reiser4_key * key /* key to look for */ ,
++			   coord_t * coord	/* where to store found
++						 * position in a tree. Fields
++						 * in "coord" are only valid if
++						 * coord_by_key() returned
++						 * "CBK_COORD_FOUND" */ ,
++			   lock_handle * lh,	/* resulting lock handle */
++			   znode_lock_mode lock_mode	/* type of lookup we
++							 * want on node. Pass
++							 * ZNODE_READ_LOCK here
++							 * if you only want to
++							 * read item found and
++							 * ZNODE_WRITE_LOCK if
++							 * you want to modify
++							 * it */ ,
++			   lookup_bias bias	/* what to return if coord
++						 * with exactly the @key is
++						 * not in the tree */ ,
++			   tree_level lock_level	/* tree level where to start
++							 * taking @lock type of
++							 * locks */ ,
++			   tree_level stop_level	/* tree level to stop. Pass
++							 * LEAF_LEVEL or TWIG_LEVEL
++							 * here Item being looked
++							 * for has to be between
++							 * @lock_level and
++							 * @stop_level, inclusive */ ,
++			   __u32 flags /* search flags */ ,
++			   ra_info_t *
++			   info
++			   /* information about desired tree traversal readahead */
++			   )
++{
++	cbk_handle handle;
++	lock_handle parent_lh;
++	lookup_result result;
++
++	init_lh(lh);
++	init_lh(&parent_lh);
++
++	assert("nikita-3023", schedulable());
++
++	assert("nikita-353", tree != NULL);
++	assert("nikita-354", key != NULL);
++	assert("nikita-355", coord != NULL);
++	assert("nikita-356", (bias == FIND_EXACT)
++	       || (bias == FIND_MAX_NOT_MORE_THAN));
++	assert("nikita-357", stop_level >= LEAF_LEVEL);
++	/* no locks can be held during tree traversal */
++	assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
++
++	cbk_pack(&handle,
++		 tree,
++		 key,
++		 coord,
++		 lh,
++		 &parent_lh,
++		 lock_mode, bias, lock_level, stop_level, flags, info);
++
++	result = coord_by_handle(&handle);
++	assert("nikita-3247",
++	       ergo(!IS_CBKERR(result), coord->node == lh->node));
++	return result;
++}
++
++/* like coord_by_key(), but starts traversal from vroot of @object rather than
++ * from tree root. */
++lookup_result
++object_lookup(struct inode * object,
++	      const reiser4_key * key,
++	      coord_t * coord,
++	      lock_handle * lh,
++	      znode_lock_mode lock_mode,
++	      lookup_bias bias,
++	      tree_level lock_level,
++	      tree_level stop_level, __u32 flags, ra_info_t * info)
++{
++	cbk_handle handle;
++	lock_handle parent_lh;
++	lookup_result result;
++
++	init_lh(lh);
++	init_lh(&parent_lh);
++
++	assert("nikita-3023", schedulable());
++
++	assert("nikita-354", key != NULL);
++	assert("nikita-355", coord != NULL);
++	assert("nikita-356", (bias == FIND_EXACT)
++	       || (bias == FIND_MAX_NOT_MORE_THAN));
++	assert("nikita-357", stop_level >= LEAF_LEVEL);
++	/* no locks can be held during tree search by key */
++	assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
++
++	cbk_pack(&handle,
++		 object != NULL ? tree_by_inode(object) : current_tree,
++		 key,
++		 coord,
++		 lh,
++		 &parent_lh,
++		 lock_mode, bias, lock_level, stop_level, flags, info);
++	handle.object = object;
++
++	result = coord_by_handle(&handle);
++	assert("nikita-3247",
++	       ergo(!IS_CBKERR(result), coord->node == lh->node));
++	return result;
++}
++
++/* lookup by cbk_handle. Common part of coord_by_key() and object_lookup(). */
++static lookup_result coord_by_handle(cbk_handle * handle)
++{
++	/*
++	 * first check cbk_cache (which is look-aside cache for our tree) and
++	 * of this fails, start traversal.
++	 */
++	/* first check whether "key" is in cache of recent lookups. */
++	if (cbk_cache_search(handle) == 0)
++		return handle->result;
++	else
++		return traverse_tree(handle);
++}
++
++/* Execute actor for each item (or unit, depending on @through_units_p),
++   starting from @coord, right-ward, until either:
++
++   - end of the tree is reached
++   - unformatted node is met
++   - error occurred
++   - @actor returns 0 or less
++
++   Error code, or last actor return value is returned.
++
++   This is used by plugin/dir/hashe_dir.c:find_entry() to move through
++   sequence of entries with identical keys and alikes.
++*/
++int iterate_tree(reiser4_tree * tree /* tree to scan */ ,
++		 coord_t * coord /* coord to start from */ ,
++		 lock_handle * lh	/* lock handle to start with and to
++					 * update along the way */ ,
++		 tree_iterate_actor_t actor	/* function to call on each
++						 * item/unit */ ,
++		 void *arg /* argument to pass to @actor */ ,
++		 znode_lock_mode mode /* lock mode on scanned nodes */ ,
++		 int through_units_p	/* call @actor on each item or on each
++					 * unit */ )
++{
++	int result;
++
++	assert("nikita-1143", tree != NULL);
++	assert("nikita-1145", coord != NULL);
++	assert("nikita-1146", lh != NULL);
++	assert("nikita-1147", actor != NULL);
++
++	result = zload(coord->node);
++	coord_clear_iplug(coord);
++	if (result != 0)
++		return result;
++	if (!coord_is_existing_unit(coord)) {
++		zrelse(coord->node);
++		return -ENOENT;
++	}
++	while ((result = actor(tree, coord, lh, arg)) > 0) {
++		/* move further  */
++		if ((through_units_p && coord_next_unit(coord)) ||
++		    (!through_units_p && coord_next_item(coord))) {
++			do {
++				lock_handle couple;
++
++				/* move to the next node  */
++				init_lh(&couple);
++				result =
++				    reiser4_get_right_neighbor(&couple,
++							       coord->node,
++							       (int)mode,
++							       GN_CAN_USE_UPPER_LEVELS);
++				zrelse(coord->node);
++				if (result == 0) {
++
++					result = zload(couple.node);
++					if (result != 0) {
++						done_lh(&couple);
++						return result;
++					}
++
++					coord_init_first_unit(coord,
++							      couple.node);
++					done_lh(lh);
++					move_lh(lh, &couple);
++				} else
++					return result;
++			} while (node_is_empty(coord->node));
++		}
++
++		assert("nikita-1149", coord_is_existing_unit(coord));
++	}
++	zrelse(coord->node);
++	return result;
++}
++
++/* return locked uber znode for @tree */
++int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
++		   znode_lock_request pri, lock_handle * lh)
++{
++	int result;
++
++	result = longterm_lock_znode(lh, tree->uber, mode, pri);
++	return result;
++}
++
++/* true if @key is strictly within @node
++
++   we are looking for possibly non-unique key and it is item is at the edge of
++   @node. May be it is in the neighbor.
++*/
++static int znode_contains_key_strict(znode * node	/* node to check key
++							 * against */ ,
++				     const reiser4_key *
++				     key /* key to check */ ,
++				     int isunique)
++{
++	int answer;
++
++	assert("nikita-1760", node != NULL);
++	assert("nikita-1722", key != NULL);
++
++	if (keyge(key, &node->rd_key))
++		return 0;
++
++	answer = keycmp(&node->ld_key, key);
++
++	if (isunique)
++		return answer != GREATER_THAN;
++	else
++		return answer == LESS_THAN;
++}
++
++/*
++ * Virtual Root (vroot) code.
++ *
++ *     For given file system object (e.g., regular file or directory) let's
++ *     define its "virtual root" as lowest in the tree (that is, furtherest
++ *     from the tree root) node such that all body items of said object are
++ *     located in a tree rooted at this node.
++ *
++ *     Once vroot of object is found all tree lookups for items within body of
++ *     this object ("object lookups") can be started from its vroot rather
++ *     than from real root. This has following advantages:
++ *
++ *         1. amount of nodes traversed during lookup (and, hence, amount of
++ *         key comparisons made) decreases, and
++ *
++ *         2. contention on tree root is decreased. This latter was actually
++ *         motivating reason behind vroot, because spin lock of root node,
++ *         which is taken when acquiring long-term lock on root node is the
++ *         hottest lock in the reiser4.
++ *
++ * How to find vroot.
++ *
++ *     When vroot of object F is not yet determined, all object lookups start
++ *     from the root of the tree. At each tree level during traversal we have
++ *     a node N such that a key we are looking for (which is the key inside
++ *     object's body) is located within N. In function handle_vroot() called
++ *     from cbk_level_lookup() we check whether N is possible vroot for
++ *     F. Check is trivial---if neither leftmost nor rightmost item of N
++ *     belongs to F (and we already have helpful ->owns_item() method of
++ *     object plugin for this), then N is possible vroot of F. This, of
++ *     course, relies on the assumption that each object occupies contiguous
++ *     range of keys in the tree.
++ *
++ *     Thus, traversing tree downward and checking each node as we go, we can
++ *     find lowest such node, which, by definition, is vroot.
++ *
++ * How to track vroot.
++ *
++ *     Nohow. If actual vroot changes, next object lookup will just restart
++ *     from the actual tree root, refreshing object's vroot along the way.
++ *
++ */
++
++/*
++ * Check whether @node is possible vroot of @object.
++ */
++static void handle_vroot(struct inode *object, znode * node)
++{
++	file_plugin *fplug;
++	coord_t coord;
++
++	fplug = inode_file_plugin(object);
++	assert("nikita-3353", fplug != NULL);
++	assert("nikita-3354", fplug->owns_item != NULL);
++
++	if (unlikely(node_is_empty(node)))
++		return;
++
++	coord_init_first_unit(&coord, node);
++	/*
++	 * if leftmost item of @node belongs to @object, we cannot be sure
++	 * that @node is vroot of @object, because, some items of @object are
++	 * probably in the sub-tree rooted at the left neighbor of @node.
++	 */
++	if (fplug->owns_item(object, &coord))
++		return;
++	coord_init_last_unit(&coord, node);
++	/* mutatis mutandis for the rightmost item */
++	if (fplug->owns_item(object, &coord))
++		return;
++	/* otherwise, @node is possible vroot of @object */
++	inode_set_vroot(object, node);
++}
++
++/*
++ * helper function used by traverse tree to start tree traversal not from the
++ * tree root, but from @h->object's vroot, if possible.
++ */
++static int prepare_object_lookup(cbk_handle * h)
++{
++	znode *vroot;
++	int result;
++
++	vroot = inode_get_vroot(h->object);
++	if (vroot == NULL) {
++		/*
++		 * object doesn't have known vroot, start from real tree root.
++		 */
++		return LOOKUP_CONT;
++	}
++
++	h->level = znode_get_level(vroot);
++	/* take a long-term lock on vroot */
++	h->result = longterm_lock_znode(h->active_lh, vroot,
++					cbk_lock_mode(h->level, h),
++					ZNODE_LOCK_LOPRI);
++	result = LOOKUP_REST;
++	if (h->result == 0) {
++		int isunique;
++		int inside;
++
++		isunique = h->flags & CBK_UNIQUE;
++		/* check that key is inside vroot */
++		read_lock_dk(h->tree);
++		inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
++			  !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
++		read_unlock_dk(h->tree);
++		if (inside) {
++			h->result = zload(vroot);
++			if (h->result == 0) {
++				/* search for key in vroot. */
++				result = cbk_node_lookup(h);
++				zrelse(vroot);	/*h->active_lh->node); */
++				if (h->active_lh->node != vroot) {
++					result = LOOKUP_REST;
++				} else if (result == LOOKUP_CONT) {
++					move_lh(h->parent_lh, h->active_lh);
++					h->flags &= ~CBK_DKSET;
++				}
++			}
++		}
++	} else
++		/* long-term locking failed. Restart. */
++		;
++
++	zput(vroot);
++
++	if (IS_CBKERR(h->result) || result == LOOKUP_REST)
++		hput(h);
++	return result;
++}
++
++/* main function that handles common parts of tree traversal: starting
++    (fake znode handling), restarts, error handling, completion */
++static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
++{
++	int done;
++	int iterations;
++	int vroot_used;
++
++	assert("nikita-365", h != NULL);
++	assert("nikita-366", h->tree != NULL);
++	assert("nikita-367", h->key != NULL);
++	assert("nikita-368", h->coord != NULL);
++	assert("nikita-369", (h->bias == FIND_EXACT)
++	       || (h->bias == FIND_MAX_NOT_MORE_THAN));
++	assert("nikita-370", h->stop_level >= LEAF_LEVEL);
++	assert("nikita-2949", !(h->flags & CBK_DKSET));
++	assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
++
++	done = 0;
++	iterations = 0;
++	vroot_used = 0;
++
++	/* loop for restarts */
++      restart:
++
++	assert("nikita-3024", schedulable());
++
++	h->result = CBK_COORD_FOUND;
++	/* connect_znode() needs it */
++	h->ld_key = *min_key();
++	h->rd_key = *max_key();
++	h->flags |= CBK_DKSET;
++	h->error = NULL;
++
++	if (!vroot_used && h->object != NULL) {
++		vroot_used = 1;
++		done = prepare_object_lookup(h);
++		if (done == LOOKUP_REST) {
++			goto restart;
++		} else if (done == LOOKUP_DONE)
++			return h->result;
++	}
++	if (h->parent_lh->node == NULL) {
++		done =
++		    get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
++				   h->parent_lh);
++
++		assert("nikita-1637", done != -E_DEADLOCK);
++
++		h->block = h->tree->root_block;
++		h->level = h->tree->height;
++		h->coord->node = h->parent_lh->node;
++
++		if (done != 0)
++			return done;
++	}
++
++	/* loop descending a tree */
++	while (!done) {
++
++		if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
++			     IS_POW(iterations))) {
++			warning("nikita-1481", "Too many iterations: %i",
++				iterations);
++			print_key("key", h->key);
++			++iterations;
++		} else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
++			h->error =
++			    "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
++			h->result = RETERR(-EIO);
++			break;
++		}
++		switch (cbk_level_lookup(h)) {
++		case LOOKUP_CONT:
++			move_lh(h->parent_lh, h->active_lh);
++			continue;
++		default:
++			wrong_return_value("nikita-372", "cbk_level");
++		case LOOKUP_DONE:
++			done = 1;
++			break;
++		case LOOKUP_REST:
++			hput(h);
++			/* deadlock avoidance is normal case. */
++			if (h->result != -E_DEADLOCK)
++				++iterations;
++			preempt_point();
++			goto restart;
++		}
++	}
++	/* that's all. The rest is error handling */
++	if (unlikely(h->error != NULL)) {
++		warning("nikita-373", "%s: level: %i, "
++			"lock_level: %i, stop_level: %i "
++			"lock_mode: %s, bias: %s",
++			h->error, h->level, h->lock_level, h->stop_level,
++			lock_mode_name(h->lock_mode), bias_name(h->bias));
++		reiser4_print_address("block", &h->block);
++		print_key("key", h->key);
++		print_coord_content("coord", h->coord);
++	}
++	/* `unlikely' error case */
++	if (unlikely(IS_CBKERR(h->result))) {
++		/* failure. do cleanup */
++		hput(h);
++	} else {
++		assert("nikita-1605", WITH_DATA_RET
++		       (h->coord->node, 1,
++			ergo((h->result == CBK_COORD_FOUND) &&
++			     (h->bias == FIND_EXACT) &&
++			     (!node_is_empty(h->coord->node)),
++			     coord_is_existing_item(h->coord))));
++	}
++	return h->result;
++}
++
++/* find delimiting keys of child
++
++   Determine left and right delimiting keys for child pointed to by
++   @parent_coord.
++
++*/
++static void find_child_delimiting_keys(znode * parent	/* parent znode, passed
++							 * locked */ ,
++				       const coord_t * parent_coord	/* coord where
++									 * pointer to
++									 * child is
++									 * stored */ ,
++				       reiser4_key * ld	/* where to store left
++							 * delimiting key */ ,
++				       reiser4_key * rd	/* where to store right
++							 * delimiting key */ )
++{
++	coord_t neighbor;
++
++	assert("nikita-1484", parent != NULL);
++	assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
++
++	coord_dup(&neighbor, parent_coord);
++
++	if (neighbor.between == AT_UNIT)
++		/* imitate item ->lookup() behavior. */
++		neighbor.between = AFTER_UNIT;
++
++	if (coord_set_to_left(&neighbor) == 0)
++		unit_key_by_coord(&neighbor, ld);
++	else {
++		assert("nikita-14851", 0);
++		*ld = *znode_get_ld_key(parent);
++	}
++
++	coord_dup(&neighbor, parent_coord);
++	if (neighbor.between == AT_UNIT)
++		neighbor.between = AFTER_UNIT;
++	if (coord_set_to_right(&neighbor) == 0)
++		unit_key_by_coord(&neighbor, rd);
++	else
++		*rd = *znode_get_rd_key(parent);
++}
++
++/*
++ * setup delimiting keys for a child
++ *
++ * @parent parent node
++ *
++ * @coord location in @parent where pointer to @child is
++ *
++ * @child child node
++ */
++int
++set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
++{
++	reiser4_tree *tree;
++
++	assert("nikita-2952",
++	       znode_get_level(parent) == znode_get_level(coord->node));
++
++	/* fast check without taking dk lock. This is safe, because
++	 * JNODE_DKSET is never cleared once set. */
++	if (!ZF_ISSET(child, JNODE_DKSET)) {
++		tree = znode_get_tree(parent);
++		write_lock_dk(tree);
++		if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
++			find_child_delimiting_keys(parent, coord,
++						   &child->ld_key,
++						   &child->rd_key);
++			ON_DEBUG(child->ld_key_version =
++				 atomic_inc_return(&delim_key_version);
++				 child->rd_key_version =
++				 atomic_inc_return(&delim_key_version););
++			ZF_SET(child, JNODE_DKSET);
++		}
++		write_unlock_dk(tree);
++		return 1;
++	}
++	return 0;
++}
++
++/* Perform tree lookup at one level. This is called from cbk_traverse()
++   function that drives lookup through tree and calls cbk_node_lookup() to
++   perform lookup within one node.
++
++   See comments in a code.
++*/
++static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
++{
++	int ret;
++	int setdk;
++	int ldkeyset = 0;
++	reiser4_key ldkey;
++	reiser4_key key;
++	znode *active;
++
++	assert("nikita-3025", schedulable());
++
++	/* acquire reference to @active node */
++	active =
++	    zget(h->tree, &h->block, h->parent_lh->node, h->level, get_gfp_mask());
++
++	if (IS_ERR(active)) {
++		h->result = PTR_ERR(active);
++		return LOOKUP_DONE;
++	}
++
++	/* lock @active */
++	h->result = longterm_lock_znode(h->active_lh,
++					active,
++					cbk_lock_mode(h->level, h),
++					ZNODE_LOCK_LOPRI);
++	/* longterm_lock_znode() acquires additional reference to znode (which
++	   will be later released by longterm_unlock_znode()). Release
++	   reference acquired by zget().
++	 */
++	zput(active);
++	if (unlikely(h->result != 0))
++		goto fail_or_restart;
++
++	setdk = 0;
++	/* if @active is accessed for the first time, setup delimiting keys on
++	   it. Delimiting keys are taken from the parent node. See
++	   setup_delimiting_keys() for details.
++	 */
++	if (h->flags & CBK_DKSET) {
++		setdk = setup_delimiting_keys(h);
++		h->flags &= ~CBK_DKSET;
++	} else {
++		znode *parent;
++
++		parent = h->parent_lh->node;
++		h->result = zload(parent);
++		if (unlikely(h->result != 0))
++			goto fail_or_restart;
++
++		if (!ZF_ISSET(active, JNODE_DKSET))
++			setdk = set_child_delimiting_keys(parent,
++							  h->coord, active);
++		else {
++			read_lock_dk(h->tree);
++			find_child_delimiting_keys(parent, h->coord, &ldkey,
++						   &key);
++			read_unlock_dk(h->tree);
++			ldkeyset = 1;
++		}
++		zrelse(parent);
++	}
++
++	/* this is ugly kludge. Reminder: this is necessary, because
++	   ->lookup() method returns coord with ->between field probably set
++	   to something different from AT_UNIT.
++	 */
++	h->coord->between = AT_UNIT;
++
++	if (znode_just_created(active) && (h->coord->node != NULL)) {
++		write_lock_tree(h->tree);
++		/* if we are going to load znode right now, setup
++		   ->in_parent: coord where pointer to this node is stored in
++		   parent.
++		 */
++		coord_to_parent_coord(h->coord, &active->in_parent);
++		write_unlock_tree(h->tree);
++	}
++
++	/* check connectedness without holding tree lock---false negatives
++	 * will be re-checked by connect_znode(), and false positives are
++	 * impossible---@active cannot suddenly turn into unconnected
++	 * state. */
++	if (!znode_is_connected(active)) {
++		h->result = connect_znode(h->coord, active);
++		if (unlikely(h->result != 0)) {
++			put_parent(h);
++			goto fail_or_restart;
++		}
++	}
++
++	jload_prefetch(ZJNODE(active));
++
++	if (setdk)
++		update_stale_dk(h->tree, active);
++
++	/* put_parent() cannot be called earlier, because connect_znode()
++	   assumes parent node is referenced; */
++	put_parent(h);
++
++	if ((!znode_contains_key_lock(active, h->key) &&
++	     (h->flags & CBK_TRUST_DK))
++	    || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
++		/* 1. key was moved out of this node while this thread was
++		   waiting for the lock. Restart. More elaborate solution is
++		   to determine where key moved (to the left, or to the right)
++		   and try to follow it through sibling pointers.
++
++		   2. or, node itself is going to be removed from the
++		   tree. Release lock and restart.
++		 */
++		h->result = -E_REPEAT;
++	}
++	if (h->result == -E_REPEAT)
++		return LOOKUP_REST;
++
++	h->result = zload_ra(active, h->ra_info);
++	if (h->result) {
++		return LOOKUP_DONE;
++	}
++
++	/* sanity checks */
++	if (sanity_check(h)) {
++		zrelse(active);
++		return LOOKUP_DONE;
++	}
++
++	/* check that key of leftmost item in the @active is the same as in
++	 * its parent */
++	if (ldkeyset && !node_is_empty(active) &&
++	    !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
++		warning("vs-3533", "Keys are inconsistent. Fsck?");
++		print_key("inparent", &ldkey);
++		print_key("inchild", &key);
++		h->result = RETERR(-EIO);
++		zrelse(active);
++		return LOOKUP_DONE;
++	}
++
++	if (h->object != NULL)
++		handle_vroot(h->object, active);
++
++	ret = cbk_node_lookup(h);
++
++	/* h->active_lh->node might change, but active is yet to be zrelsed */
++	zrelse(active);
++
++	return ret;
++
++      fail_or_restart:
++	if (h->result == -E_DEADLOCK)
++		return LOOKUP_REST;
++	return LOOKUP_DONE;
++}
++
++#if REISER4_DEBUG
++/* check left and right delimiting keys of a znode */
++void check_dkeys(znode * node)
++{
++	znode *left;
++	znode *right;
++
++	read_lock_tree(current_tree);
++	read_lock_dk(current_tree);
++
++	assert("vs-1710", znode_is_any_locked(node));
++	assert("vs-1197",
++	       !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
++
++	left = node->left;
++	right = node->right;
++
++	if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
++	    && left != NULL && ZF_ISSET(left, JNODE_DKSET))
++		/* check left neighbor. Note that left neighbor is not locked,
++		   so it might get wrong delimiting keys therefore */
++		assert("vs-1198",
++		       (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
++			|| ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
++
++	if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
++	    && right != NULL && ZF_ISSET(right, JNODE_DKSET))
++		/* check right neighbor. Note that right neighbor is not
++		   locked, so it might get wrong delimiting keys therefore  */
++		assert("vs-1199",
++		       (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
++			|| ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
++
++	read_unlock_dk(current_tree);
++	read_unlock_tree(current_tree);
++}
++#endif
++
++/* true if @key is left delimiting key of @node */
++static int key_is_ld(znode * node, const reiser4_key * key)
++{
++	int ld;
++
++	assert("nikita-1716", node != NULL);
++	assert("nikita-1758", key != NULL);
++
++	read_lock_dk(znode_get_tree(node));
++	assert("nikita-1759", znode_contains_key(node, key));
++	ld = keyeq(znode_get_ld_key(node), key);
++	read_unlock_dk(znode_get_tree(node));
++	return ld;
++}
++
++/* Process one node during tree traversal.
++
++   This is called by cbk_level_lookup(). */
++static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
++{
++	/* node plugin of @active */
++	node_plugin *nplug;
++	/* item plugin of item that was found */
++	item_plugin *iplug;
++	/* search bias */
++	lookup_bias node_bias;
++	/* node we are operating upon */
++	znode *active;
++	/* tree we are searching in */
++	reiser4_tree *tree;
++	/* result */
++	int result;
++
++	assert("nikita-379", h != NULL);
++
++	active = h->active_lh->node;
++	tree = h->tree;
++
++	nplug = active->nplug;
++	assert("nikita-380", nplug != NULL);
++
++	ON_DEBUG(check_dkeys(active));
++
++	/* return item from "active" node with maximal key not greater than
++	   "key"  */
++	node_bias = h->bias;
++	result = nplug->lookup(active, h->key, node_bias, h->coord);
++	if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
++		/* error occurred */
++		h->result = result;
++		return LOOKUP_DONE;
++	}
++	if (h->level == h->stop_level) {
++		/* welcome to the stop level */
++		assert("nikita-381", h->coord->node == active);
++		if (result == NS_FOUND) {
++			/* success of tree lookup */
++			if (!(h->flags & CBK_UNIQUE)
++			    && key_is_ld(active, h->key)) {
++				return search_to_left(h);
++			} else
++				h->result = CBK_COORD_FOUND;
++		} else {
++			h->result = CBK_COORD_NOTFOUND;
++		}
++		if (!(h->flags & CBK_IN_CACHE))
++			cbk_cache_add(active);
++		return LOOKUP_DONE;
++	}
++
++	if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
++		h->error = "not found on internal node";
++		h->result = result;
++		return LOOKUP_DONE;
++	}
++
++	assert("vs-361", h->level > h->stop_level);
++
++	if (handle_eottl(h, &result)) {
++		assert("vs-1674", (result == LOOKUP_DONE ||
++				   result == LOOKUP_REST));
++		return result;
++	}
++
++	/* go down to next level */
++	check_me("vs-12", zload(h->coord->node) == 0);
++	assert("nikita-2116", item_is_internal(h->coord));
++	iplug = item_plugin_by_coord(h->coord);
++	iplug->s.internal.down_link(h->coord, h->key, &h->block);
++	zrelse(h->coord->node);
++	--h->level;
++	return LOOKUP_CONT;	/* continue */
++}
++
++/* scan cbk_cache slots looking for a match for @h */
++static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
++{
++	level_lookup_result llr;
++	znode *node;
++	reiser4_tree *tree;
++	cbk_cache_slot *slot;
++	cbk_cache *cache;
++	tree_level level;
++	int isunique;
++	const reiser4_key *key;
++	int result;
++
++	assert("nikita-1317", h != NULL);
++	assert("nikita-1315", h->tree != NULL);
++	assert("nikita-1316", h->key != NULL);
++
++	tree = h->tree;
++	cache = &tree->cbk_cache;
++	if (cache->nr_slots == 0)
++		/* size of cbk cache was set to 0 by mount time option. */
++		return RETERR(-ENOENT);
++
++	assert("nikita-2474", cbk_cache_invariant(cache));
++	node = NULL;		/* to keep gcc happy */
++	level = h->level;
++	key = h->key;
++	isunique = h->flags & CBK_UNIQUE;
++	result = RETERR(-ENOENT);
++
++	/*
++	 * this is time-critical function and dragons had, hence, been settled
++	 * here.
++	 *
++	 * Loop below scans cbk cache slots trying to find matching node with
++	 * suitable range of delimiting keys and located at the h->level.
++	 *
++	 * Scan is done under cbk cache spin lock that protects slot->node
++	 * pointers. If suitable node is found we want to pin it in
++	 * memory. But slot->node can point to the node with x_count 0
++	 * (unreferenced). Such node can be recycled at any moment, or can
++	 * already be in the process of being recycled (within jput()).
++	 *
++	 * As we found node in the cbk cache, it means that jput() hasn't yet
++	 * called cbk_cache_invalidate().
++	 *
++	 * We acquire reference to the node without holding tree lock, and
++	 * later, check node's RIP bit. This avoids races with jput().
++	 */
++
++	rcu_read_lock();
++	read_lock(&((cbk_cache *)cache)->guard);
++
++	slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
++	slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
++	BUG_ON(&slot->lru != &cache->lru);/*????*/
++	while (1) {
++
++		slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
++
++		if (&cache->lru != &slot->lru)
++			node = slot->node;
++		else
++			node = NULL;
++
++		if (unlikely(node == NULL))
++			break;
++
++		/*
++		 * this is (hopefully) the only place in the code where we are
++		 * working with delimiting keys without holding dk lock. This
++		 * is fine here, because this is only "guess" anyway---keys
++		 * are rechecked under dk lock below.
++		 */
++		if (znode_get_level(node) == level &&
++		    /* min_key < key < max_key */
++		    znode_contains_key_strict(node, key, isunique)) {
++			zref(node);
++			result = 0;
++			spin_lock_prefetch(&tree->tree_lock);
++			break;
++		}
++	}
++	read_unlock(&((cbk_cache *)cache)->guard);
++
++	assert("nikita-2475", cbk_cache_invariant(cache));
++
++	if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
++		result = -ENOENT;
++
++	rcu_read_unlock();
++
++	if (result != 0) {
++		h->result = CBK_COORD_NOTFOUND;
++		return RETERR(-ENOENT);
++	}
++
++	result =
++	    longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
++				ZNODE_LOCK_LOPRI);
++	zput(node);
++	if (result != 0)
++		return result;
++	result = zload(node);
++	if (result != 0)
++		return result;
++
++	/* recheck keys */
++	read_lock_dk(tree);
++	result = (znode_contains_key_strict(node, key, isunique) &&
++		!ZF_ISSET(node, JNODE_HEARD_BANSHEE));
++	read_unlock_dk(tree);
++	if (result) {
++		/* do lookup inside node */
++		llr = cbk_node_lookup(h);
++		/* if cbk_node_lookup() wandered to another node (due to eottl
++		   or non-unique keys), adjust @node */
++		/*node = h->active_lh->node; */
++
++		if (llr != LOOKUP_DONE) {
++			/* restart or continue on the next level */
++			result = RETERR(-ENOENT);
++		} else if (IS_CBKERR(h->result))
++			/* io or oom */
++			result = RETERR(-ENOENT);
++		else {
++			/* good. Either item found or definitely not found. */
++			result = 0;
++
++			write_lock(&(cache->guard));
++			if (slot->node == h->active_lh->node /*node */ ) {
++				/* if this node is still in cbk cache---move
++				   its slot to the head of the LRU list. */
++				list_move(&slot->lru, &cache->lru);
++			}
++			write_unlock(&(cache->guard));
++		}
++	} else {
++		/* race. While this thread was waiting for the lock, node was
++		   rebalanced and item we are looking for, shifted out of it
++		   (if it ever was here).
++
++		   Continuing scanning is almost hopeless: node key range was
++		   moved to, is almost certainly at the beginning of the LRU
++		   list at this time, because it's hot, but restarting
++		   scanning from the very beginning is complex. Just return,
++		   so that cbk() will be performed. This is not that
++		   important, because such races should be rare. Are they?
++		 */
++		result = RETERR(-ENOENT);	/* -ERAUGHT */
++	}
++	zrelse(node);
++	assert("nikita-2476", cbk_cache_invariant(cache));
++	return result;
++}
++
++/* look for item with given key in the coord cache
++
++   This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
++   which is a small LRU list of znodes accessed lately. For each znode in
++   znode in this list, it checks whether key we are looking for fits into key
++   range covered by this node. If so, and in addition, node lies at allowed
++   level (this is to handle extents on a twig level), node is locked, and
++   lookup inside it is performed.
++
++   we need a measurement of the cost of this cache search compared to the cost
++   of coord_by_key.
++
++*/
++static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
++{
++	int result = 0;
++	tree_level level;
++
++	/* add CBK_IN_CACHE to the handle flags. This means that
++	 * cbk_node_lookup() assumes that cbk_cache is scanned and would add
++	 * found node to the cache. */
++	h->flags |= CBK_IN_CACHE;
++	for (level = h->stop_level; level <= h->lock_level; ++level) {
++		h->level = level;
++		result = cbk_cache_scan_slots(h);
++		if (result != 0) {
++			done_lh(h->active_lh);
++			done_lh(h->parent_lh);
++		} else {
++			assert("nikita-1319", !IS_CBKERR(h->result));
++			break;
++		}
++	}
++	h->flags &= ~CBK_IN_CACHE;
++	return result;
++}
++
++/* type of lock we want to obtain during tree traversal. On stop level
++    we want type of lock user asked for, on upper levels: read lock. */
++znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
++{
++	assert("nikita-382", h != NULL);
++
++	return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
++}
++
++/* update outdated delimiting keys */
++static void stale_dk(reiser4_tree * tree, znode * node)
++{
++	znode *right;
++
++	read_lock_tree(tree);
++	write_lock_dk(tree);
++	right = node->right;
++
++	if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
++	    right && ZF_ISSET(right, JNODE_DKSET) &&
++	    !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
++		znode_set_rd_key(node, znode_get_ld_key(right));
++
++	write_unlock_dk(tree);
++	read_unlock_tree(tree);
++}
++
++/* check for possibly outdated delimiting keys, and update them if
++ * necessary. */
++static void update_stale_dk(reiser4_tree * tree, znode * node)
++{
++	znode *right;
++	reiser4_key rd;
++
++	read_lock_tree(tree);
++	read_lock_dk(tree);
++	rd = *znode_get_rd_key(node);
++	right = node->right;
++	if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
++		     right && ZF_ISSET(right, JNODE_DKSET) &&
++		     !keyeq(&rd, znode_get_ld_key(right)))) {
++		assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
++		read_unlock_dk(tree);
++		read_unlock_tree(tree);
++		stale_dk(tree, node);
++		return;
++	}
++	read_unlock_dk(tree);
++	read_unlock_tree(tree);
++}
++
++/*
++ * handle searches a the non-unique key.
++ *
++ * Suppose that we are looking for an item with possibly non-unique key 100.
++ *
++ * Root node contains two pointers: one to a node with left delimiting key 0,
++ * and another to a node with left delimiting key 100. Item we interested in
++ * may well happen in the sub-tree rooted at the first pointer.
++ *
++ * To handle this search_to_left() is called when search reaches stop
++ * level. This function checks it is _possible_ that item we are looking for
++ * is in the left neighbor (this can be done by comparing delimiting keys) and
++ * if so, tries to lock left neighbor (this is low priority lock, so it can
++ * deadlock, tree traversal is just restarted if it did) and then checks
++ * whether left neighbor actually contains items with our key.
++ *
++ * Note that this is done on the stop level only. It is possible to try such
++ * left-check on each level, but as duplicate keys are supposed to be rare
++ * (very unlikely that more than one node is completely filled with items with
++ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
++ *
++ */
++static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
++{
++	level_lookup_result result;
++	coord_t *coord;
++	znode *node;
++	znode *neighbor;
++
++	lock_handle lh;
++
++	assert("nikita-1761", h != NULL);
++	assert("nikita-1762", h->level == h->stop_level);
++
++	init_lh(&lh);
++	coord = h->coord;
++	node = h->active_lh->node;
++	assert("nikita-1763", coord_is_leftmost_unit(coord));
++
++	h->result =
++	    reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
++				      GN_CAN_USE_UPPER_LEVELS);
++	neighbor = NULL;
++	switch (h->result) {
++	case -E_DEADLOCK:
++		result = LOOKUP_REST;
++		break;
++	case 0:{
++			node_plugin *nplug;
++			coord_t crd;
++			lookup_bias bias;
++
++			neighbor = lh.node;
++			h->result = zload(neighbor);
++			if (h->result != 0) {
++				result = LOOKUP_DONE;
++				break;
++			}
++
++			nplug = neighbor->nplug;
++
++			coord_init_zero(&crd);
++			bias = h->bias;
++			h->bias = FIND_EXACT;
++			h->result =
++			    nplug->lookup(neighbor, h->key, h->bias, &crd);
++			h->bias = bias;
++
++			if (h->result == NS_NOT_FOUND) {
++	case -E_NO_NEIGHBOR:
++				h->result = CBK_COORD_FOUND;
++				if (!(h->flags & CBK_IN_CACHE))
++					cbk_cache_add(node);
++	default:		/* some other error */
++				result = LOOKUP_DONE;
++			} else if (h->result == NS_FOUND) {
++				read_lock_dk(znode_get_tree(neighbor));
++				h->rd_key = *znode_get_ld_key(node);
++				leftmost_key_in_node(neighbor, &h->ld_key);
++				read_unlock_dk(znode_get_tree(neighbor));
++				h->flags |= CBK_DKSET;
++
++				h->block = *znode_get_block(neighbor);
++				/* clear coord -> node so that cbk_level_lookup()
++				   wouldn't overwrite parent hint in neighbor.
++
++				   Parent hint was set up by
++				   reiser4_get_left_neighbor()
++				 */
++				/* FIXME: why do we have to spinlock here? */
++				write_lock_tree(znode_get_tree(neighbor));
++				h->coord->node = NULL;
++				write_unlock_tree(znode_get_tree(neighbor));
++				result = LOOKUP_CONT;
++			} else {
++				result = LOOKUP_DONE;
++			}
++			if (neighbor != NULL)
++				zrelse(neighbor);
++		}
++	}
++	done_lh(&lh);
++	return result;
++}
++
++/* debugging aid: return symbolic name of search bias */
++static const char *bias_name(lookup_bias bias /* bias to get name of */ )
++{
++	if (bias == FIND_EXACT)
++		return "exact";
++	else if (bias == FIND_MAX_NOT_MORE_THAN)
++		return "left-slant";
++/* 	else if( bias == RIGHT_SLANT_BIAS ) */
++/* 		return "right-bias"; */
++	else {
++		static char buf[30];
++
++		sprintf(buf, "unknown: %i", bias);
++		return buf;
++	}
++}
++
++#if REISER4_DEBUG
++/* debugging aid: print human readable information about @p */
++void print_coord_content(const char *prefix /* prefix to print */ ,
++			 coord_t * p /* coord to print */ )
++{
++	reiser4_key key;
++
++	if (p == NULL) {
++		printk("%s: null\n", prefix);
++		return;
++	}
++	if ((p->node != NULL) && znode_is_loaded(p->node)
++	    && coord_is_existing_item(p))
++		printk("%s: data: %p, length: %i\n", prefix,
++		       item_body_by_coord(p), item_length_by_coord(p));
++	if (znode_is_loaded(p->node)) {
++		item_key_by_coord(p, &key);
++		print_key(prefix, &key);
++	}
++}
++
++/* debugging aid: print human readable information about @block */
++void reiser4_print_address(const char *prefix /* prefix to print */ ,
++		   const reiser4_block_nr * block /* block number to print */ )
++{
++	printk("%s: %s\n", prefix, sprint_address(block));
++}
++#endif
++
++/* return string containing human readable representation of @block */
++char *sprint_address(const reiser4_block_nr *
++		     block /* block number to print */ )
++{
++	static char address[30];
++
++	if (block == NULL)
++		sprintf(address, "null");
++	else if (blocknr_is_fake(block))
++		sprintf(address, "%llx", (unsigned long long)(*block));
++	else
++		sprintf(address, "%llu", (unsigned long long)(*block));
++	return address;
++}
++
++/* release parent node during traversal */
++static void put_parent(cbk_handle * h /* search handle */ )
++{
++	assert("nikita-383", h != NULL);
++	if (h->parent_lh->node != NULL) {
++		longterm_unlock_znode(h->parent_lh);
++	}
++}
++
++/* helper function used by coord_by_key(): release reference to parent znode
++   stored in handle before processing its child. */
++static void hput(cbk_handle * h /* search handle */ )
++{
++	assert("nikita-385", h != NULL);
++	done_lh(h->parent_lh);
++	done_lh(h->active_lh);
++}
++
++/* Helper function used by cbk(): update delimiting keys of child node (stored
++   in h->active_lh->node) using key taken from parent on the parent level. */
++static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
++{
++	znode *active;
++	reiser4_tree *tree;
++
++	assert("nikita-1088", h != NULL);
++
++	active = h->active_lh->node;
++
++	/* fast check without taking dk lock. This is safe, because
++	 * JNODE_DKSET is never cleared once set. */
++	if (!ZF_ISSET(active, JNODE_DKSET)) {
++		tree = znode_get_tree(active);
++		write_lock_dk(tree);
++		if (!ZF_ISSET(active, JNODE_DKSET)) {
++			znode_set_ld_key(active, &h->ld_key);
++			znode_set_rd_key(active, &h->rd_key);
++			ZF_SET(active, JNODE_DKSET);
++		}
++		write_unlock_dk(tree);
++		return 1;
++	}
++	return 0;
++}
++
++/* true if @block makes sense for the @tree. Used to detect corrupted node
++ * pointers */
++static int
++block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
++		    reiser4_tree * tree /* tree to check against */ )
++{
++	assert("nikita-757", block != NULL);
++	assert("nikita-758", tree != NULL);
++
++	/* check to see if it exceeds the size of the device. */
++	return reiser4_blocknr_is_sane_for(tree->super, block);
++}
++
++/* check consistency of fields */
++static int sanity_check(cbk_handle * h /* search handle */ )
++{
++	assert("nikita-384", h != NULL);
++
++	if (h->level < h->stop_level) {
++		h->error = "Buried under leaves";
++		h->result = RETERR(-EIO);
++		return LOOKUP_DONE;
++	} else if (!block_nr_is_correct(&h->block, h->tree)) {
++		h->error = "bad block number";
++		h->result = RETERR(-EIO);
++		return LOOKUP_DONE;
++	} else
++		return 0;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/status_flags.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/status_flags.c
+@@ -0,0 +1,176 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Functions that deal with reiser4 status block, query status and update it, if needed */
++
++#include <linux/bio.h>
++#include <linux/highmem.h>
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include "debug.h"
++#include "dformat.h"
++#include "status_flags.h"
++#include "super.h"
++
++/* This is our end I/O handler that marks page uptodate if IO was successful. It also
++   unconditionally unlocks the page, so we can see that io was done.
++   We do not free bio, because we hope to reuse that. */
++static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
++				int err)
++{
++	if (bio->bi_size)
++		return 1;
++
++	if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
++		SetPageUptodate(bio->bi_io_vec->bv_page);
++	} else {
++		ClearPageUptodate(bio->bi_io_vec->bv_page);
++		SetPageError(bio->bi_io_vec->bv_page);
++	}
++	unlock_page(bio->bi_io_vec->bv_page);
++	return 0;
++}
++
++/* Initialise status code. This is expected to be called from the disk format
++   code. block paremeter is where status block lives. */
++int reiser4_status_init(reiser4_block_nr block)
++{
++	struct super_block *sb = reiser4_get_current_sb();
++	struct reiser4_status *statuspage;
++	struct bio *bio;
++	struct page *page;
++
++
++	get_super_private(sb)->status_page = NULL;
++	get_super_private(sb)->status_bio = NULL;
++
++	page = alloc_pages(GFP_KERNEL, 0);
++	if (!page)
++		return -ENOMEM;
++
++	bio = bio_alloc(GFP_KERNEL, 1);
++	if (bio != NULL) {
++		bio->bi_sector = block * (sb->s_blocksize >> 9);
++		bio->bi_bdev = sb->s_bdev;
++		bio->bi_io_vec[0].bv_page = page;
++		bio->bi_io_vec[0].bv_len = sb->s_blocksize;
++		bio->bi_io_vec[0].bv_offset = 0;
++		bio->bi_vcnt = 1;
++		bio->bi_size = sb->s_blocksize;
++		bio->bi_end_io = reiser4_status_endio;
++	} else {
++		__free_pages(page, 0);
++		return -ENOMEM;
++	}
++	lock_page(page);
++	submit_bio(READ, bio);
++	blk_run_address_space(get_super_fake(sb)->i_mapping);
++	wait_on_page_locked(page);
++	if (!PageUptodate(page)) {
++		warning("green-2007",
++			"I/O error while tried to read status page\n");
++		return -EIO;
++	}
++
++	statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
++	if (memcmp
++	    (statuspage->magic, REISER4_STATUS_MAGIC,
++	     sizeof(REISER4_STATUS_MAGIC))) {
++		/* Magic does not match. */
++		kunmap_atomic((char *)statuspage, KM_USER0);
++		warning("green-2008", "Wrong magic in status block\n");
++		__free_pages(page, 0);
++		bio_put(bio);
++		return -EINVAL;
++	}
++	kunmap_atomic((char *)statuspage, KM_USER0);
++
++	get_super_private(sb)->status_page = page;
++	get_super_private(sb)->status_bio = bio;
++	return 0;
++}
++
++/* Query the status of fs. Returns if the FS can be safely mounted.
++   Also if "status" and "extended" parameters are given, it will fill
++   actual parts of status from disk there. */
++int reiser4_status_query(u64 * status, u64 * extended)
++{
++	struct super_block *sb = reiser4_get_current_sb();
++	struct reiser4_status *statuspage;
++	int retval;
++
++	if (!get_super_private(sb)->status_page) {	// No status page?
++		return REISER4_STATUS_MOUNT_UNKNOWN;
++	}
++	statuspage = (struct reiser4_status *)
++	    kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
++	switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {	// FIXME: this cast is a hack for 32 bit arches to work.
++	case REISER4_STATUS_OK:
++		retval = REISER4_STATUS_MOUNT_OK;
++		break;
++	case REISER4_STATUS_CORRUPTED:
++		retval = REISER4_STATUS_MOUNT_WARN;
++		break;
++	case REISER4_STATUS_DAMAGED:
++	case REISER4_STATUS_DESTROYED:
++	case REISER4_STATUS_IOERROR:
++		retval = REISER4_STATUS_MOUNT_RO;
++		break;
++	default:
++		retval = REISER4_STATUS_MOUNT_UNKNOWN;
++		break;
++	}
++
++	if (status)
++		*status = le64_to_cpu(get_unaligned(&statuspage->status));
++	if (extended)
++		*extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
++
++	kunmap_atomic((char *)statuspage, KM_USER0);
++	return retval;
++}
++
++/* This function should be called when something bad happens (e.g. from reiser4_panic).
++   It fills the status structure and tries to push it to disk. */
++int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
++{
++	struct super_block *sb = reiser4_get_current_sb();
++	struct reiser4_status *statuspage;
++	struct bio *bio = get_super_private(sb)->status_bio;
++
++	if (!get_super_private(sb)->status_page) {	// No status page?
++		return -1;
++	}
++	statuspage = (struct reiser4_status *)
++	    kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
++
++	put_unaligned(cpu_to_le64(status), &statuspage->status);
++	put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
++	strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
++
++	kunmap_atomic((char *)statuspage, KM_USER0);
++	bio->bi_bdev = sb->s_bdev;
++	bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
++	bio->bi_io_vec[0].bv_len = sb->s_blocksize;
++	bio->bi_io_vec[0].bv_offset = 0;
++	bio->bi_vcnt = 1;
++	bio->bi_size = sb->s_blocksize;
++	bio->bi_end_io = reiser4_status_endio;
++	lock_page(get_super_private(sb)->status_page);	// Safe as nobody should touch our page.
++	/* We can block now, but we have no other choice anyway */
++	submit_bio(WRITE, bio);
++	blk_run_address_space(get_super_fake(sb)->i_mapping);
++	return 0;		// We do not wait for io to finish.
++}
++
++/* Frees the page with status and bio structure. Should be called by disk format at umount time */
++int reiser4_status_finish(void)
++{
++	struct super_block *sb = reiser4_get_current_sb();
++
++	__free_pages(get_super_private(sb)->status_page, 0);
++	get_super_private(sb)->status_page = NULL;
++	bio_put(get_super_private(sb)->status_bio);
++	get_super_private(sb)->status_bio = NULL;
++	return 0;
++}
+Index: linux-2.6.16/fs/reiser4/status_flags.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/status_flags.h
+@@ -0,0 +1,43 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Here we declare structures and flags that store reiser4 status on disk.
++   The status that helps us to find out if the filesystem is valid or if it
++   contains some critical, or not so critical errors */
++
++#if !defined( __REISER4_STATUS_FLAGS_H__ )
++#define __REISER4_STATUS_FLAGS_H__
++
++#include "dformat.h"
++/* These are major status flags */
++#define REISER4_STATUS_OK 0
++#define REISER4_STATUS_CORRUPTED 0x1
++#define REISER4_STATUS_DAMAGED 0x2
++#define REISER4_STATUS_DESTROYED 0x4
++#define REISER4_STATUS_IOERROR 0x8
++
++/* Return values for reiser4_status_query() */
++#define REISER4_STATUS_MOUNT_OK 0
++#define REISER4_STATUS_MOUNT_WARN 1
++#define REISER4_STATUS_MOUNT_RO 2
++#define REISER4_STATUS_MOUNT_UNKNOWN -1
++
++#define REISER4_TEXTERROR_LEN 256
++
++#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
++/* We probably need to keep its size under sector size which is 512 bytes */
++struct reiser4_status {
++	char magic[16];
++	d64 status;		/* Current FS state */
++	d64 extended_status;	/* Any additional info that might have sense in addition to "status". E.g.
++				   last sector where io error happened if status is "io error encountered" */
++	d64 stacktrace[10];	/* Last ten functional calls made (addresses) */
++	char texterror[REISER4_TEXTERROR_LEN];	/* Any error message if appropriate, otherwise filled with zeroes */
++};
++
++int reiser4_status_init(reiser4_block_nr block);
++int reiser4_status_query(u64 * status, u64 * extended);
++int reiser4_status_write(u64 status, u64 extended_status, char *message);
++int reiser4_status_finish(void);
++
++#endif
+Index: linux-2.6.16/fs/reiser4/super.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/super.c
+@@ -0,0 +1,313 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Super-block manipulations. */
++
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "plugin/security/perm.h"
++#include "plugin/space/space_allocator.h"
++#include "plugin/plugin.h"
++#include "tree.h"
++#include "vfs_ops.h"
++#include "super.h"
++#include "reiser4.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block  */
++
++
++static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
++static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
++static __u64 reserved_for_root(const struct super_block *super);
++
++/* Return reiser4-specific part of super block */
++reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super	/* super block
++											 * queried */ )
++{
++	return (reiser4_super_info_data *) super->s_fs_info;
++}
++
++/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
++long statfs_type(const struct super_block *super UNUSED_ARG	/* super block
++								 * queried */ )
++{
++	assert("nikita-448", super != NULL);
++	assert("nikita-449", is_reiser4_super(super));
++	return (long)REISER4_SUPER_MAGIC;
++}
++
++/* functions to read/modify fields of reiser4_super_info_data */
++
++/* get number of blocks in file system */
++__u64 reiser4_block_count(const struct super_block *super	/* super block
++								   queried */ )
++{
++	assert("vs-494", super != NULL);
++	assert("vs-495", is_reiser4_super(super));
++	return get_super_private(super)->block_count;
++}
++
++/*
++ * number of blocks in the current file system
++ */
++__u64 reiser4_current_block_count(void)
++{
++	return get_current_super_private()->block_count;
++}
++
++/* set number of block in filesystem */
++void reiser4_set_block_count(const struct super_block *super, __u64 nr)
++{
++	assert("vs-501", super != NULL);
++	assert("vs-502", is_reiser4_super(super));
++	get_super_private(super)->block_count = nr;
++	/*
++	 * The proper calculation of the reserved space counter (%5 of device
++	 * block counter) we need a 64 bit division which is missing in Linux
++	 * on i386 platform. Because we do not need a precise calculation here
++	 * we can replace a div64 operation by this combination of
++	 * multiplication and shift: 51. / (2^10) == .0498 .
++	 * FIXME: this is a bug. It comes up only for very small filesystems
++	 * which probably are never used. Nevertheless, it is a bug. Number of
++	 * reserved blocks must be not less than maximal number of blocks which
++	 * get grabbed with BA_RESERVED.
++	 */
++	get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
++}
++
++/* amount of blocks used (allocated for data) in file system */
++__u64 reiser4_data_blocks(const struct super_block *super	/* super block
++								   queried */ )
++{
++	assert("nikita-452", super != NULL);
++	assert("nikita-453", is_reiser4_super(super));
++	return get_super_private(super)->blocks_used;
++}
++
++/* set number of block used in filesystem */
++void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
++{
++	assert("vs-503", super != NULL);
++	assert("vs-504", is_reiser4_super(super));
++	get_super_private(super)->blocks_used = nr;
++}
++
++/* amount of free blocks in file system */
++__u64 reiser4_free_blocks(const struct super_block *super	/* super block
++								   queried */ )
++{
++	assert("nikita-454", super != NULL);
++	assert("nikita-455", is_reiser4_super(super));
++	return get_super_private(super)->blocks_free;
++}
++
++/* set number of blocks free in filesystem */
++void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
++{
++	assert("vs-505", super != NULL);
++	assert("vs-506", is_reiser4_super(super));
++	get_super_private(super)->blocks_free = nr;
++}
++
++/* get mkfs unique identifier */
++__u32 reiser4_mkfs_id(const struct super_block *super	/* super block
++							   queried */ )
++{
++	assert("vpf-221", super != NULL);
++	assert("vpf-222", is_reiser4_super(super));
++	return get_super_private(super)->mkfs_id;
++}
++
++/* amount of free blocks in file system */
++__u64 reiser4_free_committed_blocks(const struct super_block *super)
++{
++	assert("vs-497", super != NULL);
++	assert("vs-498", is_reiser4_super(super));
++	return get_super_private(super)->blocks_free_committed;
++}
++
++/* amount of blocks in the file system reserved for @uid and @gid */
++long reiser4_reserved_blocks(const struct super_block *super	/* super block
++								   queried */ ,
++			     uid_t uid /* user id */ ,
++			     gid_t gid /* group id */ )
++{
++	long reserved;
++
++	assert("nikita-456", super != NULL);
++	assert("nikita-457", is_reiser4_super(super));
++
++	reserved = 0;
++	if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
++		reserved += reserved_for_gid(super, gid);
++	if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
++		reserved += reserved_for_uid(super, uid);
++	if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
++		reserved += reserved_for_root(super);
++	return reserved;
++}
++
++/* get/set value of/to grabbed blocks counter */
++__u64 reiser4_grabbed_blocks(const struct super_block * super)
++{
++	assert("zam-512", super != NULL);
++	assert("zam-513", is_reiser4_super(super));
++
++	return get_super_private(super)->blocks_grabbed;
++}
++
++__u64 flush_reserved(const struct super_block * super)
++{
++	assert("vpf-285", super != NULL);
++	assert("vpf-286", is_reiser4_super(super));
++
++	return get_super_private(super)->blocks_flush_reserved;
++}
++
++/* get/set value of/to counter of fake allocated formatted blocks */
++__u64 reiser4_fake_allocated(const struct super_block * super)
++{
++	assert("zam-516", super != NULL);
++	assert("zam-517", is_reiser4_super(super));
++
++	return get_super_private(super)->blocks_fake_allocated;
++}
++
++/* get/set value of/to counter of fake allocated unformatted blocks */
++__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
++{
++	assert("zam-516", super != NULL);
++	assert("zam-517", is_reiser4_super(super));
++
++	return get_super_private(super)->blocks_fake_allocated_unformatted;
++}
++
++/* get/set value of/to counter of clustered blocks */
++__u64 reiser4_clustered_blocks(const struct super_block * super)
++{
++	assert("edward-601", super != NULL);
++	assert("edward-602", is_reiser4_super(super));
++
++	return get_super_private(super)->blocks_clustered;
++}
++
++/* space allocator used by this file system */
++reiser4_space_allocator *get_space_allocator(const struct super_block * super)
++{
++	assert("nikita-1965", super != NULL);
++	assert("nikita-1966", is_reiser4_super(super));
++	return &get_super_private(super)->space_allocator;
++}
++
++/* return fake inode used to bind formatted nodes in the page cache */
++struct inode *get_super_fake(const struct super_block *super	/* super block
++								   queried */ )
++{
++	assert("nikita-1757", super != NULL);
++	return get_super_private(super)->fake;
++}
++
++/* return fake inode used to bind copied on capture nodes in the page cache */
++struct inode *get_cc_fake(const struct super_block *super	/* super block
++								   queried */ )
++{
++	assert("nikita-1757", super != NULL);
++	return get_super_private(super)->cc;
++}
++
++/* return fake inode used to bind bitmaps and journlal heads */
++struct inode *get_bitmap_fake(const struct super_block *super)
++{
++	assert("nikita-17571", super != NULL);
++	return get_super_private(super)->bitmap;
++}
++
++/* tree used by this file system */
++reiser4_tree *get_tree(const struct super_block * super	/* super block
++							 * queried */ )
++{
++	assert("nikita-460", super != NULL);
++	assert("nikita-461", is_reiser4_super(super));
++	return &get_super_private(super)->tree;
++}
++
++/* Check that @super is (looks like) reiser4 super block. This is mainly for
++   use in assertions. */
++int is_reiser4_super(const struct super_block *super	/* super block
++							 * queried */ )
++{
++	return
++	    super != NULL &&
++	    get_super_private(super) != NULL &&
++	    super->s_op == &(get_super_private(super)->ops.super);
++}
++
++int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
++{
++	return test_bit((int)f, &get_super_private(super)->fs_flags);
++}
++
++/* amount of blocks reserved for given group in file system */
++static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG	/* super
++										 * block
++										 * queried */ ,
++			      gid_t gid UNUSED_ARG /* group id */ )
++{
++	return 0;
++}
++
++/* amount of blocks reserved for given user in file system */
++static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG	/* super
++										   block
++										   queried */ ,
++			      uid_t uid UNUSED_ARG /* user id */ )
++{
++	return 0;
++}
++
++/* amount of blocks reserved for super user in file system */
++static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG	/* super
++										   block
++										   queried */ )
++{
++	return 0;
++}
++
++/*
++ * true if block number @blk makes sense for the file system at @super.
++ */
++int
++reiser4_blocknr_is_sane_for(const struct super_block *super,
++			    const reiser4_block_nr * blk)
++{
++	reiser4_super_info_data *sbinfo;
++
++	assert("nikita-2957", super != NULL);
++	assert("nikita-2958", blk != NULL);
++
++	if (blocknr_is_fake(blk))
++		return 1;
++
++	sbinfo = get_super_private(super);
++	return *blk < sbinfo->block_count;
++}
++
++/*
++ * true, if block number @blk makes sense for the current file system
++ */
++int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
++{
++	return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/super.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/super.h
+@@ -0,0 +1,468 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Super-block functions. See super.c for details. */
++
++#if !defined( __REISER4_SUPER_H__ )
++#define __REISER4_SUPER_H__
++
++#include "tree.h"
++#include "entd.h"
++#include "wander.h"
++#include "fsdata.h"
++#include "plugin/object.h"
++#include "plugin/space/space_allocator.h"
++
++/*
++ * Flush algorithms parameters.
++ */
++typedef struct {
++	unsigned relocate_threshold;
++	unsigned relocate_distance;
++	unsigned written_threshold;
++	unsigned scan_maxnodes;
++} flush_params;
++
++typedef enum {
++	/*
++	 * True if this file system doesn't support hard-links (multiple names)
++	 * for directories: this is default UNIX behavior.
++	 *
++	 * If hard-links on directoires are not allowed, file system is Acyclic
++	 * Directed Graph (modulo dot, and dotdot, of course).
++	 *
++	 * This is used by reiser4_link().
++	 */
++	REISER4_ADG = 0,
++	/*
++	 * set if all nodes in internal tree have the same node layout plugin.
++	 * If so, znode_guess_plugin() will return tree->node_plugin in stead
++	 * of guessing plugin by plugin id stored in the node.
++	 */
++	REISER4_ONE_NODE_PLUGIN = 1,
++	/* if set, bsd gid assignment is supported. */
++	REISER4_BSD_GID = 2,
++	/* [mac]_time are 32 bit in inode */
++	REISER4_32_BIT_TIMES = 3,
++	/* allow concurrent flushes */
++	REISER4_MTFLUSH = 4,
++	/* load all bitmap blocks at mount time */
++	REISER4_DONT_LOAD_BITMAP = 5,
++	/* enforce atomicity during write(2) */
++	REISER4_ATOMIC_WRITE = 6,
++	/* don't use write barriers in the log writer code. */
++	REISER4_NO_WRITE_BARRIER = 7
++
++} reiser4_fs_flag;
++
++/*
++ * VFS related operation vectors.
++ */
++typedef struct object_ops {
++	struct super_operations super;
++	struct dentry_operations dentry;
++	struct export_operations export;
++} object_ops;
++
++/* reiser4-specific part of super block
++
++   Locking
++
++   Fields immutable after mount:
++
++    ->oid*
++    ->space*
++    ->default_[ug]id
++    ->mkfs_id
++    ->trace_flags
++    ->debug_flags
++    ->fs_flags
++    ->df_plug
++    ->optimal_io_size
++    ->plug
++    ->flush
++    ->u (bad name)
++    ->txnmgr
++    ->ra_params
++    ->fsuid
++    ->journal_header
++    ->journal_footer
++
++   Fields protected by ->lnode_guard
++
++    ->lnode_htable
++
++   Fields protected by per-super block spin lock
++
++    ->block_count
++    ->blocks_used
++    ->blocks_free
++    ->blocks_free_committed
++    ->blocks_grabbed
++    ->blocks_fake_allocated_unformatted
++    ->blocks_fake_allocated
++    ->blocks_flush_reserved
++    ->eflushed
++    ->blocknr_hint_default
++
++   After journal replaying during mount,
++
++    ->last_committed_tx
++
++   is protected by ->tmgr.commit_semaphore
++
++   Invariants involving this data-type:
++
++      [sb-block-counts]
++      [sb-grabbed]
++      [sb-fake-allocated]
++*/
++struct reiser4_super_info_data {
++	/*
++	 * guard spinlock which protects reiser4 super block fields (currently
++	 * blocks_free, blocks_free_committed)
++	 */
++	spinlock_t guard;
++
++	/* next oid that will be returned by oid_allocate() */
++	oid_t next_to_use;
++	/* total number of used oids */
++	oid_t oids_in_use;
++
++	/* space manager plugin */
++	reiser4_space_allocator space_allocator;
++
++	/* reiser4 internal tree */
++	reiser4_tree tree;
++
++	/*
++	 * default user id used for light-weight files without their own
++	 * stat-data.
++	 */
++	uid_t default_uid;
++
++	/*
++	 * default group id used for light-weight files without their own
++	 * stat-data.
++	 */
++	gid_t default_gid;
++
++	/* mkfs identifier generated at mkfs time. */
++	__u32 mkfs_id;
++	/* amount of blocks in a file system */
++	__u64 block_count;
++
++	/* inviolable reserve */
++	__u64 blocks_reserved;
++
++	/* amount of blocks used by file system data and meta-data. */
++	__u64 blocks_used;
++
++	/*
++	 * amount of free blocks. This is "working" free blocks counter. It is
++	 * like "working" bitmap, please see block_alloc.c for description.
++	 */
++	__u64 blocks_free;
++
++	/*
++	 * free block count for fs committed state. This is "commit" version of
++	 * free block counter.
++	 */
++	__u64 blocks_free_committed;
++
++	/*
++	 * number of blocks reserved for further allocation, for all
++	 * threads.
++	 */
++	__u64 blocks_grabbed;
++
++	/* number of fake allocated unformatted blocks in tree. */
++	__u64 blocks_fake_allocated_unformatted;
++
++	/* number of fake allocated formatted blocks in tree. */
++	__u64 blocks_fake_allocated;
++
++	/* number of blocks reserved for flush operations. */
++	__u64 blocks_flush_reserved;
++
++	/* number of blocks reserved for cluster operations. */
++	__u64 blocks_clustered;
++
++	/* unique file-system identifier */
++	__u32 fsuid;
++
++	/* file-system wide flags. See reiser4_fs_flag enum */
++	unsigned long fs_flags;
++
++	/* transaction manager */
++	txn_mgr tmgr;
++
++	/* ent thread */
++	entd_context entd;
++
++	/* fake inode used to bind formatted nodes */
++	struct inode *fake;
++	/* inode used to bind bitmaps (and journal heads) */
++	struct inode *bitmap;
++	/* inode used to bind copied on capture nodes */
++	struct inode *cc;
++
++	/* disk layout plugin */
++	disk_format_plugin *df_plug;
++
++	/* disk layout specific part of reiser4 super info data */
++	union {
++		format40_super_info format40;
++	} u;
++
++	/* value we return in st_blksize on stat(2) */
++	unsigned long optimal_io_size;
++
++	/* parameters for the flush algorithm */
++	flush_params flush;
++
++	/* pointers to jnodes for journal header and footer */
++	jnode *journal_header;
++	jnode *journal_footer;
++
++	journal_location jloc;
++
++	/* head block number of last committed transaction */
++	__u64 last_committed_tx;
++
++	/*
++	 * we remember last written location for using as a hint for new block
++	 * allocation
++	 */
++	__u64 blocknr_hint_default;
++
++	/* committed number of files (oid allocator state variable ) */
++	__u64 nr_files_committed;
++
++	ra_params_t ra_params;
++
++	/*
++	 * A semaphore for serializing cut tree operation if out-of-free-space:
++	 * the only one cut_tree thread is allowed to grab space from reserved
++	 * area (it is 5% of disk space)
++	 */
++	struct semaphore delete_sema;
++	/* task owning ->delete_sema */
++	struct task_struct *delete_sema_owner;
++
++	/* serialize semaphore */
++	struct semaphore flush_sema;
++
++	/* Diskmap's blocknumber */
++	__u64 diskmap_block;
++
++	/* What to do in case of error */
++	int onerror;
++
++	/* operations for objects on this file system */
++	object_ops ops;
++
++	/*
++	 * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
++	 * more details
++	 */
++	d_cursor_info d_info;
++
++#ifdef CONFIG_REISER4_BADBLOCKS
++	/* Alternative master superblock offset (in bytes) */
++	unsigned long altsuper;
++#endif
++	struct repacker *repacker;
++	struct page *status_page;
++	struct bio *status_bio;
++
++#if REISER4_DEBUG
++	/*
++	 * minimum used blocks value (includes super blocks, bitmap blocks and
++	 * other fs reserved areas), depends on fs format and fs size.
++	 */
++	__u64 min_blocks_used;
++
++	/*
++	 * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
++	 * are kept on a list anchored at sbinfo->all_jnodes. This list is
++	 * protected by sbinfo->all_guard spin lock. This lock should be taken
++	 * with _irq modifier, because it is also modified from interrupt
++	 * contexts (by RCU).
++	 */
++	spinlock_t all_guard;
++	/* list of all jnodes */
++	struct list_head all_jnodes;
++#endif
++	struct dentry *debugfs_root;
++};
++
++extern reiser4_super_info_data *get_super_private_nocheck(const struct
++							  super_block *super);
++
++
++/* Return reiser4-specific part of super block */
++static inline reiser4_super_info_data *get_super_private(const struct
++							 super_block *super)
++{
++	assert("nikita-447", super != NULL);
++
++	return (reiser4_super_info_data *) super->s_fs_info;
++}
++
++/* get ent context for the @super */
++static inline entd_context *get_entd_context(struct super_block *super)
++{
++	return &get_super_private(super)->entd;
++}
++
++
++/* "Current" super-block: main super block used during current system
++   call. Reference to this super block is stored in reiser4_context. */
++static inline struct super_block *reiser4_get_current_sb(void)
++{
++	return get_current_context()->super;
++}
++
++/* Reiser4-specific part of "current" super-block: main super block used
++   during current system call. Reference to this super block is stored in
++   reiser4_context. */
++static inline reiser4_super_info_data *get_current_super_private(void)
++{
++	return get_super_private(reiser4_get_current_sb());
++}
++
++static inline ra_params_t *get_current_super_ra_params(void)
++{
++	return &(get_current_super_private()->ra_params);
++}
++
++/*
++ * true, if file system on @super is read-only
++ */
++static inline int rofs_super(struct super_block *super)
++{
++	return super->s_flags & MS_RDONLY;
++}
++
++/*
++ * true, if @tree represents read-only file system
++ */
++static inline int rofs_tree(reiser4_tree * tree)
++{
++	return rofs_super(tree->super);
++}
++
++/*
++ * true, if file system where @inode lives on, is read-only
++ */
++static inline int rofs_inode(struct inode *inode)
++{
++	return rofs_super(inode->i_sb);
++}
++
++/*
++ * true, if file system where @node lives on, is read-only
++ */
++static inline int rofs_jnode(jnode * node)
++{
++	return rofs_tree(jnode_get_tree(node));
++}
++
++extern __u64 reiser4_current_block_count(void);
++
++extern void build_object_ops(struct super_block *super, object_ops * ops);
++
++#define REISER4_SUPER_MAGIC 0x52345362	/* (*(__u32 *)"R4Sb"); */
++
++static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
++{
++	spin_lock(&(sbinfo->guard));
++}
++
++static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
++{
++	assert_spin_locked(&(sbinfo->guard));
++	spin_unlock(&(sbinfo->guard));
++}
++
++extern __u64 flush_reserved(const struct super_block *);
++extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
++extern long statfs_type(const struct super_block *super);
++extern __u64 reiser4_block_count(const struct super_block *super);
++extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
++extern __u64 reiser4_data_blocks(const struct super_block *super);
++extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
++extern __u64 reiser4_free_blocks(const struct super_block *super);
++extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
++extern __u32 reiser4_mkfs_id(const struct super_block *super);
++
++extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
++
++extern __u64 reiser4_grabbed_blocks(const struct super_block *);
++extern __u64 reiser4_fake_allocated(const struct super_block *);
++extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
++extern __u64 reiser4_clustered_blocks(const struct super_block *);
++
++extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
++				    gid_t gid);
++
++extern reiser4_space_allocator *get_space_allocator(const struct super_block
++						    *super);
++extern reiser4_oid_allocator *get_oid_allocator(const struct super_block
++						*super);
++extern struct inode *get_super_fake(const struct super_block *super);
++extern struct inode *get_cc_fake(const struct super_block *super);
++extern struct inode *get_bitmap_fake(const struct super_block *super);
++extern reiser4_tree *get_tree(const struct super_block *super);
++extern int is_reiser4_super(const struct super_block *super);
++
++extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
++extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
++				       const reiser4_block_nr * blk);
++extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
++extern int reiser4_done_super(struct super_block *s);
++
++/* step of fill super */
++extern int init_fs_info(struct super_block *);
++extern void done_fs_info(struct super_block *);
++extern int init_super_data(struct super_block *, char *opt_string);
++extern int init_read_super(struct super_block *, int silent);
++extern int init_root_inode(struct super_block *);
++
++
++/* Maximal possible object id. */
++#define  ABSOLUTE_MAX_OID ((oid_t)~0)
++
++#define OIDS_RESERVED  ( 1 << 16 )
++int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
++oid_t oid_allocate(struct super_block *);
++int oid_release(struct super_block *, oid_t);
++oid_t oid_next(const struct super_block *);
++void oid_count_allocated(void);
++void oid_count_released(void);
++long oids_used(const struct super_block *);
++
++#if REISER4_DEBUG
++void print_fs_info(const char *prefix, const struct super_block *);
++#endif
++
++extern void destroy_reiser4_cache(kmem_cache_t **);
++
++extern struct super_operations reiser4_super_operations;
++extern struct export_operations reiser4_export_operations;
++extern struct dentry_operations reiser4_dentry_operations;
++extern struct dentry *reiser4_debugfs_root;
++
++/* __REISER4_SUPER_H__ */
++#endif
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 120
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/super_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/super_ops.c
+@@ -0,0 +1,721 @@
++/* Copyright 2005 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++#include "inode.h"
++#include "page_cache.h"
++#include "ktxnmgrd.h"
++#include "flush.h"
++#include "safe_link.h"
++
++#include <linux/vfs.h>
++#include <linux/writeback.h>
++#include <linux/mount.h>
++#include <linux/seq_file.h>
++#include <linux/debugfs.h>
++
++/* slab cache for inodes */
++static kmem_cache_t *inode_cache;
++
++/**
++ * init_once - constructor for reiser4 inodes
++ * @obj: inode to be initialized
++ * @cache: cache @obj belongs to
++ * @flags: SLAB flags
++ *
++ * Initialization function to be called when new page is allocated by reiser4
++ * inode cache. It is set on inode cache creation.
++ */
++static void init_once(void *obj, kmem_cache_t *cache, unsigned long flags)
++{
++	reiser4_inode_object *info;
++
++	info = obj;
++
++	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
++	    SLAB_CTOR_CONSTRUCTOR) {
++		/* initialize vfs inode */
++		inode_init_once(&info->vfs_inode);
++
++		/*
++		 * initialize reiser4 specific part fo inode.
++		 * NOTE-NIKITA add here initializations for locks, list heads,
++		 * etc. that will be added to our private inode part.
++		 */
++		INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
++		/* init semaphore which is used during inode loading */
++		loading_init_once(&info->p);
++		INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
++				GFP_ATOMIC);
++#if REISER4_DEBUG
++		info->p.nr_jnodes = 0;
++#endif
++	}
++}
++
++/**
++ * init_inodes - create znode cache
++ *
++ * Initializes slab cache of inodes. It is part of reiser4 module initialization.
++ */
++static int init_inodes(void)
++{
++	inode_cache = kmem_cache_create("reiser4_inode",
++					sizeof(reiser4_inode_object),
++					0,
++					SLAB_HWCACHE_ALIGN |
++					SLAB_RECLAIM_ACCOUNT, init_once, NULL);
++	if (inode_cache == NULL)
++		return RETERR(-ENOMEM);
++	return 0;
++}
++
++/**
++ * done_inodes - delete inode cache
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++static void done_inodes(void)
++{
++	destroy_reiser4_cache(&inode_cache);
++}
++
++/**
++ * reiser4_alloc_inode - alloc_inode of super operations
++ * @super: super block new inode is allocated for
++ *
++ * Allocates new inode, initializes reiser4 specific part of it.
++ */
++static struct inode *reiser4_alloc_inode(struct super_block *super)
++{
++	reiser4_inode_object *obj;
++
++	assert("nikita-1696", super != NULL);
++	obj = kmem_cache_alloc(inode_cache, SLAB_KERNEL);
++	if (obj != NULL) {
++		reiser4_inode *info;
++
++		info = &obj->p;
++
++		info->hset = info->pset = plugin_set_get_empty();
++		info->extmask = 0;
++		info->locality_id = 0ull;
++		info->plugin_mask = 0;
++#if !REISER4_INO_IS_OID
++		info->oid_hi = 0;
++#endif
++		seal_init(&info->sd_seal, NULL, NULL);
++		coord_init_invalid(&info->sd_coord, NULL);
++		info->flags = 0;
++		spin_lock_init(&info->guard);
++		/* this deals with info's loading semaphore */
++		loading_alloc(info);
++		info->vroot = UBER_TREE_ADDR;
++		return &obj->vfs_inode;
++	} else
++		return NULL;
++}
++
++/**
++ * reiser4_destroy_inode - destroy_inode of super operations
++ * @inode: inode being destroyed
++ *
++ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
++ */
++static void reiser4_destroy_inode(struct inode *inode)
++{
++	reiser4_inode *info;
++
++	info = reiser4_inode_data(inode);
++
++	assert("vs-1220", inode_has_no_jnodes(info));
++
++	if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
++		file_plugin *fplug = inode_file_plugin(inode);
++		if (fplug->destroy_inode != NULL)
++			fplug->destroy_inode(inode);
++	}
++	dispose_cursors(inode);
++	if (info->pset)
++		plugin_set_put(info->pset);
++
++	/*
++	 * cannot add similar assertion about ->i_list as prune_icache return
++	 * inode into slab with dangling ->list.{next,prev}. This is safe,
++	 * because they are re-initialized in the new_inode().
++	 */
++	assert("nikita-2895", list_empty(&inode->i_dentry));
++	assert("nikita-2896", hlist_unhashed(&inode->i_hash));
++	assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
++
++	/* this deals with info's loading semaphore */
++	loading_destroy(info);
++
++	kmem_cache_free(inode_cache,
++			container_of(info, reiser4_inode_object, p));
++}
++
++/**
++ * reiser4_dirty_inode - dirty_inode of super operations
++ * @inode: inode being dirtied
++ *
++ * Updates stat data.
++ */
++static void reiser4_dirty_inode(struct inode *inode)
++{
++	int result;
++
++	if (!is_in_reiser4_context())
++		return;
++	assert("", !IS_RDONLY(inode));
++	assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
++		    get_current_context()->grabbed_blocks));
++
++	result = reiser4_update_sd(inode);
++	if (result)
++		warning("", "failed to dirty inode for %llu: %d",
++			get_inode_oid(inode), result);
++}
++
++/**
++ * reiser4_delete_inode - delete_inode of super operations
++ * @inode: inode to delete
++ *
++ * Calls file plugin's delete_object method to delete object items from
++ * filesystem tree and calls clear_inode.
++ */
++static void reiser4_delete_inode(struct inode *inode)
++{
++	reiser4_context *ctx;
++	file_plugin *fplug;
++
++	ctx = init_context(inode->i_sb);
++	if (IS_ERR(ctx)) {
++		warning("vs-15", "failed to init context");
++		return;
++	}
++
++	if (is_inode_loaded(inode)) {
++		fplug = inode_file_plugin(inode);
++		if (fplug != NULL && fplug->delete_object != NULL)
++			fplug->delete_object(inode);
++	}
++
++	inode->i_blocks = 0;
++	clear_inode(inode);
++	reiser4_exit_context(ctx);
++}
++
++/**
++ * reiser4_put_super - put_super of super operations
++ * @super: super block to free
++ *
++ * Stops daemons, release resources, umounts in short.
++ */
++static void reiser4_put_super(struct super_block *super)
++{
++	reiser4_super_info_data *sbinfo;
++	reiser4_context *ctx;
++
++	sbinfo = get_super_private(super);
++	assert("vs-1699", sbinfo);
++
++	debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
++	debugfs_remove(sbinfo->tmgr.debugfs_id_count);
++	debugfs_remove(sbinfo->debugfs_root);
++
++	ctx = init_context(super);
++	if (IS_ERR(ctx)) {
++		warning("vs-17", "failed to init context");
++		return;
++	}	
++
++	/* have disk format plugin to free its resources */
++	if (get_super_private(super)->df_plug->release)
++		get_super_private(super)->df_plug->release(super);
++
++	done_formatted_fake(super);
++
++	/* stop daemons: ktxnmgr and entd */
++	done_entd(super);
++	done_ktxnmgrd(super);
++	done_txnmgr(&sbinfo->tmgr);
++
++	done_fs_info(super);
++	reiser4_exit_context(ctx);
++}
++
++/**
++ * reiser4_write_super - write_super of super operations
++ * @super: super block to write
++ *
++ * Captures znode associated with super block, comit all transactions.
++ */
++static void reiser4_write_super(struct super_block *super)
++{
++	int ret;
++	reiser4_context *ctx;
++
++	assert("vs-1700", !rofs_super(super));
++
++	ctx = init_context(super);
++	if (IS_ERR(ctx)) {
++		warning("vs-16", "failed to init context");
++		return;
++	}
++
++	ret = capture_super_block(super);
++	if (ret != 0)
++		warning("vs-1701",
++			"capture_super_block failed in write_super: %d", ret);
++	ret = txnmgr_force_commit_all(super, 0);
++	if (ret != 0)
++		warning("jmacd-77113",
++			"txn_force failed in write_super: %d", ret);
++
++	super->s_dirt = 0;
++
++	reiser4_exit_context(ctx);
++}
++
++/**
++ * reiser4_statfs - statfs of super operations
++ * @super: super block of file system in queried
++ * @stafs: buffer to fill with statistics
++ *
++ * Returns information about filesystem.
++ */
++static int reiser4_statfs(struct super_block *super, struct kstatfs *statfs)
++{
++	sector_t total;
++	sector_t reserved;
++	sector_t free;
++	sector_t forroot;
++	sector_t deleted;
++	reiser4_context *ctx;
++
++	assert("nikita-408", super != NULL);
++	assert("nikita-409", statfs != NULL);
++
++	ctx = init_context(super);
++	if (IS_ERR(ctx))
++		return PTR_ERR(ctx);
++
++	statfs->f_type = statfs_type(super);
++	statfs->f_bsize = super->s_blocksize;
++
++	/*
++	 * 5% of total block space is reserved. This is needed for flush and
++	 * for truncates (so that we are able to perform truncate/unlink even
++	 * on the otherwise completely full file system). If this reservation
++	 * is hidden from statfs(2), users will mistakenly guess that they
++	 * have enough free space to complete some operation, which is
++	 * frustrating.
++	 *
++	 * Another possible solution is to subtract ->blocks_reserved from
++	 * ->f_bfree, but changing available space seems less intrusive than
++	 * letting user to see 5% of disk space to be used directly after
++	 * mkfs.
++	 */
++	total = reiser4_block_count(super);
++	reserved = get_super_private(super)->blocks_reserved;
++	deleted = txnmgr_count_deleted_blocks();
++	free = reiser4_free_blocks(super) + deleted;
++	forroot = reiser4_reserved_blocks(super, 0, 0);
++
++	/*
++	 * These counters may be in inconsistent state because we take the
++	 * values without keeping any global spinlock.  Here we do a sanity
++	 * check that free block counter does not exceed the number of all
++	 * blocks.
++	 */
++	if (free > total)
++		free = total;
++	statfs->f_blocks = total - reserved;
++	/* make sure statfs->f_bfree is never larger than statfs->f_blocks */
++	if (free > reserved)
++		free -= reserved;
++	else
++		free = 0;
++	statfs->f_bfree = free;
++
++	if (free > forroot)
++		free -= forroot;
++	else
++		free = 0;
++	statfs->f_bavail = free;
++
++	statfs->f_files = 0;
++	statfs->f_ffree = 0;
++
++	/* maximal acceptable name length depends on directory plugin. */
++	assert("nikita-3351", super->s_root->d_inode != NULL);
++	statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
++	reiser4_exit_context(ctx);
++	return 0;
++}
++
++/**
++ * reiser4_clear_inode - clear_inode of super operation
++ * @inode: inode about to destroy
++ *
++ * Does sanity checks: being destroyed should have all jnodes detached.
++ */
++static void reiser4_clear_inode(struct inode *inode)
++{
++#if REISER4_DEBUG
++	reiser4_inode *r4_inode;
++
++	r4_inode = reiser4_inode_data(inode);
++	if (!inode_has_no_jnodes(r4_inode))
++		warning("vs-1732", "reiser4 inode has %ld jnodes\n",
++			r4_inode->nr_jnodes);
++#endif
++}
++
++/**
++ * reiser4_sync_inodes - sync_inodes of super operations
++ * @super:
++ * @wbc:
++ *
++ * This method is called by background and non-backgound writeback. Reiser4's
++ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
++ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
++ * mapping - dirty pages get into atoms. Writeout is called to flush some
++ * atoms.
++ */
++static void reiser4_sync_inodes(struct super_block *super,
++				struct writeback_control *wbc)
++{
++	reiser4_context *ctx;
++	long to_write;
++
++	if (wbc->for_kupdate)
++		/* reiser4 has its own means of periodical write-out */
++		return;
++
++	to_write = wbc->nr_to_write;
++	assert("vs-49", wbc->older_than_this == NULL);
++
++	ctx = init_context(super);
++	if (IS_ERR(ctx)) {
++		warning("vs-13", "failed to init context");
++		return;
++	}
++
++	/*
++	 * call reiser4_writepages for each of dirty inodes to turn dirty pages
++	 * into transactions if they were not yet.
++	 */
++	generic_sync_sb_inodes(super, wbc);
++
++	/* flush goes here */
++	wbc->nr_to_write = to_write;
++	writeout(super, wbc);
++
++	/* avoid recursive calls to ->sync_inodes */
++	context_set_commit_async(ctx);
++	reiser4_exit_context(ctx);
++}
++
++/**
++ * reiser4_show_options - show_options of super operations
++ * @m: file where to write information
++ * @mnt: mount structure
++ *
++ * Makes reiser4 mount options visible in /proc/mounts.
++ */
++static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
++{
++	struct super_block *super;
++	reiser4_super_info_data *sbinfo;
++
++	super = mnt->mnt_sb;
++	sbinfo = get_super_private(super);
++
++	seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
++	seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
++	seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
++	seq_printf(m, ",atom_max_flushers=0x%x",
++		   sbinfo->tmgr.atom_max_flushers);
++	seq_printf(m, ",cbk_cache_slots=0x%x",
++		   sbinfo->tree.cbk_cache.nr_slots);
++
++	return 0;
++}
++
++struct super_operations reiser4_super_operations = {
++	.alloc_inode = reiser4_alloc_inode,
++	.destroy_inode = reiser4_destroy_inode,
++	.dirty_inode = reiser4_dirty_inode,
++	.delete_inode = reiser4_delete_inode,
++	.put_super = reiser4_put_super,
++	.write_super = reiser4_write_super,
++	.statfs = reiser4_statfs,
++	.clear_inode = reiser4_clear_inode,
++	.sync_inodes = reiser4_sync_inodes,
++	.show_options = reiser4_show_options
++};
++
++/**
++ * fill_super - initialize super block on mount
++ * @super: super block to fill
++ * @data: reiser4 specific mount option
++ * @silent:
++ *
++ * This is to be called by reiser4_get_sb. Mounts filesystem.
++ */
++static int fill_super(struct super_block *super, void *data, int silent)
++{
++	reiser4_context ctx;
++	int result;
++	reiser4_super_info_data *sbinfo;
++
++	assert("zam-989", super != NULL);
++
++	super->s_op = NULL;
++	init_stack_context(&ctx, super);
++
++	/* allocate reiser4 specific super block */
++	if ((result = init_fs_info(super)) != 0)
++		goto failed_init_sinfo;
++
++	sbinfo = get_super_private(super);
++	/* initialize various reiser4 parameters, parse mount options */
++	if ((result = init_super_data(super, data)) != 0)
++		goto failed_init_super_data;
++
++	/* read reiser4 master super block, initialize disk format plugin */
++	if ((result = init_read_super(super, silent)) != 0)
++		goto failed_init_read_super;
++
++	/* initialize transaction manager */
++	init_txnmgr(&sbinfo->tmgr);
++
++	/* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
++	if ((result = init_ktxnmgrd(super)) != 0)
++		goto failed_init_ktxnmgrd;
++
++	/* initialize entd context and start kernel thread entd */
++	if ((result = init_entd(super)) != 0)
++		goto failed_init_entd;
++
++	/* initialize address spaces for formatted nodes and bitmaps */
++	if ((result = init_formatted_fake(super)) != 0)
++		goto failed_init_formatted_fake;
++
++	/* initialize disk format plugin */
++	if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
++		goto failed_init_disk_format;
++
++	/*
++	 * There are some 'committed' versions of reiser4 super block counters,
++	 * which correspond to reiser4 on-disk state. These counters are
++	 * initialized here
++	 */
++	sbinfo->blocks_free_committed = sbinfo->blocks_free;
++	sbinfo->nr_files_committed = oids_used(super);
++
++	/* get inode of root directory */
++	if ((result = init_root_inode(super)) != 0)
++		goto failed_init_root_inode;
++
++	process_safelinks(super);
++	reiser4_exit_context(&ctx);
++
++	sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
++						  reiser4_debugfs_root);
++	if (sbinfo->debugfs_root) {
++		sbinfo->tmgr.debugfs_atom_count =
++			debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
++					   sbinfo->debugfs_root,
++					   &sbinfo->tmgr.atom_count);
++		sbinfo->tmgr.debugfs_id_count =
++			debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
++					   sbinfo->debugfs_root,
++					   &sbinfo->tmgr.id_count);
++	}
++	return 0;
++
++ failed_init_root_inode:
++	if (sbinfo->df_plug->release)
++		sbinfo->df_plug->release(super);
++ failed_init_disk_format:
++	done_formatted_fake(super);
++ failed_init_formatted_fake:
++	done_entd(super);
++ failed_init_entd:
++	done_ktxnmgrd(super);
++ failed_init_ktxnmgrd:
++	done_txnmgr(&sbinfo->tmgr);
++ failed_init_read_super:
++ failed_init_super_data:
++	done_fs_info(super);
++ failed_init_sinfo:
++	reiser4_exit_context(&ctx);
++	return result;
++}
++
++/**
++ * reiser4_get_sb - get_sb of file_system_type operations
++ * @fs_type:
++ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
++ * @dev_name: block device file name
++ * @data: specific mount options
++ *
++ * Reiser4 mount entry.
++ */
++static struct super_block *reiser4_get_sb(struct file_system_type *fs_type,
++					  int flags,
++					  const char *dev_name,
++					  void *data)
++{
++	return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
++}
++
++/* structure describing the reiser4 filesystem implementation */
++static struct file_system_type reiser4_fs_type = {
++	.owner = THIS_MODULE,
++	.name = "reiser4",
++	.fs_flags = FS_REQUIRES_DEV,
++	.get_sb = reiser4_get_sb,
++	.kill_sb = kill_block_super,
++	.next = NULL
++};
++
++void destroy_reiser4_cache(kmem_cache_t **cachep)
++{
++	int result;
++
++	BUG_ON(*cachep == NULL);
++	result = kmem_cache_destroy(*cachep);
++	BUG_ON(result != 0);
++	*cachep = NULL;
++}
++
++struct dentry *reiser4_debugfs_root = NULL;
++
++/**
++ * init_reiser4 - reiser4 initialization entry point
++ *
++ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
++ * on kernel initialization or during reiser4 module load.
++ */
++static int __init init_reiser4(void)
++{
++	int result;
++
++	printk(KERN_INFO
++	       "Loading Reiser4. "
++	       "See www.namesys.com for a description of Reiser4.\n");
++
++	/* initialize slab cache of inodes */
++	if ((result = init_inodes()) != 0)
++		goto failed_inode_cache;
++
++	/* initialize cache of znodes */
++	if ((result = init_znodes()) != 0)
++		goto failed_init_znodes;
++
++	/* initialize all plugins */
++	if ((result = init_plugins()) != 0)
++		goto failed_init_plugins;
++
++	/* initialize cache of plugin_set-s and plugin_set's hash table */
++	if ((result = init_plugin_set()) != 0)
++		goto failed_init_plugin_set;
++
++	/* initialize caches of txn_atom-s and txn_handle-s */
++	if ((result = init_txnmgr_static()) != 0)
++		goto failed_init_txnmgr_static;
++
++	/* initialize cache of jnodes */
++	if ((result = init_jnodes()) != 0)
++		goto failed_init_jnodes;
++
++	/* initialize cache of flush queues */
++	if ((result = init_fqs()) != 0)
++		goto failed_init_fqs;
++
++	/* initialize cache of structures attached to dentry->d_fsdata */
++	if ((result = init_dentry_fsdata()) != 0)
++		goto failed_init_dentry_fsdata;
++
++	/* initialize cache of structures attached to file->private_data */
++	if ((result = init_file_fsdata()) != 0)
++		goto failed_init_file_fsdata;
++
++	/*
++	 * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
++	 * more details
++	 */
++	if ((result = init_d_cursor()) != 0)
++		goto failed_init_d_cursor;
++
++	if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
++		reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
++		return 0;
++	}
++
++	done_d_cursor();
++ failed_init_d_cursor:
++	done_file_fsdata();
++ failed_init_file_fsdata:
++	done_dentry_fsdata();
++ failed_init_dentry_fsdata:
++	done_fqs();
++ failed_init_fqs:
++	done_jnodes();
++ failed_init_jnodes:
++	done_txnmgr_static();
++ failed_init_txnmgr_static:
++	done_plugin_set();
++ failed_init_plugin_set:
++ failed_init_plugins:
++	done_znodes();
++ failed_init_znodes:
++	done_inodes();
++ failed_inode_cache:
++	return result;
++}
++
++/**
++ * done_reiser4 - reiser4 exit entry point
++ *
++ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
++ * or at module unload.
++ */
++static void __exit done_reiser4(void)
++{
++	int result;
++
++	debugfs_remove(reiser4_debugfs_root);
++	result = unregister_filesystem(&reiser4_fs_type);
++	BUG_ON(result != 0);
++	done_d_cursor();
++	done_file_fsdata();
++	done_dentry_fsdata();
++	done_fqs();
++	done_jnodes();
++	done_txnmgr_static();
++	done_plugin_set();
++	done_znodes();
++	destroy_reiser4_cache(&inode_cache);
++}
++
++module_init(init_reiser4);
++module_exit(done_reiser4);
++
++MODULE_DESCRIPTION("Reiser4 filesystem");
++MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
++
++MODULE_LICENSE("GPL");
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/tap.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tap.c
+@@ -0,0 +1,377 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/*
++   Tree Access Pointer (tap).
++
++   tap is data structure combining coord and lock handle (mostly). It is
++   useful when one has to scan tree nodes (for example, in readdir, or flush),
++   for tap functions allow to move tap in either direction transparently
++   crossing unit/item/node borders.
++
++   Tap doesn't provide automatic synchronization of its fields as it is
++   supposed to be per-thread object.
++*/
++
++#include "forward.h"
++#include "debug.h"
++#include "coord.h"
++#include "tree.h"
++#include "context.h"
++#include "tap.h"
++#include "znode.h"
++#include "tree_walk.h"
++
++#if REISER4_DEBUG
++static int tap_invariant(const tap_t * tap);
++static void tap_check(const tap_t * tap);
++#else
++#define tap_check(tap) noop
++#endif
++
++/** load node tap is pointing to, if not loaded already */
++int tap_load(tap_t * tap)
++{
++	tap_check(tap);
++	if (tap->loaded == 0) {
++		int result;
++
++		result = zload_ra(tap->coord->node, &tap->ra_info);
++		if (result != 0)
++			return result;
++		coord_clear_iplug(tap->coord);
++	}
++	++tap->loaded;
++	tap_check(tap);
++	return 0;
++}
++
++/** release node tap is pointing to. Dual to tap_load() */
++void tap_relse(tap_t * tap)
++{
++	tap_check(tap);
++	if (tap->loaded > 0) {
++		--tap->loaded;
++		if (tap->loaded == 0) {
++			zrelse(tap->coord->node);
++		}
++	}
++	tap_check(tap);
++}
++
++/**
++ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
++ * @mode
++ */
++void
++tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, znode_lock_mode mode)
++{
++	tap->coord = coord;
++	tap->lh = lh;
++	tap->mode = mode;
++	tap->loaded = 0;
++	INIT_LIST_HEAD(&tap->linkage);
++	init_ra_info(&tap->ra_info);
++}
++
++/** add @tap to the per-thread list of all taps */
++void tap_monitor(tap_t * tap)
++{
++	assert("nikita-2623", tap != NULL);
++	tap_check(tap);
++	list_add(&tap->linkage, taps_list());
++	tap_check(tap);
++}
++
++/* duplicate @src into @dst. Copy lock handle. @dst is not initially
++ * loaded. */
++void tap_copy(tap_t * dst, tap_t * src)
++{
++	assert("nikita-3193", src != NULL);
++	assert("nikita-3194", dst != NULL);
++
++	*dst->coord = *src->coord;
++	if (src->lh->node)
++		copy_lh(dst->lh, src->lh);
++	dst->mode = src->mode;
++	dst->loaded = 0;
++	INIT_LIST_HEAD(&dst->linkage);
++	dst->ra_info = src->ra_info;
++}
++
++/** finish with @tap */
++void tap_done(tap_t * tap)
++{
++	assert("nikita-2565", tap != NULL);
++	tap_check(tap);
++	if (tap->loaded > 0)
++		zrelse(tap->coord->node);
++	done_lh(tap->lh);
++	tap->loaded = 0;
++	list_del_init(&tap->linkage);
++	tap->coord->node = NULL;
++}
++
++/**
++ * move @tap to the new node, locked with @target. Load @target, if @tap was
++ * already loaded.
++ */
++int tap_move(tap_t * tap, lock_handle * target)
++{
++	int result = 0;
++
++	assert("nikita-2567", tap != NULL);
++	assert("nikita-2568", target != NULL);
++	assert("nikita-2570", target->node != NULL);
++	assert("nikita-2569", tap->coord->node == tap->lh->node);
++
++	tap_check(tap);
++	if (tap->loaded > 0)
++		result = zload_ra(target->node, &tap->ra_info);
++
++	if (result == 0) {
++		if (tap->loaded > 0)
++			zrelse(tap->coord->node);
++		done_lh(tap->lh);
++		copy_lh(tap->lh, target);
++		tap->coord->node = target->node;
++		coord_clear_iplug(tap->coord);
++	}
++	tap_check(tap);
++	return result;
++}
++
++/**
++ * move @tap to @target. Acquire lock on @target, if @tap was already
++ * loaded.
++ */
++static int tap_to(tap_t * tap, znode * target)
++{
++	int result;
++
++	assert("nikita-2624", tap != NULL);
++	assert("nikita-2625", target != NULL);
++
++	tap_check(tap);
++	result = 0;
++	if (tap->coord->node != target) {
++		lock_handle here;
++
++		init_lh(&here);
++		result = longterm_lock_znode(&here, target,
++					     tap->mode, ZNODE_LOCK_HIPRI);
++		if (result == 0) {
++			result = tap_move(tap, &here);
++			done_lh(&here);
++		}
++	}
++	tap_check(tap);
++	return result;
++}
++
++/**
++ * move @tap to given @target, loading and locking @target->node if
++ * necessary
++ */
++int tap_to_coord(tap_t * tap, coord_t * target)
++{
++	int result;
++
++	tap_check(tap);
++	result = tap_to(tap, target->node);
++	if (result == 0)
++		coord_dup(tap->coord, target);
++	tap_check(tap);
++	return result;
++}
++
++/** return list of all taps */
++struct list_head *taps_list(void)
++{
++	return &get_current_context()->taps;
++}
++
++/** helper function for go_{next,prev}_{item,unit,node}() */
++int go_dir_el(tap_t * tap, sideof dir, int units_p)
++{
++	coord_t dup;
++	coord_t *coord;
++	int result;
++
++	int (*coord_dir) (coord_t *);
++	int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
++	void (*coord_init) (coord_t *, const znode *);
++	ON_DEBUG(int (*coord_check) (const coord_t *));
++
++	assert("nikita-2556", tap != NULL);
++	assert("nikita-2557", tap->coord != NULL);
++	assert("nikita-2558", tap->lh != NULL);
++	assert("nikita-2559", tap->coord->node != NULL);
++
++	tap_check(tap);
++	if (dir == LEFT_SIDE) {
++		coord_dir = units_p ? coord_prev_unit : coord_prev_item;
++		get_dir_neighbor = reiser4_get_left_neighbor;
++		coord_init = coord_init_last_unit;
++	} else {
++		coord_dir = units_p ? coord_next_unit : coord_next_item;
++		get_dir_neighbor = reiser4_get_right_neighbor;
++		coord_init = coord_init_first_unit;
++	}
++	ON_DEBUG(coord_check =
++		 units_p ? coord_is_existing_unit : coord_is_existing_item);
++	assert("nikita-2560", coord_check(tap->coord));
++
++	coord = tap->coord;
++	coord_dup(&dup, coord);
++	if (coord_dir(&dup) != 0) {
++		do {
++			/* move to the left neighboring node */
++			lock_handle dup;
++
++			init_lh(&dup);
++			result =
++			    get_dir_neighbor(&dup, coord->node, (int)tap->mode,
++					     GN_CAN_USE_UPPER_LEVELS);
++			if (result == 0) {
++				result = tap_move(tap, &dup);
++				if (result == 0)
++					coord_init(tap->coord, dup.node);
++				done_lh(&dup);
++			}
++			/* skip empty nodes */
++		} while ((result == 0) && node_is_empty(coord->node));
++	} else {
++		result = 0;
++		coord_dup(coord, &dup);
++	}
++	assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
++	tap_check(tap);
++	return result;
++}
++
++/**
++ * move @tap to the next unit, transparently crossing item and node
++ * boundaries
++ */
++int go_next_unit(tap_t * tap)
++{
++	return go_dir_el(tap, RIGHT_SIDE, 1);
++}
++
++/**
++ * move @tap to the previous unit, transparently crossing item and node
++ * boundaries
++ */
++int go_prev_unit(tap_t * tap)
++{
++	return go_dir_el(tap, LEFT_SIDE, 1);
++}
++
++/**
++ * @shift times apply @actor to the @tap. This is used to move @tap by
++ * @shift units (or items, or nodes) in either direction.
++ */
++static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
++{
++	int result;
++
++	assert("nikita-2555", shift >= 0);
++	assert("nikita-2562", tap->coord->node == tap->lh->node);
++
++	tap_check(tap);
++	result = tap_load(tap);
++	if (result != 0)
++		return result;
++
++	for (; shift > 0; --shift) {
++		result = actor(tap);
++		assert("nikita-2563", tap->coord->node == tap->lh->node);
++		if (result != 0)
++			break;
++	}
++	tap_relse(tap);
++	tap_check(tap);
++	return result;
++}
++
++/** move @tap @shift units rightward */
++int rewind_right(tap_t * tap, int shift)
++{
++	return rewind_to(tap, go_next_unit, shift);
++}
++
++/** move @tap @shift units leftward */
++int rewind_left(tap_t * tap, int shift)
++{
++	return rewind_to(tap, go_prev_unit, shift);
++}
++
++#if REISER4_DEBUG
++/** debugging function: print @tap content in human readable form */
++static void print_tap(const char *prefix, const tap_t * tap)
++{
++	if (tap == NULL) {
++		printk("%s: null tap\n", prefix);
++		return;
++	}
++	printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
++	       tap->loaded, (&tap->linkage == tap->linkage.next &&
++			     &tap->linkage == tap->linkage.prev),
++	       tap->lh->node,
++	       lock_mode_name(tap->mode));
++	print_coord("\tcoord", tap->coord, 0);
++}
++
++/** check [tap-sane] invariant */
++static int tap_invariant(const tap_t * tap)
++{
++	/* [tap-sane] invariant */
++
++	if (tap == NULL)
++		return 1;
++	/* tap->mode is one of
++	 *
++	 * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
++	 */
++	if (tap->mode != ZNODE_NO_LOCK &&
++	    tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
++		return 2;
++	/* tap->coord != NULL, and */
++	if (tap->coord == NULL)
++		return 3;
++	/* tap->lh != NULL, and */
++	if (tap->lh == NULL)
++		return 4;
++	/* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
++	if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
++		return 5;
++	/* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
++	if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
++		return 6;
++	return 0;
++}
++
++/** debugging function: check internal @tap consistency */
++static void tap_check(const tap_t * tap)
++{
++	int result;
++
++	result = tap_invariant(tap);
++	if (result != 0) {
++		print_tap("broken", tap);
++		reiser4_panic("nikita-2831", "tap broken: %i\n", result);
++	}
++}
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/tap.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tap.h
+@@ -0,0 +1,69 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* Tree Access Pointers. See tap.c for more details. */
++
++#if !defined( __REISER4_TAP_H__ )
++#define __REISER4_TAP_H__
++
++#include "forward.h"
++#include "readahead.h"
++
++/**
++    tree_access_pointer aka tap. Data structure combining coord_t and lock
++    handle.
++    Invariants involving this data-type, see doc/lock-ordering for details:
++
++      [tap-sane]
++ */
++struct tree_access_pointer {
++	/* coord tap is at */
++	coord_t *coord;
++	/* lock handle on ->coord->node */
++	lock_handle *lh;
++	/* mode of lock acquired by this tap */
++	znode_lock_mode mode;
++	/* incremented by tap_load(). Decremented by tap_relse(). */
++	int loaded;
++	/* list of taps */
++	struct list_head linkage;
++	/* read-ahead hint */
++	ra_info_t ra_info;
++};
++
++typedef int (*go_actor_t) (tap_t * tap);
++
++extern int tap_load(tap_t * tap);
++extern void tap_relse(tap_t * tap);
++extern void tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
++		     znode_lock_mode mode);
++extern void tap_monitor(tap_t * tap);
++extern void tap_copy(tap_t * dst, tap_t * src);
++extern void tap_done(tap_t * tap);
++extern int tap_move(tap_t * tap, lock_handle * target);
++extern int tap_to_coord(tap_t * tap, coord_t * target);
++
++extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
++extern int go_next_unit(tap_t * tap);
++extern int go_prev_unit(tap_t * tap);
++extern int rewind_right(tap_t * tap, int shift);
++extern int rewind_left(tap_t * tap, int shift);
++
++extern struct list_head *taps_list(void);
++
++#define for_all_taps(tap)						\
++	for (tap = list_entry(taps_list()->next, tap_t, linkage);	\
++	     taps_list() != &tap->linkage;				\
++	     tap = list_entry(tap->linkage.next, tap_t, linkage))
++
++/* __REISER4_TAP_H__ */
++#endif
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/tree.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tree.c
+@@ -0,0 +1,1875 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/*
++ * KEYS IN A TREE.
++ *
++ * The tree consists of nodes located on the disk. Node in the tree is either
++ * formatted or unformatted. Formatted node is one that has structure
++ * understood by the tree balancing and traversal code. Formatted nodes are
++ * further classified into leaf and internal nodes. Latter distinctions is
++ * (almost) of only historical importance: general structure of leaves and
++ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
++ * that are part of bodies of ordinary files and attributes.
++ *
++ * Each node in the tree spawns some interval in the key space. Key ranges for
++ * all nodes in the tree are disjoint. Actually, this only holds in some weak
++ * sense, because of the non-unique keys: intersection of key ranges for
++ * different nodes is either empty, or consists of exactly one key.
++ *
++ * Formatted node consists of a sequence of items. Each item spawns some
++ * interval in key space. Key ranges for all items in a tree are disjoint,
++ * modulo non-unique keys again. Items within nodes are ordered in the key
++ * order of the smallest key in a item.
++ *
++ * Particular type of item can be further split into units. Unit is piece of
++ * item that can be cut from item and moved into another item of the same
++ * time. Units are used by balancing code to repack data during balancing.
++ *
++ * Unit can be further split into smaller entities (for example, extent unit
++ * represents several pages, and it is natural for extent code to operate on
++ * particular pages and even bytes within one unit), but this is of no
++ * relevance to the generic balancing and lookup code.
++ *
++ * Although item is said to "spawn" range or interval of keys, it is not
++ * necessary that item contains piece of data addressable by each and every
++ * key in this range. For example, compound directory item, consisting of
++ * units corresponding to directory entries and keyed by hashes of file names,
++ * looks more as having "discrete spectrum": only some disjoint keys inside
++ * range occupied by this item really address data.
++ *
++ * No than less, each item always has well-defined least (minimal) key, that
++ * is recorded in item header, stored in the node this item is in. Also, item
++ * plugin can optionally define method ->max_key_inside() returning maximal
++ * key that can _possibly_ be located within this item. This method is used
++ * (mainly) to determine when given piece of data should be merged into
++ * existing item, in stead of creating new one. Because of this, even though
++ * ->max_key_inside() can be larger that any key actually located in the item,
++ * intervals
++ *
++ * [ min_key( item ), ->max_key_inside( item ) ]
++ *
++ * are still disjoint for all items within the _same_ node.
++ *
++ * In memory node is represented by znode. It plays several roles:
++ *
++ *  . something locks are taken on
++ *
++ *  . something tracked by transaction manager (this is going to change)
++ *
++ *  . something used to access node data
++ *
++ *  . something used to maintain tree structure in memory: sibling and
++ *  parental linkage.
++ *
++ *  . something used to organize nodes into "slums"
++ *
++ * More on znodes see in znode.[ch]
++ *
++ * DELIMITING KEYS
++ *
++ *   To simplify balancing, allow some flexibility in locking and speed up
++ *   important coord cache optimization, we keep delimiting keys of nodes in
++ *   memory. Depending on disk format (implemented by appropriate node plugin)
++ *   node on disk can record both left and right delimiting key, only one of
++ *   them, or none. Still, our balancing and tree traversal code keep both
++ *   delimiting keys for a node that is in memory stored in the znode. When
++ *   node is first brought into memory during tree traversal, its left
++ *   delimiting key is taken from its parent, and its right delimiting key is
++ *   either next key in its parent, or is right delimiting key of parent if
++ *   node is the rightmost child of parent.
++ *
++ *   Physical consistency of delimiting key is protected by special dk
++ *   read-write lock. That is, delimiting keys can only be inspected or
++ *   modified under this lock. But dk lock is only sufficient for fast
++ *   "pessimistic" check, because to simplify code and to decrease lock
++ *   contention, balancing (carry) only updates delimiting keys right before
++ *   unlocking all locked nodes on the given tree level. For example,
++ *   coord-by-key cache scans LRU list of recently accessed znodes. For each
++ *   node it first does fast check under dk spin lock. If key looked for is
++ *   not between delimiting keys for this node, next node is inspected and so
++ *   on. If key is inside of the key range, long term lock is taken on node
++ *   and key range is rechecked.
++ *
++ * COORDINATES
++ *
++ *   To find something in the tree, you supply a key, and the key is resolved
++ *   by coord_by_key() into a coord (coordinate) that is valid as long as the
++ *   node the coord points to remains locked.  As mentioned above trees
++ *   consist of nodes that consist of items that consist of units. A unit is
++ *   the smallest and indivisible piece of tree as far as balancing and tree
++ *   search are concerned. Each node, item, and unit can be addressed by
++ *   giving its level in the tree and the key occupied by this entity.  A node
++ *   knows what the key ranges are of the items within it, and how to find its
++ *   items and invoke their item handlers, but it does not know how to access
++ *   individual units within its items except through the item handlers.
++ *   coord is a structure containing a pointer to the node, the ordinal number
++ *   of the item within this node (a sort of item offset), and the ordinal
++ *   number of the unit within this item.
++ *
++ * TREE LOOKUP
++ *
++ *   There are two types of access to the tree: lookup and modification.
++ *
++ *   Lookup is a search for the key in the tree. Search can look for either
++ *   exactly the key given to it, or for the largest key that is not greater
++ *   than the key given to it. This distinction is determined by "bias"
++ *   parameter of search routine (coord_by_key()). coord_by_key() either
++ *   returns error (key is not in the tree, or some kind of external error
++ *   occurred), or successfully resolves key into coord.
++ *
++ *   This resolution is done by traversing tree top-to-bottom from root level
++ *   to the desired level. On levels above twig level (level one above the
++ *   leaf level) nodes consist exclusively of internal items. Internal item is
++ *   nothing more than pointer to the tree node on the child level. On twig
++ *   level nodes consist of internal items intermixed with extent
++ *   items. Internal items form normal search tree structure used by traversal
++ *   to descent through the tree.
++ *
++ * TREE LOOKUP OPTIMIZATIONS
++ *
++ * Tree lookup described above is expensive even if all nodes traversed are
++ * already in the memory: for each node binary search within it has to be
++ * performed and binary searches are CPU consuming and tend to destroy CPU
++ * caches.
++ *
++ * Several optimizations are used to work around this:
++ *
++ *   . cbk_cache (look-aside cache for tree traversals, see search.c for
++ *   details)
++ *
++ *   . seals (see seal.[ch])
++ *
++ *   . vroot (see search.c)
++ *
++ * General search-by-key is layered thusly:
++ *
++ *                   [check seal, if any]   --ok--> done
++ *                           |
++ *                         failed
++ *                           |
++ *                           V
++ *                     [vroot defined] --no--> node = tree_root
++ *                           |                   |
++ *                          yes                  |
++ *                           |                   |
++ *                           V                   |
++ *                       node = vroot            |
++ *                                 |             |
++ *                                 |             |
++ *                                 |             |
++ *                                 V             V
++ *                            [check cbk_cache for key]  --ok--> done
++ *                                        |
++ *                                      failed
++ *                                        |
++ *                                        V
++ *                       [start tree traversal from node]
++ *
++ */
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/item/static_stat.h"
++#include "plugin/item/item.h"
++#include "plugin/node/node.h"
++#include "plugin/plugin.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree_walk.h"
++#include "carry.h"
++#include "carry_ops.h"
++#include "tap.h"
++#include "tree.h"
++#include "vfs_ops.h"
++#include "page_cache.h"
++#include "super.h"
++#include "reiser4.h"
++#include "inode.h"
++
++#include <linux/fs.h>		/* for struct super_block  */
++#include <linux/spinlock.h>
++
++/* Disk address (block number) never ever used for any real tree node. This is
++   used as block number of "uber" znode.
++
++   Invalid block addresses are 0 by tradition.
++
++*/
++const reiser4_block_nr UBER_TREE_ADDR = 0ull;
++
++#define CUT_TREE_MIN_ITERATIONS 64
++
++static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
++
++/* return node plugin of coord->node */
++node_plugin *node_plugin_by_coord(const coord_t * coord)
++{
++	assert("vs-1", coord != NULL);
++	assert("vs-2", coord->node != NULL);
++
++	return coord->node->nplug;
++}
++
++/* insert item into tree. Fields of @coord are updated so that they can be
++ * used by consequent insert operation. */
++insert_result insert_by_key(reiser4_tree * tree	/* tree to insert new item
++						 * into */ ,
++			    const reiser4_key * key /* key of new item */ ,
++			    reiser4_item_data * data	/* parameters for item
++							 * creation */ ,
++			    coord_t * coord /* resulting insertion coord */ ,
++			    lock_handle * lh	/* resulting lock
++						 * handle */ ,
++			    tree_level stop_level /** level where to insert */ ,
++			    __u32 flags /* insertion flags */ )
++{
++	int result;
++
++	assert("nikita-358", tree != NULL);
++	assert("nikita-360", coord != NULL);
++
++	result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
++			      FIND_EXACT, stop_level, stop_level,
++			      flags | CBK_FOR_INSERT, NULL /*ra_info */ );
++	switch (result) {
++	default:
++		break;
++	case CBK_COORD_FOUND:
++		result = IBK_ALREADY_EXISTS;
++		break;
++	case CBK_COORD_NOTFOUND:
++		assert("nikita-2017", coord->node != NULL);
++		result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
++		break;
++	}
++	return result;
++}
++
++/* insert item by calling carry. Helper function called if short-cut
++   insertion failed  */
++static insert_result insert_with_carry_by_coord(coord_t * coord,	/* coord where to insert */
++						lock_handle * lh,	/* lock handle of insertion
++									 * node */
++						reiser4_item_data * data,	/* parameters of new
++										 * item */
++						const reiser4_key * key,	/* key of new item */
++						carry_opcode cop,	/* carry operation to perform */
++						cop_insert_flag flags
++						/* carry flags */ )
++{
++	int result;
++	carry_pool *pool;
++	carry_level *lowest_level;
++	carry_insert_data *cdata;
++	carry_op *op;
++
++	assert("umka-314", coord != NULL);
++
++	/* allocate carry_pool and 3 carry_level-s */
++	pool =
++	    init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
++			    sizeof(*cdata));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	lowest_level = (carry_level *) (pool + 1);
++	init_carry_level(lowest_level, pool);
++
++	op = post_carry(lowest_level, cop, coord->node, 0);
++	if (IS_ERR(op) || (op == NULL)) {
++		done_carry_pool(pool);
++		return RETERR(op ? PTR_ERR(op) : -EIO);
++	}
++	cdata = (carry_insert_data *) (lowest_level + 3);
++	cdata->coord = coord;
++	cdata->data = data;
++	cdata->key = key;
++	op->u.insert.d = cdata;
++	if (flags == 0)
++		flags = znode_get_tree(coord->node)->carry.insert_flags;
++	op->u.insert.flags = flags;
++	op->u.insert.type = COPT_ITEM_DATA;
++	op->u.insert.child = NULL;
++	if (lh != NULL) {
++		assert("nikita-3245", lh->node == coord->node);
++		lowest_level->track_type = CARRY_TRACK_CHANGE;
++		lowest_level->tracked = lh;
++	}
++
++	result = carry(lowest_level, NULL);
++	done_carry_pool(pool);
++
++	return result;
++}
++
++/* form carry queue to perform paste of @data with @key at @coord, and launch
++   its execution by calling carry().
++
++   Instruct carry to update @lh it after balancing insertion coord moves into
++   different block.
++
++*/
++static int paste_with_carry(coord_t * coord,	/* coord of paste */
++			    lock_handle * lh,	/* lock handle of node
++						 * where item is
++						 * pasted */
++			    reiser4_item_data * data,	/* parameters of new
++							 * item */
++			    const reiser4_key * key,	/* key of new item */
++			    unsigned flags /* paste flags */ )
++{
++	int result;
++	carry_pool *pool;
++	carry_level *lowest_level;
++	carry_insert_data *cdata;
++	carry_op *op;
++
++	assert("umka-315", coord != NULL);
++	assert("umka-316", key != NULL);
++
++	pool =
++	    init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
++			    sizeof(*cdata));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	lowest_level = (carry_level *) (pool + 1);
++	init_carry_level(lowest_level, pool);
++
++	op = post_carry(lowest_level, COP_PASTE, coord->node, 0);
++	if (IS_ERR(op) || (op == NULL)) {
++		done_carry_pool(pool);
++		return RETERR(op ? PTR_ERR(op) : -EIO);
++	}
++	cdata = (carry_insert_data *) (lowest_level + 3);
++	cdata->coord = coord;
++	cdata->data = data;
++	cdata->key = key;
++	op->u.paste.d = cdata;
++	if (flags == 0)
++		flags = znode_get_tree(coord->node)->carry.paste_flags;
++	op->u.paste.flags = flags;
++	op->u.paste.type = COPT_ITEM_DATA;
++	if (lh != NULL) {
++		lowest_level->track_type = CARRY_TRACK_CHANGE;
++		lowest_level->tracked = lh;
++	}
++
++	result = carry(lowest_level, NULL);
++	done_carry_pool(pool);
++
++	return result;
++}
++
++/* insert item at the given coord.
++
++   First try to skip carry by directly calling ->create_item() method of node
++   plugin. If this is impossible (there is not enough free space in the node,
++   or leftmost item in the node is created), call insert_with_carry_by_coord()
++   that will do full carry().
++
++*/
++insert_result insert_by_coord(coord_t * coord	/* coord where to
++						 * insert. coord->node has
++						 * to be write locked by
++						 * caller */ ,
++			      reiser4_item_data * data	/* data to be
++							 * inserted */ ,
++			      const reiser4_key * key /* key of new item */ ,
++			      lock_handle * lh	/* lock handle of write
++						 * lock on node */ ,
++			      __u32 flags /* insertion flags */ )
++{
++	unsigned item_size;
++	int result;
++	znode *node;
++
++	assert("vs-247", coord != NULL);
++	assert("vs-248", data != NULL);
++	assert("vs-249", data->length >= 0);
++	assert("nikita-1191", znode_is_write_locked(coord->node));
++
++	node = coord->node;
++	coord_clear_iplug(coord);
++	result = zload(node);
++	if (result != 0)
++		return result;
++
++	item_size = space_needed(node, NULL, data, 1);
++	if (item_size > znode_free_space(node) &&
++	    (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
++	    && (flags & COPI_DONT_ALLOCATE)) {
++		/* we are forced to use free space of coord->node and new item
++		   does not fit into it.
++
++		   Currently we get here only when we allocate and copy units
++		   of extent item from a node to its left neighbor during
++		   "squalloc"-ing.  If @node (this is left neighbor) does not
++		   have enough free space - we do not want to attempt any
++		   shifting and allocations because we are in squeezing and
++		   everything to the left of @node is tightly packed.
++		 */
++		result = -E_NODE_FULL;
++	} else if ((item_size <= znode_free_space(node)) &&
++		   !coord_is_before_leftmost(coord) &&
++		   (node_plugin_by_node(node)->fast_insert != NULL)
++		   && node_plugin_by_node(node)->fast_insert(coord)) {
++		/* shortcut insertion without carry() overhead.
++
++		   Only possible if:
++
++		   - there is enough free space
++
++		   - insertion is not into the leftmost position in a node
++		   (otherwise it would require updating of delimiting key in a
++		   parent)
++
++		   - node plugin agrees with this
++
++		 */
++		result =
++		    node_plugin_by_node(node)->create_item(coord, key, data,
++							   NULL);
++		znode_make_dirty(node);
++	} else {
++		/* otherwise do full-fledged carry(). */
++		result =
++		    insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
++					       flags);
++	}
++	zrelse(node);
++	return result;
++}
++
++/* @coord is set to leaf level and @data is to be inserted to twig level */
++insert_result
++insert_extent_by_coord(coord_t *
++		       coord
++		       /* coord where to insert. coord->node * has to be write * locked by caller */
++		       ,
++		       reiser4_item_data * data /* data to be inserted */ ,
++		       const reiser4_key * key /* key of new item */ ,
++		       lock_handle *
++		       lh /* lock handle of write lock on * node */ )
++{
++	assert("vs-405", coord != NULL);
++	assert("vs-406", data != NULL);
++	assert("vs-407", data->length > 0);
++	assert("vs-408", znode_is_write_locked(coord->node));
++	assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
++
++	return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
++					  0 /*flags */ );
++}
++
++/* Insert into the item at the given coord.
++
++   First try to skip carry by directly calling ->paste() method of item
++   plugin. If this is impossible (there is not enough free space in the node,
++   or we are pasting into leftmost position in the node), call
++   paste_with_carry() that will do full carry().
++
++*/
++/* paste_into_item */
++int insert_into_item(coord_t * coord /* coord of pasting */ ,
++		     lock_handle * lh /* lock handle on node involved */ ,
++		     const reiser4_key * key /* key of unit being pasted */ ,
++		     reiser4_item_data * data /* parameters for new unit */ ,
++		     unsigned flags /* insert/paste flags */ )
++{
++	int result;
++	int size_change;
++	node_plugin *nplug;
++	item_plugin *iplug;
++
++	assert("umka-317", coord != NULL);
++	assert("umka-318", key != NULL);
++
++	iplug = item_plugin_by_coord(coord);
++	nplug = node_plugin_by_coord(coord);
++
++	assert("nikita-1480", iplug == data->iplug);
++
++	size_change = space_needed(coord->node, coord, data, 0);
++	if (size_change > (int)znode_free_space(coord->node) &&
++	    (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
++	    && (flags & COPI_DONT_ALLOCATE)) {
++		/* we are forced to use free space of coord->node and new data
++		   does not fit into it. */
++		return -E_NODE_FULL;
++	}
++
++	/* shortcut paste without carry() overhead.
++
++	   Only possible if:
++
++	   - there is enough free space
++
++	   - paste is not into the leftmost unit in a node (otherwise
++	   it would require updating of delimiting key in a parent)
++
++	   - node plugin agrees with this
++
++	   - item plugin agrees with us
++	 */
++	if (size_change <= (int)znode_free_space(coord->node) &&
++	    (coord->item_pos != 0 ||
++	     coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
++	    coord->unit_pos != 0 && nplug->fast_paste != NULL &&
++	    nplug->fast_paste(coord) &&
++	    iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
++		if (size_change > 0)
++			nplug->change_item_size(coord, size_change);
++		/* NOTE-NIKITA: huh? where @key is used? */
++		result = iplug->b.paste(coord, data, NULL);
++		if (size_change < 0)
++			nplug->change_item_size(coord, size_change);
++		znode_make_dirty(coord->node);
++	} else
++		/* otherwise do full-fledged carry(). */
++		result = paste_with_carry(coord, lh, data, key, flags);
++	return result;
++}
++
++/* this either appends or truncates item @coord */
++int resize_item(coord_t * coord /* coord of item being resized */ ,
++		reiser4_item_data * data /* parameters of resize */ ,
++		reiser4_key * key /* key of new unit */ ,
++		lock_handle * lh	/* lock handle of node
++					 * being modified */ ,
++		cop_insert_flag flags /* carry flags */ )
++{
++	int result;
++	znode *node;
++
++	assert("nikita-362", coord != NULL);
++	assert("nikita-363", data != NULL);
++	assert("vs-245", data->length != 0);
++
++	node = coord->node;
++	coord_clear_iplug(coord);
++	result = zload(node);
++	if (result != 0)
++		return result;
++
++	if (data->length < 0)
++		result = node_plugin_by_coord(coord)->shrink_item(coord,
++								  -data->length);
++	else
++		result = insert_into_item(coord, lh, key, data, flags);
++
++	zrelse(node);
++	return result;
++}
++
++/* insert flow @f */
++int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
++{
++	int result;
++	carry_pool *pool;
++	carry_level *lowest_level;
++	reiser4_item_data *data;
++	carry_op *op;
++
++	pool =
++	    init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
++			    sizeof(*data));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	lowest_level = (carry_level *) (pool + 1);
++	init_carry_level(lowest_level, pool);
++
++	op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
++			0 /* operate directly on coord -> node */ );
++	if (IS_ERR(op) || (op == NULL)) {
++		done_carry_pool(pool);
++		return RETERR(op ? PTR_ERR(op) : -EIO);
++	}
++
++	/* these are permanent during insert_flow */
++	data = (reiser4_item_data *) (lowest_level + 3);
++	data->user = 1;
++	data->iplug = item_plugin_by_id(FORMATTING_ID);
++	data->arg = NULL;
++	/* data.length and data.data will be set before calling paste or
++	   insert */
++	data->length = 0;
++	data->data = NULL;
++
++	op->u.insert_flow.flags = 0;
++	op->u.insert_flow.insert_point = coord;
++	op->u.insert_flow.flow = f;
++	op->u.insert_flow.data = data;
++	op->u.insert_flow.new_nodes = 0;
++
++	lowest_level->track_type = CARRY_TRACK_CHANGE;
++	lowest_level->tracked = lh;
++
++	result = carry(lowest_level, NULL);
++	done_carry_pool(pool);
++
++	return result;
++}
++
++/* Given a coord in parent node, obtain a znode for the corresponding child */
++znode *child_znode(const coord_t * parent_coord	/* coord of pointer to
++						 * child */ ,
++		   znode * parent /* parent of child */ ,
++		   int incore_p	/* if !0 only return child if already in
++				 * memory */ ,
++		   int setup_dkeys_p	/* if !0 update delimiting keys of
++					 * child */ )
++{
++	znode *child;
++
++	assert("nikita-1374", parent_coord != NULL);
++	assert("nikita-1482", parent != NULL);
++#if REISER4_DEBUG
++	if (setup_dkeys_p)
++		assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
++#endif
++	assert("nikita-2947", znode_is_any_locked(parent));
++
++	if (znode_get_level(parent) <= LEAF_LEVEL) {
++		/* trying to get child of leaf node */
++		warning("nikita-1217", "Child of maize?");
++		return ERR_PTR(RETERR(-EIO));
++	}
++	if (item_is_internal(parent_coord)) {
++		reiser4_block_nr addr;
++		item_plugin *iplug;
++		reiser4_tree *tree;
++
++		iplug = item_plugin_by_coord(parent_coord);
++		assert("vs-512", iplug->s.internal.down_link);
++		iplug->s.internal.down_link(parent_coord, NULL, &addr);
++
++		tree = znode_get_tree(parent);
++		if (incore_p)
++			child = zlook(tree, &addr);
++		else
++			child =
++			    zget(tree, &addr, parent,
++				 znode_get_level(parent) - 1, get_gfp_mask());
++		if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
++			set_child_delimiting_keys(parent, parent_coord, child);
++	} else {
++		warning("nikita-1483", "Internal item expected");
++		child = ERR_PTR(RETERR(-EIO));
++	}
++	return child;
++}
++
++/* remove znode from transaction */
++static void uncapture_znode(znode * node)
++{
++	struct page *page;
++
++	assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
++
++	if (!blocknr_is_fake(znode_get_block(node))) {
++		int ret;
++
++		/* An already allocated block goes right to the atom's delete set. */
++		ret =
++		    reiser4_dealloc_block(znode_get_block(node), 0,
++					  BA_DEFER | BA_FORMATTED);
++		if (ret)
++			warning("zam-942",
++				"can\'t add a block (%llu) number to atom's delete set\n",
++				(unsigned long long)(*znode_get_block(node)));
++
++		spin_lock_znode(node);
++		/* Here we return flush reserved block which was reserved at the
++		 * moment when this allocated node was marked dirty and still
++		 * not used by flush in node relocation procedure.  */
++		if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
++			txn_atom *atom;
++
++			atom = jnode_get_atom(ZJNODE(node));
++			assert("zam-939", atom != NULL);
++			spin_unlock_znode(node);
++			flush_reserved2grabbed(atom, (__u64) 1);
++			spin_unlock_atom(atom);
++		} else
++			spin_unlock_znode(node);
++	} else {
++		/* znode has assigned block which is counted as "fake
++		   allocated". Return it back to "free blocks") */
++		fake_allocated2free((__u64) 1, BA_FORMATTED);
++	}
++
++	/*
++	 * uncapture page from transaction. There is a possibility of a race
++	 * with ->releasepage(): reiser4_releasepage() detaches page from this
++	 * jnode and we have nothing to uncapture. To avoid this, get
++	 * reference of node->pg under jnode spin lock. uncapture_page() will
++	 * deal with released page itself.
++	 */
++	spin_lock_znode(node);
++	page = znode_page(node);
++	if (likely(page != NULL)) {
++		/*
++		 * uncapture_page() can only be called when we are sure that
++		 * znode is pinned in memory, which we are, because
++		 * forget_znode() is only called from longterm_unlock_znode().
++		 */
++		page_cache_get(page);
++		spin_unlock_znode(node);
++		lock_page(page);
++		uncapture_page(page);
++		unlock_page(page);
++		page_cache_release(page);
++	} else {
++		txn_atom *atom;
++
++		/* handle "flush queued" znodes */
++		while (1) {
++			atom = jnode_get_atom(ZJNODE(node));
++			assert("zam-943", atom != NULL);
++
++			if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
++			    || !atom->nr_running_queues)
++				break;
++
++			spin_unlock_znode(node);
++			atom_wait_event(atom);
++			spin_lock_znode(node);
++		}
++
++		uncapture_block(ZJNODE(node));
++		spin_unlock_atom(atom);
++		zput(node);
++	}
++}
++
++/* This is called from longterm_unlock_znode() when last lock is released from
++   the node that has been removed from the tree. At this point node is removed
++   from sibling list and its lock is invalidated. */
++void forget_znode(lock_handle * handle)
++{
++	znode *node;
++	reiser4_tree *tree;
++
++	assert("umka-319", handle != NULL);
++
++	node = handle->node;
++	tree = znode_get_tree(node);
++
++	assert("vs-164", znode_is_write_locked(node));
++	assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
++	assert_rw_locked(&(node->lock.guard));
++
++	/* We assume that this node was detached from its parent before
++	 * unlocking, it gives no way to reach this node from parent through a
++	 * down link.  The node should have no children and, thereby, can't be
++	 * reached from them by their parent pointers.  The only way to obtain a
++	 * reference to the node is to use sibling pointers from its left and
++	 * right neighbors.  In the next several lines we remove the node from
++	 * the sibling list. */
++
++	write_lock_tree(tree);
++	sibling_list_remove(node);
++	znode_remove(node, tree);
++	write_unlock_tree(tree);
++
++	/* Here we set JNODE_DYING and cancel all pending lock requests.  It
++	 * forces all lock requestor threads to repeat iterations of getting
++	 * lock on a child, neighbor or parent node.  But, those threads can't
++	 * come to this node again, because this node is no longer a child,
++	 * neighbor or parent of any other node.  This order of znode
++	 * invalidation does not allow other threads to waste cpu time is a busy
++	 * loop, trying to lock dying object.  The exception is in the flush
++	 * code when we take node directly from atom's capture list.*/
++	invalidate_lock(handle);
++	uncapture_znode(node);
++}
++
++/* Check that internal item at @pointer really contains pointer to @child. */
++int check_tree_pointer(const coord_t * pointer	/* would-be pointer to
++						 * @child */ ,
++		       const znode * child /* child znode */ )
++{
++	assert("nikita-1016", pointer != NULL);
++	assert("nikita-1017", child != NULL);
++	assert("nikita-1018", pointer->node != NULL);
++
++	assert("nikita-1325", znode_is_any_locked(pointer->node));
++
++	assert("nikita-2985",
++	       znode_get_level(pointer->node) == znode_get_level(child) + 1);
++
++	coord_clear_iplug((coord_t *) pointer);
++
++	if (coord_is_existing_unit(pointer)) {
++		item_plugin *iplug;
++		reiser4_block_nr addr;
++
++		if (item_is_internal(pointer)) {
++			iplug = item_plugin_by_coord(pointer);
++			assert("vs-513", iplug->s.internal.down_link);
++			iplug->s.internal.down_link(pointer, NULL, &addr);
++			/* check that cached value is correct */
++			if (disk_addr_eq(&addr, znode_get_block(child))) {
++				return NS_FOUND;
++			}
++		}
++	}
++	/* warning ("jmacd-1002", "tree pointer incorrect"); */
++	return NS_NOT_FOUND;
++}
++
++/* find coord of pointer to new @child in @parent.
++
++   Find the &coord_t in the @parent where pointer to a given @child will
++   be in.
++
++*/
++int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
++		       znode *
++		       child UNUSED_ARG /* child znode, passed locked */ ,
++		       znode * left /* left brother of new node */ ,
++		       coord_t * result /* where result is stored in */ )
++{
++	int ret;
++
++	assert("nikita-1486", parent != NULL);
++	assert("nikita-1487", child != NULL);
++	assert("nikita-1488", result != NULL);
++
++	ret = find_child_ptr(parent, left, result);
++	if (ret != NS_FOUND) {
++		warning("nikita-1489", "Cannot find brother position: %i", ret);
++		return RETERR(-EIO);
++	} else {
++		result->between = AFTER_UNIT;
++		return RETERR(NS_NOT_FOUND);
++	}
++}
++
++/* find coord of pointer to @child in @parent.
++
++   Find the &coord_t in the @parent where pointer to a given @child is in.
++
++*/
++int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
++		   znode * child /* child znode, passed locked */ ,
++		   coord_t * result /* where result is stored in */ )
++{
++	int lookup_res;
++	node_plugin *nplug;
++	/* left delimiting key of a child */
++	reiser4_key ld;
++	reiser4_tree *tree;
++
++	assert("nikita-934", parent != NULL);
++	assert("nikita-935", child != NULL);
++	assert("nikita-936", result != NULL);
++	assert("zam-356", znode_is_loaded(parent));
++
++	coord_init_zero(result);
++	result->node = parent;
++
++	nplug = parent->nplug;
++	assert("nikita-939", nplug != NULL);
++
++	tree = znode_get_tree(parent);
++	/* NOTE-NIKITA taking read-lock on tree here assumes that @result is
++	 * not aliased to ->in_parent of some znode. Otherwise,
++	 * parent_coord_to_coord() below would modify data protected by tree
++	 * lock. */
++	read_lock_tree(tree);
++	/* fast path. Try to use cached value. Lock tree to keep
++	   node->pos_in_parent and pos->*_blocknr consistent. */
++	if (child->in_parent.item_pos + 1 != 0) {
++		parent_coord_to_coord(&child->in_parent, result);
++		if (check_tree_pointer(result, child) == NS_FOUND) {
++			read_unlock_tree(tree);
++			return NS_FOUND;
++		}
++
++		child->in_parent.item_pos = (unsigned short)~0;
++	}
++	read_unlock_tree(tree);
++
++	/* is above failed, find some key from @child. We are looking for the
++	   least key in a child. */
++	read_lock_dk(tree);
++	ld = *znode_get_ld_key(child);
++	read_unlock_dk(tree);
++	/*
++	 * now, lookup parent with key just found. Note, that left delimiting
++	 * key doesn't identify node uniquely, because (in extremely rare
++	 * case) two nodes can have equal left delimiting keys, if one of them
++	 * is completely filled with directory entries that all happened to be
++	 * hash collision. But, we check block number in check_tree_pointer()
++	 * and, so, are safe.
++	 */
++	lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
++	/* update cached pos_in_node */
++	if (lookup_res == NS_FOUND) {
++		write_lock_tree(tree);
++		coord_to_parent_coord(result, &child->in_parent);
++		write_unlock_tree(tree);
++		lookup_res = check_tree_pointer(result, child);
++	}
++	if (lookup_res == NS_NOT_FOUND)
++		lookup_res = find_child_by_addr(parent, child, result);
++	return lookup_res;
++}
++
++/* find coord of pointer to @child in @parent by scanning
++
++   Find the &coord_t in the @parent where pointer to a given @child
++   is in by scanning all internal items in @parent and comparing block
++   numbers in them with that of @child.
++
++*/
++static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
++			      znode * child /* child znode, passed locked */ ,
++			      coord_t * result /* where result is stored in */ )
++{
++	int ret;
++
++	assert("nikita-1320", parent != NULL);
++	assert("nikita-1321", child != NULL);
++	assert("nikita-1322", result != NULL);
++
++	ret = NS_NOT_FOUND;
++
++	for_all_units(result, parent) {
++		if (check_tree_pointer(result, child) == NS_FOUND) {
++			write_lock_tree(znode_get_tree(parent));
++			coord_to_parent_coord(result, &child->in_parent);
++			write_unlock_tree(znode_get_tree(parent));
++			ret = NS_FOUND;
++			break;
++		}
++	}
++	return ret;
++}
++
++/* true, if @addr is "unallocated block number", which is just address, with
++   highest bit set. */
++int is_disk_addr_unallocated(const reiser4_block_nr * addr	/* address to
++								 * check */ )
++{
++	assert("nikita-1766", addr != NULL);
++	cassert(sizeof(reiser4_block_nr) == 8);
++	return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
++	    REISER4_UNALLOCATED_STATUS_VALUE;
++}
++
++/* returns true if removing bytes of given range of key [from_key, to_key]
++   causes removing of whole item @from */
++static int
++item_removed_completely(coord_t * from, const reiser4_key * from_key,
++			const reiser4_key * to_key)
++{
++	item_plugin *iplug;
++	reiser4_key key_in_item;
++
++	assert("umka-325", from != NULL);
++	assert("", item_is_extent(from));
++
++	/* check first key just for case */
++	item_key_by_coord(from, &key_in_item);
++	if (keygt(from_key, &key_in_item))
++		return 0;
++
++	/* check last key */
++	iplug = item_plugin_by_coord(from);
++	assert("vs-611", iplug && iplug->s.file.append_key);
++
++	iplug->s.file.append_key(from, &key_in_item);
++	set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
++
++	if (keylt(to_key, &key_in_item))
++		/* last byte is not removed */
++		return 0;
++	return 1;
++}
++
++/* helper function for prepare_twig_kill(): @left and @right are formatted
++ * neighbors of extent item being completely removed. Load and lock neighbors
++ * and store lock handles into @cdata for later use by kill_hook_extent() */
++static int
++prepare_children(znode * left, znode * right, carry_kill_data * kdata)
++{
++	int result;
++	int left_loaded;
++	int right_loaded;
++
++	result = 0;
++	left_loaded = right_loaded = 0;
++
++	if (left != NULL) {
++		result = zload(left);
++		if (result == 0) {
++			left_loaded = 1;
++			result = longterm_lock_znode(kdata->left, left,
++						     ZNODE_READ_LOCK,
++						     ZNODE_LOCK_LOPRI);
++		}
++	}
++	if (result == 0 && right != NULL) {
++		result = zload(right);
++		if (result == 0) {
++			right_loaded = 1;
++			result = longterm_lock_znode(kdata->right, right,
++						     ZNODE_READ_LOCK,
++						     ZNODE_LOCK_HIPRI |
++						     ZNODE_LOCK_NONBLOCK);
++		}
++	}
++	if (result != 0) {
++		done_lh(kdata->left);
++		done_lh(kdata->right);
++		if (left_loaded != 0)
++			zrelse(left);
++		if (right_loaded != 0)
++			zrelse(right);
++	}
++	return result;
++}
++
++static void done_children(carry_kill_data * kdata)
++{
++	if (kdata->left != NULL && kdata->left->node != NULL) {
++		zrelse(kdata->left->node);
++		done_lh(kdata->left);
++	}
++	if (kdata->right != NULL && kdata->right->node != NULL) {
++		zrelse(kdata->right->node);
++		done_lh(kdata->right);
++	}
++}
++
++/* part of cut_node. It is called when cut_node is called to remove or cut part
++   of extent item. When head of that item is removed - we have to update right
++   delimiting of left neighbor of extent. When item is removed completely - we
++   have to set sibling link between left and right neighbor of removed
++   extent. This may return -E_DEADLOCK because of trying to get left neighbor
++   locked. So, caller should repeat an attempt
++*/
++/* Audited by: umka (2002.06.16) */
++static int
++prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
++{
++	int result;
++	reiser4_key key;
++	lock_handle left_lh;
++	lock_handle right_lh;
++	coord_t left_coord;
++	coord_t *from;
++	znode *left_child;
++	znode *right_child;
++	reiser4_tree *tree;
++	int left_zloaded_here, right_zloaded_here;
++
++	from = kdata->params.from;
++	assert("umka-326", from != NULL);
++	assert("umka-327", kdata->params.to != NULL);
++
++	/* for one extent item only yet */
++	assert("vs-591", item_is_extent(from));
++	assert("vs-592", from->item_pos == kdata->params.to->item_pos);
++
++	if ((kdata->params.from_key
++	     && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
++	    || from->unit_pos != 0) {
++		/* head of item @from is not removed, there is nothing to
++		   worry about */
++		return 0;
++	}
++
++	result = 0;
++	left_zloaded_here = 0;
++	right_zloaded_here = 0;
++
++	left_child = right_child = NULL;
++
++	coord_dup(&left_coord, from);
++	init_lh(&left_lh);
++	init_lh(&right_lh);
++	if (coord_prev_unit(&left_coord)) {
++		/* @from is leftmost item in its node */
++		if (!locked_left_neighbor) {
++			result =
++			    reiser4_get_left_neighbor(&left_lh, from->node,
++						      ZNODE_READ_LOCK,
++						      GN_CAN_USE_UPPER_LEVELS);
++			switch (result) {
++			case 0:
++				break;
++			case -E_NO_NEIGHBOR:
++				/* there is no formatted node to the left of
++				   from->node */
++				warning("vs-605",
++					"extent item has smallest key in "
++					"the tree and it is about to be removed");
++				return 0;
++			case -E_DEADLOCK:
++				/* need to restart */
++			default:
++				return result;
++			}
++
++			/* we have acquired left neighbor of from->node */
++			result = zload(left_lh.node);
++			if (result)
++				goto done;
++
++			locked_left_neighbor = left_lh.node;
++		} else {
++			/* squalloc_right_twig_cut should have supplied locked
++			 * left neighbor */
++			assert("vs-834",
++			       znode_is_write_locked(locked_left_neighbor));
++			result = zload(locked_left_neighbor);
++			if (result)
++				return result;
++		}
++
++		left_zloaded_here = 1;
++		coord_init_last_unit(&left_coord, locked_left_neighbor);
++	}
++
++	if (!item_is_internal(&left_coord)) {
++		/* what else but extent can be on twig level */
++		assert("vs-606", item_is_extent(&left_coord));
++
++		/* there is no left formatted child */
++		if (left_zloaded_here)
++			zrelse(locked_left_neighbor);
++		done_lh(&left_lh);
++		return 0;
++	}
++
++	tree = znode_get_tree(left_coord.node);
++	left_child = child_znode(&left_coord, left_coord.node, 1, 0);
++
++	if (IS_ERR(left_child)) {
++		result = PTR_ERR(left_child);
++		goto done;
++	}
++
++	/* left child is acquired, calculate new right delimiting key for it
++	   and get right child if it is necessary */
++	if (item_removed_completely
++	    (from, kdata->params.from_key, kdata->params.to_key)) {
++		/* try to get right child of removed item */
++		coord_t right_coord;
++
++		assert("vs-607",
++		       kdata->params.to->unit_pos ==
++		       coord_last_unit_pos(kdata->params.to));
++		coord_dup(&right_coord, kdata->params.to);
++		if (coord_next_unit(&right_coord)) {
++			/* @to is rightmost unit in the node */
++			result =
++			    reiser4_get_right_neighbor(&right_lh, from->node,
++						       ZNODE_READ_LOCK,
++						       GN_CAN_USE_UPPER_LEVELS);
++			switch (result) {
++			case 0:
++				result = zload(right_lh.node);
++				if (result)
++					goto done;
++
++				right_zloaded_here = 1;
++				coord_init_first_unit(&right_coord,
++						      right_lh.node);
++				item_key_by_coord(&right_coord, &key);
++				break;
++
++			case -E_NO_NEIGHBOR:
++				/* there is no formatted node to the right of
++				   from->node */
++				read_lock_dk(tree);
++				key = *znode_get_rd_key(from->node);
++				read_unlock_dk(tree);
++				right_coord.node = NULL;
++				result = 0;
++				break;
++			default:
++				/* real error */
++				goto done;
++			}
++		} else {
++			/* there is an item to the right of @from - take its key */
++			item_key_by_coord(&right_coord, &key);
++		}
++
++		/* try to get right child of @from */
++		if (right_coord.node &&	/* there is right neighbor of @from */
++		    item_is_internal(&right_coord)) {	/* it is internal item */
++			right_child = child_znode(&right_coord,
++						  right_coord.node, 1, 0);
++
++			if (IS_ERR(right_child)) {
++				result = PTR_ERR(right_child);
++				goto done;
++			}
++
++		}
++		/* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
++		   update of right delimiting key of left_child */
++		result = prepare_children(left_child, right_child, kdata);
++	} else {
++		/* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
++		result = prepare_children(left_child, NULL, kdata);
++	}
++
++      done:
++	if (right_child)
++		zput(right_child);
++	if (right_zloaded_here)
++		zrelse(right_lh.node);
++	done_lh(&right_lh);
++
++	if (left_child)
++		zput(left_child);
++	if (left_zloaded_here)
++		zrelse(locked_left_neighbor);
++	done_lh(&left_lh);
++	return result;
++}
++
++/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
++   are to be cut completely */
++/* for try_to_merge_with_left, delete_copied, delete_node */
++int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,	/* first key to be removed */
++		     const reiser4_key * to_key,	/* last key to be removed */
++		     reiser4_key *
++		     smallest_removed /* smallest key actually removed */ )
++{
++	int result;
++	carry_pool *pool;
++	carry_level *lowest_level;
++	carry_cut_data *cut_data;
++	carry_op *op;
++
++	assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
++
++	pool =
++	    init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
++			    sizeof(*cut_data));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++	lowest_level = (carry_level *) (pool + 1);
++	init_carry_level(lowest_level, pool);
++
++	op = post_carry(lowest_level, COP_CUT, from->node, 0);
++	assert("vs-1509", op != 0);
++	if (IS_ERR(op)) {
++		done_carry_pool(pool);
++		return PTR_ERR(op);
++	}
++
++	cut_data = (carry_cut_data *) (lowest_level + 3);
++	cut_data->params.from = from;
++	cut_data->params.to = to;
++	cut_data->params.from_key = from_key;
++	cut_data->params.to_key = to_key;
++	cut_data->params.smallest_removed = smallest_removed;
++
++	op->u.cut_or_kill.is_cut = 1;
++	op->u.cut_or_kill.u.cut = cut_data;
++
++	result = carry(lowest_level, NULL);
++	done_carry_pool(pool);
++
++	return result;
++}
++
++/* cut part of the node
++
++   Cut part or whole content of node.
++
++   cut data between @from and @to of @from->node and call carry() to make
++   corresponding changes in the tree. @from->node may become empty. If so -
++   pointer to it will be removed. Neighboring nodes are not changed. Smallest
++   removed key is stored in @smallest_removed
++
++*/
++int kill_node_content(coord_t * from,	/* coord of the first unit/item that will be eliminated */
++		      coord_t * to,	/* coord of the last unit/item that will be eliminated */
++		      const reiser4_key * from_key,	/* first key to be removed */
++		      const reiser4_key * to_key,	/* last key to be removed */
++		      reiser4_key * smallest_removed,	/* smallest key actually removed */
++		      znode * locked_left_neighbor,	/* this is set when kill_node_content is called with left neighbor
++							 * locked (in squalloc_right_twig_cut, namely) */
++		      struct inode *inode,	/* inode of file whose item (or its part) is to be killed. This is necessary to
++						   invalidate pages together with item pointing to them */
++		      int truncate)
++{				/* this call is made for file truncate)  */
++	int result;
++	carry_pool *pool;
++	carry_level *lowest_level;
++	carry_kill_data *kdata;
++	lock_handle *left_child;
++	lock_handle *right_child;
++	carry_op *op;
++
++	assert("umka-328", from != NULL);
++	assert("vs-316", !node_is_empty(from->node));
++	assert("nikita-1812", coord_is_existing_unit(from)
++	       && coord_is_existing_unit(to));
++
++	/* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
++	pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
++			       sizeof(carry_kill_data) +
++			       2 * sizeof(lock_handle) +
++			       5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
++	if (IS_ERR(pool))
++		return PTR_ERR(pool);
++
++	lowest_level = (carry_level *) (pool + 1);
++	init_carry_level(lowest_level, pool);
++
++	kdata = (carry_kill_data *) (lowest_level + 3);
++	left_child = (lock_handle *) (kdata + 1);
++	right_child = left_child + 1;
++
++	init_lh(left_child);
++	init_lh(right_child);
++
++	kdata->params.from = from;
++	kdata->params.to = to;
++	kdata->params.from_key = from_key;
++	kdata->params.to_key = to_key;
++	kdata->params.smallest_removed = smallest_removed;
++	kdata->params.truncate = truncate;
++	kdata->flags = 0;
++	kdata->inode = inode;
++	kdata->left = left_child;
++	kdata->right = right_child;
++	/* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
++	kdata->buf = (char *)(right_child + 1);
++
++	if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
++		/* left child of extent item may have to get updated right
++		   delimiting key and to get linked with right child of extent
++		   @from if it will be removed completely */
++		result = prepare_twig_kill(kdata, locked_left_neighbor);
++		if (result) {
++			done_children(kdata);
++			done_carry_pool(pool);
++			return result;
++		}
++	}
++
++	op = post_carry(lowest_level, COP_CUT, from->node, 0);
++	if (IS_ERR(op) || (op == NULL)) {
++		done_children(kdata);
++		done_carry_pool(pool);
++		return RETERR(op ? PTR_ERR(op) : -EIO);
++	}
++
++	op->u.cut_or_kill.is_cut = 0;
++	op->u.cut_or_kill.u.kill = kdata;
++
++	result = carry(lowest_level, NULL);
++
++	done_children(kdata);
++	done_carry_pool(pool);
++	return result;
++}
++
++void
++fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
++{
++	if (inode_get_flag(inode, REISER4_HAS_MMAP)) {
++		pgoff_t start_pg, end_pg;
++
++		start_pg = start >> PAGE_CACHE_SHIFT;
++		end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
++
++		if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
++			/*
++			 * kill up to the page boundary.
++			 */
++			assert("vs-123456", start_pg == end_pg);
++			reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
++						 truncate);
++		} else if (start_pg != end_pg) {
++			/*
++			 * page boundary is within killed portion of node.
++			 */
++			assert("vs-654321", end_pg - start_pg == 1);
++			reiser4_invalidate_pages(inode->i_mapping, end_pg,
++						 end_pg - start_pg, 1);
++		}
++	}
++	inode_sub_bytes(inode, end - start);
++}
++
++/**
++ * Delete whole @node from the reiser4 tree without loading it.
++ *
++ * @left: locked left neighbor,
++ * @node: node to be deleted,
++ * @smallest_removed: leftmost key of deleted node,
++ * @object: inode pointer, if we truncate a file body.
++ * @truncate: true if called for file truncate.
++ *
++ * @return: 0 if success, error code otherwise.
++ *
++ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
++ * contains the right value of the smallest removed key from the previous
++ * cut_worker() iteration.  This is needed for proper accounting of
++ * "i_blocks" and "i_bytes" fields of the @object.
++ */
++int delete_node(znode * node, reiser4_key * smallest_removed,
++		struct inode *object, int truncate)
++{
++	lock_handle parent_lock;
++	coord_t cut_from;
++	coord_t cut_to;
++	reiser4_tree *tree;
++	int ret;
++
++	assert("zam-937", node != NULL);
++	assert("zam-933", znode_is_write_locked(node));
++	assert("zam-999", smallest_removed != NULL);
++
++	init_lh(&parent_lock);
++
++	ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
++	if (ret)
++		return ret;
++
++	assert("zam-934", !znode_above_root(parent_lock.node));
++
++	ret = zload(parent_lock.node);
++	if (ret)
++		goto failed_nozrelse;
++
++	ret = find_child_ptr(parent_lock.node, node, &cut_from);
++	if (ret)
++		goto failed;
++
++	/* decrement child counter and set parent pointer to NULL before
++	   deleting the list from parent node because of checks in
++	   internal_kill_item_hook (we can delete the last item from the parent
++	   node, the parent node is going to be deleted and its c_count should
++	   be zero). */
++
++	tree = znode_get_tree(node);
++	write_lock_tree(tree);
++	init_parent_coord(&node->in_parent, NULL);
++	--parent_lock.node->c_count;
++	write_unlock_tree(tree);
++
++	assert("zam-989", item_is_internal(&cut_from));
++
++	/* @node should be deleted after unlocking. */
++	ZF_SET(node, JNODE_HEARD_BANSHEE);
++
++	/* remove a pointer from the parent node to the node being deleted. */
++	coord_dup(&cut_to, &cut_from);
++	/* FIXME: shouldn't this be kill_node_content */
++	ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
++	if (ret)
++		/* FIXME(Zam): Should we re-connect the node to its parent if
++		 * cut_node fails? */
++		goto failed;
++
++	{
++		reiser4_tree *tree = current_tree;
++		__u64 start_offset = 0, end_offset = 0;
++
++		read_lock_tree(tree);
++		write_lock_dk(tree);
++		if (object) {
++			/* We use @smallest_removed and the left delimiting of
++			 * the current node for @object->i_blocks, i_bytes
++			 * calculation.  We assume that the items after the
++			 * *@smallest_removed key have been deleted from the
++			 * file body. */
++			start_offset = get_key_offset(znode_get_ld_key(node));
++			end_offset = get_key_offset(smallest_removed);
++		}
++
++		assert("zam-1021", znode_is_connected(node));
++		if (node->left)
++			znode_set_rd_key(node->left, znode_get_rd_key(node));
++
++		*smallest_removed = *znode_get_ld_key(node);
++
++		write_unlock_dk(tree);
++		read_unlock_tree(tree);
++
++		if (object) {
++			/* we used to perform actions which are to be performed on items on their removal from tree in
++			   special item method - kill_hook. Here for optimization reasons we avoid reading node
++			   containing item we remove and can not call item's kill hook. Instead we call function which
++			   does exactly the same things as tail kill hook in assumption that node we avoid reading
++			   contains only one item and that item is a tail one. */
++			fake_kill_hook_tail(object, start_offset, end_offset,
++					    truncate);
++		}
++	}
++      failed:
++	zrelse(parent_lock.node);
++      failed_nozrelse:
++	done_lh(&parent_lock);
++
++	return ret;
++}
++
++static int can_delete(const reiser4_key *key, znode *node)
++{
++	int result;
++
++	read_lock_dk(current_tree);
++	result = keyle(key, znode_get_ld_key(node));
++	read_unlock_dk(current_tree);
++	return result;
++}
++
++/**
++ * This subroutine is not optimal but implementation seems to
++ * be easier).
++ *
++ * @tap: the point deletion process begins from,
++ * @from_key: the beginning of the deleted key range,
++ * @to_key: the end of the deleted key range,
++ * @smallest_removed: the smallest removed key,
++ * @truncate: true if called for file truncate.
++ * @progress: return true if a progress in file items deletions was made,
++ *            @smallest_removed value is actual in that case.
++ *
++ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
++ * operation was interrupted for allowing atom commit .
++ */
++int
++cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
++		       const reiser4_key * to_key,
++		       reiser4_key * smallest_removed, struct inode *object,
++		       int truncate, int *progress)
++{
++	lock_handle next_node_lock;
++	coord_t left_coord;
++	int result;
++
++	assert("zam-931", tap->coord->node != NULL);
++	assert("zam-932", znode_is_write_locked(tap->coord->node));
++
++	*progress = 0;
++	init_lh(&next_node_lock);
++
++	while (1) {
++		znode *node;	/* node from which items are cut */
++		node_plugin *nplug;	/* node plugin for @node */
++
++		node = tap->coord->node;
++
++		/* Move next_node_lock to the next node on the left. */
++		result =
++		    reiser4_get_left_neighbor(&next_node_lock, node,
++					      ZNODE_WRITE_LOCK,
++					      GN_CAN_USE_UPPER_LEVELS);
++		if (result != 0 && result != -E_NO_NEIGHBOR)
++			break;
++		/* Check can we delete the node as a whole. */
++		if (*progress && znode_get_level(node) == LEAF_LEVEL &&
++		    can_delete(from_key, node)) {
++			result = delete_node(node, smallest_removed, object,
++					     truncate);
++		} else {
++			result = tap_load(tap);
++			if (result)
++				return result;
++
++			/* Prepare the second (right) point for cut_node() */
++			if (*progress)
++				coord_init_last_unit(tap->coord, node);
++
++			else if (item_plugin_by_coord(tap->coord)->b.lookup ==
++				 NULL)
++				/* set rightmost unit for the items without lookup method */
++				tap->coord->unit_pos =
++				    coord_last_unit_pos(tap->coord);
++
++			nplug = node->nplug;
++
++			assert("vs-686", nplug);
++			assert("vs-687", nplug->lookup);
++
++			/* left_coord is leftmost unit cut from @node */
++			result = nplug->lookup(node, from_key,
++					       FIND_MAX_NOT_MORE_THAN,
++					       &left_coord);
++
++			if (IS_CBKERR(result))
++				break;
++
++			/* adjust coordinates so that they are set to existing units */
++			if (coord_set_to_right(&left_coord)
++			    || coord_set_to_left(tap->coord)) {
++				result = 0;
++				break;
++			}
++
++			if (coord_compare(&left_coord, tap->coord) ==
++			    COORD_CMP_ON_RIGHT) {
++				/* keys from @from_key to @to_key are not in the tree */
++				result = 0;
++				break;
++			}
++
++			if (left_coord.item_pos != tap->coord->item_pos) {
++				/* do not allow to cut more than one item. It is added to solve problem of truncating
++				   partially converted files. If file is partially converted there may exist a twig node
++				   containing both internal item or items pointing to leaf nodes with formatting items
++				   and extent item. We do not want to kill internal items being at twig node here
++				   because cut_tree_worker assumes killing them from level level */
++				coord_dup(&left_coord, tap->coord);
++				assert("vs-1652",
++				       coord_is_existing_unit(&left_coord));
++				left_coord.unit_pos = 0;
++			}
++
++			/* cut data from one node */
++			// *smallest_removed = *min_key();
++			result =
++			    kill_node_content(&left_coord, tap->coord, from_key,
++					      to_key, smallest_removed,
++					      next_node_lock.node, object,
++					      truncate);
++			tap_relse(tap);
++		}
++		if (result)
++			break;
++
++		++(*progress);
++
++		/* Check whether all items with keys >= from_key were removed
++		 * from the tree. */
++		if (keyle(smallest_removed, from_key))
++			/* result = 0; */
++			break;
++
++		if (next_node_lock.node == NULL)
++			break;
++
++		result = tap_move(tap, &next_node_lock);
++		done_lh(&next_node_lock);
++		if (result)
++			break;
++
++		/* Break long cut_tree operation (deletion of a large file) if
++		 * atom requires commit. */
++		if (*progress > CUT_TREE_MIN_ITERATIONS
++		    && current_atom_should_commit()) {
++			result = -E_REPEAT;
++			break;
++		}
++	}
++	done_lh(&next_node_lock);
++	// assert("vs-301", !keyeq(&smallest_removed, min_key()));
++	return result;
++}
++
++/* there is a fundamental problem with optimizing deletes: VFS does it
++   one file at a time.  Another problem is that if an item can be
++   anything, then deleting items must be done one at a time.  It just
++   seems clean to writes this to specify a from and a to key, and cut
++   everything between them though.  */
++
++/* use this function with care if deleting more than what is part of a single file. */
++/* do not use this when cutting a single item, it is suboptimal for that */
++
++/* You are encouraged to write plugin specific versions of this.  It
++   cannot be optimal for all plugins because it works item at a time,
++   and some plugins could sometimes work node at a time. Regular files
++   however are not optimizable to work node at a time because of
++   extents needing to free the blocks they point to.
++
++   Optimizations compared to v3 code:
++
++   It does not balance (that task is left to memory pressure code).
++
++   Nodes are deleted only if empty.
++
++   Uses extents.
++
++   Performs read-ahead of formatted nodes whose contents are part of
++   the deletion.
++*/
++
++/**
++ * Delete everything from the reiser4 tree between two keys: @from_key and
++ * @to_key.
++ *
++ * @from_key: the beginning of the deleted key range,
++ * @to_key: the end of the deleted key range,
++ * @smallest_removed: the smallest removed key,
++ * @object: owner of cutting items.
++ * @truncate: true if called for file truncate.
++ * @progress: return true if a progress in file items deletions was made,
++ *            @smallest_removed value is actual in that case.
++ *
++ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
++ * operation was interrupted for allowing atom commit .
++ */
++
++int
++cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
++		const reiser4_key * to_key, reiser4_key * smallest_removed_p,
++		struct inode *object, int truncate, int *progress)
++{
++	lock_handle lock;
++	int result;
++	tap_t tap;
++	coord_t right_coord;
++	reiser4_key smallest_removed;
++	int (*cut_tree_worker) (tap_t *, const reiser4_key *,
++				const reiser4_key *, reiser4_key *,
++				struct inode *, int, int *);
++	STORE_COUNTERS;
++
++	assert("umka-329", tree != NULL);
++	assert("umka-330", from_key != NULL);
++	assert("umka-331", to_key != NULL);
++	assert("zam-936", keyle(from_key, to_key));
++
++	if (smallest_removed_p == NULL)
++		smallest_removed_p = &smallest_removed;
++
++	init_lh(&lock);
++
++	do {
++		/* Find rightmost item to cut away from the tree. */
++		result = object_lookup(object, to_key, &right_coord, &lock,
++				       ZNODE_WRITE_LOCK, FIND_MAX_NOT_MORE_THAN,
++				       TWIG_LEVEL, LEAF_LEVEL, CBK_UNIQUE,
++				       NULL /*ra_info */ );
++		if (result != CBK_COORD_FOUND)
++			break;
++		if (object == NULL
++		    || inode_file_plugin(object)->cut_tree_worker == NULL)
++			cut_tree_worker = cut_tree_worker_common;
++		else
++			cut_tree_worker =
++			    inode_file_plugin(object)->cut_tree_worker;
++		tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
++		result =
++		    cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
++				    object, truncate, progress);
++		tap_done(&tap);
++
++		preempt_point();
++
++	} while (0);
++
++	done_lh(&lock);
++
++	if (result) {
++		switch (result) {
++		case -E_NO_NEIGHBOR:
++			result = 0;
++			break;
++		case -E_DEADLOCK:
++			result = -E_REPEAT;
++		case -E_REPEAT:
++		case -ENOMEM:
++		case -ENOENT:
++			break;
++		default:
++			warning("nikita-2861", "failure: %i", result);
++		}
++	}
++
++	CHECK_COUNTERS;
++	return result;
++}
++
++/* repeat cut_tree_object until everything is deleted. unlike cut_file_items, it
++ * does not end current transaction if -E_REPEAT is returned by
++ * cut_tree_object. */
++int
++cut_tree(reiser4_tree * tree, const reiser4_key * from, const reiser4_key * to,
++	 struct inode *inode, int truncate)
++{
++	int result;
++	int progress;
++
++	do {
++		result =
++		    cut_tree_object(tree, from, to, NULL, inode, truncate,
++				    &progress);
++	} while (result == -E_REPEAT);
++
++	return result;
++}
++
++/* finishing reiser4 initialization */
++int init_tree(reiser4_tree * tree	/* pointer to structure being
++					 * initialized */ ,
++	      const reiser4_block_nr * root_block	/* address of a root block
++							 * on a disk */ ,
++	      tree_level height /* height of a tree */ ,
++	      node_plugin * nplug /* default node plugin */ )
++{
++	int result;
++
++	assert("nikita-306", tree != NULL);
++	assert("nikita-307", root_block != NULL);
++	assert("nikita-308", height > 0);
++	assert("nikita-309", nplug != NULL);
++	assert("zam-587", tree->super != NULL);
++
++	tree->root_block = *root_block;
++	tree->height = height;
++	tree->estimate_one_insert = calc_estimate_one_insert(height);
++	tree->nplug = nplug;
++
++	tree->znode_epoch = 1ull;
++
++	cbk_cache_init(&tree->cbk_cache);
++
++	result = znodes_tree_init(tree);
++	if (result == 0)
++		result = jnodes_tree_init(tree);
++	if (result == 0) {
++		tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, get_gfp_mask());
++		if (IS_ERR(tree->uber)) {
++			result = PTR_ERR(tree->uber);
++			tree->uber = NULL;
++		}
++	}
++	return result;
++}
++
++/* release resources associated with @tree */
++void done_tree(reiser4_tree * tree /* tree to release */ )
++{
++	if (tree == NULL)
++		return;
++
++	if (tree->uber != NULL) {
++		zput(tree->uber);
++		tree->uber = NULL;
++	}
++	znodes_tree_done(tree);
++	jnodes_tree_done(tree);
++	cbk_cache_done(&tree->cbk_cache);
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/tree.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tree.h
+@@ -0,0 +1,579 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Tree operations. See fs/reiser4/tree.c for comments */
++
++#if !defined( __REISER4_TREE_H__ )
++#define __REISER4_TREE_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "plugin/node/node.h"
++#include "plugin/plugin.h"
++#include "znode.h"
++#include "tap.h"
++
++#include <linux/types.h>	/* for __u??  */
++#include <linux/fs.h>		/* for struct super_block  */
++#include <linux/spinlock.h>
++#include <linux/sched.h>	/* for struct task_struct */
++
++/* fictive block number never actually used */
++extern const reiser4_block_nr UBER_TREE_ADDR;
++
++/* &cbk_cache_slot - entry in a coord cache.
++
++   This is entry in a coord_by_key (cbk) cache, represented by
++   &cbk_cache.
++
++*/
++typedef struct cbk_cache_slot {
++	/* cached node */
++	znode *node;
++	/* linkage to the next cbk cache slot in a LRU order */
++	struct list_head lru;
++} cbk_cache_slot;
++
++/* &cbk_cache - coord cache. This is part of reiser4_tree.
++
++   cbk_cache is supposed to speed up tree lookups by caching results of recent
++   successful lookups (we don't cache negative results as dentry cache
++   does). Cache consists of relatively small number of entries kept in a LRU
++   order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
++   which we can obtain a range of keys that covered by this znode. Before
++   embarking into real tree traversal we scan cbk_cache slot by slot and for
++   each slot check whether key we are looking for is between minimal and
++   maximal keys for node pointed to by this slot. If no match is found, real
++   tree traversal is performed and if result is successful, appropriate entry
++   is inserted into cache, possibly pulling least recently used entry out of
++   it.
++
++   Tree spin lock is used to protect coord cache. If contention for this
++   lock proves to be too high, more finer grained locking can be added.
++
++   Invariants involving parts of this data-type:
++
++      [cbk-cache-invariant]
++*/
++typedef struct cbk_cache {
++	/* serializator */
++	rwlock_t guard;
++	int nr_slots;
++	/* head of LRU list of cache slots */
++	struct list_head lru;
++	/* actual array of slots */
++	cbk_cache_slot *slot;
++} cbk_cache;
++
++
++/* level_lookup_result - possible outcome of looking up key at some level.
++   This is used by coord_by_key when traversing tree downward. */
++typedef enum {
++	/* continue to the next level */
++	LOOKUP_CONT,
++	/* done. Either required item was found, or we can prove it
++	   doesn't exist, or some error occurred. */
++	LOOKUP_DONE,
++	/* restart traversal from the root. Infamous "repetition". */
++	LOOKUP_REST
++} level_lookup_result;
++
++/*    This is representation of internal reiser4 tree where all file-system
++   data and meta-data are stored. This structure is passed to all tree
++   manipulation functions. It's different from the super block because:
++   we don't want to limit ourselves to strictly one to one mapping
++   between super blocks and trees, and, because they are logically
++   different: there are things in a super block that have no relation to
++   the tree (bitmaps, journalling area, mount options, etc.) and there
++   are things in a tree that bear no relation to the super block, like
++   tree of znodes.
++
++   At this time, there is only one tree
++   per filesystem, and this struct is part of the super block.  We only
++   call the super block the super block for historical reasons (most
++   other filesystems call the per filesystem metadata the super block).
++*/
++
++struct reiser4_tree {
++	/* block_nr == 0 is fake znode. Write lock it, while changing
++	   tree height. */
++	/* disk address of root node of a tree */
++	reiser4_block_nr root_block;
++
++	/* level of the root node. If this is 1, tree consists of root
++	   node only */
++	tree_level height;
++
++	/*
++	 * this is cached here avoid calling plugins through function
++	 * dereference all the time.
++	 */
++	__u64 estimate_one_insert;
++
++	/* cache of recent tree lookup results */
++	cbk_cache cbk_cache;
++
++	/* hash table to look up znodes by block number. */
++	z_hash_table zhash_table;
++	z_hash_table zfake_table;
++	/* hash table to look up jnodes by inode and offset. */
++	j_hash_table jhash_table;
++
++	/* lock protecting:
++	   - parent pointers,
++	   - sibling pointers,
++	   - znode hash table
++	   - coord cache
++	 */
++	/* NOTE: The "giant" tree lock can be replaced by more spin locks,
++	   hoping they will be less contented. We can use one spin lock per one
++	   znode hash bucket.  With adding of some code complexity, sibling
++	   pointers can be protected by both znode spin locks.  However it looks
++	   more SMP scalable we should test this locking change on n-ways (n >
++	   4) SMP machines.  Current 4-ways machine test does not show that tree
++	   lock is contented and it is a bottleneck (2003.07.25). */
++
++	rwlock_t tree_lock;
++
++	/* lock protecting delimiting keys */
++	rwlock_t dk_lock;
++
++	/* spin lock protecting znode_epoch */
++	spinlock_t epoch_lock;
++	/* version stamp used to mark znode updates. See seal.[ch] for more
++	 * information. */
++	__u64 znode_epoch;
++
++	znode *uber;
++	node_plugin *nplug;
++	struct super_block *super;
++	struct {
++		/* carry flags used for insertion of new nodes */
++		__u32 new_node_flags;
++		/* carry flags used for insertion of new extents */
++		__u32 new_extent_flags;
++		/* carry flags used for paste operations */
++		__u32 paste_flags;
++		/* carry flags used for insert operations */
++		__u32 insert_flags;
++	} carry;
++};
++
++extern int init_tree(reiser4_tree * tree,
++		     const reiser4_block_nr * root_block, tree_level height,
++		     node_plugin * default_plugin);
++extern void done_tree(reiser4_tree * tree);
++
++/* cbk flags: options for coord_by_key() */
++typedef enum {
++	/* coord_by_key() is called for insertion. This is necessary because
++	   of extents being located at the twig level. For explanation, see
++	   comment just above is_next_item_internal().
++	 */
++	CBK_FOR_INSERT = (1 << 0),
++	/* coord_by_key() is called with key that is known to be unique */
++	CBK_UNIQUE = (1 << 1),
++	/* coord_by_key() can trust delimiting keys. This options is not user
++	   accessible. coord_by_key() will set it automatically. It will be
++	   only cleared by special-case in extents-on-the-twig-level handling
++	   where it is necessary to insert item with a key smaller than
++	   leftmost key in a node. This is necessary because of extents being
++	   located at the twig level. For explanation, see comment just above
++	   is_next_item_internal().
++	 */
++	CBK_TRUST_DK = (1 << 2),
++	CBK_READA = (1 << 3),	/* original: readahead leaves which contain items of certain file */
++	CBK_READDIR_RA = (1 << 4),	/* readdir: readahead whole directory and all its stat datas */
++	CBK_DKSET = (1 << 5),
++	CBK_EXTENDED_COORD = (1 << 6),	/* coord_t is actually */
++	CBK_IN_CACHE = (1 << 7),	/* node is already in cache */
++	CBK_USE_CRABLOCK = (1 << 8)	/* use crab_lock in stead of long term
++					 * lock */
++} cbk_flags;
++
++/* insertion outcome. IBK = insert by key */
++typedef enum {
++	IBK_INSERT_OK = 0,
++	IBK_ALREADY_EXISTS = -EEXIST,
++	IBK_IO_ERROR = -EIO,
++	IBK_NO_SPACE = -E_NODE_FULL,
++	IBK_OOM = -ENOMEM
++} insert_result;
++
++#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
++
++typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
++				     lock_handle * lh, void *arg);
++extern int iterate_tree(reiser4_tree * tree, coord_t * coord, lock_handle * lh,
++			tree_iterate_actor_t actor, void *arg,
++			znode_lock_mode mode, int through_units_p);
++extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
++			  znode_lock_request pri, lock_handle * lh);
++
++/* return node plugin of @node */
++static inline node_plugin *node_plugin_by_node(const znode *
++					       node /* node to query */ )
++{
++	assert("vs-213", node != NULL);
++	assert("vs-214", znode_is_loaded(node));
++
++	return node->nplug;
++}
++
++/* number of items in @node */
++static inline pos_in_node_t node_num_items(const znode * node)
++{
++	assert("nikita-2754", znode_is_loaded(node));
++	assert("nikita-2468",
++	       node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
++
++	return node->nr_items;
++}
++
++/* Return the number of items at the present node.  Asserts coord->node !=
++   NULL. */
++static inline unsigned coord_num_items(const coord_t * coord)
++{
++	assert("jmacd-9805", coord->node != NULL);
++
++	return node_num_items(coord->node);
++}
++
++/* true if @node is empty */
++static inline int node_is_empty(const znode * node)
++{
++	return node_num_items(node) == 0;
++}
++
++typedef enum {
++	SHIFTED_SOMETHING = 0,
++	SHIFT_NO_SPACE = -E_NODE_FULL,
++	SHIFT_IO_ERROR = -EIO,
++	SHIFT_OOM = -ENOMEM,
++} shift_result;
++
++extern node_plugin *node_plugin_by_coord(const coord_t * coord);
++extern int is_coord_in_node(const coord_t * coord);
++extern int key_in_node(const reiser4_key *, const coord_t *);
++extern void coord_item_move_to(coord_t * coord, int items);
++extern void coord_unit_move_to(coord_t * coord, int units);
++
++/* there are two types of repetitive accesses (ra): intra-syscall
++   (local) and inter-syscall (global). Local ra is used when
++   during single syscall we add/delete several items and units in the
++   same place in a tree. Note that plan-A fragments local ra by
++   separating stat-data and file body in key-space. Global ra is
++   used when user does repetitive modifications in the same place in a
++   tree.
++
++   Our ra implementation serves following purposes:
++    1 it affects balancing decisions so that next operation in a row
++      can be performed faster;
++    2 it affects lower-level read-ahead in page-cache;
++    3 it allows to avoid unnecessary lookups by maintaining some state
++      across several operations (this is only for local ra);
++    4 it leaves room for lazy-micro-balancing: when we start a sequence of
++      operations they are performed without actually doing any intra-node
++      shifts, until we finish sequence or scope of sequence leaves
++      current node, only then we really pack node (local ra only).
++*/
++
++/* another thing that can be useful is to keep per-tree and/or
++   per-process cache of recent lookups. This cache can be organised as a
++   list of block numbers of formatted nodes sorted by starting key in
++   this node. Balancings should invalidate appropriate parts of this
++   cache.
++*/
++
++lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
++			   coord_t * coord, lock_handle * handle,
++			   znode_lock_mode lock, lookup_bias bias,
++			   tree_level lock_level, tree_level stop_level,
++			   __u32 flags, ra_info_t *);
++
++lookup_result object_lookup(struct inode *object,
++			    const reiser4_key * key,
++			    coord_t * coord,
++			    lock_handle * lh,
++			    znode_lock_mode lock_mode,
++			    lookup_bias bias,
++			    tree_level lock_level,
++			    tree_level stop_level,
++			    __u32 flags, ra_info_t * info);
++
++insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
++			    reiser4_item_data * data, coord_t * coord,
++			    lock_handle * lh,
++			    tree_level stop_level, __u32 flags);
++insert_result insert_by_coord(coord_t * coord,
++			      reiser4_item_data * data, const reiser4_key * key,
++			      lock_handle * lh, __u32);
++insert_result insert_extent_by_coord(coord_t * coord,
++				     reiser4_item_data * data,
++				     const reiser4_key * key, lock_handle * lh);
++int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
++		     const reiser4_key * to_key,
++		     reiser4_key * smallest_removed);
++int kill_node_content(coord_t * from, coord_t * to,
++		      const reiser4_key * from_key, const reiser4_key * to_key,
++		      reiser4_key * smallest_removed,
++		      znode * locked_left_neighbor, struct inode *inode,
++		      int truncate);
++
++int resize_item(coord_t * coord, reiser4_item_data * data,
++		reiser4_key * key, lock_handle * lh, cop_insert_flag);
++int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
++		     reiser4_item_data * data, unsigned);
++int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
++int find_new_child_ptr(znode * parent, znode * child, znode * left,
++		       coord_t * result);
++
++int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
++int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
++
++void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
++
++extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
++				  const reiser4_key *, reiser4_key *,
++				  struct inode *, int, int *);
++extern int cut_tree_object(reiser4_tree *, const reiser4_key *,
++			   const reiser4_key *, reiser4_key *, struct inode *,
++			   int, int *);
++extern int cut_tree(reiser4_tree * tree, const reiser4_key * from,
++		    const reiser4_key * to, struct inode *, int);
++
++extern int delete_node(znode * node, reiser4_key *, struct inode *, int);
++extern int check_tree_pointer(const coord_t * pointer, const znode * child);
++extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
++			      znode * left, coord_t * result);
++extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
++extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
++				     znode * child);
++extern znode *child_znode(const coord_t * in_parent, znode * parent,
++			  int incore_p, int setup_dkeys_p);
++
++extern int cbk_cache_init(cbk_cache * cache);
++extern void cbk_cache_done(cbk_cache * cache);
++extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
++
++extern char *sprint_address(const reiser4_block_nr * block);
++
++#if REISER4_DEBUG
++extern void print_coord_content(const char *prefix, coord_t * p);
++extern void reiser4_print_address(const char *prefix,
++			const reiser4_block_nr * block);
++extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
++			   __u32 flags);
++extern void check_dkeys(znode *node);
++#else
++#define print_coord_content(p, c) noop
++#define reiser4_print_address(p, b) noop
++#endif
++
++extern void forget_znode(lock_handle * handle);
++extern int deallocate_znode(znode * node);
++
++extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
++
++/* struct used internally to pack all numerous arguments of tree lookup.
++    Used to avoid passing a lot of arguments to helper functions. */
++typedef struct cbk_handle {
++	/* tree we are in */
++	reiser4_tree *tree;
++	/* key we are going after */
++	const reiser4_key *key;
++	/* coord we will store result in */
++	coord_t *coord;
++	/* type of lock to take on target node */
++	znode_lock_mode lock_mode;
++	/* lookup bias. See comments at the declaration of lookup_bias */
++	lookup_bias bias;
++	/* lock level: level starting from which tree traversal starts taking
++	 * write locks. */
++	tree_level lock_level;
++	/* level where search will stop. Either item will be found between
++	   lock_level and stop_level, or CBK_COORD_NOTFOUND will be
++	   returned.
++	 */
++	tree_level stop_level;
++	/* level we are currently at */
++	tree_level level;
++	/* block number of @active node. Tree traversal operates on two
++	   nodes: active and parent.  */
++	reiser4_block_nr block;
++	/* put here error message to be printed by caller */
++	const char *error;
++	/* result passed back to caller */
++	lookup_result result;
++	/* lock handles for active and parent */
++	lock_handle *parent_lh;
++	lock_handle *active_lh;
++	reiser4_key ld_key;
++	reiser4_key rd_key;
++	/* flags, passed to the cbk routine. Bits of this bitmask are defined
++	   in tree.h:cbk_flags enum. */
++	__u32 flags;
++	ra_info_t *ra_info;
++	struct inode *object;
++} cbk_handle;
++
++extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
++
++/* eottl.c */
++extern int handle_eottl(cbk_handle *h, int *outcome);
++
++int lookup_multikey(cbk_handle * handle, int nr_keys);
++int lookup_couple(reiser4_tree * tree,
++		  const reiser4_key * key1, const reiser4_key * key2,
++		  coord_t * coord1, coord_t * coord2,
++		  lock_handle * lh1, lock_handle * lh2,
++		  znode_lock_mode lock_mode, lookup_bias bias,
++		  tree_level lock_level, tree_level stop_level, __u32 flags,
++		  int *result1, int *result2);
++
++
++static inline void read_lock_tree(reiser4_tree *tree)
++{
++	/* check that tree is not locked */
++	assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
++		    LOCK_CNT_NIL(read_locked_tree) &&
++		    LOCK_CNT_NIL(write_locked_tree)));
++	/* check that spinlocks of lower priorities are not held */
++	assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
++		    LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_NIL(spin_locked_stack)));
++
++	read_lock(&(tree->tree_lock));
++
++	LOCK_CNT_INC(read_locked_tree);
++	LOCK_CNT_INC(rw_locked_tree);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline void read_unlock_tree(reiser4_tree *tree)
++{
++	assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
++	assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(read_locked_tree);
++	LOCK_CNT_DEC(rw_locked_tree);
++	LOCK_CNT_DEC(spin_locked);
++
++	read_unlock(&(tree->tree_lock));
++}
++
++static inline void write_lock_tree(reiser4_tree *tree)
++{
++	/* check that tree is not locked */
++	assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
++		    LOCK_CNT_NIL(read_locked_tree) &&
++		    LOCK_CNT_NIL(write_locked_tree)));
++	/* check that spinlocks of lower priorities are not held */
++	assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
++		    LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_NIL(spin_locked_stack)));
++
++	write_lock(&(tree->tree_lock));
++
++	LOCK_CNT_INC(write_locked_tree);
++	LOCK_CNT_INC(rw_locked_tree);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline void write_unlock_tree(reiser4_tree *tree)
++{
++	assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
++	assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(write_locked_tree);
++	LOCK_CNT_DEC(rw_locked_tree);
++	LOCK_CNT_DEC(spin_locked);
++
++	write_unlock(&(tree->tree_lock));
++}
++
++static inline void read_lock_dk(reiser4_tree *tree)
++{
++	/* check that dk is not locked */
++	assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_NIL(read_locked_dk) &&
++		    LOCK_CNT_NIL(write_locked_dk)));
++	/* check that spinlocks of lower priorities are not held */
++	assert("", LOCK_CNT_NIL(spin_locked_stack));
++
++	read_lock(&((tree)->dk_lock));
++
++	LOCK_CNT_INC(read_locked_dk);
++	LOCK_CNT_INC(rw_locked_dk);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline void read_unlock_dk(reiser4_tree *tree)
++{
++	assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
++	assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(read_locked_dk);
++	LOCK_CNT_DEC(rw_locked_dk);
++	LOCK_CNT_DEC(spin_locked);
++
++	read_unlock(&(tree->dk_lock));
++}
++
++static inline void write_lock_dk(reiser4_tree *tree)
++{
++	/* check that dk is not locked */
++	assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_NIL(read_locked_dk) &&
++		    LOCK_CNT_NIL(write_locked_dk)));
++	/* check that spinlocks of lower priorities are not held */
++	assert("", LOCK_CNT_NIL(spin_locked_stack));
++
++	write_lock(&((tree)->dk_lock));
++
++	LOCK_CNT_INC(write_locked_dk);
++	LOCK_CNT_INC(rw_locked_dk);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline void write_unlock_dk(reiser4_tree *tree)
++{
++	assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
++	assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(write_locked_dk);
++	LOCK_CNT_DEC(rw_locked_dk);
++	LOCK_CNT_DEC(spin_locked);
++
++	write_unlock(&(tree->dk_lock));
++}
++
++/* estimate api. Implementation is in estimate.c */
++reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
++reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
++reiser4_block_nr estimate_insert_flow(tree_level);
++reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
++reiser4_block_nr calc_estimate_one_insert(tree_level);
++reiser4_block_nr estimate_dirty_cluster(struct inode *);
++reiser4_block_nr estimate_insert_cluster(struct inode *);
++reiser4_block_nr estimate_update_cluster(struct inode *);
++
++
++/* __REISER4_TREE_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/tree_mod.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tree_mod.c
+@@ -0,0 +1,383 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/*
++ * Functions to add/delete new nodes to/from the tree.
++ *
++ * Functions from this file are used by carry (see carry*) to handle:
++ *
++ *     . insertion of new formatted node into tree
++ *
++ *     . addition of new tree root, increasing tree height
++ *
++ *     . removing tree root, decreasing tree height
++ *
++ */
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/plugin.h"
++#include "jnode.h"
++#include "znode.h"
++#include "tree_mod.h"
++#include "block_alloc.h"
++#include "tree_walk.h"
++#include "tree.h"
++#include "super.h"
++
++#include <linux/err.h>
++
++static int add_child_ptr(znode * parent, znode * child);
++/* warning only issued if error is not -E_REPEAT */
++#define ewarning( error, ... )			\
++	if( ( error ) != -E_REPEAT )		\
++		warning( __VA_ARGS__ )
++
++/* allocate new node on the @level and immediately on the right of @brother. */
++znode *new_node(znode * brother /* existing left neighbor of new node */ ,
++		tree_level level	/* tree level at which new node is to
++					 * be allocated */ )
++{
++	znode *result;
++	int retcode;
++	reiser4_block_nr blocknr;
++
++	assert("nikita-930", brother != NULL);
++	assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
++
++	retcode = assign_fake_blocknr_formatted(&blocknr);
++	if (retcode == 0) {
++		result =
++		    zget(znode_get_tree(brother), &blocknr, NULL, level,
++			 get_gfp_mask());
++		if (IS_ERR(result)) {
++			ewarning(PTR_ERR(result), "nikita-929",
++				 "Cannot allocate znode for carry: %li",
++				 PTR_ERR(result));
++			return result;
++		}
++		/* cheap test, can be executed even when debugging is off */
++		if (!znode_just_created(result)) {
++			warning("nikita-2213",
++				"Allocated already existing block: %llu",
++				(unsigned long long)blocknr);
++			zput(result);
++			return ERR_PTR(RETERR(-EIO));
++		}
++
++		assert("nikita-931", result != NULL);
++		result->nplug = znode_get_tree(brother)->nplug;
++		assert("nikita-933", result->nplug != NULL);
++
++		retcode = zinit_new(result, get_gfp_mask());
++		if (retcode == 0) {
++			ZF_SET(result, JNODE_CREATED);
++			zrelse(result);
++		} else {
++			zput(result);
++			result = ERR_PTR(retcode);
++		}
++	} else {
++		/* failure to allocate new node during balancing.
++		   This should never happen. Ever. Returning -E_REPEAT
++		   is not viable solution, because "out of disk space"
++		   is not transient error that will go away by itself.
++		 */
++		ewarning(retcode, "nikita-928",
++			 "Cannot allocate block for carry: %i", retcode);
++		result = ERR_PTR(retcode);
++	}
++	assert("nikita-1071", result != NULL);
++	return result;
++}
++
++/* allocate new root and add it to the tree
++
++   This helper function is called by add_new_root().
++
++*/
++znode *add_tree_root(znode * old_root /* existing tree root */ ,
++		     znode * fake /* "fake" znode */ )
++{
++	reiser4_tree *tree = znode_get_tree(old_root);
++	znode *new_root = NULL;	/* to shut gcc up */
++	int result;
++
++	assert("nikita-1069", old_root != NULL);
++	assert("umka-262", fake != NULL);
++	assert("umka-263", tree != NULL);
++
++	/* "fake" znode---one always hanging just above current root. This
++	   node is locked when new root is created or existing root is
++	   deleted. Downward tree traversal takes lock on it before taking
++	   lock on a root node. This avoids race conditions with root
++	   manipulations.
++
++	 */
++	assert("nikita-1348", znode_above_root(fake));
++	assert("nikita-1211", znode_is_root(old_root));
++
++	result = 0;
++	if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
++		warning("nikita-1344", "Tree is too tall: %i", tree->height);
++		/* ext2 returns -ENOSPC when it runs out of free inodes with a
++		   following comment (fs/ext2/ialloc.c:441): Is it really
++		   ENOSPC?
++
++		   -EXFULL? -EINVAL?
++		 */
++		result = RETERR(-ENOSPC);
++	} else {
++		/* Allocate block for new root. It's not that
++		   important where it will be allocated, as root is
++		   almost always in memory. Moreover, allocate on
++		   flush can be going here.
++		 */
++		assert("nikita-1448", znode_is_root(old_root));
++		new_root = new_node(fake, tree->height + 1);
++		if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
++			lock_handle rlh;
++
++			init_lh(&rlh);
++			result =
++			    longterm_lock_znode(&rlh, new_root,
++						ZNODE_WRITE_LOCK,
++						ZNODE_LOCK_LOPRI);
++			if (result == 0) {
++				parent_coord_t *in_parent;
++
++				znode_make_dirty(fake);
++
++				/* new root is a child of "fake" node */
++				write_lock_tree(tree);
++
++				++tree->height;
++
++				/* recalculate max balance overhead */
++				tree->estimate_one_insert =
++				    estimate_one_insert_item(tree);
++
++				tree->root_block = *znode_get_block(new_root);
++				in_parent = &new_root->in_parent;
++				init_parent_coord(in_parent, fake);
++				/* manually insert new root into sibling
++				 * list. With this all nodes involved into
++				 * balancing are connected after balancing is
++				 * done---useful invariant to check. */
++				sibling_list_insert_nolock(new_root, NULL);
++				write_unlock_tree(tree);
++
++				/* insert into new root pointer to the
++				   @old_root. */
++				assert("nikita-1110",
++				       WITH_DATA(new_root,
++						 node_is_empty(new_root)));
++				write_lock_dk(tree);
++				znode_set_ld_key(new_root, min_key());
++				znode_set_rd_key(new_root, max_key());
++				write_unlock_dk(tree);
++				if (REISER4_DEBUG) {
++					ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
++					ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
++					ZF_SET(old_root, JNODE_ORPHAN);
++				}
++				result = add_child_ptr(new_root, old_root);
++				done_lh(&rlh);
++			}
++			zrelse(new_root);
++		}
++	}
++	if (result != 0)
++		new_root = ERR_PTR(result);
++	return new_root;
++}
++
++/* build &reiser4_item_data for inserting child pointer
++
++   Build &reiser4_item_data that can be later used to insert pointer to @child
++   in its parent.
++
++*/
++void build_child_ptr_data(znode * child	/* node pointer to which will be
++					 * inserted */ ,
++			  reiser4_item_data * data /* where to store result */ )
++{
++	assert("nikita-1116", child != NULL);
++	assert("nikita-1117", data != NULL);
++
++	/*
++	 * NOTE: use address of child's blocknr as address of data to be
++	 * inserted. As result of this data gets into on-disk structure in cpu
++	 * byte order. internal's create_hook converts it to little endian byte
++	 * order.
++	 */
++	data->data = (char *)znode_get_block(child);
++	/* data -> data is kernel space */
++	data->user = 0;
++	data->length = sizeof(reiser4_block_nr);
++	/* FIXME-VS: hardcoded internal item? */
++
++	/* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
++	data->iplug = item_plugin_by_id(NODE_POINTER_ID);
++}
++
++/* add pointer to @child into empty @parent.
++
++   This is used when pointer to old root is inserted into new root which is
++   empty.
++*/
++static int add_child_ptr(znode * parent, znode * child)
++{
++	coord_t coord;
++	reiser4_item_data data;
++	int result;
++	reiser4_key key;
++
++	assert("nikita-1111", parent != NULL);
++	assert("nikita-1112", child != NULL);
++	assert("nikita-1115",
++	       znode_get_level(parent) == znode_get_level(child) + 1);
++
++	result = zload(parent);
++	if (result != 0)
++		return result;
++	assert("nikita-1113", node_is_empty(parent));
++	coord_init_first_unit(&coord, parent);
++
++	build_child_ptr_data(child, &data);
++	data.arg = NULL;
++
++	read_lock_dk(znode_get_tree(parent));
++	key = *znode_get_ld_key(child);
++	read_unlock_dk(znode_get_tree(parent));
++
++	result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
++							  NULL);
++	znode_make_dirty(parent);
++	zrelse(parent);
++	return result;
++}
++
++/* actually remove tree root */
++static int kill_root(reiser4_tree * tree	/* tree from which root is being
++						 * removed */ ,
++		     znode * old_root /* root node that is being removed */ ,
++		     znode * new_root	/* new root---sole child of *
++					 * @old_root */ ,
++		     const reiser4_block_nr * new_root_blk	/* disk address of
++								 * @new_root */ )
++{
++	znode *uber;
++	int result;
++	lock_handle handle_for_uber;
++
++	assert("umka-265", tree != NULL);
++	assert("nikita-1198", new_root != NULL);
++	assert("nikita-1199",
++	       znode_get_level(new_root) + 1 == znode_get_level(old_root));
++
++	assert("nikita-1201", znode_is_write_locked(old_root));
++
++	assert("nikita-1203",
++	       disk_addr_eq(new_root_blk, znode_get_block(new_root)));
++
++	init_lh(&handle_for_uber);
++	/* obtain and lock "fake" znode protecting changes in tree height. */
++	result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
++				&handle_for_uber);
++	if (result == 0) {
++		uber = handle_for_uber.node;
++
++		znode_make_dirty(uber);
++
++		/* don't take long term lock a @new_root. Take spinlock. */
++
++		write_lock_tree(tree);
++
++		tree->root_block = *new_root_blk;
++		--tree->height;
++
++		/* recalculate max balance overhead */
++		tree->estimate_one_insert = estimate_one_insert_item(tree);
++
++		assert("nikita-1202",
++		       tree->height == znode_get_level(new_root));
++
++		/* new root is child on "fake" node */
++		init_parent_coord(&new_root->in_parent, uber);
++		++uber->c_count;
++
++		/* sibling_list_insert_nolock(new_root, NULL); */
++		write_unlock_tree(tree);
++
++		/* reinitialise old root. */
++		result = node_plugin_by_node(old_root)->init(old_root);
++		znode_make_dirty(old_root);
++		if (result == 0) {
++			assert("nikita-1279", node_is_empty(old_root));
++			ZF_SET(old_root, JNODE_HEARD_BANSHEE);
++			old_root->c_count = 0;
++		}
++	}
++	done_lh(&handle_for_uber);
++
++	return result;
++}
++
++/* remove tree root
++
++   This function removes tree root, decreasing tree height by one.  Tree root
++   and its only child (that is going to become new tree root) are write locked
++   at the entry.
++
++   To remove tree root we need to take lock on special "fake" znode that
++   protects changes of tree height. See comments in add_tree_root() for more
++   on this.
++
++   Also parent pointers have to be updated in
++   old and new root. To simplify code, function is split into two parts: outer
++   kill_tree_root() collects all necessary arguments and calls kill_root()
++   to do the actual job.
++
++*/
++int kill_tree_root(znode * old_root /* tree root that we are removing */ )
++{
++	int result;
++	coord_t down_link;
++	znode *new_root;
++	reiser4_tree *tree;
++
++	assert("umka-266", current_tree != NULL);
++	assert("nikita-1194", old_root != NULL);
++	assert("nikita-1196", znode_is_root(old_root));
++	assert("nikita-1200", node_num_items(old_root) == 1);
++	assert("nikita-1401", znode_is_write_locked(old_root));
++
++	coord_init_first_unit(&down_link, old_root);
++
++	tree = znode_get_tree(old_root);
++	new_root = child_znode(&down_link, old_root, 0, 1);
++	if (!IS_ERR(new_root)) {
++		result =
++		    kill_root(tree, old_root, new_root,
++			      znode_get_block(new_root));
++		zput(new_root);
++	} else
++		result = PTR_ERR(new_root);
++
++	return result;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/tree_mod.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tree_mod.h
+@@ -0,0 +1,29 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
++ * comments. */
++
++#if !defined( __REISER4_TREE_MOD_H__ )
++#define __REISER4_TREE_MOD_H__
++
++#include "forward.h"
++
++znode *new_node(znode * brother, tree_level level);
++znode *add_tree_root(znode * old_root, znode * fake);
++int kill_tree_root(znode * old_root);
++void build_child_ptr_data(znode * child, reiser4_item_data * data);
++
++/* __REISER4_TREE_MOD_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/tree_walk.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tree_walk.c
+@@ -0,0 +1,926 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Routines and macros to:
++
++   get_left_neighbor()
++
++   get_right_neighbor()
++
++   get_parent()
++
++   get_first_child()
++
++   get_last_child()
++
++   various routines to walk the whole tree and do things to it like
++   repack it, or move it to tertiary storage.  Please make them as
++   generic as is reasonable.
++
++*/
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "coord.h"
++#include "plugin/item/item.h"
++#include "jnode.h"
++#include "znode.h"
++#include "tree_walk.h"
++#include "tree.h"
++#include "super.h"
++
++/* These macros are used internally in tree_walk.c in attempt to make
++   lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
++   lock_left_neighbor */
++#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
++#define FIELD_OFFSET(name)  offsetof(znode, name)
++#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
++#define LEFT_PTR_OFFSET   FIELD_OFFSET(left)
++#define RIGHT_PTR_OFFSET  FIELD_OFFSET(right)
++
++/* This is the generic procedure to get and lock `generic' neighbor (left or
++    right neighbor or parent). It implements common algorithm for all cases of
++    getting lock on neighbor node, only znode structure field is different in
++    each case. This is parameterized by ptr_offset argument, which is byte
++    offset for the pointer to the desired neighbor within the current node's
++    znode structure. This function should be called with the tree lock held */
++static int lock_neighbor(
++				/* resulting lock handle */
++				lock_handle * result,
++				/* znode to lock */
++				znode * node,
++				/* pointer to neighbor (or parent) znode field offset, in bytes from
++				   the base address of znode structure  */
++				int ptr_offset,
++				/* lock mode for longterm_lock_znode call */
++				znode_lock_mode mode,
++				/* lock request for longterm_lock_znode call */
++				znode_lock_request req,
++				/* GN_* flags */
++				int flags, int rlocked)
++{
++	reiser4_tree *tree = znode_get_tree(node);
++	znode *neighbor;
++	int ret;
++
++	assert("umka-236", node != NULL);
++	assert("umka-237", tree != NULL);
++	assert_rw_locked(&(tree->tree_lock));
++
++	if (flags & GN_TRY_LOCK)
++		req |= ZNODE_LOCK_NONBLOCK;
++	if (flags & GN_SAME_ATOM)
++		req |= ZNODE_LOCK_DONT_FUSE;
++
++	/* get neighbor's address by using of sibling link, quit while loop
++	   (and return) if link is not available. */
++	while (1) {
++		neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
++
++		/* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
++		 * node pointed by it is not connected.
++		 *
++		 * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
++		 * check and allows passing reference to not connected znode to
++		 * subsequent longterm_lock_znode() call.  This kills possible
++		 * busy loop if we are trying to get longterm lock on locked but
++		 * not yet connected parent node. */
++		if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
++					  || znode_is_connected(neighbor))) {
++			return RETERR(-E_NO_NEIGHBOR);
++		}
++
++		/* protect it from deletion. */
++		zref(neighbor);
++
++		rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
++
++		ret = longterm_lock_znode(result, neighbor, mode, req);
++
++		/* The lock handle obtains its own reference, release the one from above. */
++		zput(neighbor);
++
++		rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
++
++		/* restart if node we got reference to is being
++		   invalidated. we should not get reference to this node
++		   again. */
++		if (ret == -EINVAL)
++			continue;
++		if (ret)
++			return ret;
++
++		/* check if neighbor link still points to just locked znode;
++		   the link could have been changed while the process slept. */
++		if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
++			return 0;
++
++		/* znode was locked by mistake; unlock it and restart locking
++		   process from beginning. */
++		rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
++		longterm_unlock_znode(result);
++		rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
++	}
++}
++
++/* get parent node with longterm lock, accepts GN* flags. */
++int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
++			     znode * node /* child node */ ,
++			     znode_lock_mode mode
++			     /* type of lock: read or write */ ,
++			     int flags /* GN_* flags */ )
++{
++	int result;
++
++	read_lock_tree(znode_get_tree(node));
++	result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
++			       ZNODE_LOCK_HIPRI, flags, 1);
++	read_unlock_tree(znode_get_tree(node));
++	return result;
++}
++
++/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
++   bit in @flags parameter  */
++/* Audited by: umka (2002.06.14) */
++static inline int
++lock_side_neighbor(lock_handle * result,
++		   znode * node, znode_lock_mode mode, int flags, int rlocked)
++{
++	int ret;
++	int ptr_offset;
++	znode_lock_request req;
++
++	if (flags & GN_GO_LEFT) {
++		ptr_offset = LEFT_PTR_OFFSET;
++		req = ZNODE_LOCK_LOPRI;
++	} else {
++		ptr_offset = RIGHT_PTR_OFFSET;
++		req = ZNODE_LOCK_HIPRI;
++	}
++
++	ret =
++	    lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
++
++	if (ret == -E_NO_NEIGHBOR)	/* if we walk left or right -E_NO_NEIGHBOR does not
++					 * guarantee that neighbor is absent in the
++					 * tree; in this case we return -ENOENT --
++					 * means neighbor at least not found in
++					 * cache */
++		return RETERR(-ENOENT);
++
++	return ret;
++}
++
++#if REISER4_DEBUG
++
++int check_sibling_list(znode * node)
++{
++	znode *scan;
++	znode *next;
++
++	assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
++
++	if (node == NULL)
++		return 1;
++
++	if (ZF_ISSET(node, JNODE_RIP))
++		return 1;
++
++	assert("nikita-3270", node != NULL);
++	assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
++
++	for (scan = node; znode_is_left_connected(scan); scan = next) {
++		next = scan->left;
++		if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
++			assert("nikita-3271", znode_is_right_connected(next));
++			assert("nikita-3272", next->right == scan);
++		} else
++			break;
++	}
++	for (scan = node; znode_is_right_connected(scan); scan = next) {
++		next = scan->right;
++		if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
++			assert("nikita-3273", znode_is_left_connected(next));
++			assert("nikita-3274", next->left == scan);
++		} else
++			break;
++	}
++	return 1;
++}
++
++#endif
++
++/* Znode sibling pointers maintenence. */
++
++/* Znode sibling pointers are established between any neighbored nodes which are
++   in cache.  There are two znode state bits (JNODE_LEFT_CONNECTED,
++   JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
++   value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
++
++   Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
++   take care about searching (hash table lookup may be required) of znode
++   neighbors, establishing sibling pointers between them and setting
++   JNODE_*_CONNECTED state bits. */
++
++/* adjusting of sibling pointers and `connected' states for two
++   neighbors; works if one neighbor is NULL (was not found). */
++
++/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
++void link_left_and_right(znode * left, znode * right)
++{
++	assert("nikita-3275", check_sibling_list(left));
++	assert("nikita-3275", check_sibling_list(right));
++
++	if (left != NULL) {
++		if (left->right == NULL) {
++			left->right = right;
++			ZF_SET(left, JNODE_RIGHT_CONNECTED);
++
++			ON_DEBUG(left->right_version =
++				 atomic_inc_return(&delim_key_version);
++			    );
++
++		} else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
++			   && left->right != right) {
++
++			ON_DEBUG(left->right->left_version =
++				 atomic_inc_return(&delim_key_version);
++				 left->right_version =
++				 atomic_inc_return(&delim_key_version););
++
++			left->right->left = NULL;
++			left->right = right;
++			ZF_SET(left, JNODE_RIGHT_CONNECTED);
++		} else
++			/*
++			 * there is a race condition in renew_sibling_link()
++			 * and assertions below check that it is only one
++			 * there. Thread T1 calls renew_sibling_link() without
++			 * GN_NO_ALLOC flag. zlook() doesn't find neighbor
++			 * node, but before T1 gets to the
++			 * link_left_and_right(), another thread T2 creates
++			 * neighbor node and connects it. check for
++			 * left->right == NULL above protects T1 from
++			 * overwriting correct left->right pointer installed
++			 * by T2.
++			 */
++			assert("nikita-3302",
++			       right == NULL || left->right == right);
++	}
++	if (right != NULL) {
++		if (right->left == NULL) {
++			right->left = left;
++			ZF_SET(right, JNODE_LEFT_CONNECTED);
++
++			ON_DEBUG(right->left_version =
++				 atomic_inc_return(&delim_key_version);
++			    );
++
++		} else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
++			   && right->left != left) {
++
++			ON_DEBUG(right->left->right_version =
++				 atomic_inc_return(&delim_key_version);
++				 right->left_version =
++				 atomic_inc_return(&delim_key_version););
++
++			right->left->right = NULL;
++			right->left = left;
++			ZF_SET(right, JNODE_LEFT_CONNECTED);
++
++		} else
++			assert("nikita-3303",
++			       left == NULL || right->left == left);
++	}
++	assert("nikita-3275", check_sibling_list(left));
++	assert("nikita-3275", check_sibling_list(right));
++}
++
++/* Audited by: umka (2002.06.14) */
++static void link_znodes(znode * first, znode * second, int to_left)
++{
++	if (to_left)
++		link_left_and_right(second, first);
++	else
++		link_left_and_right(first, second);
++}
++
++/* getting of next (to left or to right, depend on gn_to_left bit in flags)
++   coord's unit position in horizontal direction, even across node
++   boundary. Should be called under tree lock, it protects nonexistence of
++   sibling link on parent level, if lock_side_neighbor() fails with
++   -ENOENT. */
++static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
++{
++	int ret;
++	znode *node;
++	reiser4_tree *tree;
++
++	assert("umka-243", coord != NULL);
++	assert("umka-244", handle != NULL);
++	assert("zam-1069", handle->node == NULL);
++
++	ret =
++	    (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
++	    coord_next_unit(coord);
++	if (!ret)
++		return 0;
++
++	ret =
++	    lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
++	if (ret)
++		return ret;
++
++	node = handle->node;
++	tree = znode_get_tree(node);
++	write_unlock_tree(tree);
++
++	coord_init_zero(coord);
++
++	/* We avoid synchronous read here if it is specified by flag. */
++	if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
++		ret = jstartio(ZJNODE(handle->node));
++		if (!ret)
++			ret = -E_REPEAT;
++		goto error_locked;
++	}
++
++	/* corresponded zrelse() should be called by the clients of
++	   far_next_coord(), in place when this node gets unlocked. */
++	ret = zload(handle->node);
++	if (ret)
++		goto error_locked;
++
++	if (flags & GN_GO_LEFT)
++		coord_init_last_unit(coord, node);
++	else
++		coord_init_first_unit(coord, node);
++
++	if (0) {
++	      error_locked:
++		longterm_unlock_znode(handle);
++	}
++	write_lock_tree(tree);
++	return ret;
++}
++
++/* Very significant function which performs a step in horizontal direction
++   when sibling pointer is not available.  Actually, it is only function which
++   does it.
++   Note: this function does not restore locking status at exit,
++   caller should does care about proper unlocking and zrelsing */
++static int
++renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
++		   tree_level level, int flags, int *nr_locked)
++{
++	int ret;
++	int to_left = flags & GN_GO_LEFT;
++	reiser4_block_nr da;
++	/* parent of the neighbor node; we set it to parent until not sharing
++	   of one parent between child and neighbor node is detected */
++	znode *side_parent = coord->node;
++	reiser4_tree *tree = znode_get_tree(child);
++	znode *neighbor = NULL;
++
++	assert("umka-245", coord != NULL);
++	assert("umka-246", handle != NULL);
++	assert("umka-247", child != NULL);
++	assert("umka-303", tree != NULL);
++
++	init_lh(handle);
++	write_lock_tree(tree);
++	ret = far_next_coord(coord, handle, flags);
++
++	if (ret) {
++		if (ret != -ENOENT) {
++			write_unlock_tree(tree);
++			return ret;
++		}
++	} else {
++		item_plugin *iplug;
++
++		if (handle->node != NULL) {
++			(*nr_locked)++;
++			side_parent = handle->node;
++		}
++
++		/* does coord object points to internal item? We do not
++		   support sibling pointers between znode for formatted and
++		   unformatted nodes and return -E_NO_NEIGHBOR in that case. */
++		iplug = item_plugin_by_coord(coord);
++		if (!item_is_internal(coord)) {
++			link_znodes(child, NULL, to_left);
++			write_unlock_tree(tree);
++			/* we know there can't be formatted neighbor */
++			return RETERR(-E_NO_NEIGHBOR);
++		}
++		write_unlock_tree(tree);
++
++		iplug->s.internal.down_link(coord, NULL, &da);
++
++		if (flags & GN_NO_ALLOC) {
++			neighbor = zlook(tree, &da);
++		} else {
++			neighbor =
++			    zget(tree, &da, side_parent, level, get_gfp_mask());
++		}
++
++		if (IS_ERR(neighbor)) {
++			ret = PTR_ERR(neighbor);
++			return ret;
++		}
++
++		if (neighbor)
++			/* update delimiting keys */
++			set_child_delimiting_keys(coord->node, coord, neighbor);
++
++		write_lock_tree(tree);
++	}
++
++	if (likely(neighbor == NULL ||
++		   (znode_get_level(child) == znode_get_level(neighbor)
++		    && child != neighbor)))
++		link_znodes(child, neighbor, to_left);
++	else {
++		warning("nikita-3532",
++			"Sibling nodes on the different levels: %i != %i\n",
++			znode_get_level(child), znode_get_level(neighbor));
++		ret = RETERR(-EIO);
++	}
++
++	write_unlock_tree(tree);
++
++	/* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
++	if (neighbor != NULL && (flags & GN_NO_ALLOC))
++		/* atomic_dec(&ZJNODE(neighbor)->x_count); */
++		zput(neighbor);
++
++	return ret;
++}
++
++/* This function is for establishing of one side relation. */
++/* Audited by: umka (2002.06.14) */
++static int connect_one_side(coord_t * coord, znode * node, int flags)
++{
++	coord_t local;
++	lock_handle handle;
++	int nr_locked;
++	int ret;
++
++	assert("umka-248", coord != NULL);
++	assert("umka-249", node != NULL);
++
++	coord_dup_nocheck(&local, coord);
++
++	init_lh(&handle);
++
++	ret =
++	    renew_sibling_link(&local, &handle, node, znode_get_level(node),
++			       flags | GN_NO_ALLOC, &nr_locked);
++
++	if (handle.node != NULL) {
++		/* complementary operations for zload() and lock() in far_next_coord() */
++		zrelse(handle.node);
++		longterm_unlock_znode(&handle);
++	}
++
++	/* we catch error codes which are not interesting for us because we
++	   run renew_sibling_link() only for znode connection. */
++	if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
++		return 0;
++
++	return ret;
++}
++
++/* if @child is not in `connected' state, performs hash searches for left and
++   right neighbor nodes and establishes horizontal sibling links */
++/* Audited by: umka (2002.06.14), umka (2002.06.15) */
++int connect_znode(coord_t * parent_coord, znode * child)
++{
++	reiser4_tree *tree = znode_get_tree(child);
++	int ret = 0;
++
++	assert("zam-330", parent_coord != NULL);
++	assert("zam-331", child != NULL);
++	assert("zam-332", parent_coord->node != NULL);
++	assert("umka-305", tree != NULL);
++
++	/* it is trivial to `connect' root znode because it can't have
++	   neighbors */
++	if (znode_above_root(parent_coord->node)) {
++		child->left = NULL;
++		child->right = NULL;
++		ZF_SET(child, JNODE_LEFT_CONNECTED);
++		ZF_SET(child, JNODE_RIGHT_CONNECTED);
++
++		ON_DEBUG(child->left_version =
++			 atomic_inc_return(&delim_key_version);
++			 child->right_version =
++			 atomic_inc_return(&delim_key_version););
++
++		return 0;
++	}
++
++	/* load parent node */
++	coord_clear_iplug(parent_coord);
++	ret = zload(parent_coord->node);
++
++	if (ret != 0)
++		return ret;
++
++	/* protect `connected' state check by tree_lock */
++	read_lock_tree(tree);
++
++	if (!znode_is_right_connected(child)) {
++		read_unlock_tree(tree);
++		/* connect right (default is right) */
++		ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
++		if (ret)
++			goto zrelse_and_ret;
++
++		read_lock_tree(tree);
++	}
++
++	ret = znode_is_left_connected(child);
++
++	read_unlock_tree(tree);
++
++	if (!ret) {
++		ret =
++		    connect_one_side(parent_coord, child,
++				     GN_NO_ALLOC | GN_GO_LEFT);
++	} else
++		ret = 0;
++
++      zrelse_and_ret:
++	zrelse(parent_coord->node);
++
++	return ret;
++}
++
++/* this function is like renew_sibling_link() but allocates neighbor node if
++   it doesn't exist and `connects' it. It may require making two steps in
++   horizontal direction, first one for neighbor node finding/allocation,
++   second one is for finding neighbor of neighbor to connect freshly allocated
++   znode. */
++/* Audited by: umka (2002.06.14), umka (2002.06.15) */
++static int
++renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
++{
++	coord_t local;
++	lock_handle empty[2];
++	reiser4_tree *tree = znode_get_tree(node);
++	znode *neighbor = NULL;
++	int nr_locked = 0;
++	int ret;
++
++	assert("umka-250", coord != NULL);
++	assert("umka-251", node != NULL);
++	assert("umka-307", tree != NULL);
++	assert("umka-308", level <= tree->height);
++
++	/* umka (2002.06.14)
++	   Here probably should be a check for given "level" validness.
++	   Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
++	 */
++
++	coord_dup(&local, coord);
++
++	ret =
++	    renew_sibling_link(&local, &empty[0], node, level,
++			       flags & ~GN_NO_ALLOC, &nr_locked);
++	if (ret)
++		goto out;
++
++	/* tree lock is not needed here because we keep parent node(s) locked
++	   and reference to neighbor znode incremented */
++	neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
++
++	read_lock_tree(tree);
++	ret = znode_is_connected(neighbor);
++	read_unlock_tree(tree);
++	if (ret) {
++		ret = 0;
++		goto out;
++	}
++
++	ret =
++	    renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
++			       flags | GN_NO_ALLOC, &nr_locked);
++	/* second renew_sibling_link() call is used for znode connection only,
++	   so we can live with these errors */
++	if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
++		ret = 0;
++
++      out:
++
++	for (--nr_locked; nr_locked >= 0; --nr_locked) {
++		zrelse(empty[nr_locked].node);
++		longterm_unlock_znode(&empty[nr_locked]);
++	}
++
++	if (neighbor != NULL)
++		/* decrement znode reference counter without actually
++		   releasing it. */
++		atomic_dec(&ZJNODE(neighbor)->x_count);
++
++	return ret;
++}
++
++/*
++   reiser4_get_neighbor() -- lock node's neighbor.
++
++   reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
++   given parameter) using sibling link to it. If sibling link is not available
++   (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
++   level up for information about neighbor's disk address. We lock node's
++   parent, if it is common parent for both 'node' and its neighbor, neighbor's
++   disk address is in next (to left or to right) down link from link that points
++   to original node. If not, we need to lock parent's neighbor, read its content
++   and take first(last) downlink with neighbor's disk address.  That locking
++   could be done by using sibling link and lock_neighbor() function, if sibling
++   link exists. In another case we have to go level up again until we find
++   common parent or valid sibling link. Then go down
++   allocating/connecting/locking/reading nodes until neighbor of first one is
++   locked.
++
++   @neighbor:  result lock handle,
++   @node: a node which we lock neighbor of,
++   @lock_mode: lock mode {LM_READ, LM_WRITE},
++   @flags: logical OR of {GN_*} (see description above) subset.
++
++   @return: 0 if success, negative value if lock was impossible due to an error
++   or lack of neighbor node.
++*/
++
++/* Audited by: umka (2002.06.14), umka (2002.06.15) */
++int
++reiser4_get_neighbor(lock_handle * neighbor, znode * node,
++		     znode_lock_mode lock_mode, int flags)
++{
++	reiser4_tree *tree = znode_get_tree(node);
++	lock_handle path[REAL_MAX_ZTREE_HEIGHT];
++
++	coord_t coord;
++
++	tree_level base_level;
++	tree_level h = 0;
++	int ret;
++
++	assert("umka-252", tree != NULL);
++	assert("umka-253", neighbor != NULL);
++	assert("umka-254", node != NULL);
++
++	base_level = znode_get_level(node);
++
++	assert("umka-310", base_level <= tree->height);
++
++	coord_init_zero(&coord);
++
++      again:
++	/* first, we try to use simple lock_neighbor() which requires sibling
++	   link existence */
++	read_lock_tree(tree);
++	ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
++	read_unlock_tree(tree);
++	if (!ret) {
++		/* load znode content if it was specified */
++		if (flags & GN_LOAD_NEIGHBOR) {
++			ret = zload(node);
++			if (ret)
++				longterm_unlock_znode(neighbor);
++		}
++		return ret;
++	}
++
++	/* only -ENOENT means we may look upward and try to connect
++	   @node with its neighbor (if @flags allow us to do it) */
++	if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
++		return ret;
++
++	/* before establishing of sibling link we lock parent node; it is
++	   required by renew_neighbor() to work.  */
++	init_lh(&path[0]);
++	ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
++	if (ret)
++		return ret;
++	if (znode_above_root(path[0].node)) {
++		longterm_unlock_znode(&path[0]);
++		return RETERR(-E_NO_NEIGHBOR);
++	}
++
++	while (1) {
++		znode *child = (h == 0) ? node : path[h - 1].node;
++		znode *parent = path[h].node;
++
++		ret = zload(parent);
++		if (ret)
++			break;
++
++		ret = find_child_ptr(parent, child, &coord);
++
++		if (ret) {
++			zrelse(parent);
++			break;
++		}
++
++		/* try to establish missing sibling link */
++		ret = renew_neighbor(&coord, child, h + base_level, flags);
++
++		zrelse(parent);
++
++		switch (ret) {
++		case 0:
++			/* unlocking of parent znode prevents simple
++			   deadlock situation */
++			done_lh(&path[h]);
++
++			/* depend on tree level we stay on we repeat first
++			   locking attempt ...  */
++			if (h == 0)
++				goto again;
++
++			/* ... or repeat establishing of sibling link at
++			   one level below. */
++			--h;
++			break;
++
++		case -ENOENT:
++			/* sibling link is not available -- we go
++			   upward. */
++			init_lh(&path[h + 1]);
++			ret =
++			    reiser4_get_parent(&path[h + 1], parent,
++					       ZNODE_READ_LOCK);
++			if (ret)
++				goto fail;
++			++h;
++			if (znode_above_root(path[h].node)) {
++				ret = RETERR(-E_NO_NEIGHBOR);
++				goto fail;
++			}
++			break;
++
++		case -E_DEADLOCK:
++			/* there was lock request from hi-pri locker. if
++			   it is possible we unlock last parent node and
++			   re-lock it again. */
++			for (; check_deadlock(); h--) {
++				done_lh(&path[h]);
++				if (h == 0)
++					goto fail;
++			}
++
++			break;
++
++		default:	/* other errors. */
++			goto fail;
++		}
++	}
++      fail:
++	ON_DEBUG(check_lock_node_data(node));
++	ON_DEBUG(check_lock_data());
++
++	/* unlock path */
++	do {
++		/* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
++		   fail; path[0] is already done_lh-ed, therefore
++		   longterm_unlock_znode(&path[h]); is not applicable */
++		done_lh(&path[h]);
++		--h;
++	} while (h + 1 != 0);
++
++	return ret;
++}
++
++/* remove node from sibling list */
++/* Audited by: umka (2002.06.14) */
++void sibling_list_remove(znode * node)
++{
++	reiser4_tree *tree;
++
++	tree = znode_get_tree(node);
++	assert("umka-255", node != NULL);
++	assert_rw_write_locked(&(tree->tree_lock));
++	assert("nikita-3275", check_sibling_list(node));
++
++	write_lock_dk(tree);
++	if (znode_is_right_connected(node) && node->right != NULL &&
++	    znode_is_left_connected(node) && node->left != NULL) {
++		assert("zam-32245",
++		       keyeq(znode_get_rd_key(node),
++			     znode_get_ld_key(node->right)));
++		znode_set_rd_key(node->left, znode_get_ld_key(node->right));
++	}
++	write_unlock_dk(tree);
++
++	if (znode_is_right_connected(node) && node->right != NULL) {
++		assert("zam-322", znode_is_left_connected(node->right));
++		node->right->left = node->left;
++		ON_DEBUG(node->right->left_version =
++			 atomic_inc_return(&delim_key_version);
++		    );
++	}
++	if (znode_is_left_connected(node) && node->left != NULL) {
++		assert("zam-323", znode_is_right_connected(node->left));
++		node->left->right = node->right;
++		ON_DEBUG(node->left->right_version =
++			 atomic_inc_return(&delim_key_version);
++		    );
++	}
++
++	ZF_CLR(node, JNODE_LEFT_CONNECTED);
++	ZF_CLR(node, JNODE_RIGHT_CONNECTED);
++	ON_DEBUG(node->left = node->right = NULL;
++		 node->left_version = atomic_inc_return(&delim_key_version);
++		 node->right_version = atomic_inc_return(&delim_key_version););
++	assert("nikita-3276", check_sibling_list(node));
++}
++
++/* disconnect node from sibling list */
++void sibling_list_drop(znode * node)
++{
++	znode *right;
++	znode *left;
++
++	assert("nikita-2464", node != NULL);
++	assert("nikita-3277", check_sibling_list(node));
++
++	right = node->right;
++	if (right != NULL) {
++		assert("nikita-2465", znode_is_left_connected(right));
++		right->left = NULL;
++		ON_DEBUG(right->left_version =
++			 atomic_inc_return(&delim_key_version);
++		    );
++	}
++	left = node->left;
++	if (left != NULL) {
++		assert("zam-323", znode_is_right_connected(left));
++		left->right = NULL;
++		ON_DEBUG(left->right_version =
++			 atomic_inc_return(&delim_key_version);
++		    );
++	}
++	ZF_CLR(node, JNODE_LEFT_CONNECTED);
++	ZF_CLR(node, JNODE_RIGHT_CONNECTED);
++	ON_DEBUG(node->left = node->right = NULL;
++		 node->left_version = atomic_inc_return(&delim_key_version);
++		 node->right_version = atomic_inc_return(&delim_key_version););
++}
++
++/* Insert new node into sibling list. Regular balancing inserts new node
++   after (at right side) existing and locked node (@before), except one case
++   of adding new tree root node. @before should be NULL in that case. */
++void sibling_list_insert_nolock(znode * new, znode * before)
++{
++	assert("zam-334", new != NULL);
++	assert("nikita-3298", !znode_is_left_connected(new));
++	assert("nikita-3299", !znode_is_right_connected(new));
++	assert("nikita-3300", new->left == NULL);
++	assert("nikita-3301", new->right == NULL);
++	assert("nikita-3278", check_sibling_list(new));
++	assert("nikita-3279", check_sibling_list(before));
++
++	if (before != NULL) {
++		assert("zam-333", znode_is_connected(before));
++		new->right = before->right;
++		new->left = before;
++		ON_DEBUG(new->right_version =
++			 atomic_inc_return(&delim_key_version);
++			 new->left_version =
++			 atomic_inc_return(&delim_key_version););
++		if (before->right != NULL) {
++			before->right->left = new;
++			ON_DEBUG(before->right->left_version =
++				 atomic_inc_return(&delim_key_version);
++			    );
++		}
++		before->right = new;
++		ON_DEBUG(before->right_version =
++			 atomic_inc_return(&delim_key_version);
++		    );
++	} else {
++		new->right = NULL;
++		new->left = NULL;
++		ON_DEBUG(new->right_version =
++			 atomic_inc_return(&delim_key_version);
++			 new->left_version =
++			 atomic_inc_return(&delim_key_version););
++	}
++	ZF_SET(new, JNODE_LEFT_CONNECTED);
++	ZF_SET(new, JNODE_RIGHT_CONNECTED);
++	assert("nikita-3280", check_sibling_list(new));
++	assert("nikita-3281", check_sibling_list(before));
++}
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 80
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/tree_walk.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/tree_walk.h
+@@ -0,0 +1,125 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++/* definitions of reiser4 tree walk functions */
++
++#ifndef __FS_REISER4_TREE_WALK_H__
++#define __FS_REISER4_TREE_WALK_H__
++
++#include "debug.h"
++#include "forward.h"
++
++/* establishes horizontal links between cached znodes */
++int connect_znode(coord_t * coord, znode * node);
++
++/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
++  have the following common arguments:
++
++  return codes:
++
++  @return : 0        - OK,
++
++ZAM-FIXME-HANS: wrong return code name.  Change them all.
++	    -ENOENT  - neighbor is not in cache, what is detected by sibling
++	               link absence.
++
++            -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
++                       found (because we are left-/right- most node of the
++		       tree, for example). Also, this return code is for
++		       reiser4_get_parent() when we see no parent link -- it
++		       means that our node is root node.
++
++            -E_DEADLOCK - deadlock detected (request from high-priority process
++	               received), other error codes are conformed to
++		       /usr/include/asm/errno.h .
++*/
++
++int
++reiser4_get_parent_flags(lock_handle * result, znode * node,
++			 znode_lock_mode mode, int flags);
++
++/* bits definition for reiser4_get_neighbor function `flags' arg. */
++typedef enum {
++	/* If sibling pointer is NULL, this flag allows get_neighbor() to try to
++	 * find not allocated not connected neigbor by going though upper
++	 * levels */
++	GN_CAN_USE_UPPER_LEVELS = 0x1,
++	/* locking left neighbor instead of right one */
++	GN_GO_LEFT = 0x2,
++	/* automatically load neighbor node content */
++	GN_LOAD_NEIGHBOR = 0x4,
++	/* return -E_REPEAT if can't lock  */
++	GN_TRY_LOCK = 0x8,
++	/* used internally in tree_walk.c, causes renew_sibling to not
++	   allocate neighbor znode, but only search for it in znode cache */
++	GN_NO_ALLOC = 0x10,
++	/* do not go across atom boundaries */
++	GN_SAME_ATOM = 0x20,
++	/* allow to lock not connected nodes */
++	GN_ALLOW_NOT_CONNECTED = 0x40,
++	/*  Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
++	GN_ASYNC = 0x80
++} znode_get_neigbor_flags;
++
++/* A commonly used wrapper for reiser4_get_parent_flags(). */
++static inline int reiser4_get_parent(lock_handle * result, znode * node,
++				     znode_lock_mode mode)
++{
++	return reiser4_get_parent_flags(result, node, mode,
++					GN_ALLOW_NOT_CONNECTED);
++}
++
++int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
++			 znode_lock_mode lock_mode, int flags);
++
++/* there are wrappers for most common usages of reiser4_get_neighbor() */
++static inline int
++reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
++			  int flags)
++{
++	return reiser4_get_neighbor(result, node, lock_mode,
++				    flags | GN_GO_LEFT);
++}
++
++static inline int
++reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
++			   int flags)
++{
++	ON_DEBUG(check_lock_node_data(node));
++	ON_DEBUG(check_lock_data());
++	return reiser4_get_neighbor(result, node, lock_mode,
++				    flags & (~GN_GO_LEFT));
++}
++
++extern void sibling_list_remove(znode * node);
++extern void sibling_list_drop(znode * node);
++extern void sibling_list_insert_nolock(znode * new, znode * before);
++extern void link_left_and_right(znode * left, znode * right);
++
++/* Functions called by tree_walk() when tree_walk() ...  */
++struct tree_walk_actor {
++	/* ... meets a formatted node, */
++	int (*process_znode) (tap_t *, void *);
++	/* ... meets an extent, */
++	int (*process_extent) (tap_t *, void *);
++	/* ... begins tree traversal or repeats it after -E_REPEAT was returned by
++	 * node or extent processing functions. */
++	int (*before) (void *);
++};
++
++#if REISER4_DEBUG
++int check_sibling_list(znode * node);
++#else
++#define check_sibling_list(n) (1)
++#endif
++
++#endif				/* __FS_REISER4_TREE_WALK_H__ */
++
++/*
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/txnmgr.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/txnmgr.c
+@@ -0,0 +1,3158 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Joshua MacDonald wrote the first draft of this code. */
++
++/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
++filesystem scales only as well as its worst locking design.  You need to
++substantially restructure this code. Josh was not as experienced a programmer
++as you.  Particularly review how the locking style differs from what you did
++for znodes usingt hi-lo priority locking, and present to me an opinion on
++whether the differences are well founded.  */
++
++/* I cannot help but to disagree with the sentiment above. Locking of
++ * transaction manager is _not_ badly designed, and, at the very least, is not
++ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
++ * locking on znodes, especially on the root node of the tree. --nikita,
++ * 2003.10.13 */
++
++/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles.  The
++   txnmgr processes capture_block requests and manages the relationship between jnodes and
++   atoms through the various stages of a transcrash, and it also oversees the fusion and
++   capture-on-copy processes.  The main difficulty with this task is maintaining a
++   deadlock-free lock ordering between atoms and jnodes/handles.  The reason for the
++   difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
++   must be broken.  The main requirement is that atom-fusion be deadlock free, so once you
++   hold the atom_lock you may then wait to acquire any jnode or handle lock.  This implies
++   that any time you check the atom-pointer of a jnode or handle and then try to lock that
++   atom, you must use trylock() and possibly reverse the order.
++
++   This code implements the design documented at:
++
++     http://namesys.com/txn-doc.html
++
++ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
++above document and reference the new.  Be sure to provide some credit to Josh.  I already have some writings on this
++topic in v4.html, but they are lacking in details present in the above.  Cure that.  Remember to write for the bright 12
++year old --- define all technical terms used.
++
++*/
++
++/* Thoughts on the external transaction interface:
++
++   In the current code, a TRANSCRASH handle is created implicitly by init_context() (which
++   creates state that lasts for the duration of a system call and is called at the start
++   of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
++   occupying the scope of a single system call.  We wish to give certain applications an
++   interface to begin and close (commit) transactions.  Since our implementation of
++   transactions does not yet support isolation, allowing an application to open a
++   transaction implies trusting it to later close the transaction.  Part of the
++   transaction interface will be aimed at enabling that trust, but the interface for
++   actually using transactions is fairly narrow.
++
++   BEGIN_TRANSCRASH: Returns a transcrash identifier.  It should be possible to translate
++   this identifier into a string that a shell-script could use, allowing you to start a
++   transaction by issuing a command.  Once open, the transcrash should be set in the task
++   structure, and there should be options (I suppose) to allow it to be carried across
++   fork/exec.  A transcrash has several options:
++
++     - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
++     on writes (WRITE_FUSING) and allow "dirty reads".  If the application wishes to
++     capture on reads as well, it should set READ_FUSING.
++
++     - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
++     eventually close (or else the machine must crash).  If the application dies an
++     unexpected death with an open transcrash, for example, or if it hangs for a long
++     duration, one solution (to avoid crashing the machine) is to simply close it anyway.
++     This is a dangerous option, but it is one way to solve the problem until isolated
++     transcrashes are available for untrusted applications.
++
++     It seems to be what databases do, though it is unclear how one avoids a DoS attack
++     creating a vulnerability based on resource starvation.  Guaranteeing that some
++     minimum amount of computational resources are made available would seem more correct
++     than guaranteeing some amount of time.  When we again have someone to code the work,
++     this issue should be considered carefully.  -Hans
++
++   RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
++   many dirty blocks it expects.  The reserve_blocks interface should be called at a point
++   where it is safe for the application to fail, because the system may not be able to
++   grant the allocation and the application must be able to back-out.  For this reason,
++   the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
++   the application may also wish to extend the allocation after beginning its transcrash.
++
++   CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
++   modifications that require transaction protection.  When isolated transactions are
++   supported the CLOSE operation is replaced by either COMMIT or ABORT.  For example, if a
++   RESERVE_BLOCKS call fails for the application, it should "abort" by calling
++   CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
++   why, for safety, the application should call RESERVE_BLOCKS before making any changes).
++
++   For actually implementing these out-of-system-call-scopped transcrashes, the
++   reiser4_context has a "txn_handle *trans" pointer that may be set to an open
++   transcrash.  Currently there are no dynamically-allocated transcrashes, but there is a
++   "kmem_cache_t *_txnh_slab" created for that purpose in this file.
++*/
++
++/* Extending the other system call interfaces for future transaction features:
++
++   Specialized applications may benefit from passing flags to the ordinary system call
++   interface such as read(), write(), or stat().  For example, the application specifies
++   WRITE_FUSING by default but wishes to add that a certain read() command should be
++   treated as READ_FUSING.  But which read?  Is it the directory-entry read, the stat-data
++   read, or the file-data read?  These issues are straight-forward, but there are a lot of
++   them and adding the necessary flags-passing code will be tedious.
++
++   When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
++   flag, which specifies that although it is a read operation being requested, a
++   write-lock should be taken.  The reason is that read-locks are shared while write-locks
++   are exclusive, so taking a read-lock when a later-write is known in advance will often
++   leads to deadlock.  If a reader knows it will write later, it should issue read
++   requests with the RMW flag set.
++*/
++
++/*
++   The znode/atom deadlock avoidance.
++
++   FIXME(Zam): writing of this comment is in progress.
++
++   The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
++   long-term locking, which makes reiser4 locking scheme more complex.  It had
++   deadlocks until we implement deadlock avoidance algorithms.  That deadlocks
++   looked as the following: one stopped thread waits for a long-term lock on
++   znode, the thread who owns that lock waits when fusion with another atom will
++   be allowed.
++
++   The source of the deadlocks is an optimization of not capturing index nodes
++   for read.  Let's prove it.  Suppose we have dumb node capturing scheme which
++   unconditionally captures each block before locking it.
++
++   That scheme has no deadlocks.  Let's begin with the thread which stage is
++   ASTAGE_CAPTURE_WAIT and it waits for a znode lock.  The thread can't wait for
++   a capture because it's stage allows fusion with any atom except which are
++   being committed currently. A process of atom commit can't deadlock because
++   atom commit procedure does not acquire locks and does not fuse with other
++   atoms.  Reiser4 does capturing right before going to sleep inside the
++   longtertm_lock_znode() function, it means the znode which we want to lock is
++   already captured and its atom is in ASTAGE_CAPTURE_WAIT stage.  If we
++   continue the analysis we understand that no one process in the sequence may
++   waits atom fusion.  Thereby there are no deadlocks of described kind.
++
++   The capturing optimization makes the deadlocks possible.  A thread can wait a
++   lock which owner did not captured that node.  The lock owner's current atom
++   is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
++   state. A deadlock is possible when that atom meets another one which is in
++   ASTAGE_CAPTURE_WAIT already.
++
++   The deadlock avoidance scheme includes two algorithms:
++
++   First algorithm is used when a thread captures a node which is locked but not
++   captured by another thread.  Those nodes are marked MISSED_IN_CAPTURE at the
++   moment we skip their capturing.  If such a node (marked MISSED_IN_CAPTURE) is
++   being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
++   routine which forces all lock owners to join with current atom is executed.
++
++   Second algorithm does not allow to skip capturing of already captured nodes.
++
++   Both algorithms together prevent waiting a longterm lock without atom fusion
++   with atoms of all lock owners, which is a key thing for getting atom/znode
++   locking deadlocks.
++*/
++
++/*
++ * Transactions and mmap(2).
++ *
++ *     1. Transactions are not supported for accesses through mmap(2), because
++ *     this would effectively amount to user-level transactions whose duration
++ *     is beyond control of the kernel.
++ *
++ *     2. That said, we still want to preserve some decency with regard to
++ *     mmap(2). During normal write(2) call, following sequence of events
++ *     happens:
++ *
++ *         1. page is created;
++ *
++ *         2. jnode is created, dirtied and captured into current atom.
++ *
++ *         3. extent is inserted and modified.
++ *
++ *     Steps (2) and (3) take place under long term lock on the twig node.
++ *
++ *     When file is accessed through mmap(2) page is always created during
++ *     page fault. After this (in reiser4_readpage()->readpage_extent()):
++ *
++ *         1. if access is made to non-hole page new jnode is created, (if
++ *         necessary)
++ *
++ *         2. if access is made to the hole page, jnode is not created (XXX
++ *         not clear why).
++ *
++ *     Also, even if page is created by write page fault it is not marked
++ *     dirty immediately by handle_mm_fault(). Probably this is to avoid races
++ *     with page write-out.
++ *
++ *     Dirty bit installed by hardware is only transferred to the struct page
++ *     later, when page is unmapped (in zap_pte_range(), or
++ *     try_to_unmap_one()).
++ *
++ *     So, with mmap(2) we have to handle following irksome situations:
++ *
++ *         1. there exists modified page (clean or dirty) without jnode
++ *
++ *         2. there exists modified page (clean or dirty) with clean jnode
++ *
++ *         3. clean page which is a part of atom can be transparently modified
++ *         at any moment through mapping without becoming dirty.
++ *
++ *     (1) and (2) can lead to the out-of-memory situation: ->writepage()
++ *     doesn't know what to do with such pages and ->sync_sb()/->writepages()
++ *     don't see them, because these methods operate on atoms.
++ *
++ *     (3) can lead to the loss of data: suppose we have dirty page with dirty
++ *     captured jnode captured by some atom. As part of early flush (for
++ *     example) page was written out. Dirty bit was cleared on both page and
++ *     jnode. After this page is modified through mapping, but kernel doesn't
++ *     notice and just discards page and jnode as part of commit. (XXX
++ *     actually it doesn't, because to reclaim page ->releasepage() has to be
++ *     called and before this dirty bit will be transferred to the struct
++ *     page).
++ *
++ */
++
++#include "debug.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree.h"
++#include "wander.h"
++#include "ktxnmgrd.h"
++#include "super.h"
++#include "page_cache.h"
++#include "reiser4.h"
++#include "vfs_ops.h"
++#include "inode.h"
++#include "flush.h"
++
++#include <asm/atomic.h>
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/pagemap.h>
++#include <linux/writeback.h>
++#include <linux/swap.h>		/* for totalram_pages */
++
++static void atom_free(txn_atom * atom);
++
++static int commit_txnh(txn_handle * txnh);
++
++static void wakeup_atom_waitfor_list(txn_atom * atom);
++static void wakeup_atom_waiting_list(txn_atom * atom);
++
++static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
++
++static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
++
++static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
++
++static int capture_init_fusion(jnode * node, txn_handle * txnh,
++			       txn_capture mode);
++
++static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
++
++static void capture_fuse_into(txn_atom * small, txn_atom * large);
++
++void invalidate_list(struct list_head *);
++
++/* GENERIC STRUCTURES */
++
++typedef struct _txn_wait_links txn_wait_links;
++
++struct _txn_wait_links {
++	lock_stack *_lock_stack;
++	struct list_head _fwaitfor_link;
++	struct list_head _fwaiting_link;
++	int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
++	int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
++};
++
++/* FIXME: In theory, we should be using the slab cache init & destructor
++   methods instead of, e.g., jnode_init, etc. */
++static kmem_cache_t *_atom_slab = NULL;
++/* this is for user-visible, cross system-call transactions. */
++static kmem_cache_t *_txnh_slab = NULL;
++
++/**
++ * init_txnmgr_static - create transaction manager slab caches
++ *
++ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
++ * initialization.
++ */
++int init_txnmgr_static(void)
++{
++	assert("jmacd-600", _atom_slab == NULL);
++	assert("jmacd-601", _txnh_slab == NULL);
++
++	ON_DEBUG(atomic_set(&flush_cnt, 0));
++
++	_atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
++				       SLAB_HWCACHE_ALIGN |
++				       SLAB_RECLAIM_ACCOUNT, NULL, NULL);
++	if (_atom_slab == NULL)
++		return RETERR(-ENOMEM);
++
++	_txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
++			      SLAB_HWCACHE_ALIGN, NULL, NULL);
++	if (_txnh_slab == NULL) {
++		kmem_cache_destroy(_atom_slab);
++		_atom_slab = NULL;
++		return RETERR(-ENOMEM);
++	}
++
++	return 0;
++}
++
++/**
++ * done_txnmgr_static - delete txn_atom and txn_handle caches
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_txnmgr_static(void)
++{
++	destroy_reiser4_cache(&_atom_slab);
++	destroy_reiser4_cache(&_txnh_slab);
++}
++
++/**
++ * init_txnmgr - initialize a new transaction manager
++ * @mgr: pointer to transaction manager embedded in reiser4 super block
++ *
++ * This is called on mount. Makes necessary initializations.
++ */
++void init_txnmgr(txn_mgr *mgr)
++{
++	assert("umka-169", mgr != NULL);
++
++	mgr->atom_count = 0;
++	mgr->id_count = 1;
++	INIT_LIST_HEAD(&mgr->atoms_list);
++	spin_lock_init(&mgr->tmgr_lock);
++	sema_init(&mgr->commit_semaphore, 1);
++}
++
++/**
++ * done_txnmgr - stop transaction manager
++ * @mgr: pointer to transaction manager embedded in reiser4 super block
++ *
++ * This is called on umount. Does sanity checks.
++ */
++void done_txnmgr(txn_mgr *mgr)
++{
++	assert("umka-170", mgr != NULL);
++	assert("umka-1701", list_empty_careful(&mgr->atoms_list));
++	assert("umka-1702", mgr->atom_count == 0);
++}
++
++/* Initialize a transaction handle. */
++/* Audited by: umka (2002.06.13) */
++static void txnh_init(txn_handle * txnh, txn_mode mode)
++{
++	assert("umka-171", txnh != NULL);
++
++	txnh->mode = mode;
++	txnh->atom = NULL;
++	set_gfp_mask();
++	txnh->flags = 0;
++	spin_lock_init(&txnh->hlock);
++	INIT_LIST_HEAD(&txnh->txnh_link);
++}
++
++#if REISER4_DEBUG
++/* Check if a transaction handle is clean. */
++static int txnh_isclean(txn_handle * txnh)
++{
++	assert("umka-172", txnh != NULL);
++	return txnh->atom == NULL &&
++		LOCK_CNT_NIL(spin_locked_txnh);
++}
++#endif
++
++/* Initialize an atom. */
++static void atom_init(txn_atom * atom)
++{
++	int level;
++
++	assert("umka-173", atom != NULL);
++
++	memset(atom, 0, sizeof(txn_atom));
++
++	atom->stage = ASTAGE_FREE;
++	atom->start_time = jiffies;
++
++	for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
++		INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
++
++	INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
++	INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
++	INIT_LIST_HEAD(ATOM_WB_LIST(atom));
++	INIT_LIST_HEAD(&atom->inodes);
++	spin_lock_init(&atom->alock);
++	/* list of transaction handles */
++	INIT_LIST_HEAD(&atom->txnh_list);
++	/* link to transaction manager's list of atoms */
++	INIT_LIST_HEAD(&atom->atom_link);
++	INIT_LIST_HEAD(&atom->fwaitfor_list);
++	INIT_LIST_HEAD(&atom->fwaiting_list);
++	blocknr_set_init(&atom->delete_set);
++	blocknr_set_init(&atom->wandered_map);
++
++	init_atom_fq_parts(atom);
++}
++
++#if REISER4_DEBUG
++/* Check if an atom is clean. */
++static int atom_isclean(txn_atom * atom)
++{
++	int level;
++
++	assert("umka-174", atom != NULL);
++
++	for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
++		if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
++			return 0;
++		}
++	}
++
++	return	atom->stage == ASTAGE_FREE &&
++		atom->txnh_count == 0 &&
++		atom->capture_count == 0 &&
++		atomic_read(&atom->refcount) == 0 &&
++		(&atom->atom_link == atom->atom_link.next &&
++		 &atom->atom_link == atom->atom_link.prev) &&
++		list_empty_careful(&atom->txnh_list) &&
++		list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
++		list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
++		list_empty_careful(ATOM_WB_LIST(atom)) &&
++		list_empty_careful(&atom->fwaitfor_list) &&
++		list_empty_careful(&atom->fwaiting_list) &&
++		atom_fq_parts_are_clean(atom);
++}
++#endif
++
++/* Begin a transaction in this context.  Currently this uses the reiser4_context's
++   trans_in_ctx, which means that transaction handles are stack-allocated.  Eventually
++   this will be extended to allow transaction handles to span several contexts. */
++/* Audited by: umka (2002.06.13) */
++void txn_begin(reiser4_context * context)
++{
++	assert("jmacd-544", context->trans == NULL);
++
++	context->trans = &context->trans_in_ctx;
++
++	/* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
++	   transcrash.  Default should be TXN_WRITE_FUSING.  Also, the _trans variable is
++	   stack allocated right now, but we would like to allow for dynamically allocated
++	   transcrashes that span multiple system calls.
++	 */
++	txnh_init(context->trans, TXN_WRITE_FUSING);
++}
++
++/* Finish a transaction handle context. */
++int txn_end(reiser4_context * context)
++{
++	long ret = 0;
++	txn_handle *txnh;
++
++	assert("umka-283", context != NULL);
++	assert("nikita-3012", schedulable());
++	assert("vs-24", context == get_current_context());
++	assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
++
++	txnh = context->trans;
++	if (txnh != NULL) {
++		if (txnh->atom != NULL)
++			ret = commit_txnh(txnh);
++		assert("jmacd-633", txnh_isclean(txnh));
++		context->trans = NULL;
++	}
++	return ret;
++}
++
++void txn_restart(reiser4_context * context)
++{
++	txn_end(context);
++	preempt_point();
++	txn_begin(context);
++}
++
++void txn_restart_current(void)
++{
++	txn_restart(get_current_context());
++}
++
++/* TXN_ATOM */
++
++/* Get the atom belonging to a txnh, which is not locked.  Return txnh locked. Locks atom, if atom
++   is not NULL.  This performs the necessary spin_trylock to break the lock-ordering cycle.  May
++   return NULL. */
++static txn_atom *txnh_get_atom(txn_handle * txnh)
++{
++	txn_atom *atom;
++
++	assert("umka-180", txnh != NULL);
++	assert_spin_not_locked(&(txnh->hlock));
++
++	while (1) {
++		spin_lock_txnh(txnh);
++		atom = txnh->atom;
++
++		if (atom == NULL)
++			break;
++
++		if (spin_trylock_atom(atom))
++			break;
++
++		atomic_inc(&atom->refcount);
++
++		spin_unlock_txnh(txnh);
++		spin_lock_atom(atom);
++		spin_lock_txnh(txnh);
++
++		if (txnh->atom == atom) {
++			atomic_dec(&atom->refcount);
++			break;
++		}
++
++		spin_unlock_txnh(txnh);
++		atom_dec_and_unlock(atom);
++	}
++
++	return atom;
++}
++
++/* Get the current atom and spinlock it if current atom present. May return NULL  */
++txn_atom *get_current_atom_locked_nocheck(void)
++{
++	reiser4_context *cx;
++	txn_atom *atom;
++	txn_handle *txnh;
++
++	cx = get_current_context();
++	assert("zam-437", cx != NULL);
++
++	txnh = cx->trans;
++	assert("zam-435", txnh != NULL);
++
++	atom = txnh_get_atom(txnh);
++
++	spin_unlock_txnh(txnh);
++	return atom;
++}
++
++/* Get the atom belonging to a jnode, which is initially locked.  Return with
++   both jnode and atom locked.  This performs the necessary spin_trylock to
++   break the lock-ordering cycle.  Assumes the jnode is already locked, and
++   returns NULL if atom is not set. */
++txn_atom *jnode_get_atom(jnode * node)
++{
++	txn_atom *atom;
++
++	assert("umka-181", node != NULL);
++
++	while (1) {
++		assert_spin_locked(&(node->guard));
++
++		atom = node->atom;
++		/* node is not in any atom */
++		if (atom == NULL)
++			break;
++
++		/* If atom is not locked, grab the lock and return */
++		if (spin_trylock_atom(atom))
++			break;
++
++		/* At least one jnode belongs to this atom it guarantees that
++		 * atom->refcount > 0, we can safely increment refcount. */
++		atomic_inc(&atom->refcount);
++		spin_unlock_jnode(node);
++
++		/* re-acquire spin locks in the right order */
++		spin_lock_atom(atom);
++		spin_lock_jnode(node);
++
++		/* check if node still points to the same atom. */
++		if (node->atom == atom) {
++			atomic_dec(&atom->refcount);
++			break;
++		}
++
++		/* releasing of atom lock and reference requires not holding
++		 * locks on jnodes.  */
++		spin_unlock_jnode(node);
++
++		/* We do not sure that this atom has extra references except our
++		 * one, so we should call proper function which may free atom if
++		 * last reference is released. */
++		atom_dec_and_unlock(atom);
++
++		/* lock jnode again for getting valid node->atom pointer
++		 * value. */
++		spin_lock_jnode(node);
++	}
++
++	return atom;
++}
++
++/* Returns true if @node is dirty and part of the same atom as one of its neighbors.  Used
++   by flush code to indicate whether the next node (in some direction) is suitable for
++   flushing. */
++int
++same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
++{
++	int compat;
++	txn_atom *atom;
++
++	assert("umka-182", node != NULL);
++	assert("umka-183", check != NULL);
++
++	/* Not sure what this function is supposed to do if supplied with @check that is
++	   neither formatted nor unformatted (bitmap or so). */
++	assert("nikita-2373", jnode_is_znode(check)
++	       || jnode_is_unformatted(check));
++
++	/* Need a lock on CHECK to get its atom and to check various state bits.
++	   Don't need a lock on NODE once we get the atom lock. */
++	/* It is not enough to lock two nodes and check (node->atom ==
++	   check->atom) because atom could be locked and being fused at that
++	   moment, jnodes of the atom of that state (being fused) can point to
++	   different objects, but the atom is the same. */
++	spin_lock_jnode(check);
++
++	atom = jnode_get_atom(check);
++
++	if (atom == NULL) {
++		compat = 0;
++	} else {
++		compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
++
++		if (compat && jnode_is_znode(check)) {
++			compat &= znode_is_connected(JZNODE(check));
++		}
++
++		if (compat && alloc_check) {
++			compat &= (alloc_value == jnode_is_flushprepped(check));
++		}
++
++		spin_unlock_atom(atom);
++	}
++
++	spin_unlock_jnode(check);
++
++	return compat;
++}
++
++/* Decrement the atom's reference count and if it falls to zero, free it. */
++void atom_dec_and_unlock(txn_atom * atom)
++{
++	txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
++
++	assert("umka-186", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++	assert("zam-1039", atomic_read(&atom->refcount) > 0);
++
++	if (atomic_dec_and_test(&atom->refcount)) {
++		/* take txnmgr lock and atom lock in proper order. */
++		if (!spin_trylock_txnmgr(mgr)) {
++			/* This atom should exist after we re-acquire its
++			 * spinlock, so we increment its reference counter. */
++			atomic_inc(&atom->refcount);
++			spin_unlock_atom(atom);
++			spin_lock_txnmgr(mgr);
++			spin_lock_atom(atom);
++
++			if (!atomic_dec_and_test(&atom->refcount)) {
++				spin_unlock_atom(atom);
++				spin_unlock_txnmgr(mgr);
++				return;
++			}
++		}
++		assert_spin_locked(&(mgr->tmgr_lock));
++		atom_free(atom);
++		spin_unlock_txnmgr(mgr);
++	} else
++		spin_unlock_atom(atom);
++}
++
++/* Create new atom and connect it to given transaction handle.  This adds the
++   atom to the transaction manager's list and sets its reference count to 1, an
++   artificial reference which is kept until it commits.  We play strange games
++   to avoid allocation under jnode & txnh spinlocks.*/
++
++static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
++{
++	txn_atom *atom;
++	txn_mgr *mgr;
++
++	if (REISER4_DEBUG && rofs_tree(current_tree)) {
++		warning("nikita-3366", "Creating atom on rofs");
++		dump_stack();
++	}
++
++	if (*atom_alloc == NULL) {
++		(*atom_alloc) = kmem_cache_alloc(_atom_slab, get_gfp_mask());
++
++		if (*atom_alloc == NULL)
++			return RETERR(-ENOMEM);
++	}
++
++	/* and, also, txnmgr spin lock should be taken before jnode and txnh
++	   locks. */
++	mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
++	spin_lock_txnmgr(mgr);
++	spin_lock_txnh(txnh);
++
++	/* Check whether new atom still needed */
++	if (txnh->atom != NULL) {
++		/* NOTE-NIKITA probably it is rather better to free
++		 * atom_alloc here than thread it up to try_capture(). */
++
++		spin_unlock_txnh(txnh);
++		spin_unlock_txnmgr(mgr);
++
++		return -E_REPEAT;
++	}
++
++	atom = *atom_alloc;
++	*atom_alloc = NULL;
++
++	atom_init(atom);
++
++	assert("jmacd-17", atom_isclean(atom));
++
++        /*
++	 * do not use spin_lock_atom because we have broken lock ordering here
++	 * which is ok, as long as @atom is new and inaccessible for others.
++	 */
++	spin_lock(&(atom->alock));
++
++	/* add atom to the end of transaction manager's list of atoms */
++	list_add_tail(&atom->atom_link, &mgr->atoms_list);
++	atom->atom_id = mgr->id_count++;
++	mgr->atom_count += 1;
++
++	/* Release txnmgr lock */
++	spin_unlock_txnmgr(mgr);
++
++	/* One reference until it commits. */
++	atomic_inc(&atom->refcount);
++	atom->stage = ASTAGE_CAPTURE_FUSE;
++	atom->super = reiser4_get_current_sb();
++	capture_assign_txnh_nolock(atom, txnh);
++
++	spin_unlock(&(atom->alock));
++	spin_unlock_txnh(txnh);
++
++	return -E_REPEAT;
++}
++
++/* Return true if an atom is currently "open". */
++static int atom_isopen(const txn_atom * atom)
++{
++	assert("umka-185", atom != NULL);
++
++	return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
++}
++
++/* Return the number of pointers to this atom that must be updated during fusion.  This
++   approximates the amount of work to be done.  Fusion chooses the atom with fewer
++   pointers to fuse into the atom with more pointers. */
++static int atom_pointer_count(const txn_atom * atom)
++{
++	assert("umka-187", atom != NULL);
++
++	/* This is a measure of the amount of work needed to fuse this atom
++	 * into another. */
++	return atom->txnh_count + atom->capture_count;
++}
++
++/* Called holding the atom lock, this removes the atom from the transaction manager list
++   and frees it. */
++static void atom_free(txn_atom * atom)
++{
++	txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
++
++	assert("umka-188", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++
++	/* Remove from the txn_mgr's atom list */
++	assert_spin_locked(&(mgr->tmgr_lock));
++	mgr->atom_count -= 1;
++	list_del_init(&atom->atom_link);
++
++	/* Clean the atom */
++	assert("jmacd-16",
++	       (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
++	atom->stage = ASTAGE_FREE;
++
++	blocknr_set_destroy(&atom->delete_set);
++	blocknr_set_destroy(&atom->wandered_map);
++
++	assert("jmacd-16", atom_isclean(atom));
++
++	spin_unlock_atom(atom);
++
++	kmem_cache_free(_atom_slab, atom);
++}
++
++static int atom_is_dotard(const txn_atom * atom)
++{
++	return time_after(jiffies, atom->start_time +
++			  get_current_super_private()->tmgr.atom_max_age);
++}
++
++static int atom_can_be_committed(txn_atom * atom)
++{
++	assert_spin_locked(&(atom->alock));
++	assert("zam-885", atom->txnh_count > atom->nr_waiters);
++	return atom->txnh_count == atom->nr_waiters + 1;
++}
++
++/* Return true if an atom should commit now.  This is determined by aging, atom
++   size or atom flags. */
++static int atom_should_commit(const txn_atom * atom)
++{
++	assert("umka-189", atom != NULL);
++	return
++	    (atom->flags & ATOM_FORCE_COMMIT) ||
++	    ((unsigned)atom_pointer_count(atom) >
++	     get_current_super_private()->tmgr.atom_max_size)
++	    || atom_is_dotard(atom);
++}
++
++/* return 1 if current atom exists and requires commit. */
++int current_atom_should_commit(void)
++{
++	txn_atom *atom;
++	int result = 0;
++
++	atom = get_current_atom_locked_nocheck();
++	if (atom) {
++		result = atom_should_commit(atom);
++		spin_unlock_atom(atom);
++	}
++	return result;
++}
++
++static int atom_should_commit_asap(const txn_atom * atom)
++{
++	unsigned int captured;
++	unsigned int pinnedpages;
++
++	assert("nikita-3309", atom != NULL);
++
++	captured = (unsigned)atom->capture_count;
++	pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
++
++	return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
++}
++
++static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
++{
++	jnode *first_dirty;
++
++	list_for_each_entry(first_dirty, head, capture_link) {
++		if (!(flags & JNODE_FLUSH_COMMIT)) {
++			/*
++			 * skip jnodes which "heard banshee" or having active
++			 * I/O
++			 */
++			if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
++			    JF_ISSET(first_dirty, JNODE_WRITEBACK))
++				continue;
++		}
++		return first_dirty;
++	}
++	return NULL;
++}
++
++/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
++   nodes on atom's lists */
++jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
++{
++	jnode *first_dirty;
++	tree_level level;
++
++	assert_spin_locked(&(atom->alock));
++
++	/* The flush starts from LEAF_LEVEL (=1). */
++	for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
++		if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
++			continue;
++
++		first_dirty =
++		    find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
++					     flags);
++		if (first_dirty)
++			return first_dirty;
++	}
++
++	/* znode-above-root is on the list #0. */
++	return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
++}
++
++static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
++{
++	jnode *cur;
++
++	assert("zam-905", atom_is_protected(atom));
++
++	cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
++	while (ATOM_WB_LIST(atom) != &cur->capture_link) {
++		jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
++
++		spin_lock_jnode(cur);
++		if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
++			if (JF_ISSET(cur, JNODE_DIRTY)) {
++				queue_jnode(fq, cur);
++			} else {
++				/* move jnode to atom's clean list */
++				list_move_tail(&cur->capture_link,
++					      ATOM_CLEAN_LIST(atom));
++			}
++		}
++		spin_unlock_jnode(cur);
++
++		cur = next;
++	}
++}
++
++/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
++ * jnodes to disk. */
++static int submit_wb_list(void)
++{
++	int ret;
++	flush_queue_t *fq;
++
++	fq = get_fq_for_current_atom();
++	if (IS_ERR(fq))
++		return PTR_ERR(fq);
++
++	dispatch_wb_list(fq->atom, fq);
++	spin_unlock_atom(fq->atom);
++
++	ret = write_fq(fq, NULL, 1);
++	fq_put(fq);
++
++	return ret;
++}
++
++/* Wait completion of all writes, re-submit atom writeback list if needed. */
++static int current_atom_complete_writes(void)
++{
++	int ret;
++
++	/* Each jnode from that list was modified and dirtied when it had i/o
++	 * request running already. After i/o completion we have to resubmit
++	 * them to disk again.*/
++	ret = submit_wb_list();
++	if (ret < 0)
++		return ret;
++
++	/* Wait all i/o completion */
++	ret = current_atom_finish_all_fq();
++	if (ret)
++		return ret;
++
++	/* Scan wb list again; all i/o should be completed, we re-submit dirty
++	 * nodes to disk */
++	ret = submit_wb_list();
++	if (ret < 0)
++		return ret;
++
++	/* Wait all nodes we just submitted */
++	return current_atom_finish_all_fq();
++}
++
++#define TOOMANYFLUSHES (1 << 13)
++
++/* Called with the atom locked and no open "active" transaction handlers except
++   ours, this function calls flush_current_atom() until all dirty nodes are
++   processed.  Then it initiates commit processing.
++
++   Called by the single remaining open "active" txnh, which is closing. Other
++   open txnhs belong to processes which wait atom commit in commit_txnh()
++   routine. They are counted as "waiters" in atom->nr_waiters.  Therefore as
++   long as we hold the atom lock none of the jnodes can be captured and/or
++   locked.
++
++   Return value is an error code if commit fails.
++*/
++static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
++{
++	reiser4_super_info_data *sbinfo = get_current_super_private();
++	long ret = 0;
++	/* how many times jnode_flush() was called as a part of attempt to
++	 * commit this atom. */
++	int flushiters;
++
++	assert("zam-888", atom != NULL && *atom != NULL);
++	assert_spin_locked(&((*atom)->alock));
++	assert("zam-887", get_current_context()->trans->atom == *atom);
++	assert("jmacd-151", atom_isopen(*atom));
++
++	/* lock ordering: delete_sema and commit_sema are unordered */
++	assert("nikita-3184",
++	       get_current_super_private()->delete_sema_owner != current);
++
++	for (flushiters = 0;; ++flushiters) {
++		ret =
++		    flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
++				       JNODE_FLUSH_COMMIT,
++				       LONG_MAX /* nr_to_write */ ,
++				       nr_submitted, atom, NULL);
++		if (ret != -E_REPEAT)
++			break;
++
++		/* if atom's dirty list contains one znode which is
++		   HEARD_BANSHEE and is locked we have to allow lock owner to
++		   continue and uncapture that znode */
++		preempt_point();
++
++		*atom = get_current_atom_locked();
++		if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
++			warning("nikita-3176",
++				"Flushing like mad: %i", flushiters);
++			info_atom("atom", *atom);
++			DEBUGON(flushiters > (1 << 20));
++		}
++	}
++
++	if (ret)
++		return ret;
++
++	assert_spin_locked(&((*atom)->alock));
++
++	if (!atom_can_be_committed(*atom)) {
++		spin_unlock_atom(*atom);
++		return RETERR(-E_REPEAT);
++	}
++
++	if ((*atom)->capture_count == 0)
++		goto done;
++
++	/* Up to this point we have been flushing and after flush is called we
++	   return -E_REPEAT.  Now we can commit.  We cannot return -E_REPEAT
++	   at this point, commit should be successful. */
++	atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
++	ON_DEBUG(((*atom)->committer = current));
++	spin_unlock_atom(*atom);
++
++	ret = current_atom_complete_writes();
++	if (ret)
++		return ret;
++
++	assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
++
++	/* isolate critical code path which should be executed by only one
++	 * thread using tmgr semaphore */
++	down(&sbinfo->tmgr.commit_semaphore);
++
++	ret = reiser4_write_logs(nr_submitted);
++	if (ret < 0)
++		reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
++
++	/* The atom->ovrwr_nodes list is processed under commit semaphore held
++	   because of bitmap nodes which are captured by special way in
++	   bitmap_pre_commit_hook(), that way does not include
++	   capture_fuse_wait() as a capturing of other nodes does -- the commit
++	   semaphore is used for transaction isolation instead. */
++	invalidate_list(ATOM_OVRWR_LIST(*atom));
++	up(&sbinfo->tmgr.commit_semaphore);
++
++	invalidate_list(ATOM_CLEAN_LIST(*atom));
++	invalidate_list(ATOM_WB_LIST(*atom));
++	assert("zam-927", list_empty(&(*atom)->inodes));
++
++	spin_lock_atom(*atom);
++ done:
++	atom_set_stage(*atom, ASTAGE_DONE);
++	ON_DEBUG((*atom)->committer = NULL);
++
++	/* Atom's state changes, so wake up everybody waiting for this
++	   event. */
++	wakeup_atom_waiting_list(*atom);
++
++	/* Decrement the "until commit" reference, at least one txnh (the caller) is
++	   still open. */
++	atomic_dec(&(*atom)->refcount);
++
++	assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
++	assert("jmacd-1062", (*atom)->capture_count == 0);
++	BUG_ON((*atom)->capture_count != 0);
++	assert_spin_locked(&((*atom)->alock));
++
++	return ret;
++}
++
++/* TXN_TXNH */
++
++/**
++ * force_commit_atom - commit current atom and wait commit completion
++ * @txnh:
++ *
++ * Commits current atom and wait commit completion; current atom and @txnh have
++ * to be spinlocked before call, this function unlocks them on exit.
++ */
++int force_commit_atom(txn_handle *txnh)
++{
++	txn_atom *atom;
++
++	assert("zam-837", txnh != NULL);
++	assert_spin_locked(&(txnh->hlock));
++	assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
++
++	atom = txnh->atom;
++
++	assert("zam-834", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++
++	/*
++	 * Set flags for atom and txnh: forcing atom commit and waiting for
++	 * commit completion
++	 */
++	txnh->flags |= TXNH_WAIT_COMMIT;
++	atom->flags |= ATOM_FORCE_COMMIT;
++
++	spin_unlock_txnh(txnh);
++	spin_unlock_atom(atom);
++
++	/* commit is here */
++	txn_restart_current();
++	return 0;
++}
++
++/* Called to force commit of any outstanding atoms.  @commit_all_atoms controls
++ * should we commit all atoms including new ones which are created after this
++ * functions is called. */
++int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
++{
++	int ret;
++	txn_atom *atom;
++	txn_mgr *mgr;
++	txn_handle *txnh;
++	unsigned long start_time = jiffies;
++	reiser4_context *ctx = get_current_context();
++
++	assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
++	assert("nikita-3058", commit_check_locks());
++
++	txn_restart_current();
++
++	mgr = &get_super_private(super)->tmgr;
++
++	txnh = ctx->trans;
++
++      again:
++
++	spin_lock_txnmgr(mgr);
++
++	list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
++		spin_lock_atom(atom);
++
++		/* Commit any atom which can be committed.  If @commit_new_atoms
++		 * is not set we commit only atoms which were created before
++		 * this call is started. */
++		if (commit_all_atoms
++		    || time_before_eq(atom->start_time, start_time)) {
++			if (atom->stage <= ASTAGE_POST_COMMIT) {
++				spin_unlock_txnmgr(mgr);
++
++				if (atom->stage < ASTAGE_PRE_COMMIT) {
++					spin_lock_txnh(txnh);
++					/* Add force-context txnh */
++					capture_assign_txnh_nolock(atom, txnh);
++					ret = force_commit_atom(txnh);
++					if (ret)
++						return ret;
++				} else
++					/* wait atom commit */
++					atom_wait_event(atom);
++
++				goto again;
++			}
++		}
++
++		spin_unlock_atom(atom);
++	}
++
++#if REISER4_DEBUG
++	if (commit_all_atoms) {
++		reiser4_super_info_data *sbinfo = get_super_private(super);
++		spin_lock_reiser4_super(sbinfo);
++		assert("zam-813",
++		       sbinfo->blocks_fake_allocated_unformatted == 0);
++		assert("zam-812", sbinfo->blocks_fake_allocated == 0);
++		spin_unlock_reiser4_super(sbinfo);
++	}
++#endif
++
++	spin_unlock_txnmgr(mgr);
++
++	return 0;
++}
++
++/* check whether commit_some_atoms() can commit @atom. Locking is up to the
++ * caller */
++static int atom_is_committable(txn_atom * atom)
++{
++	return
++	    atom->stage < ASTAGE_PRE_COMMIT &&
++	    atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
++}
++
++/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
++ * lock at exit */
++int commit_some_atoms(txn_mgr * mgr)
++{
++	int ret = 0;
++	txn_atom *atom;
++	txn_handle *txnh;
++	reiser4_context *ctx;
++	struct list_head *pos, *tmp;
++
++	ctx = get_current_context();
++	assert("nikita-2444", ctx != NULL);
++
++	txnh = ctx->trans;
++	spin_lock_txnmgr(mgr);
++
++	/*
++	 * this is to avoid gcc complain that atom might be used
++	 * uninitialized
++	 */
++	atom = NULL;
++
++	/* look for atom to commit */
++	list_for_each_safe(pos, tmp, &mgr->atoms_list) {
++		atom = list_entry(pos, txn_atom, atom_link);
++		/*
++		 * first test without taking atom spin lock, whether it is
++		 * eligible for committing at all
++		 */
++		if (atom_is_committable(atom)) {
++			/* now, take spin lock and re-check */
++			spin_lock_atom(atom);
++			if (atom_is_committable(atom))
++				break;
++			spin_unlock_atom(atom);
++		}
++	}
++
++	ret = (&mgr->atoms_list == pos);
++	spin_unlock_txnmgr(mgr);
++
++	if (ret) {
++		/* nothing found */
++		spin_unlock(&mgr->daemon->guard);
++		return 0;
++	}
++
++	spin_lock_txnh(txnh);
++
++	BUG_ON(atom == NULL);
++	/* Set the atom to force committing */
++	atom->flags |= ATOM_FORCE_COMMIT;
++
++	/* Add force-context txnh */
++	capture_assign_txnh_nolock(atom, txnh);
++
++	spin_unlock_txnh(txnh);
++	spin_unlock_atom(atom);
++
++	/* we are about to release daemon spin lock, notify daemon it
++	   has to rescan atoms */
++	mgr->daemon->rescan = 1;
++	spin_unlock(&mgr->daemon->guard);
++	txn_restart_current();
++	return 0;
++}
++
++static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
++{
++	int atom_stage;
++	txn_atom *atom_2;
++	int repeat;
++
++	assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
++
++	atom_stage = atom->stage;
++	repeat = 0;
++
++	if (!spin_trylock_txnmgr(tmgr)) {
++		atomic_inc(&atom->refcount);
++		spin_unlock_atom(atom);
++		spin_lock_txnmgr(tmgr);
++		spin_lock_atom(atom);
++		repeat = 1;
++		if (atom->stage != atom_stage) {
++			spin_unlock_txnmgr(tmgr);
++			atom_dec_and_unlock(atom);
++			return -E_REPEAT;
++		}
++		atomic_dec(&atom->refcount);
++	}
++
++	list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
++		if (atom == atom_2)
++			continue;
++		/*
++		 * if trylock does not succeed we just do not fuse with that
++		 * atom.
++		 */
++		if (spin_trylock_atom(atom_2)) {
++			if (atom_2->stage < ASTAGE_PRE_COMMIT) {
++				spin_unlock_txnmgr(tmgr);
++				capture_fuse_into(atom_2, atom);
++				/* all locks are lost we can only repeat here */
++				return -E_REPEAT;
++			}
++			spin_unlock_atom(atom_2);
++		}
++	}
++	atom->flags |= ATOM_CANCEL_FUSION;
++	spin_unlock_txnmgr(tmgr);
++	if (repeat) {
++		spin_unlock_atom(atom);
++		return -E_REPEAT;
++	}
++	return 0;
++}
++
++/* Calls jnode_flush for current atom if it exists; if not, just take another
++   atom and call jnode_flush() for him.  If current transaction handle has
++   already assigned atom (current atom) we have to close current transaction
++   prior to switch to another atom or do something with current atom. This
++   code tries to flush current atom.
++
++   flush_some_atom() is called as part of memory clearing process. It is
++   invoked from balance_dirty_pages(), pdflushd, and entd.
++
++   If we can flush no nodes, atom is committed, because this frees memory.
++
++   If atom is too large or too old it is committed also.
++*/
++int
++flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
++		int flags)
++{
++	reiser4_context *ctx = get_current_context();
++	txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
++	txn_handle *txnh = ctx->trans;
++	txn_atom *atom;
++	int ret;
++
++	BUG_ON(wbc->nr_to_write == 0);
++	BUG_ON(*nr_submitted != 0);
++	assert("zam-1042", txnh != NULL);
++      repeat:
++	if (txnh->atom == NULL) {
++		/* current atom is not available, take first from txnmgr */
++		spin_lock_txnmgr(tmgr);
++
++		/* traverse the list of all atoms */
++		list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
++			/* lock atom before checking its state */
++			spin_lock_atom(atom);
++
++			/*
++			 * we need an atom which is not being committed and
++			 * which has no flushers (jnode_flush() add one flusher
++			 * at the beginning and subtract one at the end).
++			 */
++			if (atom->stage < ASTAGE_PRE_COMMIT &&
++			    atom->nr_flushers == 0) {
++				spin_lock_txnh(txnh);
++				capture_assign_txnh_nolock(atom, txnh);
++				spin_unlock_txnh(txnh);
++
++				goto found;
++			}
++
++			spin_unlock_atom(atom);
++		}
++
++		/*
++		 * Write throttling is case of no one atom can be
++		 * flushed/committed.
++		 */
++		if (!current_is_pdflush() && !wbc->nonblocking) {
++			list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
++				spin_lock_atom(atom);
++				/* Repeat the check from the above. */
++				if (atom->stage < ASTAGE_PRE_COMMIT
++				    && atom->nr_flushers == 0) {
++					spin_lock_txnh(txnh);
++					capture_assign_txnh_nolock(atom, txnh);
++					spin_unlock_txnh(txnh);
++
++					goto found;
++				}
++				if (atom->stage <= ASTAGE_POST_COMMIT) {
++					spin_unlock_txnmgr(tmgr);
++					/*
++					 * we just wait until atom's flusher
++					 * makes a progress in flushing or
++					 * committing the atom
++					 */
++					atom_wait_event(atom);
++					goto repeat;
++				}
++				spin_unlock_atom(atom);
++			}
++		}
++		spin_unlock_txnmgr(tmgr);
++		return 0;
++	      found:
++		spin_unlock_txnmgr(tmgr);
++	} else
++		atom = get_current_atom_locked();
++
++	BUG_ON(atom->super != ctx->super);
++	assert("vs-35", atom->super == ctx->super);
++	if (start) {
++		spin_lock_jnode(start);
++		ret = (atom == start->atom) ? 1 : 0;
++		spin_unlock_jnode(start);
++		if (ret == 0)
++			start = NULL;
++	}
++	ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
++	if (ret == 0) {
++		/* flush_current_atom returns 0 only if it submitted for write
++		   nothing */
++		BUG_ON(*nr_submitted != 0);
++		if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
++			if (atom->capture_count < tmgr->atom_min_size &&
++			    !(atom->flags & ATOM_CANCEL_FUSION)) {
++				ret = txn_try_to_fuse_small_atom(tmgr, atom);
++				if (ret == -E_REPEAT) {
++					preempt_point();
++					goto repeat;
++				}
++			}
++			/* if early flushing could not make more nodes clean,
++			 * or atom is too old/large,
++			 * we force current atom to commit */
++			/* wait for commit completion but only if this
++			 * wouldn't stall pdflushd and ent thread. */
++			if (!wbc->nonblocking && !ctx->entd)
++				txnh->flags |= TXNH_WAIT_COMMIT;
++			atom->flags |= ATOM_FORCE_COMMIT;
++		}
++		spin_unlock_atom(atom);
++	} else if (ret == -E_REPEAT) {
++		if (*nr_submitted == 0) {
++			/* let others who hampers flushing (hold longterm locks,
++			   for instance) to free the way for flush */
++			preempt_point();
++			goto repeat;
++		}
++		ret = 0;
++	}
++/*
++	if (*nr_submitted > wbc->nr_to_write)
++		warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
++*/
++	txn_restart(ctx);
++
++	return ret;
++}
++
++/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
++void invalidate_list(struct list_head *head)
++{
++	while (!list_empty(head)) {
++		jnode *node;
++
++		node = list_entry(head->next, jnode, capture_link);
++		spin_lock_jnode(node);
++		uncapture_block(node);
++		jput(node);
++	}
++}
++
++static void init_wlinks(txn_wait_links * wlinks)
++{
++	wlinks->_lock_stack = get_current_lock_stack();
++	INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
++	INIT_LIST_HEAD(&wlinks->_fwaiting_link);
++	wlinks->waitfor_cb = NULL;
++	wlinks->waiting_cb = NULL;
++}
++
++/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
++void atom_wait_event(txn_atom * atom)
++{
++	txn_wait_links _wlinks;
++
++	assert_spin_locked(&(atom->alock));
++	assert("nikita-3156",
++	       lock_stack_isclean(get_current_lock_stack()) ||
++	       atom->nr_running_queues > 0);
++
++	init_wlinks(&_wlinks);
++	list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
++	atomic_inc(&atom->refcount);
++	spin_unlock_atom(atom);
++
++	prepare_to_sleep(_wlinks._lock_stack);
++	go_to_sleep(_wlinks._lock_stack);
++
++	spin_lock_atom(atom);
++	list_del(&_wlinks._fwaitfor_link);
++	atom_dec_and_unlock(atom);
++}
++
++void atom_set_stage(txn_atom * atom, txn_stage stage)
++{
++	assert("nikita-3535", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++	assert("nikita-3536", ASTAGE_FREE <= stage && stage <= ASTAGE_INVALID);
++	/* Excelsior! */
++	assert("nikita-3537", stage >= atom->stage);
++	if (atom->stage != stage) {
++		atom->stage = stage;
++		atom_send_event(atom);
++	}
++}
++
++/* wake all threads which wait for an event */
++void atom_send_event(txn_atom * atom)
++{
++	assert_spin_locked(&(atom->alock));
++	wakeup_atom_waitfor_list(atom);
++}
++
++/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
++   example, because it does fsync(2)) */
++static int should_wait_commit(txn_handle * h)
++{
++	return h->flags & TXNH_WAIT_COMMIT;
++}
++
++typedef struct commit_data {
++	txn_atom *atom;
++	txn_handle *txnh;
++	long nr_written;
++	/* as an optimization we start committing atom by first trying to
++	 * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
++	 * allows to reduce stalls due to other threads waiting for atom in
++	 * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
++	 * preliminary flushes. */
++	int preflush;
++	/* have we waited on atom. */
++	int wait;
++	int failed;
++	int wake_ktxnmgrd_up;
++} commit_data;
++
++/*
++ * Called from commit_txnh() repeatedly, until either error happens, or atom
++ * commits successfully.
++ */
++static int try_commit_txnh(commit_data * cd)
++{
++	int result;
++
++	assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
++
++	/* Get the atom and txnh locked. */
++	cd->atom = txnh_get_atom(cd->txnh);
++	assert("jmacd-309", cd->atom != NULL);
++	spin_unlock_txnh(cd->txnh);
++
++	if (cd->wait) {
++		cd->atom->nr_waiters--;
++		cd->wait = 0;
++	}
++
++	if (cd->atom->stage == ASTAGE_DONE)
++		return 0;
++
++	if (cd->failed)
++		return 0;
++
++	if (atom_should_commit(cd->atom)) {
++		/* if atom is _very_ large schedule it for commit as soon as
++		 * possible. */
++		if (atom_should_commit_asap(cd->atom)) {
++			/*
++			 * When atom is in PRE_COMMIT or later stage following
++			 * invariant (encoded   in    atom_can_be_committed())
++			 * holds:  there is exactly one non-waiter transaction
++			 * handle opened  on this atom.  When  thread wants to
++			 * wait  until atom  commits (for  example  sync()) it
++			 * waits    on    atom  event     after     increasing
++			 * atom->nr_waiters (see blow  in  this  function). It
++			 * cannot be guaranteed that atom is already committed
++			 * after    receiving event,  so     loop has   to  be
++			 * re-started. But  if  atom switched into  PRE_COMMIT
++			 * stage and became  too  large, we cannot  change its
++			 * state back   to CAPTURE_WAIT (atom  stage can  only
++			 * increase monotonically), hence this check.
++			 */
++			if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
++				atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
++			cd->atom->flags |= ATOM_FORCE_COMMIT;
++		}
++		if (cd->txnh->flags & TXNH_DONT_COMMIT) {
++			/*
++			 * this  thread (transaction  handle  that is) doesn't
++			 * want to commit  atom. Notify waiters that handle is
++			 * closed. This can happen, for  example, when we  are
++			 * under  VFS directory lock  and don't want to commit
++			 * atom  right   now to  avoid  stalling other threads
++			 * working in the same directory.
++			 */
++
++			/* Wake  the ktxnmgrd up if  the ktxnmgrd is needed to
++			 * commit this  atom: no  atom  waiters  and only  one
++			 * (our) open transaction handle. */
++			cd->wake_ktxnmgrd_up =
++			    cd->atom->txnh_count == 1 &&
++			    cd->atom->nr_waiters == 0;
++			atom_send_event(cd->atom);
++			result = 0;
++		} else if (!atom_can_be_committed(cd->atom)) {
++			if (should_wait_commit(cd->txnh)) {
++				/* sync(): wait for commit */
++				cd->atom->nr_waiters++;
++				cd->wait = 1;
++				atom_wait_event(cd->atom);
++				result = RETERR(-E_REPEAT);
++			} else {
++				result = 0;
++			}
++		} else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
++			/*
++			 * optimization: flush  atom without switching it into
++			 * ASTAGE_CAPTURE_WAIT.
++			 *
++			 * But don't  do this for  ktxnmgrd, because  ktxnmgrd
++			 * should never block on atom fusion.
++			 */
++			result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
++						    LONG_MAX, &cd->nr_written,
++						    &cd->atom, NULL);
++			if (result == 0) {
++				spin_unlock_atom(cd->atom);
++				cd->preflush = 0;
++				result = RETERR(-E_REPEAT);
++			} else	/* Atoms wasn't flushed
++				 * completely. Rinse. Repeat. */
++				--cd->preflush;
++		} else {
++			/* We change   atom state  to   ASTAGE_CAPTURE_WAIT to
++			   prevent atom fusion and count  ourself as an active
++			   flusher */
++			atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
++			cd->atom->flags |= ATOM_FORCE_COMMIT;
++
++			result =
++			    commit_current_atom(&cd->nr_written, &cd->atom);
++			if (result != 0 && result != -E_REPEAT)
++				cd->failed = 1;
++		}
++	} else
++		result = 0;
++
++#if REISER4_DEBUG
++	if (result == 0)
++		assert_spin_locked(&(cd->atom->alock));
++#endif
++
++	/* perfectly valid assertion, except that when atom/txnh is not locked
++	 * fusion can take place, and cd->atom points nowhere. */
++	/*
++	   assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
++	 */
++	return result;
++}
++
++/* Called to commit a transaction handle.  This decrements the atom's number of open
++   handles and if it is the last handle to commit and the atom should commit, initiates
++   atom commit. if commit does not fail, return number of written blocks */
++static int commit_txnh(txn_handle * txnh)
++{
++	commit_data cd;
++	assert("umka-192", txnh != NULL);
++
++	memset(&cd, 0, sizeof cd);
++	cd.txnh = txnh;
++	cd.preflush = 10;
++
++	/* calls try_commit_txnh() until either atom commits, or error
++	 * happens */
++	while (try_commit_txnh(&cd) != 0)
++		preempt_point();
++
++	spin_lock_txnh(txnh);
++
++	cd.atom->txnh_count -= 1;
++	txnh->atom = NULL;
++	/* remove transaction handle from atom's list of transaction handles */
++	list_del_init(&txnh->txnh_link);
++
++	spin_unlock_txnh(txnh);
++	atom_dec_and_unlock(cd.atom);
++	/* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
++	 * because it takes time) by current thread, we do that work
++	 * asynchronously by ktxnmgrd daemon. */
++	if (cd.wake_ktxnmgrd_up)
++		ktxnmgrd_kick(&get_current_super_private()->tmgr);
++
++	return 0;
++}
++
++/* TRY_CAPTURE */
++
++/* This routine attempts a single block-capture request.  It may return -E_REPEAT if some
++   condition indicates that the request should be retried, and it may block if the
++   txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
++
++   This routine encodes the basic logic of block capturing described by:
++
++     http://namesys.com/v4/v4.html
++
++   Our goal here is to ensure that any two blocks that contain dependent modifications
++   should commit at the same time.  This function enforces this discipline by initiating
++   fusion whenever a transaction handle belonging to one atom requests to read or write a
++   block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
++
++   In addition, this routine handles the initial assignment of atoms to blocks and
++   transaction handles.  These are possible outcomes of this function:
++
++   1. The block and handle are already part of the same atom: return immediate success
++
++   2. The block is assigned but the handle is not: call capture_assign_txnh to assign
++      the handle to the block's atom.
++
++   3. The handle is assigned but the block is not: call capture_assign_block to assign
++      the block to the handle's atom.
++
++   4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
++      to fuse atoms.
++
++   5. Neither block nor handle are assigned: create a new atom and assign them both.
++
++   6. A read request for a non-captured block: return immediate success.
++
++   This function acquires and releases the handle's spinlock.  This function is called
++   under the jnode lock and if the return value is 0, it returns with the jnode lock still
++   held.  If the return is -E_REPEAT or some other error condition, the jnode lock is
++   released.  The external interface (try_capture) manages re-aquiring the jnode lock
++   in the failure case.
++*/
++static int try_capture_block(
++	txn_handle * txnh, jnode * node, txn_capture mode,
++	txn_atom ** atom_alloc)
++{
++	txn_atom *block_atom;
++	txn_atom *txnh_atom;
++
++	/* Should not call capture for READ_NONCOM requests, handled in try_capture. */
++	assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
++
++	/* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
++	 * node->tree somewhere. */
++	assert("umka-194", txnh != NULL);
++	assert("umka-195", node != NULL);
++
++	/* The jnode is already locked!  Being called from try_capture(). */
++	assert_spin_locked(&(node->guard));
++	block_atom = node->atom;
++
++	/* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
++	   let us touch the atoms themselves. */
++	spin_lock_txnh(txnh);
++	txnh_atom = txnh->atom;
++	/* Process of capturing continues into one of four branches depends on
++	   which atoms from (block atom (node->atom), current atom (txnh->atom))
++	   exist. */
++	if (txnh_atom == NULL) {
++		if (block_atom == NULL) {
++			spin_unlock_txnh(txnh);
++			spin_unlock_jnode(node);
++			/* assign empty atom to the txnh and repeat */
++			return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
++		} else {
++			atomic_inc(&block_atom->refcount);
++			/* node spin-lock isn't needed anymore */
++			spin_unlock_jnode(node);
++			if (!spin_trylock_atom(block_atom)) {
++				spin_unlock_txnh(txnh);
++				spin_lock_atom(block_atom);
++				spin_lock_txnh(txnh);
++			}
++			/* re-check state after getting txnh and the node
++			 * atom spin-locked */
++			if (node->atom != block_atom || txnh->atom != NULL) {
++				spin_unlock_txnh(txnh);
++				atom_dec_and_unlock(block_atom);
++				return RETERR(-E_REPEAT);
++			}
++			atomic_dec(&block_atom->refcount);
++			if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
++			    (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
++			     block_atom->txnh_count != 0))
++				return capture_fuse_wait(txnh, block_atom, NULL, mode);
++			capture_assign_txnh_nolock(block_atom, txnh);
++			spin_unlock_txnh(txnh);
++			spin_unlock_atom(block_atom);
++			return RETERR(-E_REPEAT);
++		}
++	} else {
++		/* It is time to perform deadlock prevention check over the
++                  node we want to capture.  It is possible this node was locked
++                  for read without capturing it. The optimization which allows
++                  to do it helps us in keeping atoms independent as long as
++                  possible but it may cause lock/fuse deadlock problems.
++
++                  A number of similar deadlock situations with locked but not
++                  captured nodes were found.  In each situation there are two
++                  or more threads: one of them does flushing while another one
++                  does routine balancing or tree lookup.  The flushing thread
++                  (F) sleeps in long term locking request for node (N), another
++                  thread (A) sleeps in trying to capture some node already
++                  belonging the atom F, F has a state which prevents
++                  immediately fusion .
++
++                  Deadlocks of this kind cannot happen if node N was properly
++                  captured by thread A. The F thread fuse atoms before locking
++                  therefore current atom of thread F and current atom of thread
++                  A became the same atom and thread A may proceed.  This does
++                  not work if node N was not captured because the fusion of
++                  atom does not happens.
++
++                  The following scheme solves the deadlock: If
++                  longterm_lock_znode locks and does not capture a znode, that
++                  znode is marked as MISSED_IN_CAPTURE.  A node marked this way
++                  is processed by the code below which restores the missed
++                  capture and fuses current atoms of all the node lock owners
++                  by calling the fuse_not_fused_lock_owners() function. */
++		if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
++			JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
++			if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
++				spin_unlock_txnh(txnh);
++				spin_unlock_jnode(node);
++				fuse_not_fused_lock_owners(txnh, JZNODE(node));
++				return RETERR(-E_REPEAT);
++			}
++		}
++		if (block_atom == NULL) {
++			atomic_inc(&txnh_atom->refcount);
++			spin_unlock_txnh(txnh);
++			if (!spin_trylock_atom(txnh_atom)) {
++				spin_unlock_jnode(node);
++				spin_lock_atom(txnh_atom);
++				spin_lock_jnode(node);
++			}
++			if (txnh->atom != txnh_atom || node->atom != NULL
++				|| JF_ISSET(node, JNODE_IS_DYING)) {
++				spin_unlock_jnode(node);
++				atom_dec_and_unlock(txnh_atom);
++				return RETERR(-E_REPEAT);
++			}
++			atomic_dec(&txnh_atom->refcount);
++			capture_assign_block_nolock(txnh_atom, node);
++			spin_unlock_atom(txnh_atom);
++		} else {
++			if (txnh_atom != block_atom) {
++				if (mode & TXN_CAPTURE_DONT_FUSE) {
++					spin_unlock_txnh(txnh);
++					spin_unlock_jnode(node);
++					/* we are in a "no-fusion" mode and @node is
++					 * already part of transaction. */
++					return RETERR(-E_NO_NEIGHBOR);
++				}
++				return capture_init_fusion(node, txnh, mode);
++			}
++			spin_unlock_txnh(txnh);
++		}
++	}
++	return 0;
++}
++
++static txn_capture
++build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
++{
++	txn_capture cap_mode;
++
++	assert_spin_locked(&(node->guard));
++
++	/* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
++
++	if (lock_mode == ZNODE_WRITE_LOCK) {
++		cap_mode = TXN_CAPTURE_WRITE;
++	} else if (node->atom != NULL) {
++		cap_mode = TXN_CAPTURE_WRITE;
++	} else if (0 &&		/* txnh->mode == TXN_READ_FUSING && */
++		   jnode_get_level(node) == LEAF_LEVEL) {
++		/* NOTE-NIKITA TXN_READ_FUSING is not currently used */
++		/* We only need a READ_FUSING capture at the leaf level.  This
++		   is because the internal levels of the tree (twigs included)
++		   are redundant from the point of the user that asked for a
++		   read-fusing transcrash.  The user only wants to read-fuse
++		   atoms due to reading uncommitted data that another user has
++		   written.  It is the file system that reads/writes the
++		   internal tree levels, the user only reads/writes leaves. */
++		cap_mode = TXN_CAPTURE_READ_ATOMIC;
++	} else {
++		/* In this case (read lock at a non-leaf) there's no reason to
++		 * capture. */
++		/* cap_mode = TXN_CAPTURE_READ_NONCOM; */
++		return 0;
++	}
++
++	cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
++	assert("nikita-3186", cap_mode != 0);
++	return cap_mode;
++}
++
++/* This is an external interface to try_capture_block(), it calls
++   try_capture_block() repeatedly as long as -E_REPEAT is returned.
++
++   @node:         node to capture,
++   @lock_mode:    read or write lock is used in capture mode calculation,
++   @flags:        see txn_capture flags enumeration,
++   @can_coc     : can copy-on-capture
++
++   @return: 0 - node was successfully captured, -E_REPEAT - capture request
++            cannot be processed immediately as it was requested in flags,
++	    < 0 - other errors.
++*/
++int try_capture(jnode *node, znode_lock_mode lock_mode,
++		txn_capture flags)
++{
++	txn_atom *atom_alloc = NULL;
++	txn_capture cap_mode;
++	txn_handle *txnh = get_current_context()->trans;
++	int ret;
++
++	assert_spin_locked(&(node->guard));
++
++      repeat:
++	if (JF_ISSET(node, JNODE_IS_DYING))
++		return RETERR(-EINVAL);
++	if (node->atom != NULL && txnh->atom == node->atom)
++		return 0;
++	cap_mode = build_capture_mode(node, lock_mode, flags);
++	if (cap_mode == 0 ||
++	    (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
++		/* Mark this node as "MISSED".  It helps in further deadlock
++		 * analysis */
++		if (jnode_is_znode(node))
++			JF_SET(node, JNODE_MISSED_IN_CAPTURE);
++		return 0;
++	}
++	/* Repeat try_capture as long as -E_REPEAT is returned. */
++	ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
++	/* Regardless of non_blocking:
++
++	   If ret == 0 then jnode is still locked.
++	   If ret != 0 then jnode is unlocked.
++	 */
++#if REISER4_DEBUG
++	if (ret == 0)
++		assert_spin_locked(&(node->guard));
++	else
++		assert_spin_not_locked(&(node->guard));
++#endif
++	assert_spin_not_locked(&(txnh->guard));
++
++	if (ret == -E_REPEAT) {
++		/* E_REPEAT implies all locks were released, therefore we need
++		   to take the jnode's lock again. */
++		spin_lock_jnode(node);
++
++		/* Although this may appear to be a busy loop, it is not.
++		   There are several conditions that cause E_REPEAT to be
++		   returned by the call to try_capture_block, all cases
++		   indicating some kind of state change that means you should
++		   retry the request and will get a different result.  In some
++		   cases this could be avoided with some extra code, but
++		   generally it is done because the necessary locks were
++		   released as a result of the operation and repeating is the
++		   simplest thing to do (less bug potential).  The cases are:
++		   atom fusion returns E_REPEAT after it completes (jnode and
++		   txnh were unlocked); race conditions in assign_block,
++		   assign_txnh, and init_fusion return E_REPEAT (trylock
++		   failure); after going to sleep in capture_fuse_wait
++		   (request was blocked but may now succeed).  I'm not quite
++		   sure how capture_copy works yet, but it may also return
++		   E_REPEAT.  When the request is legitimately blocked, the
++		   requestor goes to sleep in fuse_wait, so this is not a busy
++		   loop. */
++		/* NOTE-NIKITA: still don't understand:
++
++		   try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
++
++		   looks like busy loop?
++		 */
++		goto repeat;
++	}
++
++	/* free extra atom object that was possibly allocated by
++	   try_capture_block().
++
++	   Do this before acquiring jnode spin lock to
++	   minimize time spent under lock. --nikita */
++	if (atom_alloc != NULL) {
++		kmem_cache_free(_atom_slab, atom_alloc);
++	}
++
++	if (ret != 0) {
++		if (ret == -E_BLOCK) {
++			assert("nikita-3360",
++			       cap_mode & TXN_CAPTURE_NONBLOCKING);
++			ret = -E_REPEAT;
++		}
++
++		/* Failure means jnode is not locked.  FIXME_LATER_JMACD May
++		   want to fix the above code to avoid releasing the lock and
++		   re-acquiring it, but there are cases were failure occurs
++		   when the lock is not held, and those cases would need to be
++		   modified to re-take the lock. */
++		spin_lock_jnode(node);
++	}
++
++	/* Jnode is still locked. */
++	assert_spin_locked(&(node->guard));
++	return ret;
++}
++
++static void release_two_atoms(txn_atom *one, txn_atom *two)
++{
++	spin_unlock_atom(one);
++	atom_dec_and_unlock(two);
++	spin_lock_atom(one);
++	atom_dec_and_unlock(one);
++}
++
++/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
++   returned by that routine.  The txn_capture request mode is computed here depending on
++   the transaction handle's type and the lock request.  This is called from the depths of
++   the lock manager with the jnode lock held and it always returns with the jnode lock
++   held.
++*/
++
++/* fuse all 'active' atoms of lock owners of given node. */
++static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
++{
++	lock_handle *lh;
++	int repeat;
++	txn_atom *atomh, *atomf;
++	reiser4_context *me = get_current_context();
++	reiser4_context *ctx = NULL;
++
++	assert_spin_not_locked(&(ZJNODE(node)->guard));
++	assert_spin_not_locked(&(txnh->hlock));
++
++ repeat:
++	repeat = 0;
++	atomh = txnh_get_atom(txnh);
++	spin_unlock_txnh(txnh);
++	assert("zam-692", atomh != NULL);
++
++	spin_lock_zlock(&node->lock);
++	/* inspect list of lock owners */
++	list_for_each_entry(lh, &node->lock.owners, owners_link) {
++		ctx = get_context_by_lock_stack(lh->owner);
++		if (ctx == me)
++			continue;
++		/* below we use two assumptions to avoid addition spin-locks
++		   for checking the condition :
++
++		   1) if the lock stack has lock, the transaction should be
++		   opened, i.e. ctx->trans != NULL;
++
++		   2) reading of well-aligned ctx->trans->atom is atomic, if it
++		   equals to the address of spin-locked atomh, we take that
++		   the atoms are the same, nothing has to be captured. */
++		if (atomh != ctx->trans->atom) {
++			reiser4_wake_up(lh->owner);
++			repeat = 1;
++			break;
++		}
++	}
++	if (repeat) {
++		if (!spin_trylock_txnh(ctx->trans)) {
++			spin_unlock_zlock(&node->lock);
++			spin_unlock_atom(atomh);
++			goto repeat;
++		}
++		atomf = ctx->trans->atom;
++		if (atomf == NULL) {
++			capture_assign_txnh_nolock(atomh, ctx->trans);
++			/* release zlock lock _after_ assigning the atom to the
++			 * transaction handle, otherwise the lock owner thread
++			 * may unlock all znodes, exit kernel context and here
++			 * we would access an invalid transaction handle. */
++			spin_unlock_zlock(&node->lock);
++			spin_unlock_atom(atomh);
++			spin_unlock_txnh(ctx->trans);
++			goto repeat;
++		}
++		assert("zam-1059", atomf != atomh);
++		spin_unlock_zlock(&node->lock);
++		atomic_inc(&atomh->refcount);
++		atomic_inc(&atomf->refcount);
++		spin_unlock_txnh(ctx->trans);
++		if (atomf > atomh) {
++			spin_lock_atom(atomf);
++		} else {
++			spin_unlock_atom(atomh);
++			spin_lock_atom(atomf);
++			spin_lock_atom(atomh);
++		}
++		if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
++			release_two_atoms(atomf, atomh);
++			goto repeat;
++		}
++		atomic_dec(&atomh->refcount);
++		atomic_dec(&atomf->refcount);
++		capture_fuse_into(atomf, atomh);
++		goto repeat;
++	}
++	spin_unlock_zlock(&node->lock);
++	spin_unlock_atom(atomh);
++}
++
++/* This is the interface to capture unformatted nodes via their struct page
++   reference. Currently it is only used in reiser4_invalidatepage */
++int try_capture_page_to_invalidate(struct page *pg)
++{
++	int ret;
++	jnode *node;
++
++	assert("umka-292", pg != NULL);
++	assert("nikita-2597", PageLocked(pg));
++
++	if (IS_ERR(node = jnode_of_page(pg))) {
++		return PTR_ERR(node);
++	}
++
++	spin_lock_jnode(node);
++	unlock_page(pg);
++
++	ret = try_capture(node, ZNODE_WRITE_LOCK, 0);
++	spin_unlock_jnode(node);
++	jput(node);
++	lock_page(pg);
++	return ret;
++}
++
++/* This informs the transaction manager when a node is deleted.  Add the block to the
++   atom's delete set and uncapture the block.
++
++VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
++explanations.  find all the functions that use it, and unless there is some very
++good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
++move the loop to inside the function.
++
++VS-FIXME-HANS: can this code be at all streamlined?  In particular, can you lock and unlock the jnode fewer times?
++  */
++void uncapture_page(struct page *pg)
++{
++	jnode *node;
++	txn_atom *atom;
++
++	assert("umka-199", pg != NULL);
++	assert("nikita-3155", PageLocked(pg));
++
++	clear_page_dirty_for_io(pg);
++
++	reiser4_wait_page_writeback(pg);
++
++	node = jprivate(pg);
++	BUG_ON(node == NULL);
++
++	spin_lock_jnode(node);
++
++	atom = jnode_get_atom(node);
++	if (atom == NULL) {
++		assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
++		spin_unlock_jnode(node);
++		return;
++	}
++
++	/* We can remove jnode from transaction even if it is on flush queue
++	 * prepped list, we only need to be sure that flush queue is not being
++	 * written by write_fq().  write_fq() does not use atom spin lock for
++	 * protection of the prepped nodes list, instead write_fq() increments
++	 * atom's nr_running_queues counters for the time when prepped list is
++	 * not protected by spin lock.  Here we check this counter if we want
++	 * to remove jnode from flush queue and, if the counter is not zero,
++	 * wait all write_fq() for this atom to complete. This is not
++	 * significant overhead. */
++	while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
++		spin_unlock_jnode(node);
++		/*
++		 * at this moment we want to wait for "atom event", viz. wait
++		 * until @node can be removed from flush queue. But
++		 * atom_wait_event() cannot be called with page locked, because
++		 * it deadlocks with jnode_extent_write(). Unlock page, after
++		 * making sure (through page_cache_get()) that it cannot be
++		 * released from memory.
++		 */
++		page_cache_get(pg);
++		unlock_page(pg);
++		atom_wait_event(atom);
++		lock_page(pg);
++		/*
++		 * page may has been detached by ->writepage()->releasepage().
++		 */
++		reiser4_wait_page_writeback(pg);
++		spin_lock_jnode(node);
++		page_cache_release(pg);
++		atom = jnode_get_atom(node);
++/* VS-FIXME-HANS: improve the commenting in this function */
++		if (atom == NULL) {
++			spin_unlock_jnode(node);
++			return;
++		}
++	}
++	uncapture_block(node);
++	spin_unlock_atom(atom);
++	jput(node);
++}
++
++/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
++ * inode's tree of jnodes */
++void uncapture_jnode(jnode * node)
++{
++	txn_atom *atom;
++
++	assert_spin_locked(&(node->guard));
++	assert("", node->pg == 0);
++
++	atom = jnode_get_atom(node);
++	if (atom == NULL) {
++		assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
++		spin_unlock_jnode(node);
++		return;
++	}
++
++	uncapture_block(node);
++	spin_unlock_atom(atom);
++	jput(node);
++}
++
++/* No-locking version of assign_txnh.  Sets the transaction handle's atom pointer,
++   increases atom refcount and txnh_count, adds to txnh_list. */
++static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
++{
++	assert("umka-200", atom != NULL);
++	assert("umka-201", txnh != NULL);
++
++	assert_spin_locked(&(txnh->hlock));
++	assert_spin_locked(&(atom->alock));
++	assert("jmacd-824", txnh->atom == NULL);
++	assert("nikita-3540", atom_isopen(atom));
++	BUG_ON(txnh->atom != NULL);
++
++	atomic_inc(&atom->refcount);
++	txnh->atom = atom;
++	set_gfp_mask();
++	list_add_tail(&txnh->txnh_link, &atom->txnh_list);
++	atom->txnh_count += 1;
++}
++
++/* No-locking version of assign_block.  Sets the block's atom pointer, references the
++   block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
++static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
++{
++	assert("umka-202", atom != NULL);
++	assert("umka-203", node != NULL);
++	assert_spin_locked(&(node->guard));
++	assert_spin_locked(&(atom->alock));
++	assert("jmacd-323", node->atom == NULL);
++	BUG_ON(!list_empty_careful(&node->capture_link));
++	assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
++
++	/* Pointer from jnode to atom is not counted in atom->refcount. */
++	node->atom = atom;
++
++	list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
++	atom->capture_count += 1;
++	/* reference to jnode is acquired by atom. */
++	jref(node);
++
++	ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
++
++	LOCK_CNT_INC(t_refs);
++}
++
++/* common code for dirtying both unformatted jnodes and formatted znodes. */
++static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
++{
++	assert_spin_locked(&(node->guard));
++	assert_spin_locked(&(atom->alock));
++	assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
++
++	JF_SET(node, JNODE_DIRTY);
++
++	get_current_context()->nr_marked_dirty++;
++
++	/* We grab2flush_reserve one additional block only if node was
++	   not CREATED and jnode_flush did not sort it into neither
++	   relocate set nor overwrite one. If node is in overwrite or
++	   relocate set we assume that atom's flush reserved counter was
++	   already adjusted. */
++	if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
++	    && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
++	    && !jnode_is_cluster_page(node)) {
++		assert("vs-1093", !blocknr_is_fake(&node->blocknr));
++		assert("vs-1506", *jnode_get_block(node) != 0);
++		grabbed2flush_reserved_nolock(atom, (__u64) 1);
++		JF_SET(node, JNODE_FLUSH_RESERVED);
++	}
++
++	if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
++		/* If the atom is not set yet, it will be added to the appropriate list in
++		   capture_assign_block_nolock. */
++		/* Sometimes a node is set dirty before being captured -- the case for new
++		   jnodes.  In that case the jnode will be added to the appropriate list
++		   in capture_assign_block_nolock. Another reason not to re-link jnode is
++		   that jnode is on a flush queue (see flush.c for details) */
++
++		int level = jnode_get_level(node);
++
++		assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
++		assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
++		assert("nikita-2607", 0 <= level);
++		assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
++
++		/* move node to atom's dirty list */
++		list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
++		ON_DEBUG(count_jnode
++			 (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
++	}
++}
++
++/* Set the dirty status for this (spin locked) jnode. */
++void jnode_make_dirty_locked(jnode * node)
++{
++	assert("umka-204", node != NULL);
++	assert_spin_locked(&(node->guard));
++
++	if (REISER4_DEBUG && rofs_jnode(node)) {
++		warning("nikita-3365", "Dirtying jnode on rofs");
++		dump_stack();
++	}
++
++	/* Fast check for already dirty node */
++	if (!JF_ISSET(node, JNODE_DIRTY)) {
++		txn_atom *atom;
++
++		atom = jnode_get_atom(node);
++		assert("vs-1094", atom);
++		/* Check jnode dirty status again because node spin lock might
++		 * be released inside jnode_get_atom(). */
++		if (likely(!JF_ISSET(node, JNODE_DIRTY)))
++			do_jnode_make_dirty(node, atom);
++		spin_unlock_atom(atom);
++	}
++}
++
++/* Set the dirty status for this znode. */
++void znode_make_dirty(znode * z)
++{
++	jnode *node;
++	struct page *page;
++
++	assert("umka-204", z != NULL);
++	assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
++	assert("nikita-3560", znode_is_write_locked(z));
++
++	node = ZJNODE(z);
++	/* znode is longterm locked, we can check dirty bit without spinlock */
++	if (JF_ISSET(node, JNODE_DIRTY)) {
++		/* znode is dirty already. All we have to do is to change znode version */
++		z->version = znode_build_version(jnode_get_tree(node));
++		return;
++	}
++
++	spin_lock_jnode(node);
++	jnode_make_dirty_locked(node);
++	page = jnode_page(node);
++	if (page != NULL) {
++		/* this is useful assertion (allows one to check that no
++		 * modifications are lost due to update of in-flight page),
++		 * but it requires locking on page to check PG_writeback
++		 * bit. */
++		/* assert("nikita-3292",
++		   !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
++		page_cache_get(page);
++
++		/* jnode lock is not needed for the rest of
++		 * znode_set_dirty(). */
++		spin_unlock_jnode(node);
++		/* reiser4 file write code calls set_page_dirty for
++		 * unformatted nodes, for formatted nodes we do it here. */
++		set_page_dirty_internal(page);
++		page_cache_release(page);
++		/* bump version counter in znode */
++		z->version = znode_build_version(jnode_get_tree(node));
++	} else {
++		assert("zam-596", znode_above_root(JZNODE(node)));
++		spin_unlock_jnode(node);
++	}
++
++	assert("nikita-1900", znode_is_write_locked(z));
++	assert("jmacd-9777", node->atom != NULL);
++}
++
++int sync_atom(txn_atom * atom)
++{
++	int result;
++	txn_handle *txnh;
++
++	txnh = get_current_context()->trans;
++
++	result = 0;
++	if (atom != NULL) {
++		if (atom->stage < ASTAGE_PRE_COMMIT) {
++			spin_lock_txnh(txnh);
++			capture_assign_txnh_nolock(atom, txnh);
++			result = force_commit_atom(txnh);
++		} else if (atom->stage < ASTAGE_POST_COMMIT) {
++			/* wait atom commit */
++			atom_wait_event(atom);
++			/* try once more */
++			result = RETERR(-E_REPEAT);
++		} else
++			spin_unlock_atom(atom);
++	}
++	return result;
++}
++
++#if REISER4_DEBUG
++
++/* move jnode form one list to another
++   call this after atom->capture_count is updated */
++void
++count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
++	    atom_list new_list, int check_lists)
++{
++	struct list_head *pos;
++
++	assert("zam-1018", atom_is_protected(atom));
++	assert_spin_locked(&(node->guard));
++	assert("", NODE_LIST(node) == old_list);
++
++	switch (NODE_LIST(node)) {
++	case NOT_CAPTURED:
++		break;
++	case DIRTY_LIST:
++		assert("", atom->dirty > 0);
++		atom->dirty--;
++		break;
++	case CLEAN_LIST:
++		assert("", atom->clean > 0);
++		atom->clean--;
++		break;
++	case FQ_LIST:
++		assert("", atom->fq > 0);
++		atom->fq--;
++		break;
++	case WB_LIST:
++		assert("", atom->wb > 0);
++		atom->wb--;
++		break;
++	case OVRWR_LIST:
++		assert("", atom->ovrwr > 0);
++		atom->ovrwr--;
++		break;
++	default:
++		impossible("", "");
++	}
++
++	switch (new_list) {
++	case NOT_CAPTURED:
++		break;
++	case DIRTY_LIST:
++		atom->dirty++;
++		break;
++	case CLEAN_LIST:
++		atom->clean++;
++		break;
++	case FQ_LIST:
++		atom->fq++;
++		break;
++	case WB_LIST:
++		atom->wb++;
++		break;
++	case OVRWR_LIST:
++		atom->ovrwr++;
++		break;
++	default:
++		impossible("", "");
++	}
++	ASSIGN_NODE_LIST(node, new_list);
++	if (0 && check_lists) {
++		int count;
++		tree_level level;
++
++		count = 0;
++
++		/* flush queue list */
++		/*check_fq(atom); */
++
++		/* dirty list */
++		count = 0;
++		for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
++			list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
++				count++;
++		}
++		if (count != atom->dirty)
++			warning("", "dirty counter %d, real %d\n", atom->dirty,
++				count);
++
++		/* clean list */
++		count = 0;
++		list_for_each(pos, ATOM_CLEAN_LIST(atom))
++			count++;
++		if (count != atom->clean)
++			warning("", "clean counter %d, real %d\n", atom->clean,
++				count);
++
++		/* wb list */
++		count = 0;
++		list_for_each(pos, ATOM_WB_LIST(atom))
++			count++;
++		if (count != atom->wb)
++			warning("", "wb counter %d, real %d\n", atom->wb,
++				count);
++
++		/* overwrite list */
++		count = 0;
++		list_for_each(pos, ATOM_OVRWR_LIST(atom))
++			count++;
++
++		if (count != atom->ovrwr)
++			warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
++				count);
++	}
++	assert("vs-1624", atom->num_queued == atom->fq);
++	if (atom->capture_count !=
++	    atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
++		printk
++		    ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
++		     atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
++		     atom->wb, atom->fq);
++		assert("vs-1622",
++		       atom->capture_count ==
++		       atom->dirty + atom->clean + atom->ovrwr + atom->wb +
++		       atom->fq);
++	}
++}
++
++#endif
++
++/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
++ * lock should be taken before calling this function. */
++void jnode_make_wander_nolock(jnode * node)
++{
++	txn_atom *atom;
++
++	assert("nikita-2431", node != NULL);
++	assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
++	assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
++	assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
++	assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
++
++	atom = node->atom;
++
++	assert("zam-895", atom != NULL);
++	assert("zam-894", atom_is_protected(atom));
++
++	JF_SET(node, JNODE_OVRWR);
++	/* move node to atom's overwrite list */
++	list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
++	ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
++}
++
++/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
++ * this function. */
++void jnode_make_wander(jnode * node)
++{
++	txn_atom *atom;
++
++	spin_lock_jnode(node);
++	atom = jnode_get_atom(node);
++	assert("zam-913", atom != NULL);
++	assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
++
++	jnode_make_wander_nolock(node);
++	spin_unlock_atom(atom);
++	spin_unlock_jnode(node);
++}
++
++/* this just sets RELOC bit  */
++static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
++{
++	assert_spin_locked(&(node->guard));
++	assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
++	assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
++	assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
++	assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
++	assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
++	jnode_set_reloc(node);
++}
++
++/* Make znode RELOC and put it on flush queue */
++void znode_make_reloc(znode * z, flush_queue_t * fq)
++{
++	jnode *node;
++	txn_atom *atom;
++
++	node = ZJNODE(z);
++	spin_lock_jnode(node);
++
++	atom = jnode_get_atom(node);
++	assert("zam-919", atom != NULL);
++
++	jnode_make_reloc_nolock(fq, node);
++	queue_jnode(fq, node);
++
++	spin_unlock_atom(atom);
++	spin_unlock_jnode(node);
++
++}
++
++/* Make unformatted node RELOC and put it on flush queue */
++void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
++{
++	assert("vs-1479", jnode_is_unformatted(node));
++
++	jnode_make_reloc_nolock(fq, node);
++	queue_jnode(fq, node);
++}
++
++int capture_super_block(struct super_block *s)
++{
++	int result;
++	znode *uber;
++	lock_handle lh;
++
++	init_lh(&lh);
++	result = get_uber_znode(get_tree(s),
++				ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
++	if (result)
++		return result;
++
++	uber = lh.node;
++	/* Grabbing one block for superblock */
++	result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
++	if (result != 0)
++		return result;
++
++	znode_make_dirty(uber);
++
++	done_lh(&lh);
++	return 0;
++}
++
++/* Wakeup every handle on the atom's WAITFOR list */
++static void wakeup_atom_waitfor_list(txn_atom * atom)
++{
++	txn_wait_links *wlinks;
++
++	assert("umka-210", atom != NULL);
++
++	/* atom is locked */
++	list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
++		if (wlinks->waitfor_cb == NULL ||
++		    wlinks->waitfor_cb(atom, wlinks))
++			/* Wake up. */
++			reiser4_wake_up(wlinks->_lock_stack);
++	}
++}
++
++/* Wakeup every handle on the atom's WAITING list */
++static void wakeup_atom_waiting_list(txn_atom * atom)
++{
++	txn_wait_links *wlinks;
++
++	assert("umka-211", atom != NULL);
++
++	/* atom is locked */
++	list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
++		if (wlinks->waiting_cb == NULL ||
++		    wlinks->waiting_cb(atom, wlinks))
++			/* Wake up. */
++			reiser4_wake_up(wlinks->_lock_stack);
++	}
++}
++
++/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
++static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
++{
++	assert("nikita-3330", atom != NULL);
++	assert_spin_locked(&(atom->alock));
++
++	/* atom->txnh_count == 1 is for waking waiters up if we are releasing
++	 * last transaction handle. */
++	return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
++}
++
++/* The general purpose of this function is to wait on the first of two possible events.
++   The situation is that a handle (and its atom atomh) is blocked trying to capture a
++   block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state.  The
++   handle's atom (atomh) is not in the CAPTURE_WAIT state.  However, atomh could fuse with
++   another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
++   needs to unblock the handle to avoid deadlock.  When the txnh is unblocked it will
++   proceed and fuse the two atoms in the CAPTURE_WAIT state.
++
++   In other words, if either atomh or atomf change state, the handle will be awakened,
++   thus there are two lists per atom: WAITING and WAITFOR.
++
++   This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
++   close but it is not assigned to an atom of its own.
++
++   Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
++   BOTH_ATOM_LOCKS.  Result: all four locks are released.
++*/
++static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
++		    txn_atom * atomh, txn_capture mode)
++{
++	int ret;
++	txn_wait_links wlinks;
++
++	assert("umka-213", txnh != NULL);
++	assert("umka-214", atomf != NULL);
++
++	if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
++		spin_unlock_txnh(txnh);
++		spin_unlock_atom(atomf);
++
++		if (atomh) {
++			spin_unlock_atom(atomh);
++		}
++
++		return RETERR(-E_BLOCK);
++	}
++
++	/* Initialize the waiting list links. */
++	init_wlinks(&wlinks);
++
++	/* Add txnh to atomf's waitfor list, unlock atomf. */
++	list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
++	wlinks.waitfor_cb = wait_for_fusion;
++	atomic_inc(&atomf->refcount);
++	spin_unlock_atom(atomf);
++
++	if (atomh) {
++		/* Add txnh to atomh's waiting list, unlock atomh. */
++		list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
++		atomic_inc(&atomh->refcount);
++		spin_unlock_atom(atomh);
++	}
++
++	/* Go to sleep. */
++	spin_unlock_txnh(txnh);
++
++	ret = prepare_to_sleep(wlinks._lock_stack);
++	if (ret == 0) {
++		go_to_sleep(wlinks._lock_stack);
++		ret = RETERR(-E_REPEAT);
++	}
++
++	/* Remove from the waitfor list. */
++	spin_lock_atom(atomf);
++
++	list_del(&wlinks._fwaitfor_link);
++	atom_dec_and_unlock(atomf);
++
++	if (atomh) {
++		/* Remove from the waiting list. */
++		spin_lock_atom(atomh);
++		list_del(&wlinks._fwaiting_link);
++		atom_dec_and_unlock(atomh);
++	}
++	return ret;
++}
++
++static void lock_two_atoms(txn_atom * one, txn_atom * two)
++{
++	assert("zam-1067", one != two);
++
++	/* lock the atom with lesser address first */
++	if (one < two) {
++		spin_lock_atom(one);
++		spin_lock_atom(two);
++	} else {
++		spin_lock_atom(two);
++		spin_lock_atom(one);
++	}
++}
++
++
++/* Perform the necessary work to prepare for fusing two atoms, which involves
++ * acquiring two atom locks in the proper order.  If one of the node's atom is
++ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
++ * atom is not then the handle's request is put to sleep.  If the node's atom
++ * is committing, then the node can be copy-on-captured.  Otherwise, pick the
++ * atom with fewer pointers to be fused into the atom with more pointer and
++ * call capture_fuse_into.
++ */
++static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
++{
++	txn_atom * txnh_atom = txnh->atom;
++	txn_atom * block_atom = node->atom;
++
++	atomic_inc(&txnh_atom->refcount);
++	atomic_inc(&block_atom->refcount);
++
++	spin_unlock_txnh(txnh);
++	spin_unlock_jnode(node);
++
++	lock_two_atoms(txnh_atom, block_atom);
++
++	if (txnh->atom != txnh_atom || node->atom != block_atom ) {
++		release_two_atoms(txnh_atom, block_atom);
++		return RETERR(-E_REPEAT);
++	}
++
++	atomic_dec(&txnh_atom->refcount);
++	atomic_dec(&block_atom->refcount);
++
++	assert ("zam-1066", atom_isopen(txnh_atom));
++
++	if (txnh_atom->stage >= block_atom->stage ||
++	    (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
++		capture_fuse_into(txnh_atom, block_atom);
++		return RETERR(-E_REPEAT);
++	}
++	spin_lock_txnh(txnh);
++	return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
++}
++
++/* This function splices together two jnode lists (small and large) and sets all jnodes in
++   the small list to point to the large atom.  Returns the length of the list. */
++static int
++capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
++			 struct list_head *small_head)
++{
++	int count = 0;
++	jnode *node;
++
++	assert("umka-218", large != NULL);
++	assert("umka-219", large_head != NULL);
++	assert("umka-220", small_head != NULL);
++	/* small atom should be locked also. */
++	assert_spin_locked(&(large->alock));
++
++	/* For every jnode on small's capture list... */
++	list_for_each_entry(node, small_head, capture_link) {
++		count += 1;
++
++		/* With the jnode lock held, update atom pointer. */
++		spin_lock_jnode(node);
++		node->atom = large;
++		spin_unlock_jnode(node);
++	}
++
++	/* Splice the lists. */
++	list_splice_init(small_head, large_head->prev);
++
++	return count;
++}
++
++/* This function splices together two txnh lists (small and large) and sets all txn handles in
++   the small list to point to the large atom.  Returns the length of the list. */
++static int
++capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
++			struct list_head *small_head)
++{
++	int count = 0;
++	txn_handle *txnh;
++
++	assert("umka-221", large != NULL);
++	assert("umka-222", large_head != NULL);
++	assert("umka-223", small_head != NULL);
++
++	/* Adjust every txnh to the new atom. */
++	list_for_each_entry(txnh, small_head, txnh_link) {
++		count += 1;
++
++		/* With the txnh lock held, update atom pointer. */
++		spin_lock_txnh(txnh);
++		txnh->atom = large;
++		spin_unlock_txnh(txnh);
++	}
++
++	/* Splice the txn_handle list. */
++	list_splice_init(small_head, large_head->prev);
++
++	return count;
++}
++
++/* This function fuses two atoms.  The captured nodes and handles belonging to SMALL are
++   added to LARGE and their ->atom pointers are all updated.  The associated counts are
++   updated as well, and any waiting handles belonging to either are awakened.  Finally the
++   smaller atom's refcount is decremented.
++*/
++static void capture_fuse_into(txn_atom * small, txn_atom * large)
++{
++	int level;
++	unsigned zcount = 0;
++	unsigned tcount = 0;
++
++	assert("umka-224", small != NULL);
++	assert("umka-225", small != NULL);
++
++	assert_spin_locked(&(large->alock));
++	assert_spin_locked(&(small->alock));
++
++	assert("jmacd-201", atom_isopen(small));
++	assert("jmacd-202", atom_isopen(large));
++
++	/* Splice and update the per-level dirty jnode lists */
++	for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
++		zcount +=
++		    capture_fuse_jnode_lists(large,
++					     ATOM_DIRTY_LIST(large, level),
++					     ATOM_DIRTY_LIST(small, level));
++	}
++
++	/* Splice and update the [clean,dirty] jnode and txnh lists */
++	zcount +=
++	    capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
++				     ATOM_CLEAN_LIST(small));
++	zcount +=
++	    capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
++				     ATOM_OVRWR_LIST(small));
++	zcount +=
++	    capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
++				     ATOM_WB_LIST(small));
++	zcount +=
++	    capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
++	tcount +=
++	    capture_fuse_txnh_lists(large, &large->txnh_list,
++				    &small->txnh_list);
++
++	/* Check our accounting. */
++	assert("jmacd-1063",
++	       zcount + small->num_queued == small->capture_count);
++	assert("jmacd-1065", tcount == small->txnh_count);
++
++	/* sum numbers of waiters threads */
++	large->nr_waiters += small->nr_waiters;
++	small->nr_waiters = 0;
++
++	/* splice flush queues */
++	fuse_fq(large, small);
++
++	/* update counter of jnode on every atom' list */
++	ON_DEBUG(large->dirty += small->dirty;
++		 small->dirty = 0;
++		 large->clean += small->clean;
++		 small->clean = 0;
++		 large->ovrwr += small->ovrwr;
++		 small->ovrwr = 0;
++		 large->wb += small->wb;
++		 small->wb = 0;
++		 large->fq += small->fq;
++		 small->fq = 0;);
++
++	/* count flushers in result atom */
++	large->nr_flushers += small->nr_flushers;
++	small->nr_flushers = 0;
++
++	/* update counts of flushed nodes */
++	large->flushed += small->flushed;
++	small->flushed = 0;
++
++	/* Transfer list counts to large. */
++	large->txnh_count += small->txnh_count;
++	large->capture_count += small->capture_count;
++
++	/* Add all txnh references to large. */
++	atomic_add(small->txnh_count, &large->refcount);
++	atomic_sub(small->txnh_count, &small->refcount);
++
++	/* Reset small counts */
++	small->txnh_count = 0;
++	small->capture_count = 0;
++
++	/* Assign the oldest start_time, merge flags. */
++	large->start_time = min(large->start_time, small->start_time);
++	large->flags |= small->flags;
++
++	/* Merge blocknr sets. */
++	blocknr_set_merge(&small->delete_set, &large->delete_set);
++	blocknr_set_merge(&small->wandered_map, &large->wandered_map);
++
++	/* Merge allocated/deleted file counts */
++	large->nr_objects_deleted += small->nr_objects_deleted;
++	large->nr_objects_created += small->nr_objects_created;
++
++	small->nr_objects_deleted = 0;
++	small->nr_objects_created = 0;
++
++	/* Merge allocated blocks counts */
++	large->nr_blocks_allocated += small->nr_blocks_allocated;
++
++	large->nr_running_queues += small->nr_running_queues;
++	small->nr_running_queues = 0;
++
++	/* Merge blocks reserved for overwrite set. */
++	large->flush_reserved += small->flush_reserved;
++	small->flush_reserved = 0;
++
++	if (large->stage < small->stage) {
++		/* Large only needs to notify if it has changed state. */
++		atom_set_stage(large, small->stage);
++		wakeup_atom_waiting_list(large);
++	}
++
++	atom_set_stage(small, ASTAGE_INVALID);
++
++	/* Notify any waiters--small needs to unload its wait lists.  Waiters
++	   actually remove themselves from the list before returning from the
++	   fuse_wait function. */
++	wakeup_atom_waiting_list(small);
++
++	/* Unlock atoms */
++	spin_unlock_atom(large);
++	atom_dec_and_unlock(small);
++}
++
++/* TXNMGR STUFF */
++
++/* Release a block from the atom, reversing the effects of being captured,
++   do not release atom's reference to jnode due to holding spin-locks.
++   Currently this is only called when the atom commits.
++
++   NOTE: this function does not release a (journal) reference to jnode
++   due to locking optimizations, you should call jput() somewhere after
++   calling uncapture_block(). */
++void uncapture_block(jnode * node)
++{
++	txn_atom *atom;
++
++	assert("umka-226", node != NULL);
++	atom = node->atom;
++	assert("umka-228", atom != NULL);
++
++	assert("jmacd-1021", node->atom == atom);
++	assert_spin_locked(&(node->guard));
++	assert("jmacd-1023", atom_is_protected(atom));
++
++	JF_CLR(node, JNODE_DIRTY);
++	JF_CLR(node, JNODE_RELOC);
++	JF_CLR(node, JNODE_OVRWR);
++	JF_CLR(node, JNODE_CREATED);
++	JF_CLR(node, JNODE_WRITEBACK);
++	JF_CLR(node, JNODE_REPACK);
++
++	list_del_init(&node->capture_link);
++	if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
++		assert("zam-925", atom_isopen(atom));
++		assert("vs-1623", NODE_LIST(node) == FQ_LIST);
++		ON_DEBUG(atom->num_queued--);
++		JF_CLR(node, JNODE_FLUSH_QUEUED);
++	}
++	atom->capture_count -= 1;
++	ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
++	node->atom = NULL;
++
++	spin_unlock_jnode(node);
++	LOCK_CNT_DEC(t_refs);
++}
++
++/* Unconditional insert of jnode into atom's overwrite list. Currently used in
++   bitmap-based allocator code for adding modified bitmap blocks the
++   transaction. @atom and @node are spin locked */
++void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
++{
++	assert("zam-538", atom_is_protected(atom));
++	assert_spin_locked(&(node->guard));
++	assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
++	assert("zam-543", node->atom == NULL);
++	assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
++
++	list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
++	jref(node);
++	node->atom = atom;
++	atom->capture_count++;
++	ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
++}
++
++
++#if REISER4_DEBUG
++
++void info_atom(const char *prefix, const txn_atom * atom)
++{
++	if (atom == NULL) {
++		printk("%s: no atom\n", prefix);
++		return;
++	}
++
++	printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
++	       " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
++	       atomic_read(&atom->refcount), atom->atom_id, atom->flags,
++	       atom->txnh_count, atom->capture_count, atom->stage,
++	       atom->start_time, atom->flushed);
++}
++
++#endif
++
++static int count_deleted_blocks_actor(txn_atom * atom,
++				      const reiser4_block_nr * a,
++				      const reiser4_block_nr * b, void *data)
++{
++	reiser4_block_nr *counter = data;
++
++	assert("zam-995", data != NULL);
++	assert("zam-996", a != NULL);
++	if (b == NULL)
++		*counter += 1;
++	else
++		*counter += *b;
++	return 0;
++}
++
++reiser4_block_nr txnmgr_count_deleted_blocks(void)
++{
++	reiser4_block_nr result;
++	txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
++	txn_atom *atom;
++
++	result = 0;
++
++	spin_lock_txnmgr(tmgr);
++	list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
++		spin_lock_atom(atom);
++		if (atom_isopen(atom))
++			blocknr_set_iterator(
++				atom, &atom->delete_set,
++				count_deleted_blocks_actor, &result, 0);
++		spin_unlock_atom(atom);
++	}
++	spin_unlock_txnmgr(tmgr);
++
++	return result;
++}
++
++/*
++ * Local variables:
++ * c-indentation-style: "K&R"
++ * mode-name: "LC"
++ * c-basic-offset: 8
++ * tab-width: 8
++ * fill-column: 79
++ * End:
++ */
+Index: linux-2.6.16/fs/reiser4/txnmgr.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/txnmgr.h
+@@ -0,0 +1,704 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* data-types and function declarations for transaction manager. See txnmgr.c
++ * for details. */
++
++#ifndef __REISER4_TXNMGR_H__
++#define __REISER4_TXNMGR_H__
++
++#include "forward.h"
++#include "dformat.h"
++
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/types.h>
++#include <linux/spinlock.h>
++#include <asm/atomic.h>
++#include <asm/semaphore.h>
++
++/* TYPE DECLARATIONS */
++
++/* This enumeration describes the possible types of a capture request (try_capture).
++   A capture request dynamically assigns a block to the calling thread's transaction
++   handle. */
++typedef enum {
++	/* A READ_ATOMIC request indicates that a block will be read and that the caller's
++	   atom should fuse in order to ensure that the block commits atomically with the
++	   caller. */
++	TXN_CAPTURE_READ_ATOMIC = (1 << 0),
++
++	/* A READ_NONCOM request indicates that a block will be read and that the caller is
++	   willing to read a non-committed block without causing atoms to fuse. */
++	TXN_CAPTURE_READ_NONCOM = (1 << 1),
++
++	/* A READ_MODIFY request indicates that a block will be read but that the caller
++	   wishes for the block to be captured as it will be written.  This capture request
++	   mode is not currently used, but eventually it will be useful for preventing
++	   deadlock in read-modify-write cycles. */
++	TXN_CAPTURE_READ_MODIFY = (1 << 2),
++
++	/* A WRITE capture request indicates that a block will be modified and that atoms
++	   should fuse to make the commit atomic. */
++	TXN_CAPTURE_WRITE = (1 << 3),
++
++	/* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
++	   exclusive type designation from extra bits that may be supplied -- see
++	   below. */
++	TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
++			     TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
++			     TXN_CAPTURE_WRITE),
++
++	/* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
++	   indicate modification will occur. */
++	TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
++
++	/* An option to try_capture, NONBLOCKING indicates that the caller would
++	   prefer not to sleep waiting for an aging atom to commit. */
++	TXN_CAPTURE_NONBLOCKING = (1 << 4),
++
++	/* An option to try_capture to prevent atom fusion, just simple capturing is allowed */
++	TXN_CAPTURE_DONT_FUSE = (1 << 5)
++
++	/* This macro selects only the exclusive capture request types, stripping out any
++	   options that were supplied (i.e., NONBLOCKING). */
++#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
++} txn_capture;
++
++/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
++   difference is in the handling of read requests.  A WRITE_FUSING transaction handle
++   defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
++   transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
++typedef enum {
++	TXN_WRITE_FUSING = (1 << 0),
++	TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING,	/* READ implies WRITE */
++} txn_mode;
++
++/* Every atom has a stage, which is one of these exclusive values: */
++typedef enum {
++	/* Initially an atom is free. */
++	ASTAGE_FREE = 0,
++
++	/* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
++	   blocks and fuse with other atoms. */
++	ASTAGE_CAPTURE_FUSE = 1,
++
++	/* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
++
++	/* When an atom reaches a certain age it must do all it can to commit.  An atom in
++	   the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
++	   atoms in the CAPTURE_FUSE stage. */
++	ASTAGE_CAPTURE_WAIT = 2,
++
++	/* Waiting for I/O before commit.  Copy-on-capture (see
++	   http://namesys.com/v4/v4.html). */
++	ASTAGE_PRE_COMMIT = 3,
++
++	/* Post-commit overwrite I/O.  Steal-on-capture. */
++	ASTAGE_POST_COMMIT = 4,
++
++	/* Atom which waits for the removal of the last reference to (it? ) to
++	 * be deleted from memory  */
++	ASTAGE_DONE = 5,
++
++	/* invalid atom. */
++	ASTAGE_INVALID = 6,
++
++} txn_stage;
++
++/* Certain flags may be set in the txn_atom->flags field. */
++typedef enum {
++	/* Indicates that the atom should commit as soon as possible. */
++	ATOM_FORCE_COMMIT = (1 << 0),
++	/* to avoid endless loop, mark the atom (which was considered as too
++	 * small) after failed attempt to fuse it. */
++	ATOM_CANCEL_FUSION = (1 << 1)
++} txn_flags;
++
++/* Flags for controlling commit_txnh */
++typedef enum {
++	/* Wait commit atom completion in commit_txnh */
++	TXNH_WAIT_COMMIT = 0x2,
++	/* Don't commit atom when this handle is closed */
++	TXNH_DONT_COMMIT = 0x4
++} txn_handle_flags_t;
++
++/* TYPE DEFINITIONS */
++
++/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
++   fields, so typically an operation on the atom through either of these objects must (1)
++   lock the object, (2) read the atom pointer, (3) lock the atom.
++
++   During atom fusion, the process holds locks on both atoms at once.  Then, it iterates
++   through the list of handles and pages held by the smaller of the two atoms.  For each
++   handle and page referencing the smaller atom, the fusing process must: (1) lock the
++   object, and (2) update the atom pointer.
++
++   You can see that there is a conflict of lock ordering here, so the more-complex
++   procedure should have priority, i.e., the fusing process has priority so that it is
++   guaranteed to make progress and to avoid restarts.
++
++   This decision, however, means additional complexity for aquiring the atom lock in the
++   first place.
++
++   The general original procedure followed in the code was:
++
++       TXN_OBJECT *obj = ...;
++       TXN_ATOM   *atom;
++
++       spin_lock (& obj->_lock);
++
++       atom = obj->_atom;
++
++       if (! spin_trylock_atom (atom))
++         {
++           spin_unlock (& obj->_lock);
++           RESTART OPERATION, THERE WAS A RACE;
++         }
++
++       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
++
++   It has however been found that this wastes CPU a lot in a manner that is
++   hard to profile. So, proper refcounting was added to atoms, and new
++   standard locking sequence is like following:
++
++       TXN_OBJECT *obj = ...;
++       TXN_ATOM   *atom;
++
++       spin_lock (& obj->_lock);
++
++       atom = obj->_atom;
++
++       if (! spin_trylock_atom (atom))
++         {
++           atomic_inc (& atom->refcount);
++           spin_unlock (& obj->_lock);
++           spin_lock (&atom->_lock);
++           atomic_dec (& atom->refcount);
++           // HERE atom is locked
++           spin_unlock (&atom->_lock);
++           RESTART OPERATION, THERE WAS A RACE;
++         }
++
++       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
++
++   (core of this is implemented in trylock_throttle() function)
++
++   See the jnode_get_atom() function for a common case.
++
++   As an additional (and important) optimization allowing to avoid restarts,
++   it is possible to re-check required pre-conditions at the HERE point in
++   code above and proceed without restarting if they are still satisfied.
++*/
++
++/* A block number set consists of only the list head. */
++struct blocknr_set {
++	struct list_head entries;
++};
++
++/* An atomic transaction: this is the underlying system representation
++   of a transaction, not the one seen by clients.
++
++   Invariants involving this data-type:
++
++      [sb-fake-allocated]
++*/
++struct txn_atom {
++	/* The spinlock protecting the atom, held during fusion and various other state
++	   changes. */
++	spinlock_t alock;
++
++	/* The atom's reference counter, increasing (in case of a duplication
++	   of an existing reference or when we are sure that some other
++	   reference exists) may be done without taking spinlock, decrementing
++	   of the ref. counter requires a spinlock to be held.
++
++	   Each transaction handle counts in ->refcount. All jnodes count as
++	   one reference acquired in atom_begin_andlock(), released in
++	   commit_current_atom().
++	 */
++	atomic_t refcount;
++
++	/* The atom_id identifies the atom in persistent records such as the log. */
++	__u32 atom_id;
++
++	/* Flags holding any of the txn_flags enumerated values (e.g.,
++	   ATOM_FORCE_COMMIT). */
++	__u32 flags;
++
++	/* Number of open handles. */
++	__u32 txnh_count;
++
++	/* The number of znodes captured by this atom.  Equal to the sum of lengths of the
++	   dirty_nodes[level] and clean_nodes lists. */
++	__u32 capture_count;
++
++#if REISER4_DEBUG
++	int clean;
++	int dirty;
++	int ovrwr;
++	int wb;
++	int fq;
++#endif
++
++	__u32 flushed;
++
++	/* Current transaction stage. */
++	txn_stage stage;
++
++	/* Start time. */
++	unsigned long start_time;
++
++	/* The atom's delete set. It collects block numbers of the nodes
++	   which were deleted during the transaction. */
++	blocknr_set delete_set;
++
++	/* The atom's wandered_block mapping. */
++	blocknr_set wandered_map;
++
++	/* The transaction's list of dirty captured nodes--per level.  Index
++	   by (level). dirty_nodes[0] is for znode-above-root */
++	struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
++
++	/* The transaction's list of clean captured nodes. */
++	struct list_head clean_nodes;
++
++	/* The atom's overwrite set */
++	struct list_head ovrwr_nodes;
++
++	/* nodes which are being written to disk */
++	struct list_head writeback_nodes;
++
++	/* list of inodes */
++	struct list_head inodes;
++
++	/* List of handles associated with this atom. */
++	struct list_head txnh_list;
++
++	/* Transaction list link: list of atoms in the transaction manager. */
++	struct list_head atom_link;
++
++	/* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
++	struct list_head fwaitfor_list;
++
++	/* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
++	struct list_head fwaiting_list;
++
++	/* Numbers of objects which were deleted/created in this transaction
++	   thereby numbers of objects IDs which were released/deallocated. */
++	int nr_objects_deleted;
++	int nr_objects_created;
++	/* number of blocks allocated during the transaction */
++	__u64 nr_blocks_allocated;
++	/* All atom's flush queue objects are on this list  */
++	struct list_head flush_queues;
++#if REISER4_DEBUG
++	/* number of flush queues for this atom. */
++	int nr_flush_queues;
++	/* Number of jnodes which were removed from atom's lists and put
++	   on flush_queue */
++	int num_queued;
++#endif
++	/* number of threads who wait for this atom to complete commit */
++	int nr_waiters;
++	/* number of threads which do jnode_flush() over this atom */
++	int nr_flushers;
++	/* number of flush queues which are IN_USE and jnodes from fq->prepped
++	   are submitted to disk by the write_fq() routine. */
++	int nr_running_queues;
++	/* A counter of grabbed unformatted nodes, see a description of the
++	 * reiser4 space reservation scheme at block_alloc.c */
++	reiser4_block_nr flush_reserved;
++#if REISER4_DEBUG
++	void *committer;
++#endif
++	struct super_block *super;
++};
++
++#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
++#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
++#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
++#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
++#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
++
++#define NODE_LIST(node) (node)->list
++#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
++ON_DEBUG(void
++	 count_jnode(txn_atom *, jnode *, atom_list old_list,
++		     atom_list new_list, int check_lists));
++
++typedef struct protected_jnodes {
++	struct list_head inatom; /* link to atom's list these structures */
++	struct list_head nodes; /* head of list of protected nodes */
++} protected_jnodes;
++
++/* A transaction handle: the client obtains and commits this handle which is assigned by
++   the system to a txn_atom. */
++struct txn_handle {
++	/* Spinlock protecting ->atom pointer */
++	spinlock_t hlock;
++
++	/* Flags for controlling commit_txnh() behavior */
++	/* from txn_handle_flags_t */
++	txn_handle_flags_t flags;
++
++	/* Whether it is READ_FUSING or WRITE_FUSING. */
++	txn_mode mode;
++
++	/* If assigned, the atom it is part of. */
++	txn_atom *atom;
++
++	/* Transaction list link. Head is in txn_atom. */
++	struct list_head txnh_link;
++};
++
++/* The transaction manager: one is contained in the reiser4_super_info_data */
++struct txn_mgr {
++	/* A spinlock protecting the atom list, id_count, flush_control */
++	spinlock_t tmgr_lock;
++
++	/* List of atoms. */
++	struct list_head atoms_list;
++
++	/* Number of atoms. */
++	int atom_count;
++
++	/* A counter used to assign atom->atom_id values. */
++	__u32 id_count;
++
++	/* a semaphore object for commit serialization */
++	struct semaphore commit_semaphore;
++
++	/* a list of all txnmrgs served by particular daemon. */
++	struct list_head linkage;
++
++	/* description of daemon for this txnmgr */
++	ktxnmgrd_context *daemon;
++
++	/* parameters. Adjustable through mount options. */
++	unsigned int atom_max_size;
++	unsigned int atom_max_age;
++	unsigned int atom_min_size;
++	/* max number of concurrent flushers for one atom, 0 - unlimited.  */
++	unsigned int atom_max_flushers;
++	struct dentry *debugfs_atom_count;
++	struct dentry *debugfs_id_count;
++};
++
++/* FUNCTION DECLARATIONS */
++
++/* These are the externally (within Reiser4) visible transaction functions, therefore they
++   are prefixed with "txn_".  For comments, see txnmgr.c. */
++
++extern int init_txnmgr_static(void);
++extern void done_txnmgr_static(void);
++
++extern void init_txnmgr(txn_mgr *);
++extern void done_txnmgr(txn_mgr *);
++
++extern int txn_reserve(int reserved);
++
++extern void txn_begin(reiser4_context * context);
++extern int txn_end(reiser4_context * context);
++
++extern void txn_restart(reiser4_context * context);
++extern void txn_restart_current(void);
++
++extern int txnmgr_force_commit_all(struct super_block *, int);
++extern int current_atom_should_commit(void);
++
++extern jnode *find_first_dirty_jnode(txn_atom *, int);
++
++extern int commit_some_atoms(txn_mgr *);
++extern int force_commit_atom(txn_handle *);
++extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
++
++extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
++
++extern void atom_set_stage(txn_atom * atom, txn_stage stage);
++
++extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
++			   int alloc_value);
++extern void atom_dec_and_unlock(txn_atom * atom);
++
++extern int try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
++extern int try_capture_page_to_invalidate(struct page *pg);
++
++extern void uncapture_page(struct page *pg);
++extern void uncapture_block(jnode *);
++extern void uncapture_jnode(jnode *);
++
++extern int capture_inode(struct inode *);
++extern int uncapture_inode(struct inode *);
++
++extern txn_atom *get_current_atom_locked_nocheck(void);
++
++#if REISER4_DEBUG
++
++/**
++ * atom_is_protected - make sure that nobody but us can do anything with atom
++ * @atom: atom to be checked
++ *
++ * This is used to assert that atom either entered commit stages or is spin
++ * locked.
++ */
++static inline int atom_is_protected(txn_atom *atom)
++{
++	if (atom->stage >= ASTAGE_PRE_COMMIT)
++		return 1;
++	assert_spin_locked(&(atom->alock));
++	return 1;
++}
++
++#endif
++
++/* Get the current atom and spinlock it if current atom present. May not return NULL */
++static inline txn_atom *get_current_atom_locked(void)
++{
++	txn_atom *atom;
++
++	atom = get_current_atom_locked_nocheck();
++	assert("zam-761", atom != NULL);
++
++	return atom;
++}
++
++extern txn_atom *jnode_get_atom(jnode *);
++
++extern void atom_wait_event(txn_atom *);
++extern void atom_send_event(txn_atom *);
++
++extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
++extern int capture_super_block(struct super_block *s);
++int capture_bulk(jnode **, int count);
++
++/* See the comment on the function blocknrset.c:blocknr_set_add for the
++   calling convention of these three routines. */
++extern void blocknr_set_init(blocknr_set * bset);
++extern void blocknr_set_destroy(blocknr_set * bset);
++extern void blocknr_set_merge(blocknr_set * from, blocknr_set * into);
++extern int blocknr_set_add_extent(txn_atom * atom,
++				  blocknr_set * bset,
++				  blocknr_set_entry ** new_bsep,
++				  const reiser4_block_nr * start,
++				  const reiser4_block_nr * len);
++extern int blocknr_set_add_pair(txn_atom * atom, blocknr_set * bset,
++				blocknr_set_entry ** new_bsep,
++				const reiser4_block_nr * a,
++				const reiser4_block_nr * b);
++
++typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
++				    const reiser4_block_nr *, void *);
++
++extern int blocknr_set_iterator(txn_atom * atom, blocknr_set * bset,
++				blocknr_set_actor_f actor, void *data,
++				int delete);
++
++/* flush code takes care about how to fuse flush queues */
++extern void flush_init_atom(txn_atom * atom);
++extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
++
++static inline void spin_lock_atom(txn_atom *atom)
++{
++	/* check that spinlocks of lower priorities are not held */
++	assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
++		    LOCK_CNT_NIL(spin_locked_jnode) &&
++		    LOCK_CNT_NIL(spin_locked_zlock) &&
++		    LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_NIL(rw_locked_tree)));
++
++	spin_lock(&(atom->alock));
++
++	LOCK_CNT_INC(spin_locked_atom);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline int spin_trylock_atom(txn_atom *atom)
++{
++	if (spin_trylock(&(atom->alock))) {
++		LOCK_CNT_INC(spin_locked_atom);
++		LOCK_CNT_INC(spin_locked);
++		return 1;
++	}
++	return 0;
++}
++
++static inline void spin_unlock_atom(txn_atom *atom)
++{
++	assert_spin_locked(&(atom->alock));
++	assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(spin_locked_atom);
++	LOCK_CNT_DEC(spin_locked);
++
++	spin_unlock(&(atom->alock));
++}
++
++static inline void spin_lock_txnh(txn_handle *txnh)
++{
++	/* check that spinlocks of lower priorities are not held */
++	assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_NIL(spin_locked_zlock) &&
++		    LOCK_CNT_NIL(rw_locked_tree)));
++
++	spin_lock(&(txnh->hlock));
++
++	LOCK_CNT_INC(spin_locked_txnh);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline int spin_trylock_txnh(txn_handle *txnh)
++{
++	if (spin_trylock(&(txnh->hlock))) {
++		LOCK_CNT_INC(spin_locked_txnh);
++		LOCK_CNT_INC(spin_locked);
++		return 1;
++	}
++	return 0;
++}
++
++static inline void spin_unlock_txnh(txn_handle *txnh)
++{
++	assert_spin_locked(&(txnh->hlock));
++	assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(spin_locked_txnh);
++	LOCK_CNT_DEC(spin_locked);
++
++	spin_unlock(&(txnh->hlock));
++}
++
++#define spin_ordering_pred_txnmgr(tmgr)		\
++	( LOCK_CNT_NIL(spin_locked_atom) &&	\
++	  LOCK_CNT_NIL(spin_locked_txnh) &&	\
++	  LOCK_CNT_NIL(spin_locked_jnode) &&	\
++	  LOCK_CNT_NIL(rw_locked_zlock) &&	\
++	  LOCK_CNT_NIL(rw_locked_dk) &&		\
++	  LOCK_CNT_NIL(rw_locked_tree) )
++
++static inline void spin_lock_txnmgr(txn_mgr *mgr)
++{
++	/* check that spinlocks of lower priorities are not held */
++	assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
++		    LOCK_CNT_NIL(spin_locked_txnh) &&
++		    LOCK_CNT_NIL(spin_locked_jnode) &&
++		    LOCK_CNT_NIL(spin_locked_zlock) &&
++		    LOCK_CNT_NIL(rw_locked_dk) &&
++		    LOCK_CNT_NIL(rw_locked_tree)));
++
++	spin_lock(&(mgr->tmgr_lock));
++
++	LOCK_CNT_INC(spin_locked_txnmgr);
++	LOCK_CNT_INC(spin_locked);
++}
++
++static inline int spin_trylock_txnmgr(txn_mgr *mgr)
++{
++	if (spin_trylock(&(mgr->tmgr_lock))) {
++		LOCK_CNT_INC(spin_locked_txnmgr);
++		LOCK_CNT_INC(spin_locked);
++		return 1;
++	}
++	return 0;
++}
++
++static inline void spin_unlock_txnmgr(txn_mgr *mgr)
++{
++	assert_spin_locked(&(mgr->tmgr_lock));
++	assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
++	assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
++
++	LOCK_CNT_DEC(spin_locked_txnmgr);
++	LOCK_CNT_DEC(spin_locked);
++
++	spin_unlock(&(mgr->tmgr_lock));
++}
++
++typedef enum {
++	FQ_IN_USE = 0x1
++} flush_queue_state_t;
++
++typedef struct flush_queue flush_queue_t;
++
++/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
++   is filled by the jnode_flush() routine, and written to disk under memory
++   pressure or at atom commit time. */
++/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
++   field and fq->prepped list can be modified if atom is spin-locked and fq
++   object is "in-use" state.  For read-only traversal of the fq->prepped list
++   and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
++   only have atom spin-locked. */
++struct flush_queue {
++	/* linkage element is the first in this structure to make debugging
++	   easier.  See field in atom struct for description of list. */
++	struct list_head alink;
++	/* A spinlock to protect changes of fq state and fq->atom pointer */
++	spinlock_t guard;
++	/* flush_queue state: [in_use | ready] */
++	flush_queue_state_t state;
++	/* A list which contains queued nodes, queued nodes are removed from any
++	 * atom's list and put on this ->prepped one. */
++	struct list_head prepped;
++	/* number of submitted i/o requests */
++	atomic_t nr_submitted;
++	/* number of i/o errors */
++	atomic_t nr_errors;
++	/* An atom this flush queue is attached to */
++	txn_atom *atom;
++	/* A semaphore for waiting on i/o completion */
++	struct semaphore io_sem;
++#if REISER4_DEBUG
++	/* A thread which took this fq in exclusive use, NULL if fq is free,
++	 * used for debugging. */
++	struct task_struct *owner;
++#endif
++};
++
++extern int fq_by_atom(txn_atom *, flush_queue_t **);
++extern void fq_put_nolock(flush_queue_t *);
++extern void fq_put(flush_queue_t *);
++extern void fuse_fq(txn_atom * to, txn_atom * from);
++extern void queue_jnode(flush_queue_t *, jnode *);
++extern void mark_jnode_queued(flush_queue_t *, jnode *);
++
++extern int write_fq(flush_queue_t *, long *, int);
++extern int current_atom_finish_all_fq(void);
++extern void init_atom_fq_parts(txn_atom *);
++
++extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
++
++extern void znode_make_dirty(znode * node);
++extern void jnode_make_dirty_locked(jnode * node);
++
++extern int sync_atom(txn_atom * atom);
++
++#if REISER4_DEBUG
++extern int atom_fq_parts_are_clean(txn_atom *);
++#endif
++
++extern void add_fq_to_bio(flush_queue_t *, struct bio *);
++extern flush_queue_t *get_fq_for_current_atom(void);
++
++void protected_jnodes_init(protected_jnodes * list);
++void protected_jnodes_done(protected_jnodes * list);
++void invalidate_list(struct list_head * head);
++
++#if REISER4_DEBUG
++void info_atom(const char *prefix, const txn_atom * atom);
++#else
++#define info_atom(p,a) noop
++#endif
++
++# endif				/* __REISER4_TXNMGR_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/type_safe_hash.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/type_safe_hash.h
+@@ -0,0 +1,320 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* A hash table class that uses hash chains (singly-linked) and is
++   parametrized to provide type safety.  */
++
++#ifndef __REISER4_TYPE_SAFE_HASH_H__
++#define __REISER4_TYPE_SAFE_HASH_H__
++
++#include "debug.h"
++
++#include <asm/errno.h>
++/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
++   based on the object type.  You need to declare the item type before
++   this definition, define it after this definition. */
++#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE)                                                     \
++                                                                                              \
++typedef struct PREFIX##_hash_table_  PREFIX##_hash_table;                                     \
++typedef struct PREFIX##_hash_link_   PREFIX##_hash_link;                                      \
++                                                                                              \
++struct PREFIX##_hash_table_                                                                   \
++{                                                                                             \
++  ITEM_TYPE  **_table;                                                                        \
++  __u32        _buckets;                                                                      \
++};                                                                                            \
++                                                                                              \
++struct PREFIX##_hash_link_                                                                    \
++{                                                                                             \
++  ITEM_TYPE *_next;                                                                           \
++}
++
++/* Step 2: Define the object type of the hash: give it field of type
++   PREFIX_hash_link. */
++
++/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
++   the type and field name used in step 3.  The arguments are:
++
++   ITEM_TYPE    The item type being hashed
++   KEY_TYPE     The type of key being hashed
++   KEY_NAME     The name of the key field within the item
++   LINK_NAME    The name of the link field within the item, which you must make type PREFIX_hash_link)
++   HASH_FUNC    The name of the hash function (or macro, takes const pointer to key)
++   EQ_FUNC      The name of the equality function (or macro, takes const pointer to two keys)
++
++   It implements these functions:
++
++   prefix_hash_init           Initialize the table given its size.
++   prefix_hash_insert         Insert an item
++   prefix_hash_insert_index   Insert an item w/ precomputed hash_index
++   prefix_hash_find           Find an item by key
++   prefix_hash_find_index     Find an item w/ precomputed hash_index
++   prefix_hash_remove         Remove an item, returns 1 if found, 0 if not found
++   prefix_hash_remove_index   Remove an item w/ precomputed hash_index
++
++   If you'd like something to be done differently, feel free to ask me
++   for modifications.  Additional features that could be added but
++   have not been:
++
++   prefix_hash_remove_key           Find and remove an item by key
++   prefix_hash_remove_key_index     Find and remove an item by key w/ precomputed hash_index
++
++   The hash_function currently receives only the key as an argument,
++   meaning it must somehow know the number of buckets.  If this is a
++   problem let me know.
++
++   This hash table uses a single-linked hash chain.  This means
++   insertion is fast but deletion requires searching the chain.
++
++   There is also the doubly-linked hash chain approach, under which
++   deletion requires no search but the code is longer and it takes two
++   pointers per item.
++
++   The circularly-linked approach has the shortest code but requires
++   two pointers per bucket, doubling the size of the bucket array (in
++   addition to two pointers per item).
++*/
++#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC)	\
++											\
++static __inline__ void									\
++PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG,				\
++		     __u32                hash UNUSED_ARG)				\
++{											\
++	assert("nikita-2780", hash < table->_buckets);					\
++}											\
++											\
++static __inline__ int									\
++PREFIX##_hash_init (PREFIX##_hash_table *hash,						\
++		    __u32                buckets)					\
++{											\
++  hash->_table   = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets);		\
++  hash->_buckets = buckets;								\
++  if (hash->_table == NULL)								\
++    {											\
++      return RETERR(-ENOMEM);								\
++    }											\
++  memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets);				\
++  ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets));			\
++  return 0;										\
++}											\
++											\
++static __inline__ void									\
++PREFIX##_hash_done (PREFIX##_hash_table *hash)						\
++{											\
++  if (REISER4_DEBUG && hash->_table != NULL) {                                          \
++	    __u32 i;                                                                    \
++	    for (i = 0 ; i < hash->_buckets ; ++ i)                                     \
++		    assert("nikita-2905", hash->_table[i] == NULL);                     \
++  }                                                                                     \
++  if (hash->_table != NULL)								\
++    KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets);				\
++  hash->_table = NULL;									\
++}											\
++											\
++static __inline__ void									\
++PREFIX##_hash_prefetch_next (ITEM_TYPE *item)						\
++{											\
++	prefetch(item->LINK_NAME._next);						\
++}											\
++											\
++static __inline__ void									\
++PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash,				\
++			       __u32                index)				\
++{											\
++	prefetch(hash->_table[index]);  						\
++}											\
++											\
++static __inline__ ITEM_TYPE*								\
++PREFIX##_hash_find_index (PREFIX##_hash_table *hash,					\
++			  __u32                hash_index,				\
++			  KEY_TYPE const      *find_key)				\
++{											\
++  ITEM_TYPE *item;									\
++											\
++  PREFIX##_check_hash(hash, hash_index);						\
++											\
++  for (item  = hash->_table[hash_index];						\
++       item != NULL;									\
++       item  = item->LINK_NAME._next)							\
++    {											\
++      prefetch(item->LINK_NAME._next);							\
++      prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME));			\
++      if (EQ_FUNC (& item->KEY_NAME, find_key))						\
++        {										\
++          return item;									\
++        }										\
++    }											\
++											\
++  return NULL;										\
++}											\
++											\
++static __inline__ ITEM_TYPE*								\
++PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash,				\
++			      __u32                hash_index,				\
++			      KEY_TYPE const      *find_key)				\
++{											\
++  ITEM_TYPE ** item = &hash->_table[hash_index];                                        \
++											\
++  PREFIX##_check_hash(hash, hash_index);						\
++                                                                                        \
++  while (*item != NULL) {                                                               \
++    prefetch(&(*item)->LINK_NAME._next);						\
++    if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) {                                       \
++      ITEM_TYPE *found; 								\
++											\
++      found = *item;    								\
++      *item = found->LINK_NAME._next;                                                   \
++      found->LINK_NAME._next = hash->_table[hash_index];				\
++      hash->_table[hash_index] = found;							\
++      return found;                                                                     \
++    }                                                                                   \
++    item = &(*item)->LINK_NAME._next;                                                   \
++  }											\
++  return NULL;										\
++}											\
++											\
++static __inline__ int									\
++PREFIX##_hash_remove_index (PREFIX##_hash_table *hash,					\
++			    __u32                hash_index,				\
++			    ITEM_TYPE           *del_item)				\
++{											\
++  ITEM_TYPE ** hash_item_p = &hash->_table[hash_index];                                 \
++											\
++  PREFIX##_check_hash(hash, hash_index);						\
++                                                                                        \
++  while (*hash_item_p != NULL) {                                                        \
++    prefetch(&(*hash_item_p)->LINK_NAME._next);						\
++    if (*hash_item_p == del_item) {                                                     \
++      *hash_item_p = (*hash_item_p)->LINK_NAME._next;                                   \
++      return 1;                                                                         \
++    }                                                                                   \
++    hash_item_p = &(*hash_item_p)->LINK_NAME._next;                                     \
++  }											\
++  return 0;										\
++}											\
++											\
++static __inline__ void									\
++PREFIX##_hash_insert_index (PREFIX##_hash_table *hash,					\
++			    __u32                hash_index,				\
++			    ITEM_TYPE           *ins_item)				\
++{											\
++  PREFIX##_check_hash(hash, hash_index);						\
++											\
++  ins_item->LINK_NAME._next = hash->_table[hash_index];					\
++  hash->_table[hash_index]  = ins_item;							\
++}											\
++											\
++static __inline__ void									\
++PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash,				\
++			        __u32                hash_index,			\
++			        ITEM_TYPE           *ins_item)				\
++{											\
++  PREFIX##_check_hash(hash, hash_index);						\
++											\
++  ins_item->LINK_NAME._next = hash->_table[hash_index];					\
++  smp_wmb();    									\
++  hash->_table[hash_index]  = ins_item;							\
++}											\
++											\
++static __inline__ ITEM_TYPE*								\
++PREFIX##_hash_find (PREFIX##_hash_table *hash,						\
++	            KEY_TYPE const      *find_key)					\
++{											\
++  return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key);		\
++}											\
++											\
++static __inline__ ITEM_TYPE*								\
++PREFIX##_hash_find_lru (PREFIX##_hash_table *hash,					\
++	                KEY_TYPE const      *find_key)					\
++{											\
++  return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key);	\
++}											\
++											\
++static __inline__ int									\
++PREFIX##_hash_remove (PREFIX##_hash_table *hash,					\
++		      ITEM_TYPE           *del_item)					\
++{											\
++  return PREFIX##_hash_remove_index (hash,      					\
++                                     HASH_FUNC(hash, &del_item->KEY_NAME), del_item);	\
++}											\
++											\
++static __inline__ int									\
++PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash,					\
++		      ITEM_TYPE           *del_item)					\
++{											\
++  return PREFIX##_hash_remove (hash, del_item);						\
++}											\
++											\
++static __inline__ void									\
++PREFIX##_hash_insert (PREFIX##_hash_table *hash,					\
++		      ITEM_TYPE           *ins_item)					\
++{											\
++  return PREFIX##_hash_insert_index (hash,      					\
++                                     HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item);	\
++}											\
++											\
++static __inline__ void									\
++PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash,					\
++		          ITEM_TYPE           *ins_item)				\
++{											\
++  return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME),   	\
++                                         ins_item);     				\
++}											\
++											\
++static __inline__ ITEM_TYPE *								\
++PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind)				\
++{											\
++  ITEM_TYPE *first;									\
++											\
++  for (first = NULL; ind < hash->_buckets; ++ ind) {					\
++    first = hash->_table[ind];  							\
++    if (first != NULL)									\
++      break;										\
++  }											\
++  return first;										\
++}											\
++											\
++static __inline__ ITEM_TYPE *								\
++PREFIX##_hash_next (PREFIX##_hash_table *hash,						\
++		    ITEM_TYPE           *item)						\
++{											\
++  ITEM_TYPE  *next;									\
++											\
++  if (item == NULL)									\
++    return NULL;									\
++  next = item->LINK_NAME._next;								\
++  if (next == NULL)									\
++    next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1);		\
++  return next;										\
++}											\
++											\
++typedef struct {} PREFIX##_hash_dummy
++
++#define for_all_ht_buckets(table, head)					\
++for ((head) = &(table) -> _table[ 0 ] ;					\
++     (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
++
++#define for_all_in_bucket(bucket, item, next, field)				\
++for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ;	\
++     (item) != NULL ;								\
++     (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
++
++#define for_all_in_htable(table, prefix, item, next)	\
++for ((item) = prefix ## _hash_first ((table), 0), 	\
++     (next) = prefix ## _hash_next ((table), (item)) ;	\
++     (item) != NULL ;					\
++     (item) = (next), 					\
++     (next) = prefix ## _hash_next ((table), (item)))
++
++/* __REISER4_TYPE_SAFE_HASH_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/vfs_ops.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/vfs_ops.c
+@@ -0,0 +1,267 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
++   here. */
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "coord.h"
++#include "plugin/item/item.h"
++#include "plugin/file/file.h"
++#include "plugin/security/perm.h"
++#include "plugin/disk_format/disk_format.h"
++#include "plugin/plugin.h"
++#include "plugin/plugin_set.h"
++#include "plugin/object.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree.h"
++#include "vfs_ops.h"
++#include "inode.h"
++#include "page_cache.h"
++#include "ktxnmgrd.h"
++#include "super.h"
++#include "reiser4.h"
++#include "entd.h"
++#include "status_flags.h"
++#include "flush.h"
++#include "dscale.h"
++
++#include <linux/profile.h>
++#include <linux/types.h>
++#include <linux/mount.h>
++#include <linux/vfs.h>
++#include <linux/mm.h>
++#include <linux/buffer_head.h>
++#include <linux/dcache.h>
++#include <linux/list.h>
++#include <linux/pagemap.h>
++#include <linux/slab.h>
++#include <linux/seq_file.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/writeback.h>
++#include <linux/blkdev.h>
++#include <linux/quotaops.h>
++#include <linux/security.h>
++#include <linux/reboot.h>
++#include <linux/rcupdate.h>
++
++
++/* update inode stat-data by calling plugin */
++int reiser4_update_sd(struct inode *object)
++{
++	file_plugin *fplug;
++
++	assert("nikita-2338", object != NULL);
++	/* check for read-only file system. */
++	if (IS_RDONLY(object))
++		return 0;
++
++	fplug = inode_file_plugin(object);
++	assert("nikita-2339", fplug != NULL);
++	return fplug->write_sd_by_inode(object);
++}
++
++/* helper function: increase inode nlink count and call plugin method to save
++   updated stat-data.
++
++   Used by link/create and during creation of dot and dotdot in mkdir
++*/
++int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
++		      struct inode *parent /* parent where new entry will be */
++		      ,
++		      int write_sd_p	/* true if stat-data has to be
++					 * updated */ )
++{
++	file_plugin *fplug;
++	int result;
++
++	assert("nikita-1351", object != NULL);
++
++	fplug = inode_file_plugin(object);
++	assert("nikita-1445", fplug != NULL);
++
++	/* ask plugin whether it can add yet another link to this
++	   object */
++	if (!fplug->can_add_link(object))
++		return RETERR(-EMLINK);
++
++	assert("nikita-2211", fplug->add_link != NULL);
++	/* call plugin to do actual addition of link */
++	result = fplug->add_link(object, parent);
++
++	/* optionally update stat data */
++	if (result == 0 && write_sd_p)
++		result = fplug->write_sd_by_inode(object);
++	return result;
++}
++
++/* helper function: decrease inode nlink count and call plugin method to save
++   updated stat-data.
++
++   Used by unlink/create
++*/
++int reiser4_del_nlink(struct inode *object	/* object from which link is
++						 * removed */ ,
++		      struct inode *parent /* parent where entry was */ ,
++		      int write_sd_p	/* true is stat-data has to be
++					 * updated */ )
++{
++	file_plugin *fplug;
++	int result;
++
++	assert("nikita-1349", object != NULL);
++
++	fplug = inode_file_plugin(object);
++	assert("nikita-1350", fplug != NULL);
++	assert("nikita-1446", object->i_nlink > 0);
++	assert("nikita-2210", fplug->rem_link != NULL);
++
++	/* call plugin to do actual deletion of link */
++	result = fplug->rem_link(object, parent);
++
++	/* optionally update stat data */
++	if (result == 0 && write_sd_p)
++		result = fplug->write_sd_by_inode(object);
++	return result;
++}
++
++
++
++
++/* Release reiser4 dentry. This is d_op->d_release() method. */
++static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
++{
++	reiser4_free_dentry_fsdata(dentry);
++}
++
++/*
++ * Called by reiser4_sync_inodes(), during speculative write-back (through
++ * pdflush, or balance_dirty_pages()).
++ */
++void writeout(struct super_block *sb, struct writeback_control *wbc)
++{
++	long written = 0;
++	int repeats = 0;
++	int result;
++	struct address_space *mapping;
++
++	/*
++	 * Performs early flushing, trying to free some memory. If there is
++	 * nothing to flush, commits some atoms.
++	 */
++
++	/* Commit all atoms if reiser4_writepages() is called from sys_sync() or
++	   sys_fsync(). */
++	if (wbc->sync_mode != WB_SYNC_NONE) {
++		txnmgr_force_commit_all(sb, 0);
++		return;
++	}
++
++	BUG_ON(get_super_fake(sb) == NULL);
++	mapping = get_super_fake(sb)->i_mapping;
++	do {
++		long nr_submitted = 0;
++		jnode *node = NULL;
++
++		/* do not put more requests to overload write queue */
++		if (wbc->nonblocking &&
++		    bdi_write_congested(mapping->backing_dev_info)) {
++			blk_run_address_space(mapping);
++			wbc->encountered_congestion = 1;
++			break;
++		}
++		repeats++;
++		BUG_ON(wbc->nr_to_write <= 0);
++
++		if (get_current_context()->entd) {
++			entd_context *ent = get_entd_context(sb);
++
++			if (ent->cur_request->node)
++				/*
++				 * this is ent thread and it managed to capture
++				 * requested page itself - start flush from
++				 * that page
++				 */
++				node = jref(ent->cur_request->node);
++		}
++
++		result = flush_some_atom(node, &nr_submitted, wbc,
++					 JNODE_FLUSH_WRITE_BLOCKS);
++		if (result != 0)
++			warning("nikita-31001", "Flush failed: %i", result);
++		if (node)
++			jput(node);
++		if (!nr_submitted)
++			break;
++
++		wbc->nr_to_write -= nr_submitted;
++		written += nr_submitted;
++	} while (wbc->nr_to_write > 0);
++}
++
++
++void reiser4_throttle_write(struct inode *inode)
++{
++	txn_restart_current();
++	balance_dirty_pages_ratelimited(inode->i_mapping);
++}
++
++const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
++const int REISER4_MAGIC_OFFSET = 16 * 4096;	/* offset to magic string from the
++						 * beginning of device */
++
++
++
++/*
++ * Reiser4 initialization/shutdown.
++ *
++ * Code below performs global reiser4 initialization that is done either as
++ * part of kernel initialization (when reiser4 is statically built-in), or
++ * during reiser4 module load (when compiled as module).
++ */
++
++
++void reiser4_handle_error(void)
++{
++	struct super_block *sb = reiser4_get_current_sb();
++
++	if (!sb)
++		return;
++	reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
++			     "Filesystem error occured");
++	switch (get_super_private(sb)->onerror) {
++	case 0:
++		reiser4_panic("foobar-42", "Filesystem error occured\n");
++	case 1:
++	default:
++		if (sb->s_flags & MS_RDONLY)
++			return;
++		sb->s_flags |= MS_RDONLY;
++		break;
++	}
++}
++
++struct dentry_operations reiser4_dentry_operations = {
++	.d_revalidate = NULL,
++	.d_hash = NULL,
++	.d_compare = NULL,
++	.d_delete = NULL,
++	.d_release = reiser4_d_release,
++	.d_iput = NULL,
++};
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/vfs_ops.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/vfs_ops.h
+@@ -0,0 +1,58 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* vfs_ops.c's exported symbols */
++
++#if !defined( __FS_REISER4_VFS_OPS_H__ )
++#define __FS_REISER4_VFS_OPS_H__
++
++#include "forward.h"
++#include "coord.h"
++#include "seal.h"
++#include "plugin/file/file.h"
++#include "super.h"
++#include "readahead.h"
++
++#include <linux/types.h>	/* for loff_t */
++#include <linux/fs.h>		/* for struct address_space */
++#include <linux/dcache.h>	/* for struct dentry */
++#include <linux/mm.h>
++#include <linux/backing-dev.h>
++
++/* address space operations */
++int reiser4_writepage(struct page *, struct writeback_control *);
++int reiser4_set_page_dirty(struct page *);
++int reiser4_readpages(struct file *, struct address_space *,
++		      struct list_head *pages, unsigned nr_pages);
++int reiser4_invalidatepage(struct page *, unsigned long offset);
++int reiser4_releasepage(struct page *, gfp_t);
++
++extern int reiser4_update_sd(struct inode *);
++extern int reiser4_add_nlink(struct inode *, struct inode *, int);
++extern int reiser4_del_nlink(struct inode *, struct inode *, int);
++
++
++extern int reiser4_start_up_io(struct page *page);
++extern void reiser4_throttle_write(struct inode *);
++extern int jnode_is_releasable(jnode *);
++
++#define CAPTURE_APAGE_BURST (1024l)
++void writeout(struct super_block *, struct writeback_control *);
++
++
++extern void reiser4_handle_error(void);
++
++
++/* __FS_REISER4_VFS_OPS_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/wander.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/wander.c
+@@ -0,0 +1,1799 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Reiser4 Wandering Log */
++
++/* You should read http://www.namesys.com/txn-doc.html
++
++   That describes how filesystem operations are performed as atomic
++   transactions, and how we try to arrange it so that we can write most of the
++   data only once while performing the operation atomically.
++
++   For the purposes of this code, it is enough for it to understand that it
++   has been told a given block should be written either once, or twice (if
++   twice then once to the wandered location and once to the real location).
++
++   This code guarantees that those blocks that are defined to be part of an
++   atom either all take effect or none of them take effect.
++
++   Relocate set nodes are submitted to write by the jnode_flush() routine, and
++   the overwrite set is submitted by reiser4_write_log().  This is because with
++   the overwrite set we seek to optimize writes, and with the relocate set we
++   seek to cause disk order to correlate with the parent first pre-order.
++
++   reiser4_write_log() allocates and writes wandered blocks and maintains
++   additional on-disk structures of the atom as wander records (each wander
++   record occupies one block) for storing of the "wandered map" (a table which
++   contains a relation between wandered and real block numbers) and other
++   information which might be needed at transaction recovery time.
++
++   The wander records are unidirectionally linked into a circle: each wander
++   record contains a block number of the next wander record, the last wander
++   record points to the first one.
++
++   One wander record (named "tx head" in this file) has a format which is
++   different from the other wander records. The "tx head" has a reference to the
++   "tx head" block of the previously committed atom.  Also, "tx head" contains
++   fs information (the free blocks counter, and the oid allocator state) which
++   is logged in a special way .
++
++   There are two journal control blocks, named journal header and journal
++   footer which have fixed on-disk locations.  The journal header has a
++   reference to the "tx head" block of the last committed atom.  The journal
++   footer points to the "tx head" of the last flushed atom.  The atom is
++   "played" when all blocks from its overwrite set are written to disk the
++   second time (i.e. written to their real locations).
++
++   NOTE: People who know reiserfs internals and its journal structure might be
++   confused with these terms journal footer and journal header. There is a table
++   with terms of similar semantics in reiserfs (reiser3) and reiser4:
++
++   REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
++   --------------------+-----------------------+----------------------------
++   commit record       |  journal header       | atomic write of this record
++                       |                       | ends transaction commit
++   --------------------+-----------------------+----------------------------
++   journal header      |  journal footer       | atomic write of this record
++                       |                       | ends post-commit writes.
++                       |                       | After successful
++                       |                       | writing of this journal
++                       |                       | blocks (in reiser3) or
++                       |                       | wandered blocks/records are
++                       |                       | free for re-use.
++   --------------------+-----------------------+----------------------------
++
++   The atom commit process is the following:
++
++   1. The overwrite set is taken from atom's clean list, and its size is
++      counted.
++
++   2. The number of necessary wander records (including tx head) is calculated,
++      and the wander record blocks are allocated.
++
++   3. Allocate wandered blocks and populate wander records by wandered map.
++
++   4. submit write requests for wander records and wandered blocks.
++
++   5. wait until submitted write requests complete.
++
++   6. update journal header: change the pointer to the block number of just
++   written tx head, submit an i/o for modified journal header block and wait
++   for i/o completion.
++
++   NOTE: The special logging for bitmap blocks and some reiser4 super block
++   fields makes processes of atom commit, flush and recovering a bit more
++   complex (see comments in the source code for details).
++
++   The atom playing process is the following:
++
++   1. Write atom's overwrite set in-place.
++
++   2. Wait on i/o.
++
++   3. Update journal footer: change the pointer to block number of tx head
++   block of the atom we currently flushing, submit an i/o, wait on i/o
++   completion.
++
++   4. Free disk space which was used for wandered blocks and wander records.
++
++   After the freeing of wandered blocks and wander records we have that journal
++   footer points to the on-disk structure which might be overwritten soon.
++   Neither the log writer nor the journal recovery procedure use that pointer
++   for accessing the data.  When the journal recovery procedure finds the oldest
++   transaction it compares the journal footer pointer value with the "prev_tx"
++   pointer value in tx head, if values are equal the oldest not flushed
++   transaction is found.
++
++   NOTE on disk space leakage: the information about of what blocks and how many
++   blocks are allocated for wandered blocks, wandered records is not written to
++   the disk because of special logging for bitmaps and some super blocks
++   counters.  After a system crash we the reiser4 does not remember those
++   objects allocation, thus we have no such a kind of disk space leakage.
++*/
++
++/* Special logging of reiser4 super block fields. */
++
++/* There are some reiser4 super block fields (free block count and OID allocator
++   state (number of files and next free OID) which are logged separately from
++   super block to avoid unnecessary atom fusion.
++
++   So, the reiser4 super block can be not captured by a transaction with
++   allocates/deallocates disk blocks or create/delete file objects.  Moreover,
++   the reiser4 on-disk super block is not touched when such a transaction is
++   committed and flushed.  Those "counters logged specially" are logged in "tx
++   head" blocks and in the journal footer block.
++
++   A step-by-step description of special logging:
++
++   0. The per-atom information about deleted or created files and allocated or
++   freed blocks is collected during the transaction.  The atom's
++   ->nr_objects_created and ->nr_objects_deleted are for object
++   deletion/creation tracking, the numbers of allocated and freed blocks are
++   calculated using atom's delete set and atom's capture list -- all new and
++   relocated nodes should be on atom's clean list and should have JNODE_RELOC
++   bit set.
++
++   1. The "logged specially" reiser4 super block fields have their "committed"
++   versions in the reiser4 in-memory super block.  They get modified only at
++   atom commit time.  The atom's commit thread has an exclusive access to those
++   "committed" fields because the log writer implementation supports only one
++   atom commit a time (there is a per-fs "commit" semaphore).  At
++   that time "committed" counters are modified using per-atom information
++   collected during the transaction. These counters are stored on disk as a
++   part of tx head block when atom is committed.
++
++   2. When the atom is flushed the value of the free block counter and the OID
++   allocator state get written to the journal footer block.  A special journal
++   procedure (journal_recover_sb_data()) takes those values from the journal
++   footer and updates the reiser4 in-memory super block.
++
++   NOTE: That means free block count and OID allocator state are logged
++   separately from the reiser4 super block regardless of the fact that the
++   reiser4 super block has fields to store both the free block counter and the
++   OID allocator.
++
++   Writing the whole super block at commit time requires knowing true values of
++   all its fields without changes made by not yet committed transactions. It is
++   possible by having their "committed" version of the super block like the
++   reiser4 bitmap blocks have "committed" and "working" versions.  However,
++   another scheme was implemented which stores special logged values in the
++   unused free space inside transaction head block.  In my opinion it has an
++   advantage of not writing whole super block when only part of it was
++   modified. */
++
++#include "debug.h"
++#include "dformat.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "page_cache.h"
++#include "wander.h"
++#include "reiser4.h"
++#include "super.h"
++#include "vfs_ops.h"
++#include "writeout.h"
++#include "inode.h"
++#include "entd.h"
++
++#include <linux/types.h>
++#include <linux/fs.h>		/* for struct super_block  */
++#include <linux/mm.h>		/* for struct page */
++#include <linux/pagemap.h>
++#include <linux/bio.h>		/* for struct bio */
++#include <linux/blkdev.h>
++
++static int write_jnodes_to_disk_extent(
++	jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
++
++/* The commit_handle is a container for objects needed at atom commit time  */
++struct commit_handle {
++	/* A pointer to atom's list of OVRWR nodes */
++	struct list_head *overwrite_set;
++	/* atom's overwrite set size */
++	int overwrite_set_size;
++	/* jnodes for wander record blocks */
++	struct list_head tx_list;
++	/* number of wander records */
++	__u32 tx_size;
++	/* 'committed' sb counters are saved here until atom is completely
++	   flushed  */
++	__u64 free_blocks;
++	__u64 nr_files;
++	__u64 next_oid;
++	/* A pointer to the atom which is being committed */
++	txn_atom *atom;
++	/* A pointer to current super block */
++	struct super_block *super;
++	/* The counter of modified bitmaps */
++	reiser4_block_nr nr_bitmap;
++};
++
++static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
++{
++	memset(ch, 0, sizeof(struct commit_handle));
++	INIT_LIST_HEAD(&ch->tx_list);
++
++	ch->atom = atom;
++	ch->super = reiser4_get_current_sb();
++}
++
++static void done_commit_handle(struct commit_handle *ch)
++{
++	assert("zam-690", list_empty(&ch->tx_list));
++}
++
++static inline int reiser4_use_write_barrier(struct super_block * s)
++{
++	return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
++}
++
++static void disable_write_barrier(struct super_block * s)
++{
++	notice("zam-1055", "%s does not support write barriers,"
++	       " using synchronous write instead.", s->s_id);
++	set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
++}
++
++
++/* fill journal header block data  */
++static void format_journal_header(struct commit_handle *ch)
++{
++	struct reiser4_super_info_data *sbinfo;
++	struct journal_header *header;
++	jnode *txhead;
++
++	sbinfo = get_super_private(ch->super);
++	assert("zam-479", sbinfo != NULL);
++	assert("zam-480", sbinfo->journal_header != NULL);
++
++	txhead = list_entry(ch->tx_list.next, jnode, capture_link);
++
++	jload(sbinfo->journal_header);
++
++	header = (struct journal_header *)jdata(sbinfo->journal_header);
++	assert("zam-484", header != NULL);
++
++	put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
++		      &header->last_committed_tx);
++
++	jrelse(sbinfo->journal_header);
++}
++
++/* fill journal footer block data */
++static void format_journal_footer(struct commit_handle *ch)
++{
++	struct reiser4_super_info_data *sbinfo;
++	struct journal_footer *footer;
++	jnode *tx_head;
++
++	sbinfo = get_super_private(ch->super);
++
++	tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
++
++	assert("zam-493", sbinfo != NULL);
++	assert("zam-494", sbinfo->journal_header != NULL);
++
++	check_me("zam-691", jload(sbinfo->journal_footer) == 0);
++
++	footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
++	assert("zam-495", footer != NULL);
++
++	put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
++		      &footer->last_flushed_tx);
++	put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
++
++	put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
++	put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
++
++	jrelse(sbinfo->journal_footer);
++}
++
++/* wander record capacity depends on current block size */
++static int wander_record_capacity(const struct super_block *super)
++{
++	return (super->s_blocksize -
++		sizeof(struct wander_record_header)) /
++	    sizeof(struct wander_entry);
++}
++
++/* Fill first wander record (tx head) in accordance with supplied given data */
++static void format_tx_head(struct commit_handle *ch)
++{
++	jnode *tx_head;
++	jnode *next;
++	struct tx_header *header;
++
++	tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
++	assert("zam-692", &ch->tx_list != &tx_head->capture_link);
++
++	next = list_entry(tx_head->capture_link.next, jnode, capture_link);
++	if (&ch->tx_list == &next->capture_link)
++		next = tx_head;
++
++	header = (struct tx_header *)jdata(tx_head);
++
++	assert("zam-460", header != NULL);
++	assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
++
++	memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
++	memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
++
++	put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
++	put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
++		      &header->prev_tx);
++	put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
++	put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
++	put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
++	put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
++}
++
++/* prepare ordinary wander record block (fill all service fields) */
++static void
++format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
++{
++	struct wander_record_header *LRH;
++	jnode *next;
++
++	assert("zam-464", node != NULL);
++
++	LRH = (struct wander_record_header *)jdata(node);
++	next = list_entry(node->capture_link.next, jnode, capture_link);
++
++	if (&ch->tx_list == &next->capture_link)
++		next = list_entry(ch->tx_list.next, jnode, capture_link);
++
++	assert("zam-465", LRH != NULL);
++	assert("zam-463",
++	       ch->super->s_blocksize > sizeof(struct wander_record_header));
++
++	memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
++	memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
++
++	put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
++	put_unaligned(cpu_to_le32(serial), &LRH->serial);
++	put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
++}
++
++/* add one wandered map entry to formatted wander record */
++static void
++store_entry(jnode * node, int index, const reiser4_block_nr * a,
++	    const reiser4_block_nr * b)
++{
++	char *data;
++	struct wander_entry *pairs;
++
++	data = jdata(node);
++	assert("zam-451", data != NULL);
++
++	pairs =
++	    (struct wander_entry *)(data + sizeof(struct wander_record_header));
++
++	put_unaligned(cpu_to_le64(*a), &pairs[index].original);
++	put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
++}
++
++/* currently, wander records contains contain only wandered map, which depend on
++   overwrite set size */
++static void get_tx_size(struct commit_handle *ch)
++{
++	assert("zam-440", ch->overwrite_set_size != 0);
++	assert("zam-695", ch->tx_size == 0);
++
++	/* count all ordinary wander records
++	   (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
++	   for tx head block */
++	ch->tx_size =
++	    (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
++	    2;
++}
++
++/* A special structure for using in store_wmap_actor() for saving its state
++   between calls */
++struct store_wmap_params {
++	jnode *cur;		/* jnode of current wander record to fill */
++	int idx;		/* free element index in wander record  */
++	int capacity;		/* capacity  */
++
++#if REISER4_DEBUG
++	struct list_head *tx_list;
++#endif
++};
++
++/* an actor for use in blocknr_set_iterator routine which populates the list
++   of pre-formatted wander records by wandered map info */
++static int
++store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
++		 const reiser4_block_nr * b, void *data)
++{
++	struct store_wmap_params *params = data;
++
++	if (params->idx >= params->capacity) {
++		/* a new wander record should be taken from the tx_list */
++		params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
++		assert("zam-454",
++		       params->tx_list != &params->cur->capture_link);
++
++		params->idx = 0;
++	}
++
++	store_entry(params->cur, params->idx, a, b);
++	params->idx++;
++
++	return 0;
++}
++
++/* This function is called after Relocate set gets written to disk, Overwrite
++   set is written to wandered locations and all wander records are written
++   also. Updated journal header blocks contains a pointer (block number) to
++   first wander record of the just written transaction */
++static int update_journal_header(struct commit_handle *ch, int use_barrier)
++{
++	struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
++	jnode *jh = sbinfo->journal_header;
++	jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
++	int ret;
++
++	format_journal_header(ch);
++
++	ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
++					  use_barrier ? WRITEOUT_BARRIER : 0);
++	if (ret)
++		return ret;
++
++	// blk_run_address_space(sbinfo->fake->i_mapping);
++	/*blk_run_queues(); */
++
++	ret = jwait_io(jh, WRITE);
++
++	if (ret)
++		return ret;
++
++	sbinfo->last_committed_tx = *jnode_get_block(head);
++
++	return 0;
++}
++
++/* This function is called after write-back is finished. We update journal
++   footer block and free blocks which were occupied by wandered blocks and
++   transaction wander records */
++static int update_journal_footer(struct commit_handle *ch, int use_barrier)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(ch->super);
++
++	jnode *jf = sbinfo->journal_footer;
++
++	int ret;
++
++	format_journal_footer(ch);
++
++	ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
++					  use_barrier ? WRITEOUT_BARRIER : 0);
++	if (ret)
++		return ret;
++
++	// blk_run_address_space(sbinfo->fake->i_mapping);
++	/*blk_run_queue(); */
++
++	ret = jwait_io(jf, WRITE);
++	if (ret)
++		return ret;
++
++	return 0;
++}
++
++/* free block numbers of wander records of already written in place transaction */
++static void dealloc_tx_list(struct commit_handle *ch)
++{
++	while (!list_empty(&ch->tx_list)) {
++		jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
++		list_del(&cur->capture_link);
++		ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
++		reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
++				      BA_FORMATTED);
++
++		unpin_jnode_data(cur);
++		drop_io_head(cur);
++	}
++}
++
++/* An actor for use in block_nr_iterator() routine which frees wandered blocks
++   from atom's overwrite set. */
++static int
++dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
++		   const reiser4_block_nr * a UNUSED_ARG,
++		   const reiser4_block_nr * b, void *data UNUSED_ARG)
++{
++
++	assert("zam-499", b != NULL);
++	assert("zam-500", *b != 0);
++	assert("zam-501", !blocknr_is_fake(b));
++
++	reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
++	return 0;
++}
++
++/* free wandered block locations of already written in place transaction */
++static void dealloc_wmap(struct commit_handle *ch)
++{
++	assert("zam-696", ch->atom != NULL);
++
++	blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
++			     dealloc_wmap_actor, NULL, 1);
++}
++
++/* helper function for alloc wandered blocks, which refill set of block
++   numbers needed for wandered blocks  */
++static int
++get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
++{
++	reiser4_blocknr_hint hint;
++	int ret;
++
++	reiser4_block_nr wide_len = count;
++
++	/* FIXME-ZAM: A special policy needed for allocation of wandered blocks
++	   ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
++	   reserved allocation area so as to get the best qualities of fixed
++	   journals? */
++	blocknr_hint_init(&hint);
++	hint.block_stage = BLOCK_GRABBED;
++
++	ret = reiser4_alloc_blocks(&hint, start, &wide_len,
++				   BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
++	*len = (int)wide_len;
++
++	return ret;
++}
++
++/*
++ * roll back changes made before issuing BIO in the case of IO error.
++ */
++static void undo_bio(struct bio *bio)
++{
++	int i;
++
++	for (i = 0; i < bio->bi_vcnt; ++i) {
++		struct page *pg;
++		jnode *node;
++
++		pg = bio->bi_io_vec[i].bv_page;
++		ClearPageWriteback(pg);
++		node = jprivate(pg);
++		spin_lock_jnode(node);
++		JF_CLR(node, JNODE_WRITEBACK);
++		JF_SET(node, JNODE_DIRTY);
++		spin_unlock_jnode(node);
++	}
++	bio_put(bio);
++}
++
++/* put overwrite set back to atom's clean list */
++static void put_overwrite_set(struct commit_handle *ch)
++{
++	jnode *cur;
++
++	list_for_each_entry(cur, ch->overwrite_set, capture_link)
++		jrelse_tail(cur);
++}
++
++/* Count overwrite set size, grab disk space for wandered blocks allocation.
++   Since we have a separate list for atom's overwrite set we just scan the list,
++   count bitmap and other not leaf nodes which wandered blocks allocation we
++   have to grab space for. */
++static int get_overwrite_set(struct commit_handle *ch)
++{
++	int ret;
++	jnode *cur;
++	__u64 nr_not_leaves = 0;
++#if REISER4_DEBUG
++	__u64 nr_formatted_leaves = 0;
++	__u64 nr_unformatted_leaves = 0;
++#endif
++
++	assert("zam-697", ch->overwrite_set_size == 0);
++
++	ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
++	cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
++
++	while (ch->overwrite_set != &cur->capture_link) {
++		jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
++
++		/* Count bitmap locks for getting correct statistics what number
++		 * of blocks were cleared by the transaction commit. */
++		if (jnode_get_type(cur) == JNODE_BITMAP)
++			ch->nr_bitmap++;
++
++		assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
++		       || jnode_get_type(cur) == JNODE_BITMAP);
++
++		if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
++			/* we replace fake znode by another (real)
++			   znode which is suggested by disk_layout
++			   plugin */
++
++			/* FIXME: it looks like fake znode should be
++			   replaced by jnode supplied by
++			   disk_layout. */
++
++			struct super_block *s = reiser4_get_current_sb();
++			reiser4_super_info_data *sbinfo =
++			    get_current_super_private();
++
++			if (sbinfo->df_plug->log_super) {
++				jnode *sj = sbinfo->df_plug->log_super(s);
++
++				assert("zam-593", sj != NULL);
++
++				if (IS_ERR(sj))
++					return PTR_ERR(sj);
++
++				spin_lock_jnode(sj);
++				JF_SET(sj, JNODE_OVRWR);
++				insert_into_atom_ovrwr_list(ch->atom, sj);
++				spin_unlock_jnode(sj);
++
++				/* jload it as the rest of overwrite set */
++				jload_gfp(sj, get_gfp_mask(), 0);
++
++				ch->overwrite_set_size++;
++			}
++			spin_lock_jnode(cur);
++			uncapture_block(cur);
++			jput(cur);
++
++		} else {
++			int ret;
++			ch->overwrite_set_size++;
++			ret = jload_gfp(cur, get_gfp_mask(), 0);
++			if (ret)
++				reiser4_panic("zam-783",
++					      "cannot load e-flushed jnode back (ret = %d)\n",
++					      ret);
++		}
++
++		/* Count not leaves here because we have to grab disk space
++		 * for wandered blocks. They were not counted as "flush
++		 * reserved". Counting should be done _after_ nodes are pinned
++		 * into memory by jload(). */
++		if (!jnode_is_leaf(cur))
++			nr_not_leaves++;
++		else {
++#if REISER4_DEBUG
++			/* at this point @cur either has JNODE_FLUSH_RESERVED
++			 * or is eflushed. Locking is not strong enough to
++			 * write an assertion checking for this. */
++			if (jnode_is_znode(cur))
++				nr_formatted_leaves++;
++			else
++				nr_unformatted_leaves++;
++#endif
++			JF_CLR(cur, JNODE_FLUSH_RESERVED);
++		}
++
++		cur = next;
++	}
++
++	/* Grab space for writing (wandered blocks) of not leaves found in
++	 * overwrite set. */
++	ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
++	if (ret)
++		return ret;
++
++	/* Disk space for allocation of wandered blocks of leaf nodes already
++	 * reserved as "flush reserved", move it to grabbed space counter. */
++	spin_lock_atom(ch->atom);
++	assert("zam-940",
++	       nr_formatted_leaves + nr_unformatted_leaves <=
++	       ch->atom->flush_reserved);
++	flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
++	spin_unlock_atom(ch->atom);
++
++	return ch->overwrite_set_size;
++}
++
++/**
++ * write_jnodes_to_disk_extent - submit write request
++ * @head:
++ * @first: first jnode of the list
++ * @nr: number of jnodes on the list
++ * @block_p:
++ * @fq:
++ * @flags: used to decide whether page is to get PG_reclaim flag
++ *
++ * Submits a write request for @nr jnodes beginning from the @first, other
++ * jnodes are after the @first on the double-linked "capture" list.  All jnodes
++ * will be written to the disk region of @nr blocks starting with @block_p block
++ * number.  If @fq is not NULL it means that waiting for i/o completion will be
++ * done more efficiently by using flush_queue_t objects.
++ * This function is the one which writes list of jnodes in batch mode. It does
++ * all low-level things as bio construction and page states manipulation.
++ *
++ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
++ * aggregated in this function instead of being left to the layers below
++ *
++ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
++ * Why that layer needed? Why BIOs cannot be constructed here?
++ */
++static int write_jnodes_to_disk_extent(
++	jnode *first, int nr, const reiser4_block_nr *block_p,
++	flush_queue_t *fq, int flags)
++{
++	struct super_block *super = reiser4_get_current_sb();
++	int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
++	int max_blocks;
++	jnode *cur = first;
++	reiser4_block_nr block;
++
++	assert("zam-571", first != NULL);
++	assert("zam-572", block_p != NULL);
++	assert("zam-570", nr > 0);
++
++	block = *block_p;
++	max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
++
++	while (nr > 0) {
++		struct bio *bio;
++		int nr_blocks = min(nr, max_blocks);
++		int i;
++		int nr_used;
++
++		bio = bio_alloc(GFP_NOIO, nr_blocks);
++		if (!bio)
++			return RETERR(-ENOMEM);
++
++		bio->bi_bdev = super->s_bdev;
++		bio->bi_sector = block * (super->s_blocksize >> 9);
++		for (nr_used = 0, i = 0; i < nr_blocks; i++) {
++			struct page *pg;
++
++			pg = jnode_page(cur);
++			assert("zam-573", pg != NULL);
++
++			page_cache_get(pg);
++
++			lock_and_wait_page_writeback(pg);
++
++			if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
++				/*
++				 * underlying device is satiated. Stop adding
++				 * pages to the bio.
++				 */
++				unlock_page(pg);
++				page_cache_release(pg);
++				break;
++			}
++
++			spin_lock_jnode(cur);
++			assert("nikita-3166",
++			       pg->mapping == jnode_get_mapping(cur));
++			assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
++#if REISER4_DEBUG
++			spin_lock(&cur->load);
++			assert("nikita-3165", !jnode_is_releasable(cur));
++			spin_unlock(&cur->load);
++#endif
++			JF_SET(cur, JNODE_WRITEBACK);
++			JF_CLR(cur, JNODE_DIRTY);
++			ON_DEBUG(cur->written++);
++			spin_unlock_jnode(cur);
++
++			ClearPageError(pg);
++			set_page_writeback(pg);
++
++			if (get_current_context()->entd) {
++				/* this is ent thread */
++				entd_context *ent = get_entd_context(super);
++				struct wbq *rq, *next;
++
++				spin_lock(&ent->guard);
++
++				if (pg == ent->cur_request->page) {
++					/*
++					 * entd is called for this page. This
++					 * request is not in th etodo list
++					 */
++					ent->cur_request->written = 1;
++				} else {
++					/*
++					 * if we have written a page for which writepage
++					 * is called for - move request to another list.
++					 */
++					list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
++						assert("", rq->magic == WBQ_MAGIC);
++						if (pg == rq->page) {
++							/*
++							 * remove request from
++							 * entd's queue, but do
++							 * not wake up a thread
++							 * which put this
++							 * request
++							 */
++							list_del_init(&rq->link);
++							ent->nr_todo_reqs --;
++							list_add_tail(&rq->link, &ent->done_list);
++							ent->nr_done_reqs ++;
++							rq->written = 1;
++							break;
++						}
++					}
++				}
++				spin_unlock(&ent->guard);
++			}
++
++			clear_page_dirty_for_io(pg);
++
++			unlock_page(pg);
++
++			cur = list_entry(cur->capture_link.next, jnode, capture_link);
++			nr_used++;
++		}
++		if (nr_used > 0) {
++			assert("nikita-3453",
++			       bio->bi_size == super->s_blocksize * nr_used);
++			assert("nikita-3454", bio->bi_vcnt == nr_used);
++
++			/* Check if we are allowed to write at all */
++			if (super->s_flags & MS_RDONLY)
++				undo_bio(bio);
++			else {
++				int not_supported;
++
++				add_fq_to_bio(fq, bio);
++				bio_get(bio);
++				reiser4_submit_bio(write_op, bio);
++				not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
++				bio_put(bio);
++				if (not_supported)
++					return -EOPNOTSUPP;
++			}
++
++			block += nr_used - 1;
++			update_blocknr_hint_default(super, &block);
++			block += 1;
++		} else {
++			bio_put(bio);
++		}
++		nr -= nr_used;
++	}
++
++	return 0;
++}
++
++/* This is a procedure which recovers a contiguous sequences of disk block
++   numbers in the given list of j-nodes and submits write requests on this
++   per-sequence basis */
++int
++write_jnode_list(struct list_head *head, flush_queue_t *fq,
++		 long *nr_submitted, int flags)
++{
++	int ret;
++	jnode *beg = list_entry(head->next, jnode, capture_link);
++
++	while (head != &beg->capture_link) {
++		int nr = 1;
++		jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
++
++		while (head != &cur->capture_link) {
++			if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
++				break;
++			++nr;
++			cur = list_entry(cur->capture_link.next, jnode, capture_link);
++		}
++
++		ret = write_jnodes_to_disk_extent(
++			beg, nr, jnode_get_block(beg), fq, flags);
++		if (ret)
++			return ret;
++
++		if (nr_submitted)
++			*nr_submitted += nr;
++
++		beg = cur;
++	}
++
++	return 0;
++}
++
++/* add given wandered mapping to atom's wandered map */
++static int
++add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
++{
++	int ret;
++	blocknr_set_entry *new_bsep = NULL;
++	reiser4_block_nr block;
++
++	txn_atom *atom;
++
++	assert("zam-568", block_p != NULL);
++	block = *block_p;
++	assert("zam-569", len > 0);
++
++	while ((len--) > 0) {
++		do {
++			atom = get_current_atom_locked();
++			assert("zam-536",
++			       !blocknr_is_fake(jnode_get_block(cur)));
++			ret =
++			    blocknr_set_add_pair(atom, &atom->wandered_map,
++						 &new_bsep,
++						 jnode_get_block(cur), &block);
++		} while (ret == -E_REPEAT);
++
++		if (ret) {
++			/* deallocate blocks which were not added to wandered
++			   map */
++			reiser4_block_nr wide_len = len;
++
++			reiser4_dealloc_blocks(&block, &wide_len,
++					       BLOCK_NOT_COUNTED,
++					       BA_FORMATTED
++					       /* formatted, without defer */ );
++
++			return ret;
++		}
++
++		spin_unlock_atom(atom);
++
++		cur = list_entry(cur->capture_link.next, jnode, capture_link);
++		++block;
++	}
++
++	return 0;
++}
++
++/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
++   submit IO for allocated blocks.  We assume that current atom is in a stage
++   when any atom fusion is impossible and atom is unlocked and it is safe. */
++static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
++{
++	reiser4_block_nr block;
++
++	int rest;
++	int len;
++	int ret;
++
++	jnode *cur;
++
++	assert("zam-534", ch->overwrite_set_size > 0);
++
++	rest = ch->overwrite_set_size;
++
++	cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
++	while (ch->overwrite_set != &cur->capture_link) {
++		assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
++
++		ret = get_more_wandered_blocks(rest, &block, &len);
++		if (ret)
++			return ret;
++
++		rest -= len;
++
++		ret = add_region_to_wmap(cur, len, &block);
++		if (ret)
++			return ret;
++
++		ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
++		if (ret)
++			return ret;
++
++		while ((len--) > 0) {
++			assert("zam-604",
++			       ch->overwrite_set != &cur->capture_link);
++			cur = list_entry(cur->capture_link.next, jnode, capture_link);
++		}
++	}
++
++	return 0;
++}
++
++/* allocate given number of nodes over the journal area and link them into a
++   list, return pointer to the first jnode in the list */
++static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
++{
++	reiser4_blocknr_hint hint;
++	reiser4_block_nr allocated = 0;
++	reiser4_block_nr first, len;
++	jnode *cur;
++	jnode *txhead;
++	int ret;
++	reiser4_context *ctx;
++	reiser4_super_info_data *sbinfo;
++
++	assert("zam-698", ch->tx_size > 0);
++	assert("zam-699", list_empty_careful(&ch->tx_list));
++
++	ctx = get_current_context();
++	sbinfo = get_super_private(ctx->super);
++
++	while (allocated < (unsigned)ch->tx_size) {
++		len = (ch->tx_size - allocated);
++
++		blocknr_hint_init(&hint);
++
++		hint.block_stage = BLOCK_GRABBED;
++
++		/* FIXME: there should be some block allocation policy for
++		   nodes which contain wander records */
++
++		/* We assume that disk space for wandered record blocks can be
++		 * taken from reserved area. */
++		ret = reiser4_alloc_blocks(&hint, &first, &len,
++					   BA_FORMATTED | BA_RESERVED |
++					   BA_USE_DEFAULT_SEARCH_START);
++		blocknr_hint_done(&hint);
++
++		if (ret)
++			return ret;
++
++		allocated += len;
++
++		/* create jnodes for all wander records */
++		while (len--) {
++			cur = alloc_io_head(&first);
++
++			if (cur == NULL) {
++				ret = RETERR(-ENOMEM);
++				goto free_not_assigned;
++			}
++
++			ret = jinit_new(cur, get_gfp_mask());
++
++			if (ret != 0) {
++				jfree(cur);
++				goto free_not_assigned;
++			}
++
++			pin_jnode_data(cur);
++
++			list_add_tail(&cur->capture_link, &ch->tx_list);
++
++			first++;
++		}
++	}
++
++	{ /* format a on-disk linked list of wander records */
++		int serial = 1;
++
++		txhead = list_entry(ch->tx_list.next, jnode, capture_link);
++		format_tx_head(ch);
++
++		cur = list_entry(txhead->capture_link.next, jnode, capture_link);
++		while (&ch->tx_list != &cur->capture_link) {
++			format_wander_record(ch, cur, serial++);
++			cur = list_entry(cur->capture_link.next, jnode, capture_link);
++		}
++	}
++
++	{ /* Fill wander records with Wandered Set */
++		struct store_wmap_params params;
++		txn_atom *atom;
++
++		params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
++
++		params.idx = 0;
++		params.capacity =
++		    wander_record_capacity(reiser4_get_current_sb());
++
++		atom = get_current_atom_locked();
++		blocknr_set_iterator(atom, &atom->wandered_map,
++				     &store_wmap_actor, &params, 0);
++		spin_unlock_atom(atom);
++	}
++
++	{ /* relse all jnodes from tx_list */
++		cur = list_entry(ch->tx_list.next, jnode, capture_link);
++		while (&ch->tx_list != &cur->capture_link) {
++			jrelse(cur);
++			cur = list_entry(cur->capture_link.next, jnode, capture_link);
++		}
++	}
++
++	ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
++
++	return ret;
++
++      free_not_assigned:
++	/* We deallocate blocks not yet assigned to jnodes on tx_list. The
++	   caller takes care about invalidating of tx list  */
++	reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
++
++	return ret;
++}
++
++static int commit_tx(struct commit_handle *ch)
++{
++	flush_queue_t *fq;
++	int barrier;
++	int ret;
++
++	/* Grab more space for wandered records. */
++	ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
++	if (ret)
++		return ret;
++
++	fq = get_fq_for_current_atom();
++	if (IS_ERR(fq))
++		return PTR_ERR(fq);
++
++	spin_unlock_atom(fq->atom);
++	do {
++		ret = alloc_wandered_blocks(ch, fq);
++		if (ret)
++			break;
++		ret = alloc_tx(ch, fq);
++		if (ret)
++			break;
++	} while (0);
++
++	fq_put(fq);
++	if (ret)
++		return ret;
++ repeat_wo_barrier:
++	barrier = reiser4_use_write_barrier(ch->super);
++	if (!barrier) {
++		ret = current_atom_finish_all_fq();
++		if (ret)
++			return ret;
++	}
++	ret = update_journal_header(ch, barrier);
++	if (barrier) {
++		if (ret) {
++			if (ret == -EOPNOTSUPP) {
++				disable_write_barrier(ch->super);
++				goto repeat_wo_barrier;
++			}
++			return ret;
++		}
++		ret = current_atom_finish_all_fq();
++	}
++	return ret;
++}
++
++
++static int write_tx_back(struct commit_handle * ch)
++{
++	flush_queue_t *fq;
++	int ret;
++	int barrier;
++
++	post_commit_hook();
++	fq = get_fq_for_current_atom();
++	if (IS_ERR(fq))
++		return  PTR_ERR(fq);
++	spin_unlock_atom(fq->atom);
++	ret = write_jnode_list(
++		ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
++	fq_put(fq);
++	if (ret)
++		return ret;
++ repeat_wo_barrier:
++	barrier = reiser4_use_write_barrier(ch->super);
++	if (!barrier) {
++		ret = current_atom_finish_all_fq();
++		if (ret)
++			return ret;
++	}
++	ret = update_journal_footer(ch, barrier);
++	if (barrier) {
++		if (ret) {
++			if (ret == -EOPNOTSUPP) {
++				disable_write_barrier(ch->super);
++				goto repeat_wo_barrier;
++			}
++			return ret;
++		}
++		ret = current_atom_finish_all_fq();
++	}
++	if (ret)
++		return ret;
++	post_write_back_hook();
++	return 0;
++}
++
++/* We assume that at this moment all captured blocks are marked as RELOC or
++   WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
++   are submitted to write.
++*/
++
++int reiser4_write_logs(long *nr_submitted)
++{
++	txn_atom *atom;
++	struct super_block *super = reiser4_get_current_sb();
++	reiser4_super_info_data *sbinfo = get_super_private(super);
++	struct commit_handle ch;
++	int ret;
++
++	writeout_mode_enable();
++
++	/* block allocator may add j-nodes to the clean_list */
++	ret = pre_commit_hook();
++	if (ret)
++		return ret;
++
++	/* No locks are required if we take atom which stage >=
++	 * ASTAGE_PRE_COMMIT */
++	atom = get_current_context()->trans->atom;
++	assert("zam-965", atom != NULL);
++
++	/* relocate set is on the atom->clean_nodes list after
++	 * current_atom_complete_writes() finishes. It can be safely
++	 * uncaptured after commit_semaphore is taken, because any atom that
++	 * captures these nodes is guaranteed to commit after current one.
++	 *
++	 * This can only be done after pre_commit_hook(), because it is where
++	 * early flushed jnodes with CREATED bit are transferred to the
++	 * overwrite list. */
++	invalidate_list(ATOM_CLEAN_LIST(atom));
++	spin_lock_atom(atom);
++	/* There might be waiters for the relocate nodes which we have
++	 * released, wake them up. */
++	atom_send_event(atom);
++	spin_unlock_atom(atom);
++
++	if (REISER4_DEBUG) {
++		int level;
++
++		for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
++			assert("nikita-3352",
++			       list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
++	}
++
++	sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
++	sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
++
++	init_commit_handle(&ch, atom);
++
++	ch.free_blocks = sbinfo->blocks_free_committed;
++	ch.nr_files = sbinfo->nr_files_committed;
++	/* ZAM-FIXME-HANS: email me what the contention level is for the super
++	 * lock. */
++	ch.next_oid = oid_next(super);
++
++	/* count overwrite set and place it in a separate list */
++	ret = get_overwrite_set(&ch);
++
++	if (ret <= 0) {
++		/* It is possible that overwrite set is empty here, it means
++		   all captured nodes are clean */
++		goto up_and_ret;
++	}
++
++	/* Inform the caller about what number of dirty pages will be
++	 * submitted to disk. */
++	*nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
++
++	/* count all records needed for storing of the wandered set */
++	get_tx_size(&ch);
++
++	ret = commit_tx(&ch);
++	if (ret)
++		goto up_and_ret;
++
++	spin_lock_atom(atom);
++	atom_set_stage(atom, ASTAGE_POST_COMMIT);
++	spin_unlock_atom(atom);
++
++	ret = write_tx_back(&ch);
++	post_write_back_hook();
++
++      up_and_ret:
++	if (ret) {
++		/* there could be fq attached to current atom; the only way to
++		   remove them is: */
++		current_atom_finish_all_fq();
++	}
++
++	/* free blocks of flushed transaction */
++	dealloc_tx_list(&ch);
++	dealloc_wmap(&ch);
++
++	put_overwrite_set(&ch);
++
++	done_commit_handle(&ch);
++
++	writeout_mode_disable();
++
++	return ret;
++}
++
++/* consistency checks for journal data/control blocks: header, footer, log
++   records, transactions head blocks. All functions return zero on success. */
++
++static int check_journal_header(const jnode * node UNUSED_ARG)
++{
++	/* FIXME: journal header has no magic field yet. */
++	return 0;
++}
++
++/* wait for write completion for all jnodes from given list */
++static int wait_on_jnode_list(struct list_head *head)
++{
++	jnode *scan;
++	int ret = 0;
++
++	list_for_each_entry(scan, head, capture_link) {
++		struct page *pg = jnode_page(scan);
++
++		if (pg) {
++			if (PageWriteback(pg))
++				wait_on_page_writeback(pg);
++
++			if (PageError(pg))
++				ret++;
++		}
++	}
++
++	return ret;
++}
++
++static int check_journal_footer(const jnode * node UNUSED_ARG)
++{
++	/* FIXME: journal footer has no magic field yet. */
++	return 0;
++}
++
++static int check_tx_head(const jnode * node)
++{
++	struct tx_header *header = (struct tx_header *)jdata(node);
++
++	if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
++		warning("zam-627", "tx head at block %s corrupted\n",
++			sprint_address(jnode_get_block(node)));
++		return RETERR(-EIO);
++	}
++
++	return 0;
++}
++
++static int check_wander_record(const jnode * node)
++{
++	struct wander_record_header *RH =
++	    (struct wander_record_header *)jdata(node);
++
++	if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
++	    0) {
++		warning("zam-628", "wander record at block %s corrupted\n",
++			sprint_address(jnode_get_block(node)));
++		return RETERR(-EIO);
++	}
++
++	return 0;
++}
++
++/* fill commit_handler structure by everything what is needed for update_journal_footer */
++static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
++{
++	struct tx_header *TXH;
++	int ret;
++
++	ret = jload(tx_head);
++	if (ret)
++		return ret;
++
++	TXH = (struct tx_header *)jdata(tx_head);
++
++	ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
++	ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
++	ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
++
++	jrelse(tx_head);
++
++	list_add(&tx_head->capture_link, &ch->tx_list);
++
++	return 0;
++}
++
++/* replay one transaction: restore and write overwrite set in place */
++static int replay_transaction(const struct super_block *s,
++			      jnode * tx_head,
++			      const reiser4_block_nr * log_rec_block_p,
++			      const reiser4_block_nr * end_block,
++			      unsigned int nr_wander_records)
++{
++	reiser4_block_nr log_rec_block = *log_rec_block_p;
++	struct commit_handle ch;
++	LIST_HEAD(overwrite_set);
++	jnode *log;
++	int ret;
++
++	init_commit_handle(&ch, NULL);
++	ch.overwrite_set = &overwrite_set;
++
++	restore_commit_handle(&ch, tx_head);
++
++	while (log_rec_block != *end_block) {
++		struct wander_record_header *header;
++		struct wander_entry *entry;
++
++		int i;
++
++		if (nr_wander_records == 0) {
++			warning("zam-631",
++				"number of wander records in the linked list"
++				" greater than number stored in tx head.\n");
++			ret = RETERR(-EIO);
++			goto free_ow_set;
++		}
++
++		log = alloc_io_head(&log_rec_block);
++		if (log == NULL)
++			return RETERR(-ENOMEM);
++
++		ret = jload(log);
++		if (ret < 0) {
++			drop_io_head(log);
++			return ret;
++		}
++
++		ret = check_wander_record(log);
++		if (ret) {
++			jrelse(log);
++			drop_io_head(log);
++			return ret;
++		}
++
++		header = (struct wander_record_header *)jdata(log);
++		log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
++
++		entry = (struct wander_entry *)(header + 1);
++
++		/* restore overwrite set from wander record content */
++		for (i = 0; i < wander_record_capacity(s); i++) {
++			reiser4_block_nr block;
++			jnode *node;
++
++			block = le64_to_cpu(get_unaligned(&entry->wandered));
++			if (block == 0)
++				break;
++
++			node = alloc_io_head(&block);
++			if (node == NULL) {
++				ret = RETERR(-ENOMEM);
++				/*
++				 * FIXME-VS:???
++				 */
++				jrelse(log);
++				drop_io_head(log);
++				goto free_ow_set;
++			}
++
++			ret = jload(node);
++
++			if (ret < 0) {
++				drop_io_head(node);
++				/*
++				 * FIXME-VS:???
++				 */
++				jrelse(log);
++				drop_io_head(log);
++				goto free_ow_set;
++			}
++
++			block = le64_to_cpu(get_unaligned(&entry->original));
++
++			assert("zam-603", block != 0);
++
++			jnode_set_block(node, &block);
++
++			list_add_tail(&node->capture_link, ch.overwrite_set);
++
++			++entry;
++		}
++
++		jrelse(log);
++		drop_io_head(log);
++
++		--nr_wander_records;
++	}
++
++	if (nr_wander_records != 0) {
++		warning("zam-632", "number of wander records in the linked list"
++			" less than number stored in tx head.\n");
++		ret = RETERR(-EIO);
++		goto free_ow_set;
++	}
++
++	{			/* write wandered set in place */
++		write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
++		ret = wait_on_jnode_list(ch.overwrite_set);
++
++		if (ret) {
++			ret = RETERR(-EIO);
++			goto free_ow_set;
++		}
++	}
++
++	ret = update_journal_footer(&ch, 0);
++
++      free_ow_set:
++
++	while (!list_empty(ch.overwrite_set)) {
++		jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
++		list_del_init(&cur->capture_link);
++		jrelse(cur);
++		drop_io_head(cur);
++	}
++
++	list_del_init(&tx_head->capture_link);
++
++	done_commit_handle(&ch);
++
++	return ret;
++}
++
++/* find oldest committed and not played transaction and play it. The transaction
++ * was committed and journal header block was updated but the blocks from the
++ * process of writing the atom's overwrite set in-place and updating of journal
++ * footer block were not completed. This function completes the process by
++ * recovering the atom's overwrite set from their wandered locations and writes
++ * them in-place and updating the journal footer. */
++static int replay_oldest_transaction(struct super_block *s)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++	jnode *jf = sbinfo->journal_footer;
++	unsigned int total;
++	struct journal_footer *F;
++	struct tx_header *T;
++
++	reiser4_block_nr prev_tx;
++	reiser4_block_nr last_flushed_tx;
++	reiser4_block_nr log_rec_block = 0;
++
++	jnode *tx_head;
++
++	int ret;
++
++	if ((ret = jload(jf)) < 0)
++		return ret;
++
++	F = (struct journal_footer *)jdata(jf);
++
++	last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
++
++	jrelse(jf);
++
++	if (sbinfo->last_committed_tx == last_flushed_tx) {
++		/* all transactions are replayed */
++		return 0;
++	}
++
++	prev_tx = sbinfo->last_committed_tx;
++
++	/* searching for oldest not flushed transaction */
++	while (1) {
++		tx_head = alloc_io_head(&prev_tx);
++		if (!tx_head)
++			return RETERR(-ENOMEM);
++
++		ret = jload(tx_head);
++		if (ret < 0) {
++			drop_io_head(tx_head);
++			return ret;
++		}
++
++		ret = check_tx_head(tx_head);
++		if (ret) {
++			jrelse(tx_head);
++			drop_io_head(tx_head);
++			return ret;
++		}
++
++		T = (struct tx_header *)jdata(tx_head);
++
++		prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
++
++		if (prev_tx == last_flushed_tx)
++			break;
++
++		jrelse(tx_head);
++		drop_io_head(tx_head);
++	}
++
++	total = le32_to_cpu(get_unaligned(&T->total));
++	log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
++
++	pin_jnode_data(tx_head);
++	jrelse(tx_head);
++
++	ret =
++	    replay_transaction(s, tx_head, &log_rec_block,
++			       jnode_get_block(tx_head), total - 1);
++
++	unpin_jnode_data(tx_head);
++	drop_io_head(tx_head);
++
++	if (ret)
++		return ret;
++	return -E_REPEAT;
++}
++
++/* The reiser4 journal current implementation was optimized to not to capture
++   super block if certain super blocks fields are modified. Currently, the set
++   is (<free block count>, <OID allocator>). These fields are logged by
++   special way which includes storing them in each transaction head block at
++   atom commit time and writing that information to journal footer block at
++   atom flush time.  For getting info from journal footer block to the
++   in-memory super block there is a special function
++   reiser4_journal_recover_sb_data() which should be called after disk format
++   plugin re-reads super block after journal replaying.
++*/
++
++/* get the information from journal footer in-memory super block */
++int reiser4_journal_recover_sb_data(struct super_block *s)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++	struct journal_footer *jf;
++	int ret;
++
++	assert("zam-673", sbinfo->journal_footer != NULL);
++
++	ret = jload(sbinfo->journal_footer);
++	if (ret != 0)
++		return ret;
++
++	ret = check_journal_footer(sbinfo->journal_footer);
++	if (ret != 0)
++		goto out;
++
++	jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
++
++	/* was there at least one flushed transaction?  */
++	if (jf->last_flushed_tx) {
++
++		/* restore free block counter logged in this transaction */
++		reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
++
++		/* restore oid allocator state */
++		oid_init_allocator(s,
++				   le64_to_cpu(get_unaligned(&jf->nr_files)),
++				   le64_to_cpu(get_unaligned(&jf->next_oid)));
++	}
++      out:
++	jrelse(sbinfo->journal_footer);
++	return ret;
++}
++
++/* reiser4 replay journal procedure */
++int reiser4_journal_replay(struct super_block *s)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++	jnode *jh, *jf;
++	struct journal_header *header;
++	int nr_tx_replayed = 0;
++	int ret;
++
++	assert("zam-582", sbinfo != NULL);
++
++	jh = sbinfo->journal_header;
++	jf = sbinfo->journal_footer;
++
++	if (!jh || !jf) {
++		/* it is possible that disk layout does not support journal
++		   structures, we just warn about this */
++		warning("zam-583",
++			"journal control blocks were not loaded by disk layout plugin.  "
++			"journal replaying is not possible.\n");
++		return 0;
++	}
++
++	/* Take free block count from journal footer block. The free block
++	   counter value corresponds the last flushed transaction state */
++	ret = jload(jf);
++	if (ret < 0)
++		return ret;
++
++	ret = check_journal_footer(jf);
++	if (ret) {
++		jrelse(jf);
++		return ret;
++	}
++
++	jrelse(jf);
++
++	/* store last committed transaction info in reiser4 in-memory super
++	   block */
++	ret = jload(jh);
++	if (ret < 0)
++		return ret;
++
++	ret = check_journal_header(jh);
++	if (ret) {
++		jrelse(jh);
++		return ret;
++	}
++
++	header = (struct journal_header *)jdata(jh);
++	sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
++
++	jrelse(jh);
++
++	/* replay committed transactions */
++	while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
++		nr_tx_replayed++;
++
++	return ret;
++}
++
++/* load journal control block (either journal header or journal footer block) */
++static int
++load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
++{
++	int ret;
++
++	*node = alloc_io_head(block);
++	if (!(*node))
++		return RETERR(-ENOMEM);
++
++	ret = jload(*node);
++
++	if (ret) {
++		drop_io_head(*node);
++		*node = NULL;
++		return ret;
++	}
++
++	pin_jnode_data(*node);
++	jrelse(*node);
++
++	return 0;
++}
++
++/* unload journal header or footer and free jnode */
++static void unload_journal_control_block(jnode ** node)
++{
++	if (*node) {
++		unpin_jnode_data(*node);
++		drop_io_head(*node);
++		*node = NULL;
++	}
++}
++
++/* release journal control blocks */
++void done_journal_info(struct super_block *s)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++
++	assert("zam-476", sbinfo != NULL);
++
++	unload_journal_control_block(&sbinfo->journal_header);
++	unload_journal_control_block(&sbinfo->journal_footer);
++	rcu_barrier();
++}
++
++/* load journal control blocks */
++int init_journal_info(struct super_block *s)
++{
++	reiser4_super_info_data *sbinfo = get_super_private(s);
++	journal_location *loc;
++	int ret;
++
++	loc = &sbinfo->jloc;
++
++	assert("zam-651", loc != NULL);
++	assert("zam-652", loc->header != 0);
++	assert("zam-653", loc->footer != 0);
++
++	ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
++
++	if (ret)
++		return ret;
++
++	ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
++
++	if (ret) {
++		unload_journal_control_block(&sbinfo->journal_header);
++	}
++
++	return ret;
++}
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 80
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/wander.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/wander.h
+@@ -0,0 +1,135 @@
++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
++
++#if !defined (__FS_REISER4_WANDER_H__)
++#define __FS_REISER4_WANDER_H__
++
++#include "dformat.h"
++
++#include <linux/fs.h>		/* for struct super_block  */
++
++/* REISER4 JOURNAL ON-DISK DATA STRUCTURES   */
++
++#define TX_HEADER_MAGIC  "TxMagic4"
++#define WANDER_RECORD_MAGIC "LogMagc4"
++
++#define TX_HEADER_MAGIC_SIZE  (8)
++#define WANDER_RECORD_MAGIC_SIZE (8)
++
++/* journal header block format */
++struct journal_header {
++	/* last written transaction head location */
++	d64 last_committed_tx;
++};
++
++typedef struct journal_location {
++	reiser4_block_nr footer;
++	reiser4_block_nr header;
++} journal_location;
++
++/* The wander.c head comment describes usage and semantic of all these structures */
++/* journal footer block format */
++struct journal_footer {
++	/* last flushed transaction location. */
++	/* This block number is no more valid after the transaction it points
++	   to gets flushed, this number is used only at journal replaying time
++	   for detection of the end of on-disk list of committed transactions
++	   which were not flushed completely */
++	d64 last_flushed_tx;
++
++	/* free block counter is written in journal footer at transaction
++	   flushing , not in super block because free blocks counter is logged
++	   by another way than super block fields (root pointer, for
++	   example). */
++	d64 free_blocks;
++
++	/* number of used OIDs and maximal used OID are logged separately from
++	   super block */
++	d64 nr_files;
++	d64 next_oid;
++};
++
++/* Each wander record (except the first one) has unified format with wander
++   record header followed by an array of log entries */
++struct wander_record_header {
++	/* when there is no predefined location for wander records, this magic
++	   string should help reiser4fsck. */
++	char magic[WANDER_RECORD_MAGIC_SIZE];
++
++	/* transaction id */
++	d64 id;
++
++	/* total number of wander records in current transaction  */
++	d32 total;
++
++	/* this block number in transaction */
++	d32 serial;
++
++	/* number of previous block in commit */
++	d64 next_block;
++};
++
++/* The first wander record (transaction head) of written transaction has the
++   special format */
++struct tx_header {
++	/* magic string makes first block in transaction different from other
++	   logged blocks, it should help fsck. */
++	char magic[TX_HEADER_MAGIC_SIZE];
++
++	/* transaction id */
++	d64 id;
++
++	/* total number of records (including this first tx head) in the
++	   transaction */
++	d32 total;
++
++	/* align next field to 8-byte boundary; this field always is zero */
++	d32 padding;
++
++	/* block number of previous transaction head */
++	d64 prev_tx;
++
++	/* next wander record location */
++	d64 next_block;
++
++	/* committed versions of free blocks counter */
++	d64 free_blocks;
++
++	/* number of used OIDs (nr_files) and maximal used OID are logged
++	   separately from super block */
++	d64 nr_files;
++	d64 next_oid;
++};
++
++/* A transaction gets written to disk as a set of wander records (each wander
++   record size is fs block) */
++
++/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
++   by zeroes */
++struct wander_entry {
++	d64 original;		/* block original location */
++	d64 wandered;		/* block wandered location */
++};
++
++/* REISER4 JOURNAL WRITER FUNCTIONS   */
++
++extern int reiser4_write_logs(long *);
++extern int reiser4_journal_replay(struct super_block *);
++extern int reiser4_journal_recover_sb_data(struct super_block *);
++
++extern int init_journal_info(struct super_block *);
++extern void done_journal_info(struct super_block *);
++
++extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
++
++#endif				/* __FS_REISER4_WANDER_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 80
++   scroll-step: 1
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/writeout.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/writeout.h
+@@ -0,0 +1,21 @@
++/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
++
++#if !defined (__FS_REISER4_WRITEOUT_H__)
++
++#define WRITEOUT_SINGLE_STREAM (0x1)
++#define WRITEOUT_FOR_PAGE_RECLAIM  (0x2)
++#define WRITEOUT_BARRIER (0x4)
++
++extern int get_writeout_flags(void);
++
++#endif				/* __FS_REISER4_WRITEOUT_H__ */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 80
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/znode.c
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/znode.c
+@@ -0,0 +1,1028 @@
++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
++ * reiser4/README */
++/* Znode manipulation functions. */
++/* Znode is the in-memory header for a tree node. It is stored
++   separately from the node itself so that it does not get written to
++   disk.  In this respect znode is like buffer head or page head. We
++   also use znodes for additional reiser4 specific purposes:
++
++    . they are organized into tree structure which is a part of whole
++      reiser4 tree.
++    . they are used to implement node grained locking
++    . they are used to keep additional state associated with a
++      node
++    . they contain links to lists used by the transaction manager
++
++   Znode is attached to some variable "block number" which is instance of
++   fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
++   appropriate node being actually loaded in memory. Existence of znode itself
++   is regulated by reference count (->x_count) in it. Each time thread
++   acquires reference to znode through call to zget(), ->x_count is
++   incremented and decremented on call to zput().  Data (content of node) are
++   brought in memory through call to zload(), which also increments ->d_count
++   reference counter.  zload can block waiting on IO.  Call to zrelse()
++   decreases this counter. Also, ->c_count keeps track of number of child
++   znodes and prevents parent znode from being recycled until all of its
++   children are. ->c_count is decremented whenever child goes out of existence
++   (being actually recycled in zdestroy()) which can be some time after last
++   reference to this child dies if we support some form of LRU cache for
++   znodes.
++
++*/
++/* EVERY ZNODE'S STORY
++
++   1. His infancy.
++
++   Once upon a time, the znode was born deep inside of zget() by call to
++   zalloc(). At the return from zget() znode had:
++
++    . reference counter (x_count) of 1
++    . assigned block number, marked as used in bitmap
++    . pointer to parent znode. Root znode parent pointer points
++      to its father: "fake" znode. This, in turn, has NULL parent pointer.
++    . hash table linkage
++    . no data loaded from disk
++    . no node plugin
++    . no sibling linkage
++
++   2. His childhood
++
++   Each node is either brought into memory as a result of tree traversal, or
++   created afresh, creation of the root being a special case of the latter. In
++   either case it's inserted into sibling list. This will typically require
++   some ancillary tree traversing, but ultimately both sibling pointers will
++   exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
++   zjnode.state.
++
++   3. His youth.
++
++   If znode is bound to already existing node in a tree, its content is read
++   from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
++   in zjnode.state and zdata() function starts to return non null for this
++   znode. zload() further calls zparse() that determines which node layout
++   this node is rendered in, and sets ->nplug on success.
++
++   If znode is for new node just created, memory for it is allocated and
++   zinit_new() function is called to initialise data, according to selected
++   node layout.
++
++   4. His maturity.
++
++   After this point, znode lingers in memory for some time. Threads can
++   acquire references to znode either by blocknr through call to zget(), or by
++   following a pointer to unallocated znode from internal item. Each time
++   reference to znode is obtained, x_count is increased. Thread can read/write
++   lock znode. Znode data can be loaded through calls to zload(), d_count will
++   be increased appropriately. If all references to znode are released
++   (x_count drops to 0), znode is not recycled immediately. Rather, it is
++   still cached in the hash table in the hope that it will be accessed
++   shortly.
++
++   There are two ways in which znode existence can be terminated:
++
++    . sudden death: node bound to this znode is removed from the tree
++    . overpopulation: znode is purged out of memory due to memory pressure
++
++   5. His death.
++
++   Death is complex process.
++
++   When we irrevocably commit ourselves to decision to remove node from the
++   tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
++   znode. This is done either in ->kill_hook() of internal item or in
++   kill_root() function when tree root is removed.
++
++   At this moment znode still has:
++
++    . locks held on it, necessary write ones
++    . references to it
++    . disk block assigned to it
++    . data loaded from the disk
++    . pending requests for lock
++
++   But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
++   deletion. Node deletion includes two phases. First all ways to get
++   references to that znode (sibling and parent links and hash lookup using
++   block number stored in parent node) should be deleted -- it is done through
++   sibling_list_remove(), also we assume that nobody uses down link from
++   parent node due to its nonexistence or proper parent node locking and
++   nobody uses parent pointers from children due to absence of them. Second we
++   invalidate all pending lock requests which still are on znode's lock
++   request queue, this is done by invalidate_lock(). Another JNODE_IS_DYING
++   znode status bit is used to invalidate pending lock requests. Once it set
++   all requesters are forced to return -EINVAL from
++   longterm_lock_znode(). Future locking attempts are not possible because all
++   ways to get references to that znode are removed already. Last, node is
++   uncaptured from transaction.
++
++   When last reference to the dying znode is just about to be released,
++   block number for this lock is released and znode is removed from the
++   hash table.
++
++   Now znode can be recycled.
++
++   [it's possible to free bitmap block and remove znode from the hash
++   table when last lock is released. This will result in having
++   referenced but completely orphaned znode]
++
++   6. Limbo
++
++   As have been mentioned above znodes with reference counter 0 are
++   still cached in a hash table. Once memory pressure increases they are
++   purged out of there [this requires something like LRU list for
++   efficient implementation. LRU list would also greatly simplify
++   implementation of coord cache that would in this case morph to just
++   scanning some initial segment of LRU list]. Data loaded into
++   unreferenced znode are flushed back to the durable storage if
++   necessary and memory is freed. Znodes themselves can be recycled at
++   this point too.
++
++*/
++
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/plugin_header.h"
++#include "plugin/node/node.h"
++#include "plugin/plugin.h"
++#include "txnmgr.h"
++#include "jnode.h"
++#include "znode.h"
++#include "block_alloc.h"
++#include "tree.h"
++#include "tree_walk.h"
++#include "super.h"
++#include "reiser4.h"
++
++#include <linux/pagemap.h>
++#include <linux/spinlock.h>
++#include <linux/slab.h>
++#include <linux/err.h>
++
++static z_hash_table *get_htable(reiser4_tree *,
++				const reiser4_block_nr * const blocknr);
++static z_hash_table *znode_get_htable(const znode *);
++static void zdrop(znode *);
++
++/* hash table support */
++
++/* compare two block numbers for equality. Used by hash-table macros */
++static inline int
++blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
++{
++	assert("nikita-534", b1 != NULL);
++	assert("nikita-535", b2 != NULL);
++
++	return *b1 == *b2;
++}
++
++/* Hash znode by block number. Used by hash-table macros */
++/* Audited by: umka (2002.06.11) */
++static inline __u32
++blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
++{
++	assert("nikita-536", b != NULL);
++
++	return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
++}
++
++/* The hash table definition */
++#define KMALLOC(size) kmalloc((size), GFP_KERNEL)
++#define KFREE(ptr, size) kfree(ptr)
++TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
++		      blknrhashfn, blknreq);
++#undef KFREE
++#undef KMALLOC
++
++/* slab for znodes */
++static kmem_cache_t *znode_cache;
++
++int znode_shift_order;
++
++/**
++ * init_znodes - create znode cache
++ *
++ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
++ */
++int init_znodes(void)
++{
++	znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
++					SLAB_HWCACHE_ALIGN |
++					SLAB_RECLAIM_ACCOUNT, NULL, NULL);
++	if (znode_cache == NULL)
++		return RETERR(-ENOMEM);
++
++	for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
++	     ++znode_shift_order);
++	--znode_shift_order;
++	return 0;
++}
++
++/**
++ * done_znodes - delete znode cache
++ *
++ * This is called on reiser4 module unloading or system shutdown.
++ */
++void done_znodes(void)
++{
++	destroy_reiser4_cache(&znode_cache);
++}
++
++/* call this to initialise tree of znodes */
++int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
++{
++	int result;
++	assert("umka-050", tree != NULL);
++
++	rwlock_init(&tree->dk_lock);
++
++	result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
++	if (result != 0)
++		return result;
++	result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
++	return result;
++}
++
++/* free this znode */
++void zfree(znode * node /* znode to free */ )
++{
++	assert("nikita-465", node != NULL);
++	assert("nikita-2120", znode_page(node) == NULL);
++	assert("nikita-2301", list_empty_careful(&node->lock.owners));
++	assert("nikita-2302", list_empty_careful(&node->lock.requestors));
++	assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
++			       NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
++	assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
++	assert("nikita-3293", !znode_is_right_connected(node));
++	assert("nikita-3294", !znode_is_left_connected(node));
++	assert("nikita-3295", node->left == NULL);
++	assert("nikita-3296", node->right == NULL);
++
++	/* not yet phash_jnode_destroy(ZJNODE(node)); */
++
++	kmem_cache_free(znode_cache, node);
++}
++
++/* call this to free tree of znodes */
++void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
++{
++	znode *node;
++	znode *next;
++	z_hash_table *ztable;
++
++	/* scan znode hash-tables and kill all znodes, then free hash tables
++	 * themselves. */
++
++	assert("nikita-795", tree != NULL);
++
++	ztable = &tree->zhash_table;
++
++	if (ztable->_table != NULL) {
++		for_all_in_htable(ztable, z, node, next) {
++			node->c_count = 0;
++			node->in_parent.node = NULL;
++			assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
++			zdrop(node);
++		}
++
++		z_hash_done(&tree->zhash_table);
++	}
++
++	ztable = &tree->zfake_table;
++
++	if (ztable->_table != NULL) {
++		for_all_in_htable(ztable, z, node, next) {
++			node->c_count = 0;
++			node->in_parent.node = NULL;
++			assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
++			zdrop(node);
++		}
++
++		z_hash_done(&tree->zfake_table);
++	}
++}
++
++/* ZNODE STRUCTURES */
++
++/* allocate fresh znode */
++znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
++{
++	znode *node;
++
++	node = kmem_cache_alloc(znode_cache, gfp_flag);
++	return node;
++}
++
++/* Initialize fields of znode
++   @node:    znode to initialize;
++   @parent:  parent znode;
++   @tree:    tree we are in. */
++void zinit(znode * node, const znode * parent, reiser4_tree * tree)
++{
++	assert("nikita-466", node != NULL);
++	assert("umka-268", current_tree != NULL);
++
++	memset(node, 0, sizeof *node);
++
++	assert("umka-051", tree != NULL);
++
++	jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
++	reiser4_init_lock(&node->lock);
++	init_parent_coord(&node->in_parent, parent);
++}
++
++/*
++ * remove znode from indices. This is called jput() when last reference on
++ * znode is released.
++ */
++void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
++{
++	assert("nikita-2108", node != NULL);
++	assert("nikita-470", node->c_count == 0);
++	assert_rw_write_locked(&(tree->tree_lock));
++
++	/* remove reference to this znode from cbk cache */
++	cbk_cache_invalidate(node, tree);
++
++	/* update c_count of parent */
++	if (znode_parent(node) != NULL) {
++		assert("nikita-472", znode_parent(node)->c_count > 0);
++		/* father, onto your hands I forward my spirit... */
++		znode_parent(node)->c_count--;
++		node->in_parent.node = NULL;
++	} else {
++		/* orphaned znode?! Root? */
++	}
++
++	/* remove znode from hash-table */
++	z_hash_remove_rcu(znode_get_htable(node), node);
++}
++
++/* zdrop() -- Remove znode from the tree.
++
++   This is called when znode is removed from the memory. */
++static void zdrop(znode * node /* znode to finish with */ )
++{
++	jdrop(ZJNODE(node));
++}
++
++/*
++ * put znode into right place in the hash table. This is called by relocate
++ * code.
++ */
++int znode_rehash(znode * node /* node to rehash */ ,
++		 const reiser4_block_nr * new_block_nr /* new block number */ )
++{
++	z_hash_table *oldtable;
++	z_hash_table *newtable;
++	reiser4_tree *tree;
++
++	assert("nikita-2018", node != NULL);
++
++	tree = znode_get_tree(node);
++	oldtable = znode_get_htable(node);
++	newtable = get_htable(tree, new_block_nr);
++
++	write_lock_tree(tree);
++	/* remove znode from hash-table */
++	z_hash_remove_rcu(oldtable, node);
++
++	/* assertion no longer valid due to RCU */
++	/* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
++
++	/* update blocknr */
++	znode_set_block(node, new_block_nr);
++	node->zjnode.key.z = *new_block_nr;
++
++	/* insert it into hash */
++	z_hash_insert_rcu(newtable, node);
++	write_unlock_tree(tree);
++	return 0;
++}
++
++/* ZNODE LOOKUP, GET, PUT */
++
++/* zlook() - get znode with given block_nr in a hash table or return NULL
++
++   If result is non-NULL then the znode's x_count is incremented.  Internal version
++   accepts pre-computed hash index.  The hash table is accessed under caller's
++   tree->hash_lock.
++*/
++znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
++{
++	znode *result;
++	__u32 hash;
++	z_hash_table *htable;
++
++	assert("jmacd-506", tree != NULL);
++	assert("jmacd-507", blocknr != NULL);
++
++	htable = get_htable(tree, blocknr);
++	hash = blknrhashfn(htable, blocknr);
++
++	rcu_read_lock();
++	result = z_hash_find_index(htable, hash, blocknr);
++
++	if (result != NULL) {
++		add_x_ref(ZJNODE(result));
++		result = znode_rip_check(tree, result);
++	}
++	rcu_read_unlock();
++
++	return result;
++}
++
++/* return hash table where znode with block @blocknr is (or should be)
++ * stored */
++static z_hash_table *get_htable(reiser4_tree * tree,
++				const reiser4_block_nr * const blocknr)
++{
++	z_hash_table *table;
++	if (is_disk_addr_unallocated(blocknr))
++		table = &tree->zfake_table;
++	else
++		table = &tree->zhash_table;
++	return table;
++}
++
++/* return hash table where znode @node is (or should be) stored */
++static z_hash_table *znode_get_htable(const znode * node)
++{
++	return get_htable(znode_get_tree(node), znode_get_block(node));
++}
++
++/* zget() - get znode from hash table, allocating it if necessary.
++
++   First a call to zlook, locating a x-referenced znode if one
++   exists.  If znode is not found, allocate new one and return.  Result
++   is returned with x_count reference increased.
++
++   LOCKS TAKEN:   TREE_LOCK, ZNODE_LOCK
++   LOCK ORDERING: NONE
++*/
++znode *zget(reiser4_tree * tree,
++	    const reiser4_block_nr * const blocknr,
++	    znode * parent, tree_level level, gfp_t gfp_flag)
++{
++	znode *result;
++	__u32 hashi;
++
++	z_hash_table *zth;
++
++	assert("jmacd-512", tree != NULL);
++	assert("jmacd-513", blocknr != NULL);
++	assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
++
++	zth = get_htable(tree, blocknr);
++	hashi = blknrhashfn(zth, blocknr);
++
++	/* NOTE-NIKITA address-as-unallocated-blocknr still is not
++	   implemented. */
++
++	z_hash_prefetch_bucket(zth, hashi);
++
++	rcu_read_lock();
++	/* Find a matching BLOCKNR in the hash table.  If the znode is found,
++	   we obtain an reference (x_count) but the znode remains unlocked.
++	   Have to worry about race conditions later. */
++	result = z_hash_find_index(zth, hashi, blocknr);
++	/* According to the current design, the hash table lock protects new
++	   znode references. */
++	if (result != NULL) {
++		add_x_ref(ZJNODE(result));
++		/* NOTE-NIKITA it should be so, but special case during
++		   creation of new root makes such assertion highly
++		   complicated.  */
++		assert("nikita-2131", 1 || znode_parent(result) == parent ||
++		       (ZF_ISSET(result, JNODE_ORPHAN)
++			&& (znode_parent(result) == NULL)));
++		result = znode_rip_check(tree, result);
++	}
++
++	rcu_read_unlock();
++
++	if (!result) {
++		znode *shadow;
++
++		result = zalloc(gfp_flag);
++		if (!result) {
++			return ERR_PTR(RETERR(-ENOMEM));
++		}
++
++		zinit(result, parent, tree);
++		ZJNODE(result)->blocknr = *blocknr;
++		ZJNODE(result)->key.z = *blocknr;
++		result->level = level;
++
++		write_lock_tree(tree);
++
++		shadow = z_hash_find_index(zth, hashi, blocknr);
++		if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
++			jnode_list_remove(ZJNODE(result));
++			zfree(result);
++			result = shadow;
++		} else {
++			result->version = znode_build_version(tree);
++			z_hash_insert_index_rcu(zth, hashi, result);
++
++			if (parent != NULL)
++				++parent->c_count;
++		}
++
++		add_x_ref(ZJNODE(result));
++
++		write_unlock_tree(tree);
++	}
++#if REISER4_DEBUG
++	if (!blocknr_is_fake(blocknr) && *blocknr != 0)
++		reiser4_check_block(blocknr, 1);
++#endif
++	/* Check for invalid tree level, return -EIO */
++	if (unlikely(znode_get_level(result) != level)) {
++		warning("jmacd-504",
++			"Wrong level for cached block %llu: %i expecting %i",
++			(unsigned long long)(*blocknr), znode_get_level(result),
++			level);
++		zput(result);
++		return ERR_PTR(RETERR(-EIO));
++	}
++
++	assert("nikita-1227", znode_invariant(result));
++
++	return result;
++}
++
++/* ZNODE PLUGINS/DATA */
++
++/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
++   stored at the fixed offset from the beginning of the node. */
++static node_plugin *znode_guess_plugin(const znode * node	/* znode to guess
++								 * plugin of */ )
++{
++	reiser4_tree *tree;
++
++	assert("nikita-1053", node != NULL);
++	assert("nikita-1055", zdata(node) != NULL);
++
++	tree = znode_get_tree(node);
++	assert("umka-053", tree != NULL);
++
++	if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
++		return tree->nplug;
++	} else {
++		return node_plugin_by_disk_id
++		    (tree, &((common_node_header *) zdata(node))->plugin_id);
++#ifdef GUESS_EXISTS
++		reiser4_plugin *plugin;
++
++		/* NOTE-NIKITA add locking here when dynamic plugins will be
++		 * implemented */
++		for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
++			if ((plugin->u.node.guess != NULL)
++			    && plugin->u.node.guess(node))
++				return plugin;
++		}
++		warning("nikita-1057", "Cannot guess node plugin");
++		print_znode("node", node);
++		return NULL;
++#endif
++	}
++}
++
++/* parse node header and install ->node_plugin */
++int zparse(znode * node /* znode to parse */ )
++{
++	int result;
++
++	assert("nikita-1233", node != NULL);
++	assert("nikita-2370", zdata(node) != NULL);
++
++	if (node->nplug == NULL) {
++		node_plugin *nplug;
++
++		nplug = znode_guess_plugin(node);
++		if (likely(nplug != NULL)) {
++			result = nplug->parse(node);
++			if (likely(result == 0))
++				node->nplug = nplug;
++		} else {
++			result = RETERR(-EIO);
++		}
++	} else
++		result = 0;
++	return result;
++}
++
++/* zload with readahead */
++int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
++{
++	int result;
++
++	assert("nikita-484", node != NULL);
++	assert("nikita-1377", znode_invariant(node));
++	assert("jmacd-7771", !znode_above_root(node));
++	assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
++	assert("nikita-3016", schedulable());
++
++	if (info)
++		formatted_readahead(node, info);
++
++	result = jload(ZJNODE(node));
++	assert("nikita-1378", znode_invariant(node));
++	return result;
++}
++
++/* load content of node into memory */
++int zload(znode * node)
++{
++	return zload_ra(node, NULL);
++}
++
++/* call node plugin to initialise newly allocated node. */
++int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
++{
++	return jinit_new(ZJNODE(node), gfp_flags);
++}
++
++/* drop reference to node data. When last reference is dropped, data are
++   unloaded. */
++void zrelse(znode * node /* znode to release references to */ )
++{
++	assert("nikita-1381", znode_invariant(node));
++
++	jrelse(ZJNODE(node));
++}
++
++/* returns free space in node */
++unsigned znode_free_space(znode * node /* znode to query */ )
++{
++	assert("nikita-852", node != NULL);
++	return node_plugin_by_node(node)->free_space(node);
++}
++
++/* left delimiting key of znode */
++reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
++{
++	assert("nikita-958", node != NULL);
++	assert_rw_locked(&(znode_get_tree(node)->dk_lock));
++	assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
++	assert("nikita-30671", node->rd_key_version != 0);
++	return &node->rd_key;
++}
++
++/* right delimiting key of znode */
++reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
++{
++	assert("nikita-974", node != NULL);
++	assert_rw_locked(&(znode_get_tree(node)->dk_lock));
++	assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
++	assert("nikita-30681", node->ld_key_version != 0);
++	return &node->ld_key;
++}
++
++ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
++    )
++
++/* update right-delimiting key of @node */
++reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
++{
++	assert("nikita-2937", node != NULL);
++	assert("nikita-2939", key != NULL);
++	assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
++	assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
++	assert("nikita-2944",
++	       znode_is_any_locked(node) ||
++	       znode_get_level(node) != LEAF_LEVEL ||
++	       keyge(key, &node->rd_key) ||
++	       keyeq(&node->rd_key, min_key()) ||
++	       ZF_ISSET(node, JNODE_HEARD_BANSHEE));
++
++	node->rd_key = *key;
++	ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
++	return &node->rd_key;
++}
++
++/* update left-delimiting key of @node */
++reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
++{
++	assert("nikita-2940", node != NULL);
++	assert("nikita-2941", key != NULL);
++	assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
++	assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
++	assert("nikita-2943",
++	       znode_is_any_locked(node) || keyeq(&node->ld_key, min_key()));
++
++	node->ld_key = *key;
++	ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
++	return &node->ld_key;
++}
++
++/* true if @key is inside key range for @node */
++int znode_contains_key(znode * node /* znode to look in */ ,
++		       const reiser4_key * key /* key to look for */ )
++{
++	assert("nikita-1237", node != NULL);
++	assert("nikita-1238", key != NULL);
++
++	/* left_delimiting_key <= key <= right_delimiting_key */
++	return keyle(znode_get_ld_key(node), key)
++	    && keyle(key, znode_get_rd_key(node));
++}
++
++/* same as znode_contains_key(), but lock dk lock */
++int znode_contains_key_lock(znode * node /* znode to look in */ ,
++			    const reiser4_key * key /* key to look for */ )
++{
++	int result;
++
++	assert("umka-056", node != NULL);
++	assert("umka-057", key != NULL);
++
++	read_lock_dk(znode_get_tree(node));
++	result = znode_contains_key(node, key);
++	read_unlock_dk(znode_get_tree(node));
++	return result;
++}
++
++/* get parent pointer, assuming tree is not locked */
++znode *znode_parent_nolock(const znode * node /* child znode */ )
++{
++	assert("nikita-1444", node != NULL);
++	return node->in_parent.node;
++}
++
++/* get parent pointer of znode */
++znode *znode_parent(const znode * node /* child znode */ )
++{
++	assert("nikita-1226", node != NULL);
++	assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
++	return znode_parent_nolock(node);
++}
++
++/* detect uber znode used to protect in-superblock tree root pointer */
++int znode_above_root(const znode * node /* znode to query */ )
++{
++	assert("umka-059", node != NULL);
++
++	return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
++}
++
++/* check that @node is root---that its block number is recorder in the tree as
++   that of root node */
++#if REISER4_DEBUG
++static int znode_is_true_root(const znode * node /* znode to query */ )
++{
++	assert("umka-060", node != NULL);
++	assert("umka-061", current_tree != NULL);
++
++	return disk_addr_eq(znode_get_block(node),
++			    &znode_get_tree(node)->root_block);
++}
++#endif
++
++/* check that @node is root */
++int znode_is_root(const znode * node /* znode to query */ )
++{
++	assert("nikita-1206", node != NULL);
++
++	return znode_get_level(node) == znode_get_tree(node)->height;
++}
++
++/* Returns true is @node was just created by zget() and wasn't ever loaded
++   into memory. */
++/* NIKITA-HANS: yes */
++int znode_just_created(const znode * node)
++{
++	assert("nikita-2188", node != NULL);
++	return (znode_page(node) == NULL);
++}
++
++/* obtain updated ->znode_epoch. See seal.c for description. */
++__u64 znode_build_version(reiser4_tree * tree)
++{
++	__u64 result;
++
++	spin_lock(&tree->epoch_lock);
++	result = ++tree->znode_epoch;
++	spin_unlock(&tree->epoch_lock);
++	return result;
++}
++
++void init_load_count(load_count * dh)
++{
++	assert("nikita-2105", dh != NULL);
++	memset(dh, 0, sizeof *dh);
++}
++
++void done_load_count(load_count * dh)
++{
++	assert("nikita-2106", dh != NULL);
++	if (dh->node != NULL) {
++		for (; dh->d_ref > 0; --dh->d_ref)
++			zrelse(dh->node);
++		dh->node = NULL;
++	}
++}
++
++static int incr_load_count(load_count * dh)
++{
++	int result;
++
++	assert("nikita-2110", dh != NULL);
++	assert("nikita-2111", dh->node != NULL);
++
++	result = zload(dh->node);
++	if (result == 0)
++		++dh->d_ref;
++	return result;
++}
++
++int incr_load_count_znode(load_count * dh, znode * node)
++{
++	assert("nikita-2107", dh != NULL);
++	assert("nikita-2158", node != NULL);
++	assert("nikita-2109",
++	       ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
++
++	dh->node = node;
++	return incr_load_count(dh);
++}
++
++int incr_load_count_jnode(load_count * dh, jnode * node)
++{
++	if (jnode_is_znode(node)) {
++		return incr_load_count_znode(dh, JZNODE(node));
++	}
++	return 0;
++}
++
++void copy_load_count(load_count * new, load_count * old)
++{
++	int ret = 0;
++	done_load_count(new);
++	new->node = old->node;
++	new->d_ref = 0;
++
++	while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
++	}
++
++	assert("jmacd-87589", ret == 0);
++}
++
++void move_load_count(load_count * new, load_count * old)
++{
++	done_load_count(new);
++	new->node = old->node;
++	new->d_ref = old->d_ref;
++	old->node = NULL;
++	old->d_ref = 0;
++}
++
++/* convert parent pointer into coord */
++void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
++{
++	assert("nikita-3204", pcoord != NULL);
++	assert("nikita-3205", coord != NULL);
++
++	coord_init_first_unit_nocheck(coord, pcoord->node);
++	coord_set_item_pos(coord, pcoord->item_pos);
++	coord->between = AT_UNIT;
++}
++
++/* pack coord into parent_coord_t */
++void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
++{
++	assert("nikita-3206", pcoord != NULL);
++	assert("nikita-3207", coord != NULL);
++
++	pcoord->node = coord->node;
++	pcoord->item_pos = coord->item_pos;
++}
++
++/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
++   look for comments there) */
++void init_parent_coord(parent_coord_t * pcoord, const znode * node)
++{
++	pcoord->node = (znode *) node;
++	pcoord->item_pos = (unsigned short)~0;
++}
++
++#if REISER4_DEBUG
++
++/* debugging aid: znode invariant */
++static int znode_invariant_f(const znode * node /* znode to check */ ,
++			     char const **msg	/* where to store error
++						 * message, if any */ )
++{
++#define _ergo(ant, con) 						\
++	((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
++
++#define _equi(e1, e2) 						\
++	((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
++
++#define _check(exp) ((*msg) = #exp, (exp))
++
++	return jnode_invariant_f(ZJNODE(node), msg) &&
++	    /* [znode-fake] invariant */
++	    /* fake znode doesn't have a parent, and */
++	    _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
++	    /* there is another way to express this very check, and */
++	    _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
++	    /* it has special block number, and */
++	    _ergo(znode_get_level(node) == 0,
++		  disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
++	    /* it is the only znode with such block number, and */
++	    _ergo(!znode_above_root(node) && znode_is_loaded(node),
++		  !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
++	    /* it is parent of the tree root node */
++	    _ergo(znode_is_true_root(node),
++		  znode_above_root(znode_parent(node))) &&
++	    /* [znode-level] invariant */
++	    /* level of parent znode is one larger than that of child,
++	       except for the fake znode, and */
++	    _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
++		  znode_get_level(znode_parent(node)) ==
++		  znode_get_level(node) + 1) &&
++	    /* left neighbor is at the same level, and */
++	    _ergo(znode_is_left_connected(node) && node->left != NULL,
++		  znode_get_level(node) == znode_get_level(node->left)) &&
++	    /* right neighbor is at the same level */
++	    _ergo(znode_is_right_connected(node) && node->right != NULL,
++		  znode_get_level(node) == znode_get_level(node->right)) &&
++	    /* [znode-connected] invariant */
++	    _ergo(node->left != NULL, znode_is_left_connected(node)) &&
++	    _ergo(node->right != NULL, znode_is_right_connected(node)) &&
++	    _ergo(!znode_is_root(node) && node->left != NULL,
++		  znode_is_right_connected(node->left) &&
++		  node->left->right == node) &&
++	    _ergo(!znode_is_root(node) && node->right != NULL,
++		  znode_is_left_connected(node->right) &&
++		  node->right->left == node) &&
++	    /* [znode-c_count] invariant */
++	    /* for any znode, c_count of its parent is greater than 0 */
++	    _ergo(znode_parent(node) != NULL &&
++		  !znode_above_root(znode_parent(node)),
++		  znode_parent(node)->c_count > 0) &&
++	    /* leaves don't have children */
++	    _ergo(znode_get_level(node) == LEAF_LEVEL,
++		  node->c_count == 0) &&
++	    _check(node->zjnode.jnodes.prev != NULL) &&
++	    _check(node->zjnode.jnodes.next != NULL) &&
++	    /* orphan doesn't have a parent */
++	    _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
++	    /* [znode-modify] invariant */
++	    /* if znode is not write-locked, its checksum remains
++	     * invariant */
++	    /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
++	     * cannot check this. */
++	    /* [znode-refs] invariant */
++	    /* only referenced znode can be long-term locked */
++	    _ergo(znode_is_locked(node),
++		  atomic_read(&ZJNODE(node)->x_count) != 0);
++}
++
++/* debugging aid: check znode invariant and panic if it doesn't hold */
++int znode_invariant(znode * node /* znode to check */ )
++{
++	char const *failed_msg;
++	int result;
++
++	assert("umka-063", node != NULL);
++	assert("umka-064", current_tree != NULL);
++
++	spin_lock_znode(node);
++	read_lock_tree(znode_get_tree(node));
++	result = znode_invariant_f(node, &failed_msg);
++	if (!result) {
++		/* print_znode("corrupted node", node); */
++		warning("jmacd-555", "Condition %s failed", failed_msg);
++	}
++	read_unlock_tree(znode_get_tree(node));
++	spin_unlock_znode(node);
++	return result;
++}
++
++/* return non-0 iff data are loaded into znode */
++int znode_is_loaded(const znode * node /* znode to query */ )
++{
++	assert("nikita-497", node != NULL);
++	return jnode_is_loaded(ZJNODE(node));
++}
++
++unsigned long znode_times_locked(const znode * z)
++{
++	return z->times_locked;
++}
++
++#endif				/* REISER4_DEBUG */
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/fs/reiser4/znode.h
+===================================================================
+--- /dev/null
++++ linux-2.6.16/fs/reiser4/znode.h
+@@ -0,0 +1,434 @@
++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
++ * reiser4/README */
++
++/* Declaration of znode (Zam's node). See znode.c for more details. */
++
++#ifndef __ZNODE_H__
++#define __ZNODE_H__
++
++#include "forward.h"
++#include "debug.h"
++#include "dformat.h"
++#include "key.h"
++#include "coord.h"
++#include "plugin/node/node.h"
++#include "jnode.h"
++#include "lock.h"
++#include "readahead.h"
++
++#include <linux/types.h>
++#include <linux/spinlock.h>
++#include <linux/pagemap.h>	/* for PAGE_CACHE_SIZE */
++#include <asm/atomic.h>
++#include <asm/semaphore.h>
++
++/* znode tracks its position within parent (internal item in a parent node,
++ * that contains znode's block number). */
++typedef struct parent_coord {
++	znode *node;
++	pos_in_node_t item_pos;
++} parent_coord_t;
++
++/* &znode - node in a reiser4 tree.
++
++   NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
++   cacheline pressure.
++
++   Locking:
++
++   Long term: data in a disk node attached to this znode are protected
++   by long term, deadlock aware lock ->lock;
++
++   Spin lock: the following fields are protected by the spin lock:
++
++    ->lock
++
++   Following fields are protected by the global tree lock:
++
++    ->left
++    ->right
++    ->in_parent
++    ->c_count
++
++   Following fields are protected by the global delimiting key lock (dk_lock):
++
++    ->ld_key (to update ->ld_key long-term lock on the node is also required)
++    ->rd_key
++
++   Following fields are protected by the long term lock:
++
++    ->nr_items
++
++   ->node_plugin is never changed once set. This means that after code made
++   itself sure that field is valid it can be accessed without any additional
++   locking.
++
++   ->level is immutable.
++
++   Invariants involving this data-type:
++
++      [znode-fake]
++      [znode-level]
++      [znode-connected]
++      [znode-c_count]
++      [znode-refs]
++      [jnode-refs]
++      [jnode-queued]
++      [znode-modify]
++
++    For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
++    Suggestions for how to do that are desired.*/
++struct znode {
++	/* Embedded jnode. */
++	jnode zjnode;
++
++	/* contains three subfields, node, pos_in_node, and pos_in_unit.
++
++	   pos_in_node and pos_in_unit are only hints that are cached to
++	   speed up lookups during balancing. They are not required to be up to
++	   date. Synched in find_child_ptr().
++
++	   This value allows us to avoid expensive binary searches.
++
++	   in_parent->node points to the parent of this node, and is NOT a
++	   hint.
++	 */
++	parent_coord_t in_parent;
++
++	/*
++	 * sibling list pointers
++	 */
++
++	/* left-neighbor */
++	znode *left;
++	/* right-neighbor */
++	znode *right;
++
++	/* long term lock on node content. This lock supports deadlock
++	   detection. See lock.c
++	 */
++	zlock lock;
++
++	/* You cannot remove from memory a node that has children in
++	   memory. This is because we rely on the fact that parent of given
++	   node can always be reached without blocking for io. When reading a
++	   node into memory you must increase the c_count of its parent, when
++	   removing it from memory you must decrease the c_count.  This makes
++	   the code simpler, and the cases where it is suboptimal are truly
++	   obscure.
++	 */
++	int c_count;
++
++	/* plugin of node attached to this znode. NULL if znode is not
++	   loaded. */
++	node_plugin *nplug;
++
++	/* version of znode data. This is increased on each modification. This
++	 * is necessary to implement seals (see seal.[ch]) efficiently. */
++	__u64 version;
++
++	/* left delimiting key. Necessary to efficiently perform
++	   balancing with node-level locking. Kept in memory only. */
++	reiser4_key ld_key;
++	/* right delimiting key. */
++	reiser4_key rd_key;
++
++	/* znode's tree level */
++	__u16 level;
++	/* number of items in this node. This field is modified by node
++	 * plugin. */
++	__u16 nr_items;
++
++#if REISER4_DEBUG
++	void *creator;
++	reiser4_key first_key;
++	unsigned long times_locked;
++	int left_version;	/* when node->left was updated */
++	int right_version;	/* when node->right was updated */
++	int ld_key_version;	/* when node->ld_key was updated */
++	int rd_key_version;	/* when node->rd_key was updated */
++#endif
++
++} __attribute__ ((aligned(16)));
++
++ON_DEBUG(extern atomic_t delim_key_version;
++    )
++
++/* In general I think these macros should not be exposed. */
++#define znode_is_locked(node)          (lock_is_locked(&node->lock))
++#define znode_is_rlocked(node)         (lock_is_rlocked(&node->lock))
++#define znode_is_wlocked(node)         (lock_is_wlocked(&node->lock))
++#define znode_is_wlocked_once(node)    (lock_is_wlocked_once(&node->lock))
++#define znode_can_be_rlocked(node)     (lock_can_be_rlocked(&node->lock))
++#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
++/* Macros for accessing the znode state. */
++#define	ZF_CLR(p,f)	        JF_CLR  (ZJNODE(p), (f))
++#define	ZF_ISSET(p,f)	        JF_ISSET(ZJNODE(p), (f))
++#define	ZF_SET(p,f)		JF_SET  (ZJNODE(p), (f))
++extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
++		   znode * parent, tree_level level, gfp_t gfp_flag);
++extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
++extern int zload(znode * node);
++extern int zload_ra(znode * node, ra_info_t * info);
++extern int zinit_new(znode * node, gfp_t gfp_flags);
++extern void zrelse(znode * node);
++extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
++
++/* size of data in znode */
++static inline unsigned
++znode_size(const znode * node UNUSED_ARG /* znode to query */ )
++{
++	assert("nikita-1416", node != NULL);
++	return PAGE_CACHE_SIZE;
++}
++
++extern void parent_coord_to_coord(const parent_coord_t * pcoord,
++				  coord_t * coord);
++extern void coord_to_parent_coord(const coord_t * coord,
++				  parent_coord_t * pcoord);
++extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
++
++extern unsigned znode_free_space(znode * node);
++
++extern reiser4_key *znode_get_rd_key(znode * node);
++extern reiser4_key *znode_get_ld_key(znode * node);
++
++extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
++extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
++
++/* `connected' state checks */
++static inline int znode_is_right_connected(const znode * node)
++{
++	return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
++}
++
++static inline int znode_is_left_connected(const znode * node)
++{
++	return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
++}
++
++static inline int znode_is_connected(const znode * node)
++{
++	return znode_is_right_connected(node) && znode_is_left_connected(node);
++}
++
++extern int znode_shift_order;
++extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
++extern void znode_remove(znode *, reiser4_tree *);
++extern znode *znode_parent(const znode * node);
++extern znode *znode_parent_nolock(const znode * node);
++extern int znode_above_root(const znode * node);
++extern int init_znodes(void);
++extern void done_znodes(void);
++extern int znodes_tree_init(reiser4_tree * ztree);
++extern void znodes_tree_done(reiser4_tree * ztree);
++extern int znode_contains_key(znode * node, const reiser4_key * key);
++extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
++extern unsigned znode_save_free_space(znode * node);
++extern unsigned znode_recover_free_space(znode * node);
++extern znode *zalloc(gfp_t gfp_flag);
++extern void zinit(znode *, const znode * parent, reiser4_tree *);
++extern int zparse(znode * node);
++
++
++extern int znode_just_created(const znode * node);
++
++extern void zfree(znode * node);
++
++#if REISER4_DEBUG
++extern void print_znode(const char *prefix, const znode * node);
++#else
++#define print_znode( p, n ) noop
++#endif
++
++/* Make it look like various znode functions exist instead of treating znodes as
++   jnodes in znode-specific code. */
++#define znode_page(x)               jnode_page ( ZJNODE(x) )
++#define zdata(x)                    jdata ( ZJNODE(x) )
++#define znode_get_block(x)          jnode_get_block ( ZJNODE(x) )
++#define znode_created(x)            jnode_created ( ZJNODE(x) )
++#define znode_set_created(x)        jnode_set_created ( ZJNODE(x) )
++#define znode_convertible(x)        jnode_convertible (ZJNODE(x))
++#define znode_set_convertible(x)    jnode_set_convertible (ZJNODE(x))
++
++#define znode_is_dirty(x)           jnode_is_dirty    ( ZJNODE(x) )
++#define znode_check_dirty(x)        jnode_check_dirty ( ZJNODE(x) )
++#define znode_make_clean(x)         jnode_make_clean   ( ZJNODE(x) )
++#define znode_set_block(x, b)       jnode_set_block ( ZJNODE(x), (b) )
++
++#define spin_lock_znode(x)          spin_lock_jnode ( ZJNODE(x) )
++#define spin_unlock_znode(x)        spin_unlock_jnode ( ZJNODE(x) )
++#define spin_trylock_znode(x)       spin_trylock_jnode ( ZJNODE(x) )
++#define spin_znode_is_locked(x)     spin_jnode_is_locked ( ZJNODE(x) )
++#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
++
++#if REISER4_DEBUG
++extern int znode_x_count_is_protected(const znode * node);
++extern int znode_invariant(znode * node);
++#endif
++
++/* acquire reference to @node */
++static inline znode *zref(znode * node)
++{
++	/* change of x_count from 0 to 1 is protected by tree spin-lock */
++	return JZNODE(jref(ZJNODE(node)));
++}
++
++/* release reference to @node */
++static inline void zput(znode * node)
++{
++	assert("nikita-3564", znode_invariant(node));
++	jput(ZJNODE(node));
++}
++
++/* get the level field for a znode */
++static inline tree_level znode_get_level(const znode * node)
++{
++	return node->level;
++}
++
++/* get the level field for a jnode */
++static inline tree_level jnode_get_level(const jnode * node)
++{
++	if (jnode_is_znode(node))
++		return znode_get_level(JZNODE(node));
++	else
++		/* unformatted nodes are all at the LEAF_LEVEL and for
++		   "semi-formatted" nodes like bitmaps, level doesn't matter. */
++		return LEAF_LEVEL;
++}
++
++/* true if jnode is on leaf level */
++static inline int jnode_is_leaf(const jnode * node)
++{
++	if (jnode_is_znode(node))
++		return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
++	if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
++		return 1;
++	return 0;
++}
++
++/* return znode's tree */
++static inline reiser4_tree *znode_get_tree(const znode * node)
++{
++	assert("nikita-2692", node != NULL);
++	return jnode_get_tree(ZJNODE(node));
++}
++
++/* resolve race with zput */
++static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
++{
++	jnode *j;
++
++	j = jnode_rip_sync(tree, ZJNODE(node));
++	if (likely(j != NULL))
++		node = JZNODE(j);
++	else
++		node = NULL;
++	return node;
++}
++
++#if defined(REISER4_DEBUG)
++int znode_is_loaded(const znode * node /* znode to query */ );
++#endif
++
++extern __u64 znode_build_version(reiser4_tree * tree);
++
++/* Data-handles.  A data handle object manages pairing calls to zload() and zrelse().  We
++   must load the data for a node in many places.  We could do this by simply calling
++   zload() everywhere, the difficulty arises when we must release the loaded data by
++   calling zrelse.  In a function with many possible error/return paths, it requires extra
++   work to figure out which exit paths must call zrelse and those which do not.  The data
++   handle automatically calls zrelse for every zload that it is responsible for.  In that
++   sense, it acts much like a lock_handle.
++*/
++typedef struct load_count {
++	znode *node;
++	int d_ref;
++} load_count;
++
++extern void init_load_count(load_count * lc);	/* Initialize a load_count set the current node to NULL. */
++extern void done_load_count(load_count * dh);	/* Finalize a load_count: call zrelse() if necessary */
++extern int incr_load_count_znode(load_count * dh, znode * node);	/* Set the argument znode to the current node, call zload(). */
++extern int incr_load_count_jnode(load_count * dh, jnode * node);	/* If the argument jnode is formatted, do the same as
++									 * incr_load_count_znode, otherwise do nothing (unformatted nodes
++									 * don't require zload/zrelse treatment). */
++extern void move_load_count(load_count * new, load_count * old);	/* Move the contents of a load_count.  Old handle is released. */
++extern void copy_load_count(load_count * new, load_count * old);	/* Copy the contents of a load_count.  Old handle remains held. */
++
++/* Variable initializers for load_count. */
++#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
++#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
++/* A convenience macro for use in assertions or debug-only code, where loaded
++   data is only required to perform the debugging check.  This macro
++   encapsulates an expression inside a pair of calls to zload()/zrelse(). */
++#define WITH_DATA( node, exp )				\
++({							\
++	long __with_dh_result;				\
++	znode *__with_dh_node;				\
++							\
++	__with_dh_node = ( node );			\
++	__with_dh_result = zload( __with_dh_node );	\
++	if( __with_dh_result == 0 ) {			\
++		__with_dh_result = ( long )( exp );	\
++		zrelse( __with_dh_node );		\
++	}						\
++	__with_dh_result;				\
++})
++
++/* Same as above, but accepts a return value in case zload fails. */
++#define WITH_DATA_RET( node, ret, exp )			\
++({							\
++	int __with_dh_result;				\
++	znode *__with_dh_node;				\
++							\
++	__with_dh_node = ( node );			\
++	__with_dh_result = zload( __with_dh_node );	\
++	if( __with_dh_result == 0 ) {			\
++		__with_dh_result = ( int )( exp );	\
++		zrelse( __with_dh_node );		\
++	} else						\
++		__with_dh_result = ( ret );		\
++	__with_dh_result;				\
++})
++
++#define WITH_COORD(coord, exp)			\
++({						\
++	coord_t *__coord;			\
++						\
++	__coord = (coord);			\
++	coord_clear_iplug(__coord);		\
++	WITH_DATA(__coord->node, exp);		\
++})
++
++#if REISER4_DEBUG
++#define STORE_COUNTERS						\
++	lock_counters_info __entry_counters = *lock_counters()
++#define CHECK_COUNTERS						\
++ON_DEBUG_CONTEXT(						\
++({								\
++	__entry_counters.x_refs = lock_counters() -> x_refs;	\
++	__entry_counters.t_refs = lock_counters() -> t_refs;	\
++	__entry_counters.d_refs = lock_counters() -> d_refs;	\
++	assert("nikita-2159",					\
++	       !memcmp(&__entry_counters, lock_counters(),	\
++		       sizeof __entry_counters));		\
++}) )
++
++#else
++#define STORE_COUNTERS
++#define CHECK_COUNTERS noop
++#endif
++
++/* __ZNODE_H__ */
++#endif
++
++/* Make Linus happy.
++   Local variables:
++   c-indentation-style: "K&R"
++   mode-name: "LC"
++   c-basic-offset: 8
++   tab-width: 8
++   fill-column: 120
++   End:
++*/
+Index: linux-2.6.16/include/linux/fs.h
+===================================================================
+--- linux-2.6.16.orig/include/linux/fs.h
++++ linux-2.6.16/include/linux/fs.h
+@@ -1085,6 +1085,8 @@ struct super_operations {
+ 	void (*clear_inode) (struct inode *);
+ 	void (*umount_begin) (struct super_block *);
+ 
++	void (*sync_inodes) (struct super_block *sb,
++				struct writeback_control *wbc);
+ 	int (*show_options)(struct seq_file *, struct vfsmount *);
+ 
+ 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+@@ -1449,6 +1451,7 @@ extern int invalidate_inode_pages2(struc
+ extern int invalidate_inode_pages2_range(struct address_space *mapping,
+ 					 pgoff_t start, pgoff_t end);
+ extern int write_inode_now(struct inode *, int);
++extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
+ extern int filemap_fdatawrite(struct address_space *);
+ extern int filemap_flush(struct address_space *);
+ extern int filemap_fdatawait(struct address_space *);
+Index: linux-2.6.16/lib/radix-tree.c
+===================================================================
+--- linux-2.6.16.orig/lib/radix-tree.c
++++ linux-2.6.16/lib/radix-tree.c
+@@ -139,6 +139,7 @@ static inline void tag_set(struct radix_
+ {
+ 	__set_bit(offset, node->tags[tag]);
+ }
++EXPORT_SYMBOL(radix_tree_preload);
+ 
+ static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
+ {
+Index: linux-2.6.16/mm/filemap.c
+===================================================================
+--- linux-2.6.16.orig/mm/filemap.c
++++ linux-2.6.16/mm/filemap.c
+@@ -119,6 +119,7 @@ void __remove_from_page_cache(struct pag
+ 	mapping->nrpages--;
+ 	pagecache_acct(-1);
+ }
++EXPORT_SYMBOL(__remove_from_page_cache);
+ 
+ void remove_from_page_cache(struct page *page)
+ {
+@@ -130,6 +131,7 @@ void remove_from_page_cache(struct page 
+ 	__remove_from_page_cache(page);
+ 	write_unlock_irq(&mapping->tree_lock);
+ }
++EXPORT_SYMBOL(remove_from_page_cache);
+ 
+ static int sync_page(void *word)
+ {
+@@ -272,6 +274,7 @@ static int wait_on_page_writeback_range(
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL(add_to_page_cache_lru);
+ 
+ /*
+  * Write and wait upon all the pages in the passed range.  This is a "data
+@@ -632,7 +635,6 @@ repeat:
+ 		page_cache_release(cached_page);
+ 	return page;
+ }
+-
+ EXPORT_SYMBOL(find_or_create_page);
+ 
+ /**
+@@ -665,6 +667,7 @@ unsigned find_get_pages(struct address_s
+ 	read_unlock_irq(&mapping->tree_lock);
+ 	return ret;
+ }
++EXPORT_SYMBOL(find_get_pages);
+ 
+ /*
+  * Like find_get_pages, except we only return pages which are tagged with
+@@ -686,6 +689,7 @@ unsigned find_get_pages_tag(struct addre
+ 	read_unlock_irq(&mapping->tree_lock);
+ 	return ret;
+ }
++EXPORT_SYMBOL(find_get_pages_tag);
+ 
+ /*
+  * Same as grab_cache_page, but do not wait if the page is unavailable.
+Index: linux-2.6.16/mm/page-writeback.c
+===================================================================
+--- linux-2.6.16.orig/mm/page-writeback.c
++++ linux-2.6.16/mm/page-writeback.c
+@@ -187,7 +187,7 @@ get_dirty_limits(struct writeback_state 
+  * If we're over `background_thresh' then pdflush is woken to perform some
+  * writeout.
+  */
+-static void balance_dirty_pages(struct address_space *mapping)
++void balance_dirty_pages(struct address_space *mapping)
+ {
+ 	struct writeback_state wbs;
+ 	long nr_reclaimable;
+@@ -253,6 +253,7 @@ static void balance_dirty_pages(struct a
+ 	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+ 		pdflush_operation(background_writeout, 0);
+ }
++EXPORT_SYMBOL(balance_dirty_pages);
+ 
+ /**
+  * balance_dirty_pages_ratelimited - balance dirty memory state
+Index: linux-2.6.16/mm/readahead.c
+===================================================================
+--- linux-2.6.16.orig/mm/readahead.c
++++ linux-2.6.16/mm/readahead.c
+@@ -541,6 +541,7 @@ page_cache_readahead(struct address_spac
+ out:
+ 	return ra->prev_page + 1;
+ }
++EXPORT_SYMBOL_GPL(page_cache_readahead);
+ 
+ /*
+  * handle_ra_miss() is called when it is known that a page which should have
+@@ -558,6 +559,7 @@ void handle_ra_miss(struct address_space
+ 	ra->flags &= ~RA_FLAG_INCACHE;
+ 	ra->cache_hit = 0;
+ }
++EXPORT_SYMBOL_GPL(handle_ra_miss);
+ 
+ /*
+  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a